diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000..0f4b96a830 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,10 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{c,h}] +tab_width = 8 +indent_style = tab diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 004711ae78..f28a747e82 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,10 +1,12 @@ -# Contributing to ZFS on Linux -

+# Contributing to OpenZFS +

+ OpenZFS Logo +

*First of all, thank you for taking the time to contribute!* -By using the following guidelines, you can help us make ZFS on Linux even -better. +By using the following guidelines, you can help us make OpenZFS even better. ## Table Of Contents [What should I know before I get @@ -32,17 +34,17 @@ started?](#what-should-i-know-before-i-get-started) Helpful resources - * [ZFS on Linux wiki](https://github.com/zfsonlinux/zfs/wiki) - * [OpenZFS Documentation](http://open-zfs.org/wiki/Developer_resources) - * [Git and GitHub for beginners](https://github.com/zfsonlinux/zfs/wiki/Git-and-GitHub-for-beginners) + * [OpenZFS Documentation](https://openzfs.github.io/openzfs-docs/) + * [OpenZFS Developer Resources](http://open-zfs.org/wiki/Developer_resources) + * [Git and GitHub for beginners](https://openzfs.github.io/openzfs-docs/Developer%20Resources/Git%20and%20GitHub%20for%20beginners.html) ## What should I know before I get started? ### Get ZFS You can build zfs packages by following [these -instructions](https://github.com/zfsonlinux/zfs/wiki/Building-ZFS), +instructions](https://openzfs.github.io/openzfs-docs/Developer%20Resources/Building%20ZFS.html), or install stable packages from [your distribution's -repository](https://github.com/zfsonlinux/zfs/wiki/Getting-Started). +repository](https://openzfs.github.io/openzfs-docs/Getting%20Started/index.html). ### Debug ZFS A variety of methods and tools are available to aid ZFS developers. @@ -51,29 +53,30 @@ configure option should be set. This will enable additional correctness checks and all the ASSERTs to help quickly catch potential issues. In addition, there are numerous utilities and debugging files which -provide visibility in to the inner workings of ZFS. The most useful -of these tools are discussed in detail on the [debugging ZFS wiki -page](https://github.com/zfsonlinux/zfs/wiki/Debugging). +provide visibility into the inner workings of ZFS. The most useful +of these tools are discussed in detail on the [Troubleshooting +page](https://openzfs.github.io/openzfs-docs/Basic%20Concepts/Troubleshooting.html). ### Where can I ask for help? -[The zfs-discuss mailing list or IRC](http://list.zfsonlinux.org) -are the best places to ask for help. Please do not file support requests -on the GitHub issue tracker. +The [zfs-discuss mailing +list](https://openzfs.github.io/openzfs-docs/Project%20and%20Community/Mailing%20Lists.html) +or IRC are the best places to ask for help. Please do not file +support requests on the GitHub issue tracker. ## How Can I Contribute? ### Reporting Bugs *Please* contact us via the [zfs-discuss mailing -list or IRC](http://list.zfsonlinux.org) if you aren't -certain that you are experiencing a bug. +list](https://openzfs.github.io/openzfs-docs/Project%20and%20Community/Mailing%20Lists.html) +or IRC if you aren't certain that you are experiencing a bug. If you run into an issue, please search our [issue -tracker](https://github.com/zfsonlinux/zfs/issues) *first* to ensure the +tracker](https://github.com/openzfs/zfs/issues) *first* to ensure the issue hasn't been reported before. Open a new issue only if you haven't found anything similar to your issue. You can open a new issue and search existing issues using the public [issue -tracker](https://github.com/zfsonlinux/zfs/issues). +tracker](https://github.com/openzfs/zfs/issues). #### When opening a new issue, please include the following information at the top of the issue: * What distribution (with version) you are using. @@ -105,13 +108,13 @@ information like: * Stack traces which may be logged to `dmesg`. ### Suggesting Enhancements -ZFS on Linux is a widely deployed production filesystem which is under -active development. The team's primary focus is on fixing known issues, -improving performance, and adding compelling new features. +OpenZFS is a widely deployed production filesystem which is under active +development. The team's primary focus is on fixing known issues, improving +performance, and adding compelling new features. You can view the list of proposed features -by filtering the issue tracker by the ["Feature" -label](https://github.com/zfsonlinux/zfs/issues?q=is%3Aopen+is%3Aissue+label%3AFeature). +by filtering the issue tracker by the ["Type: Feature" +label](https://github.com/openzfs/zfs/issues?q=is%3Aopen+is%3Aissue+label%3A%22Type%3A+Feature%22). If you have an idea for a feature first check this list. If your idea already appears then add a +1 to the top most comment, this helps us gauge interest in that feature. @@ -120,8 +123,11 @@ Otherwise, open a new issue and describe your proposed feature. Why is this feature needed? What problem does it solve? ### Pull Requests -* All pull requests must be based on the current master branch and apply -without conflicts. + +#### General + +* All pull requests, except backports and releases, must be based on the current master branch +and should apply without conflicts. * Please attempt to limit pull requests to a single commit which resolves one specific issue. * Make sure your commit messages are in the correct format. See the @@ -133,16 +139,28 @@ logically independent patches which build on each other. This makes large changes easier to review and approve which speeds up the merging process. * Try to keep pull requests simple. Simple code with comments is much easier to review and approve. +* All proposed changes must be approved by an OpenZFS organization member. +* If you have an idea you'd like to discuss or which requires additional testing, consider opening it as a draft pull request. +Once everything is in good shape and the details have been worked out you can remove its draft status. +Any required reviews can then be finalized and the pull request merged. + +#### Tests and Benchmarks +* Every pull request will by tested by the buildbot on multiple platforms by running the [zfs-tests.sh and zloop.sh]( +https://openzfs.github.io/openzfs-docs/Developer%20Resources/Building%20ZFS.html#running-zloop-sh-and-zfs-tests-sh) test suites. +* To verify your changes conform to the [style guidelines]( +https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md#style-guides +), please run `make checkstyle` and resolve any warnings. +* Static code analysis of each pull request is performed by the buildbot; run `make lint` to check your changes. * Test cases should be provided when appropriate. +This includes making sure new features have adequate code coverage. * If your pull request improves performance, please include some benchmarks. * The pull request must pass all required [ZFS Buildbot](http://build.zfsonlinux.org/) builders before being accepted. If you are experiencing intermittent TEST builder failures, you may be experiencing a [test suite -issue](https://github.com/zfsonlinux/zfs/issues?q=is%3Aissue+is%3Aopen+label%3A%22Test+Suite%22). -There are also various [buildbot options](https://github.com/zfsonlinux/zfs/wiki/Buildbot-Options) +issue](https://github.com/openzfs/zfs/issues?q=is%3Aissue+is%3Aopen+label%3A%22Type%3A+Test+Suite%22). +There are also various [buildbot options](https://openzfs.github.io/openzfs-docs/Developer%20Resources/Buildbot%20Options.html) to control how changes are tested. -* All proposed changes must be approved by a ZFS on Linux organization member. ### Testing All help is appreciated! If you're in a position to run the latest code @@ -152,16 +170,41 @@ range of realistic workloads, configurations and architectures we're better able quickly identify and resolve potential issues. Users can also run the [ZFS Test -Suite](https://github.com/zfsonlinux/zfs/tree/master/tests) on their systems +Suite](https://github.com/openzfs/zfs/tree/master/tests) on their systems to verify ZFS is behaving as intended. ## Style Guides +### Repository Structure + +OpenZFS uses a standardised branching structure. +- The "development and main branch", is the branch all development should be based on. +- "Release branches" contain the latest released code for said version. +- "Staging branches" contain selected commits prior to being released. + +**Branch Names:** +- Development and Main branch: `master` +- Release branches: `zfs-$VERSION-release` +- Staging branches: `zfs-$VERSION-staging` + +`$VERSION` should be replaced with the `major.minor` version number. +_(This is the version number without the `.patch` version at the end)_ + ### Coding Conventions We currently use [C Style and Coding Standards for SunOS](http://www.cis.upenn.edu/%7Elee/06cse480/data/cstyle.ms.pdf) as our coding convention. +This repository has an `.editorconfig` file. If your editor [supports +editorconfig](https://editorconfig.org/#download), it will +automatically respect most of this project's whitespace preferences. + +Additionally, Git can help warn on whitespace problems as well: + +``` +git config --local core.whitespace trailing-space,space-before-tab,indent-with-non-tab,-tab-in-indent +``` + ### Commit Message Formats #### New Changes Commit messages for new changes must meet the following guidelines: @@ -187,70 +230,6 @@ attempting to solve. Signed-off-by: Contributor ``` -#### OpenZFS Patch Ports -If you are porting OpenZFS patches, the commit message must meet -the following guidelines: -* The first line must be the summary line from the most important OpenZFS commit being ported. -It must begin with `OpenZFS dddd, dddd - ` where `dddd` are OpenZFS issue numbers. -* Provides a `Authored by:` line to attribute each patch for each original author. -* Provides the `Reviewed by:` and `Approved by:` lines from each original -OpenZFS commit. -* Provides a `Ported-by:` line with the developer's name followed by -their email for each OpenZFS commit. -* Provides a `OpenZFS-issue:` line with link for each original illumos -issue. -* Provides a `OpenZFS-commit:` line with link for each original OpenZFS commit. -* If necessary, provide some porting notes to describe any deviations from -the original OpenZFS commits. - -An example OpenZFS patch port commit message for a single patch is provided -below. -``` -OpenZFS 1234 - Summary from the original OpenZFS commit - -Authored by: Original Author -Reviewed by: Reviewer One -Reviewed by: Reviewer Two -Approved by: Approver One -Ported-by: ZFS Contributor - -Provide some porting notes here if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/1234 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/abcd1234 -``` - -If necessary, multiple OpenZFS patches can be combined in a single port. -This is useful when you are porting a new patch and its subsequent bug -fixes. An example commit message is provided below. -``` -OpenZFS 1234, 5678 - Summary of most important OpenZFS commit - -1234 Summary from original OpenZFS commit for 1234 - -Authored by: Original Author -Reviewed by: Reviewer Two -Approved by: Approver One -Ported-by: ZFS Contributor - -Provide some porting notes here for 1234 if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/1234 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/abcd1234 - -5678 Summary from original OpenZFS commit for 5678 - -Authored by: Original Author2 -Reviewed by: Reviewer One -Approved by: Approver Two -Ported-by: ZFS Contributor - -Provide some porting notes here for 5678 if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/5678 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/efgh5678 -``` - #### Coverity Defect Fixes If you are submitting a fix to a [Coverity defect](https://scan.coverity.com/projects/zfsonlinux-zfs), @@ -290,3 +269,13 @@ Git can append the `Signed-off-by` line to your commit messages. Simply provide the `-s` or `--signoff` option when performing a `git commit`. For more information about writing commit messages, visit [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/). + +#### Co-authored By +If someone else had part in your pull request, please add the following to the commit: +`Co-authored-by: Name ` +This is useful if their authorship was lost during squashing, rebasing, etc., +but may be used in any situation where there are co-authors. + +The email address used here should be the same as on the GitHub profile of said user. +If said user does not have their email address public, please use the following instead: +`Co-authored-by: Name <[username]@users.noreply.github.com>` diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index e77ab39f35..0000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,48 +0,0 @@ - - - - -### System information - -Type | Version/Name - --- | --- -Distribution Name | -Distribution Version | -Linux Kernel | -Architecture | -ZFS Version | -SPL Version | - - -### Describe the problem you're observing - -### Describe how to reproduce the problem - -### Include any warning/errors/backtraces from the system logs - diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..92d0e03a9b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,55 @@ +--- +name: Bug report +about: Create a report to help us improve OpenZFS +title: '' +labels: 'Type: Defect' +assignees: '' + +--- + + + + + +### System information + +Type | Version/Name + --- | --- +Distribution Name | +Distribution Version | +Kernel Version | +Architecture | +OpenZFS Version | + + +### Describe the problem you're observing + +### Describe how to reproduce the problem + +### Include any warning/errors/backtraces from the system logs + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..ecaaa18210 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,14 @@ +blank_issues_enabled: false +contact_links: + - name: OpenZFS Questions + url: https://github.com/openzfs/zfs/discussions/new + about: Ask the community for help + - name: OpenZFS Community Support Mailing list (Linux) + url: https://zfsonlinux.topicbox.com/groups/zfs-discuss + about: Get community support for OpenZFS on Linux + - name: FreeBSD Community Support Mailing list + url: https://lists.freebsd.org/mailman/listinfo/freebsd-fs + about: Get community support for OpenZFS on FreeBSD + - name: OpenZFS on IRC + url: https://web.libera.chat/#openzfs + about: Use IRC to get community support for OpenZFS diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..9b50a4a3d9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,33 @@ +--- +name: Feature request +about: Suggest a feature for OpenZFS +title: '' +labels: 'Type: Feature' +assignees: '' + +--- + + + +### Describe the feature would like to see added to OpenZFS + + + +### How will this feature improve OpenZFS? + + + +### Additional context + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 699ca90780..465ee182c4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,7 +4,7 @@ ### Motivation and Context @@ -19,6 +19,7 @@ https://github.com/zfsonlinux/zfs/wiki/Buildbot-Options + ### Types of changes @@ -27,14 +28,15 @@ https://github.com/zfsonlinux/zfs/wiki/Buildbot-Options - [ ] Performance enhancement (non-breaking change which improves efficiency) - [ ] Code cleanup (non-breaking change which makes code smaller or more readable) - [ ] Breaking change (fix or feature that would cause existing functionality to change) +- [ ] Library ABI change (libzfs, libzfs\_core, libnvpair, libuutil and libzfsbootenv) - [ ] Documentation (a change to man pages or other documentation) ### Checklist: -- [ ] My code follows the ZFS on Linux [code style requirements](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md#coding-conventions). +- [ ] My code follows the OpenZFS [code style requirements](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md#coding-conventions). - [ ] I have updated the documentation accordingly. -- [ ] I have read the [**contributing** document](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md). -- [ ] I have added [tests](https://github.com/zfsonlinux/zfs/tree/master/tests) to cover my changes. -- [ ] All new and existing tests passed. -- [ ] All commit messages are properly formatted and contain [`Signed-off-by`](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md#signed-off-by). +- [ ] I have read the [**contributing** document](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md). +- [ ] I have added [tests](https://github.com/openzfs/zfs/tree/master/tests) to cover my changes. +- [ ] I have run the ZFS Test Suite with this change applied. +- [ ] All commit messages are properly formatted and contain [`Signed-off-by`](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md#signed-off-by). diff --git a/.github/codecov.yml b/.github/codecov.yml index 9ae962639e..6d4932680e 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -4,7 +4,8 @@ codecov: after_n_builds: 2 # user and kernel coverage: - precision: 2 # 2 digits of precision + precision: 0 # 0 decimals of precision + round: nearest # Round to nearest precision point range: "50...90" # red -> yellow -> green status: @@ -20,3 +21,5 @@ comment: layout: "reach, diff, flags, footer" behavior: once # update if exists; post new; skip if deleted require_changes: yes # only post when coverage changes + +# ignore: Please place any ignores in config/ax_code_coverage.m4 instead diff --git a/.github/no-response.yml b/.github/no-response.yml new file mode 100644 index 0000000000..ef2656ec96 --- /dev/null +++ b/.github/no-response.yml @@ -0,0 +1,13 @@ +# Configuration for probot-no-response - https://github.com/probot/no-response + +# Number of days of inactivity before an Issue is closed for lack of response +daysUntilClose: 31 +# Label requiring a response +responseRequiredLabel: "Status: Feedback requested" +# Comment to post when closing an Issue for lack of response. Set to `false` to disable +closeComment: > + This issue has been automatically closed because there has been no response + to our request for more information from the original author. With only the + information that is currently in the issue, we don't have enough information + to take action. Please reach out if you have or find the answers we need so + that we can investigate further. diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 0000000000..895cc8e803 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,26 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 365 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 90 +# Limit to only `issues` or `pulls` +only: issues +# Issues with these labels will never be considered stale +exemptLabels: + - "Type: Feature" + - "Bot: Not Stale" + - "Status: Work in Progress" +# Set to true to ignore issues in a project (defaults to false) +exemptProjects: true +# Set to true to ignore issues in a milestone (defaults to false) +exemptMilestones: true +# Set to true to ignore issues with an assignee (defaults to false) +exemptAssignees: true +# Label to use when marking an issue as stale +staleLabel: "Status: Stale" +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as "stale" because it has not had + any activity for a while. It will be closed in 90 days if no further activity occurs. + Thank you for your contributions. +# Limit the number of actions per hour, from 1-30. Default is 30 +limitPerRun: 6 diff --git a/.github/suppressions.txt b/.github/suppressions.txt deleted file mode 100644 index f9508a24b4..0000000000 --- a/.github/suppressions.txt +++ /dev/null @@ -1,3 +0,0 @@ -preprocessorErrorDirective:./module/zfs/vdev_raidz_math_avx512f.c:243 -preprocessorErrorDirective:./module/zfs/vdev_raidz_math_sse2.c:266 - diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml new file mode 100644 index 0000000000..553d5df397 --- /dev/null +++ b/.github/workflows/checkstyle.yaml @@ -0,0 +1,50 @@ +name: checkstyle + +on: + push: + pull_request: + +jobs: + checkstyle: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gawk alien fakeroot linux-headers-$(uname -r) + sudo apt-get install --yes -qq zlib1g-dev uuid-dev libattr1-dev libblkid-dev libselinux-dev libudev-dev libssl-dev python-dev python-setuptools python-cffi python3 python3-dev python3-setuptools python3-cffi + # packages for tests + sudo apt-get install --yes -qq parted lsscsi ksh attr acl nfs-kernel-server fio + sudo apt-get install --yes -qq mandoc cppcheck pax-utils devscripts + sudo -E pip --quiet install flake8 + - name: Prepare + run: | + sh ./autogen.sh + ./configure + make -j$(nproc) + - name: Checkstyle + run: | + make checkstyle + - name: Lint + run: | + make lint + - name: CheckABI + id: CheckABI + run: | + sudo docker run -v $(pwd):/source ghcr.io/openzfs/libabigail make checkabi + - name: StoreABI + if: failure() && steps.CheckABI.outcome == 'failure' + run: | + sudo docker run -v $(pwd):/source ghcr.io/openzfs/libabigail make storeabi + - name: Prepare artifacts + if: failure() && steps.CheckABI.outcome == 'failure' + run: | + find -name *.abi | tar -cf abi_files.tar -T - + - uses: actions/upload-artifact@v2 + if: failure() && steps.CheckABI.outcome == 'failure' + with: + name: New ABI files (use only if you're sure about interface changes) + path: abi_files.tar diff --git a/.github/workflows/zfs-tests-functional.yml b/.github/workflows/zfs-tests-functional.yml new file mode 100644 index 0000000000..aad3d552b2 --- /dev/null +++ b/.github/workflows/zfs-tests-functional.yml @@ -0,0 +1,82 @@ +name: zfs-tests-functional + +on: + push: + pull_request: + +jobs: + tests-functional-ubuntu: + strategy: + fail-fast: false + matrix: + os: [18.04, 20.04] + runs-on: ubuntu-${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gdb lcov \ + git alien fakeroot wget curl bc fio acl \ + sysstat mdadm lsscsi parted gdebi attr dbench watchdog ksh \ + nfs-kernel-server samba rng-tools xz-utils \ + zlib1g-dev uuid-dev libblkid-dev libselinux-dev \ + xfslibs-dev libattr1-dev libacl1-dev libudev-dev libdevmapper-dev \ + libssl-dev libffi-dev libaio-dev libelf-dev libmount-dev \ + libpam0g-dev pamtester python-dev python-setuptools python-cffi \ + python-packaging python3 python3-dev python3-setuptools python3-cffi \ + libcurl4-openssl-dev python3-packaging + - name: Autogen.sh + run: | + sh autogen.sh + - name: Configure + run: | + ./configure --enable-debug --enable-debuginfo + - name: Make + run: | + make --no-print-directory -s pkg-utils pkg-kmod + - name: Install + run: | + sudo dpkg -i *.deb + # Update order of directories to search for modules, otherwise + # Ubuntu will load kernel-shipped ones. + sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf + sudo depmod + sudo modprobe zfs + # Workaround for cloud-init bug + # see https://github.com/openzfs/zfs/issues/12644 + FILE=/lib/udev/rules.d/10-cloud-init-hook-hotplug.rules + if [ -r "${FILE}" ]; then + HASH=$(md5sum "${FILE}" | awk '{ print $1 }') + if [ "${HASH}" = "121ff0ef1936cd2ef65aec0458a35772" ]; then + # Just shove a zd* exclusion right above the hotplug hook... + sudo sed -i -e s/'LABEL="cloudinit_hook"'/'KERNEL=="zd*", GOTO="cloudinit_end"\n&'/ "${FILE}" + sudo udevadm control --reload-rules + fi + fi + # Workaround to provide additional free space for testing. + # https://github.com/actions/virtual-environments/issues/2840 + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Tests + run: | + /usr/share/zfs/zfs-tests.sh -v -s 3G + - name: Prepare artifacts + if: failure() + run: | + RESULTS_PATH=$(readlink -f /var/tmp/test_results/current) + sudo dmesg > $RESULTS_PATH/dmesg + sudo cp /var/log/syslog $RESULTS_PATH/ + sudo chmod +r $RESULTS_PATH/* + # Replace ':' in dir names, actions/upload-artifact doesn't support it + for f in $(find $RESULTS_PATH -name '*:*'); do mv "$f" "${f//:/__}"; done + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Test logs Ubuntu-${{ matrix.os }} + path: /var/tmp/test_results/20*/ + if-no-files-found: ignore diff --git a/.github/workflows/zfs-tests-sanity.yml b/.github/workflows/zfs-tests-sanity.yml new file mode 100644 index 0000000000..4df49461ed --- /dev/null +++ b/.github/workflows/zfs-tests-sanity.yml @@ -0,0 +1,78 @@ +name: zfs-tests-sanity + +on: + push: + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gdb lcov \ + git alien fakeroot wget curl bc fio acl \ + sysstat mdadm lsscsi parted gdebi attr dbench watchdog ksh \ + nfs-kernel-server samba rng-tools xz-utils \ + zlib1g-dev uuid-dev libblkid-dev libselinux-dev \ + xfslibs-dev libattr1-dev libacl1-dev libudev-dev libdevmapper-dev \ + libssl-dev libffi-dev libaio-dev libelf-dev libmount-dev \ + libpam0g-dev pamtester python-dev python-setuptools python-cffi \ + python-packaging python3 python3-dev python3-setuptools python3-cffi \ + python3-packaging libcurl4-openssl-dev + - name: Autogen.sh + run: | + sh autogen.sh + - name: Configure + run: | + ./configure --enable-debug --enable-debuginfo + - name: Make + run: | + make --no-print-directory -s pkg-utils pkg-kmod + - name: Install + run: | + sudo dpkg -i *.deb + # Update order of directories to search for modules, otherwise + # Ubuntu will load kernel-shipped ones. + sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf + sudo depmod + sudo modprobe zfs + # Workaround for cloud-init bug + # see https://github.com/openzfs/zfs/issues/12644 + FILE=/lib/udev/rules.d/10-cloud-init-hook-hotplug.rules + if [ -r "${FILE}" ]; then + HASH=$(md5sum "${FILE}" | awk '{ print $1 }') + if [ "${HASH}" = "121ff0ef1936cd2ef65aec0458a35772" ]; then + # Just shove a zd* exclusion right above the hotplug hook... + sudo sed -i -e s/'LABEL="cloudinit_hook"'/'KERNEL=="zd*", GOTO="cloudinit_end"\n&'/ "${FILE}" + sudo udevadm control --reload-rules + fi + fi + # Workaround to provide additional free space for testing. + # https://github.com/actions/virtual-environments/issues/2840 + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Tests + run: | + /usr/share/zfs/zfs-tests.sh -v -s 3G -r sanity + - name: Prepare artifacts + if: failure() + run: | + RESULTS_PATH=$(readlink -f /var/tmp/test_results/current) + sudo dmesg > $RESULTS_PATH/dmesg + sudo cp /var/log/syslog $RESULTS_PATH/ + sudo chmod +r $RESULTS_PATH/* + # Replace ':' in dir names, actions/upload-artifact doesn't support it + for f in $(find $RESULTS_PATH -name '*:*'); do mv "$f" "${f//:/__}"; done + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Test logs + path: /var/tmp/test_results/20*/ + if-no-files-found: ignore diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml new file mode 100644 index 0000000000..cf81ad4bca --- /dev/null +++ b/.github/workflows/zloop.yml @@ -0,0 +1,67 @@ +name: zloop + +on: + push: + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + env: + TEST_DIR: /var/tmp/zloop + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gdb \ + git alien fakeroot \ + zlib1g-dev uuid-dev libblkid-dev libselinux-dev \ + xfslibs-dev libattr1-dev libacl1-dev libudev-dev libdevmapper-dev \ + libssl-dev libffi-dev libaio-dev libelf-dev libmount-dev \ + libpam0g-dev \ + python-dev python-setuptools python-cffi python-packaging \ + python3 python3-dev python3-setuptools python3-cffi python3-packaging + - name: Autogen.sh + run: | + sh autogen.sh + - name: Configure + run: | + ./configure --enable-debug --enable-debuginfo + - name: Make + run: | + make --no-print-directory -s pkg-utils pkg-kmod + - name: Install + run: | + sudo dpkg -i *.deb + # Update order of directories to search for modules, otherwise + # Ubuntu will load kernel-shipped ones. + sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf + sudo depmod + sudo modprobe zfs + - name: Tests + run: | + sudo mkdir -p $TEST_DIR + # run for 20 minutes to have a total runner time of 30 minutes + sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 -- -T 120 -P 60 + - name: Prepare artifacts + if: failure() + run: | + sudo chmod +r -R $TEST_DIR/ + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Logs + path: | + /var/tmp/zloop/*/ + !/var/tmp/zloop/*/vdev/ + if-no-files-found: ignore + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Pool files + path: | + /var/tmp/zloop/*/vdev/ + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index 549fa59f38..056bbb8f08 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ Makefile.in # Top level generated files specific to this top level dir # /bin +/build /configure /config.log /config.status @@ -61,5 +62,9 @@ cscope.* *.patch *.orig *.log +*.tmp venv +*.so +*.so.debug +*.so.full diff --git a/.gitmodules b/.gitmodules index d400f10a7e..9eaa2b0495 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "scripts/zfs-images"] path = scripts/zfs-images - url = https://github.com/zfsonlinux/zfs-images + url = https://github.com/openzfs/zfs-images diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 620c0432e2..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,38 +0,0 @@ -language: c -sudo: required -env: - global: - # Travis limits maximum log size, we have to cut tests output - - ZFS_TEST_TRAVIS_LOG_MAX_LENGTH=800 - matrix: - # tags are mainly in ascending order - - ZFS_TEST_TAGS='acl,atime,bootfs,cachefile,casenorm,chattr,checksum,clean_mirror,compression,ctime,delegate,devices,events,exec,fault,features,grow_pool,zdb,zfs,zfs_bookmark,zfs_change-key,zfs_clone,zfs_copies,zfs_create,zfs_diff,zfs_get,zfs_inherit,zfs_load-key,zfs_rename' - - ZFS_TEST_TAGS='cache,history,hkdf,inuse,zfs_property,zfs_receive,zfs_reservation,zfs_send,zfs_set,zfs_share,zfs_snapshot,zfs_unload-key,zfs_unmount,zfs_unshare,zfs_upgrade,zpool,zpool_add,zpool_attach,zpool_clear,zpool_create,zpool_destroy,zpool_detach' - - ZFS_TEST_TAGS='grow_replicas,mv_files,cli_user,zfs_mount,zfs_promote,zfs_rollback,zpool_events,zpool_expand,zpool_export,zpool_get,zpool_history,zpool_import,zpool_labelclear,zpool_offline,zpool_online,zpool_remove,zpool_reopen,zpool_replace,zpool_scrub,zpool_set,zpool_status,zpool_sync,zpool_upgrade' - - ZFS_TEST_TAGS='zfs_destroy,large_files,largest_pool,link_count,migration,mmap,mmp,mount,nestedfs,no_space,nopwrite,online_offline,pool_names,poolversion,privilege,quota,raidz,redundancy,rsend' - - ZFS_TEST_TAGS='inheritance,refquota,refreserv,rename_dirs,replacement,reservation,rootpool,scrub_mirror,slog,snapshot,snapused,sparse,threadsappend,tmpfile,truncate,upgrade,userquota,vdev_zaps,write_dirs,xattr,zvol,libzfs' -before_install: - - sudo apt-get -qq update - - sudo apt-get install --yes -qq build-essential autoconf libtool gawk alien fakeroot linux-headers-$(uname -r) - - sudo apt-get install --yes -qq zlib1g-dev uuid-dev libattr1-dev libblkid-dev libselinux-dev libudev-dev libssl-dev - # packages for tests - - sudo apt-get install --yes -qq parted lsscsi ksh attr acl nfs-kernel-server fio -install: - - git clone --depth=1 https://github.com/zfsonlinux/spl - - cd spl - - git checkout master - - sh autogen.sh - - ./configure - - make --no-print-directory -s pkg-utils pkg-kmod - - sudo dpkg -i *.deb - - cd .. - - sh autogen.sh - - ./configure - - make --no-print-directory -s pkg-utils pkg-kmod - - sudo dpkg -i *.deb -script: - - travis_wait 50 /usr/share/zfs/zfs-tests.sh -v -T $ZFS_TEST_TAGS -after_failure: - - find /var/tmp/test_results/current/log -type f -name '*' -printf "%f\n" -exec cut -c -$ZFS_TEST_TRAVIS_LOG_MAX_LENGTH {} \; -after_success: - - find /var/tmp/test_results/current/log -type f -name '*' -printf "%f\n" -exec cut -c -$ZFS_TEST_TRAVIS_LOG_MAX_LENGTH {} \; diff --git a/AUTHORS b/AUTHORS index 8314a1c214..aab8bf29c9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -83,6 +83,7 @@ CONTRIBUTORS: Christopher Voltz Chunwei Chen Clemens Fruhwirth + Coleman Kane Colin Ian King Craig Loomis Craig Sanders @@ -181,6 +182,7 @@ CONTRIBUTORS: Keith M Wesolowski Kevin Tanguy KireinaHoro + Kjeld Schouten-Lebbing Kohsuke Kawaguchi Kyle Blatter Kyle Fuller @@ -209,6 +211,7 @@ CONTRIBUTORS: Michael Gebetsroither Michael Kjorling Michael Martin + Michael Niewöhner Mike Gerdts Mike Harsch Mike Leddy @@ -257,6 +260,7 @@ CONTRIBUTORS: Saso Kiselkov Scot W. Stevenson Sean Eric Fagan + Sebastian Gottschall Sen Haerens Serapheim Dimitropoulos Seth Forshee diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index d314a66b4e..2dcc251e55 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,2 +1,2 @@ The [OpenZFS Code of Conduct](http://www.open-zfs.org/wiki/Code_of_Conduct) -applies to spaces associated with the ZFS on Linux project, including GitHub. +applies to spaces associated with the OpenZFS project, including GitHub. diff --git a/COPYRIGHT b/COPYRIGHT index 54fbceade1..85556b542f 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -19,7 +19,11 @@ notable exceptions and their respective licenses include: * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl * PBKDF2 Implementation: lib/libzfs/THIRDPARTYLICENSE.openssl - * SPL Implementation: module/spl/THIRDPARTYLICENSE.gplv2 + * SPL Implementation: module/os/linux/spl/THIRDPARTYLICENSE.gplv2 + * GCM Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams + * GCM Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl + * GHASH Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams + * GHASH Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl This product includes software developed by the OpenSSL Project for use in the OpenSSL Toolkit (http://www.openssl.org/) diff --git a/META b/META index a93750eebd..8dacb8082f 100644 --- a/META +++ b/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 0.8.0 +Version: 2.1.99 Release: 1 Release-Tags: relext License: CDDL -Author: OpenZFS on Linux -Linux-Maximum: 5.1 -Linux-Minimum: 2.6.32 +Author: OpenZFS +Linux-Maximum: 5.14 +Linux-Minimum: 3.10 diff --git a/Makefile.am b/Makefile.am index 1ec2514922..34fe16ce41 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,12 +1,17 @@ +include $(top_srcdir)/config/Shellcheck.am + ACLOCAL_AMFLAGS = -I config -include config/rpm.am -include config/deb.am -include config/tgz.am +SUBDIRS = include +if BUILD_LINUX +SUBDIRS += rpm +endif -SUBDIRS = include rpm if CONFIG_USER -SUBDIRS += udev etc man scripts lib tests cmd contrib +SUBDIRS += man scripts lib tests cmd etc contrib +if BUILD_LINUX +SUBDIRS += udev +endif endif if CONFIG_KERNEL SUBDIRS += module @@ -14,33 +19,51 @@ SUBDIRS += module extradir = $(prefix)/src/zfs-$(VERSION) extra_HEADERS = zfs.release.in zfs_config.h.in +if BUILD_LINUX kerneldir = $(prefix)/src/zfs-$(VERSION)/$(LINUX_VERSION) nodist_kernel_HEADERS = zfs.release zfs_config.h module/$(LINUX_SYMBOLS) endif +endif AUTOMAKE_OPTIONS = foreign EXTRA_DIST = autogen.sh copy-builtin EXTRA_DIST += config/config.awk config/rpm.am config/deb.am config/tgz.am -EXTRA_DIST += META AUTHORS COPYRIGHT LICENSE NEWS NOTICE README.md -EXTRA_DIST += CODE_OF_CONDUCT.md +EXTRA_DIST += AUTHORS CODE_OF_CONDUCT.md COPYRIGHT LICENSE META NEWS NOTICE +EXTRA_DIST += README.md RELEASES.md +EXTRA_DIST += module/lua/README.zfs module/os/linux/spl/README.md # Include all the extra licensing information for modules -EXTRA_DIST += module/icp/algs/skein/THIRDPARTYLICENSE module/icp/algs/skein/THIRDPARTYLICENSE.descrip -EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip -EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip -EXTRA_DIST += module/spl/THIRDPARTYLICENSE.gplv2 module/spl/THIRDPARTYLICENSE.gplv2.descrip -EXTRA_DIST += module/zfs/THIRDPARTYLICENSE.cityhash module/zfs/THIRDPARTYLICENSE.cityhash.descrip +EXTRA_DIST += module/icp/algs/skein/THIRDPARTYLICENSE +EXTRA_DIST += module/icp/algs/skein/THIRDPARTYLICENSE.descrip +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip +EXTRA_DIST += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 +EXTRA_DIST += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip +EXTRA_DIST += module/zfs/THIRDPARTYLICENSE.cityhash +EXTRA_DIST += module/zfs/THIRDPARTYLICENSE.cityhash.descrip @CODE_COVERAGE_RULES@ -.PHONY: gitrev -gitrev: - -${top_srcdir}/scripts/make_gitrev.sh +GITREV = include/zfs_gitrev.h -BUILT_SOURCES = gitrev +PHONY = gitrev +gitrev: + $(AM_V_GEN)$(top_srcdir)/scripts/make_gitrev.sh $(GITREV) + +all: gitrev + +# Double-colon rules are allowed; there are multiple independent definitions. +maintainer-clean-local:: + -$(RM) $(GITREV) distclean-local:: - -$(RM) -R autom4te*.cache + -$(RM) -R autom4te*.cache build -find . \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS \ -o -name .pc -o -name .hg -o -name .git \) -prune -o \ \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ @@ -52,13 +75,15 @@ distclean-local:: -type f -print | xargs $(RM) all-local: - -${top_srcdir}/scripts/zfs-tests.sh -c + -[ -x ${top_builddir}/scripts/zfs-tests.sh ] && \ + ${top_builddir}/scripts/zfs-tests.sh -c -dist-hook: gitrev - cp ${top_srcdir}/include/zfs_gitrev.h $(distdir)/include; \ - sed -i 's/Release:[[:print:]]*/Release: $(RELEASE)/' \ +dist-hook: + $(AM_V_GEN)$(top_srcdir)/scripts/make_gitrev.sh -D $(distdir) $(GITREV) + $(SED) ${ac_inplace} -e 's/Release:[[:print:]]*/Release: $(RELEASE)/' \ $(distdir)/META +if BUILD_LINUX # For compatibility, create a matching spl-x.y.z directly which contains # symlinks to the updated header and object file locations. These # compatibility links will be removed in the next major release. @@ -75,75 +100,102 @@ install-data-hook: ln -fs zfs_config.h spl_config.h && \ ln -fs zfs.release spl.release endif +endif -codecheck: cstyle shellcheck flake8 mancheck testscheck vcscheck +PHONY += codecheck +codecheck: cstyle shellcheck checkbashisms flake8 mancheck testscheck vcscheck +PHONY += checkstyle checkstyle: codecheck commitcheck +PHONY += commitcheck commitcheck: @if git rev-parse --git-dir > /dev/null 2>&1; then \ ${top_srcdir}/scripts/commitcheck.sh; \ fi +PHONY += cstyle cstyle: - @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \ - ! -name '*.mod.c' -type f \ + @find ${top_srcdir} -name build -prune \ + -o -type f -name '*.[hc]' \ + ! -name 'zfs_config.*' ! -name '*.mod.c' \ + ! -name 'opt_global.h' ! -name '*_if*.h' \ + ! -path './module/zstd/lib/*' \ -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+ -shellcheck: - @if type shellcheck > /dev/null 2>&1; then \ - shellcheck --exclude=SC1090 --format=gcc \ - $$(find ${top_srcdir}/scripts/*.sh -type f) \ - $$(find ${top_srcdir}/cmd/zed/zed.d/*.sh -type f) \ - $$(find ${top_srcdir}/cmd/zpool/zpool.d/* -executable); \ - else \ - echo "skipping shellcheck because shellcheck is not installed"; \ - fi +filter_executable = -exec test -x '{}' \; -print +SHELLCHECKDIRS = cmd contrib etc scripts tests +SHELLCHECKSCRIPTS = autogen.sh + +PHONY += checkabi storeabi + +checklibabiversion: + libabiversion=`abidw -v | $(SED) 's/[^0-9]//g'`; \ + if test $$libabiversion -lt "200"; then \ + /bin/echo -e "\n" \ + "*** Please use libabigail 2.0.0 version or newer;\n" \ + "*** otherwise results are not consistent!\n" \ + "(or see https://github.com/openzfs/libabigail-docker )\n"; \ + exit 1; \ + fi; + +checkabi: checklibabiversion lib + $(MAKE) -C lib checkabi + +storeabi: checklibabiversion lib + $(MAKE) -C lib storeabi + +PHONY += mancheck mancheck: - @if type mandoc > /dev/null 2>&1; then \ - find ${top_srcdir}/man/man8 -type f -name 'zfs.8' \ - -o -name 'zpool.8' -o -name 'zdb.8' \ - -o -name 'zgenhostid.8' | \ - xargs mandoc -Tlint -Werror; \ - else \ - echo "skipping mancheck because mandoc is not installed"; \ - fi + ${top_srcdir}/scripts/mancheck.sh ${top_srcdir}/man ${top_srcdir}/tests/test-runner/man +if BUILD_LINUX +stat_fmt = -c '%A %n' +else +stat_fmt = -f '%Sp %N' +endif + +PHONY += testscheck testscheck: @find ${top_srcdir}/tests/zfs-tests -type f \ - \( -name '*.ksh' -not -executable \) -o \ - \( -name '*.kshlib' -executable \) -o \ - \( -name '*.shlib' -executable \) -o \ - \( -name '*.cfg' -executable \) | \ - xargs -r stat -c '%A %n' | \ + \( -name '*.ksh' -not ${filter_executable} \) -o \ + \( -name '*.kshlib' ${filter_executable} \) -o \ + \( -name '*.shlib' ${filter_executable} \) -o \ + \( -name '*.cfg' ${filter_executable} \) | \ + xargs -r stat ${stat_fmt} | \ awk '{c++; print} END {if(c>0) exit 1}' +PHONY += vcscheck vcscheck: @if git rev-parse --git-dir > /dev/null 2>&1; then \ git ls-files . --exclude-standard --others | \ awk '{c++; print} END {if(c>0) exit 1}' ; \ fi +PHONY += lint lint: cppcheck paxcheck -cppcheck: - @if type cppcheck > /dev/null 2>&1; then \ - cppcheck --quiet --force --error-exitcode=2 --inline-suppr \ - --suppressions-list=.github/suppressions.txt \ - -UHAVE_SSE2 -UHAVE_AVX512F -UHAVE_UIO_ZEROCOPY \ - ${top_srcdir}; \ +CPPCHECKDIRS = cmd lib module +PHONY += cppcheck +cppcheck: $(CPPCHECKDIRS) + @if test -n "$(CPPCHECK)"; then \ + set -e ; for dir in $(CPPCHECKDIRS) ; do \ + $(MAKE) -C $$dir cppcheck ; \ + done \ else \ echo "skipping cppcheck because cppcheck is not installed"; \ fi +PHONY += paxcheck paxcheck: @if type scanelf > /dev/null 2>&1; then \ - ${top_srcdir}/scripts/paxcheck.sh ${top_srcdir}; \ + ${top_srcdir}/scripts/paxcheck.sh ${top_builddir}; \ else \ echo "skipping paxcheck because scanelf is not installed"; \ fi +PHONY += flake8 flake8: @if type flake8 > /dev/null 2>&1; then \ flake8 ${top_srcdir}; \ @@ -151,17 +203,34 @@ flake8: echo "skipping flake8 because flake8 is not installed"; \ fi +PHONY += ctags ctags: $(RM) tags - find $(top_srcdir) -name .git -prune -o -name '*.[hc]' | xargs ctags + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hcS]' -print | xargs ctags -a +PHONY += etags etags: $(RM) TAGS - find $(top_srcdir) -name .pc -prune -o -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hcS]' -print | xargs etags -a +PHONY += cscopelist +cscopelist: + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hc]' -print >cscope.files + +PHONY += tags tags: ctags etags +PHONY += pkg pkg-dkms pkg-kmod pkg-utils pkg: @DEFAULT_PACKAGE@ pkg-dkms: @DEFAULT_PACKAGE@-dkms pkg-kmod: @DEFAULT_PACKAGE@-kmod pkg-utils: @DEFAULT_PACKAGE@-utils + +include config/rpm.am +include config/deb.am +include config/tgz.am + +.PHONY: $(PHONY) diff --git a/NEWS b/NEWS index bbdc2b69bb..3907ce5326 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,3 @@ Descriptions of all releases can be found on github: -https://github.com/zfsonlinux/zfs/releases +https://github.com/openzfs/zfs/releases diff --git a/README.md b/README.md index 59d167f8ec..d666df7af3 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,35 @@ -![img](http://zfsonlinux.org/images/zfs-linux.png) +![img](https://openzfs.github.io/openzfs-docs/_static/img/logo/480px-Open-ZFS-Secondary-Logo-Colour-halfsize.png) -ZFS on Linux is an advanced file system and volume manager which was originally +OpenZFS is an advanced file system and volume manager which was originally developed for Solaris and is now maintained by the OpenZFS community. +This repository contains the code for running OpenZFS on Linux and FreeBSD. -[![codecov](https://codecov.io/gh/zfsonlinux/zfs/branch/master/graph/badge.svg)](https://codecov.io/gh/zfsonlinux/zfs) -[![coverity](https://scan.coverity.com/projects/1973/badge.svg)](https://scan.coverity.com/projects/zfsonlinux-zfs) +[![codecov](https://codecov.io/gh/openzfs/zfs/branch/master/graph/badge.svg)](https://codecov.io/gh/openzfs/zfs) +[![coverity](https://scan.coverity.com/projects/1973/badge.svg)](https://scan.coverity.com/projects/openzfs-zfs) # Official Resources - * [Site](http://zfsonlinux.org) - * [Wiki](https://github.com/zfsonlinux/zfs/wiki) - * [Mailing lists](https://github.com/zfsonlinux/zfs/wiki/Mailing-Lists) - * [OpenZFS site](http://open-zfs.org/) + * [Documentation](https://openzfs.github.io/openzfs-docs/) - for using and developing this repo + * [ZoL Site](https://zfsonlinux.org) - Linux release info & links + * [Mailing lists](https://openzfs.github.io/openzfs-docs/Project%20and%20Community/Mailing%20Lists.html) + * [OpenZFS site](http://open-zfs.org/) - for conference videos and info on other platforms (illumos, OSX, Windows, etc) # Installation -Full documentation for installing ZoL on your favorite Linux distribution can -be found at [our site](http://zfsonlinux.org/). +Full documentation for installing OpenZFS on your favorite operating system can +be found at the [Getting Started Page](https://openzfs.github.io/openzfs-docs/Getting%20Started/index.html). # Contribute & Develop We have a separate document with [contribution guidelines](./.github/CONTRIBUTING.md). +We have a [Code of Conduct](./CODE_OF_CONDUCT.md). + # Release -ZFS on Linux is released under a CDDL license. +OpenZFS is released under a CDDL license. For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197` # Supported Kernels - * The `META` file contains the officially recognized supported kernel versions. + * The `META` file contains the officially recognized supported Linux kernel versions. + * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE. diff --git a/RELEASES.md b/RELEASES.md new file mode 100644 index 0000000000..55bfdb80ef --- /dev/null +++ b/RELEASES.md @@ -0,0 +1,37 @@ +OpenZFS uses the MAJOR.MINOR.PATCH versioning scheme described here: + + * MAJOR - Incremented at the discretion of the OpenZFS developers to indicate + a particularly noteworthy feature or change. An increase in MAJOR number + does not indicate any incompatible on-disk format change. The ability + to import a ZFS pool is controlled by the feature flags enabled on the + pool and the feature flags supported by the installed OpenZFS version. + Increasing the MAJOR version is expected to be an infrequent occurrence. + + * MINOR - Incremented to indicate new functionality such as a new feature + flag, pool/dataset property, zfs/zpool sub-command, new user/kernel + interface, etc. MINOR releases may introduce incompatible changes to the + user space library APIs (libzfs.so). Existing user/kernel interfaces are + considered to be stable to maximize compatibility between OpenZFS releases. + Additions to the user/kernel interface are backwards compatible. + + * PATCH - Incremented when applying documentation updates, important bug + fixes, minor performance improvements, and kernel compatibility patches. + The user space library APIs and user/kernel interface are considered to + be stable. PATCH releases for a MAJOR.MINOR are published as needed. + +Two release branches are maintained for OpenZFS, they are: + + * OpenZFS LTS - A designated MAJOR.MINOR release with periodic PATCH + releases that incorporate important changes backported from newer OpenZFS + releases. This branch is intended for use in environments using an + LTS, enterprise, or similarly managed kernel (RHEL, Ubuntu LTS, Debian). + Minor changes to support these distribution kernels will be applied as + needed. New kernel versions released after the OpenZFS LTS release are + not supported. LTS releases will receive patches for at least 2 years. + The current LTS release is OpenZFS 2.1. + + * OpenZFS current - Tracks the newest MAJOR.MINOR release. This branch + includes support for the latest OpenZFS features and recently releases + kernels. When a new MINOR release is tagged the previous MINOR release + will no longer be maintained (unless it is an LTS release). New MINOR + releases are planned to occur roughly annually. diff --git a/TEST b/TEST index ebe6ef963f..376d6eb691 100644 --- a/TEST +++ b/TEST @@ -48,64 +48,3 @@ #TEST_ZFSSTRESS_VDEV="/var/tmp/vdev" #TEST_ZFSSTRESS_DIR="/$TEST_ZFSSTRESS_POOL/$TEST_ZFSSTRESS_FS" #TEST_ZFSSTRESS_OPTIONS="" - -### per-builder customization -# -# BB_NAME=builder-name -# - distribution=Amazon,Debian,Fedora,RHEL,SUSE,Ubuntu -# - version=x.y -# - architecture=x86_64,i686,arm,aarch64 -# - type=build,test -# -case "$BB_NAME" in -Amazon*) - # ZFS enabled xfstests fails to build - TEST_XFSTESTS_SKIP="yes" - ;; -CentOS-7*) - # ZFS enabled xfstests fails to build - TEST_XFSTESTS_SKIP="yes" - ;; -CentOS-6*) - ;; -Debian*) - ;; -Fedora*) - ;; -RHEL*) - ;; -SUSE*) - ;; -Ubuntu-16.04*) - # ZFS enabled xfstests fails to build - TEST_XFSTESTS_SKIP="yes" - ;; -Ubuntu*) - ;; -*) - ;; -esac - -### -# -# Run ztest longer on the "coverage" builders to gain more code coverage -# data out of ztest, libzpool, etc. -# -case "$BB_NAME" in -*coverage*) - TEST_ZTEST_TIMEOUT=3600 - ;; -*) - TEST_ZTEST_TIMEOUT=900 - ;; -esac - -### -# -# Disable the following test suites on 32-bit systems. -# -if [ $(getconf LONG_BIT) = "32" ]; then - TEST_ZTEST_SKIP="yes" - TEST_XFSTESTS_SKIP="yes" - TEST_ZFSSTRESS_SKIP="yes" -fi diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 9dd7b8b4f0..5fc9e83971 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,3 +1,27 @@ -SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest -SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -SUBDIRS += arc_summary raidz_test zgenhostid +include $(top_srcdir)/config/Shellcheck.am + +SUBDIRS = zfs zpool zdb zhack zinject zstream ztest +SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path +SUBDIRS += zpool_influxdb + +CPPCHECKDIRS = zfs zpool zdb zhack zinject zstream ztest +CPPCHECKDIRS += raidz_test zfs_ids_to_path zpool_influxdb + +# TODO: #12084: SHELLCHECKDIRS = fsck_zfs vdev_id zpool +SHELLCHECKDIRS = fsck_zfs zpool + +if USING_PYTHON +SUBDIRS += arcstat arc_summary dbufstat +endif + +if BUILD_LINUX +SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait +CPPCHECKDIRS += mount_zfs zed zgenhostid zvol_id +SHELLCHECKDIRS += zed +endif + +PHONY = cppcheck +cppcheck: $(CPPCHECKDIRS) + set -e ; for dir in $(CPPCHECKDIRS) ; do \ + $(MAKE) -C $$dir cppcheck ; \ + done diff --git a/cmd/arc_summary/.gitignore b/cmd/arc_summary/.gitignore new file mode 100644 index 0000000000..50ba15f034 --- /dev/null +++ b/cmd/arc_summary/.gitignore @@ -0,0 +1 @@ +arc_summary diff --git a/cmd/arc_summary/Makefile.am b/cmd/arc_summary/Makefile.am index a83edffadc..1a26c2c199 100644 --- a/cmd/arc_summary/Makefile.am +++ b/cmd/arc_summary/Makefile.am @@ -1,13 +1,13 @@ +bin_SCRIPTS = arc_summary + +CLEANFILES = arc_summary EXTRA_DIST = arc_summary2 arc_summary3 if USING_PYTHON_2 -dist_bin_SCRIPTS = arc_summary2 -install-exec-hook: - mv $(DESTDIR)$(bindir)/arc_summary2 $(DESTDIR)$(bindir)/arc_summary +SCRIPT = arc_summary2 +else +SCRIPT = arc_summary3 endif -if USING_PYTHON_3 -dist_bin_SCRIPTS = arc_summary3 -install-exec-hook: - mv $(DESTDIR)$(bindir)/arc_summary3 $(DESTDIR)$(bindir)/arc_summary -endif +arc_summary: $(SCRIPT) + cp $< $@ diff --git a/cmd/arc_summary/arc_summary2 b/cmd/arc_summary/arc_summary2 index ab4a3c574a..3302a802d1 100755 --- a/cmd/arc_summary/arc_summary2 +++ b/cmd/arc_summary/arc_summary2 @@ -1,4 +1,4 @@ -#!/usr/bin/python2 +#!/usr/bin/env python2 # # $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $ # @@ -42,7 +42,7 @@ Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at -https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. """ import getopt @@ -54,46 +54,64 @@ import errno from subprocess import Popen, PIPE from decimal import Decimal as D + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def is_value(ctl): + return ctl.type != sysctl.CTLTYPE_NODE + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + base = 'kstat.zfs.misc.%s.' % namespace + fmt = lambda kstat: (kstat.name, D(kstat.value)) + kstats = sysctl.filter(base) + return [fmt(kstat) for kstat in kstats if is_value(kstat)] + + def load_tunables(): + ctls = sysctl.filter('vfs.zfs') + return dict((ctl.name, ctl.value) for ctl in ctls if is_value(ctl)) + +elif sys.platform.startswith('linux'): + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + kstat = 'kstat.zfs.misc.%s.%%s' % namespace + path = '/proc/spl/kstat/zfs/%s' % namespace + with open(path) as f: + entries = [line.strip().split() for line in f][2:] # Skip header + return [(kstat % name, D(value)) for name, _, value in entries] + + def load_tunables(): + basepath = '/sys/module/zfs/parameters' + tunables = {} + for name in os.listdir(basepath): + if not name: + continue + path = '%s/%s' % (basepath, name) + with open(path) as f: + value = f.read() + tunables[name] = value.strip() + return tunables + + show_tunable_descriptions = False alternate_tunable_layout = False -def handle_Exception(ex_cls, ex, tb): - if ex is IOError: - if ex.errno == errno.EPIPE: - sys.exit() - - if ex is KeyboardInterrupt: - sys.exit() - - -sys.excepthook = handle_Exception - - def get_Kstat(): """Collect information on the ZFS subsystem from the /proc virtual file system. The name "kstat" is a holdover from the Solaris utility of the same name. """ - def load_proc_kstats(fn, namespace): - """Collect information on a specific subsystem of the ARC""" - - kstats = [line.strip() for line in open(fn)] - del kstats[0:2] - for kstat in kstats: - kstat = kstat.strip() - name, _, value = kstat.split() - Kstat[namespace + name] = D(value) - Kstat = {} - load_proc_kstats('/proc/spl/kstat/zfs/arcstats', - 'kstat.zfs.misc.arcstats.') - load_proc_kstats('/proc/spl/kstat/zfs/zfetchstats', - 'kstat.zfs.misc.zfetchstats.') - load_proc_kstats('/proc/spl/kstat/zfs/vdev_cache_stats', - 'kstat.zfs.misc.vdev_cache_stats.') - + Kstat.update(load_kstats('arcstats')) + Kstat.update(load_kstats('zfetchstats')) + Kstat.update(load_kstats('vdev_cache_stats')) return Kstat @@ -195,12 +213,30 @@ def get_arc_summary(Kstat): deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"] + evict_l2_cached = Kstat["kstat.zfs.misc.arcstats.evict_l2_cached"] + evict_l2_eligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible"] + evict_l2_eligible_mfu = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mfu"] + evict_l2_eligible_mru = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mru"] + evict_l2_ineligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_ineligible"] + evict_l2_skip = Kstat["kstat.zfs.misc.arcstats.evict_l2_skip"] # ARC Misc. output["arc_misc"] = {} output["arc_misc"]["deleted"] = fHits(deleted) - output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) - output["arc_misc"]['evict_skips'] = fHits(evict_skip) + output["arc_misc"]["mutex_miss"] = fHits(mutex_miss) + output["arc_misc"]["evict_skips"] = fHits(evict_skip) + output["arc_misc"]["evict_l2_skip"] = fHits(evict_l2_skip) + output["arc_misc"]["evict_l2_cached"] = fBytes(evict_l2_cached) + output["arc_misc"]["evict_l2_eligible"] = fBytes(evict_l2_eligible) + output["arc_misc"]["evict_l2_eligible_mfu"] = { + 'per': fPerc(evict_l2_eligible_mfu, evict_l2_eligible), + 'num': fBytes(evict_l2_eligible_mfu), + } + output["arc_misc"]["evict_l2_eligible_mru"] = { + 'per': fPerc(evict_l2_eligible_mru, evict_l2_eligible), + 'num': fBytes(evict_l2_eligible_mru), + } + output["arc_misc"]["evict_l2_ineligible"] = fBytes(evict_l2_ineligible) # ARC Sizing arc_size = Kstat["kstat.zfs.misc.arcstats.size"] @@ -316,8 +352,26 @@ def _arc_summary(Kstat): sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % arc['arc_misc']['mutex_miss']) - sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % + sys.stdout.write("\tEviction Skips:\t\t\t\t%s\n" % arc['arc_misc']['evict_skips']) + sys.stdout.write("\tEviction Skips Due to L2 Writes:\t%s\n" % + arc['arc_misc']['evict_l2_skip']) + sys.stdout.write("\tL2 Cached Evictions:\t\t\t%s\n" % + arc['arc_misc']['evict_l2_cached']) + sys.stdout.write("\tL2 Eligible Evictions:\t\t\t%s\n" % + arc['arc_misc']['evict_l2_eligible']) + sys.stdout.write("\tL2 Eligible MFU Evictions:\t%s\t%s\n" % ( + arc['arc_misc']['evict_l2_eligible_mfu']['per'], + arc['arc_misc']['evict_l2_eligible_mfu']['num'], + ) + ) + sys.stdout.write("\tL2 Eligible MRU Evictions:\t%s\t%s\n" % ( + arc['arc_misc']['evict_l2_eligible_mru']['per'], + arc['arc_misc']['evict_l2_eligible_mru']['num'], + ) + ) + sys.stdout.write("\tL2 Ineligible Evictions:\t\t%s\n" % + arc['arc_misc']['evict_l2_ineligible']) sys.stdout.write("\n") # ARC Sizing @@ -653,6 +707,11 @@ def get_l2arc_summary(Kstat): l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"] l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"] l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"] + l2_mfu_asize = Kstat["kstat.zfs.misc.arcstats.l2_mfu_asize"] + l2_mru_asize = Kstat["kstat.zfs.misc.arcstats.l2_mru_asize"] + l2_prefetch_asize = Kstat["kstat.zfs.misc.arcstats.l2_prefetch_asize"] + l2_bufc_data_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_data_asize"] + l2_bufc_metadata_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_metadata_asize"] l2_access_total = (l2_hits + l2_misses) output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error) @@ -675,7 +734,7 @@ def get_l2arc_summary(Kstat): output["io_errors"] = fHits(l2_io_error) output["l2_arc_size"] = {} - output["l2_arc_size"]["adative"] = fBytes(l2_size) + output["l2_arc_size"]["adaptive"] = fBytes(l2_size) output["l2_arc_size"]["actual"] = { 'per': fPerc(l2_asize, l2_size), 'num': fBytes(l2_asize) @@ -684,6 +743,26 @@ def get_l2arc_summary(Kstat): 'per': fPerc(l2_hdr_size, l2_size), 'num': fBytes(l2_hdr_size), } + output["l2_arc_size"]["mfu_asize"] = { + 'per': fPerc(l2_mfu_asize, l2_asize), + 'num': fBytes(l2_mfu_asize), + } + output["l2_arc_size"]["mru_asize"] = { + 'per': fPerc(l2_mru_asize, l2_asize), + 'num': fBytes(l2_mru_asize), + } + output["l2_arc_size"]["prefetch_asize"] = { + 'per': fPerc(l2_prefetch_asize, l2_asize), + 'num': fBytes(l2_prefetch_asize), + } + output["l2_arc_size"]["bufc_data_asize"] = { + 'per': fPerc(l2_bufc_data_asize, l2_asize), + 'num': fBytes(l2_bufc_data_asize), + } + output["l2_arc_size"]["bufc_metadata_asize"] = { + 'per': fPerc(l2_bufc_metadata_asize, l2_asize), + 'num': fBytes(l2_bufc_metadata_asize), + } output["l2_arc_evicts"] = {} output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry) @@ -748,7 +827,7 @@ def _l2arc_summary(Kstat): sys.stdout.write("\n") sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" % - arc["l2_arc_size"]["adative"]) + arc["l2_arc_size"]["adaptive"]) sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % ( arc["l2_arc_size"]["actual"]["per"], arc["l2_arc_size"]["actual"]["num"], @@ -759,11 +838,36 @@ def _l2arc_summary(Kstat): arc["l2_arc_size"]["head_size"]["num"], ) ) + sys.stdout.write("\tMFU Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["mfu_asize"]["per"], + arc["l2_arc_size"]["mfu_asize"]["num"], + ) + ) + sys.stdout.write("\tMRU Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["mru_asize"]["per"], + arc["l2_arc_size"]["mru_asize"]["num"], + ) + ) + sys.stdout.write("\tPrefetch Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["prefetch_asize"]["per"], + arc["l2_arc_size"]["prefetch_asize"]["num"], + ) + ) + sys.stdout.write("\tData (buf content) Alloc. Size:\t%s\t%s\n" % ( + arc["l2_arc_size"]["bufc_data_asize"]["per"], + arc["l2_arc_size"]["bufc_data_asize"]["num"], + ) + ) + sys.stdout.write("\tMetadata (buf content) Size:\t%s\t%s\n" % ( + arc["l2_arc_size"]["bufc_metadata_asize"]["per"], + arc["l2_arc_size"]["bufc_metadata_asize"]["num"], + ) + ) sys.stdout.write("\n") if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ arc["l2_arc_evicts"]["reading"] != '0': - sys.stdout.write("L2 ARC Evicts:\n") + sys.stdout.write("L2 ARC Evictions:\n") sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % arc["l2_arc_evicts"]['lock_retries']) sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" % @@ -921,14 +1025,7 @@ def _tunable_summary(Kstat): global show_tunable_descriptions global alternate_tunable_layout - names = os.listdir("/sys/module/zfs/parameters/") - - values = {} - for name in names: - with open("/sys/module/zfs/parameters/" + name) as f: - value = f.read() - values[name] = value.strip() - + tunables = load_tunables() descriptions = {} if show_tunable_descriptions: @@ -966,22 +1063,17 @@ def _tunable_summary(Kstat): sys.stderr.write("Tunable descriptions will be disabled.\n") sys.stdout.write("ZFS Tunables:\n") - names.sort() if alternate_tunable_layout: fmt = "\t%s=%s\n" else: fmt = "\t%-50s%s\n" - for name in names: - - if not name: - continue - + for name in sorted(tunables.keys()): if show_tunable_descriptions and name in descriptions: sys.stdout.write("\t# %s\n" % descriptions[name]) - sys.stdout.write(fmt % (name, values[name])) + sys.stdout.write(fmt % (name, tunables[name])) unSub = [ @@ -1033,48 +1125,55 @@ def main(): global alternate_tunable_layout try: - opts, args = getopt.getopt( - sys.argv[1:], - "adp:h", ["alternate", "description", "page=", "help"] - ) - except getopt.error as e: - sys.stderr.write("Error: %s\n" % e.msg) - usage() - sys.exit(1) - - args = {} - for opt, arg in opts: - if opt in ('-a', '--alternate'): - args['a'] = True - if opt in ('-d', '--description'): - args['d'] = True - if opt in ('-p', '--page'): - args['p'] = arg - if opt in ('-h', '--help'): - usage() - sys.exit(0) - - Kstat = get_Kstat() - - alternate_tunable_layout = 'a' in args - show_tunable_descriptions = 'd' in args - - pages = [] - - if 'p' in args: try: - pages.append(unSub[int(args['p']) - 1]) - except IndexError: - sys.stderr.write('the argument to -p must be between 1 and ' + - str(len(unSub)) + '\n') + opts, args = getopt.getopt( + sys.argv[1:], + "adp:h", ["alternate", "description", "page=", "help"] + ) + except getopt.error as e: + sys.stderr.write("Error: %s\n" % e.msg) + usage() sys.exit(1) - else: - pages = unSub - zfs_header() - for page in pages: - page(Kstat) - sys.stdout.write("\n") + args = {} + for opt, arg in opts: + if opt in ('-a', '--alternate'): + args['a'] = True + if opt in ('-d', '--description'): + args['d'] = True + if opt in ('-p', '--page'): + args['p'] = arg + if opt in ('-h', '--help'): + usage() + sys.exit(0) + + Kstat = get_Kstat() + + alternate_tunable_layout = 'a' in args + show_tunable_descriptions = 'd' in args + + pages = [] + + if 'p' in args: + try: + pages.append(unSub[int(args['p']) - 1]) + except IndexError: + sys.stderr.write('the argument to -p must be between 1 and ' + + str(len(unSub)) + '\n') + sys.exit(1) + else: + pages = unSub + + zfs_header() + for page in pages: + page(Kstat) + sys.stdout.write("\n") + except IOError as ex: + if (ex.errno == errno.EPIPE): + sys.exit(0) + raise + except KeyboardInterrupt: + sys.exit(0) if __name__ == '__main__': diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 index fc5e1e4b64..7b28012ede 100755 --- a/cmd/arc_summary/arc_summary3 +++ b/cmd/arc_summary/arc_summary3 @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # # Copyright (c) 2008 Ben Rockwood , # Copyright (c) 2010 Martin Matuska , @@ -32,7 +32,7 @@ Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at -https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. The original introduction to arc_summary can be found at http://cuddletech.com/?p=454 """ @@ -42,13 +42,17 @@ import os import subprocess import sys import time +import errno -DECRIPTION = 'Print ARC and other statistics for ZFS on Linux' +# We can't use env -S portably, and we need python3 -u to handle pipes in +# the shell abruptly closing the way we want to, so... +import io +if isinstance(sys.__stderr__.buffer, io.BufferedWriter): + os.execv(sys.executable, [sys.executable, "-u"] + sys.argv) + +DESCRIPTION = 'Print ARC and other statistics for OpenZFS' INDENT = ' '*8 LINE_LENGTH = 72 -PROC_PATH = '/proc/spl/kstat/zfs/' -SPL_PATH = '/sys/module/spl/parameters/' -TUNABLES_PATH = '/sys/module/zfs/parameters/' DATE_FORMAT = '%a %b %d %H:%M:%S %Y' TITLE = 'ZFS Subsystem Report' @@ -61,11 +65,10 @@ SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats 'vdev': 'vdev_cache_stats', - 'xuio': 'xuio_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} -parser = argparse.ArgumentParser(description=DECRIPTION) +parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('-a', '--alternate', action='store_true', default=False, help='use alternate formatting for tunables and SPL', dest='alt') @@ -83,6 +86,172 @@ parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) ARGS = parser.parse_args() +if sys.platform.startswith('freebsd'): + # Requires py36-sysctl on FreeBSD + import sysctl + + VDEV_CACHE_SIZE = 'vdev.cache_size' + + def is_value(ctl): + return ctl.type != sysctl.CTLTYPE_NODE + + def namefmt(ctl, base='vfs.zfs.'): + # base is removed from the name + cut = len(base) + return ctl.name[cut:] + + def load_kstats(section): + base = 'kstat.zfs.misc.{section}.'.format(section=section) + fmt = lambda kstat: '{name} : {value}'.format(name=namefmt(kstat, base), + value=kstat.value) + kstats = sysctl.filter(base) + return [fmt(kstat) for kstat in kstats if is_value(kstat)] + + def get_params(base): + ctls = sysctl.filter(base) + return {namefmt(ctl): str(ctl.value) for ctl in ctls if is_value(ctl)} + + def get_tunable_params(): + return get_params('vfs.zfs') + + def get_vdev_params(): + return get_params('vfs.zfs.vdev') + + def get_version_impl(request): + # FreeBSD reports versions for zpl and spa instead of zfs and spl. + name = {'zfs': 'zpl', + 'spl': 'spa'}[request] + mib = 'vfs.zfs.version.{}'.format(name) + version = sysctl.filter(mib)[0].value + return '{} version {}'.format(name, version) + + def get_descriptions(_request): + ctls = sysctl.filter('vfs.zfs') + return {namefmt(ctl): ctl.description for ctl in ctls if is_value(ctl)} + + +elif sys.platform.startswith('linux'): + KSTAT_PATH = '/proc/spl/kstat/zfs' + SPL_PATH = '/sys/module/spl/parameters' + TUNABLES_PATH = '/sys/module/zfs/parameters' + + VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' + + def load_kstats(section): + path = os.path.join(KSTAT_PATH, section) + with open(path) as f: + return list(f)[2:] # Get rid of header + + def get_params(basepath): + """Collect information on the Solaris Porting Layer (SPL) or the + tunables, depending on the PATH given. Does not check if PATH is + legal. + """ + result = {} + for name in os.listdir(basepath): + path = os.path.join(basepath, name) + with open(path) as f: + value = f.read() + result[name] = value.strip() + return result + + def get_spl_params(): + return get_params(SPL_PATH) + + def get_tunable_params(): + return get_params(TUNABLES_PATH) + + def get_vdev_params(): + return get_params(TUNABLES_PATH) + + def get_version_impl(request): + # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # the version information. We switch to /sys/module/{spl,zfs}/version + # to make sure we get what is really loaded in the kernel + try: + with open("/sys/module/{}/version".format(request)) as f: + return f.read().strip() + except: + return "(unknown)" + + def get_descriptions(request): + """Get the descriptions of the Solaris Porting Layer (SPL) or the + tunables, return with minimal formatting. + """ + + if request not in ('spl', 'zfs'): + print('ERROR: description of "{0}" requested)'.format(request)) + sys.exit(1) + + descs = {} + target_prefix = 'parm:' + + # We would prefer to do this with /sys/modules -- see the discussion at + # get_version() -- but there isn't a way to get the descriptions from + # there, so we fall back on modinfo + command = ["/sbin/modinfo", request, "-0"] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + info = '' + + try: + + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + raw_output = info.stdout.split('\0') + else: + info = subprocess.check_output(command, + universal_newlines=True) + raw_output = info.split('\0') + + except subprocess.CalledProcessError: + print("Error: Descriptions not available", + "(can't access kernel module)") + sys.exit(1) + + for line in raw_output: + + if not line.startswith(target_prefix): + continue + + line = line[len(target_prefix):].strip() + name, raw_desc = line.split(':', 1) + desc = raw_desc.rsplit('(', 1)[0] + + if desc == '': + desc = '(No description found)' + + descs[name.strip()] = desc.strip() + + return descs + +def handle_unraisableException(exc_type, exc_value=None, exc_traceback=None, + err_msg=None, object=None): + handle_Exception(exc_type, object, exc_traceback) + +def handle_Exception(ex_cls, ex, tb): + if ex_cls is KeyboardInterrupt: + sys.exit() + + if ex_cls is BrokenPipeError: + # It turns out that while sys.exit() triggers an exception + # not handled message on Python 3.8+, os._exit() does not. + os._exit(0) + + if ex_cls is OSError: + if ex.errno == errno.ENOTCONN: + sys.exit() + + raise ex + +if hasattr(sys,'unraisablehook'): # Python 3.8+ + sys.unraisablehook = handle_unraisableException +sys.excepthook = handle_Exception + + def cleanup_line(single_line): """Format a raw line of data from /proc and isolate the name value part, returning a tuple with each. Currently, this gets rid of the @@ -238,139 +407,48 @@ def format_raw_line(name, value): if ARGS.alt: result = '{0}{1}={2}'.format(INDENT, name, value) else: - spc = LINE_LENGTH-(len(INDENT)+len(value)) - result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) + # Right-align the value within the line length if it fits, + # otherwise just separate it from the name by a single space. + fit = LINE_LENGTH - len(INDENT) - len(name) + overflow = len(value) + 1 + w = max(fit, overflow) + result = '{0}{1}{2:>{w}}'.format(INDENT, name, value, w=w) return result def get_kstats(): - """Collect information on the ZFS subsystem from the /proc Linux virtual - file system. The step does not perform any further processing, giving us - the option to only work on what is actually needed. The name "kstat" is a - holdover from the Solaris utility of the same name. + """Collect information on the ZFS subsystem. The step does not perform any + further processing, giving us the option to only work on what is actually + needed. The name "kstat" is a holdover from the Solaris utility of the same + name. """ result = {} - secs = SECTION_PATHS.values() - for section in secs: - - with open(PROC_PATH+section, 'r') as proc_location: - lines = [line for line in proc_location] - - del lines[0:2] # Get rid of header - result[section] = lines + for section in SECTION_PATHS.values(): + if section not in result: + result[section] = load_kstats(section) return result -def get_spl_tunables(PATH): - """Collect information on the Solaris Porting Layer (SPL) or the - tunables, depending on the PATH given. Does not check if PATH is - legal. - """ - - result = {} - parameters = os.listdir(PATH) - - for name in parameters: - - with open(PATH+name, 'r') as para_file: - value = para_file.read() - result[name] = value.strip() - - return result - - -def get_descriptions(request): - """Get the decriptions of the Solaris Porting Layer (SPL) or the - tunables, return with minimal formatting. - """ - - if request not in ('spl', 'zfs'): - print('ERROR: description of "{0}" requested)'.format(request)) - sys.exit(1) - - descs = {} - target_prefix = 'parm:' - - # We would prefer to do this with /sys/modules -- see the discussion at - # get_version() -- but there isn't a way to get the descriptions from - # there, so we fall back on modinfo - command = ["/sbin/modinfo", request, "-0"] - - # The recommended way to do this is with subprocess.run(). However, - # some installed versions of Python are < 3.5, so we offer them - # the option of doing it the old way (for now) - info = '' - - try: - - if 'run' in dir(subprocess): - info = subprocess.run(command, stdout=subprocess.PIPE, - universal_newlines=True) - raw_output = info.stdout.split('\0') - else: - info = subprocess.check_output(command, universal_newlines=True) - raw_output = info.split('\0') - - except subprocess.CalledProcessError: - print("Error: Descriptions not available (can't access kernel module)") - sys.exit(1) - - for line in raw_output: - - if not line.startswith(target_prefix): - continue - - line = line[len(target_prefix):].strip() - name, raw_desc = line.split(':', 1) - desc = raw_desc.rsplit('(', 1)[0] - - if desc == '': - desc = '(No description found)' - - descs[name.strip()] = desc.strip() - - return descs - - def get_version(request): """Get the version number of ZFS or SPL on this machine for header. Returns an error string, but does not raise an error, if we can't - get the ZFS/SPL version via modinfo. + get the ZFS/SPL version. """ if request not in ('spl', 'zfs'): error_msg = '(ERROR: "{0}" requested)'.format(request) return error_msg - # The original arc_summary called /sbin/modinfo/{spl,zfs} to get - # the version information. We switch to /sys/module/{spl,zfs}/version - # to make sure we get what is really loaded in the kernel - command = ["cat", "/sys/module/{0}/version".format(request)] - req = request.upper() - version = "(Can't get {0} version)".format(req) - - # The recommended way to do this is with subprocess.run(). However, - # some installed versions of Python are < 3.5, so we offer them - # the option of doing it the old way (for now) - info = '' - if 'run' in dir(subprocess): - info = subprocess.run(command, stdout=subprocess.PIPE, - universal_newlines=True) - version = info.stdout.strip() - else: - info = subprocess.check_output(command, universal_newlines=True) - version = info.strip() - - return version + return get_version_impl(request) def print_header(): """Print the initial heading with date and time as well as info on the - Linux and ZFS versions. This is not called for the graph. + kernel and ZFS versions. This is not called for the graph. """ # datetime is now recommended over time but we keep the exact formatting @@ -534,6 +612,20 @@ def section_arc(kstats_dict): prt_i1('Deleted:', f_hits(arc_stats['deleted'])) prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) + prt_i1('Eviction skips due to L2 writes:', + f_hits(arc_stats['evict_l2_skip'])) + prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) + prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) + prt_i2('L2 eligible MFU evictions:', + f_perc(arc_stats['evict_l2_eligible_mfu'], + arc_stats['evict_l2_eligible']), + f_bytes(arc_stats['evict_l2_eligible_mfu'])) + prt_i2('L2 eligible MRU evictions:', + f_perc(arc_stats['evict_l2_eligible_mru'], + arc_stats['evict_l2_eligible']), + f_bytes(arc_stats['evict_l2_eligible_mru'])) + prt_i1('L2 ineligible evictions:', + f_bytes(arc_stats['evict_l2_ineligible'])) print() @@ -672,15 +764,30 @@ def section_l2arc(kstats_dict): prt_i2('Header size:', f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), f_bytes(arc_stats['l2_hdr_size'])) + prt_i2('MFU allocated size:', + f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_mfu_asize'])) + prt_i2('MRU allocated size:', + f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_mru_asize'])) + prt_i2('Prefetch allocated size:', + f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_prefetch_asize'])) + prt_i2('Data (buffer content) allocated size:', + f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_bufc_data_asize'])) + prt_i2('Metadata (buffer content) allocated size:', + f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_bufc_metadata_asize'])) print() prt_1('L2ARC breakdown:', f_hits(l2_access_total)) prt_i2('Hit ratio:', f_perc(arc_stats['l2_hits'], l2_access_total), - f_bytes(arc_stats['l2_hits'])) + f_hits(arc_stats['l2_hits'])) prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), - f_bytes(arc_stats['l2_misses'])) + f_hits(arc_stats['l2_misses'])) prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() @@ -691,13 +798,13 @@ def section_l2arc(kstats_dict): prt_i2('Done ratio:', f_perc(arc_stats['l2_writes_done'], arc_stats['l2_writes_sent']), - f_bytes(arc_stats['l2_writes_done'])) + f_hits(arc_stats['l2_writes_done'])) prt_i2('Error ratio:', f_perc(arc_stats['l2_writes_error'], arc_stats['l2_writes_sent']), - f_bytes(arc_stats['l2_writes_error'])) + f_hits(arc_stats['l2_writes_error'])) else: - prt_i2('Writes sent:', '100 %', f_bytes(arc_stats['l2_writes_sent'])) + prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') @@ -708,10 +815,14 @@ def section_l2arc(kstats_dict): def section_spl(*_): """Print the SPL parameters, if requested with alternative format - and/or decriptions. This does not use kstats. + and/or descriptions. This does not use kstats. """ - spls = get_spl_tunables(SPL_PATH) + if sys.platform.startswith('freebsd'): + # No SPL support in FreeBSD + return + + spls = get_spl_params() keylist = sorted(spls.keys()) print('Solaris Porting Layer (SPL):') @@ -725,7 +836,7 @@ def section_spl(*_): try: print(INDENT+'#', descriptions[key]) except KeyError: - print(INDENT+'# (No decription found)') # paranoid + print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) @@ -734,10 +845,10 @@ def section_spl(*_): def section_tunables(*_): """Print the tunables, if requested with alternative format and/or - decriptions. This does not use kstasts. + descriptions. This does not use kstasts. """ - tunables = get_spl_tunables(TUNABLES_PATH) + tunables = get_tunable_params() keylist = sorted(tunables.keys()) print('Tunables:') @@ -751,7 +862,7 @@ def section_tunables(*_): try: print(INDENT+'#', descriptions[key]) except KeyError: - print(INDENT+'# (No decription found)') # paranoid + print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) @@ -763,11 +874,11 @@ def section_vdev(kstats_dict): # Currently [Nov 2017] the VDEV cache is disabled, because it is actually # harmful. When this is the case, we just skip the whole entry. See - # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c + # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c # for details - tunables = get_spl_tunables(TUNABLES_PATH) + tunables = get_vdev_params() - if tunables['zfs_vdev_cache_size'] == '0': + if tunables[VDEV_CACHE_SIZE] == '0': print('VDEV cache disabled, skipping section\n') return @@ -789,7 +900,7 @@ def section_vdev(kstats_dict): def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information - taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h + taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h """ zil_stats = isolate_section('zil', kstats_dict) diff --git a/cmd/arcstat/.gitignore b/cmd/arcstat/.gitignore new file mode 100644 index 0000000000..6d6cd1ab75 --- /dev/null +++ b/cmd/arcstat/.gitignore @@ -0,0 +1 @@ +arcstat diff --git a/cmd/arcstat/Makefile.am b/cmd/arcstat/Makefile.am index 462e9a6197..d1ba989a0c 100644 --- a/cmd/arcstat/Makefile.am +++ b/cmd/arcstat/Makefile.am @@ -1,13 +1,5 @@ -dist_bin_SCRIPTS = arcstat +include $(top_srcdir)/config/Substfiles.am -# -# The arcstat script is compatibile with both Python 2.6 and 3.4. -# As such the python 3 shebang can be replaced at install time when -# targeting a python 2 system. This allows us to maintain a single -# version of the source. -# -if USING_PYTHON_2 -install-exec-hook: - sed --in-place 's|^#!/usr/bin/python3|#!/usr/bin/python2|' \ - $(DESTDIR)$(bindir)/arcstat -endif +bin_SCRIPTS = arcstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/arcstat/arcstat b/cmd/arcstat/arcstat.in similarity index 60% rename from cmd/arcstat/arcstat rename to cmd/arcstat/arcstat.in index 57a2d621f3..cd9a803a24 100755 --- a/cmd/arcstat/arcstat +++ b/cmd/arcstat/arcstat.in @@ -1,20 +1,25 @@ -#!/usr/bin/python3 +#!/usr/bin/env @PYTHON_SHEBANG@ # # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arctstat.pl -v +# For a definition of fields, or usage, use arcstat -v # -# This script is a fork of the original arcstat.pl (0.1) by -# Neelakanth Nadgir, originally published on his Sun blog on +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on # 09/18/2007 # http://blogs.sun.com/realneel/entry/zfs_arc_statistics # -# This version aims to improve upon the original by adding features -# and fixing bugs as needed. This version is maintained by -# Mike Harsch and is hosted in a public open source repository: +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: # http://github.com/mharsch/arcstat # -# Comments, Questions, or Suggestions are always welcome. -# Contact the maintainer at ( mike at harschsystems dot com ) +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. # # CDDL HEADER START # @@ -51,16 +56,16 @@ import getopt import re import copy -from decimal import Decimal from signal import signal, SIGINT, SIGWINCH, SIG_DFL + cols = { # HDR: [Size, Scale, Description] "time": [8, -1, "Time"], "hits": [4, 1000, "ARC reads per second"], "miss": [4, 1000, "ARC misses per second"], "read": [4, 1000, "Total ARC accesses per second"], - "hit%": [4, 100, "ARC Hit percentage"], + "hit%": [4, 100, "ARC hit percentage"], "miss%": [5, 100, "ARC miss percentage"], "dhit": [4, 1000, "Demand hits per second"], "dmis": [4, 1000, "Demand misses per second"], @@ -75,13 +80,20 @@ cols = { "mread": [5, 1000, "Metadata accesses per second"], "mh%": [3, 100, "Metadata hit percentage"], "mm%": [3, 100, "Metadata miss percentage"], - "arcsz": [5, 1024, "ARC Size"], - "c": [4, 1024, "ARC Target Size"], - "mfu": [4, 1000, "MFU List hits per second"], - "mru": [4, 1000, "MRU List hits per second"], - "mfug": [4, 1000, "MFU Ghost List hits per second"], - "mrug": [4, 1000, "MRU Ghost List hits per second"], + "arcsz": [5, 1024, "ARC size"], + "size": [4, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], "eskip": [5, 1000, "evict_skip per second"], + "el2skip": [7, 1000, "evict skip, due to l2 writes, per second"], + "el2cach": [7, 1024, "Size of L2 cached evictions per second"], + "el2el": [5, 1024, "Size of L2 eligible evictions per second"], + "el2mfu": [6, 1024, "Size of L2 eligible MFU evictions per second"], + "el2mru": [6, 1024, "Size of L2 eligible MRU evictions per second"], + "el2inel": [7, 1024, "Size of L2 ineligible evictions per second"], "mtxmis": [6, 1000, "mutex_miss per second"], "dread": [5, 1000, "Demand accesses per second"], "pread": [5, 1000, "Prefetch accesses per second"], @@ -90,17 +102,29 @@ cols = { "l2read": [6, 1000, "Total L2ARC accesses per second"], "l2hit%": [6, 100, "L2ARC access hit percentage"], "l2miss%": [7, 100, "L2ARC access miss percentage"], + "l2pref": [6, 1024, "L2ARC prefetch allocated size"], + "l2mfu": [5, 1024, "L2ARC MFU allocated size"], + "l2mru": [5, 1024, "L2ARC MRU allocated size"], + "l2data": [6, 1024, "L2ARC data allocated size"], + "l2meta": [6, 1024, "L2ARC metadata allocated size"], + "l2pref%": [7, 100, "L2ARC prefetch percentage"], + "l2mfu%": [6, 100, "L2ARC MFU percentage"], + "l2mru%": [6, 100, "L2ARC MRU percentage"], + "l2data%": [7, 100, "L2ARC data percentage"], + "l2meta%": [7, 100, "L2ARC metadata percentage"], "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], "l2size": [6, 1024, "Size of the L2ARC"], - "l2bytes": [7, 1024, "bytes read per second from the L2ARC"], - "grow": [4, 1000, "ARC Grow disabled"], - "need": [4, 1024, "ARC Reclaim need"], - "free": [4, 1024, "ARC Free memory"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], + "avail": [5, 1024, "ARC available memory"], + "waste": [5, 1024, "Wasted memory due to round up to pagesize"], } v = {} hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", - "mm%", "arcsz", "c"] + "mm%", "size", "c", "avail"] xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread", "pread", "read"] sint = 1 # Default interval is 1 second @@ -110,12 +134,56 @@ opfile = None sep = " " # Default separator is 2 spaces version = "0.4" l2exist = False -cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} out = None kstat = None +pretty_print = True + + +if sys.platform.startswith('freebsd'): + # Requires py-sysctl on FreeBSD + import sysctl + + def kstat_update(): + global kstat + + k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') + if ctl.type != sysctl.CTLTYPE_NODE] + + if not k: + sys.exit(1) + + kstat = {} + + for s in k: + if not s: + continue + + name, value = s.name, s.value + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + +elif sys.platform.startswith('linux'): + def kstat_update(): + global kstat + + k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + + if not k: + sys.exit(1) + + del k[0:2] + kstat = {} + + for s in k: + if not s: + continue + + name, unused, value = s.split() + kstat[name] = int(value) def detailed_usage(): @@ -131,6 +199,7 @@ def detailed_usage(): def usage(): sys.stderr.write("%s\n" % cmd) sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -a : Print all possible stats\n") sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") @@ -138,6 +207,7 @@ def usage(): sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " "character or string\n") + sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n") sys.stderr.write("\nExamples:\n") sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") @@ -148,25 +218,6 @@ def usage(): sys.exit(1) -def kstat_update(): - global kstat - - k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] - - if not k: - sys.exit(1) - - del k[0:2] - kstat = {} - - for s in k: - if not s: - continue - - name, unused, value = s.split() - kstat[name] = Decimal(value) - - def snap_stats(): global cur global kstat @@ -197,7 +248,7 @@ def prettynum(sz, scale, num=0): elif 0 < num < 1: num = 0 - while num > scale and index < 5: + while abs(num) > scale and index < 5: save = num num = num / scale index += 1 @@ -205,7 +256,7 @@ def prettynum(sz, scale, num=0): if index == 0: return "%*d" % (sz, num) - if (save / scale) < 10: + if abs(save / scale) < 10: return "%*.1f%s" % (sz - 1, num, suffix[index]) else: return "%*d%s" % (sz - 1, num, suffix[index]) @@ -215,12 +266,14 @@ def print_values(): global hdr global sep global v + global pretty_print - for col in hdr: - sys.stdout.write("%s%s" % ( - prettynum(cols[col][0], cols[col][1], v[col]), - sep - )) + if pretty_print: + fmt = lambda col: prettynum(cols[col][0], cols[col][1], v[col]) + else: + fmt = lambda col: v[col] + + sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") sys.stdout.flush() @@ -228,9 +281,14 @@ def print_values(): def print_header(): global hdr global sep + global pretty_print - for col in hdr: - sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + if pretty_print: + fmt = lambda col: "%*s" % (cols[col][0], col) + else: + fmt = lambda col: col + + sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") @@ -267,8 +325,10 @@ def init(): global sep global out global l2exist + global pretty_print desired_cols = None + aflag = False xflag = False hflag = False vflag = False @@ -277,14 +337,16 @@ def init(): try: opts, args = getopt.getopt( sys.argv[1:], - "xo:hvs:f:", + "axo:hvs:f:p", [ + "all", "extended", "outfile", "help", "verbose", "separator", - "columns" + "columns", + "parsable" ] ) except getopt.error as msg: @@ -293,6 +355,8 @@ def init(): opts = None for opt, arg in opts: + if opt in ('-a', '--all'): + aflag = True if opt in ('-x', '--extended'): xflag = True if opt in ('-o', '--outfile'): @@ -308,19 +372,13 @@ def init(): if opt in ('-f', '--columns'): desired_cols = arg i += 1 + if opt in ('-p', '--parsable'): + pretty_print = False i += 1 argv = sys.argv[i:] - sint = Decimal(argv[0]) if argv else sint - count = int(argv[1]) if len(argv) > 1 else count - - if len(argv) > 1: - sint = Decimal(argv[0]) - count = int(argv[1]) - - elif len(argv) > 0: - sint = Decimal(argv[0]) - count = 0 + sint = int(argv[0]) if argv else sint + count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) if hflag or (xflag and desired_cols): usage() @@ -360,6 +418,12 @@ def init(): incompat) usage() + if aflag: + if l2exist: + hdr = cols.keys() + else: + hdr = [col for col in cols.keys() if not col.startswith("l2")] + if opfile: try: out = open(opfile, "w") @@ -377,59 +441,79 @@ def calculate(): v = dict() v["time"] = time.strftime("%H:%M:%S", time.localtime()) - v["hits"] = d["hits"] / sint - v["miss"] = d["misses"] / sint + v["hits"] = d["hits"] // sint + v["miss"] = d["misses"] // sint v["read"] = v["hits"] + v["miss"] - v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 + v["hit%"] = 100 * v["hits"] // v["read"] if v["read"] > 0 else 0 v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 - v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint - v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint + v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) // sint + v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) // sint v["dread"] = v["dhit"] + v["dmis"] - v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 + v["dh%"] = 100 * v["dhit"] // v["dread"] if v["dread"] > 0 else 0 v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 - v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint + v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) // sint v["pmis"] = (d["prefetch_data_misses"] + - d["prefetch_metadata_misses"]) / sint + d["prefetch_metadata_misses"]) // sint v["pread"] = v["phit"] + v["pmis"] - v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 + v["ph%"] = 100 * v["phit"] // v["pread"] if v["pread"] > 0 else 0 v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 v["mhit"] = (d["prefetch_metadata_hits"] + - d["demand_metadata_hits"]) / sint + d["demand_metadata_hits"]) // sint v["mmis"] = (d["prefetch_metadata_misses"] + - d["demand_metadata_misses"]) / sint + d["demand_metadata_misses"]) // sint v["mread"] = v["mhit"] + v["mmis"] - v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 + v["mh%"] = 100 * v["mhit"] // v["mread"] if v["mread"] > 0 else 0 v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 v["arcsz"] = cur["size"] + v["size"] = cur["size"] v["c"] = cur["c"] - v["mfu"] = d["mfu_hits"] / sint - v["mru"] = d["mru_hits"] / sint - v["mrug"] = d["mru_ghost_hits"] / sint - v["mfug"] = d["mfu_ghost_hits"] / sint - v["eskip"] = d["evict_skip"] / sint - v["mtxmis"] = d["mutex_miss"] / sint + v["mfu"] = d["mfu_hits"] // sint + v["mru"] = d["mru_hits"] // sint + v["mrug"] = d["mru_ghost_hits"] // sint + v["mfug"] = d["mfu_ghost_hits"] // sint + v["eskip"] = d["evict_skip"] // sint + v["el2skip"] = d["evict_l2_skip"] // sint + v["el2cach"] = d["evict_l2_cached"] // sint + v["el2el"] = d["evict_l2_eligible"] // sint + v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint + v["el2mru"] = d["evict_l2_eligible_mru"] // sint + v["el2inel"] = d["evict_l2_ineligible"] // sint + v["mtxmis"] = d["mutex_miss"] // sint if l2exist: - v["l2hits"] = d["l2_hits"] / sint - v["l2miss"] = d["l2_misses"] / sint + v["l2hits"] = d["l2_hits"] // sint + v["l2miss"] = d["l2_misses"] // sint v["l2read"] = v["l2hits"] + v["l2miss"] - v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 + v["l2hit%"] = 100 * v["l2hits"] // v["l2read"] if v["l2read"] > 0 else 0 v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 v["l2asize"] = cur["l2_asize"] v["l2size"] = cur["l2_size"] - v["l2bytes"] = d["l2_read_bytes"] / sint + v["l2bytes"] = d["l2_read_bytes"] // sint + + v["l2pref"] = cur["l2_prefetch_asize"] + v["l2mfu"] = cur["l2_mfu_asize"] + v["l2mru"] = cur["l2_mru_asize"] + v["l2data"] = cur["l2_bufc_data_asize"] + v["l2meta"] = cur["l2_bufc_metadata_asize"] + v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] + v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] + v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] + v["l2data%"] = 100 * v["l2data"] // v["l2asize"] + v["l2meta%"] = 100 * v["l2meta"] // v["l2asize"] v["grow"] = 0 if cur["arc_no_grow"] else 1 v["need"] = cur["arc_need_free"] - v["free"] = cur["arc_sys_free"] + v["free"] = cur["memory_free_bytes"] + v["avail"] = cur["memory_available_bytes"] + v["waste"] = cur["abd_chunk_waste_size"] def main(): diff --git a/cmd/dbufstat/.gitignore b/cmd/dbufstat/.gitignore new file mode 100644 index 0000000000..2c2e913cef --- /dev/null +++ b/cmd/dbufstat/.gitignore @@ -0,0 +1 @@ +dbufstat diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am index 968a760779..e672a01a42 100644 --- a/cmd/dbufstat/Makefile.am +++ b/cmd/dbufstat/Makefile.am @@ -1,13 +1,5 @@ -dist_bin_SCRIPTS = dbufstat +include $(top_srcdir)/config/Substfiles.am -# -# The dbufstat script is compatibile with both Python 2.6 and 3.4. -# As such the python 3 shebang can be replaced at install time when -# targeting a python 2 system. This allows us to maintain a single -# version of the source. -# -if USING_PYTHON_2 -install-exec-hook: - sed --in-place 's|^#!/usr/bin/python3|#!/usr/bin/python2|' \ - $(DESTDIR)$(bindir)/dbufstat -endif +bin_SCRIPTS = dbufstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/dbufstat/dbufstat b/cmd/dbufstat/dbufstat.in similarity index 97% rename from cmd/dbufstat/dbufstat rename to cmd/dbufstat/dbufstat.in index e6c947fbcb..82250353f5 100755 --- a/cmd/dbufstat/dbufstat +++ b/cmd/dbufstat/dbufstat.in @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env @PYTHON_SHEBANG@ # # Print out statistics for all cached dmu buffers. This information # is available through the dbufs kstat and may be post-processed as @@ -113,10 +113,25 @@ cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] " raw = 0 +if sys.platform.startswith("freebsd"): + import io + # Requires py-sysctl on FreeBSD + import sysctl + + def default_ifile(): + dbufs = sysctl.filter("kstat.zfs.misc.dbufs")[0].value + sys.stdin = io.StringIO(dbufs) + return "-" + +elif sys.platform.startswith("linux"): + def default_ifile(): + return "/proc/spl/kstat/zfs/dbufs" + + def print_incompat_helper(incompat): cnt = 0 for key in sorted(incompat): - if cnt is 0: + if cnt == 0: sys.stderr.write("\t") elif cnt > 8: sys.stderr.write(",\n\t") @@ -343,7 +358,7 @@ def get_compstring(c): "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", - "ZIO_COMPRESS_FUNCTION"] + "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"] # If "-rr" option is used, don't convert to string representation if raw > 1: @@ -645,9 +660,9 @@ def main(): sys.exit(1) if not ifile: - ifile = '/proc/spl/kstat/zfs/dbufs' + ifile = default_ifile() - if ifile is not "-": + if ifile != "-": try: tmp = open(ifile, "r") sys.stdin = tmp diff --git a/cmd/fsck_zfs/.gitignore b/cmd/fsck_zfs/.gitignore new file mode 100644 index 0000000000..0edf0309e9 --- /dev/null +++ b/cmd/fsck_zfs/.gitignore @@ -0,0 +1 @@ +/fsck.zfs diff --git a/cmd/fsck_zfs/Makefile.am b/cmd/fsck_zfs/Makefile.am index 2380f56fa4..f8139f117f 100644 --- a/cmd/fsck_zfs/Makefile.am +++ b/cmd/fsck_zfs/Makefile.am @@ -1 +1,6 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + dist_sbin_SCRIPTS = fsck.zfs + +SUBSTFILES += $(dist_sbin_SCRIPTS) diff --git a/cmd/fsck_zfs/fsck.zfs b/cmd/fsck_zfs/fsck.zfs deleted file mode 100755 index f1685db652..0000000000 --- a/cmd/fsck_zfs/fsck.zfs +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh -# -# fsck.zfs: A fsck helper to accomidate distributions that expect -# to be able to execute a fsck on all filesystem types. Currently -# this script does nothing but it could be extended to act as a -# compatibility wrapper for 'zpool scrub'. -# - -exit 0 diff --git a/cmd/fsck_zfs/fsck.zfs.in b/cmd/fsck_zfs/fsck.zfs.in new file mode 100755 index 0000000000..37096902cb --- /dev/null +++ b/cmd/fsck_zfs/fsck.zfs.in @@ -0,0 +1,44 @@ +#!/bin/sh +# +# fsck.zfs: A fsck helper to accommodate distributions that expect +# to be able to execute a fsck on all filesystem types. +# +# This script simply bubbles up some already-known-about errors, +# see fsck.zfs(8) +# + +if [ "$#" = "0" ]; then + echo "Usage: $0 [options] dataset…" >&2 + exit 16 +fi + +ret=0 +for dataset in "$@"; do + case "$dataset" in + -*) + continue + ;; + *) + ;; + esac + + pool="${dataset%%/*}" + + case "$(@sbindir@/zpool list -Ho health "$pool")" in + DEGRADED) + ret=$(( ret | 4 )) + ;; + FAULTED) + awk '!/^([[:space:]]*#.*)?$/ && $1 == "'"$dataset"'" && $3 == "zfs" {exit 1}' /etc/fstab || \ + ret=$(( ret | 8 )) + ;; + "") + # Pool not found, error printed by zpool(8) + ret=$(( ret | 8 )) + ;; + *) + ;; + esac +done + +exit "$ret" diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am index 7adedd63b6..3957602d27 100644 --- a/cmd/mount_zfs/Makefile.am +++ b/cmd/mount_zfs/Makefile.am @@ -1,9 +1,5 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - # # Ignore the prefix for the mount helper. It must be installed in /sbin/ # because this path is hardcoded in the mount(8) for security reasons. @@ -17,5 +13,10 @@ mount_zfs_SOURCES = \ mount_zfs.c mount_zfs_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +mount_zfs_LDADD += $(LTLIBINTL) + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c index a9b1e166b4..434d53cbad 100644 --- a/cmd/mount_zfs/mount_zfs.c +++ b/cmd/mount_zfs/mount_zfs.c @@ -42,247 +42,46 @@ libzfs_handle_t *g_zfs; -typedef struct option_map { - const char *name; - unsigned long mntmask; - unsigned long zfsmask; -} option_map_t; - -static const option_map_t option_map[] = { - /* Canonicalized filesystem independent options from mount(8) */ - { MNTOPT_NOAUTO, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_DEFAULTS, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_NODEVICES, MS_NODEV, ZS_COMMENT }, - { MNTOPT_DIRSYNC, MS_DIRSYNC, ZS_COMMENT }, - { MNTOPT_NOEXEC, MS_NOEXEC, ZS_COMMENT }, - { MNTOPT_GROUP, MS_GROUP, ZS_COMMENT }, - { MNTOPT_NETDEV, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_NOFAIL, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_NOSUID, MS_NOSUID, ZS_COMMENT }, - { MNTOPT_OWNER, MS_OWNER, ZS_COMMENT }, - { MNTOPT_REMOUNT, MS_REMOUNT, ZS_COMMENT }, - { MNTOPT_RO, MS_RDONLY, ZS_COMMENT }, - { MNTOPT_RW, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_SYNC, MS_SYNCHRONOUS, ZS_COMMENT }, - { MNTOPT_USER, MS_USERS, ZS_COMMENT }, - { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, - /* acl flags passed with util-linux-2.24 mount command */ - { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, - { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, -#ifdef MS_NOATIME - { MNTOPT_NOATIME, MS_NOATIME, ZS_COMMENT }, -#endif -#ifdef MS_NODIRATIME - { MNTOPT_NODIRATIME, MS_NODIRATIME, ZS_COMMENT }, -#endif -#ifdef MS_RELATIME - { MNTOPT_RELATIME, MS_RELATIME, ZS_COMMENT }, -#endif -#ifdef MS_STRICTATIME - { MNTOPT_STRICTATIME, MS_STRICTATIME, ZS_COMMENT }, -#endif -#ifdef MS_LAZYTIME - { MNTOPT_LAZYTIME, MS_LAZYTIME, ZS_COMMENT }, -#endif - { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, -#ifdef MS_I_VERSION - { MNTOPT_IVERSION, MS_I_VERSION, ZS_COMMENT }, -#endif -#ifdef MS_MANDLOCK - { MNTOPT_NBMAND, MS_MANDLOCK, ZS_COMMENT }, -#endif - /* Valid options not found in mount(8) */ - { MNTOPT_BIND, MS_BIND, ZS_COMMENT }, -#ifdef MS_REC - { MNTOPT_RBIND, MS_BIND|MS_REC, ZS_COMMENT }, -#endif - { MNTOPT_COMMENT, MS_COMMENT, ZS_COMMENT }, -#ifdef MS_NOSUB - { MNTOPT_NOSUB, MS_NOSUB, ZS_COMMENT }, -#endif -#ifdef MS_SILENT - { MNTOPT_QUIET, MS_SILENT, ZS_COMMENT }, -#endif - /* Custom zfs options */ - { MNTOPT_XATTR, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_NOXATTR, MS_COMMENT, ZS_COMMENT }, - { MNTOPT_ZFSUTIL, MS_COMMENT, ZS_ZFSUTIL }, - { NULL, 0, 0 } }; - /* - * Break the mount option in to a name/value pair. The name is - * validated against the option map and mount flags set accordingly. + * Opportunistically convert a target string into a pool name. If the + * string does not represent a block device with a valid zfs label + * then it is passed through without modification. */ -static int -parse_option(char *mntopt, unsigned long *mntflags, - unsigned long *zfsflags, int sloppy) +static void +parse_dataset(const char *target, char **dataset) { - const option_map_t *opt; - char *ptr, *name, *value = NULL; - int error = 0; - - name = strdup(mntopt); - if (name == NULL) - return (ENOMEM); - - for (ptr = name; ptr && *ptr; ptr++) { - if (*ptr == '=') { - *ptr = '\0'; - value = ptr+1; - VERIFY3P(value, !=, NULL); - break; - } - } - - for (opt = option_map; opt->name != NULL; opt++) { - if (strncmp(name, opt->name, strlen(name)) == 0) { - *mntflags |= opt->mntmask; - *zfsflags |= opt->zfsmask; - error = 0; - goto out; - } - } - - if (!sloppy) - error = ENOENT; -out: - /* If required further process on the value may be done here */ - free(name); - return (error); -} - -/* - * Translate the mount option string in to MS_* mount flags for the - * kernel vfs. When sloppy is non-zero unknown options will be ignored - * otherwise they are considered fatal are copied in to badopt. - */ -static int -parse_options(char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, - int sloppy, char *badopt, char *mtabopt) -{ - int error = 0, quote = 0, flag = 0, count = 0; - char *ptr, *opt, *opts; - - opts = strdup(mntopts); - if (opts == NULL) - return (ENOMEM); - - *mntflags = 0; - opt = NULL; - /* - * Scan through all mount options which must be comma delimited. - * We must be careful to notice regions which are double quoted - * and skip commas in these regions. Each option is then checked - * to determine if it is a known option. + * Prior to util-linux 2.36.2, if a file or directory in the + * current working directory was named 'dataset' then mount(8) + * would prepend the current working directory to the dataset. + * Check for it and strip the prepended path when it is added. */ - for (ptr = opts; ptr && !flag; ptr++) { - if (opt == NULL) - opt = ptr; - - if (*ptr == '"') - quote = !quote; - - if (quote) - continue; - - if (*ptr == '\0') - flag = 1; - - if ((*ptr == ',') || (*ptr == '\0')) { - *ptr = '\0'; - - error = parse_option(opt, mntflags, zfsflags, sloppy); - if (error) { - strcpy(badopt, opt); - goto out; - - } - - if (!(*mntflags & MS_REMOUNT) && - !(*zfsflags & ZS_ZFSUTIL)) { - if (count > 0) - strlcat(mtabopt, ",", MNT_LINE_MAX); - - strlcat(mtabopt, opt, MNT_LINE_MAX); - count++; - } - - opt = NULL; - } - } - -out: - free(opts); - return (error); -} - -/* - * Return the pool/dataset to mount given the name passed to mount. This - * is expected to be of the form pool/dataset, however may also refer to - * a block device if that device contains a valid zfs label. - */ -static char * -parse_dataset(char *dataset) -{ char cwd[PATH_MAX]; - struct stat64 statbuf; - int error; - int len; - - /* - * We expect a pool/dataset to be provided, however if we're - * given a device which is a member of a zpool we attempt to - * extract the pool name stored in the label. Given the pool - * name we can mount the root dataset. - */ - error = stat64(dataset, &statbuf); - if (error == 0) { - nvlist_t *config; - char *name; - int fd; - - fd = open(dataset, O_RDONLY); - if (fd < 0) - goto out; - - error = zpool_read_label(fd, &config, NULL); - (void) close(fd); - if (error) - goto out; - - error = nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &name); - if (error) { - nvlist_free(config); - } else { - dataset = strdup(name); - nvlist_free(config); - return (dataset); - } + if (getcwd(cwd, PATH_MAX) == NULL) { + perror("getcwd"); + return; } -out: - /* - * If a file or directory in your current working directory is - * named 'dataset' then mount(8) will prepend your current working - * directory to the dataset. There is no way to prevent this - * behavior so we simply check for it and strip the prepended - * patch when it is added. - */ - if (getcwd(cwd, PATH_MAX) == NULL) - return (dataset); + int len = strlen(cwd); + if (strncmp(cwd, target, len) == 0) + target += len; - len = strlen(cwd); + /* Assume pool/dataset is more likely */ + strlcpy(*dataset, target, PATH_MAX); - /* Do not add one when cwd already ends in a trailing '/' */ - if (strncmp(cwd, dataset, len) == 0) - return (dataset + len + (cwd[len-1] != '/')); + int fd = open(target, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return; - return (dataset); + nvlist_t *cfg = NULL; + if (zpool_read_label(fd, &cfg, NULL) == 0) { + char *nm = NULL; + if (!nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &nm)) + strlcpy(*dataset, nm, PATH_MAX); + nvlist_free(cfg); + } + + if (close(fd)) + perror("close"); } /* @@ -326,8 +125,8 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) if (!fp) { (void) fprintf(stderr, gettext( "filesystem '%s' was mounted, but /etc/mtab " - "could not be opened due to error %d\n"), - dataset, errno); + "could not be opened due to error: %s\n"), + dataset, strerror(errno)); return (MOUNT_FILEIO); } @@ -335,8 +134,8 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) if (error) { (void) fprintf(stderr, gettext( "filesystem '%s' was mounted, but /etc/mtab " - "could not be updated due to error %d\n"), - dataset, errno); + "could not be updated due to error: %s\n"), + dataset, strerror(errno)); return (MOUNT_FILEIO); } @@ -345,34 +144,6 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) return (MOUNT_SUCCESS); } -static void -append_mntopt(const char *name, const char *val, char *mntopts, - char *mtabopt, boolean_t quote) -{ - char tmp[MNT_LINE_MAX]; - - snprintf(tmp, MNT_LINE_MAX, quote ? ",%s=\"%s\"" : ",%s=%s", name, val); - - if (mntopts) - strlcat(mntopts, tmp, MNT_LINE_MAX); - - if (mtabopt) - strlcat(mtabopt, tmp, MNT_LINE_MAX); -} - -static void -zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, - char *mntopts, char *mtabopt) -{ - char context[ZFS_MAXPROPLEN]; - - if (zfs_prop_get(zhp, zpt, context, sizeof (context), - NULL, NULL, 0, B_FALSE) == 0) { - if (strcmp(context, "none") != 0) - append_mntopt(name, context, mntopts, mtabopt, B_TRUE); - } -} - int main(int argc, char **argv) { @@ -383,12 +154,13 @@ main(int argc, char **argv) char badopt[MNT_LINE_MAX] = { '\0' }; char mtabopt[MNT_LINE_MAX] = { '\0' }; char mntpoint[PATH_MAX]; - char *dataset; + char dataset[PATH_MAX], *pdataset = dataset; unsigned long mntflags = 0, zfsflags = 0, remount = 0; int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0; int error, c; (void) setlocale(LC_ALL, ""); + (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; @@ -413,10 +185,11 @@ main(int argc, char **argv) break; case 'h': case '?': - (void) fprintf(stderr, gettext("Invalid option '%c'\n"), - optopt); + if (optopt) + (void) fprintf(stderr, + gettext("Invalid option '%c'\n"), optopt); (void) fprintf(stderr, gettext("Usage: mount.zfs " - "[-sfnv] [-o options] \n")); + "[-sfnvh] [-o options] \n")); return (MOUNT_USAGE); } } @@ -438,18 +211,18 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - dataset = parse_dataset(argv[0]); + parse_dataset(argv[0], &pdataset); /* canonicalize the mount point */ if (realpath(argv[1], mntpoint) == NULL) { (void) fprintf(stderr, gettext("filesystem '%s' cannot be " - "mounted at '%s' due to canonicalization error %d.\n"), - dataset, argv[1], errno); + "mounted at '%s' due to canonicalization error: %s\n"), + dataset, argv[1], strerror(errno)); return (MOUNT_SYSERR); } /* validate mount options and set mntflags */ - error = parse_options(mntopts, &mntflags, &zfsflags, sloppy, + error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy, badopt, mtabopt); if (error) { switch (error) { @@ -489,7 +262,7 @@ main(int argc, char **argv) zfsutil = 1; if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "%s", libzfs_error_init(errno)); + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (MOUNT_SYSERR); } @@ -502,32 +275,7 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - /* - * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists - * if it does, create a tmp variable in case it's needed - * checks to see if the selinux context is set to the default - * if it is, allow the setting of the other context properties - * this is needed because the 'context' property overrides others - * if it is not the default, set the 'context' property - */ - if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), - NULL, NULL, 0, B_FALSE) == 0) { - if (strcmp(prop, "none") == 0) { - zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, - MNTOPT_FSCONTEXT, mntopts, mtabopt); - zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, - MNTOPT_DEFCONTEXT, mntopts, mtabopt); - zfs_selinux_setcontext(zhp, - ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, - mntopts, mtabopt); - } else { - append_mntopt(MNTOPT_CONTEXT, prop, - mntopts, mtabopt, B_TRUE); - } - } - - /* A hint used to determine an auto-mounted snapshot mount point */ - append_mntopt(MNTOPT_MNTPOINT, mntpoint, mntopts, NULL, B_FALSE); + zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); /* treat all snapshots as legacy mount points */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) @@ -620,8 +368,8 @@ main(int argc, char **argv) "mount the filesystem again.\n"), dataset); return (MOUNT_SYSERR); } - /* fallthru */ #endif + fallthrough; default: (void) fprintf(stderr, gettext("filesystem " "'%s' can not be mounted: %s\n"), dataset, diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am index a394a0dde3..983ff25dc9 100644 --- a/cmd/raidz_test/Makefile.am +++ b/cmd/raidz_test/Makefile.am @@ -4,11 +4,7 @@ include $(top_srcdir)/config/Rules.am AM_CFLAGS += $(FRAME_LARGER_THAN) # Unconditionally enable ASSERTs -AM_CPPFLAGS += -DDEBUG -UNDEBUG - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG bin_PROGRAMS = raidz_test @@ -18,6 +14,9 @@ raidz_test_SOURCES = \ raidz_bench.c raidz_test_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la -raidz_test_LDADD += -lm -ldl +raidz_test_LDADD += -lm + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 4863b8d97b..f44d6fbde7 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -31,8 +31,6 @@ #include #include -#include - #include "raidz_test.h" #define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32) @@ -83,8 +81,17 @@ run_gen_bench_impl(const char *impl) /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + fn + 1; zio_bench.io_size = 1ULL << ds; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, fn+1); + + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + rto_opts.rto_ashift, ncols+1, ncols, + fn+1, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + } /* estimate iteration count */ iter_cnt = GEN_BENCH_MEMORY; @@ -113,7 +120,7 @@ run_gen_bench_impl(const char *impl) } } -void +static void run_gen_bench(void) { char **impl_name; @@ -163,8 +170,16 @@ run_rec_bench_impl(const char *impl) (1ULL << BENCH_ASHIFT)) continue; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, PARITY_PQR); + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + BENCH_ASHIFT, ncols+1, ncols, + PARITY_PQR, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + } /* estimate iteration count */ iter_cnt = (REC_BENCH_MEMORY); @@ -197,7 +212,7 @@ run_rec_bench_impl(const char *impl) } } -void +static void run_rec_bench(void) { char **impl_name; diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index a05070399c..c1610a8d1b 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -37,11 +37,11 @@ static int *rand_data; raidz_test_opts_t rto_opts; -static char gdb[256]; -static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d"; +static char pid_s[16]; static void sig_handler(int signo) { + int old_errno = errno; struct sigaction action; /* * Restore default action and re-raise signal so SIGSEGV and @@ -52,10 +52,19 @@ static void sig_handler(int signo) action.sa_flags = 0; (void) sigaction(signo, &action, NULL); - if (rto_opts.rto_gdb) - if (system(gdb)) { } + if (rto_opts.rto_gdb) { + pid_t pid = fork(); + if (pid == 0) { + execlp("gdb", "gdb", "-ex", "set pagination 0", + "-p", pid_s, NULL); + _exit(-1); + } else if (pid > 0) + while (waitpid(pid, NULL, 0) == -1 && errno == EINTR) + ; + } raise(signo); + errno = old_errno; } static void print_opts(raidz_test_opts_t *opts, boolean_t force) @@ -77,16 +86,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force) (void) fprintf(stdout, DBLSEP "Running with options:\n" " (-a) zio ashift : %zu\n" " (-o) zio offset : 1 << %zu\n" + " (-e) expanded map : %s\n" + " (-r) reflow offset : %llx\n" " (-d) number of raidz data columns : %zu\n" " (-s) size of DATA : 1 << %zu\n" " (-S) sweep parameters : %s \n" " (-v) verbose : %s \n\n", - opts->rto_ashift, /* -a */ - ilog2(opts->rto_offset), /* -o */ - opts->rto_dcols, /* -d */ - ilog2(opts->rto_dsize), /* -s */ - opts->rto_sweep ? "yes" : "no", /* -S */ - verbose); /* -v */ + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)opts->rto_expand_offset, /* -r */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ } } @@ -104,6 +117,8 @@ static void usage(boolean_t requested) "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" + "\t[-e use expanded raidz map (default: %s)]\n" + "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %zu)]\n" "\t[-h (print help)]\n" "\t[-T test the test, see if failure would be detected]\n" @@ -114,6 +129,8 @@ static void usage(boolean_t requested) o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ + rto_opts.rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -d */ exit(requested ? 0 : 1); @@ -128,7 +145,7 @@ static void process_options(int argc, char **argv) bcopy(&rto_opts_defaults, o, sizeof (*o)); - while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { value = 0; switch (opt) { @@ -136,6 +153,12 @@ static void process_options(int argc, char **argv) value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; + case 'e': + o->rto_expand = 1; + break; + case 'r': + o->rto_expand_offset = strtoull(optarg, NULL, 0); + break; case 'o': value = strtoull(optarg, NULL, 0); o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; @@ -179,25 +202,34 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) -#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) +#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) +#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) -#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) +#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) +#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) static int cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) { - int i, ret = 0; + int r, i, ret = 0; VERIFY(parity >= 1 && parity <= 3); - for (i = 0; i < parity; i++) { - if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) - != 0) { - ret++; - LOG_OPT(D_DEBUG, opts, - "\nParity block [%d] different!\n", i); + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t * const rr = rm->rm_row[r]; + raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; + for (i = 0; i < parity; i++) { + if (CODE_COL_SIZE(rrg, i) == 0) { + VERIFY0(CODE_COL_SIZE(rr, i)); + continue; + } + + if (abd_cmp(CODE_COL(rr, i), + CODE_COL(rrg, i)) != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } } } return (ret); @@ -206,16 +238,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) static int cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) { - int i, ret = 0; - int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + int r, i, dcols, ret = 0; - for (i = 0; i < dcols; i++) { - if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) - != 0) { - ret++; + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + raidz_row_t *rrg = opts->rm_golden->rm_row[r]; + dcols = opts->rm_golden->rm_row[0]->rr_cols - + raidz_parity(opts->rm_golden); + for (i = 0; i < dcols; i++) { + if (DATA_COL_SIZE(rrg, i) == 0) { + VERIFY0(DATA_COL_SIZE(rr, i)); + continue; + } - LOG_OPT(D_DEBUG, opts, - "\nData block [%d] different!\n", i); + if (abd_cmp(DATA_COL(rrg, i), + DATA_COL(rr, i)) != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } } } return (ret); @@ -236,12 +278,13 @@ init_rand(void *data, size_t size, void *private) static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { - int i; - raidz_col_t *col; - - for (i = 0; i < cnt; i++) { - col = &rm->rm_col[tgts[i]]; - abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + for (int i = 0; i < cnt; i++) { + raidz_col_t *col = &rr->rr_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, + init_rand, NULL); + } } } @@ -288,10 +331,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) VERIFY0(vdev_raidz_impl_set("original")); - opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, - opts->rto_ashift, total_ncols, parity); - rm_test = vdev_raidz_map_alloc(zio_test, - opts->rto_ashift, total_ncols, parity); + if (opts->rto_expand) { + opts->rm_golden = + vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, + opts->zio_golden->io_size, opts->zio_golden->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, + zio_test->io_size, zio_test->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + } VERIFY(opts->zio_golden); VERIFY(opts->rm_golden); @@ -312,6 +367,187 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } +/* + * If reflow is not in progress, reflow_offset should be UINT64_MAX. + * For each row, if the row is entirely before reflow_offset, it will + * come from the new location. Otherwise this row will come from the + * old location. Therefore, rows that straddle the reflow_offset will + * come from the old location. + * + * NOTE: Until raidz expansion is implemented this function is only + * needed by raidz_test.c to the multi-row raid_map_t functionality. + */ +raidz_map_t * +vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset) +{ + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, devidx, asize = 0, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, + rr_col[cols]), KM_SLEEP); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and any part of this + * row has not been copied, then use the old location of + * this row. + */ + int row_phys_cols = physical_cols; + if (b + (logical_cols - nparity) > reflow_offset >> ashift) + row_phys_cols--; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * We set cols to the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_cols = cols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + rr->rr_col[c].rc_devidx = child_id; + rr->rr_col[c].rc_offset = child_offset; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, + B_TRUE); + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end, this for parity generation. + */ + rr->rr_col[c].rc_size = 0; + rr->rr_col[c].rc_abd = NULL; + } else { + /* + * "data column" (col excluding parity) + * Add an ASCII art diagram here + */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = abd_get_offset_struct( + &rr->rr_col[c].rc_abdstruct, + abd, off << ashift, 1 << ashift); + } + + asize += rr->rr_col[c].rc_size; + } + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch + * the parity every 1MB. + * + * ...at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices + * evenly, we won't see any benefit. Further, occasional writes + * that aren't a multiple of the LCM of the number of children + * and the minimum stripe width are sufficient to avoid pessimal + * behavior. Unfortunately, this decision created an implicit + * on-disk format requirement that we need to support for all + * eternity, but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + devidx = rr->rr_col[0].rc_devidx; + uint64_t o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } + + } + ASSERT3U(asize, ==, tot << ashift); + + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -330,8 +566,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_abd = raidz_alloc(alloc_dsize); init_zio_abd(*zio); - rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, - total_ncols, parity); + if (opts->rto_expand) { + rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, + (*zio)->io_size, (*zio)->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + } VERIFY(rm); /* Make sure code columns are destroyed */ @@ -420,7 +663,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) if (fn < RAIDZ_REC_PQ) { /* can reconstruct 1 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ @@ -445,10 +688,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else if (fn < RAIDZ_REC_PQR) { /* can reconstruct 2 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -475,14 +719,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else { /* can reconstruct 3 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { - if (x2 >= - rm->rm_cols - raidz_parity(rm)) + if (x2 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -700,6 +945,8 @@ run_sweep(void) opts->rto_dcols = dcols_v[d]; opts->rto_offset = (1 << ashift_v[a]) * rand(); opts->rto_dsize = size_v[s]; + opts->rto_expand = rto_opts.rto_expand; + opts->rto_expand_offset = rto_opts.rto_expand_offset; opts->rto_v = 0; /* be quiet */ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, @@ -732,6 +979,7 @@ exit: return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); } + int main(int argc, char **argv) { @@ -739,8 +987,8 @@ main(int argc, char **argv) struct sigaction action; int err = 0; - /* init gdb string early */ - (void) sprintf(gdb, gdb_tmpl, getpid()); + /* init gdb pid string early */ + (void) sprintf(pid_s, "%d", getpid()); action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); @@ -757,7 +1005,7 @@ main(int argc, char **argv) process_options(argc, argv); - kernel_init(FREAD); + kernel_init(SPA_MODE_READ); /* setup random data because rand() is not reentrant */ rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index a7fd26b8b2..0f7f4cee3e 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -38,18 +38,21 @@ static const char *raidz_impl_names[] = { "avx512bw", "aarch64_neon", "aarch64_neonx2", + "powerpc_altivec", NULL }; typedef struct raidz_test_opts { size_t rto_ashift; - size_t rto_offset; + uint64_t rto_offset; size_t rto_dcols; size_t rto_dsize; size_t rto_v; size_t rto_sweep; size_t rto_sweep_timeout; size_t rto_benchmark; + size_t rto_expand; + uint64_t rto_expand_offset; size_t rto_sanity; size_t rto_gdb; @@ -68,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = { .rto_v = 0, .rto_sweep = 0, .rto_benchmark = 0, + .rto_expand = 0, + .rto_expand_offset = -1ULL, .rto_sanity = 0, .rto_gdb = 0, .rto_should_stop = B_FALSE @@ -112,4 +117,7 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); +struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + #endif /* RAIDZ_TEST_H */ diff --git a/cmd/vdev_id/Makefile.am b/cmd/vdev_id/Makefile.am index fb815faad0..4071c6d5ed 100644 --- a/cmd/vdev_id/Makefile.am +++ b/cmd/vdev_id/Makefile.am @@ -1 +1,3 @@ +include $(top_srcdir)/config/Shellcheck.am + dist_udev_SCRIPTS = vdev_id diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id index 3796ab4885..8cc4399a56 100755 --- a/cmd/vdev_id/vdev_id +++ b/cmd/vdev_id/vdev_id @@ -79,6 +79,34 @@ # channel 86:00.0 1 A # channel 86:00.0 0 B +# # +# # Example vdev_id.conf - multipath / multijbod-daisychaining +# # +# +# multipath yes +# multijbod yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - multipath / mixed +# # +# +# multipath yes +# slot mix +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 3 A +# channel 85:00.0 2 B +# channel 86:00.0 3 A +# channel 86:00.0 2 B +# channel af:00.0 0 C +# channel af:00.0 1 C + # # # # Example vdev_id.conf - alias # # @@ -92,9 +120,10 @@ PATH=/bin:/sbin:/usr/bin:/usr/sbin CONFIG=/etc/zfs/vdev_id.conf PHYS_PER_PORT= DEV= -MULTIPATH= TOPOLOGY= BAY= +ENCL_ID="" +UNIQ_ENCL_ID="" usage() { cat << EOF @@ -102,71 +131,153 @@ Usage: vdev_id [-h] vdev_id <-d device> [-c config_file] [-p phys_per_port] [-g sas_direct|sas_switch|scsi] [-m] - -c specify name of alernate config file [default=$CONFIG] + -c specify name of an alternative config file [default=$CONFIG] -d specify basename of device (i.e. sda) -e Create enclose device symlinks only (/dev/by-enclosure) -g Storage network topology [default="$TOPOLOGY"] -m Run in multipath mode + -j Run in multijbod mode -p number of phy's per switch port [default=$PHYS_PER_PORT] -h show this summary EOF - exit 0 + exit 1 + # exit with error to avoid processing usage message by a udev rule } map_slot() { - local LINUX_SLOT=$1 - local CHANNEL=$2 - local MAPPED_SLOT= + LINUX_SLOT=$1 + CHANNEL=$2 - MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ - \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + MAPPED_SLOT=$(awk -v linux_slot="$LINUX_SLOT" -v channel="$CHANNEL" \ + '$1 == "slot" && $2 == linux_slot && \ + ($4 ~ "^"channel"$" || $4 ~ /^$/) { print $3; exit}' $CONFIG) if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi - printf "%d" ${MAPPED_SLOT} + printf "%d" "${MAPPED_SLOT}" } map_channel() { - local MAPPED_CHAN= - local PCI_ID=$1 - local PORT=$2 + MAPPED_CHAN= + PCI_ID=$1 + PORT=$2 case $TOPOLOGY in "sas_switch") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ - { print \\$3; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v port="$PORT" \ + '$1 == "channel" && $2 == port \ + { print $3; exit }' $CONFIG) ;; "sas_direct"|"scsi") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ - \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ - { print \\$4; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v pciID="$PCI_ID" -v port="$PORT" \ + '$1 == "channel" && $2 == pciID && $3 == port \ + {print $4}' $CONFIG) ;; esac - printf "%s" ${MAPPED_CHAN} + printf "%s" "${MAPPED_CHAN}" +} + +get_encl_id() { + set -- $(echo $1) + count=$# + + i=1 + while [ $i -le $count ] ; do + d=$(eval echo '$'{$i}) + id=$(cat "/sys/class/enclosure/${d}/id") + ENCL_ID="${ENCL_ID} $id" + i=$((i + 1)) + done +} + +get_uniq_encl_id() { + for uuid in ${ENCL_ID}; do + found=0 + + for count in ${UNIQ_ENCL_ID}; do + if [ $count = $uuid ]; then + found=1 + break + fi + done + + if [ $found -eq 0 ]; then + UNIQ_ENCL_ID="${UNIQ_ENCL_ID} $uuid" + fi + done +} + +# map_jbod explainer: The bsg driver knows the difference between a SAS +# expander and fanout expander. Use hostX instance along with top-level +# (whole enclosure) expander instances in /sys/class/enclosure and +# matching a field in an array of expanders, using the index of the +# matched array field as the enclosure instance, thereby making jbod IDs +# dynamic. Avoids reliance on high overhead userspace commands like +# multipath and lsscsi and instead uses existing sysfs data. $HOSTCHAN +# variable derived from devpath gymnastics in sas_handler() function. +map_jbod() { + DEVEXP=$(ls -l "/sys/block/$DEV/device/" | grep enclos | awk -F/ '{print $(NF-1) }') + DEV=$1 + + # Use "set --" to create index values (Arrays) + set -- $(ls -l /sys/class/enclosure | grep -v "^total" | awk '{print $9}') + # Get count of total elements + JBOD_COUNT=$# + JBOD_ITEM=$* + + # Build JBODs (enclosure) id from sys/class/enclosure//id + get_encl_id "$JBOD_ITEM" + # Different expander instances for each paths. + # Filter out and keep only unique id. + get_uniq_encl_id + + # Identify final 'mapped jbod' + j=0 + for count in ${UNIQ_ENCL_ID}; do + i=1 + j=$((j + 1)) + while [ $i -le $JBOD_COUNT ] ; do + d=$(eval echo '$'{$i}) + id=$(cat "/sys/class/enclosure/${d}/id") + if [ "$d" = "$DEVEXP" ] && [ $id = $count ] ; then + MAPPED_JBOD=$j + break + fi + i=$((i + 1)) + done + done + + printf "%d" "${MAPPED_JBOD}" } sas_handler() { if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} - if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + + if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) + fi + + if [ -z "$MULTIJBOD_MODE" ] ; then + MULTIJBOD_MODE=$(awk '$1 == "multijbod" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when @@ -176,28 +287,50 @@ sas_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + # Match p[number], remove the 'p' and prepend "-part" + PART=$(echo "$DM_NAME" | + awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi - # Get the raw scsi device name from multipath -ll. Strip off - # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + # Utilize DM device name to gather subordinate block devices + # using sysfs to avoid userspace utilities + + # If our DEVNAME is something like /dev/dm-177, then we may be + # able to get our DMDEV from it. + DMDEV=$(echo $DEVNAME | sed 's;/dev/;;g') + if [ ! -e /sys/block/$DMDEV/slaves/* ] ; then + # It's not there, try looking in /dev/mapper + DMDEV=$(ls -l --full-time /dev/mapper | grep $DM_NAME | + awk '{gsub("../", " "); print $NF}') + fi + + # Use sysfs pointers in /sys/block/dm-X/slaves because using + # userspace tools creates lots of overhead and should be avoided + # whenever possible. Use awk to isolate lowest instance of + # sd device member in dm device group regardless of string + # length. + DEV=$(ls "/sys/block/$DMDEV/slaves" | awk ' + { len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; } + END { + asort(a) + print substr(a[1],22) + }') + if [ -z "$DEV" ] ; then return fi fi - if echo $DEV | grep -q ^/devices/ ; then + if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # Use positional parameters as an ad-hoc array @@ -207,84 +340,104 @@ sas_handler() { # Get path up to /sys/.../hostX i=1 - while [ $i -le $num_dirs ] ; do - d=$(eval echo \${$i}) + + while [ $i -le "$num_dirs" ] ; do + d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" - echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + echo "$d" | grep -q -E '^host[0-9]+$' && break + i=$((i + 1)) done - if [ $i = $num_dirs ] ; then + # Lets grab the SAS host channel number and save it for JBOD sorting later + HOSTCHAN=$(echo "$d" | awk -F/ '{ gsub("host","",$NF); print $NF}') + + if [ $i = "$num_dirs" ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In sas_switch mode, the directory four levels beneath # /sys/.../hostX contains symlinks to phy devices that reveal # the switch port number. In sas_direct mode, the phy links one # directory down reveal the HBA port. port_dir=$scsi_host_dir + case $TOPOLOGY in - "sas_switch") j=$(($i + 4)) ;; - "sas_direct") j=$(($i + 1)) ;; + "sas_switch") j=$((i + 4)) ;; + "sas_direct") j=$((i + 1)) ;; esac - i=$(($i + 1)) + i=$((i + 1)) + while [ $i -le $j ] ; do - port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + port_dir="$port_dir/$(eval echo '$'{$i})" + i=$((i + 1)) done - PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + PHY=$(ls -vd "$port_dir"/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}') if [ -z "$PHY" ] ; then PHY=0 fi - PORT=$(( $PHY / $PHYS_PER_PORT )) + PORT=$((PHY / PHYS_PER_PORT)) # Look in /sys/.../sas_device/end_device-X for the bay_identifier # attribute. end_device_dir=$port_dir - while [ $i -lt $num_dirs ] ; do - d=$(eval echo \${$i}) + + while [ $i -lt "$num_dirs" ] ; do + d=$(eval echo '$'{$i}) end_device_dir="$end_device_dir/$d" - if echo $d | grep -q '^end_device' ; then + if echo "$d" | grep -q '^end_device' ; then end_device_dir="$end_device_dir/sas_device/$d" break fi - i=$(($i + 1)) + i=$((i + 1)) done + # Add 'mix' slot type for environments where dm-multipath devices + # include end-devices connected via SAS expanders or direct connection + # to SAS HBA. A mixed connectivity environment such as pool devices + # contained in a SAS JBOD and spare drives or log devices directly + # connected in a server backplane without expanders in the I/O path. SLOT= + case $BAY in "bay") - SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) + ;; + "mix") + if [ $(cat "$end_device_dir/bay_identifier" 2>/dev/null) ] ; then + SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) + else + SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) + fi ;; "phy") - SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) ;; "port") - d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + d=$(eval echo '$'{$i}) + SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "id") - i=$(($i + 1)) - d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + i=$((i + 1)) + d=$(eval echo '$'{$i}) + SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "lun") - i=$(($i + 2)) - d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + i=$((i + 2)) + d=$(eval echo '$'{$i}) + SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures - sas_address=`cat $end_device_dir/sas_address 2>/dev/null` - enclosures=`lsscsi -g | \ - sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + sas_address=$(cat "$end_device_dir/sas_address" 2>/dev/null) + enclosures=$(lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p') for enclosure in $enclosures; do - set -- $(sg_ses -p aes $enclosure | \ + set -- $(sg_ses -p aes "$enclosure" | \ awk "/device slot number:/{slot=\$12} \ /SAS address: $sas_address/\ {print slot}") @@ -299,42 +452,55 @@ sas_handler() { return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` - if [ -z "$CHAN" ] ; then - return + if [ "$MULTIJBOD_MODE" = "yes" ] ; then + CHAN=$(map_channel "$PCI_ID" "$PORT") + SLOT=$(map_slot "$SLOT" "$CHAN") + JBOD=$(map_jbod "$DEV") + + if [ -z "$CHAN" ] ; then + return + fi + echo "${CHAN}"-"${JBOD}"-"${SLOT}${PART}" + else + CHAN=$(map_channel "$PCI_ID" "$PORT") + SLOT=$(map_slot "$SLOT" "$CHAN") + + if [ -z "$CHAN" ] ; then + return + fi + echo "${CHAN}${SLOT}${PART}" fi - echo ${CHAN}${SLOT}${PART} } scsi_handler() { if [ -z "$FIRST_BAY_NUMBER" ] ; then - FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ - {print \\$2; exit}" $CONFIG` + FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \ + {print $2; exit}' $CONFIG) fi FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} - if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + + if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when @@ -344,28 +510,30 @@ scsi_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + # Match p[number], remove the 'p' and prepend "-part" + PART=$(echo "$DM_NAME" | + awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Get the raw scsi device name from multipath -ll. Strip off # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + DEV=$(multipath -ll "$DM_NAME" | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}') if [ -z "$DEV" ] ; then return fi fi - if echo $DEV | grep -q ^/devices/ ; then + if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # expect sys_path like this, for example: @@ -378,44 +546,47 @@ scsi_handler() { # Get path up to /sys/.../hostX i=1 - while [ $i -le $num_dirs ] ; do - d=$(eval echo \${$i}) + + while [ $i -le "$num_dirs" ] ; do + d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" - echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + + echo "$d" | grep -q -E '^host[0-9]+$' && break + i=$((i + 1)) done - if [ $i = $num_dirs ] ; then + if [ $i = "$num_dirs" ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In scsi mode, the directory two levels beneath # /sys/.../hostX reveals the port and slot. port_dir=$scsi_host_dir - j=$(($i + 2)) + j=$((i + 2)) - i=$(($i + 1)) + i=$((i + 1)) while [ $i -le $j ] ; do - port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + port_dir="$port_dir/$(eval echo '$'{$i})" + i=$((i + 1)) done - set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') + set -- $(echo "$port_dir" | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') PORT=$1 - SLOT=$(($2 + $FIRST_BAY_NUMBER)) + SLOT=$(($2 + FIRST_BAY_NUMBER)) if [ -z "$SLOT" ] ; then return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` + CHAN=$(map_channel "$PCI_ID" "$PORT") + SLOT=$(map_slot "$SLOT" "$CHAN") + if [ -z "$CHAN" ] ; then return fi - echo ${CHAN}${SLOT}${PART} + echo "${CHAN}${SLOT}${PART}" } # Figure out the name for the enclosure symlink @@ -426,7 +597,7 @@ enclosure_handler () { # Get the enclosure ID ("0:0:0:0") ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) - if [ ! -d /sys/class/enclosure/$ENC ] ; then + if [ ! -d "/sys/class/enclosure/$ENC" ] ; then # Not an enclosure, bail out return fi @@ -434,14 +605,14 @@ enclosure_handler () { # Get the long sysfs device path to our enclosure. Looks like: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 - ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC) + ENC_DEVICE=$(readlink "/sys/class/enclosure/$ENC") # Grab the full path to the hosts port dir: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 - PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') + PORT_DIR=$(echo "$ENC_DEVICE" | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') # Get the port number - PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$") + PORT_ID=$(echo "$PORT_DIR" | grep -Eo "[0-9]+$") # The PCI directory is two directories up from the port directory # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 @@ -452,7 +623,7 @@ enclosure_handler () { # Name our device according to vdev_id.conf (like "L0" or "U1"). NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ - \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + \$3 == \"$PORT_ID\") {print \$4\$3}}" $CONFIG) echo "${NAME}" } @@ -487,10 +658,12 @@ alias_handler () { # digits as partitions, causing alias creation to fail. This # ambiguity seems unavoidable, so devices using this facility # must not use such names. - local DM_PART= - if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then + DM_PART= + if echo "$DM_NAME" | grep -q -E 'p[0-9][0-9]*$' ; then if [ "$DEVTYPE" != "partition" ] ; then - DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + # Match p[number], remove the 'p' and prepend "-part" + DM_PART=$(echo "$DM_NAME" | + awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi fi @@ -498,21 +671,25 @@ alias_handler () { for link in $DEVLINKS ; do # Remove partition information to match key of top-level device. if [ -n "$DM_PART" ] ; then - link=`echo $link | sed 's/p[0-9][0-9]*$//'` + link=$(echo "$link" | sed 's/p[0-9][0-9]*$//') fi # Check both the fully qualified and the base name of link. - for l in $link `basename $link` ; do - alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ - { print \\$2; exit }" $CONFIG` - if [ -n "$alias" ] ; then - echo ${alias}${DM_PART} - return + for l in $link $(basename "$link") ; do + if [ ! -z "$l" ]; then + alias=$(awk -v var="$l" '($1 == "alias") && \ + ($3 == var) \ + { print $2; exit }' $CONFIG) + if [ -n "$alias" ] ; then + echo "${alias}${DM_PART}" + return + fi fi done done } -while getopts 'c:d:eg:mp:h' OPTION; do +# main +while getopts 'c:d:eg:jmp:h' OPTION; do case ${OPTION} in c) CONFIG=${OPTARG} @@ -525,7 +702,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do # create the enclosure device symlinks only. We also need # "enclosure_symlinks yes" set in vdev_id.config to actually create the # symlink. - ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \ + print $2}' "$CONFIG") + if [ "$ENCLOSURE_MODE" != "yes" ] ; then exit 0 fi @@ -536,6 +715,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do p) PHYS_PER_PORT=${OPTARG} ;; + j) + MULTIJBOD_MODE=yes + ;; m) MULTIPATH_MODE=yes ;; @@ -545,34 +727,35 @@ while getopts 'c:d:eg:mp:h' OPTION; do esac done -if [ ! -r $CONFIG ] ; then - exit 0 +if [ ! -r "$CONFIG" ] ; then + echo "Error: Config file \"$CONFIG\" not found" + exit 1 fi -if [ -z "$DEV" -a -z "$ENCLOSURE_MODE" ] ; then +if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then echo "Error: missing required option -d" exit 1 fi if [ -z "$TOPOLOGY" ] ; then - TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` + TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' "$CONFIG") fi if [ -z "$BAY" ] ; then - BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` + BAY=$(awk '($1 == "slot") {print $2; exit}' "$CONFIG") fi TOPOLOGY=${TOPOLOGY:-sas_direct} # Should we create /dev/by-enclosure symlinks? -if [ "$ENCLOSURE_MODE" = "yes" -a "$TOPOLOGY" = "sas_direct" ] ; then +if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then ID_ENCLOSURE=$(enclosure_handler) if [ -z "$ID_ENCLOSURE" ] ; then exit 0 fi # Just create the symlinks to the enclosure devices and then exit. - ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG) + ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' "$CONFIG") if [ -z "$ENCLOSURE_PREFIX" ] ; then ENCLOSURE_PREFIX="enc" fi @@ -582,16 +765,16 @@ if [ "$ENCLOSURE_MODE" = "yes" -a "$TOPOLOGY" = "sas_direct" ] ; then fi # First check if an alias was defined for this device. -ID_VDEV=`alias_handler` +ID_VDEV=$(alias_handler) if [ -z "$ID_VDEV" ] ; then BAY=${BAY:-bay} case $TOPOLOGY in sas_direct|sas_switch) - ID_VDEV=`sas_handler` + ID_VDEV=$(sas_handler) ;; scsi) - ID_VDEV=`scsi_handler` + ID_VDEV=$(scsi_handler) ;; *) echo "Error: unknown topology $TOPOLOGY" diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am index 1fa7ec651b..c5858c2980 100644 --- a/cmd/zdb/Makefile.am +++ b/cmd/zdb/Makefile.am @@ -1,11 +1,7 @@ include $(top_srcdir)/config/Rules.am # Unconditionally enable debugging for zdb -AM_CPPFLAGS += -DDEBUG -UNDEBUG - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG sbin_PROGRAMS = zdb @@ -15,5 +11,8 @@ zdb_SOURCES = \ zdb.h zdb_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzpool/libzpool.la + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4b07cdb8e0..8bbb77479b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -21,16 +21,22 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright (c) 2015, 2017, Intel Corporation. + * Copyright (c) 2020 Datto Inc. + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + * Copyright (c) 2021 Allan Jude + * Copyright (c) 2021 Toomas Soome */ #include #include -#include #include #include #include @@ -50,23 +56,28 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include @@ -83,6 +94,13 @@ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) +/* Some platforms require part of inode IDs to be remapped */ +#ifdef __APPLE__ +#define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) +#else +#define ZDB_MAP_OBJECT_ID(obj) (obj) +#endif + static char * zdb_ot_name(dmu_object_type_t type) { @@ -97,25 +115,650 @@ zdb_ot_name(dmu_object_type_t type) extern int reference_tracking_enable; extern int zfs_recover; -extern uint64_t zfs_arc_max, zfs_arc_meta_limit; +extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; +extern boolean_t spa_mode_readable_spacemaps; extern int zfs_reconstruct_indirect_combinations_max; +extern int zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); -uint64_t *zopt_object = NULL; -static unsigned zopt_objects = 0; -uint64_t max_inflight = 1000; +uint64_t *zopt_metaslab = NULL; +static unsigned zopt_metaslab_args = 0; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; +zopt_object_range_t *zopt_object_ranges = NULL; +static unsigned zopt_object_args = 0; + +static int flagbits[256]; + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; -static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, + boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx); + +typedef struct sublivelist_verify { + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +static void zdb_print_blkptr(const blkptr_t *bp, int flags); + +typedef struct sublivelist_verify_block_refcnt { + /* block pointer entry in livelist being verified */ + blkptr_t svbr_blk; + + /* + * Refcount gets incremented to 1 when we encounter the first + * FREE entry for the svfbr block pointer and a node for it + * is created in our ZDB verification/tracking metadata. + * + * As we encounter more FREE entries we increment this counter + * and similarly decrement it whenever we find the respective + * ALLOC entries for this block. + * + * When the refcount gets to 0 it means that all the FREE and + * ALLOC entries of this block have paired up and we no longer + * need to track it in our verification logic (e.g. the node + * containing this struct in our verification data structure + * should be freed). + * + * [refer to sublivelist_verify_blkptr() for the actual code] + */ + uint32_t svbr_refcnt; +} sublivelist_verify_block_refcnt_t; + +static int +sublivelist_block_refcnt_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_refcnt_t *l = larg; + const sublivelist_verify_block_refcnt_t *r = rarg; + return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); +} + +static int +sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx) +{ + ASSERT3P(tx, ==, NULL); + struct sublivelist_verify *sv = arg; + sublivelist_verify_block_refcnt_t current = { + .svbr_blk = *bp, + + /* + * Start with 1 in case this is the first free entry. + * This field is not used for our B-Tree comparisons + * anyway. + */ + .svbr_refcnt = 1, + }; + + zfs_btree_index_t where; + sublivelist_verify_block_refcnt_t *pair = + zfs_btree_find(&sv->sv_pair, ¤t, &where); + if (free) { + if (pair == NULL) { + /* first free entry for this block pointer */ + zfs_btree_add(&sv->sv_pair, ¤t); + } else { + pair->svbr_refcnt++; + } + } else { + if (pair == NULL) { + /* block that is currently marked as allocated */ + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + if (DVA_IS_EMPTY(&bp->blk_dva[i])) + break; + sublivelist_verify_block_t svb = { + .svb_dva = bp->blk_dva[i], + .svb_allocated_txg = bp->blk_birth + }; + + if (zfs_btree_find(&sv->sv_leftover, &svb, + &where) == NULL) { + zfs_btree_add_idx(&sv->sv_leftover, + &svb, &where); + } + } + } else { + /* alloc matches a free entry */ + pair->svbr_refcnt--; + if (pair->svbr_refcnt == 0) { + /* all allocs and frees have been matched */ + zfs_btree_remove_idx(&sv->sv_pair, &where); + } + } + } + + return (0); +} + +static int +sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) +{ + int err; + struct sublivelist_verify *sv = args; + + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + sizeof (sublivelist_verify_block_refcnt_t)); + + err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, + sv, NULL); + + sublivelist_verify_block_refcnt_t *e; + zfs_btree_index_t *cookie = NULL; + while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), + &e->svbr_blk, B_TRUE); + (void) printf("\tERROR: %d unmatched FREE(s): %s\n", + e->svbr_refcnt, blkbuf); + } + zfs_btree_destroy(&sv->sv_pair); + + return (err); +} + +static int +livelist_block_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_t *l = larg; + const sublivelist_verify_block_t *r = rarg; + + if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) + return (-1); + else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) + return (+1); + + if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) + return (-1); + else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) + return (+1); + + if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) + return (-1); + else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) + return (+1); + + return (0); +} + +/* + * Check for errors in a livelist while tracking all unfreed ALLOCs in the + * sublivelist_verify_t: sv->sv_leftover + */ +static void +livelist_verify(dsl_deadlist_t *dl, void *arg) +{ + sublivelist_verify_t *sv = arg; + dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); +} + +/* + * Check for errors in the livelist entry and discard the intermediary + * data structures + */ +/* ARGSUSED */ +static int +sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) +{ + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + int err = sublivelist_verify_func(&sv, dle); + zfs_btree_clear(&sv.sv_leftover); + zfs_btree_destroy(&sv.sv_leftover); + return (err); +} + +typedef struct metaslab_verify { + /* + * Tree containing all the leftover ALLOCs from the livelists + * that are part of this metaslab. + */ + zfs_btree_t mv_livelist_allocs; + + /* + * Metaslab information. + */ + uint64_t mv_vdid; + uint64_t mv_msid; + uint64_t mv_start; + uint64_t mv_end; + + /* + * What's currently allocated for this metaslab. + */ + range_tree_t *mv_allocated; +} metaslab_verify_t; + +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, + void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, + uint64_t offset, uint64_t size) +{ + sublivelist_verify_block_t svb; + DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); + DVA_SET_OFFSET(&svb.svb_dva, offset); + DVA_SET_ASIZE(&svb.svb_dva, size); + zfs_btree_index_t where; + uint64_t end_offset = offset + size; + + /* + * Look for an exact match for spacemap entry in the livelist entries. + * Then, look for other livelist entries that fall within the range + * of the spacemap entry as it may have been condensed + */ + sublivelist_verify_block_t *found = + zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); + if (found == NULL) { + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); + } + for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && + DVA_GET_OFFSET(&found->svb_dva) < end_offset; + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + if (found->svb_allocated_txg <= txg) { + (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " + "from TXG %llx FREED at TXG %llx\n", + (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), + (u_longlong_t)found->svb_allocated_txg, + (u_longlong_t)txg); + } + } +} + +static int +metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t txg = sme->sme_txg; + + if (sme->sme_type == SM_ALLOC) { + if (range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE ALLOC: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_add(mv->mv_allocated, + offset, size); + } + } else { + if (!range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE FREE: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_remove(mv->mv_allocated, + offset, size); + } + } + + if (sme->sme_type != SM_ALLOC) { + /* + * If something is freed in the spacemap, verify that + * it is not listed as allocated in the livelist. + */ + verify_livelist_allocs(mv, txg, offset, size); + } + return (0); +} + +static int +spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + if (vdev_id != mv->mv_vdid) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (ms->ms_id != mv->mv_msid) + return (0); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + + ASSERT3U(txg, ==, sme->sme_txg); + return (metaslab_spacemap_validation_cb(sme, mv)); +} + +static void +spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) +{ + iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); +} + +static void +spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) +{ + if (sm == NULL) + return; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + metaslab_spacemap_validation_cb, mv)); +} + +static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); + +/* + * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if + * they are part of that metaslab (mv_msid). + */ +static void +mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) +{ + zfs_btree_index_t where; + sublivelist_verify_block_t *svb; + ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); + for (svb = zfs_btree_first(&sv->sv_leftover, &where); + svb != NULL; + svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { + if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && + (DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) + continue; + + if ((DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + zfs_btree_add(&mv->mv_livelist_allocs, svb); + } + + for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); + svb != NULL; + svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + zfs_btree_remove(&sv->sv_leftover, svb); + } +} + +/* + * [Livelist Check] + * Iterate through all the sublivelists and: + * - report leftover frees (**) + * - record leftover ALLOCs together with their TXG [see Cross Check] + * + * (**) Note: Double ALLOCs are valid in datasets that have dedup + * enabled. Similarly double FREEs are allowed as well but + * only if they pair up with a corresponding ALLOC entry once + * we our done with our sublivelist iteration. + * + * [Spacemap Check] + * for each metaslab: + * - iterate over spacemap and then the metaslab's entries in the + * spacemap log, then report any double FREEs and ALLOCs (do not + * blow up). + * + * [Cross Check] + * After finishing the Livelist Check phase and while being in the + * Spacemap Check phase, we find all the recorded leftover ALLOCs + * of the livelist check that are part of the metaslab that we are + * currently looking at in the Spacemap Check. We report any entries + * that are marked as ALLOCs in the livelists but have been actually + * freed (and potentially allocated again) after their TXG stamp in + * the spacemaps. Also report any ALLOCs from the livelists that + * belong to indirect vdevs (e.g. their vdev completed removal). + * + * Note that this will miss Log Spacemap entries that cancelled each other + * out before being flushed to the metaslab, so we are not guaranteed + * to match all erroneous ALLOCs. + */ +static void +livelist_metaslab_validate(spa_t *spa) +{ + (void) printf("Verifying deleted livelist entries\n"); + + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + iterate_deleted_livelists(spa, livelist_verify, &sv); + + (void) printf("Verifying metaslab entries\n"); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (!vdev_is_concrete(vd)) + continue; + + for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { + metaslab_t *m = vd->vdev_ms[mid]; + + (void) fprintf(stderr, + "\rverifying concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)mid, + (longlong_t)vd->vdev_ms_count); + + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, m, + &start, &shift); + metaslab_verify_t mv; + mv.mv_allocated = range_tree_create(NULL, + type, NULL, start, shift); + mv.mv_vdid = vd->vdev_id; + mv.mv_msid = m->ms_id; + mv.mv_start = m->ms_start; + mv.mv_end = m->ms_start + m->ms_size; + zfs_btree_create(&mv.mv_livelist_allocs, + livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + + mv_populate_livelist_allocs(&mv, &sv); + + spacemap_check_ms_sm(m->ms_sm, &mv); + spacemap_check_sm_log(spa, &mv); + + range_tree_vacate(mv.mv_allocated, NULL, NULL); + range_tree_destroy(mv.mv_allocated); + zfs_btree_clear(&mv.mv_livelist_allocs); + zfs_btree_destroy(&mv.mv_livelist_allocs); + } + } + (void) fprintf(stderr, "\n"); + + /* + * If there are any segments in the leftover tree after we walked + * through all the metaslabs in the concrete vdevs then this means + * that we have segments in the livelists that belong to indirect + * vdevs and are marked as allocated. + */ + if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { + zfs_btree_destroy(&sv.sv_leftover); + return; + } + (void) printf("ERROR: Found livelist blocks marked as allocated " + "for indirect vdevs:\n"); + + zfs_btree_index_t *where = NULL; + sublivelist_verify_block_t *svb; + while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != + NULL) { + int vdev_id = DVA_GET_VDEV(&svb->svb_dva); + ASSERT3U(vdev_id, <, rvd->vdev_children); + vdev_t *vd = rvd->vdev_child[vdev_id]; + ASSERT(!vdev_is_concrete(vd)); + (void) printf("<%d:%llx:%llx> TXG %llx\n", + vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), + (u_longlong_t)svb->svb_allocated_txg); + } + (void) printf("\n"); + zfs_btree_destroy(&sv.sv_leftover); +} /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -137,31 +780,45 @@ static void usage(void) { (void) fprintf(stderr, - "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p ...]] " + "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" - "\t\t[ [ ...]]\n" - "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] \n" - "\t\t[ ...]\n" + "\t\t[[/] [ ...]]\n" + "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ]\n" + "\t\t[[/] [ ...]\n" + "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" + "\t%s -r \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); - (void) fprintf(stderr, " If object numbers are specified, only " - "those objects are dumped\n\n"); + (void) fprintf(stderr, " If object numbers or object number " + "ranges are specified, only those\n" + " objects or ranges are dumped.\n\n"); + (void) fprintf(stderr, + " Object ranges take the form :[:]\n" + " start Starting object number\n" + " end Ending object number, or -1 for no upper bound\n" + " flags Optional flags to select object types:\n" + " A All objects (this is the default)\n" + " d ZFS directories\n" + " f ZFS files \n" + " m SPA space maps\n" + " z ZAPs\n" + " - Negate effect of next flag\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " @@ -181,12 +838,15 @@ usage(void) (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); + (void) fprintf(stderr, " -r copy an object by path to file\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " - "others)\n\n"); + "others)\n"); + (void) fprintf(stderr, " -y perform livelist and metaslab " + "validation on any livelists being deleted\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " @@ -218,6 +878,7 @@ usage(void) "work with dataset)\n"); (void) fprintf(stderr, " -Y attempt all reconstruction " "combinations for split blocks\n"); + (void) fprintf(stderr, " -Z show ZSTD headers \n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -420,6 +1081,57 @@ dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { + uint64_t *arr; + uint64_t oursize; + if (dump_opt['d'] < 6) + return; + + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + arr = kmem_alloc(oursize, KM_SLEEP); + + int err = dmu_read(os, object, 0, oursize, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, oursize); + return; + } + } else { + /* + * Even though the allocation is already done in this code path, + * we still cap the size to prevent excessive printing. + */ + oursize = MIN(size, 1 << 20); + arr = data; + } + + if (size == 0) { + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); + for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { + if (i % 4 != 0) + (void) printf(", %0llx", (u_longlong_t)arr[i]); + else + (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); + } + if (oursize != size) + (void) printf(", ... "); + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, oursize); } /*ARGSUSED*/ @@ -447,7 +1159,21 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { - (void) printf("%s", (char *)prop); + if (strcmp(attr.za_name, + DSL_CRYPTO_KEY_MASTER_KEY) == 0 || + strcmp(attr.za_name, + DSL_CRYPTO_KEY_HMAC_KEY) == 0 || + strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || + strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || + strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { + uint8_t *u8 = prop; + + for (i = 0; i < attr.za_num_integers; i++) { + (void) printf("%02x", u8[i]); + } + } else { + (void) printf("%s", (char *)prop); + } } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { @@ -498,12 +1224,16 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } - if (size >= sizeof (*bpop)) { + if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tnum_freed = %llu\n", + (u_longlong_t)bpop->bpo_num_freed); + } if (dump_opt['d'] < 5) return; @@ -518,7 +1248,8 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) (void) printf("got error %u from dmu_read\n", err); break; } - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, + BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } @@ -758,6 +1489,12 @@ get_checkpoint_refcount(vdev_t *vd) return (refcount); } +static int +get_log_spacemap_refcount(spa_t *spa) +{ + return (avl_numnodes(&spa->spa_sm_logs_by_txg)); +} + static int verify_spacemap_refcounts(spa_t *spa) { @@ -772,6 +1509,7 @@ verify_spacemap_refcounts(spa_t *spa) actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); + actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " @@ -815,11 +1553,20 @@ dump_spacemap(objset_t *os, space_map_t *sm) sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", - (u_longlong_t)entry_id, - ddata[SM_DEBUG_ACTION_DECODE(word)], - (u_longlong_t)SM_DEBUG_TXG_DECODE(word), - (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); + uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); + if (de_txg == 0) { + (void) printf( + "\t [%6llu] PADDING\n", + (u_longlong_t)entry_id); + } else { + (void) printf( + "\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)de_txg, + (u_longlong_t)de_sync_pass); + } entry_id++; continue; } @@ -870,7 +1617,7 @@ dump_spacemap(objset_t *os, space_map_t *sm) alloc -= entry_run; entry_id++; } - if ((uint64_t)alloc != space_map_allocated(sm)) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); @@ -882,16 +1629,16 @@ dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); - zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", avl_numnodes(t), "maxsize", maxbuf, + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -934,25 +1681,51 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); + + if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", + (u_longlong_t)metaslab_unflushed_txg(msp)); + } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; - const char *bias_str; + const char *bias_str = ""; + if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { + bias_str = VDEV_ALLOC_BIAS_LOG; + } else if (alloc_bias == VDEV_BIAS_SPECIAL) { + bias_str = VDEV_ALLOC_BIAS_SPECIAL; + } else if (alloc_bias == VDEV_BIAS_DEDUP) { + bias_str = VDEV_ALLOC_BIAS_DEDUP; + } - bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? - VDEV_ALLOC_BIAS_LOG : - (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : - (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : - vd->vdev_islog ? "log" : ""; + uint64_t ms_flush_data_obj = 0; + if (vd->vdev_top_zap != 0) { + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &ms_flush_data_obj); + if (error != ENOENT) { + ASSERT0(error); + } + } - (void) printf("\tvdev %10llu %s\n" - "\t%-10s%5llu %-19s %-15s %-12s\n", - (u_longlong_t)vd->vdev_id, bias_str, + (void) printf("\tvdev %10llu %s", + (u_longlong_t)vd->vdev_id, bias_str); + + if (ms_flush_data_obj != 0) { + (void) printf(" ms_unflushed_phys object %llu", + (u_longlong_t)ms_flush_data_obj); + } + + (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", @@ -1082,24 +1855,24 @@ dump_metaslabs(spa_t *spa) (void) printf("\nMetaslabs:\n"); - if (!dump_opt['d'] && zopt_objects > 0) { - c = zopt_object[0]; + if (!dump_opt['d'] && zopt_metaslab_args > 0) { + c = zopt_metaslab[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); - if (zopt_objects > 1) { + if (zopt_metaslab_args > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); - for (m = 1; m < zopt_objects; m++) { - if (zopt_object[m] < vd->vdev_ms_count) + for (m = 1; m < zopt_metaslab_args; m++) { + if (zopt_metaslab[m] < vd->vdev_ms_count) dump_metaslab( - vd->vdev_ms[zopt_object[m]]); + vd->vdev_ms[zopt_metaslab[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", - (u_longlong_t)zopt_object[m]); + (u_longlong_t)zopt_metaslab[m]); } (void) printf("\n"); return; @@ -1118,6 +1891,27 @@ dump_metaslabs(spa_t *spa) } } +static void +dump_log_spacemaps(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + (void) printf("\nLog Space Maps in Pool:\n"); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + (void) printf("Log Spacemap object %llu txg %llu\n", + (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); + dump_spacemap(spa->spa_meta_objset, sm); + space_map_close(sm); + } + (void) printf("\n"); +} + static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { @@ -1308,10 +2102,7 @@ dump_history(spa_t *spa) uint64_t resid, len, off = 0; uint_t num = 0; int error; - time_t tsec; - struct tm t; char tbuf[30]; - char internalstr[MAXPATHLEN]; if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", @@ -1337,38 +2128,81 @@ dump_history(spa_t *spa) (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { - uint64_t time, txg, ievent; - char *cmd, *intstr; boolean_t printed = B_FALSE; - if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, - &time) != 0) - goto next; - if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, - &cmd) != 0) { - if (nvlist_lookup_uint64(events[i], - ZPOOL_HIST_INT_EVENT, &ievent) != 0) - goto next; - verify(nvlist_lookup_uint64(events[i], - ZPOOL_HIST_TXG, &txg) == 0); - verify(nvlist_lookup_string(events[i], - ZPOOL_HIST_INT_STR, &intstr) == 0); + if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { + time_t tsec; + struct tm t; + + tsec = fnvlist_lookup_uint64(events[i], + ZPOOL_HIST_TIME); + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + } else { + tbuf[0] = '\0'; + } + + if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { + (void) printf("%s %s\n", tbuf, + fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); + } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { + uint64_t ievent; + + ievent = fnvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; - (void) snprintf(internalstr, - sizeof (internalstr), - "[internal %s txg:%lld] %s", + (void) printf(" %s [internal %s txg:%ju] %s\n", + tbuf, zfs_history_event_names[ievent], - (longlong_t)txg, intstr); - cmd = internalstr; - } - tsec = time; - (void) localtime_r(&tsec, &t); - (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); - (void) printf("%s %s\n", tbuf, cmd); - printed = B_TRUE; + fnvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG), + fnvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { + (void) printf("%s [txg:%ju] %s", tbuf, + fnvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG), + fnvlist_lookup_string(events[i], + ZPOOL_HIST_INT_NAME)); + if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { + (void) printf(" %s (%llu)", + fnvlist_lookup_string(events[i], + ZPOOL_HIST_DSNAME), + (u_longlong_t)fnvlist_lookup_uint64( + events[i], + ZPOOL_HIST_DSID)); + } + + (void) printf(" %s\n", fnvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { + (void) printf("%s ioctl %s\n", tbuf, + fnvlist_lookup_string(events[i], + ZPOOL_HIST_IOCTL)); + + if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { + (void) printf(" input:\n"); + dump_nvlist(fnvlist_lookup_nvlist(events[i], + ZPOOL_HIST_INPUT_NVL), 8); + } + if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { + (void) printf(" output:\n"); + dump_nvlist(fnvlist_lookup_nvlist(events[i], + ZPOOL_HIST_OUTPUT_NVL), 8); + } + if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { + (void) printf(" errno: %lld\n", + (longlong_t)fnvlist_lookup_int64(events[i], + ZPOOL_HIST_ERRNO)); + } + } else { + goto next; + } + + printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) @@ -1404,7 +2238,69 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, } static void -snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) +snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, + const blkptr_t *bp) +{ + abd_t *pabd; + void *buf; + zio_t *zio; + zfs_zstdhdr_t zstd_hdr; + int error; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) + return; + + if (BP_IS_HOLE(bp)) + return; + + if (BP_IS_EMBEDDED(bp)) { + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + decode_embedded_bp_compressed(bp, buf); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + free(buf); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", + zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), + zfs_get_hdrlevel(&zstd_hdr)); + return; + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + zio = zio_root(spa, NULL, NULL, 0); + + /* Decrypt but don't decompress so we can read the compression header */ + zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, + NULL)); + error = zio_wait(zio); + if (error) { + (void) fprintf(stderr, "read failed: %d\n", error); + return; + } + buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:NORMAL", + zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), + zfs_get_hdrlevel(&zstd_hdr)); + + abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); +} + +static void +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, + boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; @@ -1412,6 +2308,10 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); + if (bp_freed) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + } return; } @@ -1449,11 +2349,20 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (bp_freed) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); } } static void -print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, +print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; @@ -1476,7 +2385,9 @@ print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, } } - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); + if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) + snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } @@ -1489,7 +2400,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, if (bp->blk_birth == 0) return (0); - print_indirect(bp, zb, dnp); + print_indirect(spa, bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; @@ -1498,6 +2409,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; + ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); @@ -1710,12 +2622,12 @@ dump_bptree(objset_t *os, uint64_t obj, const char *name) /* ARGSUSED */ static int -dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } @@ -1740,14 +2652,28 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); - (void) printf(" %*s: object %llu, %llu local blkptrs, " - "%llu subobjs in object, %llu, %s (%s/%s comp)\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, - (u_longlong_t)bpo->bpo_phys->bpo_subobjs, - bytes, comp, uncomp); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu freed, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } else { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; @@ -1767,11 +2693,22 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) bpobj_close(&subbpo); } } else { - (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - bytes); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%llu freed, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + bytes); + } else { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); + } } if (dump_opt['d'] < 5) @@ -1784,6 +2721,128 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) } } +static int +dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, + boolean_t print_list) +{ + int err = 0; + zfs_bookmark_phys_t prop; + objset_t *mos = dp->dp_spa->spa_meta_objset; + err = dsl_bookmark_lookup(dp, name, NULL, &prop); + + if (err != 0) { + return (err); + } + + (void) printf("\t#%s: ", strchr(name, '#') + 1); + (void) printf("{guid: %llx creation_txg: %llu creation_time: " + "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, + (u_longlong_t)prop.zbm_creation_txg, + (u_longlong_t)prop.zbm_creation_time, + (u_longlong_t)prop.zbm_redaction_obj); + + IMPLY(print_list, print_redact); + if (!print_redact || prop.zbm_redaction_obj == 0) + return (0); + + redaction_list_t *rl; + VERIFY0(dsl_redaction_list_hold_obj(dp, + prop.zbm_redaction_obj, FTAG, &rl)); + + redaction_list_phys_t *rlp = rl->rl_phys; + (void) printf("\tRedacted:\n\t\tProgress: "); + if (rlp->rlp_last_object != UINT64_MAX || + rlp->rlp_last_blkid != UINT64_MAX) { + (void) printf("%llu %llu (incomplete)\n", + (u_longlong_t)rlp->rlp_last_object, + (u_longlong_t)rlp->rlp_last_blkid); + } else { + (void) printf("complete\n"); + } + (void) printf("\t\tSnapshots: ["); + for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { + if (i > 0) + (void) printf(", "); + (void) printf("%0llu", + (u_longlong_t)rlp->rlp_snaps[i]); + } + (void) printf("]\n\t\tLength: %llu\n", + (u_longlong_t)rlp->rlp_num_entries); + + if (!print_list) { + dsl_redaction_list_rele(rl, FTAG); + return (0); + } + + if (rlp->rlp_num_entries == 0) { + dsl_redaction_list_rele(rl, FTAG); + (void) printf("\t\tRedaction List: []\n\n"); + return (0); + } + + redact_block_phys_t *rbp_buf; + uint64_t size; + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); + size = doi.doi_max_offset; + rbp_buf = kmem_alloc(size, KM_SLEEP); + + err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, + rbp_buf, 0); + if (err != 0) { + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + return (err); + } + + (void) printf("\t\tRedaction List: [{object: %llx, offset: " + "%llx, blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[0].rbp_object, + (u_longlong_t)rbp_buf[0].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[0])), + (u_longlong_t)redact_block_get_count(&rbp_buf[0])); + + for (size_t i = 1; i < rlp->rlp_num_entries; i++) { + (void) printf(",\n\t\t{object: %llx, offset: %llx, " + "blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[i].rbp_object, + (u_longlong_t)rbp_buf[i].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[i])), + (u_longlong_t)redact_block_get_count(&rbp_buf[i])); + } + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + (void) printf("]\n\n"); + return (0); +} + +static void +dump_bookmarks(objset_t *os, int verbosity) +{ + zap_cursor_t zc; + zap_attribute_t attr; + dsl_dataset_t *ds = dmu_objset_ds(os); + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + objset_t *mos = os->os_spa->spa_meta_objset; + if (verbosity < 4) + return; + dsl_pool_config_enter(dp, FTAG); + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char osname[ZFS_MAX_DATASET_NAME_LEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dmu_objset_name(os, osname); + VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, + attr.za_name)); + (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); + } + zap_cursor_fini(&zc); + dsl_pool_config_exit(dp, FTAG); +} + static void bpobj_count_refd(bpobj_t *bpo) { @@ -1811,36 +2870,59 @@ bpobj_count_refd(bpobj_t *bpo) } } -static void -dump_deadlist(dsl_deadlist_t *dl) +static int +dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) +{ + spa_t *spa = arg; + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + if (dle->dle_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dle->dle_bpobj); + return (0); +} + +static int +dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) +{ + ASSERT(arg == NULL); + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), + "mintxg %llu -> obj %llu", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + return (0); +} + +static void +dump_blkptr_list(dsl_deadlist_t *dl, char *name) { - dsl_deadlist_entry_t *dle; - uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; - uint64_t empty_bpobj = - dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; - - /* force the tree to be loaded */ - dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); + char entries[32]; + spa_t *spa = dmu_objset_spa(dl->dl_os); + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dle->dle_bpobj.bpo_object != empty_bpobj) - bpobj_count_refd(&dle->dle_bpobj); - } + dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; @@ -1853,30 +2935,65 @@ dump_deadlist(dsl_deadlist_t *dl) zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); - (void) printf("\n Deadlist: %s (%s/%s comp)\n", - bytes, comp, uncomp); + zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); + (void) printf("\n %s: %s (%s/%s comp), %s entries\n", + name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) printf("\n"); - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dump_opt['d'] >= 5) { - char buf[128]; - (void) snprintf(buf, sizeof (buf), - "mintxg %llu -> obj %llu", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); + dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); +} - dump_full_bpobj(&dle->dle_bpobj, buf, 0); - } else { - (void) printf("mintxg %llu -> obj %llu\n", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); - } +static int +verify_dd_livelist(objset_t *os) +{ + uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + + ASSERT(!dmu_objset_is_snapshot(os)); + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return (0); + + /* Iterate through the livelist to check for duplicates */ + dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, + NULL); + + dsl_pool_config_enter(dp, FTAG); + dsl_deadlist_space(&dd->dd_livelist, &ll_used, + &ll_comp, &ll_uncomp); + + dsl_dataset_t *origin_ds; + ASSERT(dsl_pool_config_held(dp)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); + VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, + &used, &comp, &uncomp)); + dsl_dataset_rele(origin_ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + /* + * It's possible that the dataset's uncomp space is larger than the + * livelist's because livelists do not track embedded block pointers + */ + if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { + char nice_used[32], nice_comp[32], nice_uncomp[32]; + (void) printf("Discrepancy in space accounting:\n"); + zdb_nicenum(used, nice_used, sizeof (nice_used)); + zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("dir: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); + zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("livelist: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + return (1); } + return (0); } static avl_tree_t idx_tree; @@ -1886,19 +3003,26 @@ static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int -open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) +open_objset(const char *path, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); - err = dmu_objset_own(path, type, B_TRUE, B_FALSE, tag, osp); + /* + * We can't own an objset if it's redacted. Therefore, we do this + * dance: hold the objset, then acquire a long hold on its dataset, then + * release the pool (which is held as part of holding the objset). + */ + err = dmu_objset_hold(path, tag, osp); if (err != 0) { - (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, - strerror(err)); + (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", + path, strerror(err)); return (err); } + dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); + dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, @@ -1912,7 +3036,8 @@ open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); - dmu_objset_disown(*osp, B_FALSE, tag); + dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); + dsl_dataset_rele(dmu_objset_ds(*osp), tag); *osp = NULL; } } @@ -1927,7 +3052,8 @@ close_objset(objset_t *os, void *tag) VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); - dmu_objset_disown(os, B_FALSE, tag); + dsl_dataset_long_rele(dmu_objset_ds(os), tag); + dsl_dataset_rele(dmu_objset_ds(os), tag); sa_attr_table = NULL; sa_os = NULL; } @@ -2042,6 +3168,23 @@ dump_znode_sa_xattr(sa_handle_t *hdl) free(sa_xattr_packed); } +static void +dump_znode_symlink(sa_handle_t *hdl) +{ + int sa_symlink_size = 0; + char linktarget[MAXPATHLEN]; + linktarget[0] = '\0'; + int error; + + error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); + if (error || sa_symlink_size == 0) { + return; + } + if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], + &linktarget, sa_symlink_size) == 0) + (void) printf("\ttarget %s\n", linktarget); +} + /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) @@ -2106,6 +3249,9 @@ dump_znode(objset_t *os, uint64_t object, void *data, size_t size) } (void) printf("\tpath %s\n", path); } + + if (S_ISLNK(mode)) + dump_znode_symlink(hdl); dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); @@ -2204,9 +3350,49 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_unknown, /* Unknown type, must be last */ }; +static boolean_t +match_object_type(dmu_object_type_t obj_type, uint64_t flags) +{ + boolean_t match = B_TRUE; + + switch (obj_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (!(flags & ZOR_FLAG_DIRECTORY)) + match = B_FALSE; + break; + case DMU_OT_PLAIN_FILE_CONTENTS: + if (!(flags & ZOR_FLAG_PLAIN_FILE)) + match = B_FALSE; + break; + case DMU_OT_SPACE_MAP: + if (!(flags & ZOR_FLAG_SPACE_MAP)) + match = B_FALSE; + break; + default: + if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { + if (!(flags & ZOR_FLAG_ZAP)) + match = B_FALSE; + break; + } + + /* + * If all bits except some of the supported flags are + * set, the user combined the all-types flag (A) with + * a negated flag to exclude some types (e.g. A-f to + * show all object types except plain files). + */ + if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) + match = B_FALSE; + + break; + } + + return (match); +} + static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, - uint64_t *dnode_slots_used) +dump_object(objset_t *os, uint64_t object, int verbosity, + boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) { dmu_buf_t *db = NULL; dmu_object_info_t doi; @@ -2263,6 +3449,13 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, } } + /* + * Default to showing all object types if no flags were specified. + */ + if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && + !match_object_type(doi.doi_type, flags)) + goto out; + if (dnode_slots_used) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; @@ -2283,7 +3476,25 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } - if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { + if (doi.doi_compress == ZIO_COMPRESS_INHERIT && + ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { + const char *compname = NULL; + if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, + ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), + &compname) == 0) { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), " (Z=inherit=%s)", + compname); + } else { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), + " (Z=inherit=%s-unknown)", + ZDB_COMPRESS_NAME(os->os_compress)); + } + } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); + } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } @@ -2325,7 +3536,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, (void) printf("\t\t(object encrypted)\n"); } - *print_header = 1; + *print_header = B_TRUE; } if (verbosity >= 5) @@ -2366,6 +3577,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, } } +out: if (db != NULL) dmu_buf_rele(db, FTAG); if (dnode_held) @@ -2396,6 +3608,7 @@ count_ds_mos_objects(dsl_dataset_t *ds) mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); + mos_obj_refd(ds->ds_bookmarks_obj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); @@ -2405,10 +3618,118 @@ count_ds_mos_objects(dsl_dataset_t *ds) static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; -static void -dump_dir(objset_t *os) +/* + * Parse a string denoting a range of object IDs of the form + * [:[:flags]], and store the results in zor. + * Return 0 on success. On error, return 1 and update the msg + * pointer to point to a descriptive error message. + */ +static int +parse_object_range(char *range, zopt_object_range_t *zor, char **msg) { - dmu_objset_stats_t dds; + uint64_t flags = 0; + char *p, *s, *dup, *flagstr, *tmp = NULL; + size_t len; + int i; + int rc = 0; + + if (strchr(range, ':') == NULL) { + zor->zor_obj_start = strtoull(range, &p, 0); + if (*p != '\0') { + *msg = "Invalid characters in object ID"; + rc = 1; + } + zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); + zor->zor_obj_end = zor->zor_obj_start; + return (rc); + } + + if (strchr(range, ':') == range) { + *msg = "Invalid leading colon"; + rc = 1; + return (rc); + } + + len = strlen(range); + if (range[len - 1] == ':') { + *msg = "Invalid trailing colon"; + rc = 1; + return (rc); + } + + dup = strdup(range); + s = strtok_r(dup, ":", &tmp); + zor->zor_obj_start = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in start object ID"; + rc = 1; + goto out; + } + + s = strtok_r(NULL, ":", &tmp); + zor->zor_obj_end = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in end object ID"; + rc = 1; + goto out; + } + + if (zor->zor_obj_start > zor->zor_obj_end) { + *msg = "Start object ID may not exceed end object ID"; + rc = 1; + goto out; + } + + s = strtok_r(NULL, ":", &tmp); + if (s == NULL) { + zor->zor_flags = ZOR_FLAG_ALL_TYPES; + goto out; + } else if (strtok_r(NULL, ":", &tmp) != NULL) { + *msg = "Invalid colon-delimited field after flags"; + rc = 1; + goto out; + } + + flagstr = s; + for (i = 0; flagstr[i]; i++) { + int bit; + boolean_t negation = (flagstr[i] == '-'); + + if (negation) { + i++; + if (flagstr[i] == '\0') { + *msg = "Invalid trailing negation operator"; + rc = 1; + goto out; + } + } + bit = flagbits[(uchar_t)flagstr[i]]; + if (bit == 0) { + *msg = "Invalid flag"; + rc = 1; + goto out; + } + if (negation) + flags &= ~bit; + else + flags |= bit; + } + zor->zor_flags = flags; + + zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); + zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); + +out: + free(dup); + return (rc); +} + +static void +dump_objset(objset_t *os) +{ + dmu_objset_stats_t dds = { 0 }; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; @@ -2416,12 +3737,15 @@ dump_dir(objset_t *os) char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; - int print_header = 1; + boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; + uint64_t obj_start; + uint64_t obj_end; + uint64_t flags; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); @@ -2430,6 +3754,8 @@ dump_dir(objset_t *os) dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + print_header = B_TRUE; + if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; @@ -2463,10 +3789,26 @@ dump_dir(objset_t *os) numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); - if (zopt_objects != 0) { - for (i = 0; i < zopt_objects; i++) - dump_object(os, zopt_object[i], verbosity, - &print_header, NULL); + for (i = 0; i < zopt_object_args; i++) { + obj_start = zopt_object_ranges[i].zor_obj_start; + obj_end = zopt_object_ranges[i].zor_obj_end; + flags = zopt_object_ranges[i].zor_flags; + + object = obj_start; + if (object == 0 || obj_start == obj_end) + dump_object(os, object, verbosity, &print_header, NULL, + flags); + else + object--; + + while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && + object <= obj_end) { + dump_object(os, object, verbosity, &print_header, NULL, + flags); + } + } + + if (zopt_object_args > 0) { (void) printf("\n"); return; } @@ -2476,39 +3818,49 @@ dump_dir(objset_t *os) if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); - dump_deadlist(&ds->ds_deadlist); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); + if (verify_dd_livelist(os) != 0) + fatal("livelist is incorrect"); + } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); - dump_deadlist(&ds->ds_remap_deadlist); + dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } + if (dmu_objset_ds(os) != NULL) + dump_bookmarks(os, verbosity); + if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; - dump_object(os, 0, verbosity, &print_header, NULL); + dump_object(os, 0, verbosity, &print_header, NULL, 0); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, - NULL); + NULL, 0); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, - NULL); + NULL, 0); } if (DMU_PROJECTUSED_DNODE(os) != NULL && DMU_PROJECTUSED_DNODE(os)->dn_type != 0) dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, - &print_header, NULL); + &print_header, NULL, 0); object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header, &dnode_slots); + dump_object(os, object, verbosity, &print_header, &dnode_slots, + 0); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; @@ -2797,10 +4149,10 @@ cksum_record_compare(const void *x1, const void *x2) const cksum_record_t *l = (cksum_record_t *)x1; const cksum_record_t *r = (cksum_record_t *)x2; int arraysize = ARRAY_SIZE(l->cksum.zc_word); - int difference; + int difference = 0; for (int i = 0; i < arraysize; i++) { - difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } @@ -2893,6 +4245,270 @@ print_label_header(zdb_label_t *label, int l) label->header_printed = B_TRUE; } +static void +print_l2arc_header(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device header\n"); + (void) printf("------------------------------------\n"); +} + +static void +print_l2arc_log_blocks(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device log blocks\n"); + (void) printf("------------------------------------\n"); +} + +static void +dump_l2arc_log_entries(uint64_t log_entries, + l2arc_log_ent_phys_t *le, uint64_t i) +{ + for (int j = 0; j < log_entries; j++) { + dva_t dva = le[j].le_dva; + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, + (u_longlong_t)DVA_GET_ASIZE(&dva), + (u_longlong_t)DVA_GET_VDEV(&dva), + (u_longlong_t)DVA_GET_OFFSET(&dva)); + (void) printf("|\t\t\t\tbirth: %llu\n", + (u_longlong_t)le[j].le_birth); + (void) printf("|\t\t\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tpsize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcompr: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcomplevel: %llu\n", + (u_longlong_t)(&le[j])->le_complevel); + (void) printf("|\t\t\t\ttype: %llu\n", + (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprotected: %llu\n", + (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprefetch: %llu\n", + (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); + (void) printf("|\t\t\t\taddress: %llu\n", + (u_longlong_t)le[j].le_daddr); + (void) printf("|\t\t\t\tARC state: %llu\n", + (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); + (void) printf("|\n"); + } + (void) printf("\n"); +} + +static void +dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) +{ + (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); + (void) printf("|\t\tpayload_asize: %llu\n", + (u_longlong_t)lbps.lbp_payload_asize); + (void) printf("|\t\tpayload_start: %llu\n", + (u_longlong_t)lbps.lbp_payload_start); + (void) printf("|\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tasize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tcompralgo: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); + (void) printf("|\t\tcksumalgo: %llu\n", + (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); + (void) printf("|\n\n"); +} + +static void +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) +{ + l2arc_log_blk_phys_t this_lb; + uint64_t asize; + l2arc_log_blkptr_t lbps[2]; + abd_t *abd; + zio_cksum_t cksum; + int failed = 0; + l2arc_dev_t dev; + + if (!dump_opt['q']) + print_l2arc_log_blocks(); + bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); + + dev.l2ad_evict = l2dhdr.dh_evict; + dev.l2ad_start = l2dhdr.dh_start; + dev.l2ad_end = l2dhdr.dh_end; + + if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { + /* no log blocks to read */ + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } + return; + } else { + dev.l2ad_hand = lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + } + + dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + for (;;) { + if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) + break; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } + break; + } + + fletcher_4_native_varsize(&this_lb, asize, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { + failed++; + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } + break; + } + + switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + default: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); + zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &this_lb, + asize, sizeof (this_lb), NULL); + abd_free(abd); + break; + } + + if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(&this_lb, sizeof (this_lb)); + if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); + break; + } + + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, + (u_longlong_t)this_lb.lb_magic); + dump_l2arc_log_blkptr(lbps[0]); + } + + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); + + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) + break; + + lbps[0] = lbps[1]; + lbps[1] = this_lb.lb_prev_lbp; + } + + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } +} + +static int +dump_l2arc_header(int fd) +{ + l2arc_dev_hdr_phys_t l2dhdr, rebuild; + int error = B_FALSE; + + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), + VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + error = B_TRUE; + } else { + if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); + + if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) + error = B_TRUE; + } + + if (error) { + (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); + } else if (!dump_opt['q']) { + print_l2arc_header(); + + (void) printf(" magic: %llu\n", + (u_longlong_t)l2dhdr.dh_magic); + (void) printf(" version: %llu\n", + (u_longlong_t)l2dhdr.dh_version); + (void) printf(" pool_guid: %llu\n", + (u_longlong_t)l2dhdr.dh_spa_guid); + (void) printf(" flags: %llu\n", + (u_longlong_t)l2dhdr.dh_flags); + (void) printf(" start_lbps[0]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[0].lbp_daddr); + (void) printf(" start_lbps[1]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[1].lbp_daddr); + (void) printf(" log_blk_ent: %llu\n", + (u_longlong_t)l2dhdr.dh_log_entries); + (void) printf(" start: %llu\n", + (u_longlong_t)l2dhdr.dh_start); + (void) printf(" end: %llu\n", + (u_longlong_t)l2dhdr.dh_end); + (void) printf(" evict: %llu\n", + (u_longlong_t)l2dhdr.dh_evict); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_count); + (void) printf(" trim_action_time: %llu\n", + (u_longlong_t)l2dhdr.dh_trim_action_time); + (void) printf(" trim_state: %llu\n\n", + (u_longlong_t)l2dhdr.dh_trim_state); + } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); +} + static void dump_config_from_label(zdb_label_t *label, size_t buflen, int l) { @@ -2959,10 +4575,10 @@ static char curpath[PATH_MAX]; * for the last one. */ static int -dump_path_impl(objset_t *os, uint64_t obj, char *name) +dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) { int err; - int header = 1; + boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; @@ -3009,10 +4625,15 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name) switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') - return (dump_path_impl(os, child_obj, s + 1)); - /*FALLTHROUGH*/ + return (dump_path_impl(os, child_obj, s + 1, retobj)); + fallthrough; case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header, NULL); + if (retobj != NULL) { + *retobj = child_obj; + } else { + dump_object(os, child_obj, dump_opt['v'], &header, + NULL, 0); + } return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " @@ -3027,13 +4648,13 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name) * Dump the blocks for the object specified by path inside the dataset. */ static int -dump_path(char *ds, char *path) +dump_path(char *ds, char *path, uint64_t *retobj) { int err; objset_t *os; uint64_t root_obj; - err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); + err = open_objset(ds, FTAG, &os); if (err != 0) return (err); @@ -3041,27 +4662,105 @@ dump_path(char *ds, char *path) if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); - dmu_objset_disown(os, B_FALSE, FTAG); + close_objset(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); - err = dump_path_impl(os, root_obj, path); + err = dump_path_impl(os, root_obj, path, retobj); close_objset(os, FTAG); return (err); } +static int +zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) +{ + int err = 0; + uint64_t size, readsize, oursize, offset; + ssize_t writesize; + sa_handle_t *hdl; + + (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, + destfile); + + VERIFY3P(os, ==, sa_os); + if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { + (void) printf("Failed to get handle for SA znode\n"); + return (err); + } + if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { + (void) sa_handle_destroy(hdl); + return (err); + } + (void) sa_handle_destroy(hdl); + + (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, + size); + if (size == 0) { + return (EINVAL); + } + + int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + offset = 0; + char *buf = kmem_alloc(oursize, KM_NOSLEEP); + if (buf == NULL) { + return (ENOMEM); + } + + while (offset < size) { + readsize = MIN(size - offset, 1 << 20); + err = dmu_read(os, srcobj, offset, readsize, buf, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(buf, oursize); + return (err); + } + if (dump_opt['v'] > 3) { + (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 + " error=%d\n", offset, readsize, err); + } + + writesize = write(fd, buf, readsize); + if (writesize < 0) { + err = errno; + break; + } else if (writesize != readsize) { + /* Incomplete write */ + (void) fprintf(stderr, "Short write, only wrote %llu of" + " %" PRIu64 " bytes, exiting...\n", + (u_longlong_t)writesize, readsize); + break; + } + + offset += readsize; + } + + (void) close(fd); + + if (buf != NULL) + kmem_free(buf, oursize); + + return (err); +} + static int dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS]; - uint64_t psize, ashift; + uint64_t psize, ashift, l2cache; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; + boolean_t read_l2arc_header = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; @@ -3103,7 +4802,7 @@ dump_label(const char *dev) exit(1); } - if (S_ISBLK(statbuf.st_mode) && ioctl(fd, BLKFLSBUF) != 0) + if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) (void) printf("failed to invalidate cache '%s' : %s\n", path, strerror(errno)); @@ -3154,6 +4853,15 @@ dump_label(const char *dev) if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; + /* If the device is a cache device clear the header. */ + if (!read_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + read_l2arc_header = B_TRUE; + } + } + fletcher_4_native_varsize(buf, size, &cksum); rec = cksum_record_insert(&config_tree, &cksum, l); @@ -3204,6 +4912,12 @@ dump_label(const char *dev) nvlist_free(label->config_nv); } + /* + * Dump the L2ARC header, if existent. + */ + if (read_l2arc_header) + error |= dump_l2arc_header(fd); + cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); @@ -3222,17 +4936,18 @@ dump_label(const char *dev) } static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int -dump_one_dir(const char *dsname, void *arg) +dump_one_objset(const char *dsname, void *arg) { int error; objset_t *os; spa_feature_t f; - error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); + error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); @@ -3248,7 +4963,22 @@ dump_one_dir(const char *dsname, void *arg) remap_deadlist_count++; } - dump_dir(os); + for (dsl_bookmark_node_t *dbn = + avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; + dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { + mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); + if (dbn->dbn_phys.zbm_redaction_obj != 0) + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; + } + + if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + global_feature_count[SPA_FEATURE_LIVELIST]++; + } + + dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); @@ -3285,6 +5015,7 @@ static const char *zdb_ot_extname[] = { }; #define ZB_TOTAL DN_MAX_LEVELS +#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; @@ -3292,6 +5023,15 @@ typedef struct zdb_cb { uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_total; + uint64_t zcb_lsize_total; + uint64_t zcb_asize_total; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; @@ -3315,6 +5055,172 @@ same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) return ((off1 >> ms_shift) == (off2 >> ms_shift)); } +/* + * Used to simplify reporting of the histogram data. + */ +typedef struct one_histo { + char *name; + uint64_t *count; + uint64_t *len; + uint64_t cumulative; +} one_histo_t; + +/* + * The number of separate histograms processed for psize, lsize and asize. + */ +#define NUM_HISTO 3 + +/* + * This routine will create a fixed column size output of three different + * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M + * the count, length and cumulative length of the psize, lsize and + * asize blocks. + * + * All three types of blocks are listed on a single line + * + * By default the table is printed in nicenumber format (e.g. 123K) but + * if the '-P' parameter is specified then the full raw number (parseable) + * is printed out. + */ +static void +dump_size_histograms(zdb_cb_t *zcb) +{ + /* + * A temporary buffer that allows us to convert a number into + * a string using zdb_nicenumber to allow either raw or human + * readable numbers to be output. + */ + char numbuf[32]; + + /* + * Define titles which are used in the headers of the tables + * printed by this routine. + */ + const char blocksize_title1[] = "block"; + const char blocksize_title2[] = "size"; + const char count_title[] = "Count"; + const char length_title[] = "Size"; + const char cumulative_title[] = "Cum."; + + /* + * Setup the histogram arrays (psize, lsize, and asize). + */ + one_histo_t parm_histo[NUM_HISTO]; + + parm_histo[0].name = "psize"; + parm_histo[0].count = zcb->zcb_psize_count; + parm_histo[0].len = zcb->zcb_psize_len; + parm_histo[0].cumulative = 0; + + parm_histo[1].name = "lsize"; + parm_histo[1].count = zcb->zcb_lsize_count; + parm_histo[1].len = zcb->zcb_lsize_len; + parm_histo[1].cumulative = 0; + + parm_histo[2].name = "asize"; + parm_histo[2].count = zcb->zcb_asize_count; + parm_histo[2].len = zcb->zcb_asize_len; + parm_histo[2].cumulative = 0; + + + (void) printf("\nBlock Size Histogram\n"); + /* + * Print the first line titles + */ + if (dump_opt['P']) + (void) printf("\n%s\t", blocksize_title1); + else + (void) printf("\n%7s ", blocksize_title1); + + for (int j = 0; j < NUM_HISTO; j++) { + if (dump_opt['P']) { + if (j < NUM_HISTO - 1) { + (void) printf("%s\t\t\t", parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf(" %s", parm_histo[j].name); + } + } else { + if (j < NUM_HISTO - 1) { + /* Left aligned strings in the output */ + (void) printf("%-7s ", + parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf("%s", parm_histo[j].name); + } + } + } + (void) printf("\n"); + + /* + * Print the second line titles + */ + if (dump_opt['P']) { + (void) printf("%s\t", blocksize_title2); + } else { + (void) printf("%7s ", blocksize_title2); + } + + for (int i = 0; i < NUM_HISTO; i++) { + if (dump_opt['P']) { + (void) printf("%s\t%s\t%s\t", + count_title, length_title, cumulative_title); + } else { + (void) printf("%7s%7s%7s", + count_title, length_title, cumulative_title); + } + } + (void) printf("\n"); + + /* + * Print the rows + */ + for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { + + /* + * Print the first column showing the blocksize + */ + zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); + + if (dump_opt['P']) { + printf("%s", numbuf); + } else { + printf("%7s:", numbuf); + } + + /* + * Print the remaining set of 3 columns per size: + * for psize, lsize and asize + */ + for (int j = 0; j < NUM_HISTO; j++) { + parm_histo[j].cumulative += parm_histo[j].len[i]; + + zdb_nicenum(parm_histo[j].count[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].len[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].cumulative, + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + } + (void) printf("\n"); + } +} + static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) @@ -3408,6 +5314,28 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, [BPE_GET_PSIZE(bp)]++; return; } + /* + * The binning histogram bins by powers of two up to + * SPA_MAXBLOCKSIZE rather than creating bins for + * every possible blocksize found in the pool. + */ + int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + + zcb->zcb_psize_count[bin]++; + zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); + zcb->zcb_psize_total += BP_GET_PSIZE(bp); + + bin = highbit64(BP_GET_LSIZE(bp)) - 1; + + zcb->zcb_lsize_count[bin]++; + zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); + zcb->zcb_lsize_total += BP_GET_LSIZE(bp); + + bin = highbit64(BP_GET_ASIZE(bp)) - 1; + + zcb->zcb_asize_count[bin]++; + zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); + zcb->zcb_asize_total += BP_GET_ASIZE(bp); if (dump_opt['L']) return; @@ -3446,10 +5374,8 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - abd_free(zio->io_abd); - mutex_enter(&spa->spa_scrub_lock); - spa->spa_load_verify_ios--; + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -3474,6 +5400,8 @@ zdb_blkptr_done(zio_t *zio) blkbuf); } mutex_exit(&spa->spa_scrub_lock); + + abd_free(zio->io_abd); } static int @@ -3484,7 +5412,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type; boolean_t is_metadata; - if (bp == NULL) + if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { @@ -3499,7 +5427,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, blkbuf); } - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); @@ -3520,9 +5448,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_load_verify_ios > max_inflight) + while (spa->spa_load_verify_bytes > max_inflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_load_verify_ios++; + spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, @@ -3542,9 +5470,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; - int kb_per_sec = + uint64_t kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); - int sec_remaining = + uint64_t sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ @@ -3552,8 +5480,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, - "\r%5s completed (%4dMB/s) " - "estimated time remaining: %uhr %02umin %02usec ", + "\r%5s completed (%4"PRIu64"MB/s) " + "estimated time remaining: " + "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, @@ -3578,6 +5507,35 @@ static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; +/* ARGSUSED */ +static int +load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + spa_vdev_removal_t *svr = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + + /* skip vdevs we don't care about */ + if (sme->sme_vdev != svr->svr_vdev_id) + return (0); + + vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + range_tree_add(svr->svr_allocd_segs, offset, size); + else + range_tree_remove(svr->svr_allocd_segs, offset, size); + + return (0); +} + /* ARGSUSED */ static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, @@ -3626,49 +5584,47 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; - if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) - break; - - ASSERT0(range_tree_space(svr->svr_allocd_segs)); - - if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, - svr->svr_allocd_segs, SM_ALLOC)); - - /* - * Clear everything past what has been synced unless - * it's past the spacemap, because we have not allocated - * mappings for it yet. - */ - uint64_t vim_max_offset = - vdev_indirect_mapping_max_offset(vim); - uint64_t sm_end = msp->ms_sm->sm_start + - msp->ms_sm->sm_size; - if (sm_end > vim_max_offset) - range_tree_clear(svr->svr_allocd_segs, - vim_max_offset, sm_end - vim_max_offset); - } - - zcb->zcb_removing_size += - range_tree_space(svr->svr_allocd_segs); - range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + ASSERT0(range_tree_space(allocs)); + if (msp->ms_sm != NULL) + VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); + range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); } + range_tree_destroy(allocs); + + iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for + * it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); + + zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int -increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; + ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); @@ -3866,6 +5822,82 @@ zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) } } +static int +count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + int64_t *ualloc_space = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + *ualloc_space += sme->sme_run; + else + *ualloc_space -= sme->sme_run; + + return (0); +} + +static int64_t +get_unflushed_alloc_space(spa_t *spa) +{ + if (dump_opt['L']) + return (0); + + int64_t ualloc_space = 0; + iterate_through_spacemap_logs(spa, count_unflushed_space_cb, + &ualloc_space); + return (ualloc_space); +} + +static int +load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) +{ + maptype_t *uic_maptype = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (*uic_maptype == sme->sme_type) + range_tree_add(ms->ms_allocatable, offset, size); + else + range_tree_remove(ms->ms_allocatable, offset, size); + + return (0); +} + +static void +load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) +{ + iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); +} + static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { @@ -3889,7 +5921,7 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); - metaslab_unload(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -3906,6 +5938,8 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) mutex_exit(&msp->ms_lock); } } + + load_unflushed_to_ms_allocatables(spa, maptype); } /* @@ -3920,7 +5954,7 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); - metaslab_unload(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -3978,9 +6012,11 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) * metaslabs. We want to set them up for * zio_claim(). */ + vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_mapping_t *vim __maybe_unused = + vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { @@ -4016,6 +6052,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -4149,7 +6186,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - ASSERTV(metaslab_group_t *mg = vd->vdev_mg); if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); @@ -4157,7 +6193,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); + ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == + spa_embedded_log_class(spa)) ? + vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded @@ -4179,7 +6217,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } - if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } @@ -4209,6 +6246,100 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +/* + * Iterate over livelists which have been destroyed by the user but + * are still present in the MOS, waiting to be freed + */ +static void +iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + ASSERT0(err); + + zap_cursor_t zc; + zap_attribute_t attr; + dsl_deadlist_t ll; + /* NULL out os prior to dsl_deadlist_open in case it's garbage */ + ll.dl_os = NULL; + for (zap_cursor_init(&zc, mos, zap_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + dsl_deadlist_open(&ll, mos, attr.za_first_integer); + func(&ll, arg); + dsl_deadlist_close(&ll); + } + zap_cursor_fini(&zc); +} + +static int +bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (count_block_cb(arg, bp, tx)); +} + +static int +livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) +{ + zdb_cb_t *zbc = args; + bplist_t blks; + bplist_create(&blks); + /* determine which blocks have been alloc'd but not freed */ + VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); + /* count those blocks */ + (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); + bplist_destroy(&blks); + return (0); +} + +static void +livelist_count_blocks(dsl_deadlist_t *ll, void *arg) +{ + dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); +} + +/* + * Count the blocks in the livelists that have been destroyed by the user + * but haven't yet been freed. + */ +static void +deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) +{ + iterate_deleted_livelists(spa, livelist_count_blocks, zbc); +} + +static void +dump_livelist_cb(dsl_deadlist_t *ll, void *arg) +{ + ASSERT3P(arg, ==, NULL); + global_feature_count[SPA_FEATURE_LIVELIST]++; + dump_blkptr_list(ll, "Deleted Livelist"); + dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); +} + +/* + * Print out, register object references to, and increment feature counts for + * livelists that have been destroyed by the user but haven't yet been freed. + */ +static void +deleted_livelists_dump_mos(spa_t *spa) +{ + uint64_t zap_obj; + objset_t *mos = spa->spa_meta_objset; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + mos_obj_refd(zap_obj); + iterate_deleted_livelists(spa, dump_livelist_cb, NULL); +} + static int dump_block_stats(spa_t *spa) { @@ -4248,11 +6379,11 @@ dump_block_stats(spa_t *spa) * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, - count_block_cb, &zcb, NULL); + bpobj_count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, - count_block_cb, &zcb, NULL); + bpobj_count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); @@ -4263,12 +6394,16 @@ dump_block_stats(spa_t *spa) &zcb, NULL)); } + deleted_livelists_count_blocks(spa, &zcb); + if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_totalasize += + metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); @@ -4285,6 +6420,7 @@ dump_block_stats(spa_t *spa) ZIO_FLAG_GODFATHER); } } + ASSERT0(spa->spa_load_verify_bytes); /* * Done after zio_wait() since zcb_haderrors is modified in @@ -4315,8 +6451,10 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + - metaslab_class_get_alloc(spa_dedup_class(spa)); + metaslab_class_get_alloc(spa_dedup_class(spa)) + + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; @@ -4359,7 +6497,7 @@ dump_block_stats(spa_t *spa) (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); - if (spa_special_class(spa)->mc_rotor != NULL) { + if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( @@ -4370,7 +6508,7 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } - if (spa_dedup_class(spa)->mc_rotor != NULL) { + if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( @@ -4381,6 +6519,17 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } + if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Embedded log class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; @@ -4514,6 +6663,11 @@ dump_block_stats(spa_t *spa) } } } + + /* Output a table summarizing block sizes in the pool */ + if (dump_opt['b'] >= 2) { + dump_size_histograms(&zcb); + } } (void) printf("\n"); @@ -4545,7 +6699,8 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { @@ -4778,7 +6933,7 @@ zdb_set_skip_mmp(char *target) * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of - * the original pool with the above suffix appened to it. In addition, + * the original pool with the above suffix appended to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. @@ -4793,6 +6948,7 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; + boolean_t freecfg = B_FALSE; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); @@ -4811,6 +6967,7 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) "spa_get_stats() failed with error %d\n", poolname, error); } + freecfg = B_TRUE; } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) @@ -4820,6 +6977,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); + if (freecfg) + nvlist_free(cfg); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); @@ -4966,7 +7125,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; - ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); + VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } @@ -5048,7 +7207,6 @@ verify_checkpoint_blocks(spa_t *spa) spa_t *checkpoint_spa; char *checkpoint_pool; - nvlist_t *config = NULL; int error = 0; /* @@ -5056,7 +7214,7 @@ verify_checkpoint_blocks(spa_t *spa) * name) so we can do verification on it against the current state * of the pool. */ - checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); @@ -5187,12 +7345,25 @@ mos_obj_refd_multiple(uint64_t obj) range_tree_add(mos_refd_objs, obj, 1); } +static void +mos_leak_vdev_top_zap(vdev_t *vd) +{ + uint64_t ms_flush_data_obj; + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(ms_flush_data_obj); +} + static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); - mos_obj_refd(vd->vdev_top_zap); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); @@ -5210,11 +7381,33 @@ mos_leak_vdev(vdev_t *vd) mos_obj_refd(space_map_object(ms->ms_sm)); } + if (vd->vdev_top_zap != 0) { + mos_obj_refd(vd->vdev_top_zap); + mos_leak_vdev_top_zap(vd); + } + for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } +static void +mos_leak_log_spacemaps(spa_t *spa) +{ + uint64_t spacemap_zap; + int error = zap_lookup(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, + sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(spacemap_zap); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) + mos_obj_refd(sls->sls_sm_obj); +} + static int dump_mos_leaks(spa_t *spa) { @@ -5246,6 +7439,10 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); + if (spa->spa_syncing_log_sm != NULL) + mos_obj_refd(spa->spa_syncing_log_sm->sm_object); + mos_leak_log_spacemaps(spa); + mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. @@ -5257,6 +7454,7 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } + deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; @@ -5266,12 +7464,12 @@ dump_mos_leaks(spa_t *spa) dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); - dump_deadlist(&ds->ds_deadlist); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); - dump_deadlist(&dp->dp_origin_snap->ds_deadlist); + dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) @@ -5323,12 +7521,92 @@ dump_mos_leaks(spa_t *spa) return (rv); } +typedef struct log_sm_obsolete_stats_arg { + uint64_t lsos_current_txg; + + uint64_t lsos_total_entries; + uint64_t lsos_valid_entries; + + uint64_t lsos_sm_entries; + uint64_t lsos_valid_sm_entries; +} log_sm_obsolete_stats_arg_t; + +static int +log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + log_sm_obsolete_stats_arg_t *lsos = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + if (lsos->lsos_current_txg == 0) { + /* this is the first log */ + lsos->lsos_current_txg = txg; + } else if (lsos->lsos_current_txg < txg) { + /* we just changed log - print stats and reset */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos->lsos_valid_sm_entries, + (u_longlong_t)lsos->lsos_sm_entries, + (u_longlong_t)lsos->lsos_current_txg); + lsos->lsos_valid_sm_entries = 0; + lsos->lsos_sm_entries = 0; + lsos->lsos_current_txg = txg; + } + ASSERT3U(lsos->lsos_current_txg, ==, txg); + + lsos->lsos_sm_entries++; + lsos->lsos_total_entries++; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + lsos->lsos_valid_sm_entries++; + lsos->lsos_valid_entries++; + return (0); +} + +static void +dump_log_spacemap_obsolete_stats(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + log_sm_obsolete_stats_arg_t lsos; + bzero(&lsos, sizeof (lsos)); + + (void) printf("Log Space Map Obsolete Entry Statistics:\n"); + + iterate_through_spacemap_logs(spa, + log_spacemap_obsolete_stats_cb, &lsos); + + /* print stats for latest log */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos.lsos_valid_sm_entries, + (u_longlong_t)lsos.lsos_sm_entries, + (u_longlong_t)lsos.lsos_current_txg); + + (void) printf("%-8llu valid entries out of %-8llu - total\n\n", + (u_longlong_t)lsos.lsos_valid_entries, + (u_longlong_t)lsos.lsos_total_entries); +} + static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; + if (dump_opt['y']) { + livelist_metaslab_validate(spa); + } + if (dump_opt['S']) { dump_simulated_ddt(spa); return; @@ -5352,11 +7630,16 @@ dump_zpool(spa_t *spa) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) { + dump_log_spacemaps(spa); + dump_log_spacemap_obsolete_stats(spa); + } if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = range_tree_create(NULL, NULL); - dump_dir(dp->dp_meta_objset); + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); + dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; @@ -5381,7 +7664,14 @@ dump_zpool(spa_t *spa) } dump_dtl(spa->spa_root_vdev, 0); } - (void) dmu_objset_find(spa_name(spa), dump_one_dir, + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) + global_feature_count[f] = UINT64_MAX; + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + global_feature_count[SPA_FEATURE_LIVELIST] = 0; + + (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) @@ -5390,21 +7680,31 @@ dump_zpool(spa_t *spa) for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; + uint64_t *arr; if (!(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET) || - !spa_feature_is_enabled(spa, f)) { - ASSERT0(dataset_feature_count[f]); - continue; + ZFEATURE_FLAG_PER_DATASET)) { + if (global_feature_count[f] == UINT64_MAX) + continue; + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(global_feature_count[f]); + continue; + } + arr = global_feature_count; + } else { + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; - if (dataset_feature_count[f] != refcount) { + if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " - "%lld datasets != %lld refcount\n", + "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, - (longlong_t)dataset_feature_count[f], - (longlong_t)refcount); + (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " @@ -5414,9 +7714,8 @@ dump_zpool(spa_t *spa) } } - if (rc == 0) { + if (rc == 0) rc = verify_device_removal_feature_counts(spa); - } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) @@ -5445,14 +7744,15 @@ dump_zpool(spa_t *spa) #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 -#define ZDB_FLAG_PHYS 0x0020 -#define ZDB_FLAG_RAW 0x0040 -#define ZDB_FLAG_PRINT_BLKPTR 0x0080 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 static int flagbits[256]; +static char flagbitstr[16]; static void -zdb_print_blkptr(blkptr_t *bp, int flags) +zdb_print_blkptr(const blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; @@ -5578,11 +7878,126 @@ name: return (NULL); } +static int +name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) +{ + dsl_dataset_t *ds; + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, + NULL, &ds); + if (error != 0) { + (void) fprintf(stderr, "failed to hold objset %llu: %s\n", + (u_longlong_t)objset_id, strerror(error)); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (error); + } + dsl_dataset_name(ds, outstr); + dsl_dataset_rele(ds, NULL); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (0); +} + +static boolean_t +zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) +{ + char *s0, *s1, *tmp = NULL; + + if (sizes == NULL) + return (B_FALSE); + + s0 = strtok_r(sizes, "/", &tmp); + if (s0 == NULL) + return (B_FALSE); + s1 = strtok_r(NULL, "/", &tmp); + *lsize = strtoull(s0, NULL, 16); + *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; + return (*lsize >= *psize && *psize > 0); +} + +#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) + +static boolean_t +zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, + uint64_t psize, int flags) +{ + boolean_t exceeded = B_FALSE; + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; + int *cfuncp = cfuncs; + uint64_t maxlsize = SPA_MAXBLOCKSIZE; + uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | + ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | + (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); + *cfuncp++ = ZIO_COMPRESS_LZ4; + *cfuncp++ = ZIO_COMPRESS_LZJB; + mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) + if (((1ULL << c) & mask) == 0) + *cfuncp++ = c; + + /* + * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this + * could take a while and we should let the user know + * we are not stuck. On the other hand, printing progress + * info gets old after a while. User can specify 'v' flag + * to see the progression. + */ + if (lsize == psize) + lsize += SPA_MINBLOCKSIZE; + else + maxlsize = lsize; + for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { + for (cfuncp = cfuncs; *cfuncp; cfuncp++) { + if (flags & ZDB_FLAG_VERBOSE) { + (void) fprintf(stderr, + "Trying %05llx -> %05llx (%s)\n", + (u_longlong_t)psize, + (u_longlong_t)lsize, + zio_compress_table[*cfuncp].\ + ci_name); + } + + /* + * We randomize lbuf2, and decompress to both + * lbuf and lbuf2. This way, we will know if + * decompression fill exactly to lsize. + */ + VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); + + if (zio_decompress_data(*cfuncp, pabd, + lbuf, psize, lsize, NULL) == 0 && + zio_decompress_data(*cfuncp, pabd, + lbuf2, psize, lsize, NULL) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (*cfuncp != 0) + break; + } + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize > maxlsize) { + exceeded = B_TRUE; + } + if (*cfuncp == ZIO_COMPRESS_ZLE) { + printf("\nZLE decompression was selected. If you " + "suspect the results are wrong,\ntry avoiding ZLE " + "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); + } + + return (exceeded); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * - * pool:vdev_specifier:offset:size[:flags] + * pool:vdev_specifier:offset:[lsize/]psize[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) @@ -5590,15 +8005,14 @@ name: * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block - * *c: Calculate and display checksums + * c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block - * p: Do I/O to physical offset * r: Dump raw data to stdout + * v: Verbose * - * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) @@ -5606,73 +8020,87 @@ zdb_read_block(char *thing, spa_t *spa) blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; - uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; + uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; - const char *s, *vdev; - char *p, *dup, *flagstr; + char *s, *p, *dup, *vdev, *flagstr, *sizes, *tmp = NULL; int i, error; - boolean_t borrowed = B_FALSE; + boolean_t borrowed = B_FALSE, found = B_FALSE; dup = strdup(thing); - s = strtok(dup, ":"); + s = strtok_r(dup, ":", &tmp); vdev = s ? s : ""; - s = strtok(NULL, ":"); + s = strtok_r(NULL, ":", &tmp); offset = strtoull(s ? s : "", NULL, 16); - s = strtok(NULL, ":"); - size = strtoull(s ? s : "", NULL, 16); - s = strtok(NULL, ":"); - if (s) - flagstr = strdup(s); - else - flagstr = strdup(""); + sizes = strtok_r(NULL, ":", &tmp); + s = strtok_r(NULL, ":", &tmp); + flagstr = strdup(s ? s : ""); s = NULL; - if (size == 0) - s = "size must not be zero"; - if (!IS_P2ALIGNED(size, DEV_BSIZE)) + tmp = NULL; + if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) + s = "invalid size(s)"; + if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); - free(flagstr); - free(dup); - return; + goto done; } - for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { - for (i = 0; flagstr[i]; i++) { + for (s = strtok_r(flagstr, ":", &tmp); + s != NULL; + s = strtok_r(NULL, ":", &tmp)) { + for (i = 0; i < strlen(flagstr); i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { - (void) printf("***Invalid flag: %c\n", - flagstr[i]); + (void) printf("***Ignoring flag: %c\n", + (uchar_t)flagstr[i]); continue; } + found = B_TRUE; flags |= bit; - /* If it's not something with an argument, keep going */ - if ((bit & (ZDB_FLAG_CHECKSUM | - ZDB_FLAG_PRINT_BLKPTR)) == 0) - continue; - p = &flagstr[i + 1]; - if (bit == ZDB_FLAG_PRINT_BLKPTR) { - blkptr_offset = strtoull(p, &p, 16); - i = p - &flagstr[i + 1]; - } if (*p != ':' && *p != '\0') { - (void) printf("***Invalid flag arg: '%s'\n", s); - free(flagstr); - free(dup); - return; + int j = 0, nextbit = flagbits[(uchar_t)*p]; + char *end, offstr[8] = { 0 }; + if ((bit == ZDB_FLAG_PRINT_BLKPTR) && + (nextbit == 0)) { + /* look ahead to isolate the offset */ + while (nextbit == 0 && + strchr(flagbitstr, *p) == NULL) { + offstr[j] = *p; + j++; + if (i + j > strlen(flagstr)) + break; + p++; + nextbit = flagbits[(uchar_t)*p]; + } + blkptr_offset = strtoull(offstr, &end, + 16); + i += j; + } else if (nextbit == 0) { + (void) printf("***Ignoring flag arg:" + " '%c'\n", (uchar_t)*p); + } } } } - free(flagstr); + if (blkptr_offset % sizeof (blkptr_t)) { + printf("Block pointer offset 0x%llx " + "must be divisible by 0x%x\n", + (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); + goto done; + } + if (found == B_FALSE && strlen(flagstr) > 0) { + printf("Invalid flag arg: '%s'\n", flagstr); + goto done; + } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { @@ -5688,9 +8116,6 @@ zdb_read_block(char *thing, spa_t *spa) vd->vdev_ops->vdev_op_type); } - psize = size; - lsize = size; - pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); @@ -5728,10 +8153,9 @@ zdb_read_block(char *thing, spa_t *spa) */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, - NULL, NULL)); + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); @@ -5742,86 +8166,125 @@ zdb_read_block(char *thing, spa_t *spa) goto out; } + uint64_t orig_lsize = lsize; + buf = lbuf; if (flags & ZDB_FLAG_DECOMPRESS) { - /* - * We don't know how the data was compressed, so just try - * every decompress function at every inflated blocksize. - */ - enum zio_compress c; - void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - - /* - * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, - * this could take a while and we should let the user know - * we are not stuck. On the other hand, printing progress - * info gets old after a while. What to do? - */ - for (lsize = psize + SPA_MINBLOCKSIZE; - lsize <= SPA_MAXBLOCKSIZE; lsize += SPA_MINBLOCKSIZE) { - for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { - /* - * ZLE can easily decompress non zle stream. - * So have an option to disable it. - */ - if (c == ZIO_COMPRESS_ZLE && - getenv("ZDB_NO_ZLE")) - continue; - - (void) fprintf(stderr, - "Trying %05llx -> %05llx (%s)\n", - (u_longlong_t)psize, (u_longlong_t)lsize, - zio_compress_table[c].ci_name); - - /* - * We randomize lbuf2, and decompress to both - * lbuf and lbuf2. This way, we will know if - * decompression fill exactly to lsize. - */ - VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); - - if (zio_decompress_data(c, pabd, - lbuf, psize, lsize) == 0 && - zio_decompress_data(c, pabd, - lbuf2, psize, lsize) == 0 && - bcmp(lbuf, lbuf2, lsize) == 0) - break; - } - if (c != ZIO_COMPRESS_FUNCTIONS) - break; - } - umem_free(lbuf2, SPA_MAXBLOCKSIZE); - - if (lsize > SPA_MAXBLOCKSIZE) { + boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, + lsize, psize, flags); + if (failed) { (void) printf("Decompress of %s failed\n", thing); goto out; } - buf = lbuf; - size = lsize; } else { - size = psize; - buf = abd_borrow_buf_copy(pabd, size); + buf = abd_borrow_buf_copy(pabd, lsize); borrowed = B_TRUE; } + /* + * Try to detect invalid block pointer. If invalid, try + * decompressing. + */ + if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && + !(flags & ZDB_FLAG_DECOMPRESS)) { + const blkptr_t *b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == + B_FALSE) { + abd_return_buf_copy(pabd, buf, lsize); + borrowed = B_FALSE; + buf = lbuf; + boolean_t failed = zdb_decompress_block(pabd, buf, + lbuf, lsize, psize, flags); + b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (failed || zfs_blkptr_verify(spa, b, B_FALSE, + BLK_VERIFY_LOG) == B_FALSE) { + printf("invalid block pointer at this DVA\n"); + goto out; + } + } + } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) - zdb_dump_block_raw(buf, size, flags); + zdb_dump_block_raw(buf, lsize, flags); else if (flags & ZDB_FLAG_INDIRECT) - zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), - flags); + zdb_dump_indirect((blkptr_t *)buf, + orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else - zdb_dump_block(thing, buf, size, flags); + zdb_dump_block(thing, buf, lsize, flags); + + /* + * If :c was specified, iterate through the checksum table to + * calculate and display each checksum for our specified + * DVA and length. + */ + if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && + !(flags & ZDB_FLAG_GBH)) { + zio_t *czio; + (void) printf("\n"); + for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; + ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { + + if ((zio_checksum_table[ck].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED) || + ck == ZIO_CHECKSUM_NOPARITY) { + continue; + } + BP_SET_CHECKSUM(bp, ck); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + czio->io_bp = bp; + + if (vd == vd->vdev_top) { + zio_nowait(zio_read(czio, spa, bp, pabd, psize, + NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_DONT_RETRY, NULL)); + } else { + zio_nowait(zio_vdev_child_io(czio, bp, vd, + offset, pabd, psize, ZIO_TYPE_READ, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + error = zio_wait(czio); + if (error == 0 || error == ECKSUM) { + zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + ck_zio->io_offset = + DVA_GET_OFFSET(&bp->blk_dva[0]); + ck_zio->io_bp = bp; + zio_checksum_compute(ck_zio, ck, pabd, lsize); + printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", + zio_checksum_table[ck].ci_name, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + zio_wait(ck_zio); + } else { + printf("error %d reading block\n", error); + } + spa_config_exit(spa, SCL_STATE, FTAG); + } + } if (borrowed) - abd_return_buf_copy(pabd, buf, size); + abd_return_buf_copy(pabd, buf, lsize); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); +done: + free(flagstr); free(dup); } @@ -5871,13 +8334,15 @@ main(int argc, char **argv) int error = 0; char **searchdirs = NULL; int nsearch = 0; - char *target, *target_pool; + char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; + int64_t objset_id = -1; + uint64_t object; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; - char *spa_config_path_env; - boolean_t target_is_spa = B_TRUE; + char *spa_config_path_env, *objset_str; + boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); @@ -5894,8 +8359,15 @@ main(int argc, char **argv) if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + while ((c = getopt(argc, argv, - "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { + "AbcCdDeEFGhiI:klLmMo:Op:PqrRsSt:uU:vVx:XYyZ")) != -1) { switch (c) { case 'b': case 'c': @@ -5910,10 +8382,13 @@ main(int argc, char **argv) case 'm': case 'M': case 'O': + case 'r': case 'R': case 's': case 'S': case 'u': + case 'y': + case 'Z': dump_opt[c]++; dump_all = 0; break; @@ -5933,10 +8408,10 @@ main(int argc, char **argv) break; /* NB: Sort single match options below. */ case 'I': - max_inflight = strtoull(optarg, NULL, 0); - if (max_inflight == 0) { + max_inflight_bytes = strtoull(optarg, NULL, 0); + if (max_inflight_bytes == 0) { (void) fprintf(stderr, "maximum number " - "of inflight I/Os must be greater " + "of inflight bytes must be greater " "than 0\n"); usage(); } @@ -5997,12 +8472,38 @@ main(int argc, char **argv) (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } + if (dump_opt['d'] || dump_opt['r']) { + /* [/ is accepted */ + if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL && + objset_str++ != NULL) { + char *endptr; + errno = 0; + objset_id = strtoull(objset_str, &endptr, 0); + /* dataset 0 is the same as opening the pool */ + if (errno == 0 && endptr != objset_str && + objset_id != 0) { + target_is_spa = B_FALSE; + dataset_lookup = B_TRUE; + } else if (objset_id != 0) { + printf("failed to open objset %s " + "%llu %s", objset_str, + (u_longlong_t)objset_id, + strerror(errno)); + exit(1); + } + /* normal dataset name not an objset ID */ + if (endptr == objset_str) { + objset_id = -1; + } + } + } #if defined(_LP64) /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ + zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; #endif @@ -6024,24 +8525,28 @@ main(int argc, char **argv) */ spa_load_verify_dryrun = B_TRUE; - kernel_init(FREAD); + /* + * ZDB should have ability to read spacemaps. + */ + spa_mode_readable_spacemaps = B_TRUE; + + kernel_init(SPA_MODE_READ); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) + if (dump_all && strchr("AeEFklLOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } - aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + libspl_assert_ok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; - if (argc < 2 && dump_opt['R']) usage(); @@ -6067,7 +8572,13 @@ main(int argc, char **argv) if (argc != 2) usage(); dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1])); + return (dump_path(argv[0], argv[1], NULL)); + } + if (dump_opt['r']) { + if (argc != 3) + usage(); + dump_opt['v'] = verbose; + error = dump_path(argv[0], argv[1], &object); } if (dump_opt['X'] || dump_opt['F']) @@ -6128,6 +8639,11 @@ main(int argc, char **argv) } } + if (searchdirs != NULL) { + umem_free(searchdirs, nsearch * sizeof (char *)); + searchdirs = NULL; + } + /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa @@ -6146,6 +8662,11 @@ main(int argc, char **argv) target = checkpoint_target; } + if (cfg != NULL) { + nvlist_free(cfg); + cfg = NULL; + } + if (target_pool != target) free(target_pool); @@ -6161,7 +8682,7 @@ main(int argc, char **argv) checkpoint_pool, error); } - } else if (target_is_spa || dump_opt['R']) { + } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); @@ -6184,9 +8705,38 @@ main(int argc, char **argv) FTAG, policy, NULL); } } + } else if (strpbrk(target, "#") != NULL) { + dsl_pool_t *dp; + error = dsl_pool_hold(target, FTAG, &dp); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + error = dump_bookmark(dp, target, B_TRUE, verbose > 1); + dsl_pool_rele(dp, FTAG); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + return (error); } else { zdb_set_skip_mmp(target); - error = open_objset(target, DMU_OST_ANY, FTAG, &os); + if (dataset_lookup == B_TRUE) { + /* + * Use the supplied id to get the name + * for open_objset. + */ + error = spa_open(target, &spa, FTAG); + if (error == 0) { + error = name_from_objset_id(spa, + objset_id, dsname); + spa_close(spa, FTAG); + if (error == 0) + target = dsname; + } + } + if (error == 0) + error = open_objset(target, FTAG, &os); if (error == 0) spa = dmu_objset_spa(os); } @@ -6206,22 +8756,45 @@ main(int argc, char **argv) argv++; argc--; - if (!dump_opt['R']) { - if (argc > 0) { - zopt_objects = argc; - zopt_object = calloc(zopt_objects, sizeof (uint64_t)); - for (unsigned i = 0; i < zopt_objects; i++) { + if (dump_opt['r']) { + error = zdb_copy_object(os, object, argv[1]); + } else if (!dump_opt['R']) { + flagbits['d'] = ZOR_FLAG_DIRECTORY; + flagbits['f'] = ZOR_FLAG_PLAIN_FILE; + flagbits['m'] = ZOR_FLAG_SPACE_MAP; + flagbits['z'] = ZOR_FLAG_ZAP; + flagbits['A'] = ZOR_FLAG_ALL_TYPES; + + if (argc > 0 && dump_opt['d']) { + zopt_object_args = argc; + zopt_object_ranges = calloc(zopt_object_args, + sizeof (zopt_object_range_t)); + for (unsigned i = 0; i < zopt_object_args; i++) { + int err; + char *msg = NULL; + + err = parse_object_range(argv[i], + &zopt_object_ranges[i], &msg); + if (err != 0) + fatal("Bad object or range: '%s': %s\n", + argv[i], msg ? msg : ""); + } + } else if (argc > 0 && dump_opt['m']) { + zopt_metaslab_args = argc; + zopt_metaslab = calloc(zopt_metaslab_args, + sizeof (uint64_t)); + for (unsigned i = 0; i < zopt_metaslab_args; i++) { errno = 0; - zopt_object[i] = strtoull(argv[i], NULL, 0); - if (zopt_object[i] == 0 && errno != 0) - fatal("bad number %s: %s", - argv[i], strerror(errno)); + zopt_metaslab[i] = strtoull(argv[i], NULL, 0); + if (zopt_metaslab[i] == 0 && errno != 0) + fatal("bad number %s: %s", argv[i], + strerror(errno)); } } if (os != NULL) { - dump_dir(os); - } else if (zopt_objects > 0 && !dump_opt['m']) { - dump_dir(spa->spa_meta_objset); + dump_objset(os); + } else if (zopt_object_args > 0 && !dump_opt['m']) { + dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } @@ -6232,8 +8805,8 @@ main(int argc, char **argv) flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; - flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; + flagbits['v'] = ZDB_FLAG_VERBOSE; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); @@ -6245,10 +8818,11 @@ main(int argc, char **argv) free(checkpoint_target); } - if (os != NULL) + if (os != NULL) { close_objset(os, FTAG); - else + } else { spa_close(spa, FTAG); + } fuid_table_destroy(); diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index c12178effa..553765b717 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -62,9 +62,9 @@ print_log_bp(const blkptr_t *bp, const char *prefix) /* ARGSUSED */ static void -zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_create(zilog_t *zilog, int txtype, const void *arg) { - lr_create_t *lr = arg; + const lr_create_t *lr = arg; time_t crtime = lr->lr_crtime[0]; char *name, *link; lr_attr_t *lrattr; @@ -98,9 +98,9 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) /* ARGSUSED */ static void -zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_remove(zilog_t *zilog, int txtype, const void *arg) { - lr_remove_t *lr = arg; + const lr_remove_t *lr = arg; (void) printf("%sdoid %llu, name %s\n", tab_prefix, (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); @@ -108,9 +108,9 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) /* ARGSUSED */ static void -zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_link(zilog_t *zilog, int txtype, const void *arg) { - lr_link_t *lr = arg; + const lr_link_t *lr = arg; (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, @@ -119,9 +119,9 @@ zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) /* ARGSUSED */ static void -zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg) { - lr_rename_t *lr = arg; + const lr_rename_t *lr = arg; char *snm = (char *)(lr + 1); char *tnm = snm + strlen(snm) + 1; @@ -148,11 +148,11 @@ zil_prt_rec_write_cb(void *data, size_t len, void *unused) /* ARGSUSED */ static void -zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) { - lr_write_t *lr = arg; + const lr_write_t *lr = arg; abd_t *data; - blkptr_t *bp = &lr->lr_blkptr; + const blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -211,9 +211,9 @@ out: /* ARGSUSED */ static void -zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg) { - lr_truncate_t *lr = arg; + const lr_truncate_t *lr = arg; (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, @@ -222,9 +222,9 @@ zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) /* ARGSUSED */ static void -zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_setattr(zilog_t *zilog, int txtype, const void *arg) { - lr_setattr_t *lr = arg; + const lr_setattr_t *lr = arg; time_t atime = (time_t)lr->lr_atime[0]; time_t mtime = (time_t)lr->lr_mtime[0]; @@ -268,15 +268,15 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) /* ARGSUSED */ static void -zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) +zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg) { - lr_acl_t *lr = arg; + const lr_acl_t *lr = arg; (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); } -typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); +typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; const char *zri_name; @@ -309,7 +309,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { /* ARGSUSED */ static int -print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) +print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg) { int txtype; int verbose = MAX(dump_opt['d'], dump_opt['i']); @@ -343,7 +343,8 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) /* ARGSUSED */ static int -print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg, + uint64_t claim_txg) { char blkbuf[BP_SPRINTF_LEN + 10]; int verbose = MAX(dump_opt['d'], dump_opt['i']); diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 9c11315f2a..7b662994d1 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -1,11 +1,10 @@ include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Shellcheck.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS) -EXTRA_DIST = zed.d/README \ - zed.d/history_event-zfs-list-cacher.sh.in +SUBDIRS = zed.d +SHELLCHECKDIRS = $(SUBDIRS) sbin_PROGRAMS = zed @@ -41,61 +40,14 @@ FMA_SRC = \ zed_SOURCES = $(ZED_SRC) $(FMA_SRC) zed_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzfs/libzfs.la + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la -zed_LDADD += -lrt +zed_LDADD += -lrt $(LIBATOMIC_LIBS) $(LIBUDEV_LIBS) $(LIBUUID_LIBS) zed_LDFLAGS = -pthread -zedconfdir = $(sysconfdir)/zfs/zed.d +EXTRA_DIST = agents/README.md -dist_zedconf_DATA = \ - zed.d/zed-functions.sh \ - zed.d/zed.rc - -zedexecdir = $(zfsexecdir)/zed.d - -dist_zedexec_SCRIPTS = \ - zed.d/all-debug.sh \ - zed.d/all-syslog.sh \ - zed.d/data-notify.sh \ - zed.d/generic-notify.sh \ - zed.d/resilver_finish-notify.sh \ - zed.d/scrub_finish-notify.sh \ - zed.d/statechange-led.sh \ - zed.d/statechange-notify.sh \ - zed.d/vdev_clear-led.sh \ - zed.d/vdev_attach-led.sh \ - zed.d/pool_import-led.sh \ - zed.d/resilver_finish-start-scrub.sh - -nodist_zedexec_SCRIPTS = zed.d/history_event-zfs-list-cacher.sh - -$(nodist_zedexec_SCRIPTS): %: %.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -zedconfdefaults = \ - all-syslog.sh \ - data-notify.sh \ - resilver_finish-notify.sh \ - scrub_finish-notify.sh \ - statechange-led.sh \ - statechange-notify.sh \ - vdev_clear-led.sh \ - vdev_attach-led.sh \ - pool_import-led.sh \ - resilver_finish-start-scrub.sh - -install-data-hook: - $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" - for f in $(zedconfdefaults); do \ - test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ - -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ - ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ - done - chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c index ae90a322cf..607b387ca3 100644 --- a/cmd/zed/agents/fmd_api.c +++ b/cmd/zed/agents/fmd_api.c @@ -25,7 +25,7 @@ */ /* - * This file imlements the minimal FMD module API required to support the + * This file implements the minimal FMD module API required to support the * fault logic modules in ZED. This support includes module registration, * memory allocation, module property accessors, basic case management, * one-shot timers and SERD engines. diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c index 043552862e..d4ec37fb76 100644 --- a/cmd/zed/agents/fmd_serd.c +++ b/cmd/zed/agents/fmd_serd.c @@ -281,7 +281,7 @@ fmd_serd_eng_empty(fmd_serd_eng_t *sgp) void fmd_serd_eng_reset(fmd_serd_eng_t *sgp) { - serd_log_msg(" SERD Engine: reseting %s", sgp->sg_name); + serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); while (sgp->sg_count != 0) fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index 6d392604bc..35dd818ff8 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -13,6 +13,7 @@ /* * Copyright (c) 2016, Intel Corporation. * Copyright (c) 2018, loli10K + * Copyright (c) 2021 Hewlett Packard Enterprise Development LP */ #include @@ -116,7 +117,8 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) /* * On a devid match, grab the vdev guid and expansion time, if any. */ - if ((nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && (strcmp(gsp->gs_devid, path) == 0)) { (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &gsp->gs_vdev_guid); @@ -176,10 +178,12 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) } /* - * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED - * ereport from vdev_disk layer after a hot unplug. Fortunately we - * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * On Linux, we don't get the expected FM_RESOURCE_REMOVED ereport + * from the vdev_disk layer after a hot unplug. Fortunately we do + * get an EC_DEV_REMOVE from our disk monitor and it is a suitable * proxy so we remap it here for the benefit of the diagnosis engine. + * Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa + * layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful. */ if ((strcmp(class, EC_DEV_REMOVE) == 0) && (strcmp(subclass, ESC_DISK) == 0) && @@ -208,12 +212,18 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or * ZFS_EV_POOL_GUID may be missing so find them. */ - (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, - &search.gs_devid); - (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); - pool_guid = search.gs_pool_guid; - vdev_guid = search.gs_vdev_guid; - devtype = search.gs_vdev_type; + if (pool_guid == 0 || vdev_guid == 0) { + if ((nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid) == 0) && + (zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search) + == 1)) { + if (pool_guid == 0) + pool_guid = search.gs_pool_guid; + if (vdev_guid == 0) + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; + } + } /* * We want to avoid reporting "remove" events coming from @@ -382,6 +392,7 @@ zfs_agent_init(libzfs_handle_t *zfs_hdl) list_destroy(&agent_events); zed_log_die("Failed to initialize agents"); } + pthread_setname_np(g_agents_tid, "agents"); } void diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 6d3e7cb112..3bcdf6e1d7 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -63,13 +63,10 @@ * If the device could not be replaced, then the second online attempt will * trigger the FMA fault that we skipped earlier. * - * ZFS on Linux porting notes: - * Linux udev provides a disk insert for both the disk and the partition - * + * On Linux udev provides a disk insert for both the disk and the partition. */ #include -#include #include #include #include @@ -157,7 +154,7 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data) * 1. physical match with no fs, no partition * tag it top, partition disk * - * 2. physical match again, see partion and tag + * 2. physical match again, see partition and tag * */ @@ -192,8 +189,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) char rawpath[PATH_MAX], fullpath[PATH_MAX]; char devpath[PATH_MAX]; int ret; - int is_dm = 0; - int is_sd = 0; + boolean_t is_dm = B_FALSE; + boolean_t is_sd = B_FALSE; uint_t c; vdev_stat_t *vs; @@ -221,8 +218,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) is_dm = zfs_dev_is_dm(path); zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" - " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path, - physpath ? physpath : "NULL", wholedisk, is_dm, + " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, + physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", (long long unsigned int)guid); /* @@ -267,7 +264,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * testing) */ if (physpath != NULL && strcmp("scsidebug", physpath) == 0) - is_sd = 1; + is_sd = B_TRUE; /* * If the pool doesn't have the autoreplace property set, then use @@ -438,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); + /* + * Prefer sequential resilvering when supported (mirrors and dRAID), + * otherwise fallback to a traditional healing resilver. + */ + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); + if (ret != 0) { + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, + B_TRUE, B_FALSE); + } zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : @@ -534,7 +539,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) (dp->dd_func)(zhp, nvl, dp->dd_islabeled); } -void +static void zfs_enable_ds(void *arg) { unavailpool_t *pool = (unavailpool_t *)arg; @@ -635,6 +640,27 @@ devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) return (data.dd_found); } +/* + * Given a device guid, find any vdevs with a matching guid. + */ +static boolean_t +guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, + zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_func = func; + data.dd_found = B_FALSE; + data.dd_pool_guid = pool_guid; + data.dd_vdev_guid = vdev_guid; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + /* * Handle a EC_DEV_ADD.ESC_DISK event. * @@ -658,15 +684,18 @@ static int zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) { char *devpath = NULL, *devid; + uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* - * Expecting a devid string and an optional physical location + * Expecting a devid string and an optional physical location and guid */ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) return (-1); (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); @@ -674,15 +703,19 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) devid, devpath ? devpath : "NULL", is_slice); /* - * Iterate over all vdevs looking for a match in the folllowing order: + * Iterate over all vdevs looking for a match in the following order: * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). - * - * For disks, we only want to pay attention to vdevs marked as whole - * disks or are a multipath device. + * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). */ - if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) - (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); + if (devid_iter(devid, zfs_process_add, is_slice)) + return (0); + if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, + is_slice)) + return (0); + if (vdev_guid != 0) + (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, + is_slice); return (0); } @@ -892,7 +925,7 @@ zfs_enum_pools(void *arg) * * sent messages from zevents or udev monitor * - * For now, each agent has it's own libzfs instance + * For now, each agent has its own libzfs instance */ int zfs_slm_init() @@ -913,6 +946,7 @@ zfs_slm_init() return (-1); } + pthread_setname_np(g_zfs_tid, "enum-pools"); list_create(&g_device_list, sizeof (struct pendingdev), offsetof(struct pendingdev, pd_node)); diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index f3dbb24b84..6c009bdc12 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -38,8 +38,10 @@ #include #include #include +#include #include #include +#include #include "zfs_agents.h" #include "fmd_api.h" @@ -219,12 +221,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) * replace it. */ for (s = 0; s < nspares; s++) { - char *spare_name; + boolean_t rebuild = B_FALSE; + char *spare_name, *type; if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; + /* prefer sequential resilvering for distributed spares */ + if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, + &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + rebuild = B_TRUE; + /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) (void) nvlist_add_uint64(spares[s], @@ -234,10 +242,10 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) ZPOOL_CONFIG_CHILDREN, &spares[s], 1); fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", - dev_name, basename(spare_name)); + dev_name, zfs_basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE) == 0) { + replacement, B_TRUE, rebuild) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -319,12 +327,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + /* * If this is a resource notifying us of device removal then simply * check for an available spare and continue unless the device is a * l2arc vdev, in which case we just offline it. */ - if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { char *devtype; char *devname; @@ -347,9 +359,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, zpool_vdev_offline(zhp, devname, B_TRUE); } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || replace_with_spare(hdl, zhp, vdev) == B_FALSE) { - /* Could not handle with spare: offline the device */ - fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); - zpool_vdev_offline(zhp, devname, B_TRUE); + /* Could not handle with spare */ + fmd_hdl_debug(hdl, "no spare for '%s'", devname); } free(devname); @@ -361,12 +372,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, return; /* - * Note: on zfsonlinux statechange events are more than just + * Note: on Linux statechange events are more than just * healthy ones so we need to confirm the actual state value. */ if (strcmp(class, "resource.fs.zfs.statechange") == 0 && - nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, - &state) == 0 && state == VDEV_STATE_HEALTHY) { + state == VDEV_STATE_HEALTHY) { zfs_vdev_repair(hdl, nvl); return; } @@ -497,6 +507,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * Attempt to substitute a hot spare. */ (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); } diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c index bba8b8f647..e45176c00b 100644 --- a/cmd/zed/zed.c +++ b/cmd/zed/zed.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -60,8 +60,8 @@ _setup_sig_handlers(void) zed_log_die("Failed to initialize sigset"); sa.sa_flags = SA_RESTART; - sa.sa_handler = SIG_IGN; + sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) < 0) zed_log_die("Failed to ignore SIGPIPE"); @@ -75,6 +75,10 @@ _setup_sig_handlers(void) sa.sa_handler = _hup_handler; if (sigaction(SIGHUP, &sa, NULL) < 0) zed_log_die("Failed to register SIGHUP handler"); + + (void) sigaddset(&sa.sa_mask, SIGCHLD); + if (pthread_sigmask(SIG_BLOCK, &sa.sa_mask, NULL) < 0) + zed_log_die("Failed to block SIGCHLD"); } /* @@ -212,22 +216,20 @@ _finish_daemonize(void) int main(int argc, char *argv[]) { - struct zed_conf *zcp; + struct zed_conf zcp; uint64_t saved_eid; int64_t saved_etime[2]; zed_log_init(argv[0]); zed_log_stderr_open(LOG_NOTICE); - zcp = zed_conf_create(); - zed_conf_parse_opts(zcp, argc, argv); - if (zcp->do_verbose) + zed_conf_init(&zcp); + zed_conf_parse_opts(&zcp, argc, argv); + if (zcp.do_verbose) zed_log_stderr_open(LOG_INFO); if (geteuid() != 0) zed_log_die("Must be run as root"); - zed_conf_parse_file(zcp); - zed_file_close_from(STDERR_FILENO + 1); (void) umask(0); @@ -235,47 +237,72 @@ main(int argc, char *argv[]) if (chdir("/") < 0) zed_log_die("Failed to change to root directory"); - if (zed_conf_scan_dir(zcp) < 0) + if (zed_conf_scan_dir(&zcp) < 0) exit(EXIT_FAILURE); - if (!zcp->do_foreground) { + if (!zcp.do_foreground) { _start_daemonize(); zed_log_syslog_open(LOG_DAEMON); } _setup_sig_handlers(); - if (zcp->do_memlock) + if (zcp.do_memlock) _lock_memory(); - if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force)) + if ((zed_conf_write_pid(&zcp) < 0) && (!zcp.do_force)) exit(EXIT_FAILURE); - if (!zcp->do_foreground) + if (!zcp.do_foreground) _finish_daemonize(); zed_log_msg(LOG_NOTICE, "ZFS Event Daemon %s-%s (PID %d)", ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid()); - if (zed_conf_open_state(zcp) < 0) + if (zed_conf_open_state(&zcp) < 0) exit(EXIT_FAILURE); - if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) + if (zed_conf_read_state(&zcp, &saved_eid, saved_etime) < 0) exit(EXIT_FAILURE); - zed_event_init(zcp); - zed_event_seek(zcp, saved_eid, saved_etime); +idle: + /* + * If -I is specified, attempt to open /dev/zfs repeatedly until + * successful. + */ + do { + if (!zed_event_init(&zcp)) + break; + /* Wait for some time and try again. tunable? */ + sleep(30); + } while (!_got_exit && zcp.do_idle); + + if (_got_exit) + goto out; + + zed_event_seek(&zcp, saved_eid, saved_etime); while (!_got_exit) { + int rv; if (_got_hup) { _got_hup = 0; - (void) zed_conf_scan_dir(zcp); + (void) zed_conf_scan_dir(&zcp); } - zed_event_service(zcp); + rv = zed_event_service(&zcp); + + /* ENODEV: When kernel module is unloaded (osx) */ + if (rv != 0) + break; } + zed_log_msg(LOG_NOTICE, "Exiting"); - zed_event_fini(zcp); - zed_conf_destroy(zcp); + zed_event_fini(&zcp); + + if (zcp.do_idle && !_got_exit) + goto idle; + +out: + zed_conf_destroy(&zcp); zed_log_fini(); exit(EXIT_SUCCESS); } diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am new file mode 100644 index 0000000000..2c8173b3e7 --- /dev/null +++ b/cmd/zed/zed.d/Makefile.am @@ -0,0 +1,57 @@ +include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + +EXTRA_DIST += README + +zedconfdir = $(sysconfdir)/zfs/zed.d + +dist_zedconf_DATA = \ + zed-functions.sh \ + zed.rc + +zedexecdir = $(zfsexecdir)/zed.d + +dist_zedexec_SCRIPTS = \ + all-debug.sh \ + all-syslog.sh \ + data-notify.sh \ + generic-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh \ + trim_finish-notify.sh + +nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh + +SUBSTFILES += $(nodist_zedexec_SCRIPTS) + +zedconfdefaults = \ + all-syslog.sh \ + data-notify.sh \ + history_event-zfs-list-cacher.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" + for f in $(zedconfdefaults); do \ + test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ + ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ + done + chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" + +# False positive: 1>&"${ZED_FLOCK_FD}" looks suspiciously similar to a >&filename bash extension +CHECKBASHISMS_IGNORE = -e 'should be >word 2>&1' -e '&"$${ZED_FLOCK_FD}"' diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh index 14b39caacd..824c9fe423 100755 --- a/cmd/zed/zed.d/all-debug.sh +++ b/cmd/zed/zed.d/all-debug.sh @@ -12,15 +12,11 @@ zed_exit_if_ignoring_this_event -lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" +zed_lock "${ZED_DEBUG_LOG}" +{ + printenv | sort + echo +} 1>&"${ZED_FLOCK_FD}" +zed_unlock "${ZED_DEBUG_LOG}" -umask 077 -zed_lock "${lockfile}" -exec >> "${ZED_DEBUG_LOG}" - -printenv | sort -echo - -exec >&- -zed_unlock "${lockfile}" exit 0 diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh index cb92865001..b07cf0f295 100755 --- a/cmd/zed/zed.d/all-syslog.sh +++ b/cmd/zed/zed.d/all-syslog.sh @@ -1,14 +1,51 @@ #!/bin/sh +# +# Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + # # Log the zevent via syslog. +# [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" zed_exit_if_ignoring_this_event -zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ - "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \ - "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \ - "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" +# build a string of name=value pairs for this event +msg="eid=${ZEVENT_EID} class=${ZEVENT_SUBCLASS}" + +if [ "${ZED_SYSLOG_DISPLAY_GUIDS}" = "1" ]; then + [ -n "${ZEVENT_POOL_GUID}" ] && msg="${msg} pool_guid=${ZEVENT_POOL_GUID}" + [ -n "${ZEVENT_VDEV_GUID}" ] && msg="${msg} vdev_guid=${ZEVENT_VDEV_GUID}" +else + [ -n "${ZEVENT_POOL}" ] && msg="${msg} pool='${ZEVENT_POOL}'" + [ -n "${ZEVENT_VDEV_PATH}" ] && msg="${msg} vdev=$(basename "${ZEVENT_VDEV_PATH}")" +fi + +# log pool state if state is anything other than 'ACTIVE' +[ -n "${ZEVENT_POOL_STATE_STR}" ] && [ "$ZEVENT_POOL_STATE" -ne 0 ] && \ + msg="${msg} pool_state=${ZEVENT_POOL_STATE_STR}" + +# Log the following payload nvpairs if they are present +[ -n "${ZEVENT_VDEV_STATE_STR}" ] && msg="${msg} vdev_state=${ZEVENT_VDEV_STATE_STR}" +[ -n "${ZEVENT_CKSUM_ALGORITHM}" ] && msg="${msg} algorithm=${ZEVENT_CKSUM_ALGORITHM}" +[ -n "${ZEVENT_ZIO_SIZE}" ] && msg="${msg} size=${ZEVENT_ZIO_SIZE}" +[ -n "${ZEVENT_ZIO_OFFSET}" ] && msg="${msg} offset=${ZEVENT_ZIO_OFFSET}" +[ -n "${ZEVENT_ZIO_PRIORITY}" ] && msg="${msg} priority=${ZEVENT_ZIO_PRIORITY}" +[ -n "${ZEVENT_ZIO_ERR}" ] && msg="${msg} err=${ZEVENT_ZIO_ERR}" +[ -n "${ZEVENT_ZIO_FLAGS}" ] && msg="${msg} flags=$(printf '0x%x' "${ZEVENT_ZIO_FLAGS}")" + +# log delays that are >= 10 milisec +[ -n "${ZEVENT_ZIO_DELAY}" ] && [ "$ZEVENT_ZIO_DELAY" -gt 10000000 ] && \ + msg="${msg} delay=$((ZEVENT_ZIO_DELAY / 1000000))ms" + +# list the bookmark data together +# shellcheck disable=SC2153 +[ -n "${ZEVENT_ZIO_OBJSET}" ] && \ + msg="${msg} bookmark=${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}:${ZEVENT_ZIO_LEVEL}:${ZEVENT_ZIO_BLKID}" + +zed_log_msg "${msg}" + exit 0 diff --git a/cmd/zed/zed.d/data-notify.sh b/cmd/zed/zed.d/data-notify.sh index 639b459bdd..792d30a66d 100755 --- a/cmd/zed/zed.d/data-notify.sh +++ b/cmd/zed/zed.d/data-notify.sh @@ -25,7 +25,7 @@ zed_rate_limit "${rate_limit_tag}" || exit 3 umask 077 note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +note_pathname="$(mktemp)" { echo "ZFS has detected a data error:" echo diff --git a/cmd/zed/zed.d/generic-notify.sh b/cmd/zed/zed.d/generic-notify.sh index e438031a08..1db26980c1 100755 --- a/cmd/zed/zed.d/generic-notify.sh +++ b/cmd/zed/zed.d/generic-notify.sh @@ -31,7 +31,7 @@ umask 077 pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" host_str=" on $(hostname)" note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +note_pathname="$(mktemp)" { echo "ZFS has posted the following event:" echo diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in index c1513cf3a0..db40fa36d6 100755 --- a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in +++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -3,9 +3,8 @@ # Track changes to enumerated pools for use in early-boot set -ef -FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache" -FSLIST_TMP="@runstatedir@/zfs-list.cache.new" -FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" +FSLIST="@sysconfdir@/zfs/zfs-list.cache/${ZEVENT_POOL}" +FSLIST_TMP="@runstatedir@/zfs-list.cache@${ZEVENT_POOL}" # If the pool specific cache file is not writeable, abort [ -w "${FSLIST}" ] || exit 0 @@ -13,21 +12,21 @@ FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" -zed_exit_if_ignoring_this_event -zed_check_cmd "${ZFS}" sort diff grep +[ "$ZEVENT_SUBCLASS" != "history_event" ] && exit 0 +zed_check_cmd "${ZFS}" sort diff # If we are acting on a snapshot, we have nothing to do -printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0 +[ "${ZEVENT_HISTORY_DSNAME%@*}" = "${ZEVENT_HISTORY_DSNAME}" ] || exit 0 -# We obtain a lock on zfs-list to avoid any simultaneous writes. +# We lock the output file to avoid simultaneous writes. # If we run into trouble, log and drop the lock abort_alter() { - zed_log_msg "Error updating zfs-list.cache!" - zed_unlock zfs-list + zed_log_msg "Error updating zfs-list.cache for ${ZEVENT_POOL}!" + zed_unlock "${FSLIST}" } finished() { - zed_unlock zfs-list + zed_unlock "${FSLIST}" trap - EXIT exit 0 } @@ -37,7 +36,7 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in ;; export) - zed_lock zfs-list + zed_lock "${FSLIST}" trap abort_alter EXIT echo > "${FSLIST}" finished @@ -46,8 +45,13 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in set|inherit) # Only act if one of the tracked properties is altered. case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in - canmount|mountpoint|atime|relatime|devices|exec| \ - readonly|setuid|nbmand) ;; + canmount|mountpoint|atime|relatime|devices|exec|readonly| \ + setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \ + org.openzfs.systemd:requires-mounts-for| \ + org.openzfs.systemd:before|org.openzfs.systemd:after| \ + org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \ + org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \ + ) ;; *) exit 0 ;; esac ;; @@ -58,11 +62,15 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in ;; esac -zed_lock zfs-list +zed_lock "${FSLIST}" trap abort_alter EXIT -PROPS="name,mountpoint,canmount,atime,relatime,devices,exec,readonly" -PROPS="${PROPS},setuid,nbmand" +PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\ +,readonly,setuid,nbmand,encroot,keylocation\ +,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\ +,org.openzfs.systemd:before,org.openzfs.systemd:after\ +,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\ +,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore" "${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}" @@ -70,7 +78,7 @@ PROPS="${PROPS},setuid,nbmand" sort "${FSLIST_TMP}" -o "${FSLIST_TMP}" # Don't modify the file if it hasn't changed -diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}" +diff -q "${FSLIST_TMP}" "${FSLIST}" || cat "${FSLIST_TMP}" > "${FSLIST}" rm -f "${FSLIST_TMP}" finished diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh index 6f9c0b3094..c7cfd1ddba 100755 --- a/cmd/zed/zed.d/resilver_finish-start-scrub.sh +++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -5,10 +5,12 @@ # Exit codes: # 1: Internal error # 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" [ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 [ -n "${ZEVENT_POOL}" ] || exit 1 [ -n "${ZEVENT_SUBCLASS}" ] || exit 1 zed_check_cmd "${ZPOOL}" || exit 1 diff --git a/cmd/zed/zed.d/scrub_finish-notify.sh b/cmd/zed/zed.d/scrub_finish-notify.sh index 2145a100a3..5c0124b8d7 100755 --- a/cmd/zed/zed.d/scrub_finish-notify.sh +++ b/cmd/zed/zed.d/scrub_finish-notify.sh @@ -41,7 +41,7 @@ fi umask 077 note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +note_pathname="$(mktemp)" { echo "ZFS has finished a ${action}:" echo diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh index 6484b79592..26e6064fa9 100755 --- a/cmd/zed/zed.d/statechange-led.sh +++ b/cmd/zed/zed.d/statechange-led.sh @@ -1,26 +1,26 @@ #!/bin/sh # -# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes. +# Turn off/on vdevs' enclosure fault LEDs when their pool's state changes. # -# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. -# Turn the LED off when it's back ONLINE again. +# Turn a vdev's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. +# Turn its LED off when it's back ONLINE again. # # This script run in two basic modes: # # 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then -# only set the LED for that particular VDEV. This is the case for statechange +# only set the LED for that particular vdev. This is the case for statechange # events and some vdev_* events. # -# 2. If those vars are not set, then check the state of all VDEVs in the pool +# 2. If those vars are not set, then check the state of all vdevs in the pool # and set the LEDs accordingly. This is the case for pool_import events. # # Note that this script requires that your enclosure be supported by the -# Linux SCSI enclosure services (ses) driver. The script will do nothing +# Linux SCSI Enclosure services (SES) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # # Exit codes: # 0: enclosure led successfully set -# 1: enclosure leds not not available +# 1: enclosure leds not available # 2: enclosure leds administratively disabled # 3: The led sysfs path passed from ZFS does not exist # 4: $ZPOOL not set @@ -29,7 +29,8 @@ [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" -if [ ! -d /sys/class/enclosure ] ; then +if [ ! -d /sys/class/enclosure ] && [ ! -d /sys/bus/pci/slots ] ; then + # No JBOD enclosure or NVMe slots exit 1 fi @@ -59,6 +60,10 @@ check_and_set_led() file="$1" val="$2" + if [ -z "$val" ]; then + return 0 + fi + if [ ! -e "$file" ] ; then return 3 fi @@ -66,11 +71,11 @@ check_and_set_led() # If another process is accessing the LED when we attempt to update it, # the update will be lost so retry until the LED actually changes or we # timeout. - for _ in $(seq 1 5); do + for _ in 1 2 3 4 5; do # We want to check the current state first, since writing to the - # 'fault' entry always always causes a SES command, even if the + # 'fault' entry always causes a SES command, even if the # current state is already what you want. - current=$(cat "${file}") + read -r current < "${file}" # On some enclosures if you write 1 to fault, and read it back, # it will return 2. Treat all non-zero values as 1 for @@ -85,27 +90,84 @@ check_and_set_led() else break fi - done + done +} + +# Fault LEDs for JBODs and NVMe drives are handled a little differently. +# +# On JBODs the fault LED is called 'fault' and on a path like this: +# +# /sys/class/enclosure/0:0:1:0/SLOT 10/fault +# +# On NVMe it's called 'attention' and on a path like this: +# +# /sys/bus/pci/slot/0/attention +# +# This function returns the full path to the fault LED file for a given +# enclosure/slot directory. +# +path_to_led() +{ + dir=$1 + if [ -f "$dir/fault" ] ; then + echo "$dir/fault" + elif [ -f "$dir/attention" ] ; then + echo "$dir/attention" + fi } state_to_val() { state="$1" - if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \ - [ "$state" = "UNAVAIL" ] ; then - echo 1 - elif [ "$state" = "ONLINE" ] ; then - echo 0 - fi + case "$state" in + FAULTED|DEGRADED|UNAVAIL) + echo 1 + ;; + ONLINE) + echo 0 + ;; + esac } -# process_pool ([pool]) # -# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to -# the VDEV's state. +# Given a nvme name like 'nvme0n1', pass back its slot directory +# like "/sys/bus/pci/slots/0" +# +nvme_dev_to_slot() +{ + dev="$1" + + # Get the address "0000:01:00.0" + address=$(cat "/sys/class/block/$dev/device/address") + + # For each /sys/bus/pci/slots subdir that is an actual number + # (rather than weird directories like "1-3/"). + # shellcheck disable=SC2010 + for i in $(ls /sys/bus/pci/slots/ | grep -E "^[0-9]+$") ; do + this_address=$(cat "/sys/bus/pci/slots/$i/address") + + # The format of address is a little different between + # /sys/class/block/$dev/device/address and + # /sys/bus/pci/slots/ + # + # address= "0000:01:00.0" + # this_address = "0000:01:00" + # + if echo "$address" | grep -Eq ^"$this_address" ; then + echo "/sys/bus/pci/slots/$i" + break + fi + done +} + + +# process_pool (pool) +# +# Iterate through a pool and set the vdevs' enclosure slot LEDs to +# those vdevs' state. # # Arguments -# pool: Optional pool name. If not specified, iterate though all pools. +# pool: Pool name. # # Return # 0 on success, 3 on missing sysfs path @@ -113,19 +175,27 @@ state_to_val() process_pool() { pool="$1" + + # The output will be the vdevs only (from "grep '/dev/'"): + # + # U45 ONLINE 0 0 0 /dev/sdk 0 + # U46 ONLINE 0 0 0 /dev/sdm 0 + # U47 ONLINE 0 0 0 /dev/sdn 0 + # U50 ONLINE 0 0 0 /dev/sdbn 0 + # + ZPOOL_SCRIPTS_AS_ROOT=1 $ZPOOL status -c upath,fault_led "$pool" | grep '/dev/' | ( rc=0 - - # Lookup all the current LED values and paths in parallel - #shellcheck disable=SC2016 - cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",' - out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=') - - #shellcheck disable=SC2034 - echo "$out" | while read -r vdev state read write chksum therest; do + while read -r vdev state _ _ _ therest; do # Read out current LED value and path - tmp=$(echo "$therest" | sed 's/^.*led_token=//g') - vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}') - current_val=$(echo "$tmp" | awk -F ',' '{print $1}') + # Get dev name (like 'sda') + dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')") + vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*) + if [ ! -d "$vdev_enc_sysfs_path" ] ; then + # This is not a JBOD disk, but it could be a PCI NVMe drive + vdev_enc_sysfs_path=$(nvme_dev_to_slot "$dev") + fi + + current_val=$(echo "$therest" | awk '{print $NF}') if [ "$current_val" != "0" ] ; then current_val=1 @@ -136,40 +206,33 @@ process_pool() continue fi - if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then - #shellcheck disable=SC2030 - rc=1 - zed_log_msg "vdev $vdev '$file/fault' doesn't exist" - continue; + led_path=$(path_to_led "$vdev_enc_sysfs_path") + if [ ! -e "$led_path" ] ; then + rc=3 + zed_log_msg "vdev $vdev '$led_path' doesn't exist" + continue fi val=$(state_to_val "$state") if [ "$current_val" = "$val" ] ; then # LED is already set correctly - continue; + continue fi - if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then - rc=1 + if ! check_and_set_led "$led_path" "$val"; then + rc=3 fi - done - - #shellcheck disable=SC2031 - if [ "$rc" = "0" ] ; then - return 0 - else - # We didn't see a sysfs entry that we wanted to set - return 3 - fi + exit "$rc"; ) } if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then - # Got a statechange for an individual VDEV + # Got a statechange for an individual vdev val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") vdev=$(basename "$ZEVENT_VDEV_PATH") - check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" + ledpath=$(path_to_led "$ZEVENT_VDEV_ENC_SYSFS_PATH") + check_and_set_led "$ledpath" "$val" else # Process the entire pool poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") diff --git a/cmd/zed/zed.d/statechange-notify.sh b/cmd/zed/zed.d/statechange-notify.sh index f46080a032..ab11dfbc99 100755 --- a/cmd/zed/zed.d/statechange-notify.sh +++ b/cmd/zed/zed.d/statechange-notify.sh @@ -15,7 +15,7 @@ # Send notification in response to a fault induced statechange # # ZEVENT_SUBCLASS: 'statechange' -# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED', 'REMOVED', or 'UNAVAIL' # # Exit codes: # 0: notification sent @@ -31,13 +31,14 @@ if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ - && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "UNAVAIL" ]; then exit 3 fi umask 077 note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +note_pathname="$(mktemp)" { if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then echo "The number of I/O errors associated with a ZFS device exceeded" diff --git a/cmd/zed/zed.d/trim_finish-notify.sh b/cmd/zed/zed.d/trim_finish-notify.sh new file mode 100755 index 0000000000..8fdb64531d --- /dev/null +++ b/cmd/zed/zed.d/trim_finish-notify.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Send notification in response to a TRIM_FINISH. The event +# will be received for each vdev in the pool which was trimmed. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +zed_check_cmd "${ZPOOL}" || exit 9 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="$(mktemp)" +{ + echo "ZFS has finished a trim:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status -t "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh index a6e608573e..2ec0ea6948 100644 --- a/cmd/zed/zed.d/zed-functions.sh +++ b/cmd/zed/zed.d/zed-functions.sh @@ -126,10 +126,8 @@ zed_lock() # Obtain a lock on the file bound to the given file descriptor. # - eval "exec ${fd}> '${lockfile}'" - err="$(flock --exclusive "${fd}" 2>&1)" - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then + eval "exec ${fd}>> '${lockfile}'" + if ! err="$(flock --exclusive "${fd}" 2>&1)"; then zed_log_err "failed to lock \"${lockfile}\": ${err}" fi @@ -165,9 +163,7 @@ zed_unlock() fi # Release the lock and close the file descriptor. - err="$(flock --unlock "${fd}" 2>&1)" - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then + if ! err="$(flock --unlock "${fd}" 2>&1)"; then zed_log_err "failed to unlock \"${lockfile}\": ${err}" fi eval "exec ${fd}>&-" @@ -202,6 +198,14 @@ zed_notify() [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_pushover "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + [ "${num_success}" -gt 0 ] && return 0 [ "${num_failure}" -gt 0 ] && return 1 return 2 @@ -263,7 +267,7 @@ zed_notify_email() -e "s/@SUBJECT@/${subject}/g")" # shellcheck disable=SC2086 - eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1 + eval ${ZED_EMAIL_PROG} ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1 rv=$? if [ "${rv}" -ne 0 ]; then zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}" @@ -359,6 +363,158 @@ zed_notify_pushbullet() } +# zed_notify_slack_webhook (subject, pathname) +# +# Notification via Slack Webhook . +# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the +# Slack channel. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://api.slack.com/incoming-webhooks +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_SLACK_WEBHOOK_URL +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_slack_webhook() +{ + [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2 + + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="${ZED_SLACK_WEBHOOK_URL}" + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "slack webhook cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Construct the JSON message for posting. + # + msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )" + + # Send the POST request and check for errors. + # + msg_out="$(curl -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "slack webhook \"${msg_err}"\" + return 1 + fi + return 0 +} + +# zed_notify_pushover (subject, pathname) +# +# Send a notification via Pushover . +# The access token (ZED_PUSHOVER_TOKEN) identifies this client to the +# Pushover server. The user token (ZED_PUSHOVER_USER) defines the user or +# group to which the notification will be sent. +# +# Requires curl and sed executables to be installed in the standard PATH. +# +# References +# https://pushover.net/api +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_PUSHOVER_TOKEN +# ZED_PUSHOVER_USER +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_pushover() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_out + local msg_err + local url="https://api.pushover.net/1/messages.json" + + [ -n "${ZED_PUSHOVER_TOKEN}" ] && [ -n "${ZED_PUSHOVER_USER}" ] || return 2 + + if [ ! -r "${pathname}" ]; then + zed_log_err "pushover cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "curl" "sed" || return 1 + + # Read the message body in. + # + msg_body="$(cat "${pathname}")" + + if [ -z "${msg_body}" ] + then + msg_body=$subject + subject="" + fi + + # Send the POST request and check for errors. + # + msg_out="$( \ + curl \ + --form-string "token=${ZED_PUSHOVER_TOKEN}" \ + --form-string "user=${ZED_PUSHOVER_USER}" \ + --form-string "message=${msg_body}" \ + --form-string "title=${subject}" \ + "${url}" \ + 2>/dev/null \ + )"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "pushover \"${msg_err}"\" + return 1 + fi + return 0 +} + + # zed_rate_limit (tag, [interval]) # # Check whether an event of a given type [tag] has already occurred within the @@ -433,10 +589,8 @@ zed_guid_to_pool() return fi - guid=$(printf "%llu" "$1") - if [ -n "$guid" ] ; then - $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}' - fi + guid="$(printf "%u" "$1")" + $ZPOOL get -H -ovalue,name guid | awk '$1 == '"$guid"' {print $2; exit}' } # zed_exit_if_ignoring_this_event diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 0ef7068490..3bbd701f33 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -74,6 +74,31 @@ # #ZED_PUSHBULLET_CHANNEL_TAG="" +## +# Slack Webhook URL. +# This allows posting to the given channel and includes an access token. +# +# Disabled by default; uncomment to enable. +# +#ZED_SLACK_WEBHOOK_URL="" + +## +# Pushover token. +# This defines the application from which the notification will be sent. +# +# Disabled by default; uncomment to enable. +# ZED_PUSHOVER_USER, below, must also be configured. +# +#ZED_PUSHOVER_TOKEN="" + +## +# Pushover user key. +# This defines which user or group will receive Pushover notifications. +# +# Disabled by default; uncomment to enable. +# ZED_PUSHOVER_TOKEN, above, must also be configured. +#ZED_PUSHOVER_USER="" + ## # Default directory for zed state files. # @@ -81,8 +106,8 @@ ## # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for -# device mapper and multipath devices as well. Your enclosure must be -# supported by the Linux SES driver for this to work. +# device mapper and multipath devices as well. This works with JBOD enclosures +# and NVMe PCI drives (assuming they're supported by Linux in sysfs). # ZED_USE_ENCLOSURE_LEDS=1 @@ -110,5 +135,10 @@ ZED_USE_ENCLOSURE_LEDS=1 # Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the # matching subclasses are excluded from logging. #ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" -#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event" +ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" + +## +# Use GUIDs instead of names when logging pool and vdevs +# Disabled by default, 1 to enable and 0 to disable. +#ZED_SYSLOG_DISPLAY_GUIDS=1 diff --git a/cmd/zed/zed.h b/cmd/zed/zed.h index 3ac0e63141..94f13c2c9d 100644 --- a/cmd/zed/zed.h +++ b/cmd/zed/zed.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -15,11 +15,6 @@ #ifndef ZED_H #define ZED_H -/* - * Absolute path for the default zed configuration file. - */ -#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" - /* * Absolute path for the default zed pid file. */ @@ -35,16 +30,6 @@ */ #define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d" -/* - * Reserved for future use. - */ -#define ZED_MAX_EVENTS 0 - -/* - * Reserved for future use. - */ -#define ZED_MIN_EVENTS 0 - /* * String prefix for ZED variables passed via environment variables. */ diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c index 86671369c1..59935102f1 100644 --- a/cmd/zed/zed_conf.c +++ b/cmd/zed/zed_conf.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -32,43 +33,26 @@ #include "zed_strings.h" /* - * Return a new configuration with default values. + * Initialise the configuration with default values. */ -struct zed_conf * -zed_conf_create(void) +void +zed_conf_init(struct zed_conf *zcp) { - struct zed_conf *zcp; + memset(zcp, 0, sizeof (*zcp)); - zcp = calloc(1, sizeof (*zcp)); - if (!zcp) - goto nomem; + /* zcp->zfs_hdl opened in zed_event_init() */ + /* zcp->zedlets created in zed_conf_scan_dir() */ - zcp->syslog_facility = LOG_DAEMON; - zcp->min_events = ZED_MIN_EVENTS; - zcp->max_events = ZED_MAX_EVENTS; - zcp->pid_fd = -1; - zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */ - zcp->state_fd = -1; /* opened via zed_conf_open_state() */ - zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ - zcp->zevent_fd = -1; /* opened via zed_event_init() */ + zcp->pid_fd = -1; /* opened in zed_conf_write_pid() */ + zcp->state_fd = -1; /* opened in zed_conf_open_state() */ + zcp->zevent_fd = -1; /* opened in zed_event_init() */ - if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) - goto nomem; + zcp->max_jobs = 16; - if (!(zcp->pid_file = strdup(ZED_PID_FILE))) - goto nomem; - - if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR))) - goto nomem; - - if (!(zcp->state_file = strdup(ZED_STATE_FILE))) - goto nomem; - - return (zcp); - -nomem: - zed_log_die("Failed to create conf: %s", strerror(errno)); - return (NULL); + if (!(zcp->pid_file = strdup(ZED_PID_FILE)) || + !(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)) || + !(zcp->state_file = strdup(ZED_STATE_FILE))) + zed_log_die("Failed to create conf: %s", strerror(errno)); } /* @@ -79,9 +63,6 @@ nomem: void zed_conf_destroy(struct zed_conf *zcp) { - if (!zcp) - return; - if (zcp->state_fd >= 0) { if (close(zcp->state_fd) < 0) zed_log_msg(LOG_WARNING, @@ -102,10 +83,6 @@ zed_conf_destroy(struct zed_conf *zcp) zcp->pid_file, strerror(errno)); zcp->pid_fd = -1; } - if (zcp->conf_file) { - free(zcp->conf_file); - zcp->conf_file = NULL; - } if (zcp->pid_file) { free(zcp->pid_file); zcp->pid_file = NULL; @@ -122,7 +99,6 @@ zed_conf_destroy(struct zed_conf *zcp) zed_strings_destroy(zcp->zedlets); zcp->zedlets = NULL; } - free(zcp); } /* @@ -132,44 +108,52 @@ zed_conf_destroy(struct zed_conf *zcp) * otherwise, output to stderr and exit with a failure status. */ static void -_zed_conf_display_help(const char *prog, int got_err) +_zed_conf_display_help(const char *prog, boolean_t got_err) { + struct opt { const char *o, *d, *v; }; + FILE *fp = got_err ? stderr : stdout; - int w1 = 4; /* width of leading whitespace */ - int w2 = 8; /* width of L-justified option field */ + + struct opt *oo; + struct opt iopts[] = { + { .o = "-h", .d = "Display help" }, + { .o = "-L", .d = "Display license information" }, + { .o = "-V", .d = "Display version information" }, + {}, + }; + struct opt nopts[] = { + { .o = "-v", .d = "Be verbose" }, + { .o = "-f", .d = "Force daemon to run" }, + { .o = "-F", .d = "Run daemon in the foreground" }, + { .o = "-I", + .d = "Idle daemon until kernel module is (re)loaded" }, + { .o = "-M", .d = "Lock all pages in memory" }, + { .o = "-P", .d = "$PATH for ZED to use (only used by ZTS)" }, + { .o = "-Z", .d = "Zero state file" }, + {}, + }; + struct opt vopts[] = { + { .o = "-d DIR", .d = "Read enabled ZEDLETs from DIR.", + .v = ZED_ZEDLET_DIR }, + { .o = "-p FILE", .d = "Write daemon's PID to FILE.", + .v = ZED_PID_FILE }, + { .o = "-s FILE", .d = "Write daemon's state to FILE.", + .v = ZED_STATE_FILE }, + { .o = "-j JOBS", .d = "Start at most JOBS at once.", + .v = "16" }, + {}, + }; fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); fprintf(fp, "\n"); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", - "Display help."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", - "Display license information."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", - "Display version information."); + for (oo = iopts; oo->o; ++oo) + fprintf(fp, " %*s %s\n", -8, oo->o, oo->d); fprintf(fp, "\n"); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", - "Be verbose."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", - "Force daemon to run."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", - "Run daemon in the foreground."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", - "Lock all pages in memory."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P", - "$PATH for ZED to use (only used by ZTS)."); - fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", - "Zero state file."); + for (oo = nopts; oo->o; ++oo) + fprintf(fp, " %*s %s\n", -8, oo->o, oo->d); fprintf(fp, "\n"); -#if 0 - fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", - "Read configuration from FILE.", ZED_CONF_FILE); -#endif - fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", - "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR); - fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", - "Write daemon's PID to FILE.", ZED_PID_FILE); - fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", - "Write daemon's state to FILE.", ZED_STATE_FILE); + for (oo = vopts; oo->o; ++oo) + fprintf(fp, " %*s %s [%s]\n", -8, oo->o, oo->d, oo->v); fprintf(fp, "\n"); exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); @@ -181,20 +165,14 @@ _zed_conf_display_help(const char *prog, int got_err) static void _zed_conf_display_license(void) { - const char **pp; - const char *text[] = { - "The ZFS Event Daemon (ZED) is distributed under the terms of the", - " Common Development and Distribution License (CDDL-1.0)", - " .", - "", + printf( + "The ZFS Event Daemon (ZED) is distributed under the terms of the\n" + " Common Development and Distribution License (CDDL-1.0)\n" + " .\n" + "\n" "Developed at Lawrence Livermore National Laboratory" - " (LLNL-CODE-403049).", - "", - NULL - }; - - for (pp = text; *pp; pp++) - printf("%s\n", *pp); + " (LLNL-CODE-403049).\n" + "\n"); exit(EXIT_SUCCESS); } @@ -229,16 +207,19 @@ _zed_conf_parse_path(char **resultp, const char *path) if (path[0] == '/') { *resultp = strdup(path); - } else if (!getcwd(buf, sizeof (buf))) { - zed_log_die("Failed to get current working dir: %s", - strerror(errno)); - } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { - zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); - } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { - zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); } else { + if (!getcwd(buf, sizeof (buf))) + zed_log_die("Failed to get current working dir: %s", + strerror(errno)); + + if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf) || + strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) + zed_log_die("Failed to copy path: %s", + strerror(ENAMETOOLONG)); + *resultp = strdup(buf); } + if (!*resultp) zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); } @@ -249,8 +230,9 @@ _zed_conf_parse_path(char **resultp, const char *path) void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) { - const char * const opts = ":hLVc:d:p:P:s:vfFMZ"; + const char * const opts = ":hLVd:p:P:s:vfFMZIj:"; int opt; + unsigned long raw; if (!zcp || !argv || !argv[0]) zed_log_die("Failed to parse options: Internal error"); @@ -260,7 +242,7 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) while ((opt = getopt(argc, argv, opts)) != -1) { switch (opt) { case 'h': - _zed_conf_display_help(argv[0], EXIT_SUCCESS); + _zed_conf_display_help(argv[0], B_FALSE); break; case 'L': _zed_conf_display_license(); @@ -268,12 +250,12 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) case 'V': _zed_conf_display_version(); break; - case 'c': - _zed_conf_parse_path(&zcp->conf_file, optarg); - break; case 'd': _zed_conf_parse_path(&zcp->zedlet_dir, optarg); break; + case 'I': + zcp->do_idle = 1; + break; case 'p': _zed_conf_parse_path(&zcp->pid_file, optarg); break; @@ -298,31 +280,30 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) case 'Z': zcp->do_zero = 1; break; + case 'j': + errno = 0; + raw = strtoul(optarg, NULL, 0); + if (errno == ERANGE || raw > INT16_MAX) { + zed_log_die("%lu is too many jobs", raw); + } if (raw == 0) { + zed_log_die("0 jobs makes no sense"); + } else { + zcp->max_jobs = raw; + } + break; case '?': default: if (optopt == '?') - _zed_conf_display_help(argv[0], EXIT_SUCCESS); + _zed_conf_display_help(argv[0], B_FALSE); - fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], - "Invalid option", optopt); - _zed_conf_display_help(argv[0], EXIT_FAILURE); + fprintf(stderr, "%s: Invalid option '-%c'\n\n", + argv[0], optopt); + _zed_conf_display_help(argv[0], B_TRUE); break; } } } -/* - * Parse the configuration file into the configuration [zcp]. - * - * FIXME: Not yet implemented. - */ -void -zed_conf_parse_file(struct zed_conf *zcp) -{ - if (!zcp) - zed_log_die("Failed to parse config: %s", strerror(EINVAL)); -} - /* * Scan the [zcp] zedlet_dir for files to exec based on the event class. * Files must be executable by user, but not writable by group or other. @@ -330,8 +311,6 @@ zed_conf_parse_file(struct zed_conf *zcp) * * Return 0 on success with an updated set of zedlets, * or -1 on error with errno set. - * - * FIXME: Check if zedlet_dir and all parent dirs are secure. */ int zed_conf_scan_dir(struct zed_conf *zcp) @@ -447,8 +426,6 @@ zed_conf_scan_dir(struct zed_conf *zcp) int zed_conf_write_pid(struct zed_conf *zcp) { - const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; - const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; char buf[PATH_MAX]; int n; char *p; @@ -476,7 +453,7 @@ zed_conf_write_pid(struct zed_conf *zcp) if (p) *p = '\0'; - if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) { + if ((mkdirp(buf, 0755) < 0) && (errno != EEXIST)) { zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s", buf, strerror(errno)); goto err; @@ -486,7 +463,7 @@ zed_conf_write_pid(struct zed_conf *zcp) */ mask = umask(0); umask(mask | 022); - zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode); + zcp->pid_fd = open(zcp->pid_file, O_RDWR | O_CREAT | O_CLOEXEC, 0644); umask(mask); if (zcp->pid_fd < 0) { zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s", @@ -523,7 +500,7 @@ zed_conf_write_pid(struct zed_conf *zcp) errno = ERANGE; zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", zcp->pid_file, strerror(errno)); - } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) { + } else if (write(zcp->pid_fd, buf, n) != n) { zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", zcp->pid_file, strerror(errno)); } else if (fdatasync(zcp->pid_fd) < 0) { @@ -551,7 +528,6 @@ int zed_conf_open_state(struct zed_conf *zcp) { char dirbuf[PATH_MAX]; - mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; int n; char *p; int rv; @@ -573,7 +549,7 @@ zed_conf_open_state(struct zed_conf *zcp) if (p) *p = '\0'; - if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { + if ((mkdirp(dirbuf, 0755) < 0) && (errno != EEXIST)) { zed_log_msg(LOG_WARNING, "Failed to create directory \"%s\": %s", dirbuf, strerror(errno)); @@ -591,7 +567,7 @@ zed_conf_open_state(struct zed_conf *zcp) (void) unlink(zcp->state_file); zcp->state_fd = open(zcp->state_file, - (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); + O_RDWR | O_CREAT | O_CLOEXEC, 0644); if (zcp->state_fd < 0) { zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", zcp->state_file, strerror(errno)); diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h index 7d6b63b1d7..0b30a1503c 100644 --- a/cmd/zed/zed_conf.h +++ b/cmd/zed/zed_conf.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -20,42 +20,39 @@ #include "zed_strings.h" struct zed_conf { - unsigned do_force:1; /* true if force enabled */ - unsigned do_foreground:1; /* true if run in foreground */ - unsigned do_memlock:1; /* true if locking memory */ - unsigned do_verbose:1; /* true if verbosity enabled */ - unsigned do_zero:1; /* true if zeroing state */ - int syslog_facility; /* syslog facility value */ - int min_events; /* RESERVED FOR FUTURE USE */ - int max_events; /* RESERVED FOR FUTURE USE */ - char *conf_file; /* abs path to config file */ char *pid_file; /* abs path to pid file */ - int pid_fd; /* fd to pid file for lock */ char *zedlet_dir; /* abs path to zedlet dir */ - zed_strings_t *zedlets; /* names of enabled zedlets */ char *state_file; /* abs path to state file */ - int state_fd; /* fd to state file */ + libzfs_handle_t *zfs_hdl; /* handle to libzfs */ - int zevent_fd; /* fd for access to zevents */ + zed_strings_t *zedlets; /* names of enabled zedlets */ char *path; /* custom $PATH for zedlets to use */ + + int pid_fd; /* fd to pid file for lock */ + int state_fd; /* fd to state file */ + int zevent_fd; /* fd for access to zevents */ + + int16_t max_jobs; /* max zedlets to run at one time */ + + boolean_t do_force:1; /* true if force enabled */ + boolean_t do_foreground:1; /* true if run in foreground */ + boolean_t do_memlock:1; /* true if locking memory */ + boolean_t do_verbose:1; /* true if verbosity enabled */ + boolean_t do_zero:1; /* true if zeroing state */ + boolean_t do_idle:1; /* true if idle enabled */ }; -struct zed_conf *zed_conf_create(void); - +void zed_conf_init(struct zed_conf *zcp); void zed_conf_destroy(struct zed_conf *zcp); void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); -void zed_conf_parse_file(struct zed_conf *zcp); - int zed_conf_scan_dir(struct zed_conf *zcp); int zed_conf_write_pid(struct zed_conf *zcp); int zed_conf_open_state(struct zed_conf *zcp); - int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); - int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); #endif /* !ZED_CONF_H */ diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 174d245232..94e2423606 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -72,6 +72,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE) + zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART); if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) @@ -379,6 +381,7 @@ zed_disk_event_init() return (-1); } + pthread_setname_np(g_mon_tid, "udev monitor"); zed_log_msg(LOG_INFO, "zed_disk_event_init"); return (0); diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index 2a7ff16fd3..9eaad0e92f 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -15,7 +15,7 @@ #include #include #include -#include /* FIXME: Replace with libzfs_core. */ +#include #include #include #include @@ -28,6 +28,7 @@ #include "zed.h" #include "zed_conf.h" #include "zed_disk_event.h" +#include "zed_event.h" #include "zed_exec.h" #include "zed_file.h" #include "zed_log.h" @@ -40,25 +41,36 @@ /* * Open the libzfs interface. */ -void +int zed_event_init(struct zed_conf *zcp) { if (!zcp) zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); zcp->zfs_hdl = libzfs_init(); - if (!zcp->zfs_hdl) + if (!zcp->zfs_hdl) { + if (zcp->do_idle) + return (-1); zed_log_die("Failed to initialize libzfs"); + } - zcp->zevent_fd = open(ZFS_DEV, O_RDWR); - if (zcp->zevent_fd < 0) + zcp->zevent_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); + if (zcp->zevent_fd < 0) { + if (zcp->do_idle) + return (-1); zed_log_die("Failed to open \"%s\": %s", ZFS_DEV, strerror(errno)); + } zfs_agent_init(zcp->zfs_hdl); - if (zed_disk_event_init() != 0) + if (zed_disk_event_init() != 0) { + if (zcp->do_idle) + return (-1); zed_log_die("Failed to initialize disk events"); + } + + return (0); } /* @@ -84,6 +96,47 @@ zed_event_fini(struct zed_conf *zcp) libzfs_fini(zcp->zfs_hdl); zcp->zfs_hdl = NULL; } + + zed_exec_fini(); +} + +static void +_bump_event_queue_length(void) +{ + int zzlm = -1, wr; + char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */ + long int qlen; + + zzlm = open("/sys/module/zfs/parameters/zfs_zevent_len_max", O_RDWR); + if (zzlm < 0) + goto done; + + if (read(zzlm, qlen_buf, sizeof (qlen_buf)) < 0) + goto done; + qlen_buf[sizeof (qlen_buf) - 1] = '\0'; + + errno = 0; + qlen = strtol(qlen_buf, NULL, 10); + if (errno == ERANGE) + goto done; + + if (qlen <= 0) + qlen = 512; /* default zfs_zevent_len_max value */ + else + qlen *= 2; + + if (qlen > INT_MAX) + qlen = INT_MAX; + wr = snprintf(qlen_buf, sizeof (qlen_buf), "%ld", qlen); + + if (pwrite(zzlm, qlen_buf, wr, 0) < 0) + goto done; + + zed_log_msg(LOG_WARNING, "Bumping queue length to %ld", qlen); + +done: + if (zzlm > -1) + (void) close(zzlm); } /* @@ -124,10 +177,7 @@ zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) if (n_dropped > 0) { zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); - /* - * FIXME: Increase max size of event nvlist in - * /sys/module/zfs/parameters/zfs_zevent_len_max ? - */ + _bump_event_queue_length(); } if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); @@ -199,7 +249,7 @@ _zed_event_value_is_hex(const char *name) * * All environment variables in [zsp] should be added through this function. */ -static int +static __attribute__((format(printf, 5, 6))) int _zed_event_add_var(uint64_t eid, zed_strings_t *zsp, const char *prefix, const char *name, const char *fmt, ...) { @@ -574,8 +624,6 @@ _zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp, * Convert the nvpair [nvp] to a string which is added to the environment * of the child process. * Return 0 on success, -1 on error. - * - * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? */ static void _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) @@ -674,23 +722,11 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) _zed_event_add_var(eid, zsp, prefix, name, "%llu", (u_longlong_t)i64); break; - case DATA_TYPE_NVLIST: - _zed_event_add_var(eid, zsp, prefix, name, - "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ - break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); _zed_event_add_var(eid, zsp, prefix, name, "%s", (str ? str : "")); break; - case DATA_TYPE_BOOLEAN_ARRAY: - _zed_event_add_var(eid, zsp, prefix, name, - "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ - break; - case DATA_TYPE_BYTE_ARRAY: - _zed_event_add_var(eid, zsp, prefix, name, - "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ - break; case DATA_TYPE_INT8_ARRAY: _zed_event_add_int8_array(eid, zsp, prefix, nvp); break; @@ -718,9 +754,11 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) case DATA_TYPE_STRING_ARRAY: _zed_event_add_string_array(eid, zsp, prefix, nvp); break; + case DATA_TYPE_NVLIST: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_NVLIST_ARRAY: - _zed_event_add_var(eid, zsp, prefix, name, - "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + _zed_event_add_var(eid, zsp, prefix, name, "_NOT_IMPLEMENTED_"); break; default: errno = EINVAL; @@ -872,7 +910,7 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) /* * Service the next zevent, blocking until one is available. */ -void +int zed_event_service(struct zed_conf *zcp) { nvlist_t *nvl; @@ -890,20 +928,17 @@ zed_event_service(struct zed_conf *zcp) errno = EINVAL; zed_log_msg(LOG_ERR, "Failed to service zevent: %s", strerror(errno)); - return; + return (EINVAL); } rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, zcp->zevent_fd); if ((rv != 0) || !nvl) - return; + return (errno); if (n_dropped > 0) { zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); - /* - * FIXME: Increase max size of event nvlist in - * /sys/module/zfs/parameters/zfs_zevent_len_max ? - */ + _bump_event_queue_length(); } if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); @@ -941,12 +976,12 @@ zed_event_service(struct zed_conf *zcp) _zed_event_add_time_strings(eid, zsp, etime); - zed_exec_process(eid, class, subclass, - zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd); + zed_exec_process(eid, class, subclass, zcp, zsp); zed_conf_write_state(zcp, eid, etime); zed_strings_destroy(zsp); } nvlist_free(nvl); + return (0); } diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h index 9f37b80fe6..5606f14a21 100644 --- a/cmd/zed/zed_event.h +++ b/cmd/zed/zed_event.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -17,13 +17,13 @@ #include -void zed_event_init(struct zed_conf *zcp); +int zed_event_init(struct zed_conf *zcp); void zed_event_fini(struct zed_conf *zcp); int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]); -void zed_event_service(struct zed_conf *zcp); +int zed_event_service(struct zed_conf *zcp); #endif /* !ZED_EVENT_H */ diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c index 037037168d..03dcd03ace 100644 --- a/cmd/zed/zed_exec.c +++ b/cmd/zed/zed_exec.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -18,16 +18,55 @@ #include #include #include +#include +#include +#include #include #include #include #include -#include "zed_file.h" +#include +#include + +#include "zed_exec.h" #include "zed_log.h" #include "zed_strings.h" #define ZEVENT_FILENO 3 +struct launched_process_node { + avl_node_t node; + pid_t pid; + uint64_t eid; + char *name; +}; + +static int +_launched_process_node_compare(const void *x1, const void *x2) +{ + pid_t p1; + pid_t p2; + + assert(x1 != NULL); + assert(x2 != NULL); + + p1 = ((const struct launched_process_node *) x1)->pid; + p2 = ((const struct launched_process_node *) x2)->pid; + + if (p1 < p2) + return (-1); + else if (p1 == p2) + return (0); + else + return (1); +} + +static pthread_t _reap_children_tid = (pthread_t)-1; +static volatile boolean_t _reap_children_stop; +static avl_tree_t _launched_processes; +static pthread_mutex_t _launched_processes_lock = PTHREAD_MUTEX_INITIALIZER; +static int16_t _launched_processes_limit; + /* * Create an environment string array for passing to execve() using the * NAME=VALUE strings in container [zsp]. @@ -78,20 +117,26 @@ _zed_exec_create_env(zed_strings_t *zsp) */ static void _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, - char *env[], int zfd) + char *env[], int zfd, boolean_t in_foreground) { char path[PATH_MAX]; int n; pid_t pid; int fd; - pid_t wpid; - int status; + struct launched_process_node *node; + sigset_t mask; + struct timespec launch_timeout = + { .tv_sec = 0, .tv_nsec = 200 * 1000 * 1000, }; assert(dir != NULL); assert(prog != NULL); assert(env != NULL); assert(zfd >= 0); + while (__atomic_load_n(&_launched_processes_limit, + __ATOMIC_SEQ_CST) <= 0) + (void) nanosleep(&launch_timeout, NULL); + n = snprintf(path, sizeof (path), "%s/%s", dir, prog); if ((n < 0) || (n >= sizeof (path))) { zed_log_msg(LOG_WARNING, @@ -99,100 +144,179 @@ _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, prog, eid, strerror(ENAMETOOLONG)); return; } + (void) pthread_mutex_lock(&_launched_processes_lock); pid = fork(); if (pid < 0) { + (void) pthread_mutex_unlock(&_launched_processes_lock); zed_log_msg(LOG_WARNING, "Failed to fork \"%s\" for eid=%llu: %s", prog, eid, strerror(errno)); return; } else if (pid == 0) { + (void) sigemptyset(&mask); + (void) sigprocmask(SIG_SETMASK, &mask, NULL); + (void) umask(022); - if ((fd = open("/dev/null", O_RDWR)) != -1) { + if (in_foreground && /* we're already devnulled if daemonised */ + (fd = open("/dev/null", O_RDWR | O_CLOEXEC)) != -1) { (void) dup2(fd, STDIN_FILENO); (void) dup2(fd, STDOUT_FILENO); (void) dup2(fd, STDERR_FILENO); } (void) dup2(zfd, ZEVENT_FILENO); - zed_file_close_from(ZEVENT_FILENO + 1); execle(path, prog, NULL, env); _exit(127); } /* parent process */ + node = calloc(1, sizeof (*node)); + if (node) { + node->pid = pid; + node->eid = eid; + node->name = strdup(prog); + + avl_add(&_launched_processes, node); + } + (void) pthread_mutex_unlock(&_launched_processes_lock); + + __atomic_sub_fetch(&_launched_processes_limit, 1, __ATOMIC_SEQ_CST); zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", prog, eid, pid); +} - /* FIXME: Timeout rogue child processes with sigalarm? */ +static void +_nop(int sig) +{} - /* - * Wait for child process using WNOHANG to limit - * the time spent waiting to 10 seconds (10,000ms). - */ - for (n = 0; n < 1000; n++) { - wpid = waitpid(pid, &status, WNOHANG); - if (wpid == (pid_t)-1) { - if (errno == EINTR) - continue; - zed_log_msg(LOG_WARNING, - "Failed to wait for \"%s\" eid=%llu pid=%d", - prog, eid, pid); - break; - } else if (wpid == 0) { - struct timespec t; +static void * +_reap_children(void *arg) +{ + struct launched_process_node node, *pnode; + pid_t pid; + int status; + struct rusage usage; + struct sigaction sa = {}; - /* child still running */ - t.tv_sec = 0; - t.tv_nsec = 10000000; /* 10ms */ - (void) nanosleep(&t, NULL); - continue; - } + (void) sigfillset(&sa.sa_mask); + (void) sigdelset(&sa.sa_mask, SIGCHLD); + (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); - if (WIFEXITED(status)) { - zed_log_msg(LOG_INFO, - "Finished \"%s\" eid=%llu pid=%d exit=%d", - prog, eid, pid, WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - zed_log_msg(LOG_INFO, - "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", - prog, eid, pid, WTERMSIG(status), - strsignal(WTERMSIG(status))); + (void) sigemptyset(&sa.sa_mask); + sa.sa_handler = _nop; + sa.sa_flags = SA_NOCLDSTOP; + (void) sigaction(SIGCHLD, &sa, NULL); + + for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) { + (void) pthread_mutex_lock(&_launched_processes_lock); + pid = wait4(0, &status, WNOHANG, &usage); + + if (pid == 0 || pid == (pid_t)-1) { + (void) pthread_mutex_unlock(&_launched_processes_lock); + if (pid == 0 || errno == ECHILD) + pause(); + else if (errno != EINTR) + zed_log_msg(LOG_WARNING, + "Failed to wait for children: %s", + strerror(errno)); } else { - zed_log_msg(LOG_INFO, - "Finished \"%s\" eid=%llu pid=%d status=0x%X", - prog, eid, (unsigned int) status); + memset(&node, 0, sizeof (node)); + node.pid = pid; + pnode = avl_find(&_launched_processes, &node, NULL); + if (pnode) { + memcpy(&node, pnode, sizeof (node)); + + avl_remove(&_launched_processes, pnode); + free(pnode); + } + (void) pthread_mutex_unlock(&_launched_processes_lock); + __atomic_add_fetch(&_launched_processes_limit, 1, + __ATOMIC_SEQ_CST); + + usage.ru_utime.tv_sec += usage.ru_stime.tv_sec; + usage.ru_utime.tv_usec += usage.ru_stime.tv_usec; + usage.ru_utime.tv_sec += + usage.ru_utime.tv_usec / (1000 * 1000); + usage.ru_utime.tv_usec %= 1000 * 1000; + + if (WIFEXITED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d " + "time=%llu.%06us exit=%d", + node.name, node.eid, pid, + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d " + "time=%llu.%06us sig=%d/%s", + node.name, node.eid, pid, + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + WTERMSIG(status), + strsignal(WTERMSIG(status))); + } else { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d " + "time=%llu.%06us status=0x%X", + node.name, node.eid, + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + (unsigned int) status); + } + + free(node.name); } - break; } - /* - * kill child process after 10 seconds - */ - if (wpid == 0) { - zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", - prog, pid); - (void) kill(pid, SIGKILL); + return (NULL); +} + +void +zed_exec_fini(void) +{ + struct launched_process_node *node; + void *ck = NULL; + + if (_reap_children_tid == (pthread_t)-1) + return; + + _reap_children_stop = B_TRUE; + (void) pthread_kill(_reap_children_tid, SIGCHLD); + (void) pthread_join(_reap_children_tid, NULL); + + while ((node = avl_destroy_nodes(&_launched_processes, &ck)) != NULL) { + free(node->name); + free(node); } + avl_destroy(&_launched_processes); + + (void) pthread_mutex_destroy(&_launched_processes_lock); + (void) pthread_mutex_init(&_launched_processes_lock, NULL); + + _reap_children_tid = (pthread_t)-1; } /* * Process the event [eid] by synchronously invoking all zedlets with a * matching class prefix. * - * Each executable in [zedlets] from the directory [dir] is matched against - * the event's [class], [subclass], and the "all" class (which matches - * all events). Every zedlet with a matching class prefix is invoked. + * Each executable in [zcp->zedlets] from the directory [zcp->zedlet_dir] + * is matched against the event's [class], [subclass], and the "all" class + * (which matches all events). + * Every zedlet with a matching class prefix is invoked. * The NAME=VALUE strings in [envs] will be passed to the zedlet as * environment variables. * - * The file descriptor [zfd] is the zevent_fd used to track the + * The file descriptor [zcp->zevent_fd] is the zevent_fd used to track the * current cursor location within the zevent nvlist. * * Return 0 on success, -1 on error. */ int zed_exec_process(uint64_t eid, const char *class, const char *subclass, - const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd) + struct zed_conf *zcp, zed_strings_t *envs) { const char *class_strings[4]; const char *allclass = "all"; @@ -201,9 +325,22 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass, char **e; int n; - if (!dir || !zedlets || !envs || zfd < 0) + if (!zcp->zedlet_dir || !zcp->zedlets || !envs || zcp->zevent_fd < 0) return (-1); + if (_reap_children_tid == (pthread_t)-1) { + _launched_processes_limit = zcp->max_jobs; + + if (pthread_create(&_reap_children_tid, NULL, + _reap_children, NULL) != 0) + return (-1); + pthread_setname_np(_reap_children_tid, "reap ZEDLETs"); + + avl_create(&_launched_processes, _launched_process_node_compare, + sizeof (struct launched_process_node), + offsetof(struct launched_process_node, node)); + } + csp = class_strings; if (class) @@ -219,11 +356,13 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass, e = _zed_exec_create_env(envs); - for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) { + for (z = zed_strings_first(zcp->zedlets); z; + z = zed_strings_next(zcp->zedlets)) { for (csp = class_strings; *csp; csp++) { n = strlen(*csp); if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) - _zed_exec_fork_child(eid, dir, z, e, zfd); + _zed_exec_fork_child(eid, zcp->zedlet_dir, + z, e, zcp->zevent_fd, zcp->do_foreground); } } free(e); diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h index 69179c92c5..e4c8d86335 100644 --- a/cmd/zed/zed_exec.h +++ b/cmd/zed/zed_exec.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -16,9 +16,12 @@ #define ZED_EXEC_H #include +#include "zed_strings.h" +#include "zed_conf.h" + +void zed_exec_fini(void); int zed_exec_process(uint64_t eid, const char *class, const char *subclass, - const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, - int zevent_fd); + struct zed_conf *zcp, zed_strings_t *envs); #endif /* !ZED_EXEC_H */ diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c index 3a1a661faa..b62f68b261 100644 --- a/cmd/zed/zed_file.c +++ b/cmd/zed/zed_file.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -12,72 +12,17 @@ * You may not use this file except in compliance with the license. */ +#include #include #include #include #include -#include #include #include #include +#include "zed_file.h" #include "zed_log.h" -/* - * Read up to [n] bytes from [fd] into [buf]. - * Return the number of bytes read, 0 on EOF, or -1 on error. - */ -ssize_t -zed_file_read_n(int fd, void *buf, size_t n) -{ - unsigned char *p; - size_t n_left; - ssize_t n_read; - - p = buf; - n_left = n; - while (n_left > 0) { - if ((n_read = read(fd, p, n_left)) < 0) { - if (errno == EINTR) - continue; - else - return (-1); - - } else if (n_read == 0) { - break; - } - n_left -= n_read; - p += n_read; - } - return (n - n_left); -} - -/* - * Write [n] bytes from [buf] out to [fd]. - * Return the number of bytes written, or -1 on error. - */ -ssize_t -zed_file_write_n(int fd, void *buf, size_t n) -{ - const unsigned char *p; - size_t n_left; - ssize_t n_written; - - p = buf; - n_left = n; - while (n_left > 0) { - if ((n_written = write(fd, p, n_left)) < 0) { - if (errno == EINTR) - continue; - else - return (-1); - - } - n_left -= n_written; - p += n_written; - } - return (n); -} - /* * Set an exclusive advisory lock on the open file descriptor [fd]. * Return 0 on success, 1 if a conflicting lock is held by another process, @@ -159,6 +104,13 @@ zed_file_is_locked(int fd) return (lock.l_pid); } + +#if __APPLE__ +#define PROC_SELF_FD "/dev/fd" +#else /* Linux-compatible layout */ +#define PROC_SELF_FD "/proc/self/fd" +#endif + /* * Close all open file descriptors greater than or equal to [lowfd]. * Any errors encountered while closing file descriptors are ignored. @@ -166,51 +118,24 @@ zed_file_is_locked(int fd) void zed_file_close_from(int lowfd) { - const int maxfd_def = 256; - int errno_bak; - struct rlimit rl; - int maxfd; + int errno_bak = errno; + int maxfd = 0; int fd; + DIR *fddir; + struct dirent *fdent; - errno_bak = errno; - - if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { - maxfd = maxfd_def; - } else if (rl.rlim_max == RLIM_INFINITY) { - maxfd = maxfd_def; + if ((fddir = opendir(PROC_SELF_FD)) != NULL) { + while ((fdent = readdir(fddir)) != NULL) { + fd = atoi(fdent->d_name); + if (fd > maxfd && fd != dirfd(fddir)) + maxfd = fd; + } + (void) closedir(fddir); } else { - maxfd = rl.rlim_max; + maxfd = sysconf(_SC_OPEN_MAX); } for (fd = lowfd; fd < maxfd; fd++) (void) close(fd); errno = errno_bak; } - -/* - * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically - * closed upon successful execution of one of the exec functions. - * Return 0 on success, or -1 on error. - * - * FIXME: No longer needed? - */ -int -zed_file_close_on_exec(int fd) -{ - int flags; - - if (fd < 0) { - errno = EBADF; - return (-1); - } - flags = fcntl(fd, F_GETFD); - if (flags == -1) - return (-1); - - flags |= FD_CLOEXEC; - - if (fcntl(fd, F_SETFD, flags) == -1) - return (-1); - - return (0); -} diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h index 05f360d20e..7e3a0efcaf 100644 --- a/cmd/zed/zed_file.h +++ b/cmd/zed/zed_file.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -18,10 +18,6 @@ #include #include -ssize_t zed_file_read_n(int fd, void *buf, size_t n); - -ssize_t zed_file_write_n(int fd, void *buf, size_t n); - int zed_file_lock(int fd); int zed_file_unlock(int fd); @@ -30,6 +26,4 @@ pid_t zed_file_is_locked(int fd); void zed_file_close_from(int fd); -int zed_file_close_on_exec(int fd); - #endif /* !ZED_FILE_H */ diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c index 5a3f2dbdb8..0c4ab6f47d 100644 --- a/cmd/zed/zed_log.c +++ b/cmd/zed/zed_log.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h index a03a4f5396..ed88ad41d7 100644 --- a/cmd/zed/zed_log.h +++ b/cmd/zed/zed_log.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c index 51b872ac73..52a86e9296 100644 --- a/cmd/zed/zed_strings.c +++ b/cmd/zed/zed_strings.c @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -108,7 +108,7 @@ _zed_strings_node_destroy(zed_strings_node_t *np) * If [key] is specified, it will be used to index the node; otherwise, * the string [val] will be used. */ -zed_strings_node_t * +static zed_strings_node_t * _zed_strings_node_create(const char *key, const char *val) { zed_strings_node_t *np; diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h index 37a84cad7f..804639592f 100644 --- a/cmd/zed/zed_strings.h +++ b/cmd/zed/zed_strings.h @@ -1,9 +1,9 @@ /* - * This file is part of the ZFS Event Daemon (ZED) - * for ZFS on Linux (ZoL) . + * This file is part of the ZFS Event Daemon (ZED). + * * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. - * Refer to the ZoL git commit log for authoritative copyright attribution. + * Refer to the OpenZFS git commit log for authoritative copyright attribution. * * The contents of this file are subject to the terms of the * Common Development and Distribution License Version 1.0 (CDDL-1.0). diff --git a/cmd/zfs/Makefile.am b/cmd/zfs/Makefile.am index 8b6ddaa200..1ead457f0f 100644 --- a/cmd/zfs/Makefile.am +++ b/cmd/zfs/Makefile.am @@ -1,9 +1,5 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - sbin_PROGRAMS = zfs zfs_SOURCES = \ @@ -15,7 +11,15 @@ zfs_SOURCES = \ zfs_projectutil.h zfs_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzfs/libzfs.la \ - $(top_builddir)/lib/libzfs_core/libzfs_core.la + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zfs_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zfs_LDADD += -lgeom -ljail +endif + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index afaa5e881b..672c1e2ec2 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -29,10 +29,13 @@ * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include +#include #include #include #include @@ -49,8 +52,9 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -68,7 +72,6 @@ #include #include #include -#include #ifdef HAVE_IDMAP #include #include @@ -77,12 +80,10 @@ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" -#include "libzfs_impl.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; -static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; @@ -113,12 +114,18 @@ static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); -static int zfs_do_remap(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); +static int zfs_do_redact(int argc, char **argv); +static int zfs_do_wait(int argc, char **argv); + +#ifdef __FreeBSD__ +static int zfs_do_jail(int argc, char **argv); +static int zfs_do_unjail(int argc, char **argv); +#endif /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -167,13 +174,16 @@ typedef enum { HELP_HOLDS, HELP_RELEASE, HELP_DIFF, - HELP_REMAP, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, - HELP_VERSION + HELP_VERSION, + HELP_REDACT, + HELP_JAIL, + HELP_UNJAIL, + HELP_WAIT, } zfs_help_t; typedef struct zfs_command { @@ -234,10 +244,16 @@ static zfs_command_t command_table[] = { { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, - { "remap", zfs_do_remap, HELP_REMAP }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, + { "redact", zfs_do_redact, HELP_REDACT }, + { "wait", zfs_do_wait, HELP_WAIT }, + +#ifdef __FreeBSD__ + { "jail", zfs_do_jail, HELP_JAIL }, + { "unjail", zfs_do_unjail, HELP_UNJAIL }, +#endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -252,9 +268,9 @@ get_usage(zfs_help_t idx) return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: - return (gettext("\tcreate [-p] [-o property=value] ... " + return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" - "\tcreate [-ps] [-b blocksize] [-o property=value] ... " + "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" @@ -279,30 +295,34 @@ get_usage(zfs_help_t idx) "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" - "\tmount [-lvO] [-o opts] <-a | filesystem>\n")); + "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: - return (gettext("\treceive [-vnsFhu] " + return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" - "\treceive [-vnsFhu] [-o =] ... " + "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" - "\trename [-f] -p \n" + "\trename -p [-f] \n" + "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " "\n" - "\tsend [-nvPLecw] [-i snapshot|bookmark] " + "\tsend [-DnvPLecw] [-i snapshot|bookmark] " "\n" - "\tsend [-nvPe] -t \n")); + "\tsend [-DnPpvLec] [-i bookmark|snapshot] " + "--redact \n" + "\tsend [-nvPe] -t \n" + "\tsend [-Pnv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); @@ -312,7 +332,7 @@ get_usage(zfs_help_t idx) return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: - return (gettext("\tunmount [-f] " + return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " @@ -341,16 +361,16 @@ get_usage(zfs_help_t idx) return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " - "\n")); + "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " - "\n")); + "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" - "\t [-S field] ... \n")); + "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" @@ -365,10 +385,9 @@ get_usage(zfs_help_t idx) case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); - case HELP_REMAP: - return (gettext("\tremap \n")); case HELP_BOOKMARK: - return (gettext("\tbookmark \n")); + return (gettext("\tbookmark " + "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" @@ -381,15 +400,23 @@ get_usage(zfs_help_t idx) "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" - "\t [-o keylocation=] [-o pbkfd2iters=]\n" + "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion\n")); + case HELP_REDACT: + return (gettext("\tredact " + " ...\n")); + case HELP_JAIL: + return (gettext("\tjail \n")); + case HELP_UNJAIL: + return (gettext("\tunjail \n")); + case HELP_WAIT: + return (gettext("\twait [-t ] \n")); + default: + __builtin_unreachable(); } - - abort(); - /* NOTREACHED */ } void @@ -414,7 +441,7 @@ safe_malloc(size_t size) return (data); } -void * +static void * safe_realloc(void *data, size_t size) { void *newp; @@ -543,6 +570,8 @@ usage(boolean_t requested) (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "written#"); + (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); @@ -698,6 +727,32 @@ finish_progress(char *done) pt_header = NULL; } +/* This function checks if the passed fd refers to /dev/null or /dev/zero */ +#ifdef __linux__ +static boolean_t +is_dev_nullzero(int fd) +{ + struct stat st; + fstat(fd, &st); + return (major(st.st_rdev) == 1 && (minor(st.st_rdev) == 3 /* null */ || + minor(st.st_rdev) == 5 /* zero */)); +} +#endif + +static void +note_dev_error(int err, int fd) +{ +#ifdef __linux__ + if (err == EINVAL && is_dev_nullzero(fd)) { + (void) fprintf(stderr, + gettext("Error: Writing directly to /dev/{null,zero} files" + " on certain kernels is not currently implemented.\n" + "(As a workaround, " + "try \"zfs send [...] | cat > /dev/null\")\n")); + } +#endif +} + static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { @@ -727,13 +782,12 @@ zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { - ret = zfs_mount(zhp, NULL, 0); - if (ret == EPERM && geteuid() != 0) { + if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; - } else if (ret != 0) { + } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; @@ -742,6 +796,7 @@ zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) "successfully created, but not shared\n")); ret = 1; } + zfs_commit_all_shares(); } zfs_close(zhp); @@ -861,8 +916,109 @@ usage: } /* - * zfs create [-p] [-o prop=value] ... fs - * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + nvlist_t *tree, **vdevs; + uint_t nvdevs; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (ZVOL_DEFAULT_BLOCKSIZE); + } + + for (int i = 0; i < nvdevs; i++) { + nvlist_t *nv = vdevs[i]; + uint64_t ashift, ndata, nparity; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, + &ndata) == 0) { + /* dRAID minimum allocation width */ + asize = MAX(asize, ndata * (1ULL << ashift)); + } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + /* raidz minimum allocation width */ + if (nparity == 1) + asize = MAX(asize, 2 * (1ULL << ashift)); + else + asize = MAX(asize, 4 * (1ULL << ashift)); + } else { + /* mirror or (non-redundant) leaf vdev */ + asize = MAX(asize, 1ULL << ashift); + } + } + + /* + * Calculate the target volblocksize such that more than half + * of the asize is used. The following table is for 4k sectors. + * + * n asize blksz used | n asize blksz used + * -------------------------+--------------------------------- + * 1 4,096 8,192 100% | 9 36,864 32,768 88% + * 2 8,192 8,192 100% | 10 40,960 32,768 80% + * 3 12,288 8,192 66% | 11 45,056 32,768 72% + * 4 16,384 16,384 100% | 12 49,152 32,768 66% + * 5 20,480 16,384 80% | 13 53,248 32,768 61% + * 6 24,576 16,384 66% | 14 57,344 32,768 57% + * 7 28,672 16,384 57% | 15 61,440 32,768 53% + * 8 32,768 32,768 100% | 16 65,536 65,636 100% + * + * This is primarily a concern for dRAID which always allocates + * a full stripe width. For dRAID the default stripe width is + * n=8 in which case the volblocksize is set to 32k. Ignoring + * compression there are no unused sectors. This same reasoning + * applies to raidz[2,3] so target 4 sectors to minimize waste. + */ + uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + while (tgt_volblocksize * 2 <= asize) + tgt_volblocksize *= 2; + + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { + + /* Issue a warning when a non-optimal size is requested. */ + if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is less than the default " + "minimum block size (%llu).\nTo reduce wasted " + "space a volblocksize of %llu is recommended.\n"), + (u_longlong_t)volblocksize, + (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, + (u_longlong_t)tgt_volblocksize); + } else if (volblocksize < tgt_volblocksize) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is much less than the " + "minimum allocation\nunit (%llu), which wastes " + "at least %llu%% of space. To reduce wasted " + "space,\nuse a larger volblocksize (%llu is " + "recommended), fewer dRAID data disks\n" + "per group, or smaller sector size (ashift).\n"), + (u_longlong_t)volblocksize, (u_longlong_t)asize, + (u_longlong_t)((100 * (asize - volblocksize)) / + asize), (u_longlong_t)tgt_volblocksize); + } + } else { + volblocksize = tgt_volblocksize; + fnvlist_add_uint64(props, prop, volblocksize); + } + + return (volblocksize); +} + +/* + * zfs create [-Pnpv] [-o prop=value] ... fs + * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. @@ -874,25 +1030,42 @@ usage: * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. + * + * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity + * check of arguments and properties, but does not check for permissions, + * available space, etc. + * + * The '-u' flag prevents the newly created file system from being mounted. + * + * The '-v' flag is for verbose output. + * + * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; + zpool_handle_t *zpool_handle = NULL; + nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; + boolean_t dryrun = B_FALSE; + boolean_t nomount = B_FALSE; + boolean_t verbose = B_FALSE; + boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; + char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) { + while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; @@ -908,6 +1081,10 @@ zfs_do_create(int argc, char **argv) nomem(); volsize = intval; break; + case 'P': + verbose = B_TRUE; + parseable = B_TRUE; + break; case 'p': parents = B_TRUE; break; @@ -925,6 +1102,9 @@ zfs_do_create(int argc, char **argv) intval) != 0) nomem(); break; + case 'n': + dryrun = B_TRUE; + break; case 'o': if (!parseprop(props, optarg)) goto error; @@ -932,6 +1112,12 @@ zfs_do_create(int argc, char **argv) case 's': noreserve = B_TRUE; break; + case 'u': + nomount = B_TRUE; + break; + case 'v': + verbose = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); @@ -948,6 +1134,11 @@ zfs_do_create(int argc, char **argv) "used when creating a volume\n")); goto badusage; } + if (nomount && type != ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("'-u' can only be " + "used when creating a filesystem\n")); + goto badusage; + } argc -= optind; argv += optind; @@ -963,14 +1154,9 @@ zfs_do_create(int argc, char **argv) goto badusage; } - if (type == ZFS_TYPE_VOLUME && !noreserve) { - zpool_handle_t *zpool_handle; - nvlist_t *real_props = NULL; - uint64_t spa_version; + if (dryrun || type == ZFS_TYPE_VOLUME) { + char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; - zfs_prop_t resv_prop; - char *strval; - char msg[1024]; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; @@ -979,6 +1165,51 @@ zfs_do_create(int argc, char **argv) *p = '/'; if (zpool_handle == NULL) goto error; + + (void) snprintf(msg, sizeof (msg), + dryrun ? gettext("cannot verify '%s'") : + gettext("cannot create '%s'"), argv[0]); + if (props && (real_props = zfs_valid_proplist(g_zfs, type, + props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { + zpool_close(zpool_handle); + goto error; + } + } + + if (type == ZFS_TYPE_VOLUME) { + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + uint64_t volblocksize = default_volblocksize(zpool_handle, + real_props); + + if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && + nvlist_lookup_string(props, prop, &strval) != 0) { + if (asprintf(&strval, "%llu", + (u_longlong_t)volblocksize) == -1) + nomem(); + nvlist_add_string(props, prop, strval); + free(strval); + } + + /* + * If volsize is not a multiple of volblocksize, round it + * up to the nearest multiple of the volblocksize. + */ + if (volsize % volblocksize) { + volsize = P2ROUNDUP_TYPED(volsize, volblocksize, + uint64_t); + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + + if (type == ZFS_TYPE_VOLUME && !noreserve) { + uint64_t spa_version; + zfs_prop_t resv_prop; + spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) @@ -986,17 +1217,8 @@ zfs_do_create(int argc, char **argv) else resv_prop = ZFS_PROP_RESERVATION; - (void) snprintf(msg, sizeof (msg), - gettext("cannot create '%s'"), argv[0]); - if (props && (real_props = zfs_valid_proplist(g_zfs, type, - props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { - zpool_close(zpool_handle); - goto error; - } - zpool_close(zpool_handle); - - volsize = zvol_volsize_to_reservation(volsize, real_props); - nvlist_free(real_props); + volsize = zvol_volsize_to_reservation(zpool_handle, volsize, + real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { @@ -1007,6 +1229,10 @@ zfs_do_create(int argc, char **argv) } } } + if (zpool_handle != NULL) { + zpool_close(zpool_handle); + nvlist_free(real_props); + } if (parents && zfs_name_valid(argv[0], type)) { /* @@ -1018,8 +1244,50 @@ zfs_do_create(int argc, char **argv) ret = 0; goto error; } - if (zfs_create_ancestors(g_zfs, argv[0]) != 0) - goto error; + if (verbose) { + (void) printf(parseable ? "create_ancestors\t%s\n" : + dryrun ? "would create ancestors of %s\n" : + "create ancestors of %s\n", argv[0]); + } + if (!dryrun) { + if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { + goto error; + } + } + } + + if (verbose) { + nvpair_t *nvp = NULL; + (void) printf(parseable ? "create\t%s\n" : + dryrun ? "would create %s\n" : "create %s\n", argv[0]); + while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { + uint64_t uval; + char *sval; + + switch (nvpair_type(nvp)) { + case DATA_TYPE_UINT64: + VERIFY0(nvpair_value_uint64(nvp, &uval)); + (void) printf(parseable ? + "property\t%s\t%llu\n" : "\t%s=%llu\n", + nvpair_name(nvp), (u_longlong_t)uval); + break; + case DATA_TYPE_STRING: + VERIFY0(nvpair_value_string(nvp, &sval)); + (void) printf(parseable ? + "property\t%s\t%s\n" : "\t%s=%s\n", + nvpair_name(nvp), sval); + break; + default: + (void) fprintf(stderr, "property '%s' " + "has illegal type %d\n", + nvpair_name(nvp), nvpair_type(nvp)); + abort(); + } + } + } + if (dryrun) { + ret = 0; + goto error; } /* pass to libzfs */ @@ -1031,6 +1299,11 @@ zfs_do_create(int argc, char **argv) log_history = B_FALSE; } + if (nomount) { + ret = 0; + goto error; + } + ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); @@ -1502,6 +1775,13 @@ zfs_do_destroy(int argc, char **argv) return (-1); } + /* + * Unfortunately, zfs_bookmark() doesn't honor the + * casesensitivity setting. However, we can't simply + * remove this check, because lzc_destroy_bookmarks() + * ignores non-existent bookmarks, so this is necessary + * to get a proper error message. + */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); @@ -1881,7 +2161,7 @@ zfs_do_get(int argc, char **argv) flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", - "volume", "snapshot", "bookmark", + "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, @@ -1893,12 +2173,13 @@ zfs_do_get(int argc, char **argv) types |= ZFS_TYPE_VOLUME; break; case 2: + case 3: types |= ZFS_TYPE_SNAPSHOT; break; - case 3: + case 4: types |= ZFS_TYPE_BOOKMARK; break; - case 4: + case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; @@ -1931,11 +2212,11 @@ zfs_do_get(int argc, char **argv) fields = argv[0]; /* - * Handle users who want to get all snapshots of the current - * dataset (ex. 'zfs get -t snapshot refer '). + * Handle users who want to get all snapshots or bookmarks + * of a dataset (ex. 'zfs get -t snapshot refer '). */ - if (types == ZFS_TYPE_SNAPSHOT && argc > 1 && - (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } @@ -2345,11 +2626,13 @@ zfs_do_upgrade(int argc, char **argv) /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] - * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * [-S field [-S field]...] [-t type[,...]] + * filesystem | snapshot | path * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] - * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * [-S field [-S field]...] [-t type[,...]] + * filesystem | snapshot | path * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] - * [-S field [-S field]...] filesystem | snapshot + * [-S field [-S field]...] filesystem | snapshot | path * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. @@ -3055,10 +3338,10 @@ zfs_do_userspace(int argc, char **argv) } while (delim != NULL); } - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | + if ((zhp = zfs_path_to_zhandle(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); - if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) { + if (zfs_get_underlying_type(zhp) != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); @@ -3435,11 +3718,11 @@ zfs_do_list(int argc, char **argv) types &= ~ZFS_TYPE_SNAPSHOT; /* - * Handle users who want to list all snapshots of the current - * dataset (ex. 'zfs list -t snapshot '). + * Handle users who want to list all snapshots or bookmarks + * of the current dataset (ex. 'zfs list -t snapshot '). */ - if (types == ZFS_TYPE_SNAPSHOT && argc > 0 && - (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } @@ -3468,36 +3751,40 @@ zfs_do_list(int argc, char **argv) } /* - * zfs rename [-f] + * zfs rename [-fu] * zfs rename [-f] -p - * zfs rename -r + * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. + * The '-u' flag prevents file systems from being remounted during rename. */ /* ARGSUSED */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; + renameflags_t flags = { 0 }; int c; int ret = 0; - boolean_t recurse = B_FALSE; + int types; boolean_t parents = B_FALSE; - boolean_t force_unmount = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "prf")) != -1) { + while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': - recurse = B_TRUE; + flags.recursive = B_TRUE; + break; + case 'u': + flags.nounmount = B_TRUE; break; case 'f': - force_unmount = B_TRUE; + flags.forceunmount = B_TRUE; break; case '?': default: @@ -3526,20 +3813,32 @@ zfs_do_rename(int argc, char **argv) usage(B_FALSE); } - if (recurse && parents) { + if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } - if (recurse && strchr(argv[0], '@') == 0) { + if (flags.nounmount && parents) { + (void) fprintf(stderr, gettext("-u and -p options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } - if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + if (flags.nounmount) + types = ZFS_TYPE_FILESYSTEM; + else if (parents) + types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + else + types = ZFS_TYPE_DATASET; + + if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ @@ -3549,7 +3848,7 @@ zfs_do_rename(int argc, char **argv) return (1); } - ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0); + ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); @@ -3596,6 +3895,82 @@ zfs_do_promote(int argc, char **argv) return (ret); } +static int +zfs_do_redact(int argc, char **argv) +{ + char *snap = NULL; + char *bookname = NULL; + char **rsnaps = NULL; + int numrsnaps = 0; + argv++; + argc--; + if (argc < 3) { + (void) fprintf(stderr, gettext("too few arguments\n")); + usage(B_FALSE); + } + + snap = argv[0]; + bookname = argv[1]; + rsnaps = argv + 2; + numrsnaps = argc - 2; + + nvlist_t *rsnapnv = fnvlist_alloc(); + + for (int i = 0; i < numrsnaps; i++) { + fnvlist_add_boolean(rsnapnv, rsnaps[i]); + } + + int err = lzc_redact(snap, bookname, rsnapnv); + fnvlist_free(rsnapnv); + + switch (err) { + case 0: + break; + case ENOENT: + (void) fprintf(stderr, + gettext("provided snapshot %s does not exist\n"), snap); + break; + case EEXIST: + (void) fprintf(stderr, gettext("specified redaction bookmark " + "(%s) provided already exists\n"), bookname); + break; + case ENAMETOOLONG: + (void) fprintf(stderr, gettext("provided bookmark name cannot " + "be used, final name would be too long\n")); + break; + case E2BIG: + (void) fprintf(stderr, gettext("too many redaction snapshots " + "specified\n")); + break; + case EINVAL: + if (strchr(bookname, '#') != NULL) + (void) fprintf(stderr, gettext( + "redaction bookmark name must not contain '#'\n")); + else + (void) fprintf(stderr, gettext( + "redaction snapshot must be descendent of " + "snapshot being redacted\n")); + break; + case EALREADY: + (void) fprintf(stderr, gettext("attempted to redact redacted " + "dataset or with respect to redacted dataset\n")); + break; + case ENOTSUP: + (void) fprintf(stderr, gettext("redaction bookmarks feature " + "not enabled\n")); + break; + case EXDEV: + (void) fprintf(stderr, gettext("potentially invalid redaction " + "snapshot; full dataset names required\n")); + break; + default: + (void) fprintf(stderr, gettext("internal error: %s\n"), + strerror(errno)); + } + + return (err); +} + /* * zfs rollback [-rRf] * @@ -4007,6 +4382,7 @@ usage: return (-1); } + /* * Send a backup stream to stdout. */ @@ -4021,10 +4397,12 @@ zfs_do_send(int argc, char **argv) sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; - boolean_t extraverbose = B_FALSE; + char *redactbook = NULL; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, + {"skip-missing", no_argument, NULL, 's'}, + {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, @@ -4037,12 +4415,13 @@ zfs_do_send(int argc, char **argv) {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, + {"saved", no_argument, NULL, 'S'}, {0, 0, 0, 0} }; /* check options */ - while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwb", long_options, - NULL)) != -1) { + while ((c = getopt_long(argc, argv, ":i:I:RsDpvnPLeht:cwbd:S", + long_options, NULL)) != -1) { switch (c) { case 'i': if (fromname) @@ -4058,6 +4437,12 @@ zfs_do_send(int argc, char **argv) case 'R': flags.replicate = B_TRUE; break; + case 's': + flags.skipmissing = B_TRUE; + break; + case 'd': + redactbook = optarg; + break; case 'p': flags.props = B_TRUE; break; @@ -4069,16 +4454,16 @@ zfs_do_send(int argc, char **argv) break; case 'P': flags.parsable = B_TRUE; - flags.verbose = B_TRUE; break; case 'v': - if (flags.verbose) - extraverbose = B_TRUE; - flags.verbose = B_TRUE; + flags.verbosity++; flags.progress = B_TRUE; break; case 'D': - flags.dedup = B_TRUE; + (void) fprintf(stderr, + gettext("WARNING: deduplicated send is no " + "longer supported. A regular,\n" + "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; @@ -4101,6 +4486,9 @@ zfs_do_send(int argc, char **argv) flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; + case 'S': + flags.saved = B_TRUE; + break; case ':': /* * If a parameter was not passed, optopt contains the @@ -4122,7 +4510,6 @@ zfs_do_send(int argc, char **argv) usage(B_FALSE); break; case '?': - /*FALLTHROUGH*/ default: /* * If an invalid flag was passed, optopt contains the @@ -4142,19 +4529,22 @@ zfs_do_send(int argc, char **argv) } } + if (flags.parsable && flags.verbosity == 0) + flags.verbosity = 1; + argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || - flags.backup || flags.dedup) { + flags.backup || flags.holds || + flags.saved || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } - if (argc != 0) { - (void) fprintf(stderr, gettext("no additional " - "arguments are permitted with -t\n")); + if (argc > 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { @@ -4169,6 +4559,29 @@ zfs_do_send(int argc, char **argv) } } + if (flags.saved) { + if (fromname != NULL || flags.replicate || flags.props || + flags.doall || flags.backup || + flags.holds || flags.largeblock || flags.embed_data || + flags.compress || flags.raw || redactbook != NULL) { + (void) fprintf(stderr, gettext("incompatible flags " + "combined with saved send flag\n")); + usage(B_FALSE); + } + if (strchr(argv[0], '@') != NULL) { + (void) fprintf(stderr, gettext("saved send must " + "specify the dataset with partially-received " + "state\n")); + usage(B_FALSE); + } + } + + if (flags.raw && redactbook != NULL) { + (void) fprintf(stderr, + gettext("Error: raw sends may not be redacted.\n")); + return (1); + } + if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" @@ -4176,50 +4589,96 @@ zfs_do_send(int argc, char **argv) return (1); } - if (resume_token != NULL) { - return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, - resume_token)); - } - - /* - * Special case sending a filesystem, or from a bookmark. - */ - if (strchr(argv[0], '@') == NULL || - (fromname && strchr(fromname, '#') != NULL)) { - char frombuf[ZFS_MAX_DATASET_NAME_LEN]; - - if (flags.replicate || flags.doall || flags.props || - flags.backup || flags.dedup || flags.holds || - (strchr(argv[0], '@') == NULL && - (flags.dryrun || flags.verbose || flags.progress))) { - (void) fprintf(stderr, gettext("Error: " - "Unsupported flag with filesystem or bookmark.\n")); - return (1); - } - + if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); + err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, + resume_token); + if (err != 0) + note_dev_error(errno, STDOUT_FILENO); + zfs_close(zhp); + return (err != 0); + } else if (resume_token != NULL) { + err = zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token); + if (err != 0) + note_dev_error(errno, STDOUT_FILENO); + return (err); + } + + if (flags.skipmissing && !flags.replicate) { + (void) fprintf(stderr, + gettext("skip-missing flag can only be used in " + "conjunction with replicate\n")); + usage(B_FALSE); + } + + /* + * For everything except -R and -I, use the new, cleaner code path. + */ + if (!(flags.replicate || flags.doall)) { + char frombuf[ZFS_MAX_DATASET_NAME_LEN]; + + if (fromname != NULL && (strchr(fromname, '#') == NULL && + strchr(fromname, '@') == NULL)) { + /* + * Neither bookmark or snapshot was specified. Print a + * warning, and assume snapshot. + */ + (void) fprintf(stderr, "Warning: incremental source " + "didn't specify type, assuming snapshot. Use '@' " + "or '#' prefix to avoid ambiguity.\n"); + (void) snprintf(frombuf, sizeof (frombuf), "@%s", + fromname); + fromname = frombuf; + } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ + char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; - (void) strlcat(frombuf, fromname, sizeof (frombuf)); + (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } - err = zfs_send_one(zhp, fromname, STDOUT_FILENO, flags); + + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, + redactbook); zfs_close(zhp); + if (err != 0) + note_dev_error(errno, STDOUT_FILENO); return (err != 0); } - cp = strchr(argv[0], '@'); + if (fromname != NULL && strchr(fromname, '#')) { + (void) fprintf(stderr, + gettext("Error: multiple snapshots cannot be " + "sent from a bookmark.\n")); + return (1); + } + + if (redactbook != NULL) { + (void) fprintf(stderr, gettext("Error: multiple snapshots " + "cannot be sent redacted.\n")); + return (1); + } + + if ((cp = strchr(argv[0], '@')) == NULL) { + (void) fprintf(stderr, gettext("Error: " + "Unsupported flag with filesystem or bookmark.\n")); + return (1); + } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); @@ -4262,9 +4721,9 @@ zfs_do_send(int argc, char **argv) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, - extraverbose ? &dbgnv : NULL); + flags.verbosity >= 3 ? &dbgnv : NULL); - if (extraverbose && dbgnv != NULL) { + if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr @@ -4275,6 +4734,7 @@ zfs_do_send(int argc, char **argv) nvlist_free(dbgnv); } zfs_close(zhp); + note_dev_error(errno, STDOUT_FILENO); return (err != 0); } @@ -4294,7 +4754,7 @@ zfs_do_receive(int argc, char **argv) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":o:x:dehnuvFsA")) != -1) { + while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { @@ -4329,6 +4789,9 @@ zfs_do_receive(int argc, char **argv) case 'h': flags.skipholds = B_TRUE; break; + case 'M': + flags.forceunmount = B_TRUE; + break; case 'n': flags.dryrun = B_TRUE; break; @@ -4465,7 +4928,6 @@ zfs_do_receive(int argc, char **argv) #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" -#define ZFS_DELEG_PERM_REMAP "remap" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" @@ -4493,7 +4955,6 @@ static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, - { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, @@ -4894,7 +5355,6 @@ parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; - boolean_t is_set = B_TRUE; who_perm_t *who_perm = NULL; assert('$' == name[2]); @@ -4924,57 +5384,63 @@ parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) assert(!"unhandled zfs_deleg_who_type_t"); } - if (is_set) { - who_perm_node_t *found_node = NULL; - who_perm_node_t *node = safe_malloc( - sizeof (who_perm_node_t)); - who_perm = &node->who_perm; - uu_avl_index_t idx = 0; + who_perm_node_t *found_node = NULL; + who_perm_node_t *node = safe_malloc( + sizeof (who_perm_node_t)); + who_perm = &node->who_perm; + uu_avl_index_t idx = 0; - uu_avl_node_init(node, &node->who_avl_node, avl_pool); - who_perm_init(who_perm, fsperm, perm_type, perm_name); + uu_avl_node_init(node, &node->who_avl_node, avl_pool); + who_perm_init(who_perm, fsperm, perm_type, perm_name); - if ((found_node = uu_avl_find(avl, node, NULL, &idx)) - == NULL) { - if (avl == fsperm->fsp_uge_avl) { - uid_t rid = 0; - struct passwd *p = NULL; - struct group *g = NULL; - const char *nice_name = NULL; + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) { + if (avl == fsperm->fsp_uge_avl) { + uid_t rid = 0; + struct passwd *p = NULL; + struct group *g = NULL; + const char *nice_name = NULL; - switch (perm_type) { - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_USER: - rid = atoi(perm_name); - p = getpwuid(rid); - if (p) - nice_name = p->pw_name; - break; - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_GROUP: - rid = atoi(perm_name); - g = getgrgid(rid); - if (g) - nice_name = g->gr_name; - break; + switch (perm_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + rid = atoi(perm_name); + p = getpwuid(rid); + if (p) + nice_name = p->pw_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + rid = atoi(perm_name); + g = getgrgid(rid); + if (g) + nice_name = g->gr_name; + break; - default: - break; - } - - if (nice_name != NULL) - (void) strlcpy( - node->who_perm.who_ug_name, - nice_name, 256); + default: + break; } - uu_avl_insert(avl, node, idx); - } else { - node = found_node; - who_perm = &node->who_perm; + if (nice_name != NULL) { + (void) strlcpy( + node->who_perm.who_ug_name, + nice_name, 256); + } else { + /* User or group unknown */ + (void) snprintf( + node->who_perm.who_ug_name, + sizeof (node->who_perm.who_ug_name), + "(unknown: %d)", rid); + } } + + uu_avl_insert(avl, node, idx); + } else { + node = found_node; + who_perm = &node->who_perm; } - VERIFY3P(who_perm, !=, NULL); + + assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } @@ -5497,9 +5963,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (p != NULL) rid = p->pw_uid; - else { + else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( - "invalid user %s"), curr); + "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { @@ -5511,9 +5977,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (g != NULL) rid = g->gr_gid; - else { + else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( - "invalid group %s"), curr); + "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { @@ -5539,7 +6005,7 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( - "invalid user/group %s"), curr); + "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } @@ -5954,7 +6420,7 @@ typedef struct holds_cbdata { size_t cb_max_taglen; } holds_cbdata_t; -#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y" +#define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * @@ -6293,9 +6759,9 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use share(1M) to " - "share this filesystem, or set " - "sharenfs property on\n")); + (void) fprintf(stderr, gettext("use exports(5) or " + "smb.conf(5) to share this filesystem, or set " + "the sharenfs or sharesmb property\n")); return (1); } @@ -6310,7 +6776,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use %s(1M) to " + (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } @@ -6343,7 +6809,18 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { - return (0); + /* + * When performing a 'zfs mount -a', we skip any mounts for + * datasets that have 'noauto' set. Sharing a dataset with + * 'noauto' set is only allowed if it's mounted. + */ + if (op == OP_MOUNT) + return (0); + if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { + /* also purge it from existing exports */ + zfs_unshareall_bypath(zhp, mountpoint); + return (0); + } } /* @@ -6380,6 +6857,17 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, return (1); } + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Dataset is not complete, was created by receiving " + "a redacted zfs send stream.\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the @@ -6465,9 +6953,6 @@ report_mount_progress(int current, int total) time_t now = time(NULL); char info[32]; - /* report 1..n instead of 0..n-1 */ - ++current; - /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); @@ -6538,7 +7023,7 @@ share_mount(int op, int argc, char **argv) int flags = 0; /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:O" : "al")) + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': @@ -6566,6 +7051,9 @@ share_mount(int op, int argc, char **argv) case 'O': flags |= MS_OVERLAY; break; + case 'f': + flags |= MS_FORCE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -6607,8 +7095,7 @@ share_mount(int op, int argc, char **argv) get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { - if (options != NULL) - free(options); + free(options); return (0); } @@ -6630,12 +7117,15 @@ share_mount(int op, int argc, char **argv) zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT && !(flags & MS_CRYPT)); + zfs_commit_all_shares(); + ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { + FILE *mnttab; struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { @@ -6651,14 +7141,12 @@ share_mount(int op, int argc, char **argv) * automatically. */ - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", mnttab_file) == NULL) { - if (options != NULL) - free(options); + if ((mnttab = fopen(MNTTAB, "re")) == NULL) { + free(options); return (ENOENT); } - while (getmntent(mnttab_file, &entry) == 0) { + while (getmntent(mnttab, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; @@ -6667,6 +7155,7 @@ share_mount(int op, int argc, char **argv) entry.mnt_mountp); } + (void) fclose(mnttab); } else { zfs_handle_t *zhp; @@ -6682,13 +7171,12 @@ share_mount(int op, int argc, char **argv) } else { ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, options); + zfs_commit_all_shares(); zfs_close(zhp); } } - if (options != NULL) - free(options); - + free(options); return (ret); } @@ -6747,33 +7235,11 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; - /* - * Search for the path in /proc/self/mounts. Rather than looking for the - * specific path, which can be fooled by non-standard paths (i.e. ".." - * or "//"), we stat() the path and search for the corresponding - * (major,minor) device pair. - */ - if (stat64(path, &statbuf) != 0) { - (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), - cmdname, path, strerror(errno)); - return (1); - } - path_inode = statbuf.st_ino; - /* * Search for the given (major,minor) pair in the mount table. */ - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", mnttab_file) == NULL) - return (ENOENT); - - while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { - if (entry.mnt_major == major(statbuf.st_dev) && - entry.mnt_minor == minor(statbuf.st_dev)) - break; - } - if (ret != 0) { + if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); @@ -6791,6 +7257,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) } return (ret != 0); } + path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " @@ -6833,6 +7300,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) "not currently shared\n"), path); } else { ret = zfs_unshareall_bypath(zhp, path); + zfs_commit_all_shares(); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; @@ -6874,13 +7342,16 @@ unshare_unmount(int op, int argc, char **argv) char sharesmb[ZFS_MAXPROPLEN]; /* check options */ - while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "af")) != -1) { + while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': - flags = MS_FORCE; + flags |= MS_FORCE; + break; + case 'u': + flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -6912,6 +7383,7 @@ unshare_unmount(int op, int argc, char **argv) * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ + FILE *mnttab; struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; @@ -6944,11 +7416,10 @@ unshare_unmount(int op, int argc, char **argv) ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", mnttab_file) == NULL) + if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); - while (getmntent(mnttab_file, &entry) == 0) { + while (getmntent(mnttab, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) @@ -7000,6 +7471,7 @@ unshare_unmount(int op, int argc, char **argv) if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; + break; default: break; } @@ -7018,6 +7490,7 @@ unshare_unmount(int op, int argc, char **argv) free(node); } } + (void) fclose(mnttab); /* * Walk the AVL tree in reverse, unmounting each filesystem and @@ -7028,8 +7501,9 @@ unshare_unmount(int op, int argc, char **argv) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { - uu_avl_remove(tree, node); + const char *mntarg = NULL; + uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshareall_bytype(node->un_zhp, @@ -7039,7 +7513,7 @@ unshare_unmount(int op, int argc, char **argv) case OP_MOUNT: if (zfs_unmount(node->un_zhp, - node->un_zhp->zfs_name, flags) != 0) + mntarg, flags) != 0) ret = 1; break; } @@ -7049,6 +7523,9 @@ unshare_unmount(int op, int argc, char **argv) free(node); } + if (op == OP_SHARE) + zfs_commit_shares(protocol); + uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); @@ -7099,8 +7576,8 @@ unshare_unmount(int op, int argc, char **argv) "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " - "unshare(1M) to unshare this " - "filesystem\n")); + "exports(5) or smb.conf(5) to unshare " + "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot " @@ -7118,7 +7595,7 @@ unshare_unmount(int op, int argc, char **argv) "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " - "umount(1M) to unmount this " + "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { @@ -7140,8 +7617,8 @@ unshare_unmount(int op, int argc, char **argv) } /* - * zfs unmount -a - * zfs unmount filesystem + * zfs unmount [-fu] -a + * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ @@ -7163,21 +7640,6 @@ zfs_do_unshare(int argc, char **argv) return (unshare_unmount(OP_SHARE, argc, argv)); } -static int -disable_command_idx(char *command) -{ - for (int i = 0; i < NCOMMAND; i++) { - if (command_table[i].name == NULL) - continue; - - if (strcmp(command, command_table[i].name) == 0) { - command_table[i].name = NULL; - return (0); - } - } - return (1); -} - static int find_command_idx(char *command, int *idx) { @@ -7281,66 +7743,18 @@ out: return (err != 0); } - /* - * zfs remap + * zfs bookmark | * - * N.B. The remap command has been disabled and may be removed in the future. - * - * Remap the indirect blocks in the given filesystem or volume so that they no - * longer reference blocks on previously removed vdevs and we can eventually - * shrink the size of the indirect mapping objects for the previously removed - * vdevs. Note that remapping all blocks might not be possible and that - * references from snapshots will still exist and cannot be remapped. - * - * This functionality is no longer particularly useful now that the removal - * code can map large chunks. Furthermore, explaining what this command - * does and why it may be useful requires a detailed understanding of the - * internals of device removal. These are details users should not be - * bothered with. If required, the remap command can be re-enabled by - * setting the ZFS_REMAP_ENABLED environment variable. - * - * > ZFS_REMAP_ENABLED=yes zfs remap - */ -static int -zfs_do_remap(int argc, char **argv) -{ - const char *fsname; - int err = 0; - int c; - - /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - case '?': - (void) fprintf(stderr, - gettext("invalid option '%c'\n"), optopt); - usage(B_FALSE); - } - } - - if (argc != 2) { - (void) fprintf(stderr, gettext("wrong number of arguments\n")); - usage(B_FALSE); - } - - fsname = argv[1]; - err = zfs_remap_indirects(g_zfs, fsname); - - return (err); -} - -/* - * zfs bookmark - * - * Creates a bookmark with the given name from the given snapshot. + * Creates a bookmark with the given name from the source snapshot + * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - char bookname[ZFS_MAX_DATASET_NAME_LEN]; - zfs_handle_t *zhp; + char *source, *bookname; + char expbuf[ZFS_MAX_DATASET_NAME_LEN]; + int source_type; nvlist_t *nvl; int ret = 0; int c; @@ -7360,7 +7774,7 @@ zfs_do_bookmark(int argc, char **argv) /* check number of arguments */ if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); + (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { @@ -7368,50 +7782,72 @@ zfs_do_bookmark(int argc, char **argv) goto usage; } - if (strchr(argv[0], '@') == NULL) { + source = argv[0]; + bookname = argv[1]; + + if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, - gettext("invalid snapshot name '%s': " - "must contain a '@'\n"), argv[0]); + gettext("invalid source name '%s': " + "must contain a '@' or '#'\n"), source); goto usage; } - if (strchr(argv[1], '#') == NULL) { + if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " - "must contain a '#'\n"), argv[1]); + "must contain a '#'\n"), bookname); goto usage; } - if (argv[0][0] == '@') { - /* - * Snapshot name begins with @. - * Default to same fs as bookmark. - */ - (void) strlcpy(snapname, argv[1], sizeof (snapname)); - *strchr(snapname, '#') = '\0'; - (void) strlcat(snapname, argv[0], sizeof (snapname)); - } else { - (void) strlcpy(snapname, argv[0], sizeof (snapname)); - } - if (argv[1][0] == '#') { - /* - * Bookmark name begins with #. - * Default to same fs as snapshot. - */ - (void) strlcpy(bookname, argv[0], sizeof (bookname)); - *strchr(bookname, '@') = '\0'; - (void) strlcat(bookname, argv[1], sizeof (bookname)); - } else { - (void) strlcpy(bookname, argv[1], sizeof (bookname)); + /* + * expand source or bookname to full path: + * one of them may be specified as short name + */ + { + char **expand; + char *source_short, *bookname_short; + source_short = strpbrk(source, "@#"); + bookname_short = strpbrk(bookname, "#"); + if (source_short == source && + bookname_short == bookname) { + (void) fprintf(stderr, gettext( + "either source or bookmark must be specified as " + "full dataset paths")); + goto usage; + } else if (source_short != source && + bookname_short != bookname) { + expand = NULL; + } else if (source_short != source) { + strlcpy(expbuf, source, sizeof (expbuf)); + expand = &bookname; + } else if (bookname_short != bookname) { + strlcpy(expbuf, bookname, sizeof (expbuf)); + expand = &source; + } else { + abort(); + } + if (expand != NULL) { + *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ + (void) strlcat(expbuf, *expand, sizeof (expbuf)); + *expand = expbuf; + } } - zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT); + /* determine source type */ + switch (*strpbrk(source, "@#")) { + case '@': source_type = ZFS_TYPE_SNAPSHOT; break; + case '#': source_type = ZFS_TYPE_BOOKMARK; break; + default: abort(); + } + + /* test the source exists */ + zfs_handle_t *zhp; + zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, bookname, snapname); + fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); @@ -7427,6 +7863,10 @@ zfs_do_bookmark(int argc, char **argv) case EXDEV: err_msg = "bookmark is in a different pool"; break; + case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: + err_msg = "source is not an ancestor of the " + "new bookmark's dataset"; + break; case EEXIST: err_msg = "bookmark exists"; break; @@ -7905,7 +8345,7 @@ zfs_do_change_key(int argc, char **argv) * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. - * -r: Set on subdirectorie recursively. If not specify "-p" option, + * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. @@ -8085,6 +8525,90 @@ zfs_do_project(int argc, char **argv) return (ret); } +static int +zfs_do_wait(int argc, char **argv) +{ + boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; + int error, i; + int c; + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) + enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + { + static char *col_subopts[] = { "deleteq", NULL }; + char *value; + + /* Reset activities array */ + bzero(&enabled, sizeof (enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argv += optind; + argc -= optind; + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'filesystem' " + "argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!enabled[i]) + continue; + + error = zfs_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zfs_close(zhp); + + return (error); +} + /* * Display version message */ @@ -8106,6 +8630,7 @@ main(int argc, char **argv) char **newargv; (void) setlocale(LC_ALL, ""); + (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; @@ -8138,13 +8663,6 @@ main(int argc, char **argv) if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; - /* - * The 'remap' command has been disabled and may be removed in the - * future. See the comment above zfs_do_remap() for details. - */ - if (!libzfs_envvar_is_set("ZFS_REMAP_ENABLED")) - disable_command_idx("remap"); - /* * Special case '-?' */ @@ -8159,12 +8677,10 @@ main(int argc, char **argv) return (zfs_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "%s", libzfs_error_init(errno)); + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } - mnttab_file = g_zfs->libzfs_mnttab; - zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); @@ -8216,3 +8732,67 @@ main(int argc, char **argv) return (ret); } + +#ifdef __FreeBSD__ +#include +#include +/* + * Attach/detach the given dataset to/from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail_impl(int argc, char **argv, boolean_t attach) +{ + zfs_handle_t *zhp; + int jailid, ret; + + /* check number of arguments */ + if (argc < 3) { + (void) fprintf(stderr, gettext("missing argument(s)\n")); + usage(B_FALSE); + } + if (argc > 3) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + jailid = jail_getid(argv[1]); + if (jailid < 0) { + (void) fprintf(stderr, gettext("invalid jail id or name\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + ret = (zfs_jail(zhp, jailid, attach) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs jail jailid filesystem + * + * Attach the given dataset to the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_TRUE)); +} + +/* + * zfs unjail jailid filesystem + * + * Detach the given dataset from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_unjail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_FALSE)); +} +#endif diff --git a/cmd/zfs/zfs_util.h b/cmd/zfs/zfs_util.h index 3ddff9e22d..a56af59adb 100644 --- a/cmd/zfs/zfs_util.h +++ b/cmd/zfs/zfs_util.h @@ -33,7 +33,7 @@ extern "C" { void * safe_malloc(size_t size); void nomem(void); -libzfs_handle_t *g_zfs; +extern libzfs_handle_t *g_zfs; #ifdef __cplusplus } diff --git a/cmd/zfs_ids_to_path/.gitignore b/cmd/zfs_ids_to_path/.gitignore new file mode 100644 index 0000000000..f95f853e48 --- /dev/null +++ b/cmd/zfs_ids_to_path/.gitignore @@ -0,0 +1 @@ +zfs_ids_to_path diff --git a/cmd/zfs_ids_to_path/Makefile.am b/cmd/zfs_ids_to_path/Makefile.am new file mode 100644 index 0000000000..5494267640 --- /dev/null +++ b/cmd/zfs_ids_to_path/Makefile.am @@ -0,0 +1,11 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zfs_ids_to_path + +zfs_ids_to_path_SOURCES = \ + zfs_ids_to_path.c + +zfs_ids_to_path_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/cmd/zfs_ids_to_path/zfs_ids_to_path.c new file mode 100644 index 0000000000..1d3bb6b29e --- /dev/null +++ b/cmd/zfs_ids_to_path/zfs_ids_to_path.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +libzfs_handle_t *g_zfs; + +static void +usage(int err) +{ + fprintf(stderr, "Usage: zfs_ids_to_path [-v] " + "\n"); + exit(err); +} + +int +main(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + int c; + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + } + } + argc -= optind; + argv += optind; + + if (argc != 3) { + (void) fprintf(stderr, "Incorrect number of arguments: %d\n", + argc); + usage(1); + } + + uint64_t objset, object; + if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) { + (void) fprintf(stderr, "Invalid objset id: %s\n", argv[1]); + usage(2); + } + if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) { + (void) fprintf(stderr, "Invalid object id: %s\n", argv[2]); + usage(3); + } + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (4); + } + zpool_handle_t *pool = zpool_open(g_zfs, argv[0]); + if (pool == NULL) { + fprintf(stderr, "Could not open pool %s\n", argv[0]); + libzfs_fini(g_zfs); + return (5); + } + + char pathname[PATH_MAX * 2]; + if (verbose) { + zpool_obj_to_path_ds(pool, objset, object, pathname, + sizeof (pathname)); + } else { + zpool_obj_to_path(pool, objset, object, pathname, + sizeof (pathname)); + } + printf("%s\n", pathname); + zpool_close(pool); + libzfs_fini(g_zfs); + return (0); +} diff --git a/cmd/zgenhostid/.gitignore b/cmd/zgenhostid/.gitignore new file mode 100644 index 0000000000..072246c735 --- /dev/null +++ b/cmd/zgenhostid/.gitignore @@ -0,0 +1 @@ +/zgenhostid diff --git a/cmd/zgenhostid/Makefile.am b/cmd/zgenhostid/Makefile.am index 69c99ca9d8..4526a90a1d 100644 --- a/cmd/zgenhostid/Makefile.am +++ b/cmd/zgenhostid/Makefile.am @@ -1 +1,7 @@ -dist_bin_SCRIPTS = zgenhostid +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zgenhostid + +zgenhostid_SOURCES = zgenhostid.c + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zgenhostid/zgenhostid b/cmd/zgenhostid/zgenhostid deleted file mode 100755 index db690eca32..0000000000 --- a/cmd/zgenhostid/zgenhostid +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Emulate genhostid(1) available on RHEL/CENTOS, for use on distros -# which do not provide that utility. -# -# Usage: -# zgenhostid -# zgenhostid -# -# If /etc/hostid already exists and is size > 0, the script exits immediately -# and changes nothing. Unlike genhostid, this generates an error message. -# -# The first form generates a random hostid and stores it in /etc/hostid. -# The second form checks that the provided value is between 0x1 and 0xFFFFFFFF -# and if so, stores it in /etc/hostid. This form is not supported by -# genhostid(1). - -hostid_file=/etc/hostid - -function usage { - echo "$0 [value]" - echo "If $hostid_file is not present, store a hostid in it." >&2 - echo "The optional value must be an 8-digit hex number between" >&2 - echo "1 and 2^32-1. If no value is provided, a random one will" >&2 - echo "be generated. The value must be unique among your systems." >&2 -} - -# hostid(1) ignores contents of /etc/hostid if size < 4 bytes. It would -# be better if this checked size >= 4 bytes but it the method must be -# widely portable. -if [ -s $hostid_file ]; then - echo "$hostid_file already exists. No change made." >&2 - exit 1 -fi - -if [ -n "$1" ]; then - host_id=$1 -else - # $RANDOM goes from 0..32k-1 - number=$((((RANDOM % 4) * 32768 + RANDOM) * 32768 + RANDOM)) - host_id=$(printf "%08x" $number) -fi - -if egrep -o '^0{8}$' <<< $host_id >/dev/null 2>&1; then - usage - exit 2 -fi - -if ! egrep -o '^[a-fA-F0-9]{8}$' <<< $host_id >/dev/null 2>&1; then - usage - exit 3 -fi - -a=${host_id:6:2} -b=${host_id:4:2} -c=${host_id:2:2} -d=${host_id:0:2} - -echo -ne \\x$a\\x$b\\x$c\\x$d > $hostid_file - -exit 0 diff --git a/cmd/zgenhostid/zgenhostid.c b/cmd/zgenhostid/zgenhostid.c new file mode 100644 index 0000000000..853931c6ad --- /dev/null +++ b/cmd/zgenhostid/zgenhostid.c @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Georgy Yakovlev. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static __attribute__((noreturn)) void +usage(void) +{ + (void) fprintf(stderr, + "usage: zgenhostid [-fh] [-o path] [value]\n\n" + " -f\t\t force hostid file write\n" + " -h\t\t print this usage and exit\n" + " -o \t write hostid to this file\n\n" + "If hostid file is not present, store a hostid in it.\n" + "The optional value should be an 8-digit hex number between" + " 1 and 2^32-1.\n" + "If the value is 0 or no value is provided, a random one" + " will be generated.\n" + "The value must be unique among your systems.\n"); + exit(EXIT_FAILURE); +} + +int +main(int argc, char **argv) +{ + /* default file path, can be optionally set by user */ + const char *path = "/etc/hostid"; + /* holds converted user input or lrand48() generated value */ + unsigned long input_i = 0; + + int opt; + int force_fwrite = 0; + while ((opt = getopt_long(argc, argv, "fo:h?", 0, 0)) != -1) { + switch (opt) { + case 'f': + force_fwrite = 1; + break; + case 'o': + path = optarg; + break; + case 'h': + case '?': + usage(); + } + } + + char *in_s = argv[optind]; + if (in_s != NULL) { + /* increment pointer by 2 if string is 0x prefixed */ + if (strncasecmp("0x", in_s, 2) == 0) { + in_s += 2; + } + + /* need to be exactly 8 characters */ + const char *hex = "0123456789abcdefABCDEF"; + if (strlen(in_s) != 8 || strspn(in_s, hex) != 8) { + fprintf(stderr, "%s\n", strerror(ERANGE)); + usage(); + } + + input_i = strtoul(in_s, NULL, 16); + if (errno != 0) { + perror("strtoul"); + exit(EXIT_FAILURE); + } + + if (input_i > UINT32_MAX) { + fprintf(stderr, "%s\n", strerror(ERANGE)); + usage(); + } + } + + struct stat fstat; + if (force_fwrite == 0 && stat(path, &fstat) == 0 && + S_ISREG(fstat.st_mode)) { + fprintf(stderr, "%s: %s\n", path, strerror(EEXIST)); + exit(EXIT_FAILURE); + } + + /* + * generate if not provided by user + * also handle unlikely zero return from lrand48() + */ + while (input_i == 0) { + srand48(getpid() ^ time(NULL)); + input_i = lrand48(); + } + + FILE *fp = fopen(path, "wb"); + if (!fp) { + perror("fopen"); + exit(EXIT_FAILURE); + } + + /* + * we need just 4 bytes in native endianness + * not using sethostid() because it may be missing or just a stub + */ + uint32_t hostid = input_i; + int written = fwrite(&hostid, 1, 4, fp); + if (written != 4) { + perror("fwrite"); + exit(EXIT_FAILURE); + } + + fclose(fp); + exit(EXIT_SUCCESS); +} diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am index 6e3e706ec0..23f03ffd82 100644 --- a/cmd/zhack/Makefile.am +++ b/cmd/zhack/Makefile.am @@ -1,8 +1,7 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +# Unconditionally enable debugging for zhack +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG sbin_PROGRAMS = zhack @@ -10,5 +9,8 @@ zhack_SOURCES = \ zhack.c zhack_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzpool/libzpool.la + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 57e497f62d..b27423f538 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -48,16 +48,15 @@ #include #include #include +#include #include -extern boolean_t zfeature_checks_disable; - const char cmdname[] = "zhack"; static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; -static void +static __attribute__((noreturn)) void usage(void) { (void) fprintf(stderr, @@ -82,7 +81,7 @@ usage(void) } -static void +static __attribute__((noreturn)) __attribute__((format(printf, 3, 4))) void fatal(spa_t *spa, void *tag, const char *fmt, ...) { va_list ap; @@ -103,8 +102,8 @@ fatal(spa_t *spa, void *tag, const char *fmt, ...) /* ARGSUSED */ static int -space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +space_delta_cb(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) { /* * Is it a valid type of object to track? @@ -113,7 +112,6 @@ space_delta_cb(dmu_object_type_t bonustype, void *data, return (ENOENT); (void) fprintf(stderr, "modifying object that needs user accounting"); abort(); - /* NOTREACHED */ } /* @@ -126,7 +124,8 @@ zhack_import(char *target, boolean_t readonly) nvlist_t *props; int error; - kernel_init(readonly ? FREAD : (FREAD | FWRITE)); + kernel_init(readonly ? SPA_MODE_READ : + (SPA_MODE_READ | SPA_MODE_WRITE)); dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); @@ -149,6 +148,7 @@ zhack_import(char *target, boolean_t readonly) zfeature_checks_disable = B_TRUE; error = spa_import(target, config, props, (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL)); + fnvlist_free(config); zfeature_checks_disable = B_FALSE; if (error == EEXIST) error = 0; @@ -317,7 +317,8 @@ zhack_do_feature_enable(int argc, char **argv) mos = spa->spa_meta_objset; if (zfeature_is_supported(feature.fi_guid)) - fatal(spa, FTAG, "'%s' is a real feature, will not enable"); + fatal(spa, FTAG, "'%s' is a real feature, will not enable", + feature.fi_guid); if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) fatal(spa, FTAG, "feature already enabled: %s", feature.fi_guid); @@ -411,7 +412,8 @@ zhack_do_feature_ref(int argc, char **argv) if (zfeature_is_supported(feature.fi_guid)) { fatal(spa, FTAG, - "'%s' is a real feature, will not change refcount"); + "'%s' is a real feature, will not change refcount", + feature.fi_guid); } if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am index ab7f4de123..40f382c661 100644 --- a/cmd/zinject/Makefile.am +++ b/cmd/zinject/Makefile.am @@ -1,9 +1,5 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - sbin_PROGRAMS = zinject zinject_SOURCES = \ @@ -12,5 +8,8 @@ zinject_SOURCES = \ zinject.h zinject_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 700961b06a..4939c0b85b 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include @@ -85,8 +85,6 @@ parse_pathname(const char *inpath, char *dataset, char *relpath, struct stat64 *statbuf) { struct extmnttab mp; - FILE *fp; - int match; const char *rel; char fullpath[MAXPATHLEN]; @@ -99,35 +97,7 @@ parse_pathname(const char *inpath, char *dataset, char *relpath, return (-1); } - if (strlen(fullpath) >= MAXPATHLEN) { - (void) fprintf(stderr, "invalid object; pathname too long\n"); - return (-1); - } - - if (stat64(fullpath, statbuf) != 0) { - (void) fprintf(stderr, "cannot open '%s': %s\n", - fullpath, strerror(errno)); - return (-1); - } - -#ifdef HAVE_SETMNTENT - if ((fp = setmntent(MNTTAB, "r")) == NULL) { -#else - if ((fp = fopen(MNTTAB, "r")) == NULL) { -#endif - (void) fprintf(stderr, "cannot open %s\n", MNTTAB); - return (-1); - } - - match = 0; - while (getextmntent(fp, &mp, sizeof (mp)) == 0) { - if (makedev(mp.mnt_major, mp.mnt_minor) == statbuf->st_dev) { - match = 1; - break; - } - } - - if (!match) { + if (getextmntent(fullpath, &mp, statbuf) != 0) { (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", fullpath); return (-1); @@ -176,7 +146,7 @@ object_from_path(const char *dataset, uint64_t object, zinject_record_t *record) } /* - * Intialize the range based on the type, level, and range given. + * Initialize the range based on the type, level, and range given. */ static int initialize_range(err_type_t type, int level, char *range, @@ -310,7 +280,7 @@ translate_record(err_type_t type, const char *object, const char *range, ziprintf("raw object: %llu\n", record->zi_object); /* - * For the given object, intialize the range in bytes + * For the given object, initialize the range in bytes */ if (initialize_range(type, level, (char *)range, record) != 0) goto err; @@ -418,7 +388,7 @@ translate_device(const char *pool, const char *device, err_type_t label_type, record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; case TYPE_LABEL_PAD2: - record->zi_start = offsetof(vdev_label_t, vl_pad2); + record->zi_start = offsetof(vdev_label_t, vl_be); record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; } diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index cff7f861a2..bf97b0d687 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -159,8 +159,6 @@ libzfs_handle_t *g_zfs; int zfs_fd; -#define ECKSUM EBADE - static const char *errtable[TYPE_INVAL] = { "data", "dnode", @@ -340,7 +338,7 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), zfs_cmd_t zc = {"\0"}; int ret; - while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) + while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) if ((ret = func((int)zc.zc_guid, zc.zc_name, &zc.zc_inject_record, data)) != 0) return (ret); @@ -508,7 +506,7 @@ cancel_one_handler(int id, const char *pool, zinject_record_t *record, zc.zc_guid = (uint64_t)id; - if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); @@ -541,7 +539,7 @@ cancel_handler(int id) zc.zc_guid = (uint64_t)id; - if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); @@ -565,7 +563,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, zc.zc_inject_record = *record; zc.zc_guid = flags; - if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { + if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to add handler: %s\n", errno == EDOM ? "block level exceeds max level of object" : strerror(errno)); @@ -615,7 +613,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, return (0); } -int +static int perform_action(const char *pool, zinject_record_t *record, int cmd) { zfs_cmd_t zc = {"\0"}; @@ -625,7 +623,7 @@ perform_action(const char *pool, zinject_record_t *record, int cmd) zc.zc_guid = record->zi_guid; zc.zc_cookie = cmd; - if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (1); @@ -763,7 +761,7 @@ main(int argc, char **argv) uint32_t dvas = 0; if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "%s", libzfs_error_init(errno)); + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index c03da941db..fa494c030e 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -1,8 +1,9 @@ include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Shellcheck.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS) + +DEFAULT_INCLUDES += -I$(srcdir) sbin_PROGRAMS = zpool @@ -13,19 +14,37 @@ zpool_SOURCES = \ zpool_util.h \ zpool_vdev.c -zpool_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzfs/libzfs.la +if BUILD_FREEBSD +zpool_SOURCES += os/freebsd/zpool_vdev_os.c +endif -zpool_LDADD += -lm $(LIBBLKID) +if BUILD_LINUX +zpool_SOURCES += os/linux/zpool_vdev_os.c +endif + +zpool_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la + +zpool_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zpool_LDADD += -lgeom +endif +zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS) + +include $(top_srcdir)/config/CppCheck.am zpoolconfdir = $(sysconfdir)/zfs/zpool.d zpoolexecdir = $(zfsexecdir)/zpool.d -EXTRA_DIST = zpool.d/README +EXTRA_DIST = zpool.d/README compatibility.d dist_zpoolexec_SCRIPTS = \ + zpool.d/dm-deps \ zpool.d/enc \ zpool.d/encdev \ zpool.d/fault_led \ @@ -40,7 +59,6 @@ dist_zpoolexec_SCRIPTS = \ zpool.d/serial \ zpool.d/ses \ zpool.d/size \ - zpool.d/slaves \ zpool.d/slot \ zpool.d/smart \ zpool.d/smartx \ @@ -70,6 +88,7 @@ dist_zpoolexec_SCRIPTS = \ zpool.d/test_ended zpoolconfdefaults = \ + dm-deps \ enc \ encdev \ fault_led \ @@ -84,7 +103,6 @@ zpoolconfdefaults = \ serial \ ses \ size \ - slaves \ slot \ smart \ smartx \ @@ -113,6 +131,52 @@ zpoolconfdefaults = \ test_progress \ test_ended +zpoolcompatdir = $(pkgdatadir)/compatibility.d + +dist_zpoolcompat_DATA = \ + compatibility.d/compat-2018 \ + compatibility.d/compat-2019 \ + compatibility.d/compat-2020 \ + compatibility.d/compat-2021 \ + compatibility.d/freebsd-11.0 \ + compatibility.d/freebsd-11.2 \ + compatibility.d/freebsd-11.3 \ + compatibility.d/freenas-9.10.2 \ + compatibility.d/grub2 \ + compatibility.d/openzfsonosx-1.7.0 \ + compatibility.d/openzfsonosx-1.8.1 \ + compatibility.d/openzfsonosx-1.9.3 \ + compatibility.d/openzfs-2.0-freebsd \ + compatibility.d/openzfs-2.0-linux \ + compatibility.d/openzfs-2.1-freebsd \ + compatibility.d/openzfs-2.1-linux \ + compatibility.d/zol-0.6.1 \ + compatibility.d/zol-0.6.4 \ + compatibility.d/zol-0.6.5 \ + compatibility.d/zol-0.7 \ + compatibility.d/zol-0.8 + +# canonical <- alias symbolic link pairs +# eg: "2018" is a link to "compat-2018" +zpoolcompatlinks = \ + "compat-2018 2018" \ + "compat-2019 2019" \ + "compat-2020 2020" \ + "compat-2021 2021" \ + "freebsd-11.0 freebsd-11.1" \ + "freebsd-11.0 freenas-11.0" \ + "freebsd-11.2 freenas-11.2" \ + "freebsd-11.3 freebsd-11.4" \ + "freebsd-11.3 freebsd-12.0" \ + "freebsd-11.3 freebsd-12.1" \ + "freebsd-11.3 freebsd-12.2" \ + "freebsd-11.3 freenas-11.3" \ + "freenas-11.0 freenas-11.1" \ + "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ + "openzfs-2.0-freebsd truenas-12.0" \ + "zol-0.7 ubuntu-18.04" \ + "zol-0.8 ubuntu-20.04" + install-data-hook: $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" for f in $(zpoolconfdefaults); do \ @@ -120,3 +184,6 @@ install-data-hook: -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \ ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \ done + for l in $(zpoolcompatlinks); do \ + (cd "$(DESTDIR)$(zpoolcompatdir)"; ln -sf $${l} ); \ + done diff --git a/cmd/zpool/compatibility.d/compat-2018 b/cmd/zpool/compatibility.d/compat-2018 new file mode 100644 index 0000000000..7be44e1eee --- /dev/null +++ b/cmd/zpool/compatibility.d/compat-2018 @@ -0,0 +1,12 @@ +# Features supported by all Tier 1 platforms as of 2018 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/compat-2019 b/cmd/zpool/compatibility.d/compat-2019 new file mode 100644 index 0000000000..c105cc70c2 --- /dev/null +++ b/cmd/zpool/compatibility.d/compat-2019 @@ -0,0 +1,15 @@ +# Features supported by all Tier 1 platforms as of 2019 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +sha512 +skein +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/compat-2020 b/cmd/zpool/compatibility.d/compat-2020 new file mode 100644 index 0000000000..8d46a571e6 --- /dev/null +++ b/cmd/zpool/compatibility.d/compat-2020 @@ -0,0 +1,15 @@ +# Features supported by all Tier 1 platforms as of 2020 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +sha512 +skein +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/compat-2021 b/cmd/zpool/compatibility.d/compat-2021 new file mode 100644 index 0000000000..f45c82d656 --- /dev/null +++ b/cmd/zpool/compatibility.d/compat-2021 @@ -0,0 +1,19 @@ +# Features supported by all Tier 1 platforms as of 2021 +async_destroy +bookmarks +device_removal +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +obsolete_counts +sha512 +skein +spacemap_histogram +spacemap_v2 +zpool_checkpoint diff --git a/cmd/zpool/compatibility.d/freebsd-11.0 b/cmd/zpool/compatibility.d/freebsd-11.0 new file mode 100644 index 0000000000..8718559ffb --- /dev/null +++ b/cmd/zpool/compatibility.d/freebsd-11.0 @@ -0,0 +1,15 @@ +# Features supported by FreeBSD 11.0 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +sha512 +skein +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/freebsd-11.2 b/cmd/zpool/compatibility.d/freebsd-11.2 new file mode 100644 index 0000000000..14d2d573b2 --- /dev/null +++ b/cmd/zpool/compatibility.d/freebsd-11.2 @@ -0,0 +1,18 @@ +# Features supported by FreeBSD 11.2 +async_destroy +bookmarks +device_removal +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +obsolete_counts +sha512 +skein +spacemap_histogram +zpool_checkpoint diff --git a/cmd/zpool/compatibility.d/freebsd-11.3 b/cmd/zpool/compatibility.d/freebsd-11.3 new file mode 100644 index 0000000000..802cc3630d --- /dev/null +++ b/cmd/zpool/compatibility.d/freebsd-11.3 @@ -0,0 +1,19 @@ +# Features supported by FreeBSD 11.3 +async_destroy +bookmarks +device_removal +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +obsolete_counts +sha512 +skein +spacemap_histogram +spacemap_v2 +zpool_checkpoint diff --git a/cmd/zpool/compatibility.d/freenas-9.10.2 b/cmd/zpool/compatibility.d/freenas-9.10.2 new file mode 100644 index 0000000000..10789c96cc --- /dev/null +++ b/cmd/zpool/compatibility.d/freenas-9.10.2 @@ -0,0 +1,13 @@ +# Features supported by FreeNAS 9.10.2 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 new file mode 100644 index 0000000000..4e8f213625 --- /dev/null +++ b/cmd/zpool/compatibility.d/grub2 @@ -0,0 +1,12 @@ +# Features which are supported by GRUB2 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/openzfs-2.0-freebsd b/cmd/zpool/compatibility.d/openzfs-2.0-freebsd new file mode 100644 index 0000000000..e7ee2f2476 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.0-freebsd @@ -0,0 +1,33 @@ +# Features supported by OpenZFS 2.0 on FreeBSD +allocation_classes +async_destroy +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint +zstd_compress diff --git a/cmd/zpool/compatibility.d/openzfs-2.0-linux b/cmd/zpool/compatibility.d/openzfs-2.0-linux new file mode 100644 index 0000000000..ac0f5c8634 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.0-linux @@ -0,0 +1,34 @@ +# Features supported by OpenZFS 2.0 on Linux +allocation_classes +async_destroy +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint +zstd_compress diff --git a/cmd/zpool/compatibility.d/openzfs-2.1-freebsd b/cmd/zpool/compatibility.d/openzfs-2.1-freebsd new file mode 100644 index 0000000000..9fde997e8c --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.1-freebsd @@ -0,0 +1,34 @@ +# Features supported by OpenZFS 2.1 on FreeBSD +allocation_classes +async_destroy +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint +zstd_compress diff --git a/cmd/zpool/compatibility.d/openzfs-2.1-linux b/cmd/zpool/compatibility.d/openzfs-2.1-linux new file mode 100644 index 0000000000..c3ff176bf8 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.1-linux @@ -0,0 +1,35 @@ +# Features supported by OpenZFS 2.1 on Linux +allocation_classes +async_destroy +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint +zstd_compress diff --git a/cmd/zpool/compatibility.d/openzfsonosx-1.7.0 b/cmd/zpool/compatibility.d/openzfsonosx-1.7.0 new file mode 100644 index 0000000000..4ae87c964c --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfsonosx-1.7.0 @@ -0,0 +1,16 @@ +# Features supported by OpenZFSonOSX 1.7.0 +async_destroy +bookmarks +edonr +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +sha512 +skein +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 new file mode 100644 index 0000000000..162ff32a78 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 @@ -0,0 +1,21 @@ +# Features supported by OpenZFSonOSX 1.8.1 +async_destroy +bookmarks +device_removal +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +multi_vdev_crash_dump +obsolete_counts +sha512 +skein +spacemap_histogram +spacemap_v2 +zpool_checkpoint diff --git a/cmd/zpool/compatibility.d/openzfsonosx-1.9.3 b/cmd/zpool/compatibility.d/openzfsonosx-1.9.3 new file mode 100644 index 0000000000..b0b28ec049 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfsonosx-1.9.3 @@ -0,0 +1,27 @@ +# Features supported by OpenZFSonOSX 1.9.3 +allocation_classes +async_destroy +bookmark_v2 +bookmarks +device_removal +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint diff --git a/cmd/zpool/compatibility.d/zol-0.6.1 b/cmd/zpool/compatibility.d/zol-0.6.1 new file mode 100644 index 0000000000..9bc963ddcc --- /dev/null +++ b/cmd/zpool/compatibility.d/zol-0.6.1 @@ -0,0 +1,4 @@ +# Features supported by ZFSonLinux v0.6.1 +async_destroy +empty_bpobj +lz4_compress diff --git a/cmd/zpool/compatibility.d/zol-0.6.4 b/cmd/zpool/compatibility.d/zol-0.6.4 new file mode 100644 index 0000000000..82a2698c8c --- /dev/null +++ b/cmd/zpool/compatibility.d/zol-0.6.4 @@ -0,0 +1,10 @@ +# Features supported by ZFSonLinux v0.6.4 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +hole_birth +lz4_compress +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/zol-0.6.5 b/cmd/zpool/compatibility.d/zol-0.6.5 new file mode 100644 index 0000000000..cb9a94d889 --- /dev/null +++ b/cmd/zpool/compatibility.d/zol-0.6.5 @@ -0,0 +1,12 @@ +# Features supported by ZFSonLinux v0.6.5 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +spacemap_histogram diff --git a/cmd/zpool/compatibility.d/zol-0.7 b/cmd/zpool/compatibility.d/zol-0.7 new file mode 100644 index 0000000000..22a02936df --- /dev/null +++ b/cmd/zpool/compatibility.d/zol-0.7 @@ -0,0 +1,18 @@ +# Features supported by ZFSonLinux v0.7 +async_destroy +bookmarks +edonr +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +lz4_compress +multi_vdev_crash_dump +sha512 +skein +spacemap_histogram +userobj_accounting diff --git a/cmd/zpool/compatibility.d/zol-0.8 b/cmd/zpool/compatibility.d/zol-0.8 new file mode 100644 index 0000000000..762848ef7b --- /dev/null +++ b/cmd/zpool/compatibility.d/zol-0.8 @@ -0,0 +1,27 @@ +# Features supported by ZFSonLinux v0.8 +allocation_classes +async_destroy +bookmark_v2 +bookmarks +device_removal +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +hole_birth +large_blocks +large_dnode +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +zpool_checkpoint diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c new file mode 100644 index 0000000000..66bfe28f13 --- /dev/null +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" +#include + +int +check_device(const char *name, boolean_t force, boolean_t isspare, + boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (0); +} + +void +after_zpool_upgrade(zpool_handle_t *zhp) +{ + char bootfs[ZPOOL_MAXPROPLEN]; + + if (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, + sizeof (bootfs), NULL, B_FALSE) == 0 && + strcmp(bootfs, "-") != 0) { + (void) printf(gettext("Pool '%s' has the bootfs " + "property set, you might need to update\nthe boot " + "code. See gptzfsboot(8) and loader.efi(8) for " + "details.\n"), zpool_get_name(zhp)); + } +} + +int +check_file(const char *file, boolean_t force, boolean_t isspare) +{ + return (check_file_generic(file, force, isspare)); +} diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c new file mode 100644 index 0000000000..10929fa65a --- /dev/null +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -0,0 +1,418 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libblkid to make sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpool_util.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct vdev_disk_db_entry +{ + char id[24]; + int sector_size; +} vdev_disk_db_entry_t; + +/* + * Database of block devices that lie about physical sector sizes. The + * identification string must be precisely 24 characters to avoid false + * negatives + */ +static vdev_disk_db_entry_t vdev_disk_database[] = { + {"ATA ADATA SSD S396 3", 8192}, + {"ATA APPLE SSD SM128E", 8192}, + {"ATA APPLE SSD SM256E", 8192}, + {"ATA APPLE SSD SM512E", 8192}, + {"ATA APPLE SSD SM768E", 8192}, + {"ATA C400-MTFDDAC064M", 8192}, + {"ATA C400-MTFDDAC128M", 8192}, + {"ATA C400-MTFDDAC256M", 8192}, + {"ATA C400-MTFDDAC512M", 8192}, + {"ATA Corsair Force 3 ", 8192}, + {"ATA Corsair Force GS", 8192}, + {"ATA INTEL SSDSA2CT04", 8192}, + {"ATA INTEL SSDSA2BZ10", 8192}, + {"ATA INTEL SSDSA2BZ20", 8192}, + {"ATA INTEL SSDSA2BZ30", 8192}, + {"ATA INTEL SSDSA2CW04", 8192}, + {"ATA INTEL SSDSA2CW08", 8192}, + {"ATA INTEL SSDSA2CW12", 8192}, + {"ATA INTEL SSDSA2CW16", 8192}, + {"ATA INTEL SSDSA2CW30", 8192}, + {"ATA INTEL SSDSA2CW60", 8192}, + {"ATA INTEL SSDSC2CT06", 8192}, + {"ATA INTEL SSDSC2CT12", 8192}, + {"ATA INTEL SSDSC2CT18", 8192}, + {"ATA INTEL SSDSC2CT24", 8192}, + {"ATA INTEL SSDSC2CW06", 8192}, + {"ATA INTEL SSDSC2CW12", 8192}, + {"ATA INTEL SSDSC2CW18", 8192}, + {"ATA INTEL SSDSC2CW24", 8192}, + {"ATA INTEL SSDSC2CW48", 8192}, + {"ATA KINGSTON SH100S3", 8192}, + {"ATA KINGSTON SH103S3", 8192}, + {"ATA M4-CT064M4SSD2 ", 8192}, + {"ATA M4-CT128M4SSD2 ", 8192}, + {"ATA M4-CT256M4SSD2 ", 8192}, + {"ATA M4-CT512M4SSD2 ", 8192}, + {"ATA OCZ-AGILITY2 ", 8192}, + {"ATA OCZ-AGILITY3 ", 8192}, + {"ATA OCZ-VERTEX2 3.5 ", 8192}, + {"ATA OCZ-VERTEX3 ", 8192}, + {"ATA OCZ-VERTEX3 LT ", 8192}, + {"ATA OCZ-VERTEX3 MI ", 8192}, + {"ATA OCZ-VERTEX4 ", 8192}, + {"ATA SAMSUNG MZ7WD120", 8192}, + {"ATA SAMSUNG MZ7WD240", 8192}, + {"ATA SAMSUNG MZ7WD480", 8192}, + {"ATA SAMSUNG MZ7WD960", 8192}, + {"ATA SAMSUNG SSD 830 ", 8192}, + {"ATA Samsung SSD 840 ", 8192}, + {"ATA SanDisk SSD U100", 8192}, + {"ATA TOSHIBA THNSNH06", 8192}, + {"ATA TOSHIBA THNSNH12", 8192}, + {"ATA TOSHIBA THNSNH25", 8192}, + {"ATA TOSHIBA THNSNH51", 8192}, + {"ATA APPLE SSD TS064C", 4096}, + {"ATA APPLE SSD TS128C", 4096}, + {"ATA APPLE SSD TS256C", 4096}, + {"ATA APPLE SSD TS512C", 4096}, + {"ATA INTEL SSDSA2M040", 4096}, + {"ATA INTEL SSDSA2M080", 4096}, + {"ATA INTEL SSDSA2M160", 4096}, + {"ATA INTEL SSDSC2MH12", 4096}, + {"ATA INTEL SSDSC2MH25", 4096}, + {"ATA OCZ CORE_SSD ", 4096}, + {"ATA OCZ-VERTEX ", 4096}, + {"ATA SAMSUNG MCCOE32G", 4096}, + {"ATA SAMSUNG MCCOE64G", 4096}, + {"ATA SAMSUNG SSD PM80", 4096}, + /* Flash drives optimized for 4KB IOs on larger pages */ + {"ATA INTEL SSDSC2BA10", 4096}, + {"ATA INTEL SSDSC2BA20", 4096}, + {"ATA INTEL SSDSC2BA40", 4096}, + {"ATA INTEL SSDSC2BA80", 4096}, + {"ATA INTEL SSDSC2BB08", 4096}, + {"ATA INTEL SSDSC2BB12", 4096}, + {"ATA INTEL SSDSC2BB16", 4096}, + {"ATA INTEL SSDSC2BB24", 4096}, + {"ATA INTEL SSDSC2BB30", 4096}, + {"ATA INTEL SSDSC2BB40", 4096}, + {"ATA INTEL SSDSC2BB48", 4096}, + {"ATA INTEL SSDSC2BB60", 4096}, + {"ATA INTEL SSDSC2BB80", 4096}, + {"ATA INTEL SSDSC2BW24", 4096}, + {"ATA INTEL SSDSC2BW48", 4096}, + {"ATA INTEL SSDSC2BP24", 4096}, + {"ATA INTEL SSDSC2BP48", 4096}, + {"NA SmrtStorSDLKAE9W", 4096}, + {"NVMe Amazon EC2 NVMe ", 4096}, + /* Imported from Open Solaris */ + {"ATA MARVELL SD88SA02", 4096}, + /* Advanced format Hard drives */ + {"ATA Hitachi HDS5C303", 4096}, + {"ATA SAMSUNG HD204UI ", 4096}, + {"ATA ST2000DL004 HD20", 4096}, + {"ATA WDC WD10EARS-00M", 4096}, + {"ATA WDC WD10EARS-00S", 4096}, + {"ATA WDC WD10EARS-00Z", 4096}, + {"ATA WDC WD15EARS-00M", 4096}, + {"ATA WDC WD15EARS-00S", 4096}, + {"ATA WDC WD15EARS-00Z", 4096}, + {"ATA WDC WD20EARS-00M", 4096}, + {"ATA WDC WD20EARS-00S", 4096}, + {"ATA WDC WD20EARS-00Z", 4096}, + {"ATA WDC WD1600BEVT-0", 4096}, + {"ATA WDC WD2500BEVT-0", 4096}, + {"ATA WDC WD3200BEVT-0", 4096}, + {"ATA WDC WD5000BEVT-0", 4096}, +}; + + +#define INQ_REPLY_LEN 96 +#define INQ_CMD_LEN 6 + +static const int vdev_disk_database_size = + sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]); + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + unsigned char inq_buff[INQ_REPLY_LEN]; + unsigned char sense_buffer[32]; + unsigned char inq_cmd_blk[INQ_CMD_LEN] = + {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0}; + sg_io_hdr_t io_hdr; + int error; + int fd; + int i; + + /* Prepare INQUIRY command */ + memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = sizeof (inq_cmd_blk); + io_hdr.mx_sb_len = sizeof (sense_buffer); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.dxfer_len = INQ_REPLY_LEN; + io_hdr.dxferp = inq_buff; + io_hdr.cmdp = inq_cmd_blk; + io_hdr.sbp = sense_buffer; + io_hdr.timeout = 10; /* 10 milliseconds is ample time */ + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (B_FALSE); + + error = ioctl(fd, SG_IO, (unsigned long) &io_hdr); + + (void) close(fd); + + if (error < 0) + return (B_FALSE); + + if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) + return (B_FALSE); + + for (i = 0; i < vdev_disk_database_size; i++) { + if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24)) + continue; + + *sector_size = vdev_disk_database[i].sector_size; + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) +{ + int err; + char *value; + + /* No valid type detected device is safe to use */ + value = blkid_get_tag_value(cache, "TYPE", path); + if (value == NULL) + return (0); + + /* + * If libblkid detects a ZFS device, we check the device + * using check_file() to see if it's safe. The one safe + * case is a spare device shared between multiple pools. + */ + if (strcmp(value, "zfs_member") == 0) { + err = check_file(path, force, isspare); + } else { + if (force) { + err = 0; + } else { + err = -1; + vdev_error(gettext("%s contains a filesystem of " + "type '%s'\n"), path, value); + } + } + + free(value); + + return (err); +} + +/* + * Validate that a disk including all partitions are safe to use. + * + * For EFI labeled disks this can done relatively easily with the libefi + * library. The partition numbers are extracted from the label and used + * to generate the expected /dev/ paths. Each partition can then be + * checked for conflicts. + * + * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible + * but due to the lack of a readily available libraries this scanning is + * not implemented. Instead only the device path as given is checked. + */ +static int +check_disk(const char *path, blkid_cache cache, int force, + boolean_t isspare, boolean_t iswholedisk) +{ + struct dk_gpt *vtoc; + char slice_path[MAXPATHLEN]; + int err = 0; + int fd, i; + int flags = O_RDONLY|O_DIRECT; + + if (!iswholedisk) + return (check_slice(path, cache, force, isspare)); + + /* only spares can be shared, other devices require exclusive access */ + if (!isspare) + flags |= O_EXCL; + + if ((fd = open(path, flags)) < 0) { + char *value = blkid_get_tag_value(cache, "TYPE", path); + (void) fprintf(stderr, gettext("%s is in use and contains " + "a %s filesystem.\n"), path, value ? value : "unknown"); + free(value); + return (-1); + } + + /* + * Expected to fail for non-EFI labeled disks. Just check the device + * as given and do not attempt to detect and scan partitions. + */ + err = efi_alloc_and_read(fd, &vtoc); + if (err) { + (void) close(fd); + return (check_slice(path, cache, force, isspare)); + } + + /* + * The primary efi partition label is damaged however the secondary + * label at the end of the device is intact. Rather than use this + * label we should play it safe and treat this as a non efi device. + */ + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + + if (force) { + /* Partitions will now be created using the backup */ + return (0); + } else { + vdev_error(gettext("%s contains a corrupt primary " + "EFI label.\n"), path); + return (-1); + } + } + + for (i = 0; i < vtoc->efi_nparts; i++) { + + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || + uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) + continue; + + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, "-part", i+1); + else + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, isdigit(path[strlen(path)-1]) ? + "p" : "", i+1); + + err = check_slice(slice_path, cache, force, isspare); + if (err) + break; + } + + efi_free(vtoc); + (void) close(fd); + + return (err); +} + +int +check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + blkid_cache cache; + int error; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) { + (void) fprintf(stderr, gettext("unable to access the blkid " + "cache.\n")); + return (-1); + } + + error = check_disk(path, cache, force, isspare, iswholedisk); + blkid_put_cache(cache); + + return (error); +} + +void +after_zpool_upgrade(zpool_handle_t *zhp) +{ +} + +int +check_file(const char *file, boolean_t force, boolean_t isspare) +{ + return (check_file_generic(file, force, isspare)); +} diff --git a/cmd/zpool/zpool.d/slaves b/cmd/zpool/zpool.d/dm-deps similarity index 64% rename from cmd/zpool/zpool.d/slaves rename to cmd/zpool/zpool.d/dm-deps index 9c16d6c4e5..ee39514e4d 100755 --- a/cmd/zpool/zpool.d/slaves +++ b/cmd/zpool/zpool.d/dm-deps @@ -1,14 +1,11 @@ #!/bin/sh # -# Show device mapper slave devices. This is useful for looking up the -# /dev/sd* devices associated with a dm or multipath device. For example: -# -# $ ls /sys/block/dm-113/slaves/ -# sddt sdjw +# Show device mapper dependent / underlying devices. This is useful for +# looking up the /dev/sd* devices associated with a dm or multipath device. # if [ "$1" = "-h" ] ; then - echo "Show device mapper slave devices." + echo "Show device mapper dependent (underlying) devices." exit fi @@ -29,4 +26,4 @@ if [ -d "/sys/class/block/$dev/slaves" ] ; then val=$(echo "$val" | sed -r 's/[[:blank:]]+/ /g') fi -echo "slaves=$val" +echo "dm-deps=$val" diff --git a/cmd/zpool/zpool.d/iostat b/cmd/zpool/zpool.d/iostat index f6452fb250..41a3acfae7 100755 --- a/cmd/zpool/zpool.d/iostat +++ b/cmd/zpool/zpool.d/iostat @@ -17,14 +17,14 @@ fi if [ "$script" = "iostat-1s" ] ; then # Do a single one-second sample - extra="1 1" + interval=1 # Don't show summary stats - y="-y" + brief="yes" elif [ "$script" = "iostat-10s" ] ; then # Do a single ten-second sample - extra="10 1" + interval=10 # Don't show summary stats - y="-y" + brief="yes" fi if [ -f "$VDEV_UPATH" ] ; then @@ -32,7 +32,19 @@ if [ -f "$VDEV_UPATH" ] ; then exit fi -out=$(eval "iostat $y -k -x $VDEV_UPATH $extra") +if [ "$(uname)" = "FreeBSD" ]; then + out=$(iostat -dKx \ + ${interval:+"-w $interval"} \ + ${interval:+"-c 1"} \ + "$VDEV_UPATH" | tail -n 2) +else + out=$(iostat -kx \ + ${brief:+"-y"} \ + ${interval:+"$interval"} \ + ${interval:+"1"} \ + "$VDEV_UPATH" | awk NF | tail -n 2) +fi + # Sample output (we want the last two lines): # @@ -46,16 +58,16 @@ out=$(eval "iostat $y -k -x $VDEV_UPATH $extra") # # Get the column names -cols=$(echo "$out" | grep Device) +cols=$(echo "$out" | head -n 1) # Get the values and tab separate them to make them cut-able. -vals="$(echo "$out" | grep -A1 Device | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g')" +vals=$(echo "$out" | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g') i=0 for col in $cols ; do i=$((i+1)) # Skip the first column since it's just the device name - if [ "$col" = "Device:" ] ; then + if [ $i -eq 1 ]; then continue fi diff --git a/cmd/zpool/zpool.d/media b/cmd/zpool/zpool.d/media index 05bc15918b..5683cdc3c0 100755 --- a/cmd/zpool/zpool.d/media +++ b/cmd/zpool/zpool.d/media @@ -4,7 +4,7 @@ # if [ "$1" = "-h" ] ; then - echo "Show whether a vdev is a file, hdd, or ssd." + echo "Show whether a vdev is a file, hdd, ssd, or iscsi." exit fi @@ -18,6 +18,13 @@ if [ -b "$VDEV_UPATH" ]; then if [ "$val" = "1" ]; then MEDIA="hdd" fi + + vpd_pg83="/sys/block/$device/device/vpd_pg83" + if [ -f "$vpd_pg83" ]; then + if grep -q --binary "iqn." "$vpd_pg83"; then + MEDIA="iscsi" + fi + fi else if [ -f "$VDEV_UPATH" ]; then MEDIA="file" diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index f6b7520dfb..b1836d6765 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -41,7 +41,13 @@ for i in $scripts ; do val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) ;; fault_led) - val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + # JBODs fault LED is called 'fault', NVMe fault LED is called + # 'attention'. + if [ -f "$VDEV_ENC_SYSFS_PATH/fault" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + elif [ -f "$VDEV_ENC_SYSFS_PATH/attention" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/attention" 2>/dev/null) + fi ;; locate_led) val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null) diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart index bd18e9d044..b95256d756 100755 --- a/cmd/zpool/zpool.d/smart +++ b/cmd/zpool/zpool.d/smart @@ -53,7 +53,7 @@ get_filename_from_dir() num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) mod=$((pid % num_files)) i=0 - find "$dir" -type f -printf "%f\n" | while read -r file ; do + find "$dir" -type f -printf '%f\n' | while read -r file ; do if [ "$mod" = "$i" ] ; then echo "$file" break @@ -62,24 +62,22 @@ get_filename_from_dir() done } -script=$(basename "$0") +script="${0##*/}" if [ "$1" = "-h" ] ; then echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- exit fi -smartctl_path=$(command -v smartctl) - -if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then +if [ -b "$VDEV_UPATH" ] && PATH="/usr/sbin:$PATH" command -v smartctl > /dev/null || [ -n "$samples" ] ; then if [ -n "$samples" ] ; then # cat a smartctl output text file instead of running smartctl # on a vdev (only used for developer testing). - file=$(get_filename_from_dir $samples) + file=$(get_filename_from_dir "$samples") echo "file=$file" raw_out=$(cat "$samples/$file") else - raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") + raw_out=$(sudo smartctl -a "$VDEV_UPATH") fi # What kind of drive are we? Look for the right line in smartctl: @@ -230,11 +228,11 @@ esac with_vals=$(echo "$out" | grep -E "$scripts") if [ -n "$with_vals" ]; then echo "$with_vals" - without_vals=$(echo "$scripts" | tr "|" "\n" | + without_vals=$(echo "$scripts" | tr '|' '\n' | grep -v -E "$(echo "$with_vals" | awk -F "=" '{print $1}')" | awk '{print $0"="}') else - without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}') + without_vals=$(echo "$scripts" | tr '|' '\n' | awk '{print $0"="}') fi if [ -n "$without_vals" ]; then diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 9927a9debc..abfa2b7f6b 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -56,6 +56,7 @@ typedef struct zpool_node { struct zpool_list { boolean_t zl_findall; + boolean_t zl_literal; uu_avl_t *zl_avl; uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; @@ -88,7 +89,9 @@ add_pool(zpool_handle_t *zhp, void *data) uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { if (zlp->zl_proplist && - zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_expand_proplist(zhp, zlp->zl_proplist, + zlp->zl_literal) + != 0) { zpool_close(zhp); free(node); return (-1); @@ -110,7 +113,8 @@ add_pool(zpool_handle_t *zhp, void *data) * line. */ zpool_list_t * -pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +pool_list_get(int argc, char **argv, zprop_list_t **proplist, + boolean_t literal, int *err) { zpool_list_t *zlp; @@ -128,6 +132,8 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) zlp->zl_proplist = proplist; + zlp->zl_literal = literal; + if (argc == 0) { (void) zpool_iter(g_zfs, add_pool, zlp); zlp->zl_findall = B_TRUE; @@ -242,12 +248,12 @@ pool_list_count(zpool_list_t *zlp) */ int for_each_pool(int argc, char **argv, boolean_t unavail, - zprop_list_t **proplist, zpool_iter_f func, void *data) + zprop_list_t **proplist, boolean_t literal, zpool_iter_f func, void *data) { zpool_list_t *list; int ret = 0; - if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, proplist, literal, &ret)) == NULL) return (1); if (pool_list_iter(list, unavail, func, data) != 0) @@ -258,51 +264,6 @@ for_each_pool(int argc, char **argv, boolean_t unavail, return (ret); } -static int -for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, - void *data) -{ - nvlist_t **child; - uint_t c, children; - int ret = 0; - int i; - char *type; - - const char *list[] = { - ZPOOL_CONFIG_SPARES, - ZPOOL_CONFIG_L2CACHE, - ZPOOL_CONFIG_CHILDREN - }; - - for (i = 0; i < ARRAY_SIZE(list); i++) { - if (nvlist_lookup_nvlist_array(nv, list[i], &child, - &children) == 0) { - for (c = 0; c < children; c++) { - uint64_t ishole = 0; - - (void) nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_HOLE, &ishole); - - if (ishole) - continue; - - ret |= for_each_vdev_cb(zhp, child[c], func, - data); - } - } - } - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (ret); - - /* Don't run our function on root vdevs */ - if (strcmp(type, VDEV_TYPE_ROOT) != 0) { - ret |= func(zhp, nv, data); - } - - return (ret); -} - /* * This is the equivalent of for_each_pool() for vdevs. It iterates thorough * all vdevs in the pool, ignoring root vdevs and holes, calling func() on @@ -321,7 +282,7 @@ for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); } - return (for_each_vdev_cb(zhp, nvroot, func, data)); + return (for_each_vdev_cb((void *) zhp, nvroot, func, data)); } /* @@ -488,19 +449,25 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) /* Setup our custom environment variables */ rc = asprintf(&env[1], "VDEV_PATH=%s", data->path ? data->path : ""); - if (rc == -1) + if (rc == -1) { + env[1] = NULL; goto out; + } rc = asprintf(&env[2], "VDEV_UPATH=%s", data->upath ? data->upath : ""); - if (rc == -1) + if (rc == -1) { + env[2] = NULL; goto out; + } rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", data->vdev_enc_sysfs_path ? data->vdev_enc_sysfs_path : ""); - if (rc == -1) + if (rc == -1) { + env[3] = NULL; goto out; + } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, @@ -519,8 +486,7 @@ out: /* Start with i = 1 since env[0] was statically allocated */ for (i = 1; i < ARRAY_SIZE(env); i++) - if (env[i] != NULL) - free(env[i]); + free(env[i]); } /* @@ -592,7 +558,7 @@ vdev_run_cmd_thread(void *cb_cmd_data) /* For each vdev in the pool run a command */ static int -for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) +for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) { vdev_cmd_data_list_t *vcdl = cb_vcdl; vdev_cmd_data_t *data; @@ -600,6 +566,7 @@ for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) char *vname = NULL; char *vdev_enc_sysfs_path = NULL; int i, match = 0; + zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); @@ -616,7 +583,7 @@ for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) } } - /* Check for whitelisted vdevs here, if any */ + /* Check for selected vdevs here, if any */ for (i = 0; i < vcdl->vdev_names_count; i++) { vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); if (strcmp(vcdl->vdev_names[i], vname) == 0) { @@ -627,7 +594,7 @@ for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) free(vname); } - /* If we whitelisted vdevs, and this isn't one of them, then bail out */ + /* If we selected vdevs, and this isn't one of them, then bail out */ if (!match && vcdl->vdev_names_count) return (0); @@ -711,7 +678,7 @@ all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, vcdl->g_zfs = g_zfs; /* Gather our list of all vdevs in all pools */ - for_each_pool(argc, argv, B_TRUE, NULL, + for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, all_pools_for_each_vdev_gather_cb, vcdl); /* Run command on all vdevs in all pools */ diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index a3c76030d6..3a2caa9a81 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. @@ -31,6 +31,8 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K + * Copyright (c) 2021, Colm Buckley + * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include @@ -43,10 +45,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -73,6 +77,8 @@ #include "statcommon.h" +libzfs_handle_t *g_zfs; + static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); @@ -118,6 +124,11 @@ static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); +static int zpool_do_wait(int, char **); + +static zpool_compat_status_t zpool_do_load_compat( + const char *, boolean_t *); + /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. @@ -168,7 +179,8 @@ typedef enum { HELP_SYNC, HELP_REGUID, HELP_REOPEN, - HELP_VERSION + HELP_VERSION, + HELP_WAIT } zpool_help_t; @@ -199,7 +211,7 @@ enum iostat_type { * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ -static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { +static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, @@ -211,6 +223,7 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, @@ -218,6 +231,7 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, @@ -226,6 +240,7 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, @@ -240,6 +255,8 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, + ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, NULL}, }; @@ -309,6 +326,8 @@ static zpool_command_t command_table[] = { { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, + { NULL }, + { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -328,7 +347,7 @@ get_usage(zpool_help_t idx) return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-f] [-o property=value] " + return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); @@ -337,7 +356,7 @@ get_usage(zpool_help_t idx) "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: - return (gettext("\tcheckpoint [--discard] ...\n")); + return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: @@ -371,21 +390,21 @@ get_usage(zpool_help_t idx) case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: - return (gettext("\treplace [-f] [-o property=value] " + return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: - return (gettext("\tremove [-nps] ...\n")); + return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: - return (gettext("\tinitialize [-c | -s] " + return (gettext("\tinitialize [-c | -s] [-w] " "[ ...]\n")); case HELP_SCRUB: - return (gettext("\tscrub [-s | -p] ...\n")); + return (gettext("\tscrub [-s | -p] [-w] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: - return (gettext("\ttrim [-d] [-r ] [-c | -s] " + return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " @@ -412,10 +431,12 @@ get_usage(zpool_help_t idx) return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); + case HELP_WAIT: + return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " + " [interval]\n")); + default: + __builtin_unreachable(); } - - abort(); - /* NOTREACHED */ } static void @@ -432,7 +453,8 @@ zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); - if (strcmp(path, VDEV_TYPE_INDIRECT) != 0) + if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && + strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); @@ -472,7 +494,7 @@ print_prop_cb(int prop, void *cb) * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ -void +static void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; @@ -515,7 +537,7 @@ usage(boolean_t requested) (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " - "appended with a feature name.\nSee zpool-features(5).\n")); + "appended with a feature name.\nSee zpool-features(7).\n")); } /* @@ -530,12 +552,13 @@ usage(boolean_t requested) } /* - * zpool initialize [-c | -s] [ ...] + * zpool initialize [-c | -s] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. + * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) @@ -545,15 +568,17 @@ zpool_do_initialize(int argc, char **argv) zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; + boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; - while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && @@ -573,6 +598,9 @@ zpool_do_initialize(int argc, char **argv) } cmd_type = POOL_INITIALIZE_SUSPEND; break; + case 'w': + wait = B_TRUE; + break; case '?': if (optopt != 0) { (void) fprintf(stderr, @@ -595,6 +623,12 @@ zpool_do_initialize(int argc, char **argv) return (-1); } + if (wait && (cmd_type != POOL_INITIALIZE_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) @@ -613,7 +647,10 @@ zpool_do_initialize(int argc, char **argv) } } - err = zpool_initialize(zhp, cmd_type, vdevs); + if (wait) + err = zpool_initialize_wait(zhp, cmd_type, vdevs); + else + err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); @@ -641,9 +678,16 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; + uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole == B_TRUE) { + continue; + } + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) @@ -664,6 +708,54 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } } +/* + * Print the list of l2cache devices for dry runs. + */ +static void +print_cache_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "cache"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + +/* + * Print the list of spares for dry runs. + */ +static void +print_spare_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "spares"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + static boolean_t prop_list_contains_feature(nvlist_t *proplist) { @@ -699,6 +791,8 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); + const char *cname = + zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && !zpool_prop_feature(propname)) { @@ -721,6 +815,22 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, return (2); } + /* + * if version is specified, only "legacy" compatibility + * may be requested + */ + if ((prop == ZPOOL_PROP_COMPATIBILITY && + strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 && + nvlist_exists(proplist, vname)) || + (prop == ZPOOL_PROP_VERSION && + nvlist_exists(proplist, cname) && + strcmp(fnvlist_lookup_string(proplist, cname), + ZPOOL_COMPAT_LEGACY) != 0)) { + (void) fprintf(stderr, gettext("when 'version' is " + "specified, the 'compatibility' feature may only " + "be set to '" ZPOOL_COMPAT_LEGACY "'\n")); + return (2); + } if (zpool_prop_feature(propname)) normnm = propname; @@ -893,16 +1003,16 @@ zpool_do_add(int argc, char **argv) if (dryrun) { nvlist_t *poolnvroot; - nvlist_t **l2child; - uint_t l2children, c; + nvlist_t **l2child, **sparechild; + uint_t l2children, sparechildren, c; char *vname; - boolean_t hadcache = B_FALSE; + boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " - "configuration:\n"), zpool_get_name(zhp)); + "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", @@ -910,20 +1020,35 @@ zpool_do_add(int argc, char **argv) print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ - print_vdev_tree(zhp, "dedup", poolnvroot, 0, - VDEV_ALLOC_BIAS_DEDUP, name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, - name_flags); + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", poolnvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } - print_vdev_tree(zhp, "special", poolnvroot, 0, - VDEV_ALLOC_BIAS_SPECIAL, name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, - name_flags); + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", poolnvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } - print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, - name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, - name_flags); + if (num_logs(poolnvroot) > 0) { + print_vdev_tree(zhp, "logs", poolnvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } else if (num_logs(nvroot) > 0) { + print_vdev_tree(zhp, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, @@ -948,6 +1073,29 @@ zpool_do_add(int argc, char **argv) free(vname); } } + /* And finally the spares */ + if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + hadspare = B_TRUE; + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + if (!hadspare) + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } ret = 0; } else { @@ -962,7 +1110,7 @@ zpool_do_add(int argc, char **argv) } /* - * zpool remove ... + * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ @@ -976,9 +1124,10 @@ zpool_do_remove(int argc, char **argv) int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; + boolean_t wait = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "nps")) != -1) { + while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; @@ -989,6 +1138,9 @@ zpool_do_remove(int argc, char **argv) case 's': stop = B_TRUE; break; + case 'w': + wait = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -1022,6 +1174,11 @@ zpool_do_remove(int argc, char **argv) } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; + if (wait) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -w cannot be used with -s\n")); + usage(B_FALSE); + } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); @@ -1053,12 +1210,35 @@ zpool_do_remove(int argc, char **argv) ret = 1; } } + + if (ret == 0 && wait) + ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } +/* + * Return 1 if a vdev is active (being used in a pool) + * Return 0 if a vdev is inactive (offlined or faulted, or not in active pool) + * + * This is useful for checking if a disk in an active pool is offlined or + * faulted. + */ +static int +vdev_is_active(char *vdev_path) +{ + int fd; + fd = open(vdev_path, O_EXCL); + if (fd < 0) { + return (1); /* cant open O_EXCL - disk is active */ + } + + close(fd); + return (0); /* disk is inactive in the pool */ +} + /* * zpool labelclear [-f] * @@ -1140,7 +1320,7 @@ zpool_do_labelclear(int argc, char **argv) * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ - if ((ioctl(fd, BLKFLSBUF) != 0) && (errno != ENOTTY)) + if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); @@ -1168,9 +1348,23 @@ zpool_do_labelclear(int argc, char **argv) case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: + /* + * We allow the user to call 'zpool offline -f' + * on an offlined disk in an active pool. We can check if + * the disk is online by calling vdev_is_active(). + */ + if (force && !vdev_is_active(vdev)) + break; + (void) fprintf(stderr, gettext( - "%s is a member (%s) of pool \"%s\"\n"), + "%s is a member (%s) of pool \"%s\""), vdev, zpool_pool_state_to_name(state), name); + + if (force) { + (void) fprintf(stderr, gettext( + ". Offline the disk first to clear its label.")); + } + printf("\n"); ret = 1; goto errout; @@ -1241,13 +1435,15 @@ zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; - boolean_t enable_all_pool_feat = B_TRUE; + boolean_t enable_pool_features = B_TRUE; + int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; + char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; @@ -1263,7 +1459,7 @@ zpool_do_create(int argc, char **argv) dryrun = B_TRUE; break; case 'd': - enable_all_pool_feat = B_FALSE; + enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; @@ -1301,11 +1497,14 @@ zpool_do_create(int argc, char **argv) ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { - enable_all_pool_feat = B_FALSE; + enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; + if (zpool_name_to_prop(optarg) == + ZPOOL_PROP_COMPATIBILITY) + compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { @@ -1493,14 +1692,33 @@ zpool_do_create(int argc, char **argv) VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); + print_cache_list(nvroot, 0); + print_spare_list(nvroot, 0); ret = 0; } else { /* - * Hand off to libzfs. + * Load in feature set. + * Note: if compatibility property not given, we'll have + * NULL, which means 'all features'. */ - spa_feature_t i; - for (i = 0; i < SPA_FEATURES; i++) { + boolean_t requested_features[SPA_FEATURES]; + if (zpool_do_load_compat(compat, requested_features) != + ZPOOL_COMPATIBILITY_OK) + goto errout; + + /* + * props contains list of features to enable. + * For each feature: + * - remove it if feature@name=disabled + * - leave it there if feature@name=enabled + * - add it if: + * - enable_pool_features (ie: no '-d' or '-o version') + * - it's supported by the kernel module + * - it's in the requested feature set + * - warn if it's enabled but not in compat + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; @@ -1508,17 +1726,22 @@ zpool_do_create(int argc, char **argv) (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); - /* - * Only features contained in props will be enabled: - * remove from the nvlist every ZFS_FEATURE_DISABLED - * value and add every missing ZFS_FEATURE_ENABLED if - * enable_all_pool_feat is set. - */ if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); - } else if (enable_all_pool_feat) { + if (strcmp(propval, + ZFS_FEATURE_ENABLED) == 0 && + !requested_features[i]) + (void) fprintf(stderr, gettext( + "Warning: feature \"%s\" enabled " + "but is not in specified " + "'compatibility' feature set.\n"), + feat->fi_uname); + } else if ( + enable_pool_features && + feat->fi_zfs_mod_supported && + requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) @@ -1532,8 +1755,10 @@ zpool_do_create(int argc, char **argv) zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { - if (zfs_mount(pool, NULL, 0) == 0) + if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_shareall(pool); + zfs_commit_all_shares(); + } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { @@ -1634,7 +1859,7 @@ typedef struct export_cbdata { /* * Export one pool */ -int +static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; @@ -1705,7 +1930,7 @@ zpool_do_export(int argc, char **argv) } return (for_each_pool(argc, argv, B_TRUE, NULL, - zpool_export_one, &cb)); + B_FALSE, zpool_export_one, &cb)); } /* check arguments */ @@ -1714,7 +1939,8 @@ zpool_do_export(int argc, char **argv) usage(B_FALSE); } - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, + &cb); return (ret); } @@ -1871,7 +2097,7 @@ zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) * Mark empty values with dashes to make output * awk-able. */ - if (is_blank_str(val)) + if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); @@ -1995,15 +2221,37 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) } } +/* + * Return the color associated with a health string. This includes returning + * NULL for no color change. + */ +static char * +health_str_to_color(const char *health) +{ + if (strcmp(health, gettext("FAULTED")) == 0 || + strcmp(health, gettext("SUSPENDED")) == 0 || + strcmp(health, gettext("UNAVAIL")) == 0) { + return (ANSI_RED); + } + + if (strcmp(health, gettext("OFFLINE")) == 0 || + strcmp(health, gettext("DEGRADED")) == 0 || + strcmp(health, gettext("REMOVED")) == 0) { + return (ANSI_YELLOW); + } + + return (NULL); +} + /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, - nvlist_t *nv, int depth, boolean_t isspare) + nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; - uint_t c, children; + uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; @@ -2013,13 +2261,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; char *type; char *path = NULL; + char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -2027,34 +2276,54 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) - state = "INUSE"; + state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) - state = "AVAIL"; + state = gettext("AVAIL"); } - (void) printf("\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, + printf_color(health_str_to_color(state), + "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { + if (vs->vs_read_errors) + rcolor = ANSI_RED; + + if (vs->vs_write_errors) + wcolor = ANSI_RED; + + if (vs->vs_checksum_errors) + ccolor = ANSI_RED; + if (cb->cb_literal) { - printf(" %5llu %5llu %5llu", - (u_longlong_t)vs->vs_read_errors, - (u_longlong_t)vs->vs_write_errors, + printf(" "); + printf_color(rcolor, "%5llu", + (u_longlong_t)vs->vs_read_errors); + printf(" "); + printf_color(wcolor, "%5llu", + (u_longlong_t)vs->vs_write_errors); + printf(" "); + printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + printf(" "); + printf_color(rcolor, "%5s", rbuf); + printf(" "); + printf_color(wcolor, "%5s", wbuf); + printf(" "); + printf_color(ccolor, "%5s", cbuf); } - if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ @@ -2069,16 +2338,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, else printf(" %5s", rbuf); } - } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - (void) printf(" was %s", path); + (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); - + color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); @@ -2100,6 +2368,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("unsupported feature(s)")); break; + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); @@ -2146,10 +2418,22 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("all children offline")); break; + case VDEV_AUX_BAD_LABEL: + (void) printf(gettext("invalid label")); + break; + default: (void) printf(gettext("corrupted data")); break; } + color_end(); + } else if (children == 0 && !isspare && + getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && + VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift) { + (void) printf( + gettext(" block size: %dB configured, %dB native"), + 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } /* The root vdev has the scrub/resilver stats */ @@ -2168,6 +2452,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } + /* The top-level vdevs have the rebuild stats */ + if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && + children == 0) { + if (vs->vs_rebuild_processed != 0) { + (void) printf(gettext(" (resilvering)")); + } + } + if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); @@ -2175,7 +2467,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } - /* Display vdev initialization and trim status for leaves */ + /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); @@ -2197,11 +2489,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; + /* Provide vdev_rebuild_stats to children if available */ + if (vrs == NULL) { + (void) nvlist_lookup_uint64_array(nv, + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i); + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); - print_status_config(zhp, cb, vname, child[c], depth + 2, - isspare); + isspare, vrs); free(vname); } } @@ -2266,6 +2564,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, (void) printf(gettext("all children offline")); break; + case VDEV_AUX_BAD_LABEL: + (void) printf(gettext("invalid label")); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -2370,7 +2672,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, - B_FALSE); + B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); @@ -2380,8 +2682,8 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, /* * Display the status for the given pool. */ -static void -show_import(nvlist_t *config) +static int +show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; @@ -2413,6 +2715,13 @@ show_import(nvlist_t *config) reason = zpool_import_status(config, &msgid, &errata); + /* + * If we're importing using a cachefile, then we won't report any + * errors unless we are in the scan phase of the import. + */ + if (reason != ZPOOL_STATUS_OK && !report_error) + return (reason); + (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); @@ -2424,14 +2733,16 @@ show_import(nvlist_t *config) case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext(" status: One or more devices are " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext(" status: One or more devices contains " - "corrupted data.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices contains" + " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: @@ -2440,81 +2751,123 @@ show_import(nvlist_t *config) break; case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext(" status: One or more devices " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext(" status: The pool metadata is " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext(" status: The pool is formatted using a " - "legacy on-disk version.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext(" status: The pool is formatted using an " - "incompatible version.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: - (void) printf(gettext(" status: Some supported features are " - "not enabled on the pool.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported " + "features are not enabled on the pool.\n\t" + "(Note that they may be intentionally disabled " + "if the\n\t'compatibility' property is set.)\n")); + break; + + case ZPOOL_STATUS_COMPATIBILITY_ERR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Error reading or parsing " + "the file(s) indicated by the 'compatibility'\n" + "property.\n")); + break; + + case ZPOOL_STATUS_INCOMPATIBLE_FEAT: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more features " + "are enabled on the pool despite not being\n" + "requested by the 'compatibility' property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("status: The pool uses the following " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); + color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("status: The pool can only be accessed " - "in read-only mode on this system. It\n\tcannot be " - "accessed in read-write mode because it uses the " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); + color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: - (void) printf(gettext(" status: The pool is currently " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: - (void) printf(gettext(" status: The pool has the " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: - (void) printf(gettext(" status: The pool was last accessed by " - "another system.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool was last accessed " + "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext(" status: One or more devices are " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext(" status: An intent log record cannot be " - "read.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record cannot " + "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: - (void) printf(gettext(" status: One or more devices were being " - "resilvered.\n")); + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices were " + "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: - (void) printf(gettext(" status: Errata #%d detected.\n"), + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + break; + default: /* * No other status can be seen when importing pools. @@ -2532,6 +2885,12 @@ show_import(nvlist_t *config) "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); + } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric\n\tidentifier, " + "though the file(s) indicated by its " + "'compatibility'\n\tproperty cannot be parsed at " + "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " @@ -2606,13 +2965,15 @@ show_import(nvlist_t *config) "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("action: The pool cannot be " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("action: The pool cannot be " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " @@ -2673,9 +3034,11 @@ show_import(nvlist_t *config) "the '-f' flag.\n")); } - if (msgid != NULL) - (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), + if (msgid != NULL) { + (void) printf(gettext( + " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); + } (void) printf(gettext(" config:\n\n")); @@ -2695,6 +3058,7 @@ show_import(nvlist_t *config) "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } + return (0); } static boolean_t @@ -2833,6 +3197,121 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +static int +import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, + char *orig_name, char *new_name, + boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, + importargs_t *import) +{ + nvlist_t *config = NULL; + nvlist_t *found_config = NULL; + uint64_t pool_state; + + /* + * At this point we have a list of import candidate configs. Even if + * we were searching by pool name or guid, we still need to + * post-process the list to deal with pool state and possible + * duplicate names. + */ + int err = 0; + nvpair_t *elem = NULL; + boolean_t first = B_TRUE; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + + verify(nvpair_value_nvlist(elem, &config) == 0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + continue; + if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + continue; + + verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + import->policy) == 0); + + if (!pool_specified) { + if (first) + first = B_FALSE; + else if (!do_all) + (void) printf("\n"); + + if (do_all) { + err |= do_import(config, NULL, mntopts, + props, flags); + } else { + /* + * If we're importing from cachefile, then + * we don't want to report errors until we + * are in the scan phase of the import. If + * we get an error, then we return that error + * to invoke the scan phase. + */ + if (import->cachefile && !import->scan) + err = show_import(config, B_FALSE); + else + (void) show_import(config, B_TRUE); + } + } else if (import->poolname != NULL) { + char *name; + + /* + * We are searching for a pool based on name. + */ + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + + if (strcmp(name, import->poolname) == 0) { + if (found_config != NULL) { + (void) fprintf(stderr, gettext( + "cannot import '%s': more than " + "one matching pool\n"), + import->poolname); + (void) fprintf(stderr, gettext( + "import by numeric ID instead\n")); + err = B_TRUE; + } + found_config = config; + } + } else { + uint64_t guid; + + /* + * Search for a pool by guid. + */ + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + + if (guid == import->guid) + found_config = config; + } + } + + /* + * If we were searching for a specific pool, verify that we found a + * pool, and then do the import. + */ + if (pool_specified && err == 0) { + if (found_config == NULL) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), orig_name); + err = B_TRUE; + } else { + err |= do_import(found_config, new_name, + mntopts, props, flags); + } + } + + /* + * If we were just looking for pools, report an error if none were + * found. + */ + if (!pool_specified && first) + (void) fprintf(stderr, + gettext("no pools available to import\n")); + return (err); +} + typedef struct target_exists_args { const char *poolname; uint64_t poolguid; @@ -2874,28 +3353,36 @@ name_or_guid_exists(zpool_handle_t *zhp, void *data) * -d Discard the checkpoint from a checkpointed * --discard pool. * + * -w Wait for discarding a checkpoint to complete. + * --wait + * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { - boolean_t discard; + boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, + {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; - while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) { + wait = B_FALSE; + while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; + case 'w': + wait = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2903,6 +3390,12 @@ zpool_do_checkpoint(int argc, char **argv) } } + if (wait && !discard) { + (void) fprintf(stderr, gettext("--wait only valid when " + "--discard also specified\n")); + usage(B_FALSE); + } + argc -= optind; argv += optind; @@ -2928,10 +3421,13 @@ zpool_do_checkpoint(int argc, char **argv) return (1); } - if (discard) + if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); - else + if (err == 0 && wait) + err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); + } else { err = (zpool_checkpoint(zhp) != 0); + } zpool_close(zhp); @@ -2943,51 +3439,54 @@ zpool_do_checkpoint(int argc, char **argv) /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] - * [-d dir | -c cachefile] [-f] -a + * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] - * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] + * [-d dir | -c cachefile | -s] [-f] [-n] [-F] + * [newpool] * - * -c Read pool information from a cachefile instead of searching - * devices. + * -c Read pool information from a cachefile instead of searching + * devices. If importing from a cachefile config fails, then + * fallback to searching for devices only in the directories that + * exist in the cachefile. * - * -d Scan in a specific directory, other than /dev/. More than + * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * - * -D Scan for previously destroyed pools or import all or only - * specified destroyed pools. + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. * - * -R Temporarily import the pool, with all mountpoints relative to + * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * - * -V Import even in the presence of faulted vdevs. This is an - * intentionally undocumented option for testing purposes, and - * treats the pool configuration as complete, leaving any bad + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * - * -f Force import, even if it appears that the pool is active. + * -f Force import, even if it appears that the pool is active. * - * -F Attempt rewind if necessary. + * -F Attempt rewind if necessary. * - * -n See if rewind would work, but don't actually rewind. + * -n See if rewind would work, but don't actually rewind. * - * -N Import the pool but don't mount datasets. + * -N Import the pool but don't mount datasets. * - * -T Specify a starting txg to use for import. This option is - * intentionally undocumented option for testing purposes. + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. * - * -a Import all pools found. + * -a Import all pools found. * - * -l Load encryption keys while importing. + * -l Load encryption keys while importing. * - * -o Set property=value and/or temporary mount options (without '='). + * -o Set property=value and/or temporary mount options (without '='). * - * -s Scan using the default search path, the libblkid cache will - * not be consulted. + * -s Scan using the default search path, the libblkid cache will + * not be consulted. * - * --rewind-to-checkpoint - * Import the pool and revert back to the checkpoint. + * --rewind-to-checkpoint + * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. @@ -3004,15 +3503,11 @@ zpool_do_import(int argc, char **argv) boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; - nvpair_t *elem; - nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; - nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; - boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; @@ -3020,7 +3515,8 @@ zpool_do_import(int argc, char **argv) boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - uint64_t pool_state, txg = -1ULL; + boolean_t pool_specified = B_FALSE; + uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; @@ -3041,16 +3537,8 @@ zpool_do_import(int argc, char **argv) cachefile = optarg; break; case 'd': - if (searchdirs == NULL) { - searchdirs = safe_malloc(sizeof (char *)); - } else { - char **tmp = safe_malloc((nsearch + 1) * - sizeof (char *)); - bcopy(searchdirs, tmp, nsearch * - sizeof (char *)); - free(searchdirs); - searchdirs = tmp; - } + searchdirs = safe_realloc(searchdirs, + (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = optarg; break; case 'D': @@ -3142,6 +3630,11 @@ zpool_do_import(int argc, char **argv) usage(B_FALSE); } + if (cachefile && do_scan) { + (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); + usage(B_FALSE); + } + if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); @@ -3222,7 +3715,7 @@ zpool_do_import(int argc, char **argv) searchname = argv[0]; searchguid = 0; } - found_config = NULL; + pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. @@ -3235,24 +3728,16 @@ zpool_do_import(int argc, char **argv) * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { - char *dir; + char *dir, *tmp = NULL; envdup = strdup(env); - dir = strtok(envdup, ":"); - while (dir != NULL) { - if (searchdirs == NULL) { - searchdirs = safe_malloc(sizeof (char *)); - } else { - char **tmp = safe_malloc((nsearch + 1) * - sizeof (char *)); - bcopy(searchdirs, tmp, nsearch * - sizeof (char *)); - free(searchdirs); - searchdirs = tmp; - } + for (dir = strtok_r(envdup, ":", &tmp); + dir != NULL; + dir = strtok_r(NULL, ":", &tmp)) { + searchdirs = safe_realloc(searchdirs, + (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = dir; - dir = strtok(NULL, ":"); } } @@ -3291,116 +3776,49 @@ zpool_do_import(int argc, char **argv) } if (err == 1) { - if (searchdirs != NULL) - free(searchdirs); - if (envdup != NULL) - free(envdup); + free(searchdirs); + free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } + err = import_pools(pools, props, mntopts, flags, + argc >= 1 ? argv[0] : NULL, + argc >= 2 ? argv[1] : NULL, + do_destroyed, pool_specified, do_all, &idata); + /* - * At this point we have a list of import candidate configs. Even if - * we were searching by pool name or guid, we still need to - * post-process the list to deal with pool state and possible - * duplicate names. + * If we're using the cachefile and we failed to import, then + * fallback to scanning the directory for pools that match + * those in the cachefile. */ - err = 0; - elem = NULL; - first = B_TRUE; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + if (err != 0 && cachefile != NULL) { + (void) printf(gettext("cachefile import failed, retrying\n")); - verify(nvpair_value_nvlist(elem, &config) == 0); + /* + * We use the scan flag to gather the directories that exist + * in the cachefile. If we need to fallback to searching for + * the pool config, we will only search devices in these + * directories. + */ + idata.scan = B_TRUE; + nvlist_free(pools); + pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) - continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) - continue; - - verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, - policy) == 0); - - if (argc == 0) { - if (first) - first = B_FALSE; - else if (!do_all) - (void) printf("\n"); - - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); - } else { - show_import(config); - } - } else if (searchname != NULL) { - char *name; - - /* - * We are searching for a pool based on name. - */ - verify(nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &name) == 0); - - if (strcmp(name, searchname) == 0) { - if (found_config != NULL) { - (void) fprintf(stderr, gettext( - "cannot import '%s': more than " - "one matching pool\n"), searchname); - (void) fprintf(stderr, gettext( - "import by numeric ID instead\n")); - err = B_TRUE; - } - found_config = config; - } - } else { - uint64_t guid; - - /* - * Search for a pool by guid. - */ - verify(nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - - if (guid == searchguid) - found_config = config; - } + err = import_pools(pools, props, mntopts, flags, + argc >= 1 ? argv[0] : NULL, + argc >= 2 ? argv[1] : NULL, + do_destroyed, pool_specified, do_all, &idata); } - /* - * If we were searching for a specific pool, verify that we found a - * pool, and then do the import. - */ - if (argc != 0 && err == 0) { - if (found_config == NULL) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "no such pool available\n"), argv[0]); - err = B_TRUE; - } else { - err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, props, flags); - } - } - - /* - * If we were just looking for pools, report an error if none were - * found. - */ - if (argc == 0 && first) - (void) fprintf(stderr, - gettext("no pools available to import\n")); - error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); - if (searchdirs != NULL) - free(searchdirs); - if (envdup != NULL) - free(envdup); + free(searchdirs); + free(envdup); return (err ? 1 : 0); } @@ -3439,7 +3857,8 @@ zpool_do_sync(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ - ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, + &force); return (ret); } @@ -3464,22 +3883,23 @@ typedef struct name_and_columns { unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; -#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ +#define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, - {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}}, + {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"rebuild", 1}, + {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, - {"trimq_write", 2}, {NULL}}, + {"trimq_write", 2}, {"rebuildq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, - {"trim", 2}, {NULL}}, + {"trim", 2}, {"rebuild", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ @@ -3488,14 +3908,17 @@ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, - {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, + {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {"wait"}, + {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, - {"pend"}, {"activ"}, {NULL}}, + {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, - {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {"rebuild"}, + {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, - {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, + {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { @@ -3573,7 +3996,7 @@ default_column_width(iostat_cbdata_t *cb, enum iostat_type type) * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ -void +static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { @@ -3645,7 +4068,7 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ -void +static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; @@ -3665,7 +4088,7 @@ print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { - printf("%*s", vcdl->uniq_cols_width[i], + printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } @@ -4127,6 +4550,8 @@ print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, }; struct stat_array *nva; @@ -4166,6 +4591,7 @@ print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, }; struct stat_array *nva; @@ -4257,11 +4683,11 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, uint64_t tdelta; double scale; - calcvs = safe_malloc(sizeof (*calcvs)); - if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); + calcvs = safe_malloc(sizeof (*calcvs)); + if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); @@ -4495,7 +4921,7 @@ refresh_iostat(zpool_handle_t *zhp, void *data) /* * Callback to print out the iostats for the given pool. */ -int +static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; @@ -4588,7 +5014,7 @@ get_interval_count(int *argcp, char **argv, float *iv, /* * Determine if the last argument is an integer or a pool name */ - if (argc > 0 && isnumber(argv[argc - 1])) { + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; @@ -4596,8 +5022,8 @@ get_interval_count(int *argcp, char **argv, float *iv, if (*end == '\0' && errno == 0) { if (interval == 0) { - (void) fprintf(stderr, gettext("interval " - "cannot be zero\n")); + (void) fprintf(stderr, gettext( + "interval cannot be zero\n")); usage(B_FALSE); } /* @@ -4618,7 +5044,7 @@ get_interval_count(int *argcp, char **argv, float *iv, * If the last argument is also an integer, then we have both a count * and an interval. */ - if (argc > 0 && isnumber(argv[argc - 1])) { + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; @@ -4627,8 +5053,8 @@ get_interval_count(int *argcp, char **argv, float *iv, if (*end == '\0' && errno == 0) { if (interval == 0) { - (void) fprintf(stderr, gettext("interval " - "cannot be zero\n")); + (void) fprintf(stderr, gettext( + "interval cannot be zero\n")); usage(B_FALSE); } @@ -4730,11 +5156,12 @@ get_stat_flags(zpool_list_t *list) * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int -is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +is_vdev_cb(void *zhp_data, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; + zpool_handle_t *zhp = zhp_data; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); @@ -4784,7 +5211,7 @@ are_vdevs_in_pool(int argc, char **argv, char *pool_name, /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, - is_vdev, cb); + B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; @@ -4812,7 +5239,8 @@ is_pool_cb(zpool_handle_t *zhp, void *data) static int is_pool(char *name) { - return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); + return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, + name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ @@ -4907,6 +5335,24 @@ fsleep(float sec) nanosleep(&req, NULL); } +/* + * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or + * if we were unable to determine its size. + */ +static int +terminal_height(void) +{ + struct winsize win; + + if (isatty(STDOUT_FILENO) == 0) + return (-1); + + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) + return (win.ws_row); + + return (-1); +} + /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. @@ -4971,7 +5417,7 @@ print_zpool_dir_scripts(char *dirpath) static void print_zpool_script_list(char *subcommand) { - char *dir, *sp; + char *dir, *sp, *tmp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); @@ -4979,11 +5425,10 @@ print_zpool_script_list(char *subcommand) if (sp == NULL) return; - dir = strtok(sp, ":"); - while (dir != NULL) { + for (dir = strtok_r(sp, ":", &tmp); + dir != NULL; + dir = strtok_r(NULL, ":", &tmp)) print_zpool_dir_scripts(dir); - dir = strtok(NULL, ":"); - } free(sp); } @@ -4991,22 +5436,48 @@ print_zpool_script_list(char *subcommand) /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. + * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; - int width, columns; + int width, available_width; + /* + * get_namewidth() returns the maximum width of any name in that column + * for any pool/vdev/device line that will be output. + */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); - columns = get_columns(); + /* + * The width we are calculating is the width of the header and also the + * padding width for names that are less than maximum width. The stats + * take up 42 characters, so the width available for names is: + */ + available_width = get_columns() - 42; + + /* + * If the maximum width fits on a screen, then great! Make everything + * line up by justifying all lines to the same width. If that max + * width is larger than what's available, the name plus stats won't fit + * on one line, and justifying to that width would cause every line to + * wrap on the screen. We only want lines with long names to wrap. + * Limit the padding to what won't wrap. + */ + if (width > available_width) + width = available_width; + + /* + * And regardless of whatever the screen width is (get_columns can + * return 0 if the width is not known or less than 42 for a narrow + * terminal) have the width be a minimum of 10. + */ if (width < 10) width = 10; - if (width > columns - 42) - width = columns - 42; + /* Save the calculated width */ cb->cb_namewidth = width; return (0); @@ -5047,7 +5518,6 @@ zpool_do_iostat(int argc, char **argv) int npools; float interval = 0; unsigned long count = 0; - struct winsize win; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; @@ -5221,7 +5691,7 @@ zpool_do_iostat(int argc, char **argv) * Construct the list of all interesting pools. */ ret = 0; - if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { @@ -5335,25 +5805,19 @@ zpool_do_iostat(int argc, char **argv) cb.vcdl = NULL; } - /* - * Are we connected to TTY? If not, headers_once - * should be true, to avoid breaking scripts. - */ - if (isatty(fileno(stdout)) == 0) - headers_once = B_TRUE; /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ - if (headers_once == B_FALSE) { - if (ioctl(1, TIOCGWINSZ, &win) != -1 && - win.ws_row > 0) - winheight = win.ws_row; - else - headers_once = B_TRUE; - } + winheight = terminal_height(); + /* + * Are we connected to TTY? If not, headers_once + * should be true, to avoid breaking scripts. + */ + if (winheight < 0) + headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, @@ -5593,7 +6057,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, break; case ZPOOL_PROP_HEALTH: width = 8; - snprintf(propval, sizeof (propval), "%-*s", (int)width, str); + (void) strlcpy(propval, str, sizeof (propval)); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); @@ -5612,7 +6076,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, * print static default line per vdev * not compatible with '-o' option */ -void +static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { @@ -5779,7 +6243,7 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, /* * Generic callback function to list a pool. */ -int +static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; @@ -5901,7 +6365,7 @@ zpool_do_list(int argc, char **argv) for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, - &ret)) == NULL) + cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) @@ -5943,6 +6407,8 @@ static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; + boolean_t rebuild = B_FALSE; + boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; @@ -5952,7 +6418,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) int ret; /* check options */ - while ((c = getopt(argc, argv, "fo:")) != -1) { + while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -5970,6 +6436,12 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; + case 's': + rebuild = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6051,7 +6523,12 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, + rebuild); + + if (ret == 0 && wait) + ret = zpool_wait(zhp, + replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); nvlist_free(props); nvlist_free(nvroot); @@ -6061,9 +6538,12 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) } /* - * zpool replace [-f] + * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. + * -o Set property=value. + * -w Wait for replacing to complete before returning * * Replace with . */ @@ -6075,10 +6555,12 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-f] [-o property=value] + * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. + * -w Wait for resilvering to complete before returning * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of @@ -6111,9 +6593,8 @@ zpool_do_detach(int argc, char **argv) int ret; /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { + while ((c = getopt(argc, argv, "")) != -1) { switch (c) { - case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6284,6 +6765,10 @@ zpool_do_split(int argc, char **argv) "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); + print_vdev_tree(NULL, "dedup", config, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", config, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); } } @@ -6342,12 +6827,11 @@ zpool_do_online(int argc, char **argv) int flags = 0; /* check options */ - while ((c = getopt(argc, argv, "et")) != -1) { + while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; - case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6637,7 +7121,7 @@ zpool_do_reopen(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); @@ -6645,8 +7129,6 @@ zpool_do_reopen(int argc, char **argv) typedef struct scrub_cbdata { int cb_type; - int cb_argc; - char **cb_argv; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; @@ -6676,7 +7158,7 @@ zpool_has_checkpoint(zpool_handle_t *zhp) return (B_FALSE); } -int +static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; @@ -6703,23 +7185,33 @@ scrub_callback(zpool_handle_t *zhp, void *data) return (err != 0); } +static int +wait_callback(zpool_handle_t *zhp, void *data) +{ + zpool_wait_activity_t *act = data; + return (zpool_wait(zhp, *act)); +} + /* - * zpool scrub [-s | -p] ... + * zpool scrub [-s | -p] [-w] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. + * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; + boolean_t wait = B_FALSE; + int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ - while ((c = getopt(argc, argv, "sp")) != -1) { + while ((c = getopt(argc, argv, "spw")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; @@ -6727,6 +7219,9 @@ zpool_do_scrub(int argc, char **argv) case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; + case 'w': + wait = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6741,8 +7236,13 @@ zpool_do_scrub(int argc, char **argv) usage(B_FALSE); } - cb.cb_argc = argc; - cb.cb_argv = argv; + if (wait && (cb.cb_type == POOL_SCAN_NONE || + cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { + (void) fprintf(stderr, gettext("invalid option combination: " + "-w cannot be used with -p or -s\n")); + usage(B_FALSE); + } + argc -= optind; argv += optind; @@ -6751,7 +7251,16 @@ zpool_do_scrub(int argc, char **argv) usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb); + + if (wait && !error) { + zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + wait_callback, &act); + } + + return (error); } /* @@ -6767,8 +7276,6 @@ zpool_do_resilver(int argc, char **argv) cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; - cb.cb_argc = argc; - cb.cb_argv = argv; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { @@ -6788,7 +7295,8 @@ zpool_do_resilver(int argc, char **argv) usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); + return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb)); } /* @@ -6799,6 +7307,7 @@ zpool_do_resilver(int argc, char **argv) * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. + * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) @@ -6808,15 +7317,17 @@ zpool_do_trim(int argc, char **argv) {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; + boolean_t wait = B_FALSE; int c; - while ((c = getopt_long(argc, argv, "cdr:s", long_options, NULL)) + while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -6842,9 +7353,10 @@ zpool_do_trim(int argc, char **argv) "combined with the -c or -s options\n")); usage(B_FALSE); } - if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { - (void) fprintf(stderr, - gettext("invalid value for rate\n")); + if (zfs_nicestrtonum(g_zfs, optarg, &rate) == -1) { + (void) fprintf(stderr, "%s: %s\n", + gettext("invalid value for rate"), + libzfs_error_description(g_zfs)); usage(B_FALSE); } break; @@ -6857,6 +7369,9 @@ zpool_do_trim(int argc, char **argv) } cmd_type = POOL_TRIM_SUSPEND; break; + case 'w': + wait = B_TRUE; + break; case '?': if (optopt != 0) { (void) fprintf(stderr, @@ -6879,6 +7394,12 @@ zpool_do_trim(int argc, char **argv) return (-1); } + if (wait && (cmd_type != POOL_TRIM_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) @@ -6887,6 +7408,7 @@ zpool_do_trim(int argc, char **argv) trimflags_t trim_flags = { .secure = secure, .rate = rate, + .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); @@ -6912,22 +7434,45 @@ zpool_do_trim(int argc, char **argv) return (error); } +/* + * Converts a total number of seconds to a human readable string broken + * down in to days/hours/minutes/seconds. + */ +static void +secs_to_dhms(uint64_t total, char *buf) +{ + uint64_t days = total / 60 / 60 / 24; + uint64_t hours = (total / 60 / 60) % 24; + uint64_t mins = (total / 60) % 60; + uint64_t secs = (total % 60); + + if (days > 0) { + (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", + (u_longlong_t)days, (u_longlong_t)hours, + (u_longlong_t)mins, (u_longlong_t)secs); + } else { + (void) sprintf(buf, "%02llu:%02llu:%02llu", + (u_longlong_t)hours, (u_longlong_t)mins, + (u_longlong_t)secs); + } +} + /* * Print out detailed scrub status. */ static void -print_scan_status(pool_scan_stat_t *ps) +print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t total_secs_left; - uint64_t elapsed, secs_left, mins_left, hours_left, days_left; uint64_t pass_scanned, scanned, pass_issued, issued, total; - uint64_t scan_rate, issue_rate; + uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; - (void) printf(gettext(" scan: ")); + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || @@ -6947,26 +7492,18 @@ print_scan_status(pool_scan_stat_t *ps) /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { - total_secs_left = end - start; - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { @@ -7014,13 +7551,9 @@ print_scan_status(pool_scan_stat_t *ps) scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; - total_secs_left = (issue_rate != 0 && total >= issued) ? + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; - - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); @@ -7050,10 +7583,84 @@ print_scan_status(pool_scan_stat_t *ps) if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { - (void) printf(gettext(", %llu days " - "%02llu:%02llu:%02llu to go\n"), - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +static void +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +{ + if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) + return; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + uint64_t bytes_scanned = vrs->vrs_bytes_scanned; + uint64_t bytes_issued = vrs->vrs_bytes_issued; + uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; + uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / + (vrs->vrs_pass_time_ms + 1)) * 1000; + uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / + (vrs->vrs_pass_time_ms + 1)) * 1000; + double scan_pct = MIN((double)bytes_scanned * 100 / + (bytes_est + 1), 100); + + /* Format all of the numbers we will be reporting */ + char bytes_scanned_buf[7], bytes_issued_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; + zfs_nicebytes(bytes_scanned, bytes_scanned_buf, + sizeof (bytes_scanned_buf)); + zfs_nicebytes(bytes_issued, bytes_issued_buf, + sizeof (bytes_issued_buf)); + zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, + sizeof (bytes_rebuilt_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + + time_t start = vrs->vrs_start_time; + time_t end = vrs->vrs_end_time; + + /* Rebuild is finished or canceled. */ + if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { + secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); + (void) printf(gettext("resilvered (%s) %s in %s " + "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, + time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { + (void) printf(gettext("resilver (%s) canceled on %s"), + vdev_name, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + (void) printf(gettext("resilver (%s) in progress since %s"), + vdev_name, ctime(&start)); + } + + assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); + + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + bytes_rebuilt_buf, scan_pct); + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + if (scan_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); @@ -7064,9 +7671,38 @@ print_scan_status(pool_scan_stat_t *ps) } /* - * As we don't scrub checkpointed blocks, we want to warn the - * user that we skipped scanning some blocks if a checkpoint exists - * or existed at any time during the scan. + * Print rebuild status for top-level vdevs. + */ +static void +print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + char *name = zpool_vdev_name(g_zfs, zhp, + child[c], VDEV_NAME_TYPE_ID); + print_rebuild_status_impl(vrs, name); + free(name); + } + } +} + +/* + * As we don't scrub checkpointed blocks, we want to warn the user that we + * skipped scanning some blocks if a checkpoint exists or existed at any + * time during the scan. If a sequential instead of healing reconstruction + * was performed then the blocks were reconstructed. However, their checksums + * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) @@ -7097,6 +7733,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) } } +/* + * Returns B_TRUE if there is an active rebuild in progress. Otherwise, + * B_FALSE is returned and 'rebuild_end_time' is set to the end time for + * the last completed (or cancelled) rebuild. + */ +static boolean_t +check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) +{ + nvlist_t **child; + uint_t children; + boolean_t rebuilding = B_FALSE; + uint64_t end_time = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + + if (vrs->vrs_end_time > end_time) + end_time = vrs->vrs_end_time; + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + rebuilding = B_TRUE; + end_time = 0; + break; + } + } + } + + if (rebuild_end_time != NULL) + *rebuild_end_time = end_time; + + return (rebuilding); +} + +/* + * Print the scan status. + */ +static void +print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + uint64_t rebuild_end_time = 0, resilver_end_time = 0; + boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t active_resilver = B_FALSE; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c) == 0) { + if (ps->pss_func == POOL_SCAN_RESILVER) { + resilver_end_time = ps->pss_end_time; + active_resilver = (ps->pss_state == DSS_SCANNING); + } + + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); + have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + } + + boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); + boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); + + /* Always print the scrub status when available. */ + if (have_scrub) + print_scan_scrub_resilver_status(ps); + + /* + * When there is an active resilver or rebuild print its status. + * Otherwise print the status of the last resilver or rebuild. + */ + if (active_resilver || (!active_rebuild && have_resilver && + resilver_end_time && resilver_end_time > rebuild_end_time)) { + print_scan_scrub_resilver_status(ps); + } else if (active_rebuild || (!active_resilver && have_rebuild && + rebuild_end_time && rebuild_end_time > resilver_end_time)) { + print_rebuild_status(zhp, nvroot); + } + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_scan_warning(ps, pcs); +} + /* * Print out detailed removal status. */ @@ -7125,7 +7850,7 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); - (void) printf(gettext("remove: ")); + printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; @@ -7181,8 +7906,8 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) * do not print estimated time if hours_left is more than * 30 days */ - (void) printf(gettext(" %s copied out of %s at %s/s, " - "%.2f%% done"), + (void) printf(gettext( + "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), @@ -7192,12 +7917,13 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) ", (copy is slow, no estimated time)\n")); } } + free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); - (void) printf(gettext(" %s memory used for " - "removed device mappings\n"), + (void) printf(gettext( + "\t%s memory used for removed device mappings\n"), mem_buf); } } @@ -7282,7 +8008,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } @@ -7302,7 +8028,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); + print_status_config(zhp, cb, name, l2cache[i], 2, + B_FALSE, NULL); free(name); } } @@ -7352,7 +8079,7 @@ print_dedup_stats(nvlist_t *config) * pool: tank * status: DEGRADED * reason: One or more devices ... - * see: http://zfsonlinux.org/msg/ZFS-xxxx-01 + * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK @@ -7361,7 +8088,7 @@ print_dedup_stats(nvlist_t *config) * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ -int +static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; @@ -7385,7 +8112,9 @@ status_callback(zpool_handle_t *zhp, void *data) if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || - reason == ZPOOL_STATUS_FEAT_DISABLED)) { + reason == ZPOOL_STATUS_FEAT_DISABLED || + reason == ZPOOL_STATUS_COMPATIBILITY_ERR || + reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); @@ -7406,38 +8135,52 @@ status_callback(zpool_handle_t *zhp, void *data) health = zpool_get_state_str(zhp); - (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); - (void) printf(gettext(" state: %s\n"), health); + printf(" "); + printf_color(ANSI_BOLD, gettext("pool:")); + printf(" %s\n", zpool_get_name(zhp)); + printf(" "); + printf_color(ANSI_BOLD, gettext("state: ")); + + printf_color(health_str_to_color(health), "%s", health); + + printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: - (void) printf(gettext("status: One or more devices could not " - "be opened. Sufficient replicas exist for\n\tthe pool to " - "continue functioning in a degraded state.\n")); - (void) printf(gettext("action: Attach the missing device and " - "online it using 'zpool online'.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. Sufficient replicas exist for\n\tthe pool " + "to continue functioning in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: - (void) printf(gettext("status: One or more devices could not " - "be opened. There are insufficient\n\treplicas for the " - "pool to continue functioning.\n")); - (void) printf(gettext("action: Attach the missing device and " - "online it using 'zpool online'.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. There are insufficient\n\treplicas for the" + " pool to continue functioning.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: - (void) printf(gettext("status: One or more devices could not " - "be used because the label is missing or\n\tinvalid. " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); - (void) printf(gettext("action: Replace the device using " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext("status: One or more devices could not " - "be used because the label is missing \n\tor invalid. " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), @@ -7445,175 +8188,255 @@ status_callback(zpool_handle_t *zhp, void *data) break; case ZPOOL_STATUS_FAILING_DEV: - (void) printf(gettext("status: One or more devices has " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); - (void) printf(gettext("action: Determine if the device needs " - "to be replaced, and clear the errors\n\tusing " - "'zpool clear' or replace the device with 'zpool " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Determine if the " + "device needs to be replaced, and clear the errors\n\tusing" + " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext("status: One or more devices has " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); - (void) printf(gettext("action: Online the device using " - "'zpool online' or replace the device with\n\t'zpool " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: - (void) printf(gettext("status: One or more devices has " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); - (void) printf(gettext("action: Online the device using " - "'zpool online' or replace the device with\n\t'zpool " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: - (void) printf(gettext("status: One or more devices is " + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); - (void) printf(gettext("action: Wait for the resilver to " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; + case ZPOOL_STATUS_REBUILD_SCRUB: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices have " + "been sequentially resilvered, scrubbing\n\tthe pool " + "is recommended.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " + "verify all data checksums.\n")); + break; + case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf(gettext("status: One or more devices has " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); - (void) printf(gettext("action: Restore the file in question " - "if possible. Otherwise restore the\n\tentire pool from " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Restore the file in question" + " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext("status: The pool metadata is corrupted " - "and the pool cannot be opened.\n")); + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " + "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using a " - "legacy on-disk format. The pool can\n\tstill be used, " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); - (void) printf(gettext("action: Upgrade the pool using 'zpool " - "upgrade'. Once this is done, the\n\tpool will no longer " - "be accessible on software that does not support\n\t" + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " + "'zpool upgrade'. Once this is done, the\n\tpool will no " + "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("status: The pool has been upgraded to a " - "newer, incompatible on-disk version.\n\tThe pool cannot " - "be accessed on this system.\n")); - (void) printf(gettext("action: Access the pool from a system " - "running more recent software, or\n\trestore the pool from " - "backup.\n")); - break; - - case ZPOOL_STATUS_FEAT_DISABLED: - (void) printf(gettext("status: Some supported features are not " - "enabled on the pool. The pool can\n\tstill be used, but " - "some features are unavailable.\n")); - (void) printf(gettext("action: Enable all features using " - "'zpool upgrade'. Once this is done,\n\tthe pool may no " - "longer be accessible by software that does not support\n\t" - "the features. See zpool-features(5) for details.\n")); - break; - - case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("status: The pool cannot be accessed on " - "this system because it uses the\n\tfollowing feature(s) " - "not supported on this system:\n")); - zpool_print_unsup_feat(config); - (void) printf("\n"); - (void) printf(gettext("action: Access the pool from a system " - "that supports the required feature(s),\n\tor restore the " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " + "to a newer, incompatible on-disk version.\n\tThe pool " + "cannot be accessed on this system.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; + case ZPOOL_STATUS_FEAT_DISABLED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported and " + "requested features are not enabled on the pool.\n\t" + "The pool can still be used, but some features are " + "unavailable.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Enable all features using " + "'zpool upgrade'. Once this is done,\n\tthe pool may no " + "longer be accessible by software that does not support\n\t" + "the features. See zpool-features(7) for details.\n")); + break; + + case ZPOOL_STATUS_COMPATIBILITY_ERR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("This pool has a " + "compatibility list specified, but it could not be\n\t" + "read/parsed at this time. The pool can still be used, " + "but this\n\tshould be investigated.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Check the value of the " + "'compatibility' property against the\n\t" + "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " + ZPOOL_DATA_COMPAT_D ".\n")); + break; + + case ZPOOL_STATUS_INCOMPATIBLE_FEAT: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more features " + "are enabled on the pool despite not being\n\t" + "requested by the 'compatibility' property.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Consider setting " + "'compatibility' to an appropriate value, or\n\t" + "adding needed features to the relevant file in\n\t" + ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "on this system because it uses the\n\tfollowing feature(s)" + " not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system that supports the required feature(s),\n\tor " + "restore the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("status: The pool can only be accessed " - "in read-only mode on this system. It\n\tcannot be " - "accessed in read-write mode because it uses the " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); - (void) printf(gettext("action: The pool cannot be accessed in " - "read-write mode. Import the pool with\n" + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: - (void) printf(gettext("status: One or more devices are " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); - (void) printf(gettext("action: Replace the faulted device, " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext("status: One or more devices are " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); - (void) printf(gettext("action: Destroy and re-create the pool " - "from a backup source. Manually marking the device\n" + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " + "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: - (void) printf(gettext("status: The pool is suspended because " - "multihost writes failed or were delayed;\n\tanother " - "system could import the pool undetected.\n")); - (void) printf(gettext("action: Make sure the pool's devices " - "are connected, then reboot your system and\n\timport the " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is suspended " + "because multihost writes failed or were delayed;\n\t" + "another system could import the pool undetected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" + " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: - (void) printf(gettext("status: One or more devices are " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); - (void) printf(gettext("action: Make sure the affected devices " - "are connected, then run 'zpool clear'.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the affected " + "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext("status: An intent log record " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); - (void) printf(gettext("action: Either restore the affected " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + case ZPOOL_STATUS_HOSTID_MISMATCH: - (void) printf(gettext("status: Mismatch between pool hostid " - "and system hostid on imported pool.\n\tThis pool was " + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" + " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); - (void) printf(gettext("action: Export this pool on all systems " - "on which it is imported.\n" + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Export this pool on all " + "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: - (void) printf(gettext("status: Errata #%d detected.\n"), + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { @@ -7621,16 +8444,18 @@ status_callback(zpool_handle_t *zhp, void *data) break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: - (void) printf(gettext("action: To correct the issue " - "run 'zpool scrub'.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); - (void) printf(gettext("action: To correct the issue " - "backup existing encrypted datasets to new\n\t" + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); @@ -7641,13 +8466,14 @@ status_callback(zpool_handle_t *zhp, void *data) "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); - (void) printf(gettext("action: To correct the issue, " - "enable the bookmark_v2 feature. No additional\n\t" - "action is needed if there are no encrypted " - "snapshots or bookmarks.\n\tIf preserving the " - "encrypted snapshots and bookmarks is required, " - "use\n\ta non-raw send to backup and restore them. " - "Alternately, they may be\n\tremoved to resolve " + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the" + "issue, enable the bookmark_v2 feature. No " + "additional\n\taction is needed if there are no " + "encrypted snapshots or bookmarks.\n\tIf preserving" + "the encrypted snapshots and bookmarks is required," + " use\n\ta non-raw send to backup and restore them." + " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; @@ -7667,28 +8493,29 @@ status_callback(zpool_handle_t *zhp, void *data) assert(reason == ZPOOL_STATUS_OK); } - if (msgid != NULL) - (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), + if (msgid != NULL) { + printf(" "); + printf_color(ANSI_BOLD, gettext("see:")); + printf(gettext( + " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); + } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; - pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; + print_scan_status(zhp, nvroot); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + print_removal_status(zhp, prs); + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); - - print_scan_status(ps); - print_checkpoint_scan_warning(ps, pcs); - print_removal_status(zhp, prs); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, @@ -7696,13 +8523,16 @@ status_callback(zpool_handle_t *zhp, void *data) if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; + color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); + color_end(); - if (cbp->cb_print_slow_ios) - (void) printf(" %5s", gettext("SLOW")); + if (cbp->cb_print_slow_ios) { + printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); + } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -7710,7 +8540,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, - B_FALSE); + B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); @@ -7889,7 +8719,7 @@ zpool_do_status(int argc, char **argv) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); - ret = for_each_pool(argc, argv, B_TRUE, NULL, + ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) @@ -7954,6 +8784,11 @@ upgrade_version(zpool_handle_t *zhp, uint64_t version) verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); + char compat[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, + ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) + compat[0] = '\0'; + assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); @@ -7968,6 +8803,13 @@ upgrade_version(zpool_handle_t *zhp, uint64_t version) return (1); } + if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { + (void) fprintf(stderr, gettext("Upgrade not performed because " + "'compatibility' property set to '" + ZPOOL_COMPAT_LEGACY "'.\n")); + return (1); + } + ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); @@ -7993,11 +8835,25 @@ upgrade_enable_all(zpool_handle_t *zhp, int *countp) boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); + char compat[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, + ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) + compat[0] = '\0'; + + boolean_t requested_features[SPA_FEATURES]; + if (zpool_do_load_compat(compat, requested_features) != + ZPOOL_COMPATIBILITY_OK) + return (-1); + count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; - if (!nvlist_exists(enabled, fguid)) { + + if (!spa_feature_table[i].fi_zfs_mod_supported) + continue; + + if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, @@ -8030,7 +8886,7 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; - boolean_t printnl = B_FALSE; + boolean_t modified_pool = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); @@ -8044,7 +8900,7 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); - printnl = B_TRUE; + modified_pool = B_TRUE; /* * If they did "zpool upgrade -a", then we could @@ -8064,12 +8920,13 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) if (count > 0) { cbp->cb_first = B_FALSE; - printnl = B_TRUE; + modified_pool = B_TRUE; } } - if (printnl) { - (void) printf(gettext("\n")); + if (modified_pool) { + (void) printf("\n"); + (void) after_zpool_upgrade(zhp); } return (0); @@ -8095,7 +8952,10 @@ upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " - "support feature\nflags.\n\n")); + "support feature\nflags.\n\n" + "Note that setting a pool's 'compatibility' " + "feature to '" ZPOOL_COMPAT_LEGACY "' will\n" + "inhibit upgrades.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; @@ -8127,6 +8987,10 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; + + if (!spa_feature_table[i].fi_zfs_mod_supported) + continue; + if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " @@ -8136,8 +9000,12 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) "pool may become incompatible with " "software\nthat does not support " "the feature. See " - "zpool-features(5) for " - "details.\n\n")); + "zpool-features(7) for " + "details.\n\n" + "Note that the pool " + "'compatibility' feature can be " + "used to inhibit\nfeature " + "upgrades.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" @@ -8171,7 +9039,7 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) static int upgrade_one(zpool_handle_t *zhp, void *data) { - boolean_t printnl = B_FALSE; + boolean_t modified_pool = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; @@ -8199,7 +9067,7 @@ upgrade_one(zpool_handle_t *zhp, void *data) } if (cur_version != cbp->cb_version) { - printnl = B_TRUE; + modified_pool = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); @@ -8212,16 +9080,17 @@ upgrade_one(zpool_handle_t *zhp, void *data) return (ret); if (count != 0) { - printnl = B_TRUE; + modified_pool = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " - "supported features enabled.\n"), + "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } - if (printnl) { - (void) printf(gettext("\n")); + if (modified_pool) { + (void) printf("\n"); + (void) after_zpool_upgrade(zhp); } return (0); @@ -8316,6 +9185,8 @@ zpool_do_upgrade(int argc, char **argv) "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; + if (!fi->fi_zfs_mod_supported) + continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; @@ -8376,8 +9247,8 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " - "pool already has all supported features " - "enabled.\n")); + "pool already has all supported and " + "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), @@ -8403,12 +9274,12 @@ zpool_do_upgrade(int argc, char **argv) if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " - "all supported features enabled.\n")); + "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { - ret = for_each_pool(argc, argv, B_FALSE, NULL, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } @@ -8421,30 +9292,18 @@ typedef struct hist_cbdata { boolean_t internal; } hist_cbdata_t; -/* - * Print out the command history for a specific pool. - */ -static int -get_history_one(zpool_handle_t *zhp, void *data) +static void +print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { - nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; - int ret, i; - hist_cbdata_t *cb = (hist_cbdata_t *)data; - - cb->first = B_FALSE; - - (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); - - if ((ret = zpool_get_history(zhp, &nvhis)) != 0) - return (ret); + int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; - char tbuf[30] = ""; + char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; @@ -8456,6 +9315,14 @@ get_history_one(zpool_handle_t *zhp, void *data) (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } + if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { + uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], + ZPOOL_HIST_ELAPSED_NS); + (void) snprintf(tbuf + strlen(tbuf), + sizeof (tbuf) - strlen(tbuf), + " (%lldms)", (long long)elapsed_ns / 1000 / 1000); + } + if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); @@ -8506,6 +9373,12 @@ get_history_one(zpool_handle_t *zhp, void *data) dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { + (void) printf(" output nvlist omitted; " + "original size: %lldKB\n", + (longlong_t)fnvlist_lookup_int64(rec, + ZPOOL_HIST_OUTPUT_SIZE) / 1024); + } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, @@ -8542,8 +9415,32 @@ get_history_one(zpool_handle_t *zhp, void *data) (void) printf("]"); (void) printf("\n"); } +} + +/* + * Print out the command history for a specific pool. + */ +static int +get_history_one(zpool_handle_t *zhp, void *data) +{ + nvlist_t *nvhis; + int ret; + hist_cbdata_t *cb = (hist_cbdata_t *)data; + uint64_t off = 0; + boolean_t eof = B_FALSE; + + cb->first = B_FALSE; + + (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); + + while (!eof) { + if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) + return (ret); + + print_history_records(nvhis, cb); + nvlist_free(nvhis); + } (void) printf("\n"); - nvlist_free(nvhis); return (ret); } @@ -8579,7 +9476,7 @@ zpool_do_history(int argc, char **argv) argc -= optind; argv += optind; - ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { @@ -8608,9 +9505,9 @@ zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); - (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ - (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ - (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ + (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ + (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ + (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); @@ -9142,7 +10039,7 @@ zpool_do_get(int argc, char **argv) cb.cb_proplist = &fake_name; } - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -9159,12 +10056,69 @@ typedef struct set_cbdata { boolean_t cb_any_successful; } set_cbdata_t; -int +static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; + /* Check if we have out-of-bounds features */ + if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { + boolean_t features[SPA_FEATURES]; + if (zpool_do_load_compat(cb->cb_value, features) != + ZPOOL_COMPATIBILITY_OK) + return (-1); + + nvlist_t *enabled = zpool_get_features(zhp); + spa_feature_t i; + for (i = 0; i < SPA_FEATURES; i++) { + const char *fguid = spa_feature_table[i].fi_guid; + if (nvlist_exists(enabled, fguid) && !features[i]) + break; + } + if (i < SPA_FEATURES) + (void) fprintf(stderr, gettext("Warning: one or " + "more features already enabled on pool '%s'\n" + "are not present in this compatibility set.\n"), + zpool_get_name(zhp)); + } + + /* if we're setting a feature, check it's in compatibility set */ + if (zpool_prop_feature(cb->cb_propname) && + strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) { + char *fname = strchr(cb->cb_propname, '@') + 1; + spa_feature_t f; + + if (zfeature_lookup_name(fname, &f) == 0) { + char compat[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, + compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) + compat[0] = '\0'; + + boolean_t features[SPA_FEATURES]; + if (zpool_do_load_compat(compat, features) != + ZPOOL_COMPATIBILITY_OK) { + (void) fprintf(stderr, gettext("Error: " + "cannot enable feature '%s' on pool '%s'\n" + "because the pool's 'compatibility' " + "property cannot be parsed.\n"), + fname, zpool_get_name(zhp)); + return (-1); + } + + if (!features[f]) { + (void) fprintf(stderr, gettext("Error: " + "cannot enable feature '%s' on pool '%s'\n" + "as it is not specified in this pool's " + "current compatibility set.\n" + "Consider setting 'compatibility' to a " + "less restrictive set, or to 'off'.\n"), + fname, zpool_get_name(zhp)); + return (-1); + } + } + } + error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) @@ -9212,12 +10166,446 @@ zpool_do_set(int argc, char **argv) *(cb.cb_value) = '\0'; cb.cb_value++; - error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); } +/* Add up the total number of bytes left to initialize/trim across all vdevs */ +static uint64_t +vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) +{ + uint64_t bytes_remaining; + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + + assert(activity == ZPOOL_WAIT_INITIALIZE || + activity == ZPOOL_WAIT_TRIM); + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (activity == ZPOOL_WAIT_INITIALIZE && + vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) + bytes_remaining = vs->vs_initialize_bytes_est - + vs->vs_initialize_bytes_done; + else if (activity == ZPOOL_WAIT_TRIM && + vs->vs_trim_state == VDEV_TRIM_ACTIVE) + bytes_remaining = vs->vs_trim_bytes_est - + vs->vs_trim_bytes_done; + else + bytes_remaining = 0; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) + bytes_remaining += vdev_activity_remaining(child[c], activity); + + return (bytes_remaining); +} + +/* Add up the total number of bytes left to rebuild across top-level vdevs */ +static uint64_t +vdev_activity_top_remaining(nvlist_t *nv) +{ + uint64_t bytes_remaining = 0; + nvlist_t **child; + uint_t children; + int error; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + error = nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); + if (error == 0) { + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + bytes_remaining += (vrs->vrs_bytes_est - + vrs->vrs_bytes_rebuilt); + } + } + } + + return (bytes_remaining); +} + +/* Whether any vdevs are 'spare' or 'replacing' vdevs */ +static boolean_t +vdev_any_spare_replacing(nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *vdev_type; + + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); + + if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || + strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { + return (B_TRUE); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) { + if (vdev_any_spare_replacing(child[c])) + return (B_TRUE); + } + + return (B_FALSE); +} + +typedef struct wait_data { + char *wd_poolname; + boolean_t wd_scripted; + boolean_t wd_exact; + boolean_t wd_headers_once; + boolean_t wd_should_exit; + /* Which activities to wait for */ + boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; + float wd_interval; + pthread_cond_t wd_cv; + pthread_mutex_t wd_mutex; +} wait_data_t; + +/* + * Print to stdout a single line, containing one column for each activity that + * we are waiting for specifying how many bytes of work are left for that + * activity. + */ +static void +print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) +{ + nvlist_t *config, *nvroot; + uint_t c; + int i; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *pss = NULL; + pool_removal_stat_t *prs = NULL; + char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", + "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; + + /* Calculate the width of each column */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + /* + * Make sure we have enough space in the col for pretty-printed + * numbers and for the column header, and then leave a couple + * spaces between cols for readability. + */ + col_widths[i] = MAX(strlen(headers[i]), 6) + 2; + } + + /* Print header if appropriate */ + int term_height = terminal_height(); + boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && + row % (term_height-1) == 0); + if (!wd->wd_scripted && (row == 0 || reprint_header)) { + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + if (wd->wd_enabled[i]) + (void) printf("%*s", col_widths[i], headers[i]); + } + (void) printf("\n"); + } + + /* Bytes of work remaining in each activity */ + int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; + + bytes_rem[ZPOOL_WAIT_FREE] = + zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); + + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) + bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + if (prs != NULL && prs->prs_state == DSS_SCANNING) + bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - + prs->prs_copied; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); + if (pss != NULL && pss->pss_state == DSS_SCANNING && + pss->pss_pass_scrub_pause == 0) { + int64_t rem = pss->pss_to_examine - pss->pss_issued; + if (pss->pss_func == POOL_SCAN_SCRUB) + bytes_rem[ZPOOL_WAIT_SCRUB] = rem; + else + bytes_rem[ZPOOL_WAIT_RESILVER] = rem; + } else if (check_rebuilding(nvroot, NULL)) { + bytes_rem[ZPOOL_WAIT_RESILVER] = + vdev_activity_top_remaining(nvroot); + } + + bytes_rem[ZPOOL_WAIT_INITIALIZE] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); + bytes_rem[ZPOOL_WAIT_TRIM] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); + + /* + * A replace finishes after resilvering finishes, so the amount of work + * left for a replace is the same as for resilvering. + * + * It isn't quite correct to say that if we have any 'spare' or + * 'replacing' vdevs and a resilver is happening, then a replace is in + * progress, like we do here. When a hot spare is used, the faulted vdev + * is not removed after the hot spare is resilvered, so parent 'spare' + * vdev is not removed either. So we could have a 'spare' vdev, but be + * resilvering for a different reason. However, we use it as a heuristic + * because we don't have access to the DTLs, which could tell us whether + * or not we have really finished resilvering a hot spare. + */ + if (vdev_any_spare_replacing(nvroot)) + bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + char buf[64]; + if (!wd->wd_enabled[i]) + continue; + + if (wd->wd_exact) + (void) snprintf(buf, sizeof (buf), "%" PRIi64, + bytes_rem[i]); + else + zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + + if (wd->wd_scripted) + (void) printf(i == 0 ? "%s" : "\t%s", buf); + else + (void) printf(" %*s", col_widths[i] - 1, buf); + } + (void) printf("\n"); + (void) fflush(stdout); +} + +static void * +wait_status_thread(void *arg) +{ + wait_data_t *wd = (wait_data_t *)arg; + zpool_handle_t *zhp; + + if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) + return (void *)(1); + + for (int row = 0; ; row++) { + boolean_t missing; + struct timespec timeout; + int ret = 0; + (void) clock_gettime(CLOCK_REALTIME, &timeout); + + if (zpool_refresh_stats(zhp, &missing) != 0 || missing || + zpool_props_refresh(zhp) != 0) { + zpool_close(zhp); + return (void *)(uintptr_t)(missing ? 0 : 1); + } + + print_wait_status_row(wd, zhp, row); + + timeout.tv_sec += floor(wd->wd_interval); + long nanos = timeout.tv_nsec + + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; + if (nanos >= NANOSEC) { + timeout.tv_sec++; + timeout.tv_nsec = nanos - NANOSEC; + } else { + timeout.tv_nsec = nanos; + } + pthread_mutex_lock(&wd->wd_mutex); + if (!wd->wd_should_exit) + ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, + &timeout); + pthread_mutex_unlock(&wd->wd_mutex); + if (ret == 0) { + break; /* signaled by main thread */ + } else if (ret != ETIMEDOUT) { + (void) fprintf(stderr, gettext("pthread_cond_timedwait " + "failed: %s\n"), strerror(ret)); + zpool_close(zhp); + return (void *)(uintptr_t)(1); + } + } + + zpool_close(zhp); + return (void *)(0); +} + +int +zpool_do_wait(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + int c; + char *value; + int i; + unsigned long count; + pthread_t status_thr; + int error = 0; + zpool_handle_t *zhp; + + wait_data_t wd; + wd.wd_scripted = B_FALSE; + wd.wd_exact = B_FALSE; + wd.wd_headers_once = B_FALSE; + wd.wd_should_exit = B_FALSE; + + pthread_mutex_init(&wd.wd_mutex, NULL); + pthread_cond_init(&wd.wd_cv, NULL); + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) + wd.wd_enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "HpT:t:")) != -1) { + switch (c) { + case 'H': + wd.wd_scripted = B_TRUE; + break; + case 'n': + wd.wd_headers_once = B_TRUE; + break; + case 'p': + wd.wd_exact = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 't': + { + static char *col_subopts[] = { "discard", "free", + "initialize", "replace", "remove", "resilver", + "scrub", "trim", NULL }; + + /* Reset activities array */ + bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + wd.wd_enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &wd.wd_interval, &count); + if (count != 0) { + /* This subcmd only accepts an interval, not a count */ + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (wd.wd_interval != 0) + verbose = B_TRUE; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'pool' argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + wd.wd_poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) + return (1); + + if (verbose) { + /* + * We use a separate thread for printing status updates because + * the main thread will call lzc_wait(), which blocks as long + * as an activity is in progress, which can be a long time. + */ + if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) + != 0) { + (void) fprintf(stderr, gettext("failed to create status" + "thread: %s\n"), strerror(errno)); + zpool_close(zhp); + return (1); + } + } + + /* + * Loop over all activities that we are supposed to wait for until none + * of them are in progress. Note that this means we can end up waiting + * for more activities to complete than just those that were in progress + * when we began waiting; if an activity we are interested in begins + * while we are waiting for another activity, we will wait for both to + * complete before exiting. + */ + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!wd.wd_enabled[i]) + continue; + + error = zpool_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zpool_close(zhp); + + if (verbose) { + uintptr_t status; + pthread_mutex_lock(&wd.wd_mutex); + wd.wd_should_exit = B_TRUE; + pthread_cond_signal(&wd.wd_cv); + pthread_mutex_unlock(&wd.wd_mutex); + (void) pthread_join(status_thr, (void *)&status); + if (status != 0) + error = status; + } + + pthread_mutex_destroy(&wd.wd_mutex); + pthread_cond_destroy(&wd.wd_cv); + return (error); +} + static int find_command_idx(char *command, int *idx) { @@ -9247,6 +10635,36 @@ zpool_do_version(int argc, char **argv) return (0); } +/* + * Do zpool_load_compat() and print error message on failure + */ +static zpool_compat_status_t +zpool_do_load_compat(const char *compat, boolean_t *list) +{ + char report[1024]; + + zpool_compat_status_t ret; + + ret = zpool_load_compat(compat, list, report, 1024); + switch (ret) { + + case ZPOOL_COMPATIBILITY_OK: + break; + + case ZPOOL_COMPATIBILITY_NOFILES: + case ZPOOL_COMPATIBILITY_BADFILE: + case ZPOOL_COMPATIBILITY_BADTOKEN: + (void) fprintf(stderr, "Error: %s\n", report); + break; + + case ZPOOL_COMPATIBILITY_WARNTOKEN: + (void) fprintf(stderr, "Warning: %s\n", report); + ret = ZPOOL_COMPATIBILITY_OK; + break; + } + return (ret); +} + int main(int argc, char **argv) { @@ -9256,6 +10674,7 @@ main(int argc, char **argv) char **newargv; (void) setlocale(LC_ALL, ""); + (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); @@ -9284,7 +10703,7 @@ main(int argc, char **argv) return (zpool_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "%s", libzfs_error_init(errno)); + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c index c26c0eb396..1c64c83d8f 100644 --- a/cmd/zpool/zpool_util.c +++ b/cmd/zpool/zpool_util.c @@ -49,6 +49,22 @@ safe_malloc(size_t size) return (data); } +/* + * Utility function to guarantee realloc() success. + */ +void * +safe_realloc(void *from, size_t size) +{ + void *data; + + if ((data = realloc(from, size)) == NULL) { + (void) fprintf(stderr, "internal error: out of memory\n"); + exit(1); + } + + return (data); +} + /* * Display an out of memory error message and abort the current program. */ @@ -98,20 +114,6 @@ array64_max(uint64_t array[], unsigned int len) return (max); } -/* - * Return 1 if "str" is a number string, 0 otherwise. Works for integer and - * floating point numbers. - */ -int -isnumber(char *str) -{ - for (; *str; str++) - if (!(isdigit(*str) || (*str == '.'))) - return (0); - - return (1); -} - /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 3afc82d54b..6665eaf0d4 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -27,6 +27,7 @@ #include #include +#include #ifdef __cplusplus extern "C" { @@ -39,10 +40,10 @@ extern "C" { * Basic utility functions */ void *safe_malloc(size_t); +void *safe_realloc(void *, size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); uint64_t array64_max(uint64_t array[], unsigned int len); -int isnumber(char *str); int highbit64(uint64_t i); int lowbit64(uint64_t i); @@ -65,22 +66,21 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, * Pool list functions */ int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, - zpool_iter_f, void *); + boolean_t, zpool_iter_f, void *); /* Vdev list functions */ -typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); typedef struct zpool_list zpool_list_t; -zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, boolean_t, int *); void pool_list_update(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); int pool_list_count(zpool_list_t *); void pool_list_remove(zpool_list_t *, zpool_handle_t *); -libzfs_handle_t *g_zfs; +extern libzfs_handle_t *g_zfs; typedef struct vdev_cmd_data @@ -104,7 +104,7 @@ typedef struct vdev_cmd_data_list char *cmd; /* Command to run */ unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ - /* vars to whitelist only certain vdevs, if requested */ + /* fields used to select only certain vdevs, if requested */ libzfs_handle_t *g_zfs; char **vdev_names; int vdev_names_count; @@ -125,6 +125,14 @@ vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); +int check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk); +boolean_t check_sector_size_database(char *path, int *sector_size); +void vdev_error(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +int check_file(const char *file, boolean_t force, boolean_t isspare); +void after_zpool_upgrade(zpool_handle_t *zhp); +int check_file_generic(const char *file, boolean_t force, boolean_t isspare); + #ifdef __cplusplus } #endif diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 7ea9d74200..dcc67e7e20 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -64,7 +64,6 @@ #include #include -#include #include #include #include @@ -72,19 +71,12 @@ #include #include #include -#include -#include #include #include #include -#include -#include -#include -#include -#include -#include #include "zpool_util.h" #include +#include /* * For any given vdev specification, we can have multiple errors. The @@ -94,191 +86,7 @@ boolean_t error_seen; boolean_t is_force; -typedef struct vdev_disk_db_entry -{ - char id[24]; - int sector_size; -} vdev_disk_db_entry_t; - -/* - * Database of block devices that lie about physical sector sizes. The - * identification string must be precisely 24 characters to avoid false - * negatives - */ -static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA ADATA SSD S396 3", 8192}, - {"ATA APPLE SSD SM128E", 8192}, - {"ATA APPLE SSD SM256E", 8192}, - {"ATA APPLE SSD SM512E", 8192}, - {"ATA APPLE SSD SM768E", 8192}, - {"ATA C400-MTFDDAC064M", 8192}, - {"ATA C400-MTFDDAC128M", 8192}, - {"ATA C400-MTFDDAC256M", 8192}, - {"ATA C400-MTFDDAC512M", 8192}, - {"ATA Corsair Force 3 ", 8192}, - {"ATA Corsair Force GS", 8192}, - {"ATA INTEL SSDSA2CT04", 8192}, - {"ATA INTEL SSDSA2BZ10", 8192}, - {"ATA INTEL SSDSA2BZ20", 8192}, - {"ATA INTEL SSDSA2BZ30", 8192}, - {"ATA INTEL SSDSA2CW04", 8192}, - {"ATA INTEL SSDSA2CW08", 8192}, - {"ATA INTEL SSDSA2CW12", 8192}, - {"ATA INTEL SSDSA2CW16", 8192}, - {"ATA INTEL SSDSA2CW30", 8192}, - {"ATA INTEL SSDSA2CW60", 8192}, - {"ATA INTEL SSDSC2CT06", 8192}, - {"ATA INTEL SSDSC2CT12", 8192}, - {"ATA INTEL SSDSC2CT18", 8192}, - {"ATA INTEL SSDSC2CT24", 8192}, - {"ATA INTEL SSDSC2CW06", 8192}, - {"ATA INTEL SSDSC2CW12", 8192}, - {"ATA INTEL SSDSC2CW18", 8192}, - {"ATA INTEL SSDSC2CW24", 8192}, - {"ATA INTEL SSDSC2CW48", 8192}, - {"ATA KINGSTON SH100S3", 8192}, - {"ATA KINGSTON SH103S3", 8192}, - {"ATA M4-CT064M4SSD2 ", 8192}, - {"ATA M4-CT128M4SSD2 ", 8192}, - {"ATA M4-CT256M4SSD2 ", 8192}, - {"ATA M4-CT512M4SSD2 ", 8192}, - {"ATA OCZ-AGILITY2 ", 8192}, - {"ATA OCZ-AGILITY3 ", 8192}, - {"ATA OCZ-VERTEX2 3.5 ", 8192}, - {"ATA OCZ-VERTEX3 ", 8192}, - {"ATA OCZ-VERTEX3 LT ", 8192}, - {"ATA OCZ-VERTEX3 MI ", 8192}, - {"ATA OCZ-VERTEX4 ", 8192}, - {"ATA SAMSUNG MZ7WD120", 8192}, - {"ATA SAMSUNG MZ7WD240", 8192}, - {"ATA SAMSUNG MZ7WD480", 8192}, - {"ATA SAMSUNG MZ7WD960", 8192}, - {"ATA SAMSUNG SSD 830 ", 8192}, - {"ATA Samsung SSD 840 ", 8192}, - {"ATA SanDisk SSD U100", 8192}, - {"ATA TOSHIBA THNSNH06", 8192}, - {"ATA TOSHIBA THNSNH12", 8192}, - {"ATA TOSHIBA THNSNH25", 8192}, - {"ATA TOSHIBA THNSNH51", 8192}, - {"ATA APPLE SSD TS064C", 4096}, - {"ATA APPLE SSD TS128C", 4096}, - {"ATA APPLE SSD TS256C", 4096}, - {"ATA APPLE SSD TS512C", 4096}, - {"ATA INTEL SSDSA2M040", 4096}, - {"ATA INTEL SSDSA2M080", 4096}, - {"ATA INTEL SSDSA2M160", 4096}, - {"ATA INTEL SSDSC2MH12", 4096}, - {"ATA INTEL SSDSC2MH25", 4096}, - {"ATA OCZ CORE_SSD ", 4096}, - {"ATA OCZ-VERTEX ", 4096}, - {"ATA SAMSUNG MCCOE32G", 4096}, - {"ATA SAMSUNG MCCOE64G", 4096}, - {"ATA SAMSUNG SSD PM80", 4096}, - /* Flash drives optimized for 4KB IOs on larger pages */ - {"ATA INTEL SSDSC2BA10", 4096}, - {"ATA INTEL SSDSC2BA20", 4096}, - {"ATA INTEL SSDSC2BA40", 4096}, - {"ATA INTEL SSDSC2BA80", 4096}, - {"ATA INTEL SSDSC2BB08", 4096}, - {"ATA INTEL SSDSC2BB12", 4096}, - {"ATA INTEL SSDSC2BB16", 4096}, - {"ATA INTEL SSDSC2BB24", 4096}, - {"ATA INTEL SSDSC2BB30", 4096}, - {"ATA INTEL SSDSC2BB40", 4096}, - {"ATA INTEL SSDSC2BB48", 4096}, - {"ATA INTEL SSDSC2BB60", 4096}, - {"ATA INTEL SSDSC2BB80", 4096}, - {"ATA INTEL SSDSC2BW24", 4096}, - {"ATA INTEL SSDSC2BW48", 4096}, - {"ATA INTEL SSDSC2BP24", 4096}, - {"ATA INTEL SSDSC2BP48", 4096}, - {"NA SmrtStorSDLKAE9W", 4096}, - {"NVMe Amazon EC2 NVMe ", 4096}, - /* Imported from Open Solaris */ - {"ATA MARVELL SD88SA02", 4096}, - /* Advanced format Hard drives */ - {"ATA Hitachi HDS5C303", 4096}, - {"ATA SAMSUNG HD204UI ", 4096}, - {"ATA ST2000DL004 HD20", 4096}, - {"ATA WDC WD10EARS-00M", 4096}, - {"ATA WDC WD10EARS-00S", 4096}, - {"ATA WDC WD10EARS-00Z", 4096}, - {"ATA WDC WD15EARS-00M", 4096}, - {"ATA WDC WD15EARS-00S", 4096}, - {"ATA WDC WD15EARS-00Z", 4096}, - {"ATA WDC WD20EARS-00M", 4096}, - {"ATA WDC WD20EARS-00S", 4096}, - {"ATA WDC WD20EARS-00Z", 4096}, - {"ATA WDC WD1600BEVT-0", 4096}, - {"ATA WDC WD2500BEVT-0", 4096}, - {"ATA WDC WD3200BEVT-0", 4096}, - {"ATA WDC WD5000BEVT-0", 4096}, - /* Virtual disks: Assume zvols with default volblocksize */ -#if 0 - {"ATA QEMU HARDDISK ", 8192}, - {"IET VIRTUAL-DISK ", 8192}, - {"OI COMSTAR ", 8192}, - {"SUN COMSTAR ", 8192}, - {"NETAPP LUN ", 8192}, -#endif -}; - -static const int vdev_disk_database_size = - sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]); - -#define INQ_REPLY_LEN 96 -#define INQ_CMD_LEN 6 - -static boolean_t -check_sector_size_database(char *path, int *sector_size) -{ - unsigned char inq_buff[INQ_REPLY_LEN]; - unsigned char sense_buffer[32]; - unsigned char inq_cmd_blk[INQ_CMD_LEN] = - {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0}; - sg_io_hdr_t io_hdr; - int error; - int fd; - int i; - - /* Prepare INQUIRY command */ - memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); - io_hdr.interface_id = 'S'; - io_hdr.cmd_len = sizeof (inq_cmd_blk); - io_hdr.mx_sb_len = sizeof (sense_buffer); - io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; - io_hdr.dxfer_len = INQ_REPLY_LEN; - io_hdr.dxferp = inq_buff; - io_hdr.cmdp = inq_cmd_blk; - io_hdr.sbp = sense_buffer; - io_hdr.timeout = 10; /* 10 milliseconds is ample time */ - - if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) - return (B_FALSE); - - error = ioctl(fd, SG_IO, (unsigned long) &io_hdr); - - (void) close(fd); - - if (error < 0) - return (B_FALSE); - - if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) - return (B_FALSE); - - for (i = 0; i < vdev_disk_database_size; i++) { - if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24)) - continue; - - *sector_size = vdev_disk_database[i].sector_size; - return (B_TRUE); - } - - return (B_FALSE); -} - -/*PRINTFLIKE1*/ -static void +void vdev_error(const char *fmt, ...) { va_list ap; @@ -303,8 +111,8 @@ vdev_error(const char *fmt, ...) * Check that a file is valid. All we can do in this case is check that it's * not in use by another pool, and not in use by swap. */ -static int -check_file(const char *file, boolean_t force, boolean_t isspare) +int +check_file_generic(const char *file, boolean_t force, boolean_t isspare) { char *name; int fd; @@ -367,149 +175,6 @@ check_file(const char *file, boolean_t force, boolean_t isspare) return (ret); } -static int -check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) -{ - int err; - char *value; - - /* No valid type detected device is safe to use */ - value = blkid_get_tag_value(cache, "TYPE", path); - if (value == NULL) - return (0); - - /* - * If libblkid detects a ZFS device, we check the device - * using check_file() to see if it's safe. The one safe - * case is a spare device shared between multiple pools. - */ - if (strcmp(value, "zfs_member") == 0) { - err = check_file(path, force, isspare); - } else { - if (force) { - err = 0; - } else { - err = -1; - vdev_error(gettext("%s contains a filesystem of " - "type '%s'\n"), path, value); - } - } - - free(value); - - return (err); -} - -/* - * Validate that a disk including all partitions are safe to use. - * - * For EFI labeled disks this can done relatively easily with the libefi - * library. The partition numbers are extracted from the label and used - * to generate the expected /dev/ paths. Each partition can then be - * checked for conflicts. - * - * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible - * but due to the lack of a readily available libraries this scanning is - * not implemented. Instead only the device path as given is checked. - */ -static int -check_disk(const char *path, blkid_cache cache, int force, - boolean_t isspare, boolean_t iswholedisk) -{ - struct dk_gpt *vtoc; - char slice_path[MAXPATHLEN]; - int err = 0; - int fd, i; - int flags = O_RDONLY|O_DIRECT; - - if (!iswholedisk) - return (check_slice(path, cache, force, isspare)); - - /* only spares can be shared, other devices require exclusive access */ - if (!isspare) - flags |= O_EXCL; - - if ((fd = open(path, flags)) < 0) { - char *value = blkid_get_tag_value(cache, "TYPE", path); - (void) fprintf(stderr, gettext("%s is in use and contains " - "a %s filesystem.\n"), path, value ? value : "unknown"); - return (-1); - } - - /* - * Expected to fail for non-EFI labled disks. Just check the device - * as given and do not attempt to detect and scan partitions. - */ - err = efi_alloc_and_read(fd, &vtoc); - if (err) { - (void) close(fd); - return (check_slice(path, cache, force, isspare)); - } - - /* - * The primary efi partition label is damaged however the secondary - * label at the end of the device is intact. Rather than use this - * label we should play it safe and treat this as a non efi device. - */ - if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { - efi_free(vtoc); - (void) close(fd); - - if (force) { - /* Partitions will now be created using the backup */ - return (0); - } else { - vdev_error(gettext("%s contains a corrupt primary " - "EFI label.\n"), path); - return (-1); - } - } - - for (i = 0; i < vtoc->efi_nparts; i++) { - - if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || - uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) - continue; - - if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) - (void) snprintf(slice_path, sizeof (slice_path), - "%s%s%d", path, "-part", i+1); - else - (void) snprintf(slice_path, sizeof (slice_path), - "%s%s%d", path, isdigit(path[strlen(path)-1]) ? - "p" : "", i+1); - - err = check_slice(slice_path, cache, force, isspare); - if (err) - break; - } - - efi_free(vtoc); - (void) close(fd); - - return (err); -} - -static int -check_device(const char *path, boolean_t force, - boolean_t isspare, boolean_t iswholedisk) -{ - blkid_cache cache; - int error; - - error = blkid_get_cache(&cache, NULL); - if (error != 0) { - (void) fprintf(stderr, gettext("unable to access the blkid " - "cache.\n")); - return (-1); - } - - error = check_disk(path, cache, force, isspare, iswholedisk); - blkid_put_cache(cache); - - return (error); -} - /* * This may be a shorthand device path or it could be total gibberish. * Check to see if it is a known device available in zfs_vdev_paths. @@ -553,6 +218,9 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; + if (zpool_is_draid_spare(path)) + return (B_TRUE); + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); @@ -598,9 +266,10 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx + * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; @@ -640,6 +309,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (zpool_is_draid_spare(arg)) { + if (!is_primary) { + (void) fprintf(stderr, + gettext("cannot open '%s': dRAID spares can only " + "be used to replace primary vdevs\n"), arg); + return (NULL); + } + + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -668,17 +348,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -689,10 +371,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -763,11 +442,16 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +/* + * N.B. For the purposes of comparing replication levels dRAID can be + * considered functionally equivalent to raidz. + */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { - if (strcmp(a->zprl_type, "raidz") == 0 && + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; @@ -776,6 +460,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, return (B_FALSE); } +/* + * Comparison for determining if dRAID and raidz where passed in either order. + */ +static boolean_t +is_raidz_draid(replication_level_t *a, replication_level_t *b) +{ + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && + (strcmp(b->zprl_type, "raidz") == 0 || + strcmp(b->zprl_type, "draid") == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then @@ -828,7 +528,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_children = 1; rep.zprl_parity = 0; } else { - uint64_t vdev_size; + int64_t vdev_size; /* * This is a mirror or RAID-Z vdev. Go through and make @@ -842,7 +542,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -858,12 +559,12 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) */ type = NULL; dontreport = 0; - vdev_size = -1ULL; + vdev_size = -1LL; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; char *path; struct stat64 statbuf; - uint64_t size = -1ULL; + int64_t size = -1LL; char *childtype; int fd, err; @@ -954,8 +655,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * (~16MB) then report an error. */ if (!dontreport && - (vdev_size != -1ULL && - (labs(size - vdev_size) > + (vdev_size != -1LL && + (llabs(size - vdev_size) > ZPOOL_FUZZ))) { if (ret != NULL) free(ret); @@ -1002,12 +703,40 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) "are present\n"), raidz->zprl_type, mirror->zprl_type, + (u_longlong_t) raidz->zprl_parity, + (u_longlong_t) mirror->zprl_children - 1, + (u_longlong_t) mirror->zprl_children); else return (NULL); } + } else if (is_raidz_draid(&lastrep, &rep)) { + /* + * Accepted raidz and draid when they can + * handle the same number of disk failures. + */ + if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s and %s vdevs " + "with different " + "redundancy, %llu vs. " + "%llu are present\n"), + lastrep.zprl_type, + rep.zprl_type, + (u_longlong_t) + lastrep.zprl_parity, + (u_longlong_t) + rep.zprl_parity); + else + return (NULL); + } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) @@ -1030,8 +759,9 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) "mismatched replication level: " "both %llu and %llu device parity " "%s vdevs are present\n"), + (u_longlong_t) lastrep.zprl_parity, - rep.zprl_parity, + (u_longlong_t)rep.zprl_parity, rep.zprl_type); else return (NULL); @@ -1044,7 +774,9 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) "mismatched replication level: " "both %llu-way and %llu-way %s " "vdevs are present\n"), + (u_longlong_t) lastrep.zprl_children, + (u_longlong_t) rep.zprl_children, rep.zprl_type); else @@ -1129,9 +861,9 @@ check_replication(nvlist_t *config, nvlist_t *newroot) "and %s vdevs, %llu vs. %llu (%llu-way)\n"), raidz->zprl_type, mirror->zprl_type, - raidz->zprl_parity, - mirror->zprl_children - 1, - mirror->zprl_children); + (u_longlong_t)raidz->zprl_parity, + (u_longlong_t)mirror->zprl_children - 1, + (u_longlong_t)mirror->zprl_children); ret = -1; } } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { @@ -1144,14 +876,17 @@ check_replication(nvlist_t *config, nvlist_t *newroot) vdev_error(gettext( "mismatched replication level: pool uses %llu " "device parity and new vdev uses %llu\n"), - current->zprl_parity, new->zprl_parity); + (u_longlong_t)current->zprl_parity, + (u_longlong_t)new->zprl_parity); ret = -1; } else if (current->zprl_children != new->zprl_children) { vdev_error(gettext( "mismatched replication level: pool uses %llu-way " "%s and new vdev uses %llu-way %s\n"), - current->zprl_children, current->zprl_type, - new->zprl_children, new->zprl_type); + (u_longlong_t)current->zprl_children, + current->zprl_type, + (u_longlong_t)new->zprl_children, + new->zprl_type); ret = -1; } } @@ -1272,6 +1007,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) if (fd == -1) { if (errno == EBUSY) is_exclusive = 1; +#ifdef __FreeBSD__ + if (errno == EPERM) + is_exclusive = 1; +#endif } else { (void) close(fd); } @@ -1430,31 +1169,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, return (anyinuse); } +/* + * Returns the parity level extracted from a raidz or draid type. + * If the parity cannot be determined zero is returned. + */ +static int +get_parity(const char *type) +{ + long parity = 0; + const char *p; + + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + p = type + strlen(VDEV_TYPE_RAIDZ); + + if (*p == '\0') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, no suffixes allowed */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { + return (0); + } + } + } else if (strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0) { + p = type + strlen(VDEV_TYPE_DRAID); + + if (*p == '\0' || *p == ':') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, allowed suffixes: '\0' or ':' */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || + parity < 1 || parity > VDEV_DRAID_MAXPARITY || + (*end != '\0' && *end != ':')) { + return (0); + } + } + } + + return ((int)parity); +} + +/* + * Assign the minimum and maximum number of devices allowed for + * the specified type. On error NULL is returned, otherwise the + * type prefix is returned (raidz, mirror, etc). + */ static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; - - if (*p == '\0') { - nparity = 1; - } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ - } else { - errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); - } + int nparity; + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + nparity = get_parity(type); + if (nparity == 0) + return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + + if (strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0) { + return (VDEV_TYPE_RAIDZ); + } else { + return (VDEV_TYPE_DRAID); + } } if (maxdev != NULL) @@ -1494,19 +1289,176 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (NULL); } +/* + * Extract the configuration parameters encoded in the dRAID type and + * use them to generate a dRAID configuration. The expected format is: + * + * draid[][:][:][:] + * + * The intent is to be able to generate a good configuration when no + * additional information is provided. The only mandatory component + * of the 'type' is the 'draid' prefix. If a value is not provided + * then reasonable defaults are used. The optional components may + * appear in any order but the d/s/c suffix is required. + * + * Valid inputs: + * - data: number of data devices per group (1-255) + * - parity: number of parity blocks per group (1-3) + * - spares: number of distributed spare (0-100) + * - children: total number of devices (1-255) + * + * Examples: + * - zpool create tank draid + * - zpool create tank draid2:8d:51c:2s + */ +static int +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +{ + uint64_t nparity = 1; + uint64_t nspares = 0; + uint64_t ndata = UINT64_MAX; + uint64_t ngroups = 1; + long value; + + if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0) + return (EINVAL); + + char *p = (char *)type; + while ((p = strchr(p, ':')) != NULL) { + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d/s suffix */ + value = strtol(p, &end, 10); + char suffix = tolower(*end); + if (errno != 0 || + (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + if (suffix == 'c') { + if ((uint64_t)value != children) { + fprintf(stderr, + gettext("invalid number of dRAID children; " + "%llu required but %llu provided\n"), + (u_longlong_t)value, + (u_longlong_t)children); + return (EINVAL); + } + } else if (suffix == 'd') { + ndata = (uint64_t)value; + } else if (suffix == 's') { + nspares = (uint64_t)value; + } else { + verify(0); /* Unreachable */ + } + } + + /* + * When a specific number of data disks is not provided limit a + * redundancy group to 8 data disks. This value was selected to + * provide a reasonable tradeoff between capacity and performance. + */ + if (ndata == UINT64_MAX) { + if (children > nspares + nparity) { + ndata = MIN(children - nspares - nparity, 8); + } else { + fprintf(stderr, gettext("request number of " + "distributed spares %llu and parity level %llu\n" + "leaves no disks available for data\n"), + (u_longlong_t)nspares, (u_longlong_t)nparity); + return (EINVAL); + } + } + + /* Verify the maximum allowed group size is never exceeded. */ + if (ndata == 0 || (ndata + nparity > children - nspares)) { + fprintf(stderr, gettext("requested number of dRAID data " + "disks per group %llu is too high,\nat most %llu disks " + "are available for data\n"), (u_longlong_t)ndata, + (u_longlong_t)(children - nspares - nparity)); + return (EINVAL); + } + + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); + return (EINVAL); + } + + /* + * Verify the requested number of spares can be satisfied. + * An arbitrary limit of 100 distributed spares is applied. + */ + if (nspares > 100 || nspares > (children - (ndata + nparity))) { + fprintf(stderr, + gettext("invalid number of dRAID spares %llu; additional " + "disks would be required\n"), (u_longlong_t)nspares); + return (EINVAL); + } + + /* Verify the requested number children is sufficient. */ + if (children < (ndata + nparity + nspares)) { + fprintf(stderr, gettext("%llu disks were provided, but at " + "least %llu disks are required for this config\n"), + (u_longlong_t)children, + (u_longlong_t)(ndata + nparity + nspares)); + } + + if (children > VDEV_DRAID_MAX_CHILDREN) { + fprintf(stderr, gettext("%llu disks were provided, but " + "dRAID only supports up to %u disks"), + (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); + } + + /* + * Calculate the minimum number of groups required to fill a slice. + * This is the LCM of the stripe width (ndata + nparity) and the + * number of data drives (children - nspares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + + return (0); +} + /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths * because the program is just going to exit anyway. */ -nvlist_t * +static nvlist_t * construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; + const char *type, *fulltype; + boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; @@ -1516,18 +1468,20 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { + fulltype = argv[0]; nv = NULL; /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. + * If it's a mirror, raidz, or draid the subsequent arguments + * are its leaves -- until we encounter the next mirror, + * raidz or draid. */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1539,6 +1493,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } + is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } @@ -1552,8 +1507,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; + is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* @@ -1565,8 +1519,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; + is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; @@ -1574,8 +1527,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; + is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; @@ -1589,7 +1541,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = B_FALSE; + is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { @@ -1607,13 +1560,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], - B_FALSE)) == NULL) { + !(is_log || is_special || is_dedup || + is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1662,10 +1617,11 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) + if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); + } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1681,6 +1637,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_DRAID) == 0) { + if (draid_config_by_type(nv, + fulltype, children) != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); @@ -1694,12 +1659,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], - is_log)) == NULL) + if ((nv = make_leaf_vdev(props, argv[0], !(is_log || + is_special || is_dedup || is_spare))) == NULL) goto spec_out; - if (is_log) + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; + } + if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1866,7 +1838,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, } /* - * Validate each device to make sure that its not shared with another + * Validate each device to make sure that it's not shared with another * subsystem. We do this even if 'force' is set, because there are some * uses (such as a dedicated dump device) that even '-f' cannot * override. diff --git a/cmd/zpool_influxdb/.gitignore b/cmd/zpool_influxdb/.gitignore new file mode 100644 index 0000000000..bd765d1882 --- /dev/null +++ b/cmd/zpool_influxdb/.gitignore @@ -0,0 +1 @@ +/zpool_influxdb diff --git a/cmd/zpool_influxdb/Makefile.am b/cmd/zpool_influxdb/Makefile.am new file mode 100644 index 0000000000..a59217570b --- /dev/null +++ b/cmd/zpool_influxdb/Makefile.am @@ -0,0 +1,13 @@ +include $(top_srcdir)/config/Rules.am + +zfsexec_PROGRAMS = zpool_influxdb + +zpool_influxdb_SOURCES = \ + zpool_influxdb.c + +zpool_influxdb_LDADD = \ + $(top_builddir)/lib/libspl/libspl.la \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs/libzfs.la + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zpool_influxdb/README.md b/cmd/zpool_influxdb/README.md new file mode 100644 index 0000000000..864d674983 --- /dev/null +++ b/cmd/zpool_influxdb/README.md @@ -0,0 +1,294 @@ +# Influxdb Metrics for ZFS Pools +The _zpool_influxdb_ program produces +[influxdb](https://github.com/influxdata/influxdb) line protocol +compatible metrics from zpools. In the UNIX tradition, _zpool_influxdb_ +does one thing: read statistics from a pool and print them to +stdout. In many ways, this is a metrics-friendly output of +statistics normally observed via the `zpool` command. + +## Usage +When run without arguments, _zpool_influxdb_ runs once, reading data +from all imported pools, and prints to stdout. +```shell +zpool_influxdb [options] [poolname] +``` +If no poolname is specified, then all pools are sampled. + +| option | short option | description | +|---|---|---| +| --execd | -e | For use with telegraf's `execd` plugin. When [enter] is pressed, the pools are sampled. To exit, use [ctrl+D] | +| --no-histogram | -n | Do not print histogram information | +| --signed-int | -i | Use signed integer data type (default=unsigned) | +| --sum-histogram-buckets | -s | Sum histogram bucket values | +| --tags key=value[,key=value...] | -t | Add tags to data points. No tag sanity checking is performed. | +| --help | -h | Print a short usage message | + +#### Histogram Bucket Values +The histogram data collected by ZFS is stored as independent bucket values. +This works well out-of-the-box with an influxdb data source and grafana's +heatmap visualization. The influxdb query for a grafana heatmap +visualization looks like: +``` +field(disk_read) last() non_negative_derivative(1s) +``` + +Another method for storing histogram data sums the values for lower-value +buckets. For example, a latency bucket tagged "le=10" includes the values +in the bucket "le=1". +This method is often used for prometheus histograms. +The `zpool_influxdb --sum-histogram-buckets` option presents the data from ZFS +as summed values. + +## Measurements +The following measurements are collected: + +| measurement | description | zpool equivalent | +|---|---|---| +| zpool_stats | general size and data | zpool list | +| zpool_scan_stats | scrub, rebuild, and resilver statistics (omitted if no scan has been requested) | zpool status | +| zpool_vdev_stats | per-vdev statistics | zpool iostat -q | +| zpool_io_size | per-vdev I/O size histogram | zpool iostat -r | +| zpool_latency | per-vdev I/O latency histogram | zpool iostat -w | +| zpool_vdev_queue | per-vdev instantaneous queue depth | zpool iostat -q | + +### zpool_stats Description +zpool_stats contains top-level summary statistics for the pool. +Performance counters measure the I/Os to the pool's devices. + +#### zpool_stats Tags + +| label | description | +|---|---| +| name | pool name | +| path | for leaf vdevs, the pathname | +| state | pool state, as shown by _zpool status_ | +| vdev | vdev name (root = entire pool) | + +#### zpool_stats Fields + +| field | units | description | +|---|---|---| +| alloc | bytes | allocated space | +| free | bytes | unallocated space | +| size | bytes | total pool size | +| read_bytes | bytes | bytes read since pool import | +| read_errors | count | number of read errors | +| read_ops | count | number of read operations | +| write_bytes | bytes | bytes written since pool import | +| write_errors | count | number of write errors | +| write_ops | count | number of write operations | + +### zpool_scan_stats Description +Once a pool has been scrubbed, resilvered, or rebuilt, the zpool_scan_stats +contain information about the status and performance of the operation. +Otherwise, the zpool_scan_stats do not exist in the kernel, and therefore +cannot be reported by this collector. + +#### zpool_scan_stats Tags + +| label | description | +|---|---| +| name | pool name | +| function | name of the scan function running or recently completed | +| state | scan state, as shown by _zpool status_ | + +#### zpool_scan_stats Fields + +| field | units | description | +|---|---|---| +| errors | count | number of errors encountered by scan | +| examined | bytes | total data examined during scan | +| to_examine | bytes | prediction of total bytes to be scanned | +| pass_examined | bytes | data examined during current scan pass | +| issued | bytes | size of I/Os issued to disks | +| pass_issued | bytes | size of I/Os issued to disks for current pass | +| processed | bytes | data reconstructed during scan | +| to_process | bytes | total bytes to be repaired | +| rate | bytes/sec | examination rate | +| start_ts | epoch timestamp | start timestamp for scan | +| pause_ts | epoch timestamp | timestamp for a scan pause request | +| end_ts | epoch timestamp | completion timestamp for scan | +| paused_t | seconds | elapsed time while paused | +| remaining_t | seconds | estimate of time remaining for scan | + +### zpool_vdev_stats Description +The ZFS I/O (ZIO) scheduler uses five queues to schedule I/Os to each vdev. +These queues are further divided into active and pending states. +An I/O is pending prior to being issued to the vdev. An active +I/O has been issued to the vdev. The scheduler and its tunable +parameters are described at the +[ZFS documentation for ZIO Scheduler] +(https://openzfs.github.io/openzfs-docs/Performance%20and%20Tuning/ZIO%20Scheduler.html) +The ZIO scheduler reports the queue depths as gauges where the value +represents an instantaneous snapshot of the queue depth at +the sample time. Therefore, it is not unusual to see all zeroes +for an idle pool. + +#### zpool_vdev_stats Tags +| label | description | +|---|---| +| name | pool name | +| vdev | vdev name (root = entire pool) | + +#### zpool_vdev_stats Fields +| field | units | description | +|---|---|---| +| sync_r_active_queue | entries | synchronous read active queue depth | +| sync_w_active_queue | entries | synchronous write active queue depth | +| async_r_active_queue | entries | asynchronous read active queue depth | +| async_w_active_queue | entries | asynchronous write active queue depth | +| async_scrub_active_queue | entries | asynchronous scrub active queue depth | +| sync_r_pend_queue | entries | synchronous read pending queue depth | +| sync_w_pend_queue | entries | synchronous write pending queue depth | +| async_r_pend_queue | entries | asynchronous read pending queue depth | +| async_w_pend_queue | entries | asynchronous write pending queue depth | +| async_scrub_pend_queue | entries | asynchronous scrub pending queue depth | + +### zpool_latency Histogram +ZFS tracks the latency of each I/O in the ZIO pipeline. This latency can +be useful for observing latency-related issues that are not easily observed +using the averaged latency statistics. + +The histogram fields show cumulative values from lowest to highest. +The largest bucket is tagged "le=+Inf", representing the total count +of I/Os by type and vdev. + +#### zpool_latency Histogram Tags +| label | description | +|---|---| +| le | bucket for histogram, latency is less than or equal to bucket value in seconds | +| name | pool name | +| path | for leaf vdevs, the device path name, otherwise omitted | +| vdev | vdev name (root = entire pool) | + +#### zpool_latency Histogram Fields +| field | units | description | +|---|---|---| +| total_read | operations | read operations of all types | +| total_write | operations | write operations of all types | +| disk_read | operations | disk read operations | +| disk_write | operations | disk write operations | +| sync_read | operations | ZIO sync reads | +| sync_write | operations | ZIO sync writes | +| async_read | operations | ZIO async reads| +| async_write | operations | ZIO async writes | +| scrub | operations | ZIO scrub/scan reads | +| trim | operations | ZIO trim (aka unmap) writes | + +### zpool_io_size Histogram +ZFS tracks I/O throughout the ZIO pipeline. The size of each I/O is used +to create a histogram of the size by I/O type and vdev. For example, a +4KiB write to mirrored pool will show a 4KiB write to the top-level vdev +(root) and a 4KiB write to each of the mirror leaf vdevs. + +The ZIO pipeline can aggregate I/O operations. For example, a contiguous +series of writes can be aggregated into a single, larger I/O to the leaf +vdev. The independent I/O operations reflect the logical operations and +the aggregated I/O operations reflect the physical operations. + +The histogram fields show cumulative values from lowest to highest. +The largest bucket is tagged "le=+Inf", representing the total count +of I/Os by type and vdev. + +Note: trim I/Os can be larger than 16MiB, but the larger sizes are +accounted in the 16MiB bucket. + +#### zpool_io_size Histogram Tags +| label | description | +|---|---| +| le | bucket for histogram, I/O size is less than or equal to bucket value in bytes | +| name | pool name | +| path | for leaf vdevs, the device path name, otherwise omitted | +| vdev | vdev name (root = entire pool) | + +#### zpool_io_size Histogram Fields +| field | units | description | +|---|---|---| +| sync_read_ind | blocks | independent sync reads | +| sync_write_ind | blocks | independent sync writes | +| async_read_ind | blocks | independent async reads | +| async_write_ind | blocks | independent async writes | +| scrub_read_ind | blocks | independent scrub/scan reads | +| trim_write_ind | blocks | independent trim (aka unmap) writes | +| sync_read_agg | blocks | aggregated sync reads | +| sync_write_agg | blocks | aggregated sync writes | +| async_read_agg | blocks | aggregated async reads | +| async_write_agg | blocks | aggregated async writes | +| scrub_read_agg | blocks | aggregated scrub/scan reads | +| trim_write_agg | blocks | aggregated trim (aka unmap) writes | + +#### About unsigned integers +Telegraf v1.6.2 and later support unsigned 64-bit integers which more +closely matches the uint64_t values used by ZFS. By default, zpool_influxdb +uses ZFS' uint64_t values and influxdb line protocol unsigned integer type. +If you are using old telegraf or influxdb where unsigned integers are not +available, use the `--signed-int` option. + +## Using _zpool_influxdb_ + +The simplest method is to use the execd input agent in telegraf. For older +versions of telegraf which lack execd, the exec input agent can be used. +For convenience, one of the sample config files below can be placed in the +telegraf config-directory (often /etc/telegraf/telegraf.d). Telegraf can +be restarted to read the config-directory files. + +### Example telegraf execd configuration +```toml +# # Read metrics from zpool_influxdb +[[inputs.execd]] +# ## default installation location for zpool_influxdb command + command = ["/usr/libexec/zfs/zpool_influxdb", "--execd"] + + ## Define how the process is signaled on each collection interval. + ## Valid values are: + ## "none" : Do not signal anything. (Recommended for service inputs) + ## The process must output metrics by itself. + ## "STDIN" : Send a newline on STDIN. (Recommended for gather inputs) + ## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended) + ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. + ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. + signal = "STDIN" + + ## Delay before the process is restarted after an unexpected termination + restart_delay = "10s" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +``` + +### Example telegraf exec configuration +```toml +# # Read metrics from zpool_influxdb +[[inputs.exec]] +# ## default installation location for zpool_influxdb command + commands = ["/usr/libexec/zfs/zpool_influxdb"] + data_format = "influx" +``` + +## Caveat Emptor +* Like the _zpool_ command, _zpool_influxdb_ takes a reader + lock on spa_config for each imported pool. If this lock blocks, + then the command will also block indefinitely and might be + unkillable. This is not a normal condition, but can occur if + there are bugs in the kernel modules. + For this reason, care should be taken: + * avoid spawning many of these commands hoping that one might + finish + * avoid frequent updates or short sample time + intervals, because the locks can interfere with the performance + of other instances of _zpool_ or _zpool_influxdb_ + +## Other collectors +There are a few other collectors for zpool statistics roaming around +the Internet. Many attempt to screen-scrape `zpool` output in various +ways. The screen-scrape method works poorly for `zpool` output because +of its human-friendly nature. Also, they suffer from the same caveats +as this implementation. This implementation is optimized for directly +collecting the metrics and is much more efficient than the screen-scrapers. + +## Feedback Encouraged +Pull requests and issues are greatly appreciated at +https://github.com/openzfs/zfs diff --git a/cmd/zpool_influxdb/dashboards/README.md b/cmd/zpool_influxdb/dashboards/README.md new file mode 100644 index 0000000000..2fdbe49834 --- /dev/null +++ b/cmd/zpool_influxdb/dashboards/README.md @@ -0,0 +1,3 @@ +### Dashboards for zpool_influxdb +This directory contains a collection of dashboards related to ZFS with data +collected from the zpool_influxdb collector. diff --git a/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json b/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json new file mode 100644 index 0000000000..70260ae408 --- /dev/null +++ b/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json @@ -0,0 +1,1667 @@ +{ + "__inputs": [ + { + "name": "DS_MACBOOK-INFLUX", + "label": "macbook-influx", + "description": "", + "type": "datasource", + "pluginId": "influxdb", + "pluginName": "InfluxDB" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.7.3" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "influxdb", + "name": "InfluxDB", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "jdbranham-diagram-panel", + "name": "Diagram", + "version": "1.4.5" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:1627", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Top-level ZFS pool latency by ZIO type", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1590445168391, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [], + "title": "Total Reads and Writes", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the total reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 2, + "legend": { + "show": true + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "total_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Reads", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the total writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 3, + "legend": { + "show": true + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "total_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Writes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 8, + "panels": [], + "title": "ZIO Scheduler Queues for Read Operations", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the synchronous reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 6, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "sync_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Sync Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the asynchronous reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 5, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 9, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "async_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Async Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the scrub or scan reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 10, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 10, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "scrub" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Scrub/Scan Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the actual disk reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 15, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 11, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "disk_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 13, + "panels": [], + "title": "ZIO Scheduler Queues for Write Operations", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the synchronous writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 14, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "sync_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Sync Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the asynchronous writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 5, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 15, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "async_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Async Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the trim or unmap operations of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 10, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 16, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "trim" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Trim Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the disk write operations of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 15, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 17, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "disk_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 19, + "panels": [], + "title": "About", + "type": "row" + }, + { + "content": "I/O requests that are satisfied by accessing pool devices are managed by the ZIO scheduler.\nThe total latency is measured from the start of the I/O to completion by the disk.\nLatency through each queue is shown prior to its submission to the disk queue.\n\nThis view is useful for observing the effects of tuning the ZIO scheduler min and max values\n(see zfs(4) and [ZFS on Linux Module Parameters](https://openzfs.github.io/openzfs-docs/Performance%20and%20tuning/ZFS%20on%20Linux%20Module%20Parameters.html)):\n+ *zfs_vdev_max_active* controls the ZIO scheduler's disk queue depth (do not confuse with the block device's nr_requests)\n+ *zfs_vdev_sync_read_min_active* and *zfs_vdev_sync_read_max_active* control the synchronous queue for reads: most reads are sync\n+ *zfs_vdev_sync_write_min_active* and *zfs_vdev_sync_write_max_active* control the synchronous queue for writes: \nusually metadata or user data depending on the \"sync\" property setting or I/Os that are requested to be flushed\n+ *zfs_vdev_async_read_min_active* and *zfs_vdev_async_read_max_active* control the asynchronous queue for reads: usually prefetches\n+ *zfs_vdev_async_write_min_active* and *zfs_vdev_async_write_max_active* control the asynchronous queue for writes: \nusually the bulk of all writes at transaction group (txg) commit\n+ *zfs_vdev_scrub_min_active* and *zfs_vdev_scrub_max_active* controls the scan reads: usually scrub or resilver\n\n", + "datasource": "${DS_MACBOOK-INFLUX}", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 16, + "x": 0, + "y": 29 + }, + "id": 21, + "mode": "markdown", + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "About ZFS Pool All Queues Read/Write Latency Histograms", + "type": "text" + }, + { + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "composites": [], + "content": "graph LR\nIO((I/O request)) --> SR(sync read queue)\nIO --> SW(sync write queue)\nIO --> AR(async read queue)\nIO --> AW(async write queue)\nIO --> SCRUB(scrub queue)\nIO --> TRIM(trim queue)\nSR --> DISKQ(disk queue)\nSW --> DISKQ\nAR --> DISKQ\nAW --> DISKQ\nSCRUB --> DISKQ\nTRIM --> DISKQ\nDISKQ --> DISK((disk))\n", + "datasource": "${DS_MACBOOK-INFLUX}", + "decimals": 2, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "none", + "graphId": "diagram_23", + "gridPos": { + "h": 15, + "w": 7, + "x": 16, + "y": 29 + }, + "id": 23, + "init": { + "arrowMarkerAbsolute": true, + "cloneCssStyles": true, + "flowchart": { + "htmlLabels": true, + "useMaxWidth": true + }, + "gantt": { + "barGap": 4, + "barHeight": 20, + "fontFamily": "\"Open-Sans\", \"sans-serif\"", + "fontSize": 11, + "gridLineStartPadding": 35, + "leftPadding": 75, + "numberSectionStyles": 3, + "titleTopMargin": 25, + "topPadding": 50 + }, + "logLevel": 3, + "securityLevel": "loose", + "sequence": { + "actorMargin": 50, + "bottomMarginAdj": 1, + "boxMargin": 10, + "boxTextMargin": 5, + "diagramMarginX": 50, + "diagramMarginY": 10, + "height": 65, + "messageMargin": 35, + "mirrorActors": true, + "noteMargin": 10, + "useMaxWidth": true, + "width": 150 + }, + "startOnLoad": false, + "theme": "dark" + }, + "legend": { + "avg": true, + "current": true, + "gradient": { + "enabled": true, + "show": true + }, + "max": true, + "min": true, + "show": false, + "total": true + }, + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:155", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:156", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxWidth": false, + "mermaidServiceUrl": "", + "metricCharacterReplacements": [], + "moddedSeriesVal": 0, + "mode": "content", + "nullPointMode": "connected", + "seriesOverrides": [], + "style": "", + "styleValues": {}, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "themes": [ + "default", + "dark", + "forest", + "neutral" + ], + "thresholds": "0,10", + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "type": "jdbranham-diagram-panel", + "valueMaps": [ + { + "$$hashKey": "object:151", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg", + "valueOptions": [ + "avg", + "min", + "max", + "total", + "current" + ] + } + ], + "refresh": false, + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ZFS", + "Latency", + "Histogram" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_MACBOOK-INFLUX}", + "definition": "show tag values from \"zpool_latency\" with key = \"host\"", + "hide": 0, + "includeAll": false, + "index": -1, + "label": null, + "multi": false, + "name": "hostname", + "options": [], + "query": "show tag values from \"zpool_latency\" with key = \"host\"", + "refresh": 1, + "regex": "/([-a-zA-Z-0-9]+)/", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_MACBOOK-INFLUX}", + "definition": "show tag values from \"zpool_latency\" with key = \"name\" where \"host\" =~ /^$hostname/", + "hide": 0, + "includeAll": false, + "index": -1, + "label": null, + "multi": false, + "name": "poolname", + "options": [], + "query": "show tag values from \"zpool_latency\" with key = \"name\" where \"host\" =~ /^$hostname/", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "2020-05-25T21:34:30.137Z", + "to": "2020-05-25T21:39:54.445Z" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ZFS Pool Latency Heatmaps Influxdb", + "uid": "TbB4-DkGz", + "variables": { + "list": [] + }, + "version": 2 +} diff --git a/cmd/zpool_influxdb/telegraf.d/README.md b/cmd/zpool_influxdb/telegraf.d/README.md new file mode 100644 index 0000000000..74f411a15d --- /dev/null +++ b/cmd/zpool_influxdb/telegraf.d/README.md @@ -0,0 +1,7 @@ +This directory contains sample telegraf configurations for +adding `zpool_influxdb` as an input plugin. Depending on your +telegraf configuration, the installation can be as simple as +copying one of these to the `/etc/telegraf/telegraf.d` directory +and restarting `systemctl restart telegraf` + +See the telegraf docs for more information on input plugins. diff --git a/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf b/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf new file mode 100644 index 0000000000..a2efa61892 --- /dev/null +++ b/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf @@ -0,0 +1,15 @@ +# # Read metrics from zpool_influxdb +[[inputs.exec]] +# ## default installation location for zpool_influxdb command + commands = ["/usr/local/libexec/zfs/zpool_influxdb"] +# ## Timeout for each command to complete. +# timeout = "5s" +# +# ## measurement name suffix (for separating different commands) +# name_suffix = "_mycollector" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" diff --git a/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf b/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf new file mode 100644 index 0000000000..90737b8cb7 --- /dev/null +++ b/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf @@ -0,0 +1,23 @@ +# # Read metrics from zpool_influxdb +[[inputs.execd]] +# ## default installation location for zpool_influxdb command + command = ["/usr/local/libexec/zfs/zpool_influxdb", "--execd"] + + ## Define how the process is signaled on each collection interval. + ## Valid values are: + ## "none" : Do not signal anything. (Recommended for service inputs) + ## The process must output metrics by itself. + ## "STDIN" : Send a newline on STDIN. (Recommended for gather inputs) + ## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended) + ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. + ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. + signal = "STDIN" + + ## Delay before the process is restarted after an unexpected termination + restart_delay = "10s" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" diff --git a/cmd/zpool_influxdb/zpool_influxdb.c b/cmd/zpool_influxdb/zpool_influxdb.c new file mode 100644 index 0000000000..f326b0420e --- /dev/null +++ b/cmd/zpool_influxdb/zpool_influxdb.c @@ -0,0 +1,851 @@ +/* + * Gather top-level ZFS pool and resilver/scan statistics and print using + * influxdb line protocol + * usage: [options] [pool_name] + * where options are: + * --execd, -e run in telegraf execd input plugin mode, [CR] on + * stdin causes a sample to be printed and wait for + * the next [CR] + * --no-histograms, -n don't print histogram data (reduces cardinality + * if you don't care about histograms) + * --sum-histogram-buckets, -s sum histogram bucket values + * + * To integrate into telegraf use one of: + * 1. the `inputs.execd` plugin with the `--execd` option + * 2. the `inputs.exec` plugin to simply run with no options + * + * NOTE: libzfs is an unstable interface. YMMV. + * + * The design goals of this software include: + * + be as lightweight as possible + * + reduce the number of external dependencies as far as possible, hence + * there is no dependency on a client library for managing the metric + * collection -- info is printed, KISS + * + broken pools or kernel bugs can cause this process to hang in an + * unkillable state. For this reason, it is best to keep the damage limited + * to a small process like zpool_influxdb rather than a larger collector. + * + * Copyright 2018-2020 Richard Elling + * + * This software is dual-licensed MIT and CDDL. + * + * The MIT License (MIT) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include + +#define POOL_MEASUREMENT "zpool_stats" +#define SCAN_MEASUREMENT "zpool_scan_stats" +#define VDEV_MEASUREMENT "zpool_vdev_stats" +#define POOL_LATENCY_MEASUREMENT "zpool_latency" +#define POOL_QUEUE_MEASUREMENT "zpool_vdev_queue" +#define MIN_LAT_INDEX 10 /* minimum latency index 10 = 1024ns */ +#define POOL_IO_SIZE_MEASUREMENT "zpool_io_size" +#define MIN_SIZE_INDEX 9 /* minimum size index 9 = 512 bytes */ + +/* global options */ +int execd_mode = 0; +int no_histograms = 0; +int sum_histogram_buckets = 0; +char metric_data_type = 'u'; +uint64_t metric_value_mask = UINT64_MAX; +uint64_t timestamp = 0; +int complained_about_sync = 0; +char *tags = ""; + +typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *); + +/* + * influxdb line protocol rules for escaping are important because the + * zpool name can include characters that need to be escaped + * + * caller is responsible for freeing result + */ +static char * +escape_string(const char *s) +{ + const char *c; + char *d; + char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2); + if (t == NULL) { + fprintf(stderr, "error: cannot allocate memory\n"); + exit(1); + } + + for (c = s, d = t; *c != '\0'; c++, d++) { + switch (*c) { + case ' ': + case ',': + case '=': + case '\\': + *d++ = '\\'; + fallthrough; + default: + *d = *c; + } + } + *d = '\0'; + return (t); +} + +/* + * print key=value where value is a uint64_t + */ +static void +print_kv(char *key, uint64_t value) +{ + printf("%s=%llu%c", key, + (u_longlong_t)value & metric_value_mask, metric_data_type); +} + +/* + * print_scan_status() prints the details as often seen in the "zpool status" + * output. However, unlike the zpool command, which is intended for humans, + * this output is suitable for long-term tracking in influxdb. + * TODO: update to include issued scan data + */ +static int +print_scan_status(nvlist_t *nvroot, const char *pool_name) +{ + uint_t c; + int64_t elapsed; + uint64_t examined, pass_exam, paused_time, paused_ts, rate; + uint64_t remaining_time; + pool_scan_stat_t *ps = NULL; + double pct_done; + char *state[DSS_NUM_STATES] = { + "none", "scanning", "finished", "canceled"}; + char *func; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + /* + * ignore if there are no stats + */ + if (ps == NULL) + return (0); + + /* + * return error if state is bogus + */ + if (ps->pss_state >= DSS_NUM_STATES || + ps->pss_func >= POOL_SCAN_FUNCS) { + if (complained_about_sync % 1000 == 0) { + fprintf(stderr, "error: cannot decode scan stats: " + "ZFS is out of sync with compiled zpool_influxdb"); + complained_about_sync++; + } + return (1); + } + + switch (ps->pss_func) { + case POOL_SCAN_NONE: + func = "none_requested"; + break; + case POOL_SCAN_SCRUB: + func = "scrub"; + break; + case POOL_SCAN_RESILVER: + func = "resilver"; + break; +#ifdef POOL_SCAN_REBUILD + case POOL_SCAN_REBUILD: + func = "rebuild"; + break; +#endif + default: + func = "scan"; + } + + /* overall progress */ + examined = ps->pss_examined ? ps->pss_examined : 1; + pct_done = 0.0; + if (ps->pss_to_examine > 0) + pct_done = 100.0 * examined / ps->pss_to_examine; + +#ifdef EZFS_SCRUB_PAUSED + paused_ts = ps->pss_pass_scrub_pause; + paused_time = ps->pss_pass_scrub_spent_paused; +#else + paused_ts = 0; + paused_time = 0; +#endif + + /* calculations for this pass */ + if (ps->pss_state == DSS_SCANNING) { + elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start - + (int64_t)paused_time; + elapsed = (elapsed > 0) ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + rate = (rate > 0) ? rate : 1; + remaining_time = ps->pss_to_examine - examined / rate; + } else { + elapsed = + (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start - + (int64_t)paused_time; + elapsed = (elapsed > 0) ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + remaining_time = 0; + } + rate = rate ? rate : 1; + + /* influxdb line protocol format: "tags metrics timestamp" */ + printf("%s%s,function=%s,name=%s,state=%s ", + SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]); + print_kv("end_ts", ps->pss_end_time); + print_kv(",errors", ps->pss_errors); + print_kv(",examined", examined); + print_kv(",issued", ps->pss_issued); + print_kv(",pass_examined", pass_exam); + print_kv(",pass_issued", ps->pss_pass_issued); + print_kv(",paused_ts", paused_ts); + print_kv(",paused_t", paused_time); + printf(",pct_done=%.2f", pct_done); + print_kv(",processed", ps->pss_processed); + print_kv(",rate", rate); + print_kv(",remaining_t", remaining_time); + print_kv(",start_ts", ps->pss_start_time); + print_kv(",to_examine", ps->pss_to_examine); + print_kv(",to_process", ps->pss_to_process); + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * get a vdev name that corresponds to the top-level vdev names + * printed by `zpool status` + */ +static char * +get_vdev_name(nvlist_t *nvroot, const char *parent_name) +{ + static char vdev_name[256]; + char *vdev_type = NULL; + uint64_t vdev_id = 0; + + if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, + &vdev_type) != 0) { + vdev_type = "unknown"; + } + if (nvlist_lookup_uint64( + nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) { + vdev_id = UINT64_MAX; + } + if (parent_name == NULL) { + (void) snprintf(vdev_name, sizeof (vdev_name), "%s", + vdev_type); + } else { + (void) snprintf(vdev_name, sizeof (vdev_name), + "%s/%s-%llu", + parent_name, vdev_type, (u_longlong_t)vdev_id); + } + return (vdev_name); +} + +/* + * get a string suitable for an influxdb tag that describes this vdev + * + * By default only the vdev hierarchical name is shown, separated by '/' + * If the vdev has an associated path, which is typical of leaf vdevs, + * then the path is added. + * It would be nice to have the devid instead of the path, but under + * Linux we cannot be sure a devid will exist and we'd rather have + * something than nothing, so we'll use path instead. + */ +static char * +get_vdev_desc(nvlist_t *nvroot, const char *parent_name) +{ + static char vdev_desc[2 * MAXPATHLEN]; + char *vdev_type = NULL; + uint64_t vdev_id = 0; + char vdev_value[MAXPATHLEN]; + char *vdev_path = NULL; + char *s, *t; + + if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type) != 0) { + vdev_type = "unknown"; + } + if (nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) { + vdev_id = UINT64_MAX; + } + if (nvlist_lookup_string( + nvroot, ZPOOL_CONFIG_PATH, &vdev_path) != 0) { + vdev_path = NULL; + } + + if (parent_name == NULL) { + s = escape_string(vdev_type); + (void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s); + free(s); + } else { + s = escape_string((char *)parent_name); + t = escape_string(vdev_type); + (void) snprintf(vdev_value, sizeof (vdev_value), + "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id); + free(s); + free(t); + } + if (vdev_path == NULL) { + (void) snprintf(vdev_desc, sizeof (vdev_desc), "%s", + vdev_value); + } else { + s = escape_string(vdev_path); + (void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s", + s, vdev_value); + free(s); + } + return (vdev_desc); +} + +/* + * vdev summary stats are a combination of the data shown by + * `zpool status` and `zpool list -v` + */ +static int +print_summary_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c; + vdev_stat_t *vs; + char *vdev_desc = NULL; + vdev_desc = get_vdev_desc(nvroot, parent_name); + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) { + return (1); + } + printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags, + pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state, + (vdev_aux_t)vs->vs_aux), vdev_desc); + print_kv("alloc", vs->vs_alloc); + print_kv(",free", vs->vs_space - vs->vs_alloc); + print_kv(",size", vs->vs_space); + print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]); + print_kv(",read_errors", vs->vs_read_errors); + print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]); + print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]); + print_kv(",write_errors", vs->vs_write_errors); + print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]); + print_kv(",checksum_errors", vs->vs_checksum_errors); + print_kv(",fragmentation", vs->vs_fragmentation); + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * vdev latency stats are histograms stored as nvlist arrays of uint64. + * Latency stats include the ZIO scheduler classes plus lower-level + * vdev latencies. + * + * In many cases, the top-level "root" view obscures the underlying + * top-level vdev operations. For example, if a pool has a log, special, + * or cache device, then each can behave very differently. It is useful + * to see how each is responding. + */ +static int +print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c, end = 0; + nvlist_t *nv_ex; + char *vdev_desc = NULL; + + /* short_names become part of the metric name and are influxdb-ready */ + struct lat_lookup { + char *name; + char *short_name; + uint64_t sum; + uint64_t *array; + }; + struct lat_lookup lat_type[] = { + {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, "total_read", 0}, + {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, "total_write", 0}, + {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, "disk_read", 0}, + {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, "disk_write", 0}, + {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, "sync_read", 0}, + {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, "sync_write", 0}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0}, + {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, "scrub", 0}, +#ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO + {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, "trim", 0}, +#endif + {ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, "rebuild", 0}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + vdev_desc = get_vdev_desc(nvroot, parent_name); + + for (int i = 0; lat_type[i].name; i++) { + if (nvlist_lookup_uint64_array(nv_ex, + lat_type[i].name, &lat_type[i].array, &c) != 0) { + fprintf(stderr, "error: can't get %s\n", + lat_type[i].name); + return (3); + } + /* end count count, all of the arrays are the same size */ + end = c - 1; + } + + for (int bucket = 0; bucket <= end; bucket++) { + if (bucket < MIN_LAT_INDEX) { + /* don't print, but collect the sum */ + for (int i = 0; lat_type[i].name; i++) { + lat_type[i].sum += lat_type[i].array[bucket]; + } + continue; + } + if (bucket < end) { + printf("%s%s,le=%0.6f,name=%s,%s ", + POOL_LATENCY_MEASUREMENT, tags, + (float)(1ULL << bucket) * 1e-9, + pool_name, vdev_desc); + } else { + printf("%s%s,le=+Inf,name=%s,%s ", + POOL_LATENCY_MEASUREMENT, tags, pool_name, + vdev_desc); + } + for (int i = 0; lat_type[i].name; i++) { + if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) { + lat_type[i].sum += lat_type[i].array[bucket]; + } else { + lat_type[i].sum = lat_type[i].array[bucket]; + } + print_kv(lat_type[i].short_name, lat_type[i].sum); + if (lat_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + } + return (0); +} + +/* + * vdev request size stats are histograms stored as nvlist arrays of uint64. + * Request size stats include the ZIO scheduler classes plus lower-level + * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported. + * + * In many cases, the top-level "root" view obscures the underlying + * top-level vdev operations. For example, if a pool has a log, special, + * or cache device, then each can behave very differently. It is useful + * to see how each is responding. + */ +static int +print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c, end = 0; + nvlist_t *nv_ex; + char *vdev_desc = NULL; + + /* short_names become the field name */ + struct size_lookup { + char *name; + char *short_name; + uint64_t sum; + uint64_t *array; + }; + struct size_lookup size_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, "sync_read_ind"}, + {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, "sync_write_ind"}, + {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, "async_read_ind"}, + {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, "async_write_ind"}, + {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, "scrub_read_ind"}, + {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, "sync_read_agg"}, + {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, "sync_write_agg"}, + {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, "async_read_agg"}, + {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, "async_write_agg"}, + {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, "scrub_read_agg"}, +#ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO + {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, "trim_write_ind"}, + {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, "trim_write_agg"}, +#endif + {ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, "rebuild_write_ind"}, + {ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, "rebuild_write_agg"}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + vdev_desc = get_vdev_desc(nvroot, parent_name); + + for (int i = 0; size_type[i].name; i++) { + if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name, + &size_type[i].array, &c) != 0) { + fprintf(stderr, "error: can't get %s\n", + size_type[i].name); + return (3); + } + /* end count count, all of the arrays are the same size */ + end = c - 1; + } + + for (int bucket = 0; bucket <= end; bucket++) { + if (bucket < MIN_SIZE_INDEX) { + /* don't print, but collect the sum */ + for (int i = 0; size_type[i].name; i++) { + size_type[i].sum += size_type[i].array[bucket]; + } + continue; + } + + if (bucket < end) { + printf("%s%s,le=%llu,name=%s,%s ", + POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket, + pool_name, vdev_desc); + } else { + printf("%s%s,le=+Inf,name=%s,%s ", + POOL_IO_SIZE_MEASUREMENT, tags, pool_name, + vdev_desc); + } + for (int i = 0; size_type[i].name; i++) { + if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) { + size_type[i].sum += size_type[i].array[bucket]; + } else { + size_type[i].sum = size_type[i].array[bucket]; + } + print_kv(size_type[i].short_name, size_type[i].sum); + if (size_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + } + return (0); +} + +/* + * ZIO scheduler queue stats are stored as gauges. This is unfortunate + * because the values can change very rapidly and any point-in-time + * value will quickly be obsoleted. It is also not easy to downsample. + * Thus only the top-level queue stats might be beneficial... maybe. + */ +static int +print_queue_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + nvlist_t *nv_ex; + uint64_t value; + + /* short_names are used for the field name */ + struct queue_lookup { + char *name; + char *short_name; + }; + struct queue_lookup queue_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active"}, + {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active"}, + {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, "rebuild_active"}, + {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend"}, + {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend"}, + {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, "rebuild_pend"}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name, + get_vdev_desc(nvroot, parent_name)); + for (int i = 0; queue_type[i].name; i++) { + if (nvlist_lookup_uint64(nv_ex, + queue_type[i].name, &value) != 0) { + fprintf(stderr, "error: can't get %s\n", + queue_type[i].name); + return (3); + } + print_kv(queue_type[i].short_name, value); + if (queue_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * top-level vdev stats are at the pool level + */ +static int +print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name) +{ + nvlist_t *nv_ex; + uint64_t value; + + /* short_names become part of the metric name */ + struct queue_lookup { + char *name; + char *short_name; + }; + struct queue_lookup queue_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"}, + {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"}, + {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, "rebuild_active_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"}, + {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"}, + {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, "rebuild_pend_queue"}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags, + pool_name); + for (int i = 0; queue_type[i].name; i++) { + if (nvlist_lookup_uint64(nv_ex, + queue_type[i].name, &value) != 0) { + fprintf(stderr, "error: can't get %s\n", + queue_type[i].name); + return (3); + } + if (i > 0) + printf(","); + print_kv(queue_type[i].short_name, value); + } + + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * recursive stats printer + */ +static int +print_recursive_stats(stat_printer_f func, nvlist_t *nvroot, + const char *pool_name, const char *parent_name, int descend) +{ + uint_t c, children; + nvlist_t **child; + char vdev_name[256]; + int err; + + err = func(nvroot, pool_name, parent_name); + if (err) + return (err); + + if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + (void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name), + sizeof (vdev_name)); + + for (c = 0; c < children; c++) { + print_recursive_stats(func, child[c], pool_name, + vdev_name, descend); + } + } + return (0); +} + +/* + * call-back to print the stats from the pool config + * + * Note: if the pool is broken, this can hang indefinitely and perhaps in an + * unkillable state. + */ +static int +print_stats(zpool_handle_t *zhp, void *data) +{ + uint_t c; + int err; + boolean_t missing; + nvlist_t *config, *nvroot; + vdev_stat_t *vs; + struct timespec tv; + char *pool_name; + + /* if not this pool return quickly */ + if (data && + strncmp(data, zpool_get_name(zhp), ZFS_MAX_DATASET_NAME_LEN) != 0) { + zpool_close(zhp); + return (0); + } + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (1); + } + + config = zpool_get_config(zhp, NULL); + if (clock_gettime(CLOCK_REALTIME, &tv) != 0) + timestamp = (uint64_t)time(NULL) * 1000000000; + else + timestamp = + ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec; + + if (nvlist_lookup_nvlist( + config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { + zpool_close(zhp); + return (2); + } + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) { + zpool_close(zhp); + return (3); + } + + pool_name = escape_string(zpool_get_name(zhp)); + err = print_recursive_stats(print_summary_stats, nvroot, + pool_name, NULL, 1); + /* if any of these return an error, skip the rest */ + if (err == 0) + err = print_top_level_vdev_stats(nvroot, pool_name); + + if (no_histograms == 0) { + if (err == 0) + err = print_recursive_stats(print_vdev_latency_stats, nvroot, + pool_name, NULL, 1); + if (err == 0) + err = print_recursive_stats(print_vdev_size_stats, nvroot, + pool_name, NULL, 1); + if (err == 0) + err = print_recursive_stats(print_queue_stats, nvroot, + pool_name, NULL, 0); + } + if (err == 0) + err = print_scan_status(nvroot, pool_name); + + free(pool_name); + zpool_close(zhp); + return (err); +} + +static void +usage(char *name) +{ + fprintf(stderr, "usage: %s [--execd][--no-histograms]" + "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name); + exit(EXIT_FAILURE); +} + +int +main(int argc, char *argv[]) +{ + int opt; + int ret = 8; + char *line = NULL; + size_t len, tagslen = 0; + struct option long_options[] = { + {"execd", no_argument, NULL, 'e'}, + {"help", no_argument, NULL, 'h'}, + {"no-histograms", no_argument, NULL, 'n'}, + {"signed-int", no_argument, NULL, 'i'}, + {"sum-histogram-buckets", no_argument, NULL, 's'}, + {"tags", required_argument, NULL, 't'}, + {0, 0, 0, 0} + }; + while ((opt = getopt_long( + argc, argv, "ehinst:", long_options, NULL)) != -1) { + switch (opt) { + case 'e': + execd_mode = 1; + break; + case 'i': + metric_data_type = 'i'; + metric_value_mask = INT64_MAX; + break; + case 'n': + no_histograms = 1; + break; + case 's': + sum_histogram_buckets = 1; + break; + case 't': + tagslen = strlen(optarg) + 2; + tags = calloc(tagslen, 1); + if (tags == NULL) { + fprintf(stderr, + "error: cannot allocate memory " + "for tags\n"); + exit(1); + } + (void) snprintf(tags, tagslen, ",%s", optarg); + break; + default: + usage(argv[0]); + } + } + + libzfs_handle_t *g_zfs; + if ((g_zfs = libzfs_init()) == NULL) { + fprintf(stderr, + "error: cannot initialize libzfs. " + "Is the zfs module loaded or zrepl running?\n"); + exit(EXIT_FAILURE); + } + if (execd_mode == 0) { + ret = zpool_iter(g_zfs, print_stats, argv[optind]); + return (ret); + } + while (getline(&line, &len, stdin) != -1) { + ret = zpool_iter(g_zfs, print_stats, argv[optind]); + fflush(stdout); + } + return (ret); +} diff --git a/cmd/zstream/.gitignore b/cmd/zstream/.gitignore new file mode 100644 index 0000000000..fd1240d55c --- /dev/null +++ b/cmd/zstream/.gitignore @@ -0,0 +1 @@ +zstream diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am new file mode 100644 index 0000000000..8e813027fa --- /dev/null +++ b/cmd/zstream/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zstream + +zstream_SOURCES = \ + zstream.c \ + zstream.h \ + zstream_dump.c \ + zstream_redup.c \ + zstream_token.c + +zstream_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +include $(top_srcdir)/config/CppCheck.am + +install-exec-hook: + cd $(DESTDIR)$(sbindir) && $(LN_S) -f zstream zstreamdump diff --git a/cmd/zstream/zstream.c b/cmd/zstream/zstream.c new file mode 100644 index 0000000000..523ae06897 --- /dev/null +++ b/cmd/zstream/zstream.c @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zstream.h" + +void +zstream_usage(void) +{ + (void) fprintf(stderr, + "usage: zstream command args ...\n" + "Available commands are:\n" + "\n" + "\tzstream dump [-vCd] FILE\n" + "\t... | zstream dump [-vCd]\n" + "\n" + "\tzstream token resume_token\n" + "\n" + "\tzstream redup [-v] FILE | ...\n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + char *basename = strrchr(argv[0], '/'); + basename = basename ? (basename + 1) : argv[0]; + if (argc >= 1 && strcmp(basename, "zstreamdump") == 0) + return (zstream_do_dump(argc, argv)); + + if (argc < 2) + zstream_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "dump") == 0) { + return (zstream_do_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "token") == 0) { + return (zstream_do_token(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "redup") == 0) { + return (zstream_do_redup(argc - 1, argv + 1)); + } else { + zstream_usage(); + } +} diff --git a/cmd/zstream/zstream.h b/cmd/zstream/zstream.h new file mode 100644 index 0000000000..319fecb287 --- /dev/null +++ b/cmd/zstream/zstream.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _ZSTREAM_H +#define _ZSTREAM_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zstream_do_redup(int, char *[]); +extern int zstream_do_dump(int, char *[]); +extern int zstream_do_token(int, char *[]); +extern void zstream_usage(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTREAM_H */ diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstream/zstream_dump.c similarity index 84% rename from cmd/zstreamdump/zstreamdump.c rename to cmd/zstream/zstream_dump.c index a162eceda5..45cf7b97a1 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstream/zstream_dump.c @@ -42,6 +42,7 @@ #include #include #include +#include "zstream.h" /* * If dump mode is enabled, the number of bytes to print per line @@ -53,23 +54,11 @@ */ #define DUMP_GROUPING 4 -uint64_t total_write_size = 0; uint64_t total_stream_len = 0; FILE *send_stream = 0; boolean_t do_byteswap = B_FALSE; boolean_t do_cksum = B_TRUE; -static void -usage(void) -{ - (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n"); - (void) fprintf(stderr, "\t -v -- verbose\n"); - (void) fprintf(stderr, "\t -C -- suppress checksum verification\n"); - (void) fprintf(stderr, "\t -d -- dump contents of blocks modified, " - "implies verbose\n"); - exit(1); -} - static void * safe_malloc(size_t size) { @@ -118,7 +107,8 @@ read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) sizeof (zio_cksum_t), cksum); if (r == 0) return (0); - if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && + if (do_cksum && + !ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && !ZIO_CHECKSUM_EQUAL(saved_cksum, drr->drr_u.drr_checksum.drr_checksum)) { fprintf(stderr, "invalid checksum\n"); @@ -198,7 +188,7 @@ print_block(char *buf, int length) } /* - * Print an array of bytes to stdout as hexidecimal characters. str must + * Print an array of bytes to stdout as hexadecimal characters. str must * have buf_len * 2 + 1 bytes of space. */ static void @@ -215,10 +205,13 @@ sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len) } int -main(int argc, char *argv[]) +zstream_do_dump(int argc, char *argv[]) { char *buf = safe_malloc(SPA_MAXBLOCKSIZE); uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; + uint64_t total_payload_size = 0; + uint64_t total_overhead_size = 0; + uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 }; char salt[ZIO_DATA_SALT_LEN * 2 + 1]; char iv[ZIO_DATA_IV_LEN * 2 + 1]; char mac[ZIO_DATA_MAC_LEN * 2 + 1]; @@ -236,6 +229,7 @@ main(int argc, char *argv[]) struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; struct drr_object_range *drror = &thedrr.drr_u.drr_object_range; + struct drr_redact *drrr = &thedrr.drr_u.drr_redact; struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; int c; boolean_t verbose = B_FALSE; @@ -269,26 +263,39 @@ main(int argc, char *argv[]) case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); - usage(); + zstream_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); - usage(); + zstream_usage(); break; } } - if (isatty(STDIN_FILENO)) { - (void) fprintf(stderr, - "Error: Backup stream can not be read " - "from a terminal.\n" - "You must redirect standard input.\n"); - exit(1); + if (argc > optind) { + const char *filename = argv[optind]; + send_stream = fopen(filename, "r"); + if (send_stream == NULL) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + } else { + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + "Error: The send stream is a binary format " + "and can not be read from a\n" + "terminal. Standard input must be redirected, " + "or a file must be\n" + "specified as a command-line argument.\n"); + exit(1); + } + send_stream = stdin; } fletcher_4_init(); - send_stream = stdin; while (read_hdr(drr, &zc)) { /* @@ -336,7 +343,9 @@ main(int argc, char *argv[]) } drr_record_count[drr->drr_type]++; + total_overhead_size += sizeof (*drr); total_records++; + payload_size = 0; switch (drr->drr_type) { case DRR_BEGIN: @@ -369,6 +378,8 @@ main(int argc, char *argv[]) (void) printf("\tfromguid = %llx\n", (u_longlong_t)drrb->drr_fromguid); (void) printf("\ttoname = %s\n", drrb->drr_toname); + (void) printf("\tpayloadlen = %u\n", + drr->drr_payloadlen); if (verbose) (void) printf("\n"); @@ -390,6 +401,7 @@ main(int argc, char *argv[]) nvlist_print(stdout, nv); nvlist_free(nv); } + payload_size = sz; } break; @@ -554,7 +566,6 @@ main(int argc, char *argv[]) if (dump) { print_block(buf, payload_size); } - total_write_size += payload_size; break; case DRR_WRITE_BYREF: @@ -683,6 +694,7 @@ main(int argc, char *argv[]) print_block(buf, P2ROUNDUP(drrwe->drr_psize, 8)); } + payload_size = P2ROUNDUP(drrwe->drr_psize, 8); break; case DRR_OBJECT_RANGE: if (do_byteswap) { @@ -711,6 +723,21 @@ main(int argc, char *argv[]) mac); } break; + case DRR_REDACT: + if (do_byteswap) { + drrr->drr_object = BSWAP_64(drrr->drr_object); + drrr->drr_offset = BSWAP_64(drrr->drr_offset); + drrr->drr_length = BSWAP_64(drrr->drr_length); + drrr->drr_toguid = BSWAP_64(drrr->drr_toguid); + } + if (verbose) { + (void) printf("REDACT object = %llu offset = " + "%llu length = %llu\n", + (u_longlong_t)drrr->drr_object, + (u_longlong_t)drrr->drr_offset, + (u_longlong_t)drrr->drr_length); + } + break; case DRR_NUMTYPES: /* should never be reached */ exit(1); @@ -723,6 +750,8 @@ main(int argc, char *argv[]) (longlong_t)drrc->drr_checksum.zc_word[3]); } pcksum = zc; + drr_byte_count[drr->drr_type] += payload_size; + total_payload_size += payload_size; } free(buf); fletcher_4_fini(); @@ -730,28 +759,40 @@ main(int argc, char *argv[]) /* Print final summary */ (void) printf("SUMMARY:\n"); - (void) printf("\tTotal DRR_BEGIN records = %lld\n", - (u_longlong_t)drr_record_count[DRR_BEGIN]); - (void) printf("\tTotal DRR_END records = %lld\n", - (u_longlong_t)drr_record_count[DRR_END]); - (void) printf("\tTotal DRR_OBJECT records = %lld\n", - (u_longlong_t)drr_record_count[DRR_OBJECT]); - (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); - (void) printf("\tTotal DRR_WRITE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE]); - (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]); - (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]); - (void) printf("\tTotal DRR_FREE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREE]); - (void) printf("\tTotal DRR_SPILL records = %lld\n", - (u_longlong_t)drr_record_count[DRR_SPILL]); + (void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_BEGIN], + (u_longlong_t)drr_byte_count[DRR_BEGIN]); + (void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_END], + (u_longlong_t)drr_byte_count[DRR_END]); + (void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_OBJECT], + (u_longlong_t)drr_byte_count[DRR_OBJECT]); + (void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREEOBJECTS], + (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]); + (void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE], + (u_longlong_t)drr_byte_count[DRR_WRITE]); + (void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE_BYREF], + (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]); + (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu " + "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED], + (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]); + (void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREE], + (u_longlong_t)drr_byte_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_SPILL], + (u_longlong_t)drr_byte_count[DRR_SPILL]); (void) printf("\tTotal records = %lld\n", (u_longlong_t)total_records); - (void) printf("\tTotal write size = %lld (0x%llx)\n", - (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); + (void) printf("\tTotal payload size = %lld (0x%llx)\n", + (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size); + (void) printf("\tTotal header overhead = %lld (0x%llx)\n", + (u_longlong_t)total_overhead_size, + (u_longlong_t)total_overhead_size); (void) printf("\tTotal stream length = %lld (0x%llx)\n", (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); return (0); diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c new file mode 100644 index 0000000000..474527e76e --- /dev/null +++ b/cmd/zstream/zstream_redup.c @@ -0,0 +1,469 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_fletcher.h" +#include "zstream.h" + + +#define MAX_RDT_PHYSMEM_PERCENT 20 +#define SMALLEST_POSSIBLE_MAX_RDT_MB 128 + +typedef struct redup_entry { + struct redup_entry *rde_next; + uint64_t rde_guid; + uint64_t rde_object; + uint64_t rde_offset; + uint64_t rde_stream_offset; +} redup_entry_t; + +typedef struct redup_table { + redup_entry_t **redup_hash_array; + umem_cache_t *ddecache; + uint64_t ddt_count; + int numhashbits; +} redup_table_t; + +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +static void * +safe_calloc(size_t n) +{ + void *rv = calloc(1, n); + if (rv == NULL) { + fprintf(stderr, + "Error: could not allocate %u bytes of memory\n", + (int)n); + exit(1); + } + return (rv); +} + +/* + * Safe version of fread(), exits on error. + */ +static int +sfread(void *buf, size_t size, FILE *fp) +{ + int rv = fread(buf, size, 1, fp); + if (rv == 0 && ferror(fp)) { + (void) fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + return (rv); +} + +/* + * Safe version of pread(), exits on error. + */ +static void +spread(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t err = pread(fd, buf, count, offset); + if (err == -1) { + (void) fprintf(stderr, + "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } else if (err != count) { + (void) fprintf(stderr, + "Error while reading file: short read\n"); + exit(1); + } +} + +static int +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) +{ + assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) + == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); +} + +static void +rdt_insert(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + redup_entry_t **rdepp; + + rdepp = &(rdt->redup_hash_array[hashcode]); + redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); + rde->rde_next = *rdepp; + rde->rde_guid = guid; + rde->rde_object = object; + rde->rde_offset = offset; + rde->rde_stream_offset = stream_offset; + *rdepp = rde; + rdt->ddt_count++; +} + +static void +rdt_lookup(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, + uint64_t *stream_offsetp) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + + for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; + rde != NULL; rde = rde->rde_next) { + if (rde->rde_guid == guid && + rde->rde_object == object && + rde->rde_offset == offset) { + *stream_offsetp = rde->rde_stream_offset; + return; + } + } + assert(!"could not find expected redup table entry"); +} + +/* + * Convert a dedup stream (generated by "zfs send -D") to a + * non-deduplicated stream. The entire infd will be converted, including + * any substreams in a stream package (generated by "zfs send -RD"). The + * infd must be seekable. + */ +static void +zfs_redup_stream(int infd, int outfd, boolean_t verbose) +{ + int bufsz = SPA_MAXBLOCKSIZE; + dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t *drr = &thedrr; + redup_table_t rdt; + zio_cksum_t stream_cksum; + uint64_t numbuckets; + uint64_t num_records = 0; + uint64_t num_write_byref_records = 0; + +#ifdef _ILP32 + uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; +#else + uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); + uint64_t max_rde_size = + MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, + SMALLEST_POSSIBLE_MAX_RDT_MB << 20); +#endif + + numbuckets = max_rde_size / (sizeof (redup_entry_t)); + + /* + * numbuckets must be a power of 2. Increase number to + * a power of 2 if necessary. + */ + if (!ISP2(numbuckets)) + numbuckets = 1ULL << highbit64(numbuckets); + + rdt.redup_hash_array = + safe_calloc(numbuckets * sizeof (redup_entry_t *)); + rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + rdt.numhashbits = highbit64(numbuckets) - 1; + rdt.ddt_count = 0; + + char *buf = safe_calloc(bufsz); + FILE *ofp = fdopen(infd, "r"); + long offset = ftell(ofp); + while (sfread(drr, sizeof (*drr), ofp) != 0) { + num_records++; + + /* + * We need to regenerate the checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + + uint64_t payload_size = 0; + switch (drr->drr_type) { + case DRR_BEGIN: + { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int fflags; + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + + assert(drrb->drr_magic == DMU_BACKUP_MAGIC); + + /* clear the DEDUP feature flag for this stream */ + fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + /* cppcheck-suppress syntaxError */ + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); + + int sz = drr->drr_payloadlen; + if (sz != 0) { + if (sz > bufsz) { + free(buf); + buf = safe_calloc(sz); + bufsz = sz; + } + (void) sfread(buf, sz, ofp); + } + payload_size = sz; + break; + } + + case DRR_END: + { + struct drr_end *drre = &drr->drr_u.drr_end; + /* + * Use the recalculated checksum, unless this is + * the END record of a stream package, which has + * no checksum. + */ + if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) + drre->drr_checksum = stream_cksum; + break; + } + + case DRR_OBJECT: + { + struct drr_object *drro = &drr->drr_u.drr_object; + + if (drro->drr_bonuslen > 0) { + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + (void) sfread(buf, payload_size, ofp); + } + break; + } + + case DRR_SPILL: + { + struct drr_spill *drrs = &drr->drr_u.drr_spill; + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwb = + drr->drr_u.drr_write_byref; + + num_write_byref_records++; + + /* + * Look up in hash table by drrwb->drr_refguid, + * drr_refobject, drr_refoffset. Replace this + * record with the found WRITE record, but with + * drr_object,drr_offset,drr_toguid replaced with ours. + */ + uint64_t stream_offset = 0; + rdt_lookup(&rdt, drrwb.drr_refguid, + drrwb.drr_refobject, drrwb.drr_refoffset, + &stream_offset); + + spread(infd, drr, sizeof (*drr), stream_offset); + + assert(drr->drr_type == DRR_WRITE); + struct drr_write *drrw = &drr->drr_u.drr_write; + assert(drrw->drr_toguid == drrwb.drr_refguid); + assert(drrw->drr_object == drrwb.drr_refobject); + assert(drrw->drr_offset == drrwb.drr_refoffset); + + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + spread(infd, buf, payload_size, + stream_offset + sizeof (*drr)); + + drrw->drr_toguid = drrwb.drr_toguid; + drrw->drr_object = drrwb.drr_object; + drrw->drr_offset = drrwb.drr_offset; + break; + } + + case DRR_WRITE: + { + struct drr_write *drrw = &drr->drr_u.drr_write; + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + (void) sfread(buf, payload_size, ofp); + + rdt_insert(&rdt, drrw->drr_toguid, + drrw->drr_object, drrw->drr_offset, offset); + break; + } + + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; + payload_size = + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_FREEOBJECTS: + case DRR_FREE: + case DRR_OBJECT_RANGE: + break; + + default: + (void) fprintf(stderr, "INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + + if (feof(ofp)) { + fprintf(stderr, "Error: unexpected end-of-file\n"); + exit(1); + } + if (ferror(ofp)) { + fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + + /* + * We need to recalculate the checksum, and it needs to be + * initially zero to do that. BEGIN records don't have + * a checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + if (dump_record(drr, buf, payload_size, + &stream_cksum, outfd) != 0) + break; + if (drr->drr_type == DRR_END) { + /* + * Typically the END record is either the last + * thing in the stream, or it is followed + * by a BEGIN record (which also zeros the checksum). + * However, a stream package ends with two END + * records. The last END record's checksum starts + * from zero. + */ + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + } + offset = ftell(ofp); + } + + if (verbose) { + char mem_str[16]; + zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), + mem_str, sizeof (mem_str)); + fprintf(stderr, "converted stream with %llu total records, " + "including %llu dedup records, using %sB memory.\n", + (long long)num_records, + (long long)num_write_byref_records, + mem_str); + } + + umem_cache_destroy(rdt.ddecache); + free(rdt.redup_hash_array); + free(buf); + (void) fclose(ofp); +} + +int +zstream_do_redup(int argc, char *argv[]) +{ + boolean_t verbose = B_FALSE; + int c; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + zstream_usage(); + + const char *filename = argv[0]; + + if (isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + "Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n"); + return (1); + } + + int fd = open(filename, O_RDONLY); + if (fd == -1) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + + fletcher_4_init(); + zfs_redup_stream(fd, STDOUT_FILENO, verbose); + fletcher_4_fini(); + + close(fd); + + return (0); +} diff --git a/cmd/zstream/zstream_token.c b/cmd/zstream/zstream_token.c new file mode 100644 index 0000000000..36a76a4bb8 --- /dev/null +++ b/cmd/zstream/zstream_token.c @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2012 Martin Matuska + */ + +/* + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include "zstream.h" + +int +zstream_do_token(int argc, char *argv[]) +{ + char *resume_token = NULL; + + if (argc < 2) { + (void) fprintf(stderr, "Need to pass the resume token\n"); + zstream_usage(); + } + + resume_token = argv[1]; + + libzfs_handle_t *hdl = libzfs_init(); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + + if (resume_nvl == NULL) { + (void) fprintf(stderr, + "Unable to parse resume token: %s\n", + libzfs_error_description(hdl)); + libzfs_fini(hdl); + return (1); + } + + dump_nvlist(resume_nvl, 5); + nvlist_free(resume_nvl); + + libzfs_fini(hdl); + return (0); +} diff --git a/cmd/zstreamdump/.gitignore b/cmd/zstreamdump/.gitignore deleted file mode 100644 index ca44a529eb..0000000000 --- a/cmd/zstreamdump/.gitignore +++ /dev/null @@ -1 +0,0 @@ -zstreamdump diff --git a/cmd/zstreamdump/Makefile.am b/cmd/zstreamdump/Makefile.am deleted file mode 100644 index f80b5018e0..0000000000 --- a/cmd/zstreamdump/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -include $(top_srcdir)/config/Rules.am - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - -sbin_PROGRAMS = zstreamdump - -zstreamdump_SOURCES = \ - zstreamdump.c - -zstreamdump_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/ztest/Makefile.am b/cmd/ztest/Makefile.am index 55af416805..d5e335e6d2 100644 --- a/cmd/ztest/Makefile.am +++ b/cmd/ztest/Makefile.am @@ -7,11 +7,7 @@ AM_CFLAGS += $(NO_FORMAT_TRUNCATION) AM_CFLAGS += $(FRAME_LARGER_THAN) # Unconditionally enable ASSERTs -AM_CPPFLAGS += -DDEBUG -UNDEBUG - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG sbin_PROGRAMS = ztest @@ -19,8 +15,11 @@ ztest_SOURCES = \ ztest.c ztest_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzpool/libzpool.la + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la ztest_LDADD += -lm ztest_LDFLAGS = -pthread + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 9c2cf95018..5a5c381409 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -74,7 +74,7 @@ * * To turn this into an overnight stress test, use -T to specify run time. * - * You can ask more more vdevs [-v], datasets [-d], or threads [-t] + * You can ask more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. @@ -104,9 +104,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -115,14 +117,14 @@ #include #include #include -#include +#include #include #include #include #include -#include #include #include +#include #include #include #include @@ -132,7 +134,7 @@ #include #include #include -#ifdef __GLIBC__ +#if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif @@ -157,6 +159,9 @@ enum ztest_class_state { ZTEST_VDEV_CLASS_RND }; +#define ZO_GVARS_MAX_ARGLEN ((size_t)64) +#define ZO_GVARS_MAX_COUNT ((size_t)10) + typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; @@ -167,8 +172,11 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; + int zo_raid_children; + int zo_raid_parity; + char zo_raid_type[8]; + int zo_draid_data; + int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; @@ -181,30 +189,64 @@ typedef struct ztest_shared_opts { int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; + int zo_gvars_count; + char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; } ztest_shared_opts_t; +/* Default values for command line options. */ +#define DEFAULT_POOL "ztest" +#define DEFAULT_VDEV_DIR "/tmp" +#define DEFAULT_VDEV_COUNT 5 +#define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ +#define DEFAULT_VDEV_SIZE_STR "256M" +#define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT +#define DEFAULT_MIRRORS 2 +#define DEFAULT_RAID_CHILDREN 4 +#define DEFAULT_RAID_PARITY 1 +#define DEFAULT_DRAID_DATA 4 +#define DEFAULT_DRAID_SPARES 1 +#define DEFAULT_DATASETS_COUNT 7 +#define DEFAULT_THREADS 23 +#define DEFAULT_RUN_TIME 300 /* 300 seconds */ +#define DEFAULT_RUN_TIME_STR "300 sec" +#define DEFAULT_PASS_TIME 60 /* 60 seconds */ +#define DEFAULT_PASS_TIME_STR "60 sec" +#define DEFAULT_KILL_RATE 70 /* 70% kill rate */ +#define DEFAULT_KILLRATE_STR "70%" +#define DEFAULT_INITS 1 +#define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ +#define DEFAULT_FORCE_GANGING (64 << 10) +#define DEFAULT_FORCE_GANGING_STR "64K" + +/* Simplifying assumption: -1 is not a valid default. */ +#define NO_DEFAULT -1 + static const ztest_shared_opts_t ztest_opts_defaults = { - .zo_pool = "ztest", - .zo_dir = "/tmp", + .zo_pool = DEFAULT_POOL, + .zo_dir = DEFAULT_VDEV_DIR, .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, - .zo_vdevs = 5, - .zo_ashift = SPA_MINBLOCKSHIFT, - .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, - .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ - .zo_datasets = 7, - .zo_threads = 23, - .zo_passtime = 60, /* 60 seconds */ - .zo_killrate = 70, /* 70% kill rate */ + .zo_vdevs = DEFAULT_VDEV_COUNT, + .zo_ashift = DEFAULT_ASHIFT, + .zo_mirrors = DEFAULT_MIRRORS, + .zo_raid_children = DEFAULT_RAID_CHILDREN, + .zo_raid_parity = DEFAULT_RAID_PARITY, + .zo_raid_type = VDEV_TYPE_RAIDZ, + .zo_vdev_size = DEFAULT_VDEV_SIZE, + .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ + .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ + .zo_datasets = DEFAULT_DATASETS_COUNT, + .zo_threads = DEFAULT_THREADS, + .zo_passtime = DEFAULT_PASS_TIME, + .zo_killrate = DEFAULT_KILL_RATE, .zo_verbose = 0, .zo_mmp_test = 0, - .zo_init = 1, - .zo_time = 300, /* 5 minutes */ - .zo_maxloops = 50, /* max loops during spa_freeze() */ - .zo_metaslab_force_ganging = 64 << 10, + .zo_init = DEFAULT_INITS, + .zo_time = DEFAULT_RUN_TIME, + .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ + .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, + .zo_gvars_count = 0, }; extern uint64_t metaslab_force_ganging; @@ -232,7 +274,7 @@ static ztest_shared_ds_t *ztest_shared_ds; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ - (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -359,7 +401,6 @@ ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; -ztest_func_t ztest_ddt_repair; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; @@ -414,7 +455,6 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), - ZTI_INIT(ztest_ddt_repair, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), ZTI_INIT(ztest_reguid, 1, &zopt_rarely), @@ -558,7 +598,7 @@ dump_debug_buffer(void) static void sig_handler(int signo) { struct sigaction action; -#ifdef __GLIBC__ /* backtrace() is a GNU extension */ +#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ int nptrs; void *buffer[BACKTRACE_SZ]; @@ -582,7 +622,7 @@ static void sig_handler(int signo) char *fatal_msg; -static void +static __attribute__((noreturn)) __attribute__((format(printf, 2, 3))) void fatal(int do_perror, char *message, ...) { va_list args; @@ -634,7 +674,6 @@ str2shift(const char *buf) } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); - /* NOTREACHED */ } static uint64_t @@ -650,7 +689,12 @@ nicenumtoull(const char *buf) } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); - if (fval > UINT64_MAX) { + /* + * UINT64_MAX is not exactly representable as a double. + * The closest representation is UINT64_MAX + 1, so we + * use a >= comparison instead of > for the bounds check. + */ + if (fval >= (double)UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); @@ -668,66 +712,172 @@ nicenumtoull(const char *buf) return (val); } +typedef struct ztest_option { + const char short_opt; + const char *long_opt; + const char *long_opt_param; + const char *comment; + unsigned int default_int; + char *default_str; +} ztest_option_t; + +/* + * The following option_table is used for generating the usage info as well as + * the long and short option information for calling getopt_long(). + */ +static ztest_option_t option_table[] = { + { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, + NULL}, + { 's', "vdev-size", "INTEGER", "Size of each vdev", + NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, + { 'a', "alignment-shift", "INTEGER", + "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, + { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", + DEFAULT_MIRRORS, NULL}, + { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", + DEFAULT_RAID_CHILDREN, NULL}, + { 'R', "raid-parity", "INTEGER", "Raid parity", + DEFAULT_RAID_PARITY, NULL}, + { 'K', "raid-kind", "raidz|draid|random", "Raid kind", + NO_DEFAULT, "random"}, + { 'D', "draid-data", "INTEGER", "Number of draid data drives", + DEFAULT_DRAID_DATA, NULL}, + { 'S', "draid-spares", "INTEGER", "Number of draid spares", + DEFAULT_DRAID_SPARES, NULL}, + { 'd', "datasets", "INTEGER", "Number of datasets", + DEFAULT_DATASETS_COUNT, NULL}, + { 't', "threads", "INTEGER", "Number of ztest threads", + DEFAULT_THREADS, NULL}, + { 'g', "gang-block-threshold", "INTEGER", + "Metaslab gang block threshold", + NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, + { 'i', "init-count", "INTEGER", "Number of times to initialize pool", + DEFAULT_INITS, NULL}, + { 'k', "kill-percentage", "INTEGER", "Kill percentage", + NO_DEFAULT, DEFAULT_KILLRATE_STR}, + { 'p', "pool-name", "STRING", "Pool name", + NO_DEFAULT, DEFAULT_POOL}, + { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", + NO_DEFAULT, DEFAULT_VDEV_DIR}, + { 'M', "multi-host", NULL, + "Multi-host; simulate pool imported on remote host", + NO_DEFAULT, NULL}, + { 'E', "use-existing-pool", NULL, + "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, + { 'T', "run-time", "INTEGER", "Total run time", + NO_DEFAULT, DEFAULT_RUN_TIME_STR}, + { 'P', "pass-time", "INTEGER", "Time per pass", + NO_DEFAULT, DEFAULT_PASS_TIME_STR}, + { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", + DEFAULT_MAX_LOOPS, NULL}, + { 'B', "alt-ztest", "PATH", "Alternate ztest path", + NO_DEFAULT, NULL}, + { 'C', "vdev-class-state", "on|off|random", "vdev class state", + NO_DEFAULT, "random"}, + { 'o', "option", "\"OPTION=INTEGER\"", + "Set global variable to an unsigned 32-bit integer value", + NO_DEFAULT, NULL}, + { 'G', "dump-debug-msg", NULL, + "Dump zfs_dbgmsg buffer before exiting due to an error", + NO_DEFAULT, NULL}, + { 'V', "verbose", NULL, + "Verbose (use multiple times for ever more verbosity)", + NO_DEFAULT, NULL}, + { 'h', "help", NULL, "Show this help", + NO_DEFAULT, NULL}, + {0, 0, 0, 0, 0, 0} +}; + +static struct option *long_opts = NULL; +static char *short_opts = NULL; + +static void +init_options(void) +{ + ASSERT3P(long_opts, ==, NULL); + ASSERT3P(short_opts, ==, NULL); + + int count = sizeof (option_table) / sizeof (option_table[0]); + long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); + + short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); + int short_opt_index = 0; + + for (int i = 0; i < count; i++) { + long_opts[i].val = option_table[i].short_opt; + long_opts[i].name = option_table[i].long_opt; + long_opts[i].has_arg = option_table[i].long_opt_param != NULL + ? required_argument : no_argument; + long_opts[i].flag = NULL; + short_opts[short_opt_index++] = option_table[i].short_opt; + if (option_table[i].long_opt_param != NULL) { + short_opts[short_opt_index++] = ':'; + } + } +} + +static void +fini_options(void) +{ + int count = sizeof (option_table) / sizeof (option_table[0]); + + umem_free(long_opts, sizeof (struct option) * count); + umem_free(short_opts, sizeof (char) * 2 * count); + + long_opts = NULL; + short_opts = NULL; +} + static void usage(boolean_t requested) { - const ztest_shared_opts_t *zo = &ztest_opts_defaults; - - char nice_vdev_size[NN_NUMBUF_SZ]; - char nice_force_ganging[NN_NUMBUF_SZ]; + char option[80]; FILE *fp = requested ? stdout : stderr; - nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); - nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, - sizeof (nice_force_ganging)); + (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); + for (int i = 0; option_table[i].short_opt != 0; i++) { + if (option_table[i].long_opt_param != NULL) { + (void) sprintf(option, " -%c --%s=%s", + option_table[i].short_opt, + option_table[i].long_opt, + option_table[i].long_opt_param); + } else { + (void) sprintf(option, " -%c --%s", + option_table[i].short_opt, + option_table[i].long_opt); + } + (void) fprintf(fp, " %-40s%s", option, + option_table[i].comment); - (void) fprintf(fp, "Usage: %s\n" - "\t[-v vdevs (default: %llu)]\n" - "\t[-s size_of_each_vdev (default: %s)]\n" - "\t[-a alignment_shift (default: %d)] use 0 for random\n" - "\t[-m mirror_copies (default: %d)]\n" - "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" - "\t[-d datasets (default: %d)]\n" - "\t[-t threads (default: %d)]\n" - "\t[-g gang_block_threshold (default: %s)]\n" - "\t[-i init_count (default: %d)] initialize pool i times\n" - "\t[-k kill_percentage (default: %llu%%)]\n" - "\t[-p pool_name (default: %s)]\n" - "\t[-f dir (default: %s)] file directory for vdev files\n" - "\t[-M] Multi-host simulate pool imported on remote host\n" - "\t[-V] verbose (use multiple times for ever more blather)\n" - "\t[-E] use existing pool instead of creating new one\n" - "\t[-T time (default: %llu sec)] total run time\n" - "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" - "\t[-P passtime (default: %llu sec)] time per pass\n" - "\t[-B alt_ztest (default: )] alternate ztest path\n" - "\t[-C vdev class state (default: random)] special=on|off|random\n" - "\t[-o variable=value] ... set global variable to an unsigned\n" - "\t 32-bit integer value\n" - "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" - "\t[-h] (print help)\n" - "", - zo->zo_pool, - (u_longlong_t)zo->zo_vdevs, /* -v */ - nice_vdev_size, /* -s */ - zo->zo_ashift, /* -a */ - zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ - zo->zo_datasets, /* -d */ - zo->zo_threads, /* -t */ - nice_force_ganging, /* -g */ - zo->zo_init, /* -i */ - (u_longlong_t)zo->zo_killrate, /* -k */ - zo->zo_pool, /* -p */ - zo->zo_dir, /* -f */ - (u_longlong_t)zo->zo_time, /* -T */ - (u_longlong_t)zo->zo_maxloops, /* -F */ - (u_longlong_t)zo->zo_passtime); + if (option_table[i].long_opt_param != NULL) { + if (option_table[i].default_str != NULL) { + (void) fprintf(fp, " (default: %s)", + option_table[i].default_str); + } else if (option_table[i].default_int != NO_DEFAULT) { + (void) fprintf(fp, " (default: %u)", + option_table[i].default_int); + } + } + (void) fprintf(fp, "\n"); + } exit(requested ? 0 : 1); } +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(B_TRUE, "short read from /dev/urandom"); + + return (r % range); +} static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) @@ -777,11 +927,14 @@ process_options(int argc, char **argv) int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; + char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); - while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + init_options(); + + while ((opt = getopt_long(argc, argv, short_opts, long_opts, + NULL)) != EOF) { value = 0; switch (opt) { case 'v': @@ -790,6 +943,8 @@ process_options(int argc, char **argv) case 'm': case 'r': case 'R': + case 'D': + case 'S': case 'd': case 't': case 'g': @@ -814,10 +969,19 @@ process_options(int argc, char **argv) zo->zo_mirrors = value; break; case 'r': - zo->zo_raidz = MAX(1, value); + zo->zo_raid_children = MAX(1, value); break; case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raid_parity = MIN(MAX(value, 1), 3); + break; + case 'K': + (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); + break; + case 'D': + zo->zo_draid_data = MAX(1, value); + break; + case 'S': + zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); @@ -876,8 +1040,21 @@ process_options(int argc, char **argv) ztest_parse_name_value(optarg, zo); break; case 'o': - if (set_global_var(optarg) != 0) + if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { + (void) fprintf(stderr, + "max global var count (%zu) exceeded\n", + ZO_GVARS_MAX_COUNT); usage(B_FALSE); + } + char *v = zo->zo_gvars[zo->zo_gvars_count]; + if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= + ZO_GVARS_MAX_ARGLEN) { + (void) fprintf(stderr, + "global var option '%s' is too long\n", + optarg); + usage(B_FALSE); + } + zo->zo_gvars_count++; break; case 'G': zo->zo_dump_dbgmsg = 1; @@ -892,7 +1069,56 @@ process_options(int argc, char **argv) } } - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + fini_options(); + + /* When raid choice is 'random' add a draid pool 50% of the time */ + if (strcmp(raid_kind, "random") == 0) { + (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? + "draid" : "raidz", sizeof (raid_kind)); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("choosing RAID type '%s'\n", raid_kind); + } + + if (strcmp(raid_kind, "draid") == 0) { + uint64_t min_devsize; + + /* With fewer disk use 256M, otherwise 128M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (256ULL << 20) : (128ULL << 20); + + /* No top-level mirrors with dRAID for now */ + zo->zo_mirrors = 0; + + /* Use more appropriate defaults for dRAID */ + if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) + zo->zo_vdevs = 1; + if (zo->zo_raid_children == + ztest_opts_defaults.zo_raid_children) + zo->zo_raid_children = 16; + if (zo->zo_ashift < 12) + zo->zo_ashift = 12; + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + if (zo->zo_draid_data + zo->zo_raid_parity > + zo->zo_raid_children - zo->zo_draid_spares) { + (void) fprintf(stderr, "error: too few draid " + "children (%d) for stripe width (%d)\n", + zo->zo_raid_children, + zo->zo_draid_data + zo->zo_raid_parity); + usage(B_FALSE); + } + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, + sizeof (zo->zo_raid_type)); + + } else /* using raidz */ { + ASSERT0(strcmp(raid_kind, "raidz")); + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : @@ -909,13 +1135,13 @@ process_options(int argc, char **argv) cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); - VERIFY(NULL != realpath(getexecname(), cmd)); + VERIFY3P(NULL, !=, realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", altdir); } - VERIFY(NULL != realpath(altdir, realaltdir)); + VERIFY3P(NULL, !=, realpath(altdir, realaltdir)); /* * 'cmd' should be of the form "/usr/bin//ztest". @@ -963,22 +1189,6 @@ ztest_kill(ztest_shared_t *zs) (void) kill(getpid(), SIGKILL); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - /* ARGSUSED */ static void ztest_record_enospc(const char *s) @@ -994,12 +1204,26 @@ ztest_get_ashift(void) return (ztest_opts.zo_ashift); } +static boolean_t +ztest_is_draid_spare(const char *name) +{ + uint64_t spare_id = 0, parity = 0, vdev_id = 0; + + if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", + &parity, &vdev_id, &spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; + boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -1021,31 +1245,34 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } + } else { + draid_spare = ztest_is_draid_spare(path); } - if (size != 0) { + if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) - fatal(1, "can't open %s", path); + fatal(B_TRUE, "can't open %s", path); if (ftruncate(fd, size) != 0) - fatal(1, "can't ftruncate %s", path); + fatal(B_TRUE, "can't ftruncate %s", path); (void) close(fd); } - VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); - VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + file = fnvlist_alloc(); + fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, + draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); + fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); + fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, +make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { - nvlist_t *raidz, **child; + nvlist_t *raid, **child; int c; if (r < 2) @@ -1055,20 +1282,40 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size, for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, - child, r) == 0); + raid = fnvlist_alloc(); + fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, + ztest_opts.zo_raid_type); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raid_parity); + fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, child, r); + + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata = ztest_opts.zo_draid_data; + uint64_t nparity = ztest_opts.zo_raid_parity; + uint64_t nspares = ztest_opts.zo_draid_spares; + uint64_t children = ztest_opts.zo_raid_children; + uint64_t ngroups = 1; + + /* + * Calculate the minimum number of groups required to fill a + * slice. This is the LCM of the stripe width (data + parity) + * and the number of data drives (children - spares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } for (c = 0; c < r; c++) - nvlist_free(child[c]); + fnvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); - return (raidz); + return (raid); } static nvlist_t * @@ -1079,21 +1326,19 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, int c; if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); - VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_MIRROR) == 0); - VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, - child, m) == 0); + mirror = fnvlist_alloc(); + fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); + fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, child, m); for (c = 0; c < m; c++) - nvlist_free(child[c]); + fnvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); @@ -1108,7 +1353,7 @@ make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int c; boolean_t log; - ASSERT(t > 0); + ASSERT3S(t, >, 0); log = (class != NULL && strcmp(class, "log") == 0); @@ -1117,23 +1362,22 @@ make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - log) == 0); + fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); if (class != NULL && class[0] != '\0') { ASSERT(m > 1 || log); /* expecting a mirror */ - VERIFY(nvlist_add_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); + fnvlist_add_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, class); } } - VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, - child, t) == 0); + root = fnvlist_alloc(); + fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, + child, t); for (c = 0; c < t; c++) - nvlist_free(child[c]); + fnvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); @@ -1164,7 +1408,7 @@ ztest_random_spa_version(uint64_t initial_version) static int ztest_random_blocksize(void) { - ASSERT(ztest_spa->spa_max_ashift != 0); + ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); /* * Choose a block size >= the ashift. @@ -1222,7 +1466,7 @@ ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); do { top = ztest_random(rvd->vdev_children); @@ -1290,12 +1534,12 @@ ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) nvlist_t *props = NULL; int error; - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); + props = fnvlist_alloc(); + fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); error = spa_prop_set(spa, props); - nvlist_free(props); + fnvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -1329,7 +1573,11 @@ ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); - dsl_crypto_params_free(dcp, B_FALSE); + /* + * Note: if there was an error loading, the wkey was not + * consumed, and needs to be freed. + */ + dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { @@ -1369,8 +1617,8 @@ ztest_rll_init(rll_t *rll) static void ztest_rll_destroy(rll_t *rll) { - ASSERT(rll->rll_writer == NULL); - ASSERT(rll->rll_readers == 0); + ASSERT3P(rll->rll_writer, ==, NULL); + ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } @@ -1399,11 +1647,11 @@ ztest_rll_unlock(rll_t *rll) mutex_enter(&rll->rll_lock); if (rll->rll_writer) { - ASSERT(rll->rll_readers == 0); + ASSERT0(rll->rll_readers); rll->rll_writer = NULL; } else { - ASSERT(rll->rll_readers != 0); - ASSERT(rll->rll_writer == NULL); + ASSERT3S(rll->rll_readers, >, 0); + ASSERT3P(rll->rll_writer, ==, NULL); rll->rll_readers--; } @@ -1509,7 +1757,7 @@ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { - ASSERT(txg_how == TXG_NOWAIT); + ASSERT3U(txg_how, ==, TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); @@ -1519,35 +1767,10 @@ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) return (0); } txg = dmu_tx_get_txg(tx); - ASSERT(txg != 0); + ASSERT3U(txg, !=, 0); return (txg); } -static void -ztest_pattern_set(void *buf, uint64_t size, uint64_t value) -{ - uint64_t *ip = buf; - uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); - - while (ip < ip_end) - *ip++ = value; -} - -#ifndef NDEBUG -static boolean_t -ztest_pattern_match(void *buf, uint64_t size, uint64_t value) -{ - uint64_t *ip = buf; - uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); - uint64_t diff = 0; - - while (ip < ip_end) - diff |= (value - *ip++); - - return (diff == 0); -} -#endif - static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, @@ -1606,7 +1829,7 @@ ztest_bt_bonus(dmu_buf_t *db) * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ -void +static void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { @@ -1625,7 +1848,7 @@ ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ -void +static void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { @@ -1692,7 +1915,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) if (zil_replaying(zd->zd_zilog, tx)) return; - if (lr->lr_length > ZIL_MAX_LOG_DATA) + if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, @@ -1767,8 +1990,8 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ASSERT(lr->lr_doid == ZTEST_DIROBJ); - ASSERT(name[0] != '\0'); + ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); + ASSERT3S(name[0], !=, '\0'); tx = dmu_tx_create(os); @@ -1784,7 +2007,7 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) if (txg == 0) return (ENOSPC); - ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { @@ -1816,13 +2039,13 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) return (error); } - ASSERT(lr->lr_foid != 0); + ASSERT3U(lr->lr_foid, !=, 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) - VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, @@ -1830,7 +2053,7 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); - VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); @@ -1854,16 +2077,16 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ASSERT(lr->lr_doid == ZTEST_DIROBJ); - ASSERT(name[0] != '\0'); + ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); + ASSERT3S(name[0], !=, '\0'); - VERIFY3U(0, ==, + VERIFY0( zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); - ASSERT(object != 0); + ASSERT3U(object, !=, 0); ztest_object_lock(zd, object, RL_WRITER); - VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + VERIFY0(dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); @@ -1877,12 +2100,12 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) } if (doi.doi_type == DMU_OT_ZAP_OTHER) { - VERIFY3U(0, ==, zap_destroy(os, object, tx)); + VERIFY0(zap_destroy(os, object, tx)); } else { - VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + VERIFY0(dmu_object_free(os, object, tx)); } - VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); @@ -1934,7 +2157,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); @@ -1968,7 +2191,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ - ASSERT(offset % doi.doi_data_block_size == 0); + ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; @@ -2049,8 +2272,8 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) return (ENOSPC); } - VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, - lr->lr_length, tx) == 0); + VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx)); (void) ztest_log_truncate(zd, tx, lr); @@ -2078,7 +2301,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) ztest_object_lock(zd, lr->lr_foid, RL_WRITER); - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); @@ -2097,9 +2320,9 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { - ASSERT(lr->lr_size != 0); - ASSERT(lr->lr_mode != 0); - ASSERT(lrtxg != 0); + ASSERT3U(lr->lr_size, !=, 0); + ASSERT3U(lr->lr_mode, !=, 0); + ASSERT3U(lrtxg, !=, 0); } else { /* * Randomly change the size and increment the generation. @@ -2107,7 +2330,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; - ASSERT(lrtxg == 0); + ASSERT0(lrtxg); } /* @@ -2181,8 +2404,8 @@ ztest_get_done(zgd_t *zgd, int error) } static int -ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, - zio_t *zio) +ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio) { ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; @@ -2224,22 +2447,22 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); - ASSERT(error == 0); + ASSERT0(error); } else { size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); } else { - ASSERT(offset < size); + ASSERT3U(offset, <, size); offset = 0; } - zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, @@ -2251,8 +2474,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zgd->zgd_db = db; zgd->zgd_bp = bp; - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); + ASSERT3U(db->db_offset, ==, offset); + ASSERT3U(db->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); @@ -2281,7 +2504,7 @@ ztest_lr_alloc(size_t lrsize, char *name) return (lr); } -void +static void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; @@ -2306,20 +2529,20 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { - ASSERT(error == ENOENT); - ASSERT(od->od_object == 0); + ASSERT3S(error, ==, ENOENT); + ASSERT0(od->od_object); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; - ASSERT(od->od_object != 0); - ASSERT(missing == 0); /* there should be no gaps */ + ASSERT3U(od->od_object, !=, 0); + ASSERT0(missing); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, RL_READER); - VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, - od->od_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, + FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); @@ -2362,7 +2585,7 @@ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { - ASSERT(missing == 0); + ASSERT0(missing); od->od_object = 0; missing++; } else { @@ -2370,7 +2593,7 @@ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; - ASSERT(od->od_object != 0); + ASSERT3U(od->od_object, !=, 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); @@ -2521,7 +2744,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) uint64_t blocksize; void *data; - VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); @@ -2616,8 +2839,9 @@ ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, od->od_blocksize = 0; od->od_gen = 0; - (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", - tag, (longlong_t)id, (u_longlong_t)index); + (void) snprintf(od->od_name, sizeof (od->od_name), + "%s(%"PRId64")[%"PRIu64"]", + tag, id, index); } /* @@ -2659,7 +2883,7 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id) * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); - ASSERT(zd->zd_shared != NULL); + ASSERT3P(zd->zd_shared, !=, NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); @@ -2697,7 +2921,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) zil_close(zd->zd_zilog); /* zfsvfs_setup() */ - VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); + VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); @@ -2726,7 +2950,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); - nvlist_free(nvroot); + fnvlist_free(nvroot); /* * Attempt to create using a bad mirror. @@ -2734,7 +2958,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); - nvlist_free(nvroot); + fnvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter @@ -2744,9 +2968,25 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); - nvlist_free(nvroot); - VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); - VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); + fnvlist_free(nvroot); + + /* + * We open a reference to the spa and then we try to export it + * expecting one of the following errors: + * + * EBUSY + * Because of the reference we just opened. + * + * ZFS_ERR_EXPORT_IN_PROGRESS + * For the case that there is another ztest thread doing + * an export concurrently. + */ + VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); + int error = spa_destroy(zo->zo_pool); + if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { + fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", + spa->spa_name, error); + } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); @@ -2815,6 +3055,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_mmp_test) return; + /* dRAID added after feature flags, skip upgrade test. */ + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + return; + mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); @@ -2824,13 +3068,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ - switch (ztest_opts.zo_raidz_parity) { + switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; @@ -2854,17 +3098,18 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); - VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); + VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); - VERIFY3S(spa_open(name, &spa, FTAG), ==, 0); + VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { - (void) printf("upgrading spa version from %llu to %llu\n", - (u_longlong_t)version, (u_longlong_t)newversion); + (void) printf("upgrading spa version from " + "%"PRIu64" to %"PRIu64"\n", + version, newversion); } spa_upgrade(spa, newversion); @@ -2873,7 +3118,7 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); - strfree(name); + kmem_strfree(name); mutex_exit(&ztest_vdev_lock); } @@ -2894,7 +3139,7 @@ ztest_spa_checkpoint(spa_t *spa) ztest_record_enospc(FTAG); break; default: - fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); + fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } @@ -2911,7 +3156,7 @@ ztest_spa_discard_checkpoint(spa_t *spa) case ZFS_ERR_NO_CHECKPOINT: break; default: - fatal(0, "spa_discard_checkpoint(%s) = %d", + fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } @@ -2950,24 +3195,12 @@ vdev_lookup_by_path(vdev_t *vd, const char *path) return (NULL); } -/* - * Find the first available hole which can be used as a top-level. - */ -int -find_vdev_hole(spa_t *spa) +static int +spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; - int c; - - ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); - - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - - if (cvd->vdev_ishole) - break; - } - return (c); + ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); + return (rvd->vdev_children); } /* @@ -2988,11 +3221,12 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. @@ -3003,7 +3237,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) /* * find the first real slog in log allocation class */ - mg = spa_log_class(spa)->mc_rotor; + mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; @@ -3032,7 +3266,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove() = %d", error); + fatal(B_FALSE, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); @@ -3042,10 +3276,11 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + 1); error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); + fnvlist_free(nvroot); switch (error) { case 0: @@ -3054,7 +3289,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) ztest_record_enospc("spa_vdev_add"); break; default: - fatal(0, "spa_vdev_add() = %d", error); + fatal(B_FALSE, "spa_vdev_add() = %d", error); } } @@ -3096,22 +3331,23 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); + fnvlist_free(nvroot); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); else if (error != 0) - fatal(0, "spa_vdev_add() = %d", error); + fatal(B_FALSE, "spa_vdev_add() = %d", error); /* * 50% of the time allow small blocks in the special class @@ -3152,7 +3388,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) char *aux; char *path; uint64_t guid = 0; - int error; + int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; @@ -3175,7 +3411,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Pick a random device to remove. */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + + /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ + if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) + ignore_err = ENOTSUP; + + guid = svd->vdev_guid; } else { /* * Find an unused device we can add. @@ -3211,9 +3453,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case 0: break; default: - fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); + fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); } - nvlist_free(nvroot); + fnvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its @@ -3232,7 +3474,10 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + if (error != ignore_err) + fatal(B_FALSE, + "spa_vdev_remove(%"PRIu64") = %d", + guid, error); } } @@ -3261,7 +3506,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } @@ -3273,12 +3518,11 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); - VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, - &tree) == 0); + tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); mutex_exit(&spa->spa_props_lock); - VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0); + VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &child, &children)); schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); for (c = 0; c < children; c++) { @@ -3287,37 +3531,35 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { - VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, - 0) == 0); - VERIFY(nvlist_add_string(schild[schildren], - ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); - VERIFY(nvlist_add_uint64(schild[schildren], - ZPOOL_CONFIG_IS_HOLE, 1) == 0); + schild[schildren] = fnvlist_alloc(); + fnvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); + fnvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; - VERIFY(nvlist_lookup_nvlist_array(child[c], - ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); - VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + VERIFY0(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); + schild[schildren++] = fnvlist_dup(mchild[0]); } /* OK, create a config that can be used to split */ - VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, - lastlogid != 0 ? lastlogid : schildren) == 0); + split = fnvlist_alloc(); + fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren); - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + config = fnvlist_alloc(); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); for (c = 0; c < schildren; c++) - nvlist_free(schild[c]); + fnvlist_free(schild[c]); free(schild); - nvlist_free(split); + fnvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); @@ -3325,7 +3567,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); (void) pthread_rwlock_unlock(&ztest_name_lock); - nvlist_free(config); + fnvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); @@ -3361,6 +3603,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; + int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; @@ -3371,7 +3614,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3383,8 +3626,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); - mutex_exit(&ztest_vdev_lock); - return; + goto out; } /* @@ -3409,16 +3651,19 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { - ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); - ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); + ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + if (ztest_opts.zo_raid_children > 1) { + if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) + ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); + else + ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* @@ -3427,7 +3672,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; - ASSERT(oldvd->vdev_children >= 2); + ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } @@ -3454,7 +3699,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) - fatal(0, "detach (%s) returned %d", oldpath, error); + fatal(B_FALSE, "detach (%s) returned %d", + oldpath, error); goto out; } @@ -3465,6 +3711,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; + + if (newvd->vdev_ops == &vdev_draid_spare_ops) + newvd_is_dspare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, @@ -3498,6 +3748,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. + * + * If newvd is a distributed spare and it's being attached to a + * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3510,10 +3763,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (newsize < oldsize) + else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; + else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) + expected_error = ENOTSUP; else expected_error = 0; @@ -3525,9 +3780,18 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); - error = spa_vdev_attach(spa, oldguid, root, replacing); + /* + * When supported select either a healing or sequential resilver. + */ + boolean_t rebuilding = B_FALSE; + if (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops) { + rebuilding = !!ztest_random(2); + } - nvlist_free(root); + error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); + + fnvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, @@ -3545,12 +3809,13 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || - error == ZFS_ERR_DISCARDING_CHECKPOINT) + error == ZFS_ERR_DISCARDING_CHECKPOINT || + error == ZFS_ERR_RESILVER_IN_PROGRESS || + error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; - /* XXX workaround 6690467 */ if (error != expected_error && expected_error != EBUSY) { - fatal(0, "attach (%s %llu, %s %llu, %d) " + fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); @@ -3627,22 +3892,22 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id) /* * Callback function which expands the physical size of the vdev. */ -vdev_t * +static vdev_t * grow_vdev(vdev_t *vd, void *arg) { - ASSERTV(spa_t *spa = vd->vdev_spa); + spa_t *spa __maybe_unused = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; - ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); - VERIFY(ftruncate(fd, *newsize) == 0); + VERIFY0(ftruncate(fd, *newsize)); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", @@ -3656,7 +3921,7 @@ grow_vdev(vdev_t *vd, void *arg) * Callback function which expands a given vdev by calling vdev_online(). */ /* ARGSUSED */ -vdev_t * +static vdev_t * online_vdev(vdev_t *vd, void *arg) { spa_t *spa = vd->vdev_spa; @@ -3666,7 +3931,7 @@ online_vdev(vdev_t *vd, void *arg) vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; - ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ @@ -3681,8 +3946,8 @@ online_vdev(vdev_t *vd, void *arg) */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { - (void) printf("Unable to expand vdev, state %llu, " - "error %d\n", (u_longlong_t)newstate, error); + (void) printf("Unable to expand vdev, state %u, " + "error %d\n", newstate, error); } return (vd); } @@ -3697,12 +3962,12 @@ online_vdev(vdev_t *vd, void *arg) if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " - "guid %llu, state %llu, expected gen %llu, " - "got gen %llu\n", - (u_longlong_t)guid, - (u_longlong_t)tvd->vdev_state, - (u_longlong_t)generation, - (u_longlong_t)spa->spa_config_generation); + "guid %"PRIu64", state %"PRIu64", " + "expected gen %"PRIu64", got gen %"PRIu64"\n", + guid, + tvd->vdev_state, + generation, + spa->spa_config_generation); } return (vd); } @@ -3716,7 +3981,7 @@ online_vdev(vdev_t *vd, void *arg) * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ -vdev_t * +static vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { uint_t c; @@ -3797,7 +4062,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_checkpoint_lock); return; } - ASSERT(psize > 0); + ASSERT3U(psize, >, 0); newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); @@ -3863,7 +4128,8 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { - fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", + fatal(B_FALSE, + "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", old_ms_count, new_ms_count); } @@ -3871,7 +4137,8 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { - fatal(0, "LUN expansion failed: class_space %llu < %llu\n", + fatal(B_FALSE, + "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", old_class_space, new_class_space); } @@ -3899,8 +4166,8 @@ ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) /* * Create the objects common to all ztest datasets. */ - VERIFY(zap_create_claim(os, ZTEST_DIROBJ, - DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); + VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, + DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); } static int @@ -4018,7 +4285,7 @@ ztest_snapshot_create(char *osname, uint64_t id) char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; - (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); + (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { @@ -4026,7 +4293,7 @@ ztest_snapshot_create(char *osname, uint64_t id) return (B_FALSE); } if (error != 0 && error != EEXIST) { - fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, + fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); @@ -4038,12 +4305,13 @@ ztest_snapshot_destroy(char *osname, uint64_t id) char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; - (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, - (u_longlong_t)id); + (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", + osname, id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) - fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", + snapname, error); return (B_TRUE); } @@ -4063,8 +4331,8 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) (void) pthread_rwlock_rdlock(&ztest_name_lock); - (void) snprintf(name, sizeof (name), "%s/temp_%llu", - ztest_opts.zo_pool, (u_longlong_t)id); + (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", + ztest_opts.zo_pool, id); /* * If this dataset exists from a previous run, process its replay log @@ -4103,7 +4371,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_objset_create(%s) = %d", name, error); + fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, @@ -4136,7 +4404,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Verify that we can hold an objset that is also owned. */ - VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); + VERIFY0(dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* @@ -4169,7 +4437,7 @@ ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Cleanup non-standard snapshots and clones. */ -void +static void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char *snap1name; @@ -4185,32 +4453,35 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id) clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); - (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s1_%llu", osname, (u_longlong_t)id); - (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, - "%s/c1_%llu", osname, (u_longlong_t)id); - (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s2_%llu", clone1name, (u_longlong_t)id); - (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, - "%s/c2_%llu", osname, (u_longlong_t)id); - (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s3_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", + osname, id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", + osname, id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", + clone1name, id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", + osname, id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", + clone1name, id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) - fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); + fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); + fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", + snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); + fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", + snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) - fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); + fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); + fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", + snap1name, error); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); @@ -4244,16 +4515,16 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_dsl_dataset_cleanup(osname, id); - (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s1_%llu", osname, (u_longlong_t)id); - (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, - "%s/c1_%llu", osname, (u_longlong_t)id); - (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s2_%llu", clone1name, (u_longlong_t)id); - (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, - "%s/c2_%llu", osname, (u_longlong_t)id); - (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, - "%s@s3_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", + osname, id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", + osname, id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", + clone1name, id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", + osname, id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", + clone1name, id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { @@ -4261,7 +4532,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); @@ -4270,7 +4541,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); @@ -4279,7 +4550,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); + fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); @@ -4288,7 +4559,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); @@ -4297,13 +4568,13 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) ztest_record_enospc(FTAG); goto out; } - fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); } error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os); if (error) - fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); + fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, B_TRUE, FTAG); @@ -4311,8 +4582,8 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) goto out; } if (error != EBUSY) - fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, - error); + fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", + clone2name, error); dmu_objset_disown(os, B_TRUE, FTAG); out: @@ -4399,8 +4670,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); dmu_tx_t *tx; - int i, freeit, error; - uint64_t n, s, txg; + int freeit, error; + uint64_t i, n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); @@ -4448,7 +4719,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; - ASSERT(chunksize == od[1].od_gen); + ASSERT3U(chunksize, ==, od[1].od_gen); /* * Prefetch a random chunk of the big object. @@ -4543,22 +4814,26 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; - ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); - ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); + ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) - fatal(0, "future leak: got %llx, open txg is %llx", + fatal(B_FALSE, + "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) - fatal(0, "wrong index: got %llx, wanted %llx+%llx", + fatal(B_FALSE, "wrong index: " + "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + fatal(B_FALSE, "pack/bigH mismatch in %p/%p", + pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + fatal(B_FALSE, "pack/bigT mismatch in %p/%p", + pack, bigT); if (freeit) { bzero(pack, sizeof (bufwad_t)); @@ -4579,20 +4854,16 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) if (freeit) { if (ztest_opts.zo_verbose >= 7) { - (void) printf("freeing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); + (void) printf("freeing offset %"PRIx64" size %"PRIx64"" + " txg %"PRIx64"\n", + bigoff, bigsize, txg); } - VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); + VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); + (void) printf("writing offset %"PRIx64" size %"PRIx64"" + " txg %"PRIx64"\n", + bigoff, bigsize, txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } @@ -4606,13 +4877,13 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, packobj, packoff, + VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, + VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + ASSERT0(bcmp(packbuf, packcheck, packsize)); + ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); @@ -4623,7 +4894,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) umem_free(od, size); } -void +static void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { @@ -4646,22 +4917,26 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; - ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); - ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); + ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) - fatal(0, "future leak: got %llx, open txg is %llx", + fatal(B_FALSE, + "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) - fatal(0, "wrong index: got %llx, wanted %llx+%llx", + fatal(B_FALSE, "wrong index: " + "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + fatal(B_FALSE, "pack/bigH mismatch in %p/%p", + pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + fatal(B_FALSE, "pack/bigT mismatch in %p/%p", + pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; @@ -4732,12 +5007,12 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; - ASSERT(chunksize == od[1].od_gen); + ASSERT3U(chunksize, ==, od[1].od_gen); - VERIFY(dmu_object_info(os, bigobj, &doi) == 0); + VERIFY0(dmu_object_info(os, bigobj, &doi)); VERIFY(ISP2(doi.doi_data_block_size)); - VERIFY(chunksize == doi.doi_data_block_size); - VERIFY(chunksize >= 2 * sizeof (bufwad_t)); + VERIFY3U(chunksize, ==, doi.doi_data_block_size); + VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. @@ -4754,7 +5029,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); - VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); + VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); @@ -4840,11 +5115,9 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); + (void) printf("writing offset %"PRIx64" size %"PRIx64"" + " txg %"PRIx64"\n", + bigoff, bigsize, txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; @@ -4888,13 +5161,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, packobj, packoff, + VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, + VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + ASSERT0(bcmp(packbuf, packcheck, packsize)); + ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); @@ -5023,19 +5296,19 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) goto out; for (i = 0; i < 2; i++) { value[i] = i; - VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), + VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); - VERIFY3U(0, ==, + VERIFY0( zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { - VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); + VERIFY0(zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); @@ -5045,8 +5318,8 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + (void) sprintf(propname, "prop_%"PRIu64"", prop); + (void) sprintf(txgname, "txg_%"PRIu64"", prop); bzero(value, sizeof (value)); last_txg = 0; @@ -5058,17 +5331,17 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); - VERIFY(zap_lookup(os, object, txgname, zl_intsize, - zl_ints, &last_txg) == 0); + VERIFY0(zap_lookup(os, object, txgname, zl_intsize, + zl_ints, &last_txg)); - VERIFY(zap_length(os, object, propname, &zl_intsize, - &zl_ints) == 0); + VERIFY0(zap_length(os, object, propname, &zl_intsize, + &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); - VERIFY(zap_lookup(os, object, propname, zl_intsize, - zl_ints, value) == 0); + VERIFY0(zap_lookup(os, object, propname, zl_intsize, + zl_ints, value)); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); @@ -5091,14 +5364,15 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) goto out; if (last_txg > txg) - fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); + fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", + last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; - VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), + VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); - VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), + VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); @@ -5107,8 +5381,8 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + (void) sprintf(propname, "prop_%"PRIu64"", prop); + (void) sprintf(txgname, "txg_%"PRIu64"", prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); @@ -5122,8 +5396,8 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; - VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); - VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + VERIFY0(zap_remove(os, object, txgname, tx)); + VERIFY0(zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); out: umem_free(od, sizeof (ztest_od_t)); @@ -5137,8 +5411,7 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; - uint64_t object, txg; - int i; + uint64_t object, txg, value; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); @@ -5153,14 +5426,13 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ - for (i = 0; i < 2050; i++) { + for (value = 0; value < 2050; value++) { char name[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t value = i; dmu_tx_t *tx; int error; - (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", - (u_longlong_t)id, (u_longlong_t)value); + (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", + id, value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); @@ -5225,7 +5497,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) count = -1ULL; VERIFY0(zap_count(os, object, &count)); - ASSERT(count != -1ULL); + ASSERT3S(count, !=, -1ULL); /* * Select an operation: length, lookup, add, update, remove. @@ -5264,8 +5536,8 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) if (error == 0) { if (data == string_value && bcmp(name, data, namelen) != 0) - fatal(0, "name '%s' != val '%s' len %d", - name, data, namelen); + fatal(B_FALSE, "name '%s' != val '%s' len %d", + name, (char *)data, namelen); } else { ASSERT3U(error, ==, ENOENT); } @@ -5277,7 +5549,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) break; case 3: - VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); break; case 4: @@ -5311,15 +5583,16 @@ ztest_commit_callback(void *arg, int error) ztest_cb_data_t *data = arg; uint64_t synced_txg; - VERIFY(data != NULL); + VERIFY3P(data, !=, NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) - fatal(0, "commit callback of txg %" PRIu64 " called prematurely" - ", last synced txg = %" PRIu64 "\n", data->zcd_txg, - synced_txg); + fatal(B_FALSE, + "commit callback of txg %"PRIu64" called prematurely, " + "last synced txg = %"PRIu64"\n", + data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; @@ -5437,11 +5710,12 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) /* * Read existing data to make sure there isn't a future leak. */ - VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t), + VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) - fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + fatal(B_FALSE, + "future leak: got %"PRIu64", open txg is %"PRIu64"", old_txg, txg); dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); @@ -5463,8 +5737,10 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { - fatal(0, "Commit callback threshold exceeded, oldest txg: %" - PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + fatal(B_FALSE, + "Commit callback threshold exceeded, " + "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", + tmp_cb->zcd_txg, txg); } /* @@ -5574,9 +5850,6 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) (void) pthread_rwlock_rdlock(&ztest_name_lock); - (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, - ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); - (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); VERIFY0(spa_prop_get(ztest_spa, &props)); @@ -5584,7 +5857,7 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); - nvlist_free(props); + fnvlist_free(props); (void) pthread_rwlock_unlock(&ztest_name_lock); } @@ -5625,12 +5898,11 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) dmu_objset_name(os, osname); - (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", - (u_longlong_t)id); + (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); - (void) snprintf(clonename, sizeof (clonename), - "%s/ch1_%llu", osname, (u_longlong_t)id); - (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); + (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", + osname, id); + (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); /* * Clean up from any previous run. @@ -5655,7 +5927,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) ztest_record_enospc("dmu_objset_snapshot"); goto out; } - fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); @@ -5664,22 +5936,22 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) ztest_record_enospc("dmu_objset_clone"); goto out; } - fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); + fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { - fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) - fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); + fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) - fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't @@ -5692,7 +5964,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) ztest_record_enospc("dmu_objset_snapshot"); goto out; } - fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); @@ -5704,25 +5976,26 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { - fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", + fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { - fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", + fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { - fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) - fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); + fatal(B_FALSE, "user_release_one(%s, %s) = %d", + fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); @@ -5772,11 +6045,11 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); - ASSERT(leaves >= 1); + ASSERT3U(leaves, >=, 1); /* * While ztest is running the number of leaves will not change. This @@ -5838,7 +6111,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, - * or unwriteable, or reach behind its back + * or unwritable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll @@ -5852,8 +6125,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { - (void) close(vf->vf_vnode->v_fd); - vf->vf_vnode->v_fd = -1; + (void) close(vf->vf_file->f_fd); + vf->vf_file->f_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { @@ -5905,7 +6178,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (islog) (void) pthread_rwlock_wrlock(&ztest_name_lock); - VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); if (islog) (void) pthread_rwlock_unlock(&ztest_name_lock); @@ -5945,24 +6218,26 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * - * We divide each leaf into chunks of size - * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk - * there is a series of ranges to which we can inject errors. - * Each range can accept errors on only a single leaf vdev. - * The error injection ranges are separated by ranges - * which we will not inject errors on any device (DMZs). - * Each DMZ must be large enough such that a single block - * can not straddle it, so that a single block can not be - * a target in two different injection ranges (on different - * leaf vdevs). + * To achieve this we divide each leaf device into + * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). + * Each chunk is further divided into error-injection + * ranges (can accept errors) and clear ranges (we do + * not inject errors in those). Each error-injection + * range can accept errors only for a single leaf vdev. + * Error-injection ranges are separated by clear ranges. * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 - * 32M to 64M: DMZ - no injection allowed + * 32M to 64M: clear range - no injection allowed * 64M to 96M: injection range for leaf 1 - * 96M to 128M: DMZ - no injection allowed + * 96M to 128M: clear range - no injection allowed * 128M to 160M: injection range for leaf 2 - * 160M to 192M: DMZ - no injection allowed + * 160M to 192M: clear range - no injection allowed + * + * Each clear range must be large enough such that a + * single block cannot straddle it. This way a block + * can't be a target in two different injection ranges + * (on different leaf vdevs). */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + @@ -6001,14 +6276,15 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) - fatal(1, "can't inject bad word at 0x%llx in %s", + fatal(B_TRUE, + "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," - " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + " offset 0x%"PRIx64"\n", pathrand, offset); } (void) close(fd); @@ -6017,140 +6293,10 @@ out: umem_free(pathrand, MAXPATHLEN); } -/* - * Verify that DDT repair works as expected. - */ -void -ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - objset_t *os = zd->zd_os; - ztest_od_t *od; - uint64_t object, blocksize, txg, pattern; - enum zio_checksum checksum = spa_dedup_checksum(spa); - dmu_buf_t *db; - dmu_tx_t *tx; - - od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { - umem_free(od, sizeof (ztest_od_t)); - return; - } - - /* - * Take the name lock as writer to prevent anyone else from changing - * the pool and dataset properties we need to maintain during this test. - */ - (void) pthread_rwlock_wrlock(&ztest_name_lock); - - if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, - B_FALSE) != 0 || - ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, - B_FALSE) != 0) { - (void) pthread_rwlock_unlock(&ztest_name_lock); - umem_free(od, sizeof (ztest_od_t)); - return; - } - - dmu_objset_stats_t dds; - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - - object = od[0].od_object; - blocksize = od[0].od_blocksize; - pattern = zs->zs_guid ^ dds.dds_guid; - - /* - * The numbers of copies written must always be greater than or - * equal to the threshold set by the dedupditto property. This - * is initialized in ztest_run() and then randomly changed by - * ztest_spa_prop_get_set(), these function will never set it - * larger than 2 * ZIO_DEDUPDITTO_MIN. - */ - int copies = 2 * ZIO_DEDUPDITTO_MIN; - - /* - * The block size is limited by DMU_MAX_ACCESS (64MB) which - * caps the maximum transaction size. A block size of up to - * SPA_OLD_MAXBLOCKSIZE is allowed which results in a maximum - * transaction size of: 128K * 200 (copies) = ~25MB - * - * The actual block size is checked here, rather than requested - * above, because the way ztest_od_init() is implemented it does - * not guarantee the block size requested will be used. - */ - if (blocksize > SPA_OLD_MAXBLOCKSIZE) { - (void) pthread_rwlock_unlock(&ztest_name_lock); - umem_free(od, sizeof (ztest_od_t)); - return; - } - - ASSERT(object != 0); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, object, 0, copies * blocksize); - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - (void) pthread_rwlock_unlock(&ztest_name_lock); - umem_free(od, sizeof (ztest_od_t)); - return; - } - - /* - * Write all the copies of our block. - */ - for (int i = 0; i < copies; i++) { - uint64_t offset = i * blocksize; - int error = dmu_buf_hold(os, object, offset, FTAG, &db, - DMU_READ_NO_PREFETCH); - if (error != 0) { - fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", - os, (long long)object, (long long) offset, error); - } - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == blocksize); - ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || - ztest_pattern_match(db->db_data, db->db_size, 0ULL)); - dmu_buf_will_fill(db, tx); - ztest_pattern_set(db->db_data, db->db_size, pattern); - dmu_buf_rele(db, FTAG); - } - - dmu_tx_commit(tx); - txg_wait_synced(spa_get_dsl(spa), txg); - - /* - * Find out what block we got. - */ - VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); - blkptr_t blk = *((dmu_buf_impl_t *)db)->db_blkptr; - dmu_buf_rele(db, FTAG); - - /* - * Damage the block. Dedup-ditto will save us when we read it later. - */ - uint64_t psize = BP_GET_PSIZE(&blk); - abd_t *abd = abd_alloc_linear(psize, B_TRUE); - ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); - - (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - - abd_free(abd); - - (void) pthread_rwlock_unlock(&ztest_name_lock); - umem_free(od, sizeof (ztest_od_t)); -} - /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any - * any persistent damage. + * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). @@ -6226,8 +6372,8 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) return; if (ztest_opts.zo_verbose >= 4) { - (void) printf("Changed guid old %llu -> %llu\n", - (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); + (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", + orig, spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); @@ -6379,6 +6525,75 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) } } +static int +ztest_set_global_vars(void) +{ + for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { + char *kv = ztest_opts.zo_gvars[i]; + VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); + VERIFY3U(strlen(kv), >, 0); + int err = set_global_var(kv); + if (ztest_opts.zo_verbose > 0) { + (void) printf("setting global var %s ... %s\n", kv, + err ? "failed" : "ok"); + } + if (err != 0) { + (void) fprintf(stderr, + "failed to set global var '%s'\n", kv); + return (err); + } + } + return (0); +} + +static char ** +ztest_global_vars_to_zdb_args(void) +{ + char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); + char **cur = args; + for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { + char *kv = ztest_opts.zo_gvars[i]; + *cur = "-o"; + cur++; + *cur = strdup(kv); + cur++; + } + ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); + *cur = NULL; + return (args); +} + +/* The end of strings is indicated by a NULL element */ +static char * +join_strings(char **strings, const char *sep) +{ + size_t totallen = 0; + for (char **sp = strings; *sp != NULL; sp++) { + totallen += strlen(*sp); + totallen += strlen(sep); + } + if (totallen > 0) { + ASSERT(totallen >= strlen(sep)); + totallen -= strlen(sep); + } + + size_t buflen = totallen + 1; + char *o = malloc(buflen); /* trailing 0 byte */ + o[0] = '\0'; + for (char **sp = strings; *sp != NULL; sp++) { + size_t would; + would = strlcat(o, *sp, buflen); + VERIFY3U(would, <, buflen); + if (*(sp+1) == NULL) { + break; + } + would = strlcat(o, sep, buflen); + VERIFY3U(would, <, buflen); + } + ASSERT3S(strlen(o), ==, totallen); + return (o); +} + static int ztest_check_path(char *path) { @@ -6399,12 +6614,12 @@ ztest_get_zdb_bin(char *bin, int len) strlcpy(bin, zdb_path, len); /* In env */ if (!ztest_check_path(bin)) { ztest_dump_core = 0; - fatal(1, "invalid ZDB_PATH '%s'", bin); + fatal(B_TRUE, "invalid ZDB_PATH '%s'", bin); } return; } - VERIFY(realpath(getexecname(), bin) != NULL); + VERIFY3P(realpath(getexecname(), bin), !=, NULL); if (strstr(bin, "/ztest/")) { strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ strcat(bin, "/zdb/zdb"); @@ -6434,7 +6649,7 @@ ztest_random_concrete_vdev_leaf(vdev_t *vd) eligible[eligible_idx++] = cvd; } } - VERIFY(eligible_idx > 0); + VERIFY3S(eligible_idx, >, 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); @@ -6468,7 +6683,7 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id) char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; - zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); + zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); @@ -6540,7 +6755,7 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_trim_thread != NULL; - zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); + zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); @@ -6607,13 +6822,21 @@ ztest_run_zdb(char *pool) ztest_get_zdb_bin(bin, len); - (void) sprintf(zdb, - "%s -bcc%s%s -G -d -Y -U %s %s", + char **set_gvars_args = ztest_global_vars_to_zdb_args(); + char *set_gvars_args_joined = join_strings(set_gvars_args, " "); + free(set_gvars_args); + + size_t would = snprintf(zdb, len, + "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", - spa_config_path, + set_gvars_args_joined, + ztest_opts.zo_dir, pool); + ASSERT3U(would, <, len); + + free(set_gvars_args_joined); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); @@ -6631,9 +6854,10 @@ ztest_run_zdb(char *pool) ztest_dump_core = 0; if (WIFEXITED(status)) - fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); + fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else - fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); + fatal(B_FALSE, "'%s' died with signal %d", + zdb, WTERMSIG(status)); out: umem_free(bin, len); umem_free(zdb, len); @@ -6676,7 +6900,7 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Get the pool's configuration and guid. */ - VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + VERIFY0(spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. @@ -6692,7 +6916,7 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Export it. */ - VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); + VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); @@ -6700,8 +6924,8 @@ ztest_spa_import_export(char *oldname, char *newname) * Try to import it. */ newconfig = spa_tryimport(config); - ASSERT(newconfig != NULL); - nvlist_free(newconfig); + ASSERT3P(newconfig, !=, NULL); + fnvlist_free(newconfig); /* * Import it under the new name. @@ -6733,11 +6957,11 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Verify that we can open and close the pool using the new name. */ - VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); - ASSERT(pool_guid == spa_guid(spa)); + VERIFY0(spa_open(newname, &spa, FTAG)); + ASSERT3U(pool_guid, ==, spa_guid(spa)); spa_close(spa, FTAG); - nvlist_free(config); + fnvlist_free(config); } static void @@ -6804,7 +7028,8 @@ ztest_deadman_thread(void *arg) * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { - fatal(0, "aborting test after %llu seconds because " + fatal(B_FALSE, + "aborting test after %lu seconds because " "pool has transitioned to a suspended state.", zfs_deadman_synctime_ms / 1000); } @@ -6817,7 +7042,8 @@ ztest_deadman_thread(void *arg) */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); if (gethrtime() > overdue) { - fatal(0, "aborting test after %llu seconds because " + fatal(B_FALSE, + "aborting test after %llu seconds because " "the process is overdue for termination.", (gethrtime() - zs->zs_proc_start) / NANOSEC); } @@ -6941,7 +7167,7 @@ ztest_dataset_dirobj_verify(ztest_ds_t *zd) * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ - VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } @@ -6978,7 +7204,8 @@ ztest_dataset_open(int d) if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) - fatal(0, "missing log records: claimed %llu < committed %llu", + fatal(B_FALSE, "missing log records: " + "claimed %"PRIu64" < committed %"PRIu64"", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); @@ -6988,17 +7215,19 @@ ztest_dataset_open(int d) ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) - (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + (void) printf("%s replay %"PRIu64" blocks, " + "%"PRIu64" records, seq %"PRIu64"\n", zd->zd_name, - (u_longlong_t)zilog->zl_parse_blk_count, - (u_longlong_t)zilog->zl_parse_lr_count, - (u_longlong_t)zilog->zl_replaying_seq); + zilog->zl_parse_blk_count, + zilog->zl_parse_lr_count, + zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) - fatal(0, "missing log records: replayed %llu < committed %llu", + fatal(B_FALSE, "missing log records: " + "replayed %"PRIu64" < committed %"PRIu64"", zilog->zl_replaying_seq, committed_seq); return (0); @@ -7035,11 +7264,12 @@ ztest_replay_zil_cb(const char *name, void *arg) ztest_opts.zo_verbose >= 6) { zilog_t *zilog = dmu_objset_zil(os); - (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + (void) printf("%s replay %"PRIu64" blocks, " + "%"PRIu64" records, seq %"PRIu64"\n", name, - (u_longlong_t)zilog->zl_parse_blk_count, - (u_longlong_t)zilog->zl_parse_lr_count, - (u_longlong_t)zilog->zl_replaying_seq); + zilog->zl_parse_blk_count, + zilog->zl_parse_lr_count, + zilog->zl_replaying_seq); } umem_free(zdtmp, sizeof (ztest_ds_t)); @@ -7048,6 +7278,150 @@ ztest_replay_zil_cb(const char *name, void *arg) return (0); } +static void +ztest_freeze(void) +{ + ztest_ds_t *zd = &ztest_ds[0]; + spa_t *spa; + int numloops = 0; + + if (ztest_opts.zo_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY0(ztest_dataset_open(0)); + ztest_spa = spa; + + /* + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. + */ + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, 0); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Freeze the pool. This stops spa_sync() from doing anything, + * so that the only way to record changes from now on is the ZIL. + */ + spa_freeze(spa); + + /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* + * Run tests that generate log records but don't alter the pool config + * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). + * We do a txg_wait_synced() after each iteration to force the txg + * to increase well beyond the last synced value in the uberblock. + * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. + */ + while (ztest_random(10) != 0 && + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + txg_wait_synced(spa_get_dsl(spa), 0); + } + + /* + * Commit all of the changes we just generated. + */ + zil_commit(zd->zd_zilog, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Close our dataset and close the pool. + */ + ztest_dataset_close(0); + spa_close(spa, FTAG); + kernel_fini(); + + /* + * Open and close the pool and dataset to induce log replay. + */ + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); + VERIFY0(ztest_dataset_open(0)); + ztest_spa = spa; + txg_wait_synced(spa_get_dsl(spa), 0); + ztest_dataset_close(0); + ztest_reguid(NULL, 0); + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_import_impl(ztest_shared_t *zs) +{ + importargs_t args = { 0 }; + nvlist_t *cfg = NULL; + int nsearch = 1; + char *searchdirs[nsearch]; + int flags = ZFS_IMPORT_MISSING_LOG; + + searchdirs[0] = ztest_opts.zo_dir; + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_FALSE; + + VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, + &libzpool_config_ops)); + VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); + fnvlist_free(cfg); +} + +/* + * Import a storage pool with the given name. + */ +static void +ztest_import(ztest_shared_t *zs) +{ + spa_t *spa; + + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + + ztest_import_impl(zs); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + if (!ztest_opts.zo_mmp_test) { + ztest_run_zdb(ztest_opts.zo_pool); + ztest_freeze(); + ztest_run_zdb(ztest_opts.zo_pool); + } + + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + /* * Kick off threads to run tests on all datasets in parallel. */ @@ -7087,13 +7461,24 @@ ztest_run(ztest_shared_t *zs) offsetof(ztest_cb_data_t, zcd_node)); /* - * Open our pool. + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. */ - kernel_init(FREAD | FWRITE); - VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(zs); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; + VERIFY0(vdev_raidz_impl_set("cycle")); + dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); @@ -7103,8 +7488,6 @@ ztest_run(ztest_shared_t *zs) zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, B_TRUE, FTAG); - spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; - /* * Create a thread to periodically resume suspended I/O. */ @@ -7269,96 +7652,6 @@ ztest_run(ztest_shared_t *zs) } static void -ztest_freeze(void) -{ - ztest_ds_t *zd = &ztest_ds[0]; - spa_t *spa; - int numloops = 0; - - if (ztest_opts.zo_verbose >= 3) - (void) printf("testing spa_freeze()...\n"); - - kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(0)); - ztest_spa = spa; - - /* - * Force the first log block to be transactionally allocated. - * We have to do this before we freeze the pool -- otherwise - * the log chain won't be anchored. - */ - while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { - ztest_dmu_object_alloc_free(zd, 0); - zil_commit(zd->zd_zilog, 0); - } - - txg_wait_synced(spa_get_dsl(spa), 0); - - /* - * Freeze the pool. This stops spa_sync() from doing anything, - * so that the only way to record changes from now on is the ZIL. - */ - spa_freeze(spa); - - /* - * Because it is hard to predict how much space a write will actually - * require beforehand, we leave ourselves some fudge space to write over - * capacity. - */ - uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; - - /* - * Run tests that generate log records but don't alter the pool config - * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). - * We do a txg_wait_synced() after each iteration to force the txg - * to increase well beyond the last synced value in the uberblock. - * The ZIL should be OK with that. - * - * Run a random number of times less than zo_maxloops and ensure we do - * not run out of space on the pool. - */ - while (ztest_random(10) != 0 && - numloops++ < ztest_opts.zo_maxloops && - metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { - ztest_od_t od; - ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); - VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); - ztest_io(zd, od.od_object, - ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); - txg_wait_synced(spa_get_dsl(spa), 0); - } - - /* - * Commit all of the changes we just generated. - */ - zil_commit(zd->zd_zilog, 0); - txg_wait_synced(spa_get_dsl(spa), 0); - - /* - * Close our dataset and close the pool. - */ - ztest_dataset_close(0); - spa_close(spa, FTAG); - kernel_fini(); - - /* - * Open and close the pool and dataset to induce log replay. - */ - kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); - ASSERT(spa_freeze_txg(spa) == UINT64_MAX); - VERIFY3U(0, ==, ztest_dataset_open(0)); - ztest_spa = spa; - txg_wait_synced(spa_get_dsl(spa), 0); - ztest_dataset_close(0); - ztest_reguid(NULL, 0); - - spa_close(spa, FTAG); - kernel_fini(); -} - -void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; @@ -7388,67 +7681,17 @@ make_random_props(void) { nvlist_t *props; - VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + props = fnvlist_alloc(); if (ztest_random(2) == 0) return (props); - VERIFY0(nvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1)); + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); return (props); } -/* - * Import a storage pool with the given name. - */ -static void -ztest_import(ztest_shared_t *zs) -{ - importargs_t args = { 0 }; - spa_t *spa; - nvlist_t *cfg = NULL; - int nsearch = 1; - char *searchdirs[nsearch]; - char *name = ztest_opts.zo_pool; - int flags = ZFS_IMPORT_MISSING_LOG; - int error; - - mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); - VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); - - kernel_init(FREAD | FWRITE); - - searchdirs[0] = ztest_opts.zo_dir; - args.paths = nsearch; - args.path = searchdirs; - args.can_be_active = B_FALSE; - - error = zpool_find_config(NULL, name, &cfg, &args, - &libzpool_config_ops); - if (error) - (void) fatal(0, "No pools found\n"); - - VERIFY0(spa_import(name, cfg, NULL, flags)); - VERIFY0(spa_open(name, &spa, FTAG)); - zs->zs_metaslab_sz = - 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; - spa_close(spa, FTAG); - - kernel_fini(); - - if (!ztest_opts.zo_mmp_test) { - ztest_run_zdb(ztest_opts.zo_pool); - ztest_freeze(); - ztest_run_zdb(ztest_opts.zo_pool); - } - - (void) pthread_rwlock_destroy(&ztest_name_lock); - mutex_destroy(&ztest_vdev_lock); - mutex_destroy(&ztest_checkpoint_lock); -} - /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. @@ -7464,7 +7707,7 @@ ztest_init(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); - kernel_init(FREAD | FWRITE); + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* * Create the storage pool. @@ -7474,7 +7717,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* @@ -7482,23 +7725,35 @@ ztest_init(ztest_shared_t *zs) * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ - VERIFY0(nvlist_add_uint64(props, + fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), - MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT)); + MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); for (i = 0; i < SPA_FEATURES; i++) { char *buf; + + if (!spa_feature_table[i].fi_zfs_mod_supported) + continue; + + /* + * 75% chance of using the log space map feature. We want ztest + * to exercise both the code paths that use the log space map + * feature and the ones that don't. + */ + if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) + continue; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); - VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + fnvlist_add_uint64(props, buf, 0); free(buf); } VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); - nvlist_free(nvroot); - nvlist_free(props); + fnvlist_free(nvroot); + fnvlist_free(props); - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); @@ -7548,9 +7803,9 @@ setup_hdr(void) hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); + ASSERT3P(hdr, !=, MAP_FAILED); - VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); + VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); @@ -7561,7 +7816,7 @@ setup_hdr(void) hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); - VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); + VERIFY0(ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } @@ -7575,14 +7830,14 @@ setup_data(void) hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); + ASSERT3P(hdr, !=, MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); + ASSERT3P(hdr, !=, MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; @@ -7611,7 +7866,7 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) } if (pid == -1) - fatal(1, "fork failed"); + fatal(B_TRUE, "fork failed"); if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; @@ -7621,12 +7876,13 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) (void) setrlimit(RLIMIT_NOFILE, &rl); (void) close(ztest_fd_rand); - VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data)); - VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1)); + VERIFY3S(11, >=, + snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) - VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); + VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); (void) execv(cmd, emptyargv); ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); @@ -7659,7 +7915,6 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); - /* NOTREACHED */ } } @@ -7710,7 +7965,7 @@ main(int argc, char **argv) char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; - int f; + int f, err; char *fd_data_str = getenv("ZTEST_FD_DATA"); struct sigaction action; @@ -7777,9 +8032,18 @@ main(int argc, char **argv) } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); + err = ztest_set_global_vars(); + if (err != 0 && !fd_data_str) { + /* error message done by ztest_set_global_vars */ + exit(EXIT_FAILURE); + } else { + /* children should not be spawned if setting gvars fails */ + VERIFY3S(err, ==, 0); + } + /* Override location of zpool.cache */ - VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache", - ztest_opts.zo_dir) != -1); + VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); @@ -7800,12 +8064,14 @@ main(int argc, char **argv) hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { - (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", - (u_longlong_t)ztest_opts.zo_vdevs, + (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," + "%d %s disks, %"PRIu64" seconds...\n\n", + ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, - (u_longlong_t)ztest_opts.zo_time); + ztest_opts.zo_raid_children, + ztest_opts.zo_raid_type, + ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); @@ -7887,11 +8153,11 @@ main(int argc, char **argv) print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); - (void) printf("Pass %3d, %8s, %3llu ENOSPC, " + (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", - (u_longlong_t)zs->zs_enospc_count, + zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / @@ -7908,8 +8174,8 @@ main(int argc, char **argv) zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); - (void) printf("%7llu %9s %s\n", - (u_longlong_t)zc->zc_count, timebuf, + (void) printf("%7"PRIu64" %9s %s\n", + zc->zc_count, timebuf, zi->zi_funcname); } (void) printf("\n"); diff --git a/cmd/zvol_id/Makefile.am b/cmd/zvol_id/Makefile.am index d131c6386f..bb7e31a059 100644 --- a/cmd/zvol_id/Makefile.am +++ b/cmd/zvol_id/Makefile.am @@ -1,14 +1,12 @@ include $(top_srcdir)/config/Rules.am # Disable GCC stack protection for zvol_id. This is a kludge and should be -# removed once https://github.com/zfsonlinux/zfs/issues/569 is resolved. +# removed once https://github.com/openzfs/zfs/issues/569 is resolved. AM_CFLAGS += -fno-stack-protector -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - udev_PROGRAMS = zvol_id zvol_id_SOURCES = \ zvol_id_main.c + +include $(top_srcdir)/config/CppCheck.am diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c index 4a2d74cc20..22f2e848cb 100644 --- a/cmd/zvol_id/zvol_id_main.c +++ b/cmd/zvol_id/zvol_id_main.c @@ -38,40 +38,39 @@ static int ioctl_get_msg(char *var, int fd) { - int error = 0; + int ret; char msg[ZFS_MAX_DATASET_NAME_LEN]; - error = ioctl(fd, BLKZNAME, msg); - if (error < 0) { - return (error); + ret = ioctl(fd, BLKZNAME, msg); + if (ret < 0) { + return (ret); } snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg); - return (error); + return (ret); } int main(int argc, char **argv) { - int fd, error = 0; + int fd = -1, ret = 0, status = EXIT_FAILURE; char zvol_name[ZFS_MAX_DATASET_NAME_LEN]; char *zvol_name_part = NULL; char *dev_name; struct stat64 statbuf; int dev_minor, dev_part; int i; - int rc; if (argc < 2) { - printf("Usage: %s /dev/zvol_device_node\n", argv[0]); - return (EINVAL); + fprintf(stderr, "Usage: %s /dev/zvol_device_node\n", argv[0]); + goto fail; } dev_name = argv[1]; - error = stat64(dev_name, &statbuf); - if (error != 0) { - printf("Unable to access device file: %s\n", dev_name); - return (errno); + ret = stat64(dev_name, &statbuf); + if (ret != 0) { + fprintf(stderr, "Unable to access device file: %s\n", dev_name); + goto fail; } dev_minor = minor(statbuf.st_rdev); @@ -79,23 +78,23 @@ main(int argc, char **argv) fd = open(dev_name, O_RDONLY); if (fd < 0) { - printf("Unable to open device file: %s\n", dev_name); - return (errno); + fprintf(stderr, "Unable to open device file: %s\n", dev_name); + goto fail; } - error = ioctl_get_msg(zvol_name, fd); - if (error < 0) { - printf("ioctl_get_msg failed:%s\n", strerror(errno)); - return (errno); + ret = ioctl_get_msg(zvol_name, fd); + if (ret < 0) { + fprintf(stderr, "ioctl_get_msg failed: %s\n", strerror(errno)); + goto fail; } if (dev_part > 0) - rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name, + ret = asprintf(&zvol_name_part, "%s-part%d", zvol_name, dev_part); else - rc = asprintf(&zvol_name_part, "%s", zvol_name); + ret = asprintf(&zvol_name_part, "%s", zvol_name); - if (rc == -1 || zvol_name_part == NULL) - goto error; + if (ret == -1 || zvol_name_part == NULL) + goto fail; for (i = 0; i < strlen(zvol_name_part); i++) { if (isblank(zvol_name_part[i])) @@ -103,8 +102,13 @@ main(int argc, char **argv) } printf("%s\n", zvol_name_part); - free(zvol_name_part); -error: - close(fd); - return (error); + status = EXIT_SUCCESS; + +fail: + if (zvol_name_part) + free(zvol_name_part); + if (fd >= 0) + close(fd); + + return (status); } diff --git a/cmd/zvol_wait/Makefile.am b/cmd/zvol_wait/Makefile.am new file mode 100644 index 0000000000..2e5bf33233 --- /dev/null +++ b/cmd/zvol_wait/Makefile.am @@ -0,0 +1,3 @@ +include $(top_srcdir)/config/Shellcheck.am + +dist_bin_SCRIPTS = zvol_wait diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait new file mode 100755 index 0000000000..2aa929b0ca --- /dev/null +++ b/cmd/zvol_wait/zvol_wait @@ -0,0 +1,113 @@ +#!/bin/sh + +count_zvols() { + if [ -z "$zvols" ]; then + echo 0 + else + echo "$zvols" | wc -l + fi +} + +filter_out_zvols_with_links() { + echo "$zvols" | tr ' ' '+' | while read -r zvol; do + if ! [ -L "/dev/zvol/$zvol" ]; then + echo "$zvol" + fi + done | tr '+' ' ' +} + +filter_out_deleted_zvols() { + OIFS="$IFS" + IFS=" +" + # shellcheck disable=SC2086 + zfs list -H -o name $zvols 2>/dev/null + IFS="$OIFS" +} + +list_zvols() { + read -r default_volmode < /sys/module/zfs/parameters/zvol_volmode + zfs list -t volume -H -o \ + name,volmode,receive_resume_token,redact_snaps | + while IFS=" " read -r name volmode token redacted; do # IFS=\t here! + + # /dev links are not created for zvols with volmode = "none" + # or for redacted zvols. + [ "$volmode" = "none" ] && continue + [ "$volmode" = "default" ] && [ "$default_volmode" = "3" ] && + continue + [ "$redacted" = "-" ] || continue + + # We also ignore partially received zvols if it is + # not an incremental receive, as those won't even have a block + # device minor node created yet. + if [ "$token" != "-" ]; then + + # Incremental receives create an invisible clone that + # is not automatically displayed by zfs list. + if ! zfs list "$name/%recv" >/dev/null 2>&1; then + continue + fi + fi + echo "$name" + done +} + +zvols=$(list_zvols) +zvols_count=$(count_zvols) +if [ "$zvols_count" -eq 0 ]; then + echo "No zvols found, nothing to do." + exit 0 +fi + +echo "Testing $zvols_count zvol links" + +outer_loop=0 +while [ "$outer_loop" -lt 20 ]; do + outer_loop=$((outer_loop + 1)) + + old_zvols_count=$(count_zvols) + + inner_loop=0 + while [ "$inner_loop" -lt 30 ]; do + inner_loop=$((inner_loop + 1)) + + zvols="$(filter_out_zvols_with_links)" + + zvols_count=$(count_zvols) + if [ "$zvols_count" -eq 0 ]; then + echo "All zvol links are now present." + exit 0 + fi + sleep 1 + done + + echo "Still waiting on $zvols_count zvol links ..." + # + # Although zvols should normally not be deleted at boot time, + # if that is the case then their links will be missing and + # we would stall. + # + if [ "$old_zvols_count" -eq "$zvols_count" ]; then + echo "No progress since last loop." + echo "Checking if any zvols were deleted." + + zvols=$(filter_out_deleted_zvols) + zvols_count=$(count_zvols) + + if [ "$old_zvols_count" -ne "$zvols_count" ]; then + echo "$((old_zvols_count - zvols_count)) zvol(s) deleted." + fi + + if [ "$zvols_count" -ne 0 ]; then + echo "Remaining zvols:" + echo "$zvols" + else + echo "All zvol links are now present." + exit 0 + fi + fi +done + +echo "Timed out waiting on zvol links" +exit 1 diff --git a/config/Abigail.am b/config/Abigail.am new file mode 100644 index 0000000000..94687b90ee --- /dev/null +++ b/config/Abigail.am @@ -0,0 +1,33 @@ +# +# When performing an ABI check the following options are applied: +# +# --no-unreferenced-symbols: Exclude symbols which are not referenced by +# any debug information. Without this _init() and _fini() are incorrectly +# reported on CentOS7 for libuutil.so. +# +# --headers-dir1: Limit ABI checks to public OpenZFS headers, otherwise +# changes in public system headers are also reported. +# +# --suppressions: Honor a suppressions file for each library to provide +# a mechanism for suppressing harmless warnings. +# + +PHONY += checkabi storeabi + +checkabi: + for lib in $(lib_LTLIBRARIES) ; do \ + abidiff --no-unreferenced-symbols \ + --headers-dir1 ../../include \ + --suppressions $${lib%.la}.suppr \ + $${lib%.la}.abi .libs/$${lib%.la}.so ; \ + done + +storeabi: + cd .libs ; \ + for lib in $(lib_LTLIBRARIES) ; do \ + abidw --no-show-locs \ + --no-corpus-path \ + --no-comp-dir-path \ + --type-id-style hash \ + $${lib%.la}.so > ../$${lib%.la}.abi ; \ + done diff --git a/config/CppCheck.am b/config/CppCheck.am new file mode 100644 index 0000000000..e53013bd01 --- /dev/null +++ b/config/CppCheck.am @@ -0,0 +1,11 @@ +# +# Default rules for running cppcheck against the user space components. +# + +PHONY += cppcheck + +CPPCHECKFLAGS = --std=c99 --quiet --max-configs=1 --error-exitcode=2 +CPPCHECKFLAGS += --inline-suppr -U_KERNEL + +cppcheck: + $(CPPCHECK) -j$(CPU_COUNT) $(CPPCHECKFLAGS) $(DEFAULT_INCLUDES) $(SOURCES) diff --git a/config/Rules.am b/config/Rules.am index 1e569d3419..20779ba492 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -3,28 +3,66 @@ # should include these rules and override or extend them as needed. # -DEFAULT_INCLUDES = -include ${top_builddir}/zfs_config.h +PHONY = +DEFAULT_INCLUDES = \ + -include $(top_builddir)/zfs_config.h \ + -I$(top_builddir)/include \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/module/icp/include \ + -I$(top_srcdir)/lib/libspl/include + +if BUILD_LINUX +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib/libspl/include/os/linux +endif + +if BUILD_FREEBSD +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib/libspl/include/os/freebsd +endif AM_LIBTOOLFLAGS = --silent -AM_CFLAGS = -std=gnu99 -Wall -Wstrict-prototypes -fno-strict-aliasing +AM_CFLAGS = -std=gnu99 -Wall -Wstrict-prototypes -Wmissing-prototypes +AM_CFLAGS += -fno-strict-aliasing AM_CFLAGS += $(NO_OMIT_FRAME_POINTER) +AM_CFLAGS += $(IMPLICIT_FALLTHROUGH) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) -AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) +AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) +if BUILD_FREEBSD +AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion +AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h +AM_CFLAGS += -I/usr/include -I/usr/local/include +endif AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE -AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1 -AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-linux-user\" AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" +AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) +if BUILD_LINUX +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-linux-user\" +endif +if BUILD_FREEBSD +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-freebsd-user\" +endif +AM_CPPFLAGS += -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" +AM_CPPFLAGS += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" +AM_CPPFLAGS += -D"basename(...)=basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" +AM_CPPFLAGS += -D"dirname(...)=dirname(__VA_ARGS__) __attribute__((deprecated(\"dirname(3) is underspecified. Use zfs_dirnamelen() instead!\")))" AM_LDFLAGS = $(DEBUG_LDFLAGS) AM_LDFLAGS += $(ASAN_LDFLAGS) + +if BUILD_FREEBSD +AM_LDFLAGS += -fstack-protector-strong -shared +AM_LDFLAGS += -Wl,-x -Wl,--fatal-warnings -Wl,--warn-shared-textrel +AM_LDFLAGS += -lm +endif diff --git a/config/Shellcheck.am b/config/Shellcheck.am new file mode 100644 index 0000000000..6b805b797d --- /dev/null +++ b/config/Shellcheck.am @@ -0,0 +1,22 @@ +.PHONY: shellcheck +shellcheck: $(SCRIPTS) $(SHELLCHECKSCRIPTS) +if HAVE_SHELLCHECK + [ -z "$(SCRIPTS)$(SHELLCHECKSCRIPTS)" ] && exit; shellcheck $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") --exclude=SC1090,SC1091$(SHELLCHECK_IGNORE) --format=gcc $(SCRIPTS) $(SHELLCHECKSCRIPTS) +else + @[ -z "$(SCRIPTS)$(SHELLCHECKSCRIPTS)" ] && exit; echo "skipping shellcheck of" $(SCRIPTS) $(SHELLCHECKSCRIPTS) "because shellcheck is not installed" +endif + @set -e; for dir in $(SHELLCHECKDIRS); do $(MAKE) -C $$dir shellcheck; done + + +# command -v *is* specified by POSIX and every shell in existence supports it +.PHONY: checkbashisms +checkbashisms: $(SCRIPTS) $(SHELLCHECKSCRIPTS) +if HAVE_CHECKBASHISMS + [ -z "$(SCRIPTS)$(SHELLCHECKSCRIPTS)" ] && exit; ! if [ -z "$(SHELLCHECK_SHELL)" ]; then \ + checkbashisms -npx $(SCRIPTS) $(SHELLCHECKSCRIPTS); else \ + for f in $(SCRIPTS) $(SHELLCHECKSCRIPTS); do echo $$f >&3; { echo '#!/bin/$(SHELLCHECK_SHELL)'; cat $$f; } | checkbashisms -npx; done; \ + fi 3>&2 2>&1 | grep -vFe "'command' with option other than -p" -e 'command -v' $(CHECKBASHISMS_IGNORE) >&2 +else + @[ -z "$(SCRIPTS)$(SHELLCHECKSCRIPTS)" ] && exit; echo "skipping checkbashisms of" $(SCRIPTS) $(SHELLCHECKSCRIPTS) "because checkbashisms is not installed" +endif + @set -e; for dir in $(SHELLCHECKDIRS); do $(MAKE) -C $$dir checkbashisms; done diff --git a/config/Substfiles.am b/config/Substfiles.am new file mode 100644 index 0000000000..911903e10e --- /dev/null +++ b/config/Substfiles.am @@ -0,0 +1,36 @@ +subst_sed_cmd = \ + -e 's|@bindir[@]|$(bindir)|g' \ + -e 's|@sbindir[@]|$(sbindir)|g' \ + -e 's|@datadir[@]|$(datadir)|g' \ + -e 's|@sysconfdir[@]|$(sysconfdir)|g' \ + -e 's|@runstatedir[@]|$(runstatedir)|g' \ + -e 's|@initconfdir[@]|$(initconfdir)|g' \ + -e 's|@initdir[@]|$(initdir)|g' \ + -e 's|@mounthelperdir[@]|$(mounthelperdir)|g' \ + -e 's|@systemdgeneratordir[@]|$(systemdgeneratordir)|g' \ + -e 's|@systemdunitdir[@]|$(systemdunitdir)|g' \ + -e 's|@udevdir[@]|$(udevdir)|g' \ + -e 's|@udevruledir[@]|$(udevruledir)|g' \ + -e 's|@zfsexecdir[@]|$(zfsexecdir)|g' \ + -e 's|@PYTHON[@]|$(PYTHON)|g' \ + -e 's|@PYTHON_SHEBANG[@]|$(PYTHON_SHEBANG)|g' \ + -e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \ + -e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \ + -e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \ + -e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' + +SUBSTFILES = +CLEANFILES = $(SUBSTFILES) +EXTRA_DIST = $(SUBSTFILES:=.in) + +$(SUBSTFILES):%:%.in Makefile + $(AM_V_GEN)set -e; \ + $(MKDIR_P) $$(dirname $@); \ + $(RM) $@~; \ + $(SED) $(subst_sed_cmd) $< >$@~; \ + if grep -E '@[a-zA-Z0-9_]+@' $@~ >&2; then \ + echo "Undefined substitution" >&2; \ + exit 1; \ + else test $$? -eq 1; fi; \ + test -x $< && chmod +x $@~; \ + mv -f $@~ $@ diff --git a/config/always-arch.m4 b/config/always-arch.m4 index c3e6b4a978..25e8c963a4 100644 --- a/config/always-arch.m4 +++ b/config/always-arch.m4 @@ -1,22 +1,41 @@ dnl # -dnl # Set the target arch for libspl atomic implementation and the icp +dnl # Set the target cpu architecture. This allows the +dnl # following syntax to be used in a Makefile.am. +dnl # +dnl # ifeq ($(TARGET_CPU),x86_64) +dnl # ... +dnl # endif +dnl # +dnl # if TARGET_CPU_POWERPC +dnl # ... +dnl # else +dnl # ... +dnl # endif dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ - AC_MSG_CHECKING(for target asm dir) - TARGET_ARCH=`echo ${target_cpu} | sed -e s/i.86/i386/` - - case $TARGET_ARCH in - i386|x86_64) - TARGET_ASM_DIR=asm-${TARGET_ARCH} + case $target_cpu in + i?86) + TARGET_CPU=i386 ;; - *) - TARGET_ASM_DIR=asm-generic + amd64|x86_64) + TARGET_CPU=x86_64 + ;; + powerpc*) + TARGET_CPU=powerpc + ;; + aarch64*) + TARGET_CPU=aarch64 + ;; + sparc64) + TARGET_CPU=sparc64 ;; esac - AC_SUBST([TARGET_ASM_DIR]) - AM_CONDITIONAL([TARGET_ASM_X86_64], test $TARGET_ASM_DIR = asm-x86_64) - AM_CONDITIONAL([TARGET_ASM_I386], test $TARGET_ASM_DIR = asm-i386) - AM_CONDITIONAL([TARGET_ASM_GENERIC], test $TARGET_ASM_DIR = asm-generic) - AC_MSG_RESULT([$TARGET_ASM_DIR]) + AC_SUBST(TARGET_CPU) + + AM_CONDITIONAL([TARGET_CPU_I386], test $TARGET_CPU = i386) + AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) + AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) + AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) + AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) ]) diff --git a/config/always-compiler-options.m4 b/config/always-compiler-options.m4 index e187f6ff8f..ce84f7e606 100644 --- a/config/always-compiler-options.m4 +++ b/config/always-compiler-options.m4 @@ -22,7 +22,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_ASAN], [ AS_IF([ test "$enable_asan" = "yes" ], [ AC_MSG_CHECKING([whether $CC supports -fsanitize=address]) saved_cflags="$CFLAGS" - CFLAGS="$CFLAGS -fsanitize=address" + CFLAGS="$CFLAGS -Werror -fsanitize=address" AC_LINK_IFELSE([ AC_LANG_SOURCE([[ int main() { return 0; } ]]) ], [ @@ -52,7 +52,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN], [ AC_MSG_CHECKING([whether $CC supports -Wframe-larger-than=]) saved_flags="$CFLAGS" - CFLAGS="$CFLAGS -Wframe-larger-than=4096" + CFLAGS="$CFLAGS -Werror -Wframe-larger-than=4096" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ FRAME_LARGER_THAN="-Wframe-larger-than=4096" @@ -73,7 +73,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION], [ AC_MSG_CHECKING([whether $CC supports -Wno-format-truncation]) saved_flags="$CFLAGS" - CFLAGS="$CFLAGS -Wno-format-truncation" + CFLAGS="$CFLAGS -Werror -Wno-format-truncation" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ NO_FORMAT_TRUNCATION=-Wno-format-truncation @@ -87,6 +87,27 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION], [ AC_SUBST([NO_FORMAT_TRUNCATION]) ]) +dnl # +dnl # Check if gcc supports -Wno-format-truncation option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH], [ + AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-format-zero-length" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length + AC_MSG_RESULT([yes]) + ], [ + NO_FORMAT_ZERO_LENGTH= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_FORMAT_ZERO_LENGTH]) +]) + dnl # dnl # Check if gcc supports -Wno-bool-compare option. @@ -100,7 +121,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE], [ AC_MSG_CHECKING([whether $CC supports -Wno-bool-compare]) saved_flags="$CFLAGS" - CFLAGS="$CFLAGS -Wbool-compare" + CFLAGS="$CFLAGS -Werror -Wbool-compare" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ NO_BOOL_COMPARE=-Wno-bool-compare @@ -126,7 +147,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE], [ AC_MSG_CHECKING([whether $CC supports -Wno-unused-but-set-variable]) saved_flags="$CFLAGS" - CFLAGS="$CFLAGS -Wunused-but-set-variable" + CFLAGS="$CFLAGS -Werror -Wunused-but-set-variable" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ NO_UNUSED_BUT_SET_VARIABLE=-Wno-unused-but-set-variable @@ -140,6 +161,29 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE], [ AC_SUBST([NO_UNUSED_BUT_SET_VARIABLE]) ]) +dnl # +dnl # Check if gcc supports -Wimplicit-fallthrough option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH], [ + AC_MSG_CHECKING([whether $CC supports -Wimplicit-fallthrough]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wimplicit-fallthrough" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + IMPLICIT_FALLTHROUGH=-Wimplicit-fallthrough + AC_DEFINE([HAVE_IMPLICIT_FALLTHROUGH], 1, + [Define if compiler supports -Wimplicit-fallthrough]) + AC_MSG_RESULT([yes]) + ], [ + IMPLICIT_FALLTHROUGH= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([IMPLICIT_FALLTHROUGH]) +]) + dnl # dnl # Check if gcc supports -fno-omit-frame-pointer option. dnl # @@ -147,7 +191,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER], [ AC_MSG_CHECKING([whether $CC supports -fno-omit-frame-pointer]) saved_flags="$CFLAGS" - CFLAGS="$CFLAGS -fno-omit-frame-pointer" + CFLAGS="$CFLAGS -Werror -fno-omit-frame-pointer" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ NO_OMIT_FRAME_POINTER=-fno-omit-frame-pointer @@ -160,3 +204,24 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER], [ CFLAGS="$saved_flags" AC_SUBST([NO_OMIT_FRAME_POINTER]) ]) + +dnl # +dnl # Check if cc supports -fno-ipa-sra option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA], [ + AC_MSG_CHECKING([whether $CC supports -fno-ipa-sra]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -fno-ipa-sra" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_IPA_SRA=-fno-ipa-sra + AC_MSG_RESULT([yes]) + ], [ + NO_IPA_SRA= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_IPA_SRA]) +]) diff --git a/config/always-cppcheck.m4 b/config/always-cppcheck.m4 new file mode 100644 index 0000000000..c7c134a3e8 --- /dev/null +++ b/config/always-cppcheck.m4 @@ -0,0 +1,6 @@ +dnl # +dnl # Check if cppcheck is available. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CPPCHECK], [ + AC_CHECK_PROG([CPPCHECK], [cppcheck], [cppcheck]) +]) diff --git a/config/always-python.m4 b/config/always-python.m4 index 858ab7b015..76b06fcd84 100644 --- a/config/always-python.m4 +++ b/config/always-python.m4 @@ -1,36 +1,3 @@ -dnl # -dnl # ZFS_AC_PYTHON_VERSION(version, [action-if-true], [action-if-false]) -dnl # -dnl # Verify Python version -dnl # -AC_DEFUN([ZFS_AC_PYTHON_VERSION], [ - ver_check=`$PYTHON -c "import sys; print (sys.version.split()[[0]] $1)"` - AS_IF([test "$ver_check" = "True"], [ - m4_ifvaln([$2], [$2]) - ], [ - m4_ifvaln([$3], [$3]) - ]) -]) - -dnl # -dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) -dnl # -dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE -dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html -dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. -dnl # -AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ - PYTHON_NAME=`basename $PYTHON` - AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) - AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ - AC_MSG_RESULT(yes) - m4_ifvaln([$2], [$2]) - ], [ - AC_MSG_RESULT(no) - m4_ifvaln([$3], [$3]) - ]) -]) - dnl # dnl # The majority of the python scripts are written to be compatible dnl # with Python 2.6 and Python 3.4. Therefore, they may be installed @@ -40,56 +7,57 @@ dnl # set the PYTHON environment variable accordingly. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [ AC_ARG_WITH([python], - AC_HELP_STRING([--with-python[=VERSION]], + AS_HELP_STRING([--with-python[=VERSION]], [default system python version @<:@default=check@:>@]), [with_python=$withval], [with_python=check]) AS_CASE([$with_python], - [check], - [AS_IF([test -x /usr/bin/python3], - [PYTHON="python3"], - [AS_IF([test -x /usr/bin/python2], - [PYTHON="python2"], - [PYTHON=""] - )] - )], + [check], [AC_CHECK_PROGS([PYTHON], [python3 python2], [:])], [2*], [PYTHON="python${with_python}"], [*python2*], [PYTHON="${with_python}"], [3*], [PYTHON="python${with_python}"], [*python3*], [PYTHON="${with_python}"], - [no], [PYTHON=""], + [no], [PYTHON=":"], [AC_MSG_ERROR([Unknown --with-python value '$with_python'])] ) - AS_IF([$PYTHON --version >/dev/null 2>&1], [ /bin/true ], [ - AC_MSG_ERROR([Cannot find $PYTHON in your system path]) - ]) - - AM_PATH_PYTHON([2.6], [], [:]) - AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) - AM_CONDITIONAL([USING_PYTHON_2], [test "${PYTHON_VERSION:0:2}" = "2."]) - AM_CONDITIONAL([USING_PYTHON_3], [test "${PYTHON_VERSION:0:2}" = "3."]) - dnl # dnl # Minimum supported Python versions for utilities: - dnl # Python 2.6.x, or Python 3.4.x + dnl # Python 2.6 or Python 3.4 dnl # - AS_IF([test "${PYTHON_VERSION:0:2}" = "2."], [ - ZFS_AC_PYTHON_VERSION([>= '2.6'], [ /bin/true ], - [AC_MSG_ERROR("Python >= 2.6.x is not available")]) + AM_PATH_PYTHON([], [], [:]) + AS_IF([test -z "$PYTHON_VERSION"], [ + PYTHON_VERSION=$(basename $PYTHON | tr -cd 0-9.) ]) + PYTHON_MINOR=${PYTHON_VERSION#*\.} - AS_IF([test "${PYTHON_VERSION:0:2}" = "3."], [ - ZFS_AC_PYTHON_VERSION([>= '3.4'], [ /bin/true ], - [AC_MSG_ERROR("Python >= 3.4.x is not available")]) - ]) + AS_CASE([$PYTHON_VERSION], + [2.*], [ + AS_IF([test $PYTHON_MINOR -lt 6], + [AC_MSG_ERROR("Python >= 2.6 is required")]) + ], + [3.*], [ + AS_IF([test $PYTHON_MINOR -lt 4], + [AC_MSG_ERROR("Python >= 3.4 is required")]) + ], + [:|2|3], [], + [PYTHON_VERSION=3] + ) + + AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) + AM_CONDITIONAL([USING_PYTHON_2], [test "x${PYTHON_VERSION%%\.*}" = x2]) + AM_CONDITIONAL([USING_PYTHON_3], [test "x${PYTHON_VERSION%%\.*}" = x3]) + + AM_COND_IF([USING_PYTHON_2], + [AC_SUBST([PYTHON_SHEBANG], [python2])], + [AC_SUBST([PYTHON_SHEBANG], [python3])]) dnl # dnl # Request that packages be built for a specific Python version. dnl # - AS_IF([test $with_python != check], [ - PYTHON_PKG_VERSION=`echo ${PYTHON} | tr -d 'a-zA-Z.'` + AS_IF([test "x$with_python" != xcheck], [ + PYTHON_PKG_VERSION=$(echo $PYTHON_VERSION | tr -d .) DEFINE_PYTHON_PKG_VERSION='--define "__use_python_pkg_version '${PYTHON_PKG_VERSION}'"' DEFINE_PYTHON_VERSION='--define "__use_python '${PYTHON}'"' ], [ diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index d74d6f1a75..fa39fd8851 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -1,9 +1,28 @@ dnl # -dnl # Determines if pyzfs can be built, requires Python 2.7 or latter. +dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) +dnl # +dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE +dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html +dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. +dnl # +AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ + PYTHON_NAME=$(basename $PYTHON) + AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) + AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ + AC_MSG_RESULT(yes) + m4_ifvaln([$2], [$2]) + ], [ + AC_MSG_RESULT(no) + m4_ifvaln([$3], [$3]) + ]) +]) + +dnl # +dnl # Determines if pyzfs can be built, requires Python 2.7 or later. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ AC_ARG_ENABLE([pyzfs], - AC_HELP_STRING([--enable-pyzfs], + AS_HELP_STRING([--enable-pyzfs], [install libzfs_core python bindings @<:@default=check@:>@]), [enable_pyzfs=$enableval], [enable_pyzfs=check]) @@ -18,28 +37,44 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ DEFINE_PYZFS='--without pyzfs' ]) ], [ - DEFINE_PYZFS='' + AS_IF([test "$PYTHON" != :], [ + DEFINE_PYZFS='' + ], [ + enable_pyzfs=no + DEFINE_PYZFS='--without pyzfs' + ]) ]) AC_SUBST(DEFINE_PYZFS) + dnl # + dnl # Python "packaging" (or, failing that, "distlib") module is required to build and install pyzfs + dnl # + AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ + ZFS_AC_PYTHON_MODULE([packaging], [], [ + ZFS_AC_PYTHON_MODULE([distlib], [], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AC_MSG_ERROR("Python $PYTHON_VERSION packaging and distlib modules are not installed") + ], [test "x$enable_pyzfs" != xno], [ + enable_pyzfs=no + ]) + ]) + ]) + ]) + dnl # dnl # Require python-devel libraries dnl # AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ - AS_IF([test "${PYTHON_VERSION:0:2}" = "2."], [ - PYTHON_REQUIRED_VERSION=">= '2.7.0'" - ], [ - AS_IF([test "${PYTHON_VERSION:0:2}" = "3."], [ - PYTHON_REQUIRED_VERSION=">= '3.4.0'" - ], [ - AC_MSG_ERROR("Python $PYTHON_VERSION unknown") - ]) - ]) + AS_CASE([$PYTHON_VERSION], + [3.*], [PYTHON_REQUIRED_VERSION=">= '3.4.0'"], + [2.*], [PYTHON_REQUIRED_VERSION=">= '2.7.0'"], + [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] + ) AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -52,7 +87,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([setuptools], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION setuptools is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -65,7 +100,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([cffi], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION cffi is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -76,7 +111,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ dnl # AS_IF([test "x$enable_pyzfs" = xcheck], [enable_pyzfs=yes]) - AM_CONDITIONAL([PYZFS_ENABLED], [test x$enable_pyzfs = xyes]) + AM_CONDITIONAL([PYZFS_ENABLED], [test "x$enable_pyzfs" = xyes]) AC_SUBST([PYZFS_ENABLED], [$enable_pyzfs]) AC_SUBST(pythonsitedir, [$PYTHON_SITE_PKG]) diff --git a/config/always-sed.m4 b/config/always-sed.m4 new file mode 100644 index 0000000000..3d7ae285ba --- /dev/null +++ b/config/always-sed.m4 @@ -0,0 +1,16 @@ +dnl # +dnl # Set the flags used for sed in-place edits. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SED], [ + AC_REQUIRE([AC_PROG_SED])dnl + AC_CACHE_CHECK([for sed --in-place], [ac_cv_inplace], [ + tmpfile=$(mktemp conftest.XXXXXX) + echo foo >$tmpfile + AS_IF([$SED --in-place 's#foo#bar#' $tmpfile 2>/dev/null], + [ac_cv_inplace="--in-place"], + [$SED -i '' 's#foo#bar#' $tmpfile 2>/dev/null], + [ac_cv_inplace="-i ''"], + [AC_MSG_ERROR([$SED does not support in-place])]) + ]) + AC_SUBST([ac_inplace], [$ac_cv_inplace]) +]) diff --git a/config/always-shellcheck.m4 b/config/always-shellcheck.m4 new file mode 100644 index 0000000000..2a9a099746 --- /dev/null +++ b/config/always-shellcheck.m4 @@ -0,0 +1,10 @@ +dnl # +dnl # Check if shellcheck and/or checkbashisms are available. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SHELLCHECK], [ + AC_CHECK_PROG([SHELLCHECK], [shellcheck], [yes]) + AC_CHECK_PROG([CHECKBASHISMS], [checkbashisms], [yes]) + + AM_CONDITIONAL([HAVE_SHELLCHECK], [test "x$SHELLCHECK" = "xyes"]) + AM_CONDITIONAL([HAVE_CHECKBASHISMS], [test "x$CHECKBASHISMS" = "xyes"]) +]) diff --git a/config/always-system.m4 b/config/always-system.m4 new file mode 100644 index 0000000000..3225a52af8 --- /dev/null +++ b/config/always-system.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # Set the target system +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SYSTEM], [ + AC_MSG_CHECKING([for system type ($host_os)]) + case $host_os in + *linux*) + AC_DEFINE([SYSTEM_LINUX], [1], + [True if ZFS is to be compiled for a Linux system]) + ac_system="Linux" + ;; + *freebsd*) + AC_DEFINE([SYSTEM_FREEBSD], [1], + [True if ZFS is to be compiled for a FreeBSD system]) + ac_system="FreeBSD" + ;; + *) + ac_system="unknown" + ;; + esac + AC_MSG_RESULT([$ac_system]) + AC_SUBST([ac_system]) + + AM_CONDITIONAL([BUILD_LINUX], [test "x$ac_system" = "xLinux"]) + AM_CONDITIONAL([BUILD_FREEBSD], [test "x$ac_system" = "xFreeBSD"]) +]) diff --git a/config/ax_code_coverage.m4 b/config/ax_code_coverage.m4 index 4417d4444a..3e3c666f3c 100644 --- a/config/ax_code_coverage.m4 +++ b/config/ax_code_coverage.m4 @@ -50,7 +50,7 @@ # CODE_COVERAGE_LIBS is preferred for clarity; CODE_COVERAGE_LDFLAGS is # deprecated. They have the same value. # -# This code was derived from Makefile.decl in GLib, originally licenced +# This code was derived from Makefile.decl in GLib, originally licensed # under LGPLv2.1+. # # LICENSE @@ -142,7 +142,7 @@ AC_DEFUN([AX_CODE_COVERAGE],[ '] [CODE_COVERAGE_RULES_CAPTURE=' $(code_coverage_v_lcov_cap)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --capture --output-file "$(CODE_COVERAGE_OUTPUT_FILE).tmp" --test-name "$(call code_coverage_sanitize,$(PACKAGE_NAME)-$(PACKAGE_VERSION))" --no-checksum --compat-libtool $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_OPTIONS) - $(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" "/tmp/*" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS) + $(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS) -@rm -f $(CODE_COVERAGE_OUTPUT_FILE).tmp $(code_coverage_v_genhtml)LANG=C $(GENHTML) $(code_coverage_quiet) $(addprefix --prefix ,$(CODE_COVERAGE_DIRECTORY)) --output-directory "$(CODE_COVERAGE_OUTPUT_DIRECTORY)" --title "$(PACKAGE_NAME)-$(PACKAGE_VERSION) Code Coverage" --legend --show-details "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_GENHTML_OPTIONS) @echo "file://$(abs_builddir)/$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html" @@ -219,7 +219,11 @@ CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT ?=\ $(if $(CODE_COVERAGE_BRANCH_COVERAGE),\ --rc genhtml_branch_coverage=$(CODE_COVERAGE_BRANCH_COVERAGE)) CODE_COVERAGE_GENHTML_OPTIONS ?= $(CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT) -CODE_COVERAGE_IGNORE_PATTERN ?= + +# Add any folders you want to ignore here +# Ignore tmp and tests themselves +CODE_COVERAGE_IGNORE_PATTERN ?= "/tmp/*" "*/tests/*" +CODE_COVERAGE_IGNORE_PATTERN += "*/module/zstd/lib/*" GITIGNOREFILES ?= GITIGNOREFILES += $(CODE_COVERAGE_OUTPUT_FILE) $(CODE_COVERAGE_OUTPUT_DIRECTORY) diff --git a/config/ax_count_cpus.m4 b/config/ax_count_cpus.m4 new file mode 100644 index 0000000000..5db8925534 --- /dev/null +++ b/config/ax_count_cpus.m4 @@ -0,0 +1,101 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_count_cpus.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_COUNT_CPUS([ACTION-IF-DETECTED],[ACTION-IF-NOT-DETECTED]) +# +# DESCRIPTION +# +# Attempt to count the number of logical processor cores (including +# virtual and HT cores) currently available to use on the machine and +# place detected value in CPU_COUNT variable. +# +# On successful detection, ACTION-IF-DETECTED is executed if present. If +# the detection fails, then ACTION-IF-NOT-DETECTED is triggered. The +# default ACTION-IF-NOT-DETECTED is to set CPU_COUNT to 1. +# +# LICENSE +# +# Copyright (c) 2014,2016 Karlson2k (Evgeny Grin) +# Copyright (c) 2012 Brian Aker +# Copyright (c) 2008 Michael Paul Bailey +# Copyright (c) 2008 Christophe Tournayre +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 22 + + AC_DEFUN([AX_COUNT_CPUS],[dnl + AC_REQUIRE([AC_CANONICAL_HOST])dnl + AC_REQUIRE([AC_PROG_EGREP])dnl + AC_MSG_CHECKING([the number of available CPUs]) + CPU_COUNT="0" + + # Try generic methods + + # 'getconf' is POSIX utility, but '_NPROCESSORS_ONLN' and + # 'NPROCESSORS_ONLN' are platform-specific + command -v getconf >/dev/null 2>&1 && \ + CPU_COUNT=`getconf _NPROCESSORS_ONLN 2>/dev/null || getconf NPROCESSORS_ONLN 2>/dev/null` || CPU_COUNT="0" + AS_IF([[test "$CPU_COUNT" -gt "0" 2>/dev/null || ! command -v nproc >/dev/null 2>&1]],[[: # empty]],[dnl + # 'nproc' is part of GNU Coreutils and is widely available + CPU_COUNT=`OMP_NUM_THREADS='' nproc 2>/dev/null` || CPU_COUNT=`nproc 2>/dev/null` || CPU_COUNT="0" + ])dnl + + AS_IF([[test "$CPU_COUNT" -gt "0" 2>/dev/null]],[[: # empty]],[dnl + # Try platform-specific preferred methods + AS_CASE([[$host_os]],dnl + [[*linux*]],[[CPU_COUNT=`lscpu -p 2>/dev/null | $EGREP -e '^@<:@0-9@:>@+,' -c` || CPU_COUNT="0"]],dnl + [[*darwin*]],[[CPU_COUNT=`sysctl -n hw.logicalcpu 2>/dev/null` || CPU_COUNT="0"]],dnl + [[freebsd*]],[[command -v sysctl >/dev/null 2>&1 && CPU_COUNT=`sysctl -n kern.smp.cpus 2>/dev/null` || CPU_COUNT="0"]],dnl + [[netbsd*]], [[command -v sysctl >/dev/null 2>&1 && CPU_COUNT=`sysctl -n hw.ncpuonline 2>/dev/null` || CPU_COUNT="0"]],dnl + [[solaris*]],[[command -v psrinfo >/dev/null 2>&1 && CPU_COUNT=`psrinfo 2>/dev/null | $EGREP -e '^@<:@0-9@:>@.*on-line' -c 2>/dev/null` || CPU_COUNT="0"]],dnl + [[mingw*]],[[CPU_COUNT=`ls -qpU1 /proc/registry/HKEY_LOCAL_MACHINE/HARDWARE/DESCRIPTION/System/CentralProcessor/ 2>/dev/null | $EGREP -e '^@<:@0-9@:>@+/' -c` || CPU_COUNT="0"]],dnl + [[msys*]],[[CPU_COUNT=`ls -qpU1 /proc/registry/HKEY_LOCAL_MACHINE/HARDWARE/DESCRIPTION/System/CentralProcessor/ 2>/dev/null | $EGREP -e '^@<:@0-9@:>@+/' -c` || CPU_COUNT="0"]],dnl + [[cygwin*]],[[CPU_COUNT=`ls -qpU1 /proc/registry/HKEY_LOCAL_MACHINE/HARDWARE/DESCRIPTION/System/CentralProcessor/ 2>/dev/null | $EGREP -e '^@<:@0-9@:>@+/' -c` || CPU_COUNT="0"]]dnl + )dnl + ])dnl + + AS_IF([[test "$CPU_COUNT" -gt "0" 2>/dev/null || ! command -v sysctl >/dev/null 2>&1]],[[: # empty]],[dnl + # Try less preferred generic method + # 'hw.ncpu' exist on many platforms, but not on GNU/Linux + CPU_COUNT=`sysctl -n hw.ncpu 2>/dev/null` || CPU_COUNT="0" + ])dnl + + AS_IF([[test "$CPU_COUNT" -gt "0" 2>/dev/null]],[[: # empty]],[dnl + # Try platform-specific fallback methods + # They can be less accurate and slower then preferred methods + AS_CASE([[$host_os]],dnl + [[*linux*]],[[CPU_COUNT=`$EGREP -e '^processor' -c /proc/cpuinfo 2>/dev/null` || CPU_COUNT="0"]],dnl + [[*darwin*]],[[CPU_COUNT=`system_profiler SPHardwareDataType 2>/dev/null | $EGREP -i -e 'number of cores:'|cut -d : -f 2 -s|tr -d ' '` || CPU_COUNT="0"]],dnl + [[freebsd*]],[[CPU_COUNT=`dmesg 2>/dev/null| $EGREP -e '^cpu@<:@0-9@:>@+: '|sort -u|$EGREP -e '^' -c` || CPU_COUNT="0"]],dnl + [[netbsd*]], [[CPU_COUNT=`command -v cpuctl >/dev/null 2>&1 && cpuctl list 2>/dev/null| $EGREP -e '^@<:@0-9@:>@+ .* online ' -c` || \ + CPU_COUNT=`dmesg 2>/dev/null| $EGREP -e '^cpu@<:@0-9@:>@+ at'|sort -u|$EGREP -e '^' -c` || CPU_COUNT="0"]],dnl + [[solaris*]],[[command -v kstat >/dev/null 2>&1 && CPU_COUNT=`kstat -m cpu_info -s state -p 2>/dev/null | $EGREP -c -e 'on-line'` || \ + CPU_COUNT=`kstat -m cpu_info 2>/dev/null | $EGREP -c -e 'module: cpu_info'` || CPU_COUNT="0"]],dnl + [[mingw*]],[AS_IF([[CPU_COUNT=`reg query 'HKLM\\Hardware\\Description\\System\\CentralProcessor' 2>/dev/null | $EGREP -e '\\\\@<:@0-9@:>@+$' -c`]],dnl + [[: # empty]],[[test "$NUMBER_OF_PROCESSORS" -gt "0" 2>/dev/null && CPU_COUNT="$NUMBER_OF_PROCESSORS"]])],dnl + [[msys*]],[[test "$NUMBER_OF_PROCESSORS" -gt "0" 2>/dev/null && CPU_COUNT="$NUMBER_OF_PROCESSORS"]],dnl + [[cygwin*]],[[test "$NUMBER_OF_PROCESSORS" -gt "0" 2>/dev/null && CPU_COUNT="$NUMBER_OF_PROCESSORS"]]dnl + )dnl + ])dnl + + AS_IF([[test "x$CPU_COUNT" != "x0" && test "$CPU_COUNT" -gt 0 2>/dev/null]],[dnl + AC_MSG_RESULT([[$CPU_COUNT]]) + m4_ifvaln([$1],[$1],)dnl + ],[dnl + m4_ifval([$2],[dnl + AS_UNSET([[CPU_COUNT]]) + AC_MSG_RESULT([[unable to detect]]) + $2 + ], [dnl + CPU_COUNT="1" + AC_MSG_RESULT([[unable to detect (assuming 1)]]) + ])dnl + ])dnl + ])dnl diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index c51b45b7d5..7adcf01a04 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -97,9 +97,18 @@ AC_DEFUN([AX_PYTHON_DEVEL],[ # Check for a version of Python >= 2.1.0 # AC_MSG_CHECKING([for a version of Python >= '2.1.0']) - ac_supports_python_ver=`$PYTHON -c "import sys; \ - ver = sys.version.split ()[[0]]; \ - print (ver >= '2.1.0')"` + ac_supports_python_ver=`cat<= '3.11.0' + ac_supports_python_ver=`cat<&1` - if test $? -eq 0; then + if ac_distutils_result=`$PYTHON -c "import distutils" 2>&1`; then AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) @@ -204,7 +224,7 @@ EOD` ac_python_version=$PYTHON_VERSION else ac_python_version=`$PYTHON -c "import sys; \ - print (sys.version[[:3]])"` + print ('.'.join(sys.version.split('.')[[:2]]))"` fi fi diff --git a/config/config.rpath b/config/config.rpath old mode 100644 new mode 100755 index 7b9da3c6c4..be202c1a9e --- a/config/config.rpath +++ b/config/config.rpath @@ -1 +1,684 @@ -# `make distclean` deletes files with size 0. This text is to avoid that. +#! /bin/sh +# Output a system dependent set of variables, describing how to set the +# run time search path of shared libraries in an executable. +# +# Copyright 1996-2019 Free Software Foundation, Inc. +# Taken from GNU libtool, 2001 +# Originally by Gordon Matzigkeit , 1996 +# +# This file is free software; the Free Software Foundation gives +# unlimited permission to copy and/or distribute it, with or without +# modifications, as long as this notice is preserved. +# +# The first argument passed to this file is the canonical host specification, +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# The environment variables CC, GCC, LDFLAGS, LD, with_gnu_ld +# should be set by the caller. +# +# The set of defined variables is at the end of this script. + +# Known limitations: +# - On IRIX 6.5 with CC="cc", the run time search patch must not be longer +# than 256 bytes, otherwise the compiler driver will dump core. The only +# known workaround is to choose shorter directory names for the build +# directory and/or the installation directory. + +# All known linkers require a '.a' archive for static linking (except MSVC, +# which needs '.lib'). +libext=a +shrext=.so + +host="$1" +host_cpu=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` +host_vendor=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` +host_os=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` + +# Code taken from libtool.m4's _LT_CC_BASENAME. + +for cc_temp in $CC""; do + case $cc_temp in + compile | *[\\/]compile | ccache | *[\\/]ccache ) ;; + distcc | *[\\/]distcc | purify | *[\\/]purify ) ;; + \-*) ;; + *) break;; + esac +done +cc_basename=`echo "$cc_temp" | sed -e 's%^.*/%%'` + +# Code taken from libtool.m4's _LT_COMPILER_PIC. + +wl= +if test "$GCC" = yes; then + wl='-Wl,' +else + case "$host_os" in + aix*) + wl='-Wl,' + ;; + mingw* | cygwin* | pw32* | os2* | cegcc*) + ;; + hpux9* | hpux10* | hpux11*) + wl='-Wl,' + ;; + irix5* | irix6* | nonstopux*) + wl='-Wl,' + ;; + linux* | k*bsd*-gnu | kopensolaris*-gnu) + case $cc_basename in + ecc*) + wl='-Wl,' + ;; + icc* | ifort*) + wl='-Wl,' + ;; + lf95*) + wl='-Wl,' + ;; + nagfor*) + wl='-Wl,-Wl,,' + ;; + pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*) + wl='-Wl,' + ;; + ccc*) + wl='-Wl,' + ;; + xl* | bgxl* | bgf* | mpixl*) + wl='-Wl,' + ;; + como) + wl='-lopt=' + ;; + *) + case `$CC -V 2>&1 | sed 5q` in + *Sun\ F* | *Sun*Fortran*) + wl= + ;; + *Sun\ C*) + wl='-Wl,' + ;; + esac + ;; + esac + ;; + newsos6) + ;; + *nto* | *qnx*) + ;; + osf3* | osf4* | osf5*) + wl='-Wl,' + ;; + rdos*) + ;; + solaris*) + case $cc_basename in + f77* | f90* | f95* | sunf77* | sunf90* | sunf95*) + wl='-Qoption ld ' + ;; + *) + wl='-Wl,' + ;; + esac + ;; + sunos4*) + wl='-Qoption ld ' + ;; + sysv4 | sysv4.2uw2* | sysv4.3*) + wl='-Wl,' + ;; + sysv4*MP*) + ;; + sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*) + wl='-Wl,' + ;; + unicos*) + wl='-Wl,' + ;; + uts4*) + ;; + esac +fi + +# Code taken from libtool.m4's _LT_LINKER_SHLIBS. + +hardcode_libdir_flag_spec= +hardcode_libdir_separator= +hardcode_direct=no +hardcode_minus_L=no + +case "$host_os" in + cygwin* | mingw* | pw32* | cegcc*) + # FIXME: the MSVC++ port hasn't been tested in a loooong time + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + if test "$GCC" != yes; then + with_gnu_ld=no + fi + ;; + interix*) + # we just hope/assume this is gcc and not c89 (= MSVC++) + with_gnu_ld=yes + ;; + openbsd*) + with_gnu_ld=no + ;; +esac + +ld_shlibs=yes +if test "$with_gnu_ld" = yes; then + # Set some defaults for GNU ld with shared library support. These + # are reset later if shared libraries are not supported. Putting them + # here allows them to be overridden if necessary. + # Unlike libtool, we use -rpath here, not --rpath, since the documented + # option of GNU ld is called -rpath, not --rpath. + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + case "$host_os" in + aix[3-9]*) + # On AIX/PPC, the GNU linker is very broken + if test "$host_cpu" != ia64; then + ld_shlibs=no + fi + ;; + amigaos*) + case "$host_cpu" in + powerpc) + ;; + m68k) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + esac + ;; + beos*) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + cygwin* | mingw* | pw32* | cegcc*) + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec='-L$libdir' + if $LD --help 2>&1 | grep 'auto-import' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + haiku*) + ;; + interix[3-9]*) + hardcode_direct=no + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + ;; + gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + netbsd*) + ;; + solaris*) + if $LD -v 2>&1 | grep 'BFD 2\.8' > /dev/null; then + ld_shlibs=no + elif $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*) + case `$LD -v 2>&1` in + *\ [01].* | *\ 2.[0-9].* | *\ 2.1[0-5].*) + ld_shlibs=no + ;; + *) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + hardcode_libdir_flag_spec='`test -z "$SCOABSPATH" && echo ${wl}-rpath,$libdir`' + else + ld_shlibs=no + fi + ;; + esac + ;; + sunos4*) + hardcode_direct=yes + ;; + *) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + esac + if test "$ld_shlibs" = no; then + hardcode_libdir_flag_spec= + fi +else + case "$host_os" in + aix3*) + # Note: this linker hardcodes the directories in LIBPATH if there + # are no directories specified by -L. + hardcode_minus_L=yes + if test "$GCC" = yes; then + # Neither direct hardcoding nor static linking is supported with a + # broken collect2. + hardcode_direct=unsupported + fi + ;; + aix[4-9]*) + if test "$host_cpu" = ia64; then + # On IA64, the linker does run time linking by default, so we don't + # have to do anything special. + aix_use_runtimelinking=no + else + aix_use_runtimelinking=no + # Test if we are trying to use run time linking or normal + # AIX style linking. If -brtl is somewhere in LDFLAGS, we + # need to do runtime linking. + case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*) + for ld_flag in $LDFLAGS; do + if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then + aix_use_runtimelinking=yes + break + fi + done + ;; + esac + fi + hardcode_direct=yes + hardcode_libdir_separator=':' + if test "$GCC" = yes; then + case $host_os in aix4.[012]|aix4.[012].*) + collect2name=`${CC} -print-prog-name=collect2` + if test -f "$collect2name" && \ + strings "$collect2name" | grep resolve_lib_name >/dev/null + then + # We have reworked collect2 + : + else + # We have old collect2 + hardcode_direct=unsupported + hardcode_minus_L=yes + hardcode_libdir_flag_spec='-L$libdir' + hardcode_libdir_separator= + fi + ;; + esac + fi + # Begin _LT_AC_SYS_LIBPATH_AIX. + echo 'int main () { return 0; }' > conftest.c + ${CC} ${LDFLAGS} conftest.c -o conftest + aix_libpath=`dump -H conftest 2>/dev/null | sed -n -e '/Import File Strings/,/^$/ { /^0/ { s/^0 *\(.*\)$/\1/; p; } +}'` + if test -z "$aix_libpath"; then + aix_libpath=`dump -HX64 conftest 2>/dev/null | sed -n -e '/Import File Strings/,/^$/ { /^0/ { s/^0 *\(.*\)$/\1/; p; } +}'` + fi + if test -z "$aix_libpath"; then + aix_libpath="/usr/lib:/lib" + fi + rm -f conftest.c conftest + # End _LT_AC_SYS_LIBPATH_AIX. + if test "$aix_use_runtimelinking" = yes; then + hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath" + else + if test "$host_cpu" = ia64; then + hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib' + else + hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath" + fi + fi + ;; + amigaos*) + case "$host_cpu" in + powerpc) + ;; + m68k) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + esac + ;; + bsdi[45]*) + ;; + cygwin* | mingw* | pw32* | cegcc*) + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec=' ' + libext=lib + ;; + darwin* | rhapsody*) + hardcode_direct=no + if { case $cc_basename in ifort*) true;; *) test "$GCC" = yes;; esac; }; then + : + else + ld_shlibs=no + fi + ;; + dgux*) + hardcode_libdir_flag_spec='-L$libdir' + ;; + freebsd2.[01]*) + hardcode_direct=yes + hardcode_minus_L=yes + ;; + freebsd* | dragonfly*) + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + ;; + hpux9*) + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + ;; + hpux10*) + if test "$with_gnu_ld" = no; then + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + fi + ;; + hpux11*) + if test "$with_gnu_ld" = no; then + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + case $host_cpu in + hppa*64*|ia64*) + hardcode_direct=no + ;; + *) + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + ;; + esac + fi + ;; + irix5* | irix6* | nonstopux*) + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + netbsd*) + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + ;; + newsos6) + hardcode_direct=yes + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + *nto* | *qnx*) + ;; + openbsd*) + if test -f /usr/libexec/ld.so; then + hardcode_direct=yes + if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + else + case "$host_os" in + openbsd[01].* | openbsd2.[0-7] | openbsd2.[0-7].*) + hardcode_libdir_flag_spec='-R$libdir' + ;; + *) + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + ;; + esac + fi + else + ld_shlibs=no + fi + ;; + os2*) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + osf3*) + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + osf4* | osf5*) + if test "$GCC" = yes; then + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + else + # Both cc and cxx compiler support -rpath directly + hardcode_libdir_flag_spec='-rpath $libdir' + fi + hardcode_libdir_separator=: + ;; + solaris*) + hardcode_libdir_flag_spec='-R$libdir' + ;; + sunos4*) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_direct=yes + hardcode_minus_L=yes + ;; + sysv4) + case $host_vendor in + sni) + hardcode_direct=yes # is this really true??? + ;; + siemens) + hardcode_direct=no + ;; + motorola) + hardcode_direct=no #Motorola manual says yes, but my tests say they lie + ;; + esac + ;; + sysv4.3*) + ;; + sysv4*MP*) + if test -d /usr/nec; then + ld_shlibs=yes + fi + ;; + sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*) + ;; + sysv5* | sco3.2v5* | sco5v6*) + hardcode_libdir_flag_spec='`test -z "$SCOABSPATH" && echo ${wl}-R,$libdir`' + hardcode_libdir_separator=':' + ;; + uts4*) + hardcode_libdir_flag_spec='-L$libdir' + ;; + *) + ld_shlibs=no + ;; + esac +fi + +# Check dynamic linker characteristics +# Code taken from libtool.m4's _LT_SYS_DYNAMIC_LINKER. +# Unlike libtool.m4, here we don't care about _all_ names of the library, but +# only about the one the linker finds when passed -lNAME. This is the last +# element of library_names_spec in libtool.m4, or possibly two of them if the +# linker has special search rules. +library_names_spec= # the last element of library_names_spec in libtool.m4 +libname_spec='lib$name' +case "$host_os" in + aix3*) + library_names_spec='$libname.a' + ;; + aix[4-9]*) + library_names_spec='$libname$shrext' + ;; + amigaos*) + case "$host_cpu" in + powerpc*) + library_names_spec='$libname$shrext' ;; + m68k) + library_names_spec='$libname.a' ;; + esac + ;; + beos*) + library_names_spec='$libname$shrext' + ;; + bsdi[45]*) + library_names_spec='$libname$shrext' + ;; + cygwin* | mingw* | pw32* | cegcc*) + shrext=.dll + library_names_spec='$libname.dll.a $libname.lib' + ;; + darwin* | rhapsody*) + shrext=.dylib + library_names_spec='$libname$shrext' + ;; + dgux*) + library_names_spec='$libname$shrext' + ;; + freebsd[23].*) + library_names_spec='$libname$shrext$versuffix' + ;; + freebsd* | dragonfly*) + library_names_spec='$libname$shrext' + ;; + gnu*) + library_names_spec='$libname$shrext' + ;; + haiku*) + library_names_spec='$libname$shrext' + ;; + hpux9* | hpux10* | hpux11*) + case $host_cpu in + ia64*) + shrext=.so + ;; + hppa*64*) + shrext=.sl + ;; + *) + shrext=.sl + ;; + esac + library_names_spec='$libname$shrext' + ;; + interix[3-9]*) + library_names_spec='$libname$shrext' + ;; + irix5* | irix6* | nonstopux*) + library_names_spec='$libname$shrext' + case "$host_os" in + irix5* | nonstopux*) + libsuff= shlibsuff= + ;; + *) + case $LD in + *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ") libsuff= shlibsuff= ;; + *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ") libsuff=32 shlibsuff=N32 ;; + *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ") libsuff=64 shlibsuff=64 ;; + *) libsuff= shlibsuff= ;; + esac + ;; + esac + ;; + linux*oldld* | linux*aout* | linux*coff*) + ;; + linux* | k*bsd*-gnu | kopensolaris*-gnu) + library_names_spec='$libname$shrext' + ;; + knetbsd*-gnu) + library_names_spec='$libname$shrext' + ;; + netbsd*) + library_names_spec='$libname$shrext' + ;; + newsos6) + library_names_spec='$libname$shrext' + ;; + *nto* | *qnx*) + library_names_spec='$libname$shrext' + ;; + openbsd*) + library_names_spec='$libname$shrext$versuffix' + ;; + os2*) + libname_spec='$name' + shrext=.dll + library_names_spec='$libname.a' + ;; + osf3* | osf4* | osf5*) + library_names_spec='$libname$shrext' + ;; + rdos*) + ;; + solaris*) + library_names_spec='$libname$shrext' + ;; + sunos4*) + library_names_spec='$libname$shrext$versuffix' + ;; + sysv4 | sysv4.3*) + library_names_spec='$libname$shrext' + ;; + sysv4*MP*) + library_names_spec='$libname$shrext' + ;; + sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*) + library_names_spec='$libname$shrext' + ;; + tpf*) + library_names_spec='$libname$shrext' + ;; + uts4*) + library_names_spec='$libname$shrext' + ;; +esac + +sed_quote_subst='s/\(["`$\\]\)/\\\1/g' +escaped_wl=`echo "X$wl" | sed -e 's/^X//' -e "$sed_quote_subst"` +shlibext=`echo "$shrext" | sed -e 's,^\.,,'` +escaped_libname_spec=`echo "X$libname_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` +escaped_library_names_spec=`echo "X$library_names_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` +escaped_hardcode_libdir_flag_spec=`echo "X$hardcode_libdir_flag_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` + +LC_ALL=C sed -e 's/^\([a-zA-Z0-9_]*\)=/acl_cv_\1=/' < $${path_prepend}/dh_shlibdeps; \ echo "`which dh_shlibdeps` -- \ - -xlibuutil1linux -xlibnvpair1linux -xlibzfs2linux -xlibzpool2linux" \ + -xlibuutil3linux -xlibnvpair3linux -xlibzfs5linux -xlibzpool5linux" \ >> $${path_prepend}/dh_shlibdeps; \ ## These -x arguments are passed to dpkg-shlibdeps, which exclude the ## Debianized packages from the auto-generated dependencies of the new debs, @@ -63,7 +77,7 @@ deb-utils: deb-local rpm-utils env PATH=$${path_prepend}:$${PATH} \ fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch \ $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ - $$pkg8 $$pkg9 $$pkg10; \ + $$pkg8 $$pkg9 $$pkg10 || exit 1; \ $(RM) $${path_prepend}/dh_shlibdeps; \ rmdir $${path_prepend}; \ $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ diff --git a/config/find_system_library.m4 b/config/find_system_library.m4 index 9d22bcfab5..310b44112a 100644 --- a/config/find_system_library.m4 +++ b/config/find_system_library.m4 @@ -4,70 +4,95 @@ dnl requires pkg.m4 from pkg-config dnl requires ax_save_flags.m4 from autoconf-archive dnl requires ax_restore_flags.m4 from autoconf-archive -dnl FIND_SYSTEM_LIBRARY(VARIABLE-PREFIX, MODULE, HEADER, HEADER-PREFIXES, LIBRARY, FUNCTIONS, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +dnl ZFS_AC_FIND_SYSTEM_LIBRARY(VARIABLE-PREFIX, MODULE, HEADER, HEADER-PREFIXES, LIBRARY, FUNCTIONS, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -AC_DEFUN([FIND_SYSTEM_LIBRARY], [ +AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [ AC_REQUIRE([PKG_PROG_PKG_CONFIG]) + _header_found= _library_found= + _pc_found= - PKG_CHECK_MODULES([$1], [$2], [_library_found=1], [ - AS_IF([test -f /usr/include/[$3]], [ - AC_SUBST([$1][_CFLAGS], []) - AC_SUBST([$1][_LIBS], ["-l[$5]]") - _library_found=1 - ],[ AS_IF([test -f /usr/local/include/[$3]], [ - AC_SUBST([$1][_CFLAGS], ["-I/usr/local/include"]) - AC_SUBST([$1][_LIBS], ["-L/usr/local -l[$5]]") - _library_found=1 - ],[dnl ELSE - m4_foreach([prefix], [$4], [ - AS_IF([test "x$_library_found" != "x1"], [ - AS_IF([test -f [/usr/include/]prefix[/][$3]], [ - AC_SUBST([$1][_CFLAGS], ["[-I/usr/include/]prefix["]]) - AC_SUBST([$1][_LIBS], ["-l[$5]]") - _library_found=1 - ],[ AS_IF([test -f [/usr/local/include/]prefix[/][$3]], [ - AC_SUBST([$1][_CFLAGS], ["[-I/usr/local/include/]prefix["]]) - AC_SUBST([$1][_LIBS], ["-L/usr/local -l[$5]"]) - _library_found=1 - ])]) - ]) - ]) - ])]) + AS_IF([test -n "$2"], [PKG_CHECK_MODULES([$1], [$2], [ + _header_found=1 + _library_found=1 + _pc_found=1 + ], [:])]) - AS_IF([test -z "$_library_found"], [ - AC_MSG_WARN([cannot find [$2] via pkg-config or in the standard locations]) - ]) + # set _header_found/_library_found if the user passed in CFLAGS/LIBS + AS_IF([test "x$[$1][_CFLAGS]" != x], [_header_found=1]) + AS_IF([test "x$[$1][_LIBS]" != x], [_library_found=1]) + + AX_SAVE_FLAGS + + orig_CFLAGS="$CFLAGS" + + for _prefixdir in /usr /usr/local + do + AS_VAR_PUSHDEF([header_cache], [ac_cv_header_$3]) + AS_IF([test "x$_prefixdir" != "x/usr"], [ + [$1][_CFLAGS]="-I$lt_sysroot$_prefixdir/include" + AS_IF([test "x$_library_found" = x], [ + [$1][_LIBS]="-L$lt_sysroot$_prefixdir/lib" + ]) + ]) + CFLAGS="$orig_CFLAGS $[$1][_CFLAGS]" + AS_UNSET([header_cache]) + AC_CHECK_HEADER([$3], [ + _header_found=1 + break + ], [AS_IF([test "x$_header_found" = "x1"], [ + # if pkg-config or the user set CFLAGS, fail if the header is unusable + AC_MSG_FAILURE([header [$3] for library [$5] is not usable]) + ])], [AC_INCLUDES_DEFAULT]) + # search for header under HEADER-PREFIXES + m4_foreach_w([prefix], [$4], [ + [$1][_CFLAGS]=["-I$lt_sysroot$_prefixdir/include/]prefix["] + CFLAGS="$orig_CFLAGS $[$1][_CFLAGS]" + AS_UNSET([header_cache]) + AC_CHECK_HEADER([$3], [ + _header_found=1 + break + ], [], [AC_INCLUDES_DEFAULT]) + ]) + AS_VAR_POPDEF([header_cache]) + done + + AS_IF([test "x$_header_found" = "x1"], [ + AS_IF([test "x$_library_found" = x], [ + [$1][_LIBS]="$[$1]_LIBS -l[$5]" + ]) + LDFLAGS="$LDFLAGS $[$1][_LIBS]" + + _libcheck=1 + m4_ifval([$6], + [m4_foreach_w([func], [$6], [AC_CHECK_LIB([$5], func, [:], [_libcheck=])])], + [AC_CHECK_LIB([$5], [main], [:], [_libcheck=])]) + + AS_IF([test "x$_libcheck" = "x1"], [_library_found=1], + [test "x$_library_found" = "x1"], [ + # if pkg-config or the user set LIBS, fail if the library is unusable + AC_MSG_FAILURE([library [$5] is not usable]) + ]) + ], [test "x$_library_found" = "x1"], [ + # if the user set LIBS, fail if we didn't find the header + AC_MSG_FAILURE([cannot find header [$3] for library [$5]]) ]) - dnl do some further sanity checks + AX_RESTORE_FLAGS - AS_IF([test -n "$_library_found"], [ - AX_SAVE_FLAGS - - CPPFLAGS="$CPPFLAGS $(echo $[$1][_CFLAGS] | sed 's/-include */-include-/g; s/^/ /; s/ [^-][^ ]*//g; s/ -[^Ii][^ ]*//g; s/-include-/-include /g; s/^ //;')" - CFLAGS="$CFLAGS $[$1][_CFLAGS]" - LDFLAGS="$LDFLAGS $[$1][_LIBS]" - - AC_CHECK_HEADER([$3], [], [ - AC_MSG_WARN([header [$3] for library [$2] is not usable]) - _library_found= - ]) - - m4_foreach([func], [$6], [ - AC_CHECK_LIB([$5], func, [], [ - AC_MSG_WARN([cannot find ]func[ in library [$5]]) - _library_found= - ]) - ]) - - AX_RESTORE_FLAGS - ]) - - AS_IF([test -n "$_library_found"], [ - :;$7 + AS_IF([test "x$_header_found" = "x1" && test "x$_library_found" = "x1"], [ + AC_SUBST([$1]_CFLAGS) + AC_SUBST([$1]_LIBS) + AS_IF([test "x$_pc_found" = "x1"], [ + AC_SUBST([$1]_PC, [$2]) + ]) + AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]]) + $7 ],[dnl ELSE - :;$8 + AC_SUBST([$1]_CFLAGS, []) + AC_SUBST([$1]_LIBS, []) + AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations]) + $8 ]) ]) diff --git a/config/iconv.m4 b/config/iconv.m4 index a285e9daa5..99b339a9f8 100644 --- a/config/iconv.m4 +++ b/config/iconv.m4 @@ -29,9 +29,9 @@ AC_DEFUN([AM_ICONV_LINK], AC_REQUIRE([AM_ICONV_LINKFLAGS_BODY]) dnl Add $INCICONV to CPPFLAGS before performing the following checks, - dnl because if the user has installed libiconv and not disabled its use - dnl via --without-libiconv-prefix, he wants to use it. The first - dnl AC_LINK_IFELSE will then fail, the second AC_LINK_IFELSE will succeed. + dnl so that if libiconv is installed, it will be used (unless disabled + dnl via --without-libiconv-prefix). The first AC_LINK_IFELSE will + dnl then fail, the second AC_LINK_IFELSE will succeed. am_save_CPPFLAGS="$CPPFLAGS" AC_LIB_APPENDTOVAR([CPPFLAGS], [$INCICONV]) @@ -269,8 +269,7 @@ size_t iconv(); [am_cv_proto_iconv_arg1="const"]) am_cv_proto_iconv="extern size_t iconv (iconv_t cd, $am_cv_proto_iconv_arg1 char * *inbuf, size_t *inbytesleft, char * *outbuf, size_t *outbytesleft);"]) am_cv_proto_iconv=`echo "[$]am_cv_proto_iconv" | tr -s ' ' | sed -e 's/( /(/'` - AC_MSG_RESULT([ - $am_cv_proto_iconv]) + AC_MSG_RESULT([$am_cv_proto_iconv]) else dnl When compiling GNU libiconv on a system that does not have iconv yet, dnl pick the POSIX compliant declaration without 'const'. diff --git a/config/kernel-access-ok-type.m4 b/config/kernel-access-ok-type.m4 index 3b2878a55c..dc94334587 100644 --- a/config/kernel-access-ok-type.m4 +++ b/config/kernel-access-ok-type.m4 @@ -4,17 +4,23 @@ dnl # dnl # - access_ok(type, addr, size) dnl # + access_ok(addr, size) dnl # -AC_DEFUN([ZFS_AC_KERNEL_ACCESS_OK_TYPE], [ - AC_MSG_CHECKING([whether access_ok() has 'type' parameter]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE], [ + ZFS_LINUX_TEST_SRC([access_ok_type], [ #include ],[ - const void __user __attribute__((unused)) *addr = (void *) 0xdeadbeef; + const void __user __attribute__((unused)) *addr = + (void *) 0xdeadbeef; unsigned long __attribute__((unused)) size = 1; int error __attribute__((unused)) = access_ok(0, addr, size); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACCESS_OK_TYPE], [ + AC_MSG_CHECKING([whether access_ok() has 'type' parameter]) + ZFS_LINUX_TEST_RESULT([access_ok_type], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_ACCESS_OK_TYPE, 1, [kernel has access_ok with 'type' parameter]) + AC_DEFINE(HAVE_ACCESS_OK_TYPE, 1, + [kernel has access_ok with 'type' parameter]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-acl-refcount.m4 b/config/kernel-acl-refcount.m4 deleted file mode 100644 index 43e3c442dc..0000000000 --- a/config/kernel-acl-refcount.m4 +++ /dev/null @@ -1,20 +0,0 @@ -dnl # -dnl # 4.16 kernel: check if struct posix_acl acl.a_refcount is a refcount_t. -dnl # It's an atomic_t on older kernels. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_ACL_HAS_REFCOUNT], [ - AC_MSG_CHECKING([whether posix_acl has refcount_t]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - #include - ],[ - struct posix_acl acl; - refcount_t *r __attribute__ ((unused)) = &acl.a_refcount; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_ACL_REFCOUNT, 1, [posix_acl has refcount_t]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 index 02cc020e5c..a155b59d00 100644 --- a/config/kernel-acl.m4 +++ b/config/kernel-acl.m4 @@ -3,32 +3,26 @@ dnl # Check if posix_acl_release can be used from a ZFS_META_LICENSED dnl # module. The is_owner_or_cap macro was replaced by dnl # inode_owner_or_capable dnl # -AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ - AC_MSG_CHECKING([whether posix_acl_release() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_RELEASE], [ + ZFS_LINUX_TEST_SRC([posix_acl_release], [ #include #include #include - ],[ - struct posix_acl* tmp = posix_acl_alloc(1, 0); + ], [ + struct posix_acl *tmp = posix_acl_alloc(1, 0); posix_acl_release(tmp); - ],[ + ], [], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ + AC_MSG_CHECKING([whether posix_acl_release() is available]) + ZFS_LINUX_TEST_RESULT([posix_acl_release], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_POSIX_ACL_RELEASE, 1, [posix_acl_release() is available]) AC_MSG_CHECKING([whether posix_acl_release() is GPL-only]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - #include - #include - - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - struct posix_acl* tmp = posix_acl_alloc(1, 0); - posix_acl_release(tmp); - ],[ + ZFS_LINUX_TEST_RESULT([posix_acl_release_license], [ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(yes) @@ -46,24 +40,25 @@ dnl # set_cached_acl() and forget_cached_acl() changed from inline to dnl # EXPORT_SYMBOL. In the former case, they may not be usable because of dnl # posix_acl_release. In the latter case, we can always use them. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ - AC_MSG_CHECKING([whether set_cached_acl() is usable]) - ZFS_LINUX_TRY_COMPILE([ - #include +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_CACHED_ACL_USABLE], [ + ZFS_LINUX_TEST_SRC([set_cached_acl], [ #include #include #include - - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ + ], [ struct inode *ip = NULL; struct posix_acl *acl = posix_acl_alloc(1, 0); set_cached_acl(ip, ACL_TYPE_ACCESS, acl); forget_cached_acl(ip, ACL_TYPE_ACCESS); - ],[ + ], [], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ + AC_MSG_CHECKING([whether set_cached_acl() is usable]) + ZFS_LINUX_TEST_RESULT([set_cached_acl_license], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_SET_CACHED_ACL_USABLE, 1, - [posix_acl_release() is usable]) + [set_cached_acl() is usable]) ],[ AC_MSG_RESULT(no) ]) @@ -71,58 +66,67 @@ AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ dnl # dnl # 3.1 API change, -dnl # posix_acl_chmod_masq() is not exported anymore and posix_acl_chmod() -dnl # was introduced to replace it. +dnl # posix_acl_chmod() was added as the preferred interface. dnl # dnl # 3.14 API change, -dnl # posix_acl_chmod() is changed to __posix_acl_chmod() +dnl # posix_acl_chmod() was changed to __posix_acl_chmod() dnl # -AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CHMOD], [ - AC_MSG_CHECKING([whether posix_acl_chmod exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_CHMOD], [ + ZFS_LINUX_TEST_SRC([posix_acl_chmod], [ #include #include ],[ posix_acl_chmod(NULL, 0, 0) - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_POSIX_ACL_CHMOD, 1, [posix_acl_chmod() exists]) - ],[ - AC_MSG_RESULT(no) ]) - AC_MSG_CHECKING([whether __posix_acl_chmod exists]) - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([__posix_acl_chmod], [ #include #include ],[ __posix_acl_chmod(NULL, 0, 0) - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CHMOD], [ + AC_MSG_CHECKING([whether __posix_acl_chmod exists]) + ZFS_LINUX_TEST_RESULT([__posix_acl_chmod], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE___POSIX_ACL_CHMOD, 1, [__posix_acl_chmod() exists]) + AC_DEFINE(HAVE___POSIX_ACL_CHMOD, 1, + [__posix_acl_chmod() exists]) ],[ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether posix_acl_chmod exists]) + ZFS_LINUX_TEST_RESULT([posix_acl_chmod], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_CHMOD, 1, + [posix_acl_chmod() exists]) + ],[ + ZFS_LINUX_TEST_ERROR([posix_acl_chmod()]) + ]) ]) ]) dnl # dnl # 3.1 API change, -dnl # posix_acl_equiv_mode now wants an umode_t* instead of a mode_t* +dnl # posix_acl_equiv_mode now wants an umode_t instead of a mode_t dnl # -AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ - AC_MSG_CHECKING([whether posix_acl_equiv_mode() wants umode_t]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ + ZFS_LINUX_TEST_SRC([posix_acl_equiv_mode], [ #include #include ],[ umode_t tmp; - posix_acl_equiv_mode(NULL,&tmp); - ],[ + posix_acl_equiv_mode(NULL, &tmp); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ + AC_MSG_CHECKING([whether posix_acl_equiv_mode() wants umode_t]) + ZFS_LINUX_TEST_RESULT([posix_acl_equiv_mode], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T, 1, - [ posix_acl_equiv_mode wants umode_t*]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([posix_acl_equiv_mode()]) ]) ]) @@ -130,9 +134,8 @@ dnl # dnl # 4.8 API change, dnl # The function posix_acl_valid now must be passed a namespace. dnl # -AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS], [ - AC_MSG_CHECKING([whether posix_acl_valid() wants user namespace]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_VALID_WITH_NS], [ + ZFS_LINUX_TEST_SRC([posix_acl_valid_with_ns], [ #include #include ],[ @@ -141,7 +144,12 @@ AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS], [ int error; error = posix_acl_valid(user_ns, acl); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS], [ + AC_MSG_CHECKING([whether posix_acl_valid() wants user namespace]) + ZFS_LINUX_TEST_RESULT([posix_acl_valid_with_ns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_POSIX_ACL_VALID_WITH_NS, 1, [posix_acl_valid() wants user namespace]) @@ -150,118 +158,15 @@ AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS], [ ]) ]) -dnl # -dnl # 2.6.27 API change, -dnl # Check if inode_operations contains the function permission -dnl # and expects the nameidata structure to have been removed. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION], [ - AC_MSG_CHECKING([whether iops->permission() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int permission_fn(struct inode *inode, int mask) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .permission = permission_fn, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 2.6.26 API change, -dnl # Check if inode_operations contains the function permission -dnl # and expects the nameidata structure to be passed. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA], [ - AC_MSG_CHECKING([whether iops->permission() wants nameidata]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - int permission_fn(struct inode *inode, int mask, - struct nameidata *nd) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .permission = permission_fn, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) - AC_DEFINE(HAVE_PERMISSION_WITH_NAMEIDATA, 1, - [iops->permission() with nameidata exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 2.6.32 API change, -dnl # Check if inode_operations contains the function check_acl -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL], [ - AC_MSG_CHECKING([whether iops->check_acl() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int check_acl_fn(struct inode *inode, int mask) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .check_acl = check_acl_fn, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 2.6.38 API change, -dnl # The function check_acl gained a new parameter: flags -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS], [ - AC_MSG_CHECKING([whether iops->check_acl() wants flags]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int check_acl_fn(struct inode *inode, int mask, - unsigned int flags) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .check_acl = check_acl_fn, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) - AC_DEFINE(HAVE_CHECK_ACL_WITH_FLAGS, 1, - [iops->check_acl() wants flags]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - dnl # dnl # 3.1 API change, dnl # Check if inode_operations contains the function get_acl dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ - AC_MSG_CHECKING([whether iops->get_acl() exists]) - ZFS_LINUX_TRY_COMPILE([ +dnl # 5.15 API change, +dnl # Added the bool rcu argument to get_acl for rcu path walk. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [ + ZFS_LINUX_TEST_SRC([inode_operations_get_acl], [ #include struct posix_acl *get_acl_fn(struct inode *inode, int type) @@ -271,12 +176,33 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ iops __attribute__ ((unused)) = { .get_acl = get_acl_fn, }; - ],[ - ],[ + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_get_acl_rcu], [ + #include + + struct posix_acl *get_acl_fn(struct inode *inode, int type, + bool rcu) { return NULL; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .get_acl = get_acl_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ + AC_MSG_CHECKING([whether iops->get_acl() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_get_acl], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GET_ACL, 1, [iops->get_acl() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_RESULT([inode_operations_get_acl_rcu], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_ACL_RCU, 1, [iops->get_acl() takes rcu]) + ],[ + ZFS_LINUX_TEST_ERROR([iops->get_acl()]) + ]) ]) ]) @@ -284,24 +210,48 @@ dnl # dnl # 3.14 API change, dnl # Check if inode_operations contains the function set_acl dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL], [ - AC_MSG_CHECKING([whether iops->set_acl() exists]) - ZFS_LINUX_TRY_COMPILE([ +dnl # 5.12 API change, +dnl # set_acl() added a user_namespace* parameter first +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ + ZFS_LINUX_TEST_SRC([inode_operations_set_acl_userns], [ #include - int set_acl_fn(struct inode *inode, struct posix_acl *acl, int type) - { return 0; } + int set_acl_fn(struct user_namespace *userns, + struct inode *inode, struct posix_acl *acl, + int type) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { .set_acl = set_acl_fn, }; - ],[ - ],[ + ],[]) + ZFS_LINUX_TEST_SRC([inode_operations_set_acl], [ + #include + + int set_acl_fn(struct inode *inode, struct posix_acl *acl, + int type) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .set_acl = set_acl_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL], [ + AC_MSG_CHECKING([whether iops->set_acl() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl_userns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) + AC_DEFINE(HAVE_SET_ACL_USERNS, 1, [iops->set_acl() takes 4 args]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists, takes 3 args]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) @@ -311,16 +261,71 @@ dnl # The kernel get_acl will now check cache before calling i_op->get_acl and dnl # do set_cached_acl after that, so i_op->get_acl don't need to do that dnl # anymore. dnl # -AC_DEFUN([ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE], [ - AC_MSG_CHECKING([whether uncached_acl_sentinel() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_ACL_HANDLE_CACHE], [ + ZFS_LINUX_TEST_SRC([get_acl_handle_cache], [ #include ],[ - void *sentinel __attribute__ ((unused)) = uncached_acl_sentinel(NULL); - ],[ + void *sentinel __attribute__ ((unused)) = + uncached_acl_sentinel(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE], [ + AC_MSG_CHECKING([whether uncached_acl_sentinel() exists]) + ZFS_LINUX_TEST_RESULT([get_acl_handle_cache], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KERNEL_GET_ACL_HANDLE_CACHE, 1, [uncached_acl_sentinel() exists]) + AC_DEFINE(HAVE_KERNEL_GET_ACL_HANDLE_CACHE, 1, + [uncached_acl_sentinel() exists]) ],[ AC_MSG_RESULT(no) ]) ]) + +dnl # +dnl # 4.16 kernel: check if struct posix_acl acl.a_refcount is a refcount_t. +dnl # It's an atomic_t on older kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACL_HAS_REFCOUNT], [ + ZFS_LINUX_TEST_SRC([acl_refcount], [ + #include + #include + #include + ],[ + struct posix_acl acl; + refcount_t *r __attribute__ ((unused)) = &acl.a_refcount; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACL_HAS_REFCOUNT], [ + AC_MSG_CHECKING([whether posix_acl has refcount_t]) + ZFS_LINUX_TEST_RESULT([acl_refcount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ACL_REFCOUNT, 1, [posix_acl has refcount_t]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACL], [ + ZFS_AC_KERNEL_SRC_POSIX_ACL_RELEASE + ZFS_AC_KERNEL_SRC_SET_CACHED_ACL_USABLE + ZFS_AC_KERNEL_SRC_POSIX_ACL_CHMOD + ZFS_AC_KERNEL_SRC_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T + ZFS_AC_KERNEL_SRC_POSIX_ACL_VALID_WITH_NS + ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL + ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL + ZFS_AC_KERNEL_SRC_GET_ACL_HANDLE_CACHE + ZFS_AC_KERNEL_SRC_ACL_HAS_REFCOUNT +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACL], [ + ZFS_AC_KERNEL_POSIX_ACL_RELEASE + ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE + ZFS_AC_KERNEL_POSIX_ACL_CHMOD + ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T + ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS + ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL + ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL + ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE + ZFS_AC_KERNEL_ACL_HAS_REFCOUNT +]) diff --git a/config/kernel-aio-fsync.m4 b/config/kernel-aio-fsync.m4 index 41b7a98a6b..b4dbf29ba7 100644 --- a/config/kernel-aio-fsync.m4 +++ b/config/kernel-aio-fsync.m4 @@ -1,21 +1,23 @@ dnl # dnl # Linux 4.9-rc5+ ABI, removal of the .aio_fsync field dnl # -AC_DEFUN([ZFS_AC_KERNEL_AIO_FSYNC], [ - AC_MSG_CHECKING([whether fops->aio_fsync() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_AIO_FSYNC], [ + ZFS_LINUX_TEST_SRC([aio_fsync], [ #include static const struct file_operations fops __attribute__ ((unused)) = { .aio_fsync = NULL, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_AIO_FSYNC], [ + AC_MSG_CHECKING([whether fops->aio_fsync() exists]) + ZFS_LINUX_TEST_RESULT([aio_fsync], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_FILE_AIO_FSYNC, 1, [fops->aio_fsync() exists]) ],[ AC_MSG_RESULT(no) ]) ]) - diff --git a/config/kernel-automount.m4 b/config/kernel-automount.m4 index 1ee4c168d4..f7bb63c681 100644 --- a/config/kernel-automount.m4 +++ b/config/kernel-automount.m4 @@ -5,19 +5,21 @@ dnl # solution to handling automounts. Prior to this cifs/nfs clients dnl # which required automount support would abuse the follow_link() dnl # operation on directories for this purpose. dnl # -AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ - AC_MSG_CHECKING([whether dops->d_automount() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ + ZFS_LINUX_TEST_SRC([dentry_operations_d_automount], [ #include struct vfsmount *d_automount(struct path *p) { return NULL; } struct dentry_operations dops __attribute__ ((unused)) = { .d_automount = d_automount, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_AUTOMOUNT, 1, [dops->automount() exists]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ + AC_MSG_CHECKING([whether dops->d_automount() exists]) + ZFS_LINUX_TEST_RESULT([dentry_operations_d_automount], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([dops->d_automount()]) ]) ]) diff --git a/config/kernel-bdev-logical-size.m4 b/config/kernel-bdev-logical-size.m4 deleted file mode 100644 index a6194577ab..0000000000 --- a/config/kernel-bdev-logical-size.m4 +++ /dev/null @@ -1,25 +0,0 @@ -dnl # -dnl # 2.6.30 API change -dnl # bdev_hardsect_size() replaced with bdev_logical_block_size(). While -dnl # it has been true for a while that there was no strict 1:1 mapping -dnl # between physical sector size and logical block size this change makes -dnl # it explicit. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE], [ - AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct block_device *bdev = NULL; - bdev_logical_block_size(bdev); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BDEV_LOGICAL_BLOCK_SIZE, 1, - [bdev_logical_block_size() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-bdev-physical-size.m4 b/config/kernel-bdev-physical-size.m4 deleted file mode 100644 index 77746ee916..0000000000 --- a/config/kernel-bdev-physical-size.m4 +++ /dev/null @@ -1,39 +0,0 @@ -dnl # -dnl # 2.6.30 API change -dnl # -dnl # The bdev_physical_block_size() interface was added to provide a way -dnl # to determine the smallest write which can be performed without a -dnl # read-modify-write operation. From the kernel documentation: -dnl # -dnl # What: /sys/block//queue/physical_block_size -dnl # Date: May 2009 -dnl # Contact: Martin K. Petersen -dnl # Description: -dnl # This is the smallest unit the storage device can write -dnl # without resorting to read-modify-write operation. It is -dnl # usually the same as the logical block size but may be -dnl # bigger. One example is SATA drives with 4KB sectors -dnl # that expose a 512-byte logical block size to the -dnl # operating system. -dnl # -dnl # Unfortunately, this interface isn't entirely reliable because -dnl # drives are sometimes known to misreport this value. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [ - AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct block_device *bdev = NULL; - bdev_physical_block_size(bdev); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1, - [bdev_physical_block_size() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-bdi.m4 b/config/kernel-bdi.m4 index cb7479ee9c..9758863a9c 100644 --- a/config/kernel-bdi.m4 +++ b/config/kernel-bdi.m4 @@ -1,56 +1,80 @@ dnl # -dnl # 2.6.32 - 2.6.33, bdi_setup_and_register() is not exported. -dnl # 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. -dnl # 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. -dnl # 4.12 - x.y, super_setup_bdi_name() new interface. +dnl # Check available BDI interfaces. dnl # -AC_DEFUN([ZFS_AC_KERNEL_BDI], [ - AC_MSG_CHECKING([whether super_setup_bdi_name() exists]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_BDI], [ + ZFS_LINUX_TEST_SRC([super_setup_bdi_name], [ #include struct super_block sb; ], [ char *name = "bdi"; atomic_long_t zfs_bdi_seq; + int error __attribute__((unused)); + atomic_long_set(&zfs_bdi_seq, 0); + error = + super_setup_bdi_name(&sb, "%.28s-%ld", name, + atomic_long_inc_return(&zfs_bdi_seq)); + ]) + + ZFS_LINUX_TEST_SRC([bdi_setup_and_register], [ + #include + struct backing_dev_info bdi; + ], [ + char *name = "bdi"; int error __attribute__((unused)) = - super_setup_bdi_name(&sb, "%.28s-%ld", name, atomic_long_inc_return(&zfs_bdi_seq)); - ], [super_setup_bdi_name], [fs/super.c], [ + bdi_setup_and_register(&bdi, name); + ]) + + ZFS_LINUX_TEST_SRC([bdi_setup_and_register_3args], [ + #include + struct backing_dev_info bdi; + ], [ + char *name = "bdi"; + unsigned int cap = BDI_CAP_MAP_COPY; + int error __attribute__((unused)) = + bdi_setup_and_register(&bdi, name, cap); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BDI], [ + dnl # + dnl # 4.12, super_setup_bdi_name() introduced. + dnl # + AC_MSG_CHECKING([whether super_setup_bdi_name() exists]) + ZFS_LINUX_TEST_RESULT_SYMBOL([super_setup_bdi_name], + [super_setup_bdi_name], [fs/super.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_SUPER_SETUP_BDI_NAME, 1, [super_setup_bdi_name() exits]) ], [ AC_MSG_RESULT(no) + + dnl # + dnl # 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. + dnl # AC_MSG_CHECKING( [whether bdi_setup_and_register() wants 2 args]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - struct backing_dev_info bdi; - ], [ - char *name = "bdi"; - int error __attribute__((unused)) = - bdi_setup_and_register(&bdi, name); - ], [bdi_setup_and_register], [mm/backing-dev.c], [ + ZFS_LINUX_TEST_RESULT_SYMBOL([bdi_setup_and_register], + [bdi_setup_and_register], [mm/backing-dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_2ARGS_BDI_SETUP_AND_REGISTER, 1, [bdi_setup_and_register() wants 2 args]) ], [ AC_MSG_RESULT(no) + + dnl # + dnl # 2.6.34 - 3.19, bdi_setup_and_register() + dnl # takes 3 arguments. + dnl # AC_MSG_CHECKING( [whether bdi_setup_and_register() wants 3 args]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - struct backing_dev_info bdi; - ], [ - char *name = "bdi"; - unsigned int cap = BDI_CAP_MAP_COPY; - int error __attribute__((unused)) = - bdi_setup_and_register(&bdi, name, cap); - ], [bdi_setup_and_register], [mm/backing-dev.c], [ + ZFS_LINUX_TEST_RESULT_SYMBOL( + [bdi_setup_and_register_3args], + [bdi_setup_and_register], [mm/backing-dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_3ARGS_BDI_SETUP_AND_REGISTER, 1, [bdi_setup_and_register() wants 3 args]) ], [ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([bdi_setup]) ]) ]) ]) diff --git a/config/kernel-bio-bvec-iter.m4 b/config/kernel-bio-bvec-iter.m4 deleted file mode 100644 index 64c989386b..0000000000 --- a/config/kernel-bio-bvec-iter.m4 +++ /dev/null @@ -1,20 +0,0 @@ -dnl # -dnl # 3.14 API change, -dnl # Immutable biovecs. A number of fields of struct bio are moved to -dnl # struct bvec_iter. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_BVEC_ITER], [ - AC_MSG_CHECKING([whether bio has bi_iter]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct bio bio; - bio.bi_iter.bi_sector = 0; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_BVEC_ITER, 1, [bio has bi_iter]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - diff --git a/config/kernel-bio-end-io-t-args.m4 b/config/kernel-bio-end-io-t-args.m4 deleted file mode 100644 index 3c420cc0c3..0000000000 --- a/config/kernel-bio-end-io-t-args.m4 +++ /dev/null @@ -1,46 +0,0 @@ -dnl # -dnl # 4.3 API change -dnl # Error argument dropped from bio_endio in favor of newly introduced -dnl # bio->bi_error. This also replaces bio->bi_flags value BIO_UPTODATE. -dnl # Introduced by torvalds/linux@4246a0b63bd8f56a1469b12eafeb875b1041a451 -dnl # ("block: add a bi_error field to struct bio"). -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_END_IO_T_ARGS], [ - AC_MSG_CHECKING([whether bio_end_io_t wants 1 arg]) - ZFS_LINUX_TRY_COMPILE([ - #include - - void wanted_end_io(struct bio *bio) { return; } - - bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_BIO_END_IO_T, 1, - [bio_end_io_t wants 1 arg]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 4.13 API change -dnl # The bio->bi_error field was replaced with bio->bi_status which is an -dnl # enum which describes all possible error types. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_STATUS], [ - AC_MSG_CHECKING([whether bio->bi_status exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct bio bio __attribute__ ((unused)); - blk_status_t status __attribute__ ((unused)) = BLK_STS_OK; - - bio.bi_status = status; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_BI_STATUS, 1, [bio->bi_status exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-bio-failfast.m4 b/config/kernel-bio-failfast.m4 deleted file mode 100644 index cfbec05238..0000000000 --- a/config/kernel-bio-failfast.m4 +++ /dev/null @@ -1,39 +0,0 @@ -dnl # -dnl # Preferred interface for setting FAILFAST on a bio: -dnl # 2.6.28-2.6.35: BIO_RW_FAILFAST_{DEV|TRANSPORT|DRIVER} -dnl # >= 2.6.36: REQ_FAILFAST_{DEV|TRANSPORT|DRIVER} -dnl # - -AC_DEFUN([ZFS_AC_KERNEL_BIO_FAILFAST_DTD], [ - AC_MSG_CHECKING([whether BIO_RW_FAILFAST_* are defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = ((1 << BIO_RW_FAILFAST_DEV) | - (1 << BIO_RW_FAILFAST_TRANSPORT) | - (1 << BIO_RW_FAILFAST_DRIVER)); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_RW_FAILFAST_DTD, 1, - [BIO_RW_FAILFAST_* are defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_REQ_FAILFAST_MASK], [ - AC_MSG_CHECKING([whether REQ_FAILFAST_MASK is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = REQ_FAILFAST_MASK; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_FAILFAST_MASK, 1, - [REQ_FAILFAST_MASK is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-bio-op.m4 b/config/kernel-bio-op.m4 deleted file mode 100644 index 8299e490c2..0000000000 --- a/config/kernel-bio-op.m4 +++ /dev/null @@ -1,84 +0,0 @@ -dnl # -dnl # Linux 4.8 API, -dnl # -dnl # The bio_op() helper was introduced as a replacement for explicitly -dnl # checking the bio->bi_rw flags. The following checks are used to -dnl # detect if a specific operation is supported. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_REQ_OP_DISCARD], [ - AC_MSG_CHECKING([whether REQ_OP_DISCARD is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int op __attribute__ ((unused)) = REQ_OP_DISCARD; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_OP_DISCARD, 1, - [REQ_OP_DISCARD is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_REQ_OP_SECURE_ERASE], [ - AC_MSG_CHECKING([whether REQ_OP_SECURE_ERASE is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int op __attribute__ ((unused)) = REQ_OP_SECURE_ERASE; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_OP_SECURE_ERASE, 1, - [REQ_OP_SECURE_ERASE is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - - -AC_DEFUN([ZFS_AC_KERNEL_REQ_OP_FLUSH], [ - AC_MSG_CHECKING([whether REQ_OP_FLUSH is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int op __attribute__ ((unused)) = REQ_OP_FLUSH; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_OP_FLUSH, 1, - [REQ_OP_FLUSH is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_OPF], [ - AC_MSG_CHECKING([whether bio->bi_opf is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct bio bio __attribute__ ((unused)); - bio.bi_opf = 0; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_BI_OPF, 1, [bio->bi_opf is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS], [ - AC_MSG_CHECKING([whether bio_set_op_attrs is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct bio *bio __attribute__ ((unused)) = NULL; - - bio_set_op_attrs(bio, 0, 0); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_SET_OP_ATTRS, 1, - [bio_set_op_attrs is available]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-bio-rw-barrier.m4 b/config/kernel-bio-rw-barrier.m4 deleted file mode 100644 index bcf0f7ea00..0000000000 --- a/config/kernel-bio-rw-barrier.m4 +++ /dev/null @@ -1,25 +0,0 @@ -dnl # -dnl # Interface for issuing a discard bio: -dnl # 2.6.28-2.6.35: BIO_RW_BARRIER -dnl # 2.6.36-3.x: REQ_BARRIER -dnl # - -dnl # Since REQ_BARRIER is a preprocessor definition, there is no need for an -dnl # autotools check for it. Also, REQ_BARRIER existed in the request layer -dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the -dnl # request layer and bio layer flags, so it would be wrong to assume that -dnl # the APIs are mutually exclusive contrary to the typical case. -AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_BARRIER], [ - AC_MSG_CHECKING([whether BIO_RW_BARRIER is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = BIO_RW_BARRIER; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_RW_BARRIER, 1, [BIO_RW_BARRIER is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-bio-rw-discard.m4 b/config/kernel-bio-rw-discard.m4 deleted file mode 100644 index 0554b9a9da..0000000000 --- a/config/kernel-bio-rw-discard.m4 +++ /dev/null @@ -1,25 +0,0 @@ -dnl # -dnl # Interface for issuing a discard bio: -dnl # 2.6.28-2.6.35: BIO_RW_DISCARD -dnl # 2.6.36-3.x: REQ_DISCARD -dnl # - -dnl # Since REQ_DISCARD is a preprocessor definition, there is no need for an -dnl # autotools check for it. Also, REQ_DISCARD existed in the request layer -dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the -dnl # request layer and bio layer flags, so it would be wrong to assume that -dnl # the APIs are mutually exclusive contrary to the typical case. -AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_DISCARD], [ - AC_MSG_CHECKING([whether BIO_RW_DISCARD is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = BIO_RW_DISCARD; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_RW_DISCARD, 1, [BIO_RW_DISCARD is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-bio.m4 b/config/kernel-bio.m4 new file mode 100644 index 0000000000..aad4d31cf2 --- /dev/null +++ b/config/kernel-bio.m4 @@ -0,0 +1,431 @@ +dnl # +dnl # 2.6.36 API change, +dnl # REQ_FAILFAST_{DEV|TRANSPORT|DRIVER} +dnl # REQ_DISCARD +dnl # REQ_FLUSH +dnl # +dnl # 4.8 - 4.9 API, +dnl # REQ_FLUSH was renamed to REQ_PREFLUSH +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REQ], [ + ZFS_LINUX_TEST_SRC([req_failfast_mask], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_FAILFAST_MASK; + ]) + + ZFS_LINUX_TEST_SRC([req_discard], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_DISCARD; + ]) + + ZFS_LINUX_TEST_SRC([req_flush], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_FLUSH; + ]) + + ZFS_LINUX_TEST_SRC([req_preflush], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_PREFLUSH; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_FAILFAST_MASK], [ + AC_MSG_CHECKING([whether REQ_FAILFAST_MASK is defined]) + ZFS_LINUX_TEST_RESULT([req_failfast_mask], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([REQ_FAILFAST_MASK]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_DISCARD], [ + AC_MSG_CHECKING([whether REQ_DISCARD is defined]) + ZFS_LINUX_TEST_RESULT([req_discard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_DISCARD, 1, [REQ_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_FLUSH], [ + AC_MSG_CHECKING([whether REQ_FLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_FLUSH, 1, [REQ_FLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_PREFLUSH], [ + AC_MSG_CHECKING([whether REQ_PREFLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_preflush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_PREFLUSH, 1, [REQ_PREFLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 4.8 API, +dnl # +dnl # The bio_op() helper was introduced as a replacement for explicitly +dnl # checking the bio->bi_rw flags. The following checks are used to +dnl # detect if a specific operation is supported. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_OPS], [ + ZFS_LINUX_TEST_SRC([req_op_discard], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_DISCARD; + ]) + + ZFS_LINUX_TEST_SRC([req_op_secure_erase], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_SECURE_ERASE; + ]) + + ZFS_LINUX_TEST_SRC([req_op_flush], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_FLUSH; + ]) + + ZFS_LINUX_TEST_SRC([bio_bi_opf], [ + #include + ],[ + struct bio bio __attribute__ ((unused)); + bio.bi_opf = 0; + ]) + + ZFS_LINUX_TEST_SRC([bio_set_op_attrs], [ + #include + ],[ + struct bio *bio __attribute__ ((unused)) = NULL; + bio_set_op_attrs(bio, 0, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_DISCARD], [ + AC_MSG_CHECKING([whether REQ_OP_DISCARD is defined]) + ZFS_LINUX_TEST_RESULT([req_op_discard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_DISCARD, 1, [REQ_OP_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_SECURE_ERASE], [ + AC_MSG_CHECKING([whether REQ_OP_SECURE_ERASE is defined]) + ZFS_LINUX_TEST_RESULT([req_op_secure_erase], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_SECURE_ERASE, 1, + [REQ_OP_SECURE_ERASE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_FLUSH], [ + AC_MSG_CHECKING([whether REQ_OP_FLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_op_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_FLUSH, 1, [REQ_OP_FLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_OPF], [ + AC_MSG_CHECKING([whether bio->bi_opf is defined]) + ZFS_LINUX_TEST_RESULT([bio_bi_opf], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BI_OPF, 1, [bio->bi_opf is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_OP_ATTRS], [ + AC_MSG_CHECKING([whether bio_set_op_attrs is available]) + ZFS_LINUX_TEST_RESULT([bio_set_op_attrs], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_OP_ATTRS, 1, + [bio_set_op_attrs is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 4.14 API, +dnl # +dnl # The bio_set_dev() helper macro was introduced as part of the transition +dnl # to have struct gendisk in struct bio. +dnl # +dnl # Linux 5.0 API, +dnl # +dnl # The bio_set_dev() helper macro was updated to internally depend on +dnl # bio_associate_blkg() symbol which is exported GPL-only. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_SET_DEV], [ + ZFS_LINUX_TEST_SRC([bio_set_dev], [ + #include + #include + ],[ + struct block_device *bdev = NULL; + struct bio *bio = NULL; + bio_set_dev(bio, bdev); + ], [], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV], [ + AC_MSG_CHECKING([whether bio_set_dev() is available]) + ZFS_LINUX_TEST_RESULT([bio_set_dev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_DEV, 1, [bio_set_dev() is available]) + + AC_MSG_CHECKING([whether bio_set_dev() is GPL-only]) + ZFS_LINUX_TEST_RESULT([bio_set_dev_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_DEV_GPL_ONLY, 1, + [bio_set_dev() GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.3 API change +dnl # Error argument dropped from bio_endio in favor of newly introduced +dnl # bio->bi_error. This also replaces bio->bi_flags value BIO_UPTODATE. +dnl # Introduced by torvalds/linux@4246a0b63bd8f56a1469b12eafeb875b1041a451 +dnl # ("block: add a bi_error field to struct bio"). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_END_IO_T_ARGS], [ + ZFS_LINUX_TEST_SRC([bio_end_io_t_args], [ + #include + void wanted_end_io(struct bio *bio) { return; } + bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io; + ], []) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_END_IO_T_ARGS], [ + AC_MSG_CHECKING([whether bio_end_io_t wants 1 arg]) + ZFS_LINUX_TEST_RESULT([bio_end_io_t_args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_BIO_END_IO_T, 1, + [bio_end_io_t wants 1 arg]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.13 API change +dnl # The bio->bi_error field was replaced with bio->bi_status which is an +dnl # enum which describes all possible error types. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_BI_STATUS], [ + ZFS_LINUX_TEST_SRC([bio_bi_status], [ + #include + ], [ + struct bio bio __attribute__ ((unused)); + blk_status_t status __attribute__ ((unused)) = BLK_STS_OK; + bio.bi_status = status; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_STATUS], [ + AC_MSG_CHECKING([whether bio->bi_status exists]) + ZFS_LINUX_TEST_RESULT([bio_bi_status], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BI_STATUS, 1, [bio->bi_status exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.14 API change, +dnl # Immutable biovecs. A number of fields of struct bio are moved to +dnl # struct bvec_iter. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_BVEC_ITER], [ + ZFS_LINUX_TEST_SRC([bio_bvec_iter], [ + #include + ],[ + struct bio bio; + bio.bi_iter.bi_sector = 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BVEC_ITER], [ + AC_MSG_CHECKING([whether bio has bi_iter]) + ZFS_LINUX_TEST_RESULT([bio_bvec_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BVEC_ITER, 1, [bio has bi_iter]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.8 API change +dnl # The rw argument has been removed from submit_bio/submit_bio_wait. +dnl # Callers are now expected to set bio->bi_rw instead of passing it in. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_SUBMIT_BIO], [ + ZFS_LINUX_TEST_SRC([submit_bio], [ + #include + ],[ + struct bio *bio = NULL; + (void) submit_bio(bio); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SUBMIT_BIO], [ + AC_MSG_CHECKING([whether submit_bio() wants 1 arg]) + ZFS_LINUX_TEST_RESULT([submit_bio], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_SUBMIT_BIO, 1, [submit_bio() wants 1 arg]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # current->bio_list +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_CURRENT_BIO_LIST], [ + ZFS_LINUX_TEST_SRC([current_bio_list], [ + #include + ], [ + current->bio_list = (struct bio_list *) NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_CURRENT_BIO_LIST], [ + AC_MSG_CHECKING([whether current->bio_list exists]) + ZFS_LINUX_TEST_RESULT([current_bio_list], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bio_list]) + ]) +]) + +dnl # +dnl # Linux 5.5 API, +dnl # +dnl # The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by +dnl # blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). +dnl # As a side effect the function was converted to GPL-only. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKG_TRYGET], [ + ZFS_LINUX_TEST_SRC([blkg_tryget], [ + #include + #include + #include + ],[ + struct blkcg_gq blkg __attribute__ ((unused)) = {}; + bool rc __attribute__ ((unused)); + rc = blkg_tryget(&blkg); + ], [], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKG_TRYGET], [ + AC_MSG_CHECKING([whether blkg_tryget() is available]) + ZFS_LINUX_TEST_RESULT([blkg_tryget], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKG_TRYGET, 1, [blkg_tryget() is available]) + + AC_MSG_CHECKING([whether blkg_tryget() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blkg_tryget_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKG_TRYGET_GPL_ONLY, 1, + [blkg_tryget() GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 5.12 API, +dnl # +dnl # The Linux 5.12 kernel updated struct bio to create a new bi_bdev member +dnl # and bio->bi_disk was moved to bio->bi_bdev->bd_disk +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_BDEV_DISK], [ + ZFS_LINUX_TEST_SRC([bio_bdev_disk], [ + #include + #include + ],[ + struct bio *b = NULL; + struct gendisk *d = b->bi_bdev->bd_disk; + blk_register_queue(d); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BDEV_DISK], [ + AC_MSG_CHECKING([whether bio->bi_bdev->bd_disk exists]) + ZFS_LINUX_TEST_RESULT([bio_bdev_disk], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BDEV_DISK, 1, [bio->bi_bdev->bd_disk exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO], [ + ZFS_AC_KERNEL_SRC_REQ + ZFS_AC_KERNEL_SRC_BIO_OPS + ZFS_AC_KERNEL_SRC_BIO_SET_DEV + ZFS_AC_KERNEL_SRC_BIO_END_IO_T_ARGS + ZFS_AC_KERNEL_SRC_BIO_BI_STATUS + ZFS_AC_KERNEL_SRC_BIO_BVEC_ITER + ZFS_AC_KERNEL_SRC_BIO_SUBMIT_BIO + ZFS_AC_KERNEL_SRC_BIO_CURRENT_BIO_LIST + ZFS_AC_KERNEL_SRC_BLKG_TRYGET + ZFS_AC_KERNEL_SRC_BIO_BDEV_DISK +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO], [ + ZFS_AC_KERNEL_BIO_REQ_FAILFAST_MASK + ZFS_AC_KERNEL_BIO_REQ_DISCARD + ZFS_AC_KERNEL_BIO_REQ_FLUSH + ZFS_AC_KERNEL_BIO_REQ_PREFLUSH + + ZFS_AC_KERNEL_BIO_REQ_OP_DISCARD + ZFS_AC_KERNEL_BIO_REQ_OP_SECURE_ERASE + ZFS_AC_KERNEL_BIO_REQ_OP_FLUSH + ZFS_AC_KERNEL_BIO_BI_OPF + ZFS_AC_KERNEL_BIO_SET_OP_ATTRS + + ZFS_AC_KERNEL_BIO_SET_DEV + ZFS_AC_KERNEL_BIO_END_IO_T_ARGS + ZFS_AC_KERNEL_BIO_BI_STATUS + ZFS_AC_KERNEL_BIO_BVEC_ITER + ZFS_AC_KERNEL_BIO_SUBMIT_BIO + ZFS_AC_KERNEL_BIO_CURRENT_BIO_LIST + ZFS_AC_KERNEL_BLKG_TRYGET + ZFS_AC_KERNEL_BIO_BDEV_DISK +]) diff --git a/config/kernel-bio_max_segs.m4 b/config/kernel-bio_max_segs.m4 new file mode 100644 index 0000000000..a90d75455c --- /dev/null +++ b/config/kernel-bio_max_segs.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 5.12 API change removes BIO_MAX_PAGES in favor of bio_max_segs() +dnl # which will handle the logic of setting the upper-bound to a +dnl # BIO_MAX_PAGES, internally. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS], [ + ZFS_LINUX_TEST_SRC([bio_max_segs], [ + #include + ],[ + bio_max_segs(1); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_MAX_SEGS], [ + AC_MSG_CHECKING([whether bio_max_segs() exists]) + ZFS_LINUX_TEST_RESULT([bio_max_segs], [ + AC_MSG_RESULT(yes) + + AC_DEFINE([HAVE_BIO_MAX_SEGS], 1, [bio_max_segs() is implemented]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-bio_set_dev.m4 b/config/kernel-bio_set_dev.m4 deleted file mode 100644 index 71d47a8930..0000000000 --- a/config/kernel-bio_set_dev.m4 +++ /dev/null @@ -1,53 +0,0 @@ -dnl # -dnl # Linux 4.14 API, -dnl # -dnl # The bio_set_dev() helper macro was introduced as part of the transition -dnl # to have struct gendisk in struct bio. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV_MACRO], [ - AC_MSG_CHECKING([whether bio_set_dev() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - struct block_device *bdev = NULL; - struct bio *bio = NULL; - bio_set_dev(bio, bdev); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_SET_DEV, 1, [bio_set_dev() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # Linux 5.0 API, -dnl # -dnl # The bio_set_dev() helper macro was updated to internally depend on -dnl # bio_associate_blkg() symbol which is exported GPL-only. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV_GPL_ONLY], [ - AC_MSG_CHECKING([whether bio_set_dev() is GPL-only]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - #include - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - struct block_device *bdev = NULL; - struct bio *bio = NULL; - bio_set_dev(bio, bdev); - ],[ - AC_MSG_RESULT(no) - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_SET_DEV_GPL_ONLY, 1, - [bio_set_dev() GPL-only]) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV], [ - ZFS_AC_KERNEL_BIO_SET_DEV_MACRO - ZFS_AC_KERNEL_BIO_SET_DEV_GPL_ONLY -]) diff --git a/config/kernel-blk-queue-bdi.m4 b/config/kernel-blk-queue-bdi.m4 deleted file mode 100644 index 816471166a..0000000000 --- a/config/kernel-blk-queue-bdi.m4 +++ /dev/null @@ -1,20 +0,0 @@ -dnl # -dnl # 2.6.32 - 4.11, statically allocated bdi in request_queue -dnl # 4.12 - x.y, dynamically allocated bdi in request_queue -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [ - AC_MSG_CHECKING([whether blk_queue bdi is dynamic]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue q; - struct backing_dev_info bdi; - q.backing_dev_info = &bdi; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_BDI_DYNAMIC, 1, - [blk queue backing_dev_info is dynamic]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-blk-queue-discard.m4 b/config/kernel-blk-queue-discard.m4 deleted file mode 100644 index addbba8144..0000000000 --- a/config/kernel-blk-queue-discard.m4 +++ /dev/null @@ -1,65 +0,0 @@ -dnl # -dnl # 2.6.32 - 4.x API, -dnl # blk_queue_discard() -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [ - AC_MSG_CHECKING([whether blk_queue_discard() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q __attribute__ ((unused)) = NULL; - int value __attribute__ ((unused)); - - value = blk_queue_discard(q); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1, - [blk_queue_discard() is available]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 4.8 - 4.x API, -dnl # blk_queue_secure_erase() -dnl # -dnl # 2.6.36 - 4.7 API, -dnl # blk_queue_secdiscard() -dnl # -dnl # 2.6.x - 2.6.35 API, -dnl # Unsupported by kernel -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [ - AC_MSG_CHECKING([whether blk_queue_secure_erase() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q __attribute__ ((unused)) = NULL; - int value __attribute__ ((unused)); - - value = blk_queue_secure_erase(q); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1, - [blk_queue_secure_erase() is available]) - ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether blk_queue_secdiscard() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q __attribute__ ((unused)) = NULL; - int value __attribute__ ((unused)); - - value = blk_queue_secdiscard(q); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1, - [blk_queue_secdiscard() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - ]) -]) diff --git a/config/kernel-blk-queue-flags.m4 b/config/kernel-blk-queue-flags.m4 deleted file mode 100644 index b570245c74..0000000000 --- a/config/kernel-blk-queue-flags.m4 +++ /dev/null @@ -1,38 +0,0 @@ -dnl # -dnl # API change -dnl # https://github.com/torvalds/linux/commit/8814ce8 -dnl # Introduction of blk_queue_flag_set and blk_queue_flag_clear -dnl # - -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET], [ - AC_MSG_CHECKING([whether blk_queue_flag_set() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - struct request_queue *q = NULL; - blk_queue_flag_set(0, q); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_FLAG_SET, 1, [blk_queue_flag_set() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR], [ - AC_MSG_CHECKING([whether blk_queue_flag_clear() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - struct request_queue *q = NULL; - blk_queue_flag_clear(0, q); - ],[ - - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_FLAG_CLEAR, 1, [blk_queue_flag_clear() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-blk-queue-flush.m4 b/config/kernel-blk-queue-flush.m4 deleted file mode 100644 index 1baab83a4e..0000000000 --- a/config/kernel-blk-queue-flush.m4 +++ /dev/null @@ -1,85 +0,0 @@ -dnl # -dnl # 2.6.36 API change -dnl # In 2.6.36 kernels the blk_queue_ordered() interface has been -dnl # replaced by the simpler blk_queue_flush(). However, while the -dnl # old interface was available to all the new one is GPL-only. -dnl # Thus in addition to detecting if this function is available -dnl # we determine if it is GPL-only. If the GPL-only interface is -dnl # there we implement our own compatibility function, otherwise -dnl # we use the function. The hope is that long term this function -dnl # will be opened up. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLUSH], [ - AC_MSG_CHECKING([whether blk_queue_flush() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q = NULL; - (void) blk_queue_flush(q, REQ_FLUSH); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_FLUSH, 1, - [blk_queue_flush() is available]) - - AC_MSG_CHECKING([whether blk_queue_flush() is GPL-only]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - struct request_queue *q = NULL; - (void) blk_queue_flush(q, REQ_FLUSH); - ],[ - AC_MSG_RESULT(no) - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY, 1, - [blk_queue_flush() is GPL-only]) - ]) - ],[ - AC_MSG_RESULT(no) - ]) - - dnl # - dnl # 4.7 API change - dnl # Replace blk_queue_flush with blk_queue_write_cache - dnl # - AC_MSG_CHECKING([whether blk_queue_write_cache() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - ],[ - struct request_queue *q = NULL; - blk_queue_write_cache(q, true, true); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE, 1, - [blk_queue_write_cache() exists]) - - AC_MSG_CHECKING([whether blk_queue_write_cache() is GPL-only]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - #include - - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - struct request_queue *q = NULL; - blk_queue_write_cache(q, true, true); - ],[ - AC_MSG_RESULT(no) - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY, 1, - [blk_queue_write_cache() is GPL-only]) - ]) - ],[ - AC_MSG_RESULT(no) - ]) - - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-blk-queue-max-hw-sectors.m4 b/config/kernel-blk-queue-max-hw-sectors.m4 deleted file mode 100644 index 2f5515dc6b..0000000000 --- a/config/kernel-blk-queue-max-hw-sectors.m4 +++ /dev/null @@ -1,22 +0,0 @@ -dnl # -dnl # 2.6.34 API change -dnl # blk_queue_max_hw_sectors() replaces blk_queue_max_sectors(). -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [ - AC_MSG_CHECKING([whether blk_queue_max_hw_sectors() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q = NULL; - (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_MAX_HW_SECTORS, 1, - [blk_queue_max_hw_sectors() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-blk-queue-max-segments.m4 b/config/kernel-blk-queue-max-segments.m4 deleted file mode 100644 index b2a40423a5..0000000000 --- a/config/kernel-blk-queue-max-segments.m4 +++ /dev/null @@ -1,23 +0,0 @@ -dnl # -dnl # 2.6.34 API change -dnl # blk_queue_max_segments() consolidates blk_queue_max_hw_segments() -dnl # and blk_queue_max_phys_segments(). -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ - AC_MSG_CHECKING([whether blk_queue_max_segments() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct request_queue *q = NULL; - (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_MAX_SEGMENTS, 1, - [blk_queue_max_segments() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-blk-queue-unplug.m4 b/config/kernel-blk-queue-unplug.m4 deleted file mode 100644 index 075fbccd1a..0000000000 --- a/config/kernel-blk-queue-unplug.m4 +++ /dev/null @@ -1,44 +0,0 @@ -dnl # -dnl # 2.6.32-2.6.35 API - The BIO_RW_UNPLUG enum can be used as a hint -dnl # to unplug the queue. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BIO_RW_UNPLUG], [ - AC_MSG_CHECKING([whether the BIO_RW_UNPLUG enum is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - extern enum bio_rw_flags rw; - - rw = BIO_RW_UNPLUG; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG, 1, - [BIO_RW_UNPLUG is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) - -AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG], [ - AC_MSG_CHECKING([whether struct blk_plug is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct blk_plug plug; - - blk_start_plug(&plug); - blk_finish_plug(&plug); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_HAVE_BLK_PLUG, 1, - [struct blk_plug is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 new file mode 100644 index 0000000000..ff5d2d370e --- /dev/null +++ b/config/kernel-blk-queue.m4 @@ -0,0 +1,342 @@ +dnl # +dnl # 2.6.39 API change, +dnl # blk_start_plug() and blk_finish_plug() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG], [ + ZFS_LINUX_TEST_SRC([blk_plug], [ + #include + ],[ + struct blk_plug plug __attribute__ ((unused)); + + blk_start_plug(&plug); + blk_finish_plug(&plug); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [ + AC_MSG_CHECKING([whether struct blk_plug is available]) + ZFS_LINUX_TEST_RESULT([blk_plug], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_plug]) + ]) +]) + +dnl # +dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue +dnl # 4.12: dynamically allocated bdi in request_queue +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [ + ZFS_LINUX_TEST_SRC([blk_queue_bdi], [ + #include + ],[ + struct request_queue q; + struct backing_dev_info bdi; + q.backing_dev_info = &bdi; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [ + AC_MSG_CHECKING([whether blk_queue bdi is dynamic]) + ZFS_LINUX_TEST_RESULT([blk_queue_bdi], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_BDI_DYNAMIC, 1, + [blk queue backing_dev_info is dynamic]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.9: added blk_queue_update_readahead(), +dnl # 5.15: renamed to disk_update_readahead() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD], [ + ZFS_LINUX_TEST_SRC([blk_queue_update_readahead], [ + #include + ],[ + struct request_queue q; + blk_queue_update_readahead(&q); + ]) + + ZFS_LINUX_TEST_SRC([disk_update_readahead], [ + #include + ],[ + struct gendisk disk; + disk_update_readahead(&disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD], [ + AC_MSG_CHECKING([whether blk_queue_update_readahead() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_update_readahead], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_UPDATE_READAHEAD, 1, + [blk_queue_update_readahead() exists]) + ],[ + AC_MSG_CHECKING([whether disk_update_readahead() exists]) + ZFS_LINUX_TEST_RESULT([disk_update_readahead], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DISK_UPDATE_READAHEAD, 1, + [disk_update_readahead() exists]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) +]) + +dnl # +dnl # 2.6.32 API, +dnl # blk_queue_discard() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD], [ + ZFS_LINUX_TEST_SRC([blk_queue_discard], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_discard(q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [ + AC_MSG_CHECKING([whether blk_queue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_discard], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_discard]) + ]) +]) + +dnl # +dnl # 4.8 API, +dnl # blk_queue_secure_erase() +dnl # +dnl # 2.6.36 - 4.7 API, +dnl # blk_queue_secdiscard() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE], [ + ZFS_LINUX_TEST_SRC([blk_queue_secure_erase], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_secure_erase(q); + ]) + + ZFS_LINUX_TEST_SRC([blk_queue_secdiscard], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_secdiscard(q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [ + AC_MSG_CHECKING([whether blk_queue_secure_erase() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_secure_erase], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1, + [blk_queue_secure_erase() is available]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blk_queue_secdiscard() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_secdiscard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1, + [blk_queue_secdiscard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_secure_erase]) + ]) + ]) +]) + +dnl # +dnl # 4.16 API change, +dnl # Introduction of blk_queue_flag_set and blk_queue_flag_clear +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET], [ + ZFS_LINUX_TEST_SRC([blk_queue_flag_set], [ + #include + #include + ],[ + struct request_queue *q = NULL; + blk_queue_flag_set(0, q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET], [ + AC_MSG_CHECKING([whether blk_queue_flag_set() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_flag_set], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLAG_SET, 1, + [blk_queue_flag_set() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR], [ + ZFS_LINUX_TEST_SRC([blk_queue_flag_clear], [ + #include + #include + ],[ + struct request_queue *q = NULL; + blk_queue_flag_clear(0, q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR], [ + AC_MSG_CHECKING([whether blk_queue_flag_clear() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_flag_clear], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLAG_CLEAR, 1, + [blk_queue_flag_clear() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.36 API change, +dnl # Added blk_queue_flush() interface, while the previous interface +dnl # was available to all the new one is GPL-only. Thus in addition to +dnl # detecting if this function is available we determine if it is +dnl # GPL-only. If the GPL-only interface is there we implement our own +dnl # compatibility function, otherwise we use the function. The hope +dnl # is that long term this function will be opened up. +dnl # +dnl # 4.7 API change, +dnl # Replace blk_queue_flush with blk_queue_write_cache +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH], [ + ZFS_LINUX_TEST_SRC([blk_queue_flush], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_flush(q, REQ_FLUSH); + ], [$NO_UNUSED_BUT_SET_VARIABLE], [ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([blk_queue_write_cache], [ + #include + #include + ], [ + struct request_queue *q = NULL; + blk_queue_write_cache(q, true, true); + ], [$NO_UNUSED_BUT_SET_VARIABLE], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLUSH], [ + AC_MSG_CHECKING([whether blk_queue_flush() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLUSH, 1, + [blk_queue_flush() is available]) + + AC_MSG_CHECKING([whether blk_queue_flush() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blk_queue_flush_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY, 1, + [blk_queue_flush() is GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) + + dnl # + dnl # 4.7 API change + dnl # Replace blk_queue_flush with blk_queue_write_cache + dnl # + AC_MSG_CHECKING([whether blk_queue_write_cache() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_write_cache], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE, 1, + [blk_queue_write_cache() exists]) + + AC_MSG_CHECKING([whether blk_queue_write_cache() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blk_queue_write_cache_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY, 1, + [blk_queue_write_cache() is GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # blk_queue_max_hw_sectors() replaces blk_queue_max_sectors(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS], [ + ZFS_LINUX_TEST_SRC([blk_queue_max_hw_sectors], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [ + AC_MSG_CHECKING([whether blk_queue_max_hw_sectors() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors]) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # blk_queue_max_segments() consolidates blk_queue_max_hw_segments() +dnl # and blk_queue_max_phys_segments(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS], [ + ZFS_LINUX_TEST_SRC([blk_queue_max_segments], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS); + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ + AC_MSG_CHECKING([whether blk_queue_max_segments() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blk_queue_max_segments]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ + ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG + ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI + ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD + ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD + ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH + ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS + ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ + ZFS_AC_KERNEL_BLK_QUEUE_PLUG + ZFS_AC_KERNEL_BLK_QUEUE_BDI + ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD + ZFS_AC_KERNEL_BLK_QUEUE_DISCARD + ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE + ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET + ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR + ZFS_AC_KERNEL_BLK_QUEUE_FLUSH + ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS + ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS +]) diff --git a/config/kernel-blkdev-get-by-path.m4 b/config/kernel-blkdev-get-by-path.m4 deleted file mode 100644 index 40ecc06b6c..0000000000 --- a/config/kernel-blkdev-get-by-path.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 2.6.38 API change -dnl # open_bdev_exclusive() changed to blkdev_get_by_path() -dnl # close_bdev_exclusive() changed to blkdev_put() -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], - [AC_MSG_CHECKING([whether blkdev_get_by_path() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - blkdev_get_by_path(NULL, 0, NULL); - ], [blkdev_get_by_path], [fs/block_dev.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH, 1, - [blkdev_get_by_path() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-blkdev-reread-part.m4 b/config/kernel-blkdev-reread-part.m4 deleted file mode 100644 index 5664769a30..0000000000 --- a/config/kernel-blkdev-reread-part.m4 +++ /dev/null @@ -1,21 +0,0 @@ -dnl # -dnl # 4.1 API, exported blkdev_reread_part() symbol, backported to the -dnl # 3.10.0 CentOS 7.x enterprise kernels. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ - AC_MSG_CHECKING([whether blkdev_reread_part() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ], [ - struct block_device *bdev = NULL; - int error; - - error = blkdev_reread_part(bdev); - ], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, - [blkdev_reread_part() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 new file mode 100644 index 0000000000..61e66421f8 --- /dev/null +++ b/config/kernel-blkdev.m4 @@ -0,0 +1,321 @@ +dnl # +dnl # 2.6.38 API change, +dnl # Added blkdev_get_by_path() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ + ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + + bdev = blkdev_get_by_path(path, mode, holder); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ + AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) +]) + +dnl # +dnl # 2.6.38 API change, +dnl # Added blkdev_put() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ + ZFS_LINUX_TEST_SRC([blkdev_put], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + fmode_t mode = 0; + + blkdev_put(bdev, mode); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ + AC_MSG_CHECKING([whether blkdev_put() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_put], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) +]) + +dnl # +dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the +dnl # 3.10.0 CentOS 7.x enterprise kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [ + ZFS_LINUX_TEST_SRC([blkdev_reread_part], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + int error; + + error = blkdev_reread_part(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ + AC_MSG_CHECKING([whether blkdev_reread_part() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, + [blkdev_reread_part() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # check_disk_change() was removed in 5.10 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [ + ZFS_LINUX_TEST_SRC([check_disk_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + bool error; + + error = check_disk_change(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ + AC_MSG_CHECKING([whether check_disk_change() exists]) + ZFS_LINUX_TEST_RESULT([check_disk_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1, + [check_disk_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.10 API, check_disk_change() is removed, in favor of +dnl # bdev_check_media_change(), which doesn't force revalidation +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ + ZFS_LINUX_TEST_SRC([bdev_check_media_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + int error; + + error = bdev_check_media_change(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ + AC_MSG_CHECKING([whether bdev_check_media_change() exists]) + ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1, + [bdev_check_media_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.22 API change +dnl # Single argument invalidate_bdev() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [ + ZFS_LINUX_TEST_SRC([invalidate_bdev], [ + #include + #include + ],[ + struct block_device *bdev = NULL; + invalidate_bdev(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ + AC_MSG_CHECKING([whether invalidate_bdev() exists]) + ZFS_LINUX_TEST_RESULT([invalidate_bdev], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([invalidate_bdev()]) + ]) +]) + +dnl # +dnl # 5.11 API, lookup_bdev() takes dev_t argument. +dnl # 2.6.27 API, lookup_bdev() was first exported. +dnl # 4.4.0-6.21 API, lookup_bdev() on Ubuntu takes mode argument. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ + ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [ + #include + ], [ + int error __attribute__ ((unused)); + const char path[] = "/example/path"; + dev_t dev; + + error = lookup_bdev(path, &dev); + ]) + + ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)); + const char path[] = "/example/path"; + + bdev = lookup_bdev(path); + ]) + + ZFS_LINUX_TEST_SRC([lookup_bdev_mode], [ + #include + ], [ + struct block_device *bdev __attribute__ ((unused)); + const char path[] = "/example/path"; + + bdev = lookup_bdev(path, FMODE_READ); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ + AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1, + [lookup_bdev() wants dev_t arg]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, + [lookup_bdev() wants 1 arg]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether lookup_bdev() wants mode arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_mode], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MODE_LOOKUP_BDEV, 1, + [lookup_bdev() wants mode arg]) + ], [ + ZFS_LINUX_TEST_ERROR([lookup_bdev()]) + ]) + ]) + ]) +]) + +dnl # +dnl # 2.6.30 API change +dnl # +dnl # The bdev_physical_block_size() interface was added to provide a way +dnl # to determine the smallest write which can be performed without a +dnl # read-modify-write operation. +dnl # +dnl # Unfortunately, this interface isn't entirely reliable because +dnl # drives are sometimes known to misreport this value. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ + ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [ + #include + ],[ + struct block_device *bdev __attribute__ ((unused)) = NULL; + bdev_physical_block_size(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) + ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()]) + ]) +]) + +dnl # +dnl # 2.6.30 API change +dnl # Added bdev_logical_block_size(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ + ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [ + #include + ],[ + struct block_device *bdev __attribute__ ((unused)) = NULL; + bdev_logical_block_size(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) + ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()]) + ]) +]) + +dnl # +dnl # 5.11 API change +dnl # Added bdev_whole() helper. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [ + ZFS_LINUX_TEST_SRC([bdev_whole], [ + #include + ],[ + struct block_device *bdev = NULL; + bdev = bdev_whole(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ + AC_MSG_CHECKING([whether bdev_whole() is available]) + ZFS_LINUX_TEST_RESULT([bdev_whole], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ + ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_SRC_BLKDEV_PUT + ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART + ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV + ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE + ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ + ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_BLKDEV_PUT + ZFS_AC_KERNEL_BLKDEV_REREAD_PART + ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV + ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV + ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE + ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE +]) diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 index 5f2811c153..a48618185b 100644 --- a/config/kernel-block-device-operations.m4 +++ b/config/kernel-block-device-operations.m4 @@ -1,11 +1,8 @@ dnl # dnl # 2.6.38 API change dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ - AC_MSG_CHECKING([whether bops->check_events() exists]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ + ZFS_LINUX_TEST_SRC([block_device_operations_check_events], [ #include unsigned int blk_check_events(struct gendisk *disk, @@ -15,25 +12,23 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ bops __attribute__ ((unused)) = { .check_events = blk_check_events, }; - ],[ - ],[ + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ + AC_MSG_CHECKING([whether bops->check_events() exists]) + ZFS_LINUX_TEST_RESULT([block_device_operations_check_events], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS, 1, - [bops->check_events() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([bops->check_events()]) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) dnl # dnl # 3.10.x API change dnl # -AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ - AC_MSG_CHECKING([whether bops->release() is void]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ + ZFS_LINUX_TEST_SRC([block_device_operations_release_void], [ #include void blk_release(struct gendisk *g, fmode_t mode) { return; } @@ -45,13 +40,56 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ .ioctl = NULL, .compat_ioctl = NULL, }; - ],[ - ],[ - AC_MSG_RESULT(void) - AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID, 1, - [bops->release() returns void]) - ],[ - AC_MSG_RESULT(int) - ]) - EXTRA_KCFLAGS="$tmp_flags" + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ + AC_MSG_CHECKING([whether bops->release() is void]) + ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bops->release()]) + ]) +]) + +dnl # +dnl # 5.13 API change +dnl # block_device_operations->revalidate_disk() was removed +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ + ZFS_LINUX_TEST_SRC([block_device_operations_revalidate_disk], [ + #include + + int blk_revalidate_disk(struct gendisk *disk) { + return(0); + } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .revalidate_disk = blk_revalidate_disk, + }; + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ + AC_MSG_CHECKING([whether bops->revalidate_disk() exists]) + ZFS_LINUX_TEST_RESULT([block_device_operations_revalidate_disk], [ + AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [1], + [Define if revalidate_disk() in block_device_operations]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS], [ + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) diff --git a/config/kernel-clear-inode.m4 b/config/kernel-clear-inode.m4 index 8d880fcd8f..3f454d7ec0 100644 --- a/config/kernel-clear-inode.m4 +++ b/config/kernel-clear-inode.m4 @@ -19,13 +19,18 @@ dnl # Therefore, to ensure we have the correct API we only allow the dnl # clear_inode() compatibility code to be defined iff the evict_inode() dnl # functionality is also detected. dnl # -AC_DEFUN([ZFS_AC_KERNEL_CLEAR_INODE], - [AC_MSG_CHECKING([whether clear_inode() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CLEAR_INODE], [ + ZFS_LINUX_TEST_SRC([clear_inode], [ #include ], [ clear_inode(NULL); - ], [clear_inode], [fs/inode.c], [ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CLEAR_INODE], [ + AC_MSG_CHECKING([whether clear_inode() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([clear_inode], + [clear_inode], [fs/inode.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CLEAR_INODE, 1, [clear_inode() is available]) ], [ diff --git a/config/kernel-commit-metadata.m4 b/config/kernel-commit-metadata.m4 index b66a16fd21..7df9b98029 100644 --- a/config/kernel-commit-metadata.m4 +++ b/config/kernel-commit-metadata.m4 @@ -4,20 +4,21 @@ dnl # Added eops->commit_metadata() callback to allow the underlying dnl # filesystem to determine the most efficient way to commit the inode. dnl # Prior to this the nfs server would issue an explicit fsync(). dnl # -AC_DEFUN([ZFS_AC_KERNEL_COMMIT_METADATA], [ - AC_MSG_CHECKING([whether eops->commit_metadata() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_COMMIT_METADATA], [ + ZFS_LINUX_TEST_SRC([export_operations_commit_metadata], [ #include int commit_metadata(struct inode *inode) { return 0; } static struct export_operations eops __attribute__ ((unused))={ .commit_metadata = commit_metadata, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_COMMIT_METADATA], [ + AC_MSG_CHECKING([whether eops->commit_metadata() exists]) + ZFS_LINUX_TEST_RESULT([export_operations_commit_metadata], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_COMMIT_METADATA, 1, - [eops->commit_metadata() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([eops->commit_metadata()]) ]) ]) diff --git a/config/kernel-config-defined.m4 b/config/kernel-config-defined.m4 new file mode 100644 index 0000000000..c7d18b49b1 --- /dev/null +++ b/config/kernel-config-defined.m4 @@ -0,0 +1,152 @@ +dnl # +dnl # Certain kernel build options are not supported. These must be +dnl # detected at configure time and cause a build failure. Otherwise +dnl # modules may be successfully built that behave incorrectly. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [ + AS_IF([test "x$cross_compiling" != xyes], [ + AC_RUN_IFELSE([ + AC_LANG_PROGRAM([ + #include "$LINUX/include/linux/license.h" + ], [ + return !license_is_gpl_compatible( + "$ZFS_META_LICENSE"); + ]) + ], [ + AC_DEFINE([ZFS_IS_GPL_COMPATIBLE], [1], + [Define to 1 if GPL-only symbols can be used]) + ], [ + ]) + ]) + + ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC + ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS + ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE + ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_DEFLATE + + AC_MSG_CHECKING([for kernel config option compatibility]) + ZFS_LINUX_TEST_COMPILE_ALL([config]) + AC_MSG_RESULT([done]) + + ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC + ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS + ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE + ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE +]) + +dnl # +dnl # Check CONFIG_DEBUG_LOCK_ALLOC +dnl # +dnl # This is typically only set for debug kernels because it comes with +dnl # a performance penalty. However, when it is set it maps the non-GPL +dnl # symbol mutex_lock() to the GPL-only mutex_lock_nested() symbol. +dnl # This will cause a failure at link time which we'd rather know about +dnl # at compile time. +dnl # +dnl # Since we plan to pursue making mutex_lock_nested() a non-GPL symbol +dnl # with the upstream community we add a check to detect this case. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC], [ + ZFS_LINUX_TEST_SRC([config_debug_lock_alloc], [ + #include + ],[ + struct mutex lock; + + mutex_init(&lock); + mutex_lock(&lock); + mutex_unlock(&lock); + ], [], [ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC], [ + AC_MSG_CHECKING([whether mutex_lock() is GPL-only]) + ZFS_LINUX_TEST_RESULT([config_debug_lock_alloc_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_MSG_ERROR([ + *** Kernel built with CONFIG_DEBUG_LOCK_ALLOC which is incompatible + *** with the CDDL license and will prevent the module linking stage + *** from succeeding. You must rebuild your kernel without this + *** option enabled.]) + ]) +]) + +dnl # +dnl # Check CONFIG_TRIM_UNUSED_KSYMS +dnl # +dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS disabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS], [ + ZFS_LINUX_TEST_SRC([config_trim_unusued_ksyms], [ + #if defined(CONFIG_TRIM_UNUSED_KSYMS) + #error CONFIG_TRIM_UNUSED_KSYMS not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS], [ + AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled]) + ZFS_LINUX_TEST_RESULT([config_trim_unusued_ksyms], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AS_IF([test "x$enable_linux_builtin" != xyes], [ + AC_MSG_ERROR([ + *** This kernel has unused symbols trimming enabled, please disable. + *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.]) + ]) + ]) +]) + +dnl # +dnl # Check CONFIG_ZLIB_INFLATE +dnl # +dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE], [ + ZFS_LINUX_TEST_SRC([config_zlib_inflate], [ + #if !defined(CONFIG_ZLIB_INFLATE) && \ + !defined(CONFIG_ZLIB_INFLATE_MODULE) + #error CONFIG_ZLIB_INFLATE not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined]) + ZFS_LINUX_TEST_RESULT([config_zlib_inflate], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib inflate support. + *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # Check CONFIG_ZLIB_DEFLATE +dnl # +dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_DEFLATE], [ + ZFS_LINUX_TEST_SRC([config_zlib_deflate], [ + #if !defined(CONFIG_ZLIB_DEFLATE) && \ + !defined(CONFIG_ZLIB_DEFLATE_MODULE) + #error CONFIG_ZLIB_DEFLATE not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined]) + ZFS_LINUX_TEST_RESULT([config_zlib_deflate], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib deflate support. + *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.]) + ]) +]) diff --git a/config/kernel-create-nameidata.m4 b/config/kernel-create-nameidata.m4 deleted file mode 100644 index d4c155c57f..0000000000 --- a/config/kernel-create-nameidata.m4 +++ /dev/null @@ -1,30 +0,0 @@ -dnl # -dnl # 3.6 API change -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CREATE_NAMEIDATA], [ - AC_MSG_CHECKING([whether iops->create() passes nameidata]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - #ifdef HAVE_MKDIR_UMODE_T - int inode_create(struct inode *inode ,struct dentry *dentry, - umode_t umode, struct nameidata *nidata) { return 0; } - #else - int inode_create(struct inode *inode,struct dentry *dentry, - int umode, struct nameidata * nidata) { return 0; } - #endif - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .create = inode_create, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CREATE_NAMEIDATA, 1, - [iops->create() passes nameidata]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-ctl-table-name.m4 b/config/kernel-ctl-table-name.m4 deleted file mode 100644 index 3ce499968f..0000000000 --- a/config/kernel-ctl-table-name.m4 +++ /dev/null @@ -1,18 +0,0 @@ -dnl # -dnl # 2.6.33 API change, -dnl # Removed .ctl_name from struct ctl_table. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CTL_NAME], [ - AC_MSG_CHECKING([whether struct ctl_table has ctl_name]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct ctl_table ctl __attribute__ ((unused)); - ctl.ctl_name = 0; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CTL_NAME, 1, [struct ctl_table has ctl_name]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4 index c7d5c9b520..3ceb5f63ef 100644 --- a/config/kernel-current-time.m4 +++ b/config/kernel-current-time.m4 @@ -2,14 +2,19 @@ dnl # dnl # 4.9, current_time() added dnl # 4.18, return type changed from timespec to timespec64 dnl # -AC_DEFUN([ZFS_AC_KERNEL_CURRENT_TIME], - [AC_MSG_CHECKING([whether current_time() exists]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [ + ZFS_LINUX_TEST_SRC([current_time], [ #include ], [ struct inode ip __attribute__ ((unused)); ip.i_atime = current_time(&ip); - ], [current_time], [fs/inode.c], [ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CURRENT_TIME], [ + AC_MSG_CHECKING([whether current_time() exists]) + ZFS_LINUX_TEST_RESULT_SYMBOL([current_time], + [current_time], [fs/inode.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CURRENT_TIME, 1, [current_time() exists]) ], [ diff --git a/config/kernel-current_bio_tail.m4 b/config/kernel-current_bio_tail.m4 deleted file mode 100644 index b72f21e8a3..0000000000 --- a/config/kernel-current_bio_tail.m4 +++ /dev/null @@ -1,33 +0,0 @@ -dnl # -dnl # 2.6.34 API change -dnl # current->bio_tail and current->bio_list were struct bio pointers prior to -dnl # Linux 2.6.34. They were refactored into a struct bio_list pointer called -dnl # current->bio_list in Linux 2.6.34. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CURRENT_BIO_TAIL], [ - AC_MSG_CHECKING([whether current->bio_tail exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - current->bio_tail = (struct bio **) NULL; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CURRENT_BIO_TAIL, 1, - [current->bio_tail exists]) - ],[ - AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether current->bio_list exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - current->bio_list = (struct bio_list *) NULL; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CURRENT_BIO_LIST, 1, - [current->bio_list exists]) - ],[ - AC_MSG_ERROR(no - Please file a bug report at - https://github.com/zfsonlinux/zfs/issues/new) - ]) - ]) -]) diff --git a/config/kernel-d-make-root.m4 b/config/kernel-d-make-root.m4 deleted file mode 100644 index 9c2b73dcbf..0000000000 --- a/config/kernel-d-make-root.m4 +++ /dev/null @@ -1,17 +0,0 @@ -dnl # -dnl # 3.4.0 API change -dnl # Added d_make_root() to replace previous d_alloc_root() function. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_D_MAKE_ROOT], - [AC_MSG_CHECKING([whether d_make_root() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - d_make_root(NULL); - ], [d_make_root], [fs/dcache.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_D_MAKE_ROOT, 1, [d_make_root() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-d-obtain-alias.m4 b/config/kernel-d-obtain-alias.m4 deleted file mode 100644 index 2b4b11eccc..0000000000 --- a/config/kernel-d-obtain-alias.m4 +++ /dev/null @@ -1,18 +0,0 @@ -dnl # -dnl # 2.6.28 API change -dnl # Added d_obtain_alias() helper function. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_D_OBTAIN_ALIAS], - [AC_MSG_CHECKING([whether d_obtain_alias() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - d_obtain_alias(NULL); - ], [d_obtain_alias], [fs/dcache.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_D_OBTAIN_ALIAS, 1, - [d_obtain_alias() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-d-prune-aliases.m4 b/config/kernel-d-prune-aliases.m4 deleted file mode 100644 index d9c521b1d4..0000000000 --- a/config/kernel-d-prune-aliases.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 2.6.12 API change -dnl # d_prune_aliases() helper function available. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_D_PRUNE_ALIASES], - [AC_MSG_CHECKING([whether d_prune_aliases() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - struct inode *ip = NULL; - d_prune_aliases(ip); - ], [d_prune_aliases], [fs/dcache.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_D_PRUNE_ALIASES, 1, - [d_prune_aliases() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-declare-event-class.m4 b/config/kernel-declare-event-class.m4 index 7867d75174..6c78ee858d 100644 --- a/config/kernel-declare-event-class.m4 +++ b/config/kernel-declare-event-class.m4 @@ -2,13 +2,10 @@ dnl # dnl # Ensure the DECLARE_EVENT_CLASS macro is available to non-GPL modules. dnl # AC_DEFUN([ZFS_AC_KERNEL_DECLARE_EVENT_CLASS], [ - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-I\$(src)" - AC_MSG_CHECKING([whether DECLARE_EVENT_CLASS() is available]) ZFS_LINUX_TRY_COMPILE_HEADER([ #include - MODULE_LICENSE(ZFS_META_LICENSE); + MODULE_LICENSE("$ZFS_META_LICENSE"); #define CREATE_TRACE_POINTS #include "conftest.h" @@ -18,7 +15,7 @@ AC_DEFUN([ZFS_AC_KERNEL_DECLARE_EVENT_CLASS], [ ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DECLARE_EVENT_CLASS, 1, - [DECLARE_EVENT_CLASS() is available]) + [DECLARE_EVENT_CLASS() is available]) ],[ AC_MSG_RESULT(no) ],[ @@ -55,5 +52,4 @@ AC_DEFUN([ZFS_AC_KERNEL_DECLARE_EVENT_CLASS], [ #define TRACE_INCLUDE_FILE conftest #include ]) - EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel-dentry-operations.m4 b/config/kernel-dentry-operations.m4 index 61f5a27af5..dd470d7607 100644 --- a/config/kernel-dentry-operations.m4 +++ b/config/kernel-dentry-operations.m4 @@ -1,9 +1,100 @@ +dnl # +dnl # 3.4.0 API change +dnl # Added d_make_root() to replace previous d_alloc_root() function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_MAKE_ROOT], [ + ZFS_LINUX_TEST_SRC([d_make_root], [ + #include + ], [ + d_make_root(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_MAKE_ROOT], [ + AC_MSG_CHECKING([whether d_make_root() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_make_root], + [d_make_root], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_MAKE_ROOT, 1, [d_make_root() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.28 API change +dnl # Added d_obtain_alias() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS], [ + ZFS_LINUX_TEST_SRC([d_obtain_alias], [ + #include + ], [ + d_obtain_alias(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_OBTAIN_ALIAS], [ + AC_MSG_CHECKING([whether d_obtain_alias() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_obtain_alias], + [d_obtain_alias], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([d_obtain_alias()]) + ]) +]) + +dnl # +dnl # 2.6.12 API change +dnl # d_prune_aliases() helper function available. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_PRUNE_ALIASES], [ + ZFS_LINUX_TEST_SRC([d_prune_aliases], [ + #include + ], [ + struct inode *ip = NULL; + d_prune_aliases(ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_PRUNE_ALIASES], [ + AC_MSG_CHECKING([whether d_prune_aliases() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_prune_aliases], + [d_prune_aliases], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_PRUNE_ALIASES, 1, + [d_prune_aliases() is available]) + ], [ + ZFS_LINUX_TEST_ERROR([d_prune_aliases()]) + ]) +]) + +dnl # +dnl # 2.6.38 API change +dnl # Added d_set_d_op() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ + ZFS_LINUX_TEST_SRC([d_set_d_op], [ + #include + ], [ + d_set_d_op(NULL, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [ + AC_MSG_CHECKING([whether d_set_d_op() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op], + [d_set_d_op], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([d_set_d_op]) + ]) +]) + dnl # dnl # 3.6 API change dnl # -AC_DEFUN([ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA], [ - AC_MSG_CHECKING([whether dops->d_revalidate() takes struct nameidata]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_REVALIDATE_NAMEIDATA], [ + ZFS_LINUX_TEST_SRC([dentry_operations_revalidate], [ #include #include @@ -14,11 +105,15 @@ AC_DEFUN([ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA], [ dops __attribute__ ((unused)) = { .d_revalidate = revalidate, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA], [ + AC_MSG_CHECKING([whether dops->d_revalidate() takes struct nameidata]) + ZFS_LINUX_TEST_RESULT([dentry_operations_revalidate], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_D_REVALIDATE_NAMEIDATA, 1, - [dops->d_revalidate() operation takes nameidata]) + [dops->d_revalidate() operation takes nameidata]) ],[ AC_MSG_RESULT(no) ]) @@ -28,9 +123,8 @@ dnl # dnl # 2.6.30 API change dnl # The 'struct dentry_operations' was constified in the dentry structure. dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS], [ - AC_MSG_CHECKING([whether dentry uses const struct dentry_operations]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONST_DENTRY_OPERATIONS], [ + ZFS_LINUX_TEST_SRC([dentry_operations_const], [ #include const struct dentry_operations test_d_op = { @@ -38,51 +132,59 @@ AC_DEFUN([ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS], [ }; ],[ struct dentry d __attribute__ ((unused)); - d.d_op = &test_d_op; - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS], [ + AC_MSG_CHECKING([whether dentry uses const struct dentry_operations]) + ZFS_LINUX_TEST_RESULT([dentry_operations_const], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CONST_DENTRY_OPERATIONS, 1, - [dentry uses const struct dentry_operations]) + [dentry uses const struct dentry_operations]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([const dentry_operations]) ]) ]) dnl # dnl # 2.6.38 API change -dnl # Added d_set_d_op() helper function. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], - [AC_MSG_CHECKING([whether d_set_d_op() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - d_set_d_op(NULL, NULL); - ], [d_set_d_op], [fs/dcache.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_D_SET_D_OP, 1, - [d_set_d_op() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 2.6.38 API chage dnl # Added sb->s_d_op default dentry_operations member dnl # -AC_DEFUN([ZFS_AC_KERNEL_S_D_OP], - [AC_MSG_CHECKING([whether super_block has s_d_op]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_S_D_OP], [ + ZFS_LINUX_TEST_SRC([super_block_s_d_op], [ #include ],[ struct super_block sb __attribute__ ((unused)); sb.s_d_op = NULL; - ], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_S_D_OP, 1, [struct super_block has s_d_op]) - ], [ - AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_S_D_OP], [ + AC_MSG_CHECKING([whether super_block has s_d_op]) + ZFS_LINUX_TEST_RESULT([super_block_s_d_op], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([super_block s_d_op]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [ + ZFS_AC_KERNEL_SRC_D_MAKE_ROOT + ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS + ZFS_AC_KERNEL_SRC_D_PRUNE_ALIASES + ZFS_AC_KERNEL_SRC_D_SET_D_OP + ZFS_AC_KERNEL_SRC_D_REVALIDATE_NAMEIDATA + ZFS_AC_KERNEL_SRC_CONST_DENTRY_OPERATIONS + ZFS_AC_KERNEL_SRC_S_D_OP +]) + +AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [ + ZFS_AC_KERNEL_D_MAKE_ROOT + ZFS_AC_KERNEL_D_OBTAIN_ALIAS + ZFS_AC_KERNEL_D_PRUNE_ALIASES + ZFS_AC_KERNEL_D_SET_D_OP + ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA + ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS + ZFS_AC_KERNEL_S_D_OP +]) diff --git a/config/kernel-dirty-inode.m4 b/config/kernel-dirty-inode.m4 index ffd87bb146..dc7667fa48 100644 --- a/config/kernel-dirty-inode.m4 +++ b/config/kernel-dirty-inode.m4 @@ -4,9 +4,8 @@ dnl # The sops->dirty_inode() callbacks were updated to take a flags dnl # argument. This allows the greater control over whether the dnl # filesystem needs to push out a transaction or not. dnl # -AC_DEFUN([ZFS_AC_KERNEL_DIRTY_INODE_WITH_FLAGS], [ - AC_MSG_CHECKING([whether sops->dirty_inode() wants flags]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_DIRTY_INODE], [ + ZFS_LINUX_TEST_SRC([dirty_inode_with_flags], [ #include void dirty_inode(struct inode *a, int b) { return; } @@ -15,11 +14,15 @@ AC_DEFUN([ZFS_AC_KERNEL_DIRTY_INODE_WITH_FLAGS], [ sops __attribute__ ((unused)) = { .dirty_inode = dirty_inode, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_DIRTY_INODE], [ + AC_MSG_CHECKING([whether sops->dirty_inode() wants flags]) + ZFS_LINUX_TEST_RESULT([dirty_inode_with_flags], [ AC_MSG_RESULT([yes]) AC_DEFINE(HAVE_DIRTY_INODE_WITH_FLAGS, 1, - [sops->dirty_inode() wants flags]) + [sops->dirty_inode() wants flags]) ],[ AC_MSG_RESULT([no]) ]) diff --git a/config/kernel-discard-granularity.m4 b/config/kernel-discard-granularity.m4 index 2c677c9096..61326e6773 100644 --- a/config/kernel-discard-granularity.m4 +++ b/config/kernel-discard-granularity.m4 @@ -2,19 +2,20 @@ dnl # dnl # 2.6.33 API change dnl # Discard granularity and alignment restrictions may now be set. dnl # -AC_DEFUN([ZFS_AC_KERNEL_DISCARD_GRANULARITY], [ - AC_MSG_CHECKING([whether ql->discard_granularity is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY], [ + ZFS_LINUX_TEST_SRC([discard_granularity], [ #include ],[ struct queue_limits ql __attribute__ ((unused)); - ql.discard_granularity = 0; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_DISCARD_GRANULARITY, 1, - [ql->discard_granularity is available]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_DISCARD_GRANULARITY], [ + AC_MSG_CHECKING([whether ql->discard_granularity is available]) + ZFS_LINUX_TEST_RESULT([discard_granularity], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([ql->discard_granularity]) ]) ]) diff --git a/config/kernel-elevator-change.m4 b/config/kernel-elevator-change.m4 deleted file mode 100644 index eba252579b..0000000000 --- a/config/kernel-elevator-change.m4 +++ /dev/null @@ -1,25 +0,0 @@ -dnl # -dnl # 2.6.36 API, exported elevator_change() symbol -dnl # 4.12 API, removed elevator_change() symbol -dnl # -AC_DEFUN([ZFS_AC_KERNEL_ELEVATOR_CHANGE], [ - AC_MSG_CHECKING([whether elevator_change() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - int ret; - struct request_queue *q = NULL; - char *elevator = NULL; - ret = elevator_change(q, elevator); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_ELEVATOR_CHANGE, 1, - [elevator_change() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-encode-fh-inode.m4 b/config/kernel-encode-fh-inode.m4 index 287f62a5ed..9d4ba5f0f6 100644 --- a/config/kernel-encode-fh-inode.m4 +++ b/config/kernel-encode-fh-inode.m4 @@ -4,20 +4,23 @@ dnl # torvalds/linux@b0b0382bb4904965a9e9fca77ad87514dfda0d1c changed the dnl # ->encode_fh() callback to pass the child inode and its parents inode dnl # rather than a dentry and a boolean saying whether we want the parent. dnl # -AC_DEFUN([ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE], [ - AC_MSG_CHECKING([whether eops->encode_fh() wants inode]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE], [ + ZFS_LINUX_TEST_SRC([export_operations_encode_fh], [ #include int encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { return 0; } static struct export_operations eops __attribute__ ((unused))={ .encode_fh = encode_fh, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE], [ + AC_MSG_CHECKING([whether eops->encode_fh() wants inode]) + ZFS_LINUX_TEST_RESULT([export_operations_encode_fh], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_ENCODE_FH_WITH_INODE, 1, - [eops->encode_fh() wants child and parent inodes]) + [eops->encode_fh() wants child and parent inodes]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-evict-inode.m4 b/config/kernel-evict-inode.m4 index 683cedb6d3..66f10492de 100644 --- a/config/kernel-evict-inode.m4 +++ b/config/kernel-evict-inode.m4 @@ -3,19 +3,22 @@ dnl # 2.6.36 API change dnl # The sops->delete_inode() and sops->clear_inode() callbacks have dnl # replaced by a single sops->evict_inode() callback. dnl # -AC_DEFUN([ZFS_AC_KERNEL_EVICT_INODE], [ - AC_MSG_CHECKING([whether sops->evict_inode() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_EVICT_INODE], [ + ZFS_LINUX_TEST_SRC([evict_inode], [ #include void evict_inode (struct inode * t) { return; } static struct super_operations sops __attribute__ ((unused)) = { .evict_inode = evict_inode, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_EVICT_INODE], [ + AC_MSG_CHECKING([whether sops->evict_inode() exists]) + ZFS_LINUX_TEST_RESULT([evict_inode], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_EVICT_INODE, 1, [sops->evict_inode() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([evict_inode]) ]) ]) diff --git a/config/kernel-fallocate-pax.m4 b/config/kernel-fallocate-pax.m4 deleted file mode 100644 index e8948be176..0000000000 --- a/config/kernel-fallocate-pax.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # PaX Linux 2.6.38 - 3.x API -dnl # -AC_DEFUN([ZFS_AC_PAX_KERNEL_FILE_FALLOCATE], [ - AC_MSG_CHECKING([whether fops->fallocate() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL; - struct file_operations_no_const fops __attribute__ ((unused)) = { - .fallocate = fallocate, - }; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-fallocate.m4 b/config/kernel-fallocate.m4 index 5509064725..7a8550f7e7 100644 --- a/config/kernel-fallocate.m4 +++ b/config/kernel-fallocate.m4 @@ -1,9 +1,10 @@ dnl # dnl # Linux 2.6.38 - 3.x API +dnl # The fallocate callback was moved from the inode_operations +dnl # structure to the file_operations structure. dnl # -AC_DEFUN([ZFS_AC_KERNEL_FILE_FALLOCATE], [ - AC_MSG_CHECKING([whether fops->fallocate() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FALLOCATE], [ + ZFS_LINUX_TEST_SRC([file_fallocate], [ #include long test_fallocate(struct file *file, int mode, @@ -13,44 +14,14 @@ AC_DEFUN([ZFS_AC_KERNEL_FILE_FALLOCATE], [ fops __attribute__ ((unused)) = { .fallocate = test_fallocate, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists]) - ],[ - AC_MSG_RESULT(no) - ]) + ], []) ]) -dnl # -dnl # Linux 2.6.x - 2.6.37 API -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_FALLOCATE], [ - AC_MSG_CHECKING([whether iops->fallocate() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - long test_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) { return 0; } - - static const struct inode_operations - fops __attribute__ ((unused)) = { - .fallocate = test_fallocate, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_FALLOCATE, 1, [fops->fallocate() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # The fallocate callback was moved from the inode_operations -dnl # structure to the file_operations structure. -dnl # AC_DEFUN([ZFS_AC_KERNEL_FALLOCATE], [ - ZFS_AC_KERNEL_FILE_FALLOCATE - ZFS_AC_KERNEL_INODE_FALLOCATE + AC_MSG_CHECKING([whether fops->fallocate() exists]) + ZFS_LINUX_TEST_RESULT([file_fallocate], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([file_fallocate]) + ]) ]) diff --git a/config/kernel-file-dentry.m4 b/config/kernel-file-dentry.m4 index daf742ee1b..9cb5869c38 100644 --- a/config/kernel-file-dentry.m4 +++ b/config/kernel-file-dentry.m4 @@ -4,14 +4,18 @@ dnl # struct access file->f_path.dentry was replaced by accessor function dnl # since fix torvalds/linux@4bacc9c9234c ("overlayfs: Make f_path always dnl # point to the overlay and f_inode to the underlay"). dnl # -AC_DEFUN([ZFS_AC_KERNEL_FILE_DENTRY], [ - AC_MSG_CHECKING([whether file_dentry() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILE_DENTRY], [ + ZFS_LINUX_TEST_SRC([file_dentry], [ #include ],[ struct file *f = NULL; file_dentry(f); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILE_DENTRY], [ + AC_MSG_CHECKING([whether file_dentry() is available]) + ZFS_LINUX_TEST_RESULT([file_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_FILE_DENTRY, 1, [file_dentry() is available]) ],[ diff --git a/config/kernel-file-inode.m4 b/config/kernel-file-inode.m4 index 300188fa3a..00a3621657 100644 --- a/config/kernel-file-inode.m4 +++ b/config/kernel-file-inode.m4 @@ -3,14 +3,18 @@ dnl # 3.19 API change dnl # struct access f->f_dentry->d_inode was replaced by accessor function dnl # file_inode(f) dnl # -AC_DEFUN([ZFS_AC_KERNEL_FILE_INODE], [ - AC_MSG_CHECKING([whether file_inode() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILE_INODE], [ + ZFS_LINUX_TEST_SRC([file_inode], [ #include ],[ struct file *f = NULL; file_inode(f); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILE_INODE], [ + AC_MSG_CHECKING([whether file_inode() is available]) + ZFS_LINUX_TEST_RESULT([file_inode], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_FILE_INODE, 1, [file_inode() is available]) ],[ diff --git a/config/kernel-fmode-t.m4 b/config/kernel-fmode-t.m4 index 4a23c391d3..5f111e21b4 100644 --- a/config/kernel-fmode-t.m4 +++ b/config/kernel-fmode-t.m4 @@ -2,17 +2,19 @@ dnl # dnl # 2.6.28 API change, dnl # check if fmode_t typedef is defined dnl # -AC_DEFUN([ZFS_AC_KERNEL_TYPE_FMODE_T], - [AC_MSG_CHECKING([whether kernel defines fmode_t]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FMODE_T], [ + ZFS_LINUX_TEST_SRC([type_fmode_t], [ #include ],[ fmode_t *ptr __attribute__ ((unused)); - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FMODE_T, 1, - [kernel defines fmode_t]) - ],[ - AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FMODE_T], [ + AC_MSG_CHECKING([whether kernel defines fmode_t]) + ZFS_LINUX_TEST_RESULT([type_fmode_t], [ + AC_MSG_RESULT([yes]) + ],[ + ZFS_LINUX_TEST_ERROR([type_fmode_t]) ]) ]) diff --git a/config/kernel-follow-down-one.m4 b/config/kernel-follow-down-one.m4 index 63fa779d85..38c460d350 100644 --- a/config/kernel-follow-down-one.m4 +++ b/config/kernel-follow-down-one.m4 @@ -3,18 +3,20 @@ dnl # 2.6.38 API change dnl # follow_down() renamed follow_down_one(). The original follow_down() dnl # symbol still exists but will traverse down all the layers. dnl # -AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_DOWN_ONE], [ - AC_MSG_CHECKING([whether follow_down_one() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE], [ + ZFS_LINUX_TEST_SRC([follow_down_one], [ #include ],[ struct path *p = NULL; follow_down_one(p); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FOLLOW_DOWN_ONE, 1, - [follow_down_one() is available]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_DOWN_ONE], [ + AC_MSG_CHECKING([whether follow_down_one() is available]) + ZFS_LINUX_TEST_RESULT([follow_down_one], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([follow_down_one()]) ]) ]) diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index 5fff79a74c..4d6fe05228 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -2,8 +2,9 @@ dnl # dnl # Handle differences in kernel FPU code. dnl # dnl # Kernel -dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them. -dnl # (nothing defined) +dnl # 5.0: Wrappers have been introduced to save/restore the FPU state. +dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels. +dnl # HAVE_KERNEL_FPU_INTERNAL dnl # dnl # 4.2: Use __kernel_fpu_{begin,end}() dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU @@ -11,55 +12,120 @@ dnl # dnl # Pre-4.2: Use kernel_fpu_{begin,end}() dnl # HAVE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU dnl # -AC_DEFUN([ZFS_AC_KERNEL_FPU], [ - AC_MSG_CHECKING([which kernel_fpu header to use]) +dnl # N.B. The header check is performed before all other checks since it +dnl # depends on HAVE_KERNEL_FPU_API_HEADER being set in confdefs.h. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_FPU_HEADER], [ + AC_MSG_CHECKING([whether fpu headers are available]) ZFS_LINUX_TRY_COMPILE([ #include #include ],[ ],[ - AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, [kernel has asm/fpu/api.h]) + AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, + [kernel has asm/fpu/api.h]) AC_MSG_RESULT(asm/fpu/api.h) ],[ AC_MSG_RESULT(i387.h & xcr.h) ]) +]) - AC_MSG_CHECKING([which kernel_fpu function to use]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include +AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ + ZFS_LINUX_TEST_SRC([kernel_fpu], [ + #include #ifdef HAVE_KERNEL_FPU_API_HEADER #include #else #include #include #endif - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ + ], [ kernel_fpu_begin(); kernel_fpu_end(); - ], [kernel_fpu_begin], [arch/x86/kernel/fpu/core.c], [ - AC_MSG_RESULT(kernel_fpu_*) - AC_DEFINE(HAVE_KERNEL_FPU, 1, [kernel has kernel_fpu_* functions]) - AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) + ], [], [ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([__kernel_fpu], [ + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #else + #include + #include + #endif + ], [ + __kernel_fpu_begin(); + __kernel_fpu_end(); + ], [], [ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([fpu_internal], [ + #if defined(__x86_64) || defined(__x86_64__) || \ + defined(__i386) || defined(__i386__) + #if !defined(__x86) + #define __x86 + #endif + #endif + + #if !defined(__x86) + #error Unsupported architecture + #endif + + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #include + #else + #include + #include + #endif + + #if !defined(XSTATE_XSAVE) + #error XSTATE_XSAVE not defined + #endif + + #if !defined(XSTATE_XRESTORE) + #error XSTATE_XRESTORE not defined + #endif ],[ - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - #ifdef HAVE_KERNEL_FPU_API_HEADER - #include - #else - #include - #include - #endif - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - __kernel_fpu_begin(); - __kernel_fpu_end(); - ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ + struct fpu *fpu = ¤t->thread.fpu; + union fpregs_state *st = &fpu->state; + struct fregs_state *fr __attribute__ ((unused)) = &st->fsave; + struct fxregs_state *fxr __attribute__ ((unused)) = &st->fxsave; + struct xregs_state *xr __attribute__ ((unused)) = &st->xsave; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FPU], [ + dnl # + dnl # Legacy kernel + dnl # + AC_MSG_CHECKING([whether kernel fpu is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([kernel_fpu_license], + [kernel_fpu_begin], [arch/x86/kernel/fpu/core.c], [ + AC_MSG_RESULT(kernel_fpu_*) + AC_DEFINE(HAVE_KERNEL_FPU, 1, + [kernel has kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) + ],[ + dnl # + dnl # Linux 4.2 kernel + dnl # + ZFS_LINUX_TEST_RESULT_SYMBOL([__kernel_fpu_license], + [__kernel_fpu_begin], + [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ AC_MSG_RESULT(__kernel_fpu_*) - AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions]) - AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) + AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, + [kernel has __kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) ],[ - AC_MSG_RESULT(not exported) + ZFS_LINUX_TEST_RESULT([fpu_internal], [ + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ],[ + AC_MSG_RESULT(unavailable) + ]) ]) ]) ]) diff --git a/config/kernel-fst-mount.m4 b/config/kernel-fst-mount.m4 index a8ac50bdd5..576f5f0129 100644 --- a/config/kernel-fst-mount.m4 +++ b/config/kernel-fst-mount.m4 @@ -3,9 +3,8 @@ dnl # 2.6.38 API change dnl # The .get_sb callback has been replaced by a .mount callback dnl # in the file_system_type structure. dnl # -AC_DEFUN([ZFS_AC_KERNEL_FST_MOUNT], [ - AC_MSG_CHECKING([whether fst->mount() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FST_MOUNT], [ + ZFS_LINUX_TEST_SRC([file_system_type_mount], [ #include static struct dentry * @@ -18,11 +17,14 @@ AC_DEFUN([ZFS_AC_KERNEL_FST_MOUNT], [ static struct file_system_type fst __attribute__ ((unused)) = { .mount = mount, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FST_MOUNT], [ + AC_MSG_CHECKING([whether fst->mount() exists]) + ZFS_LINUX_TEST_RESULT([file_system_type_mount], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FST_MOUNT, 1, [fst->mount() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([fst->mount()]) ]) ]) diff --git a/config/kernel-fsync.m4 b/config/kernel-fsync.m4 index e1f2d68b9b..d198191d3a 100644 --- a/config/kernel-fsync.m4 +++ b/config/kernel-fsync.m4 @@ -1,31 +1,8 @@ dnl # -dnl # Linux 2.6.x - 2.6.34 API +dnl # Check file_operations->fsync interface. dnl # -AC_DEFUN([ZFS_AC_KERNEL_FSYNC_WITH_DENTRY], [ - ZFS_LINUX_TRY_COMPILE([ - #include - - int test_fsync(struct file *f, struct dentry *dentry, int x) - { return 0; } - - static const struct file_operations - fops __attribute__ ((unused)) = { - .fsync = test_fsync, - }; - ],[ - ],[ - AC_MSG_RESULT([dentry]) - AC_DEFINE(HAVE_FSYNC_WITH_DENTRY, 1, - [fops->fsync() with dentry]) - ],[ - ]) -]) - -dnl # -dnl # Linux 2.6.35 - Linux 3.0 API -dnl # -AC_DEFUN([ZFS_AC_KERNEL_FSYNC_WITHOUT_DENTRY], [ - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FSYNC], [ + ZFS_LINUX_TEST_SRC([fsync_without_dentry], [ #include int test_fsync(struct file *f, int x) { return 0; } @@ -34,20 +11,9 @@ AC_DEFUN([ZFS_AC_KERNEL_FSYNC_WITHOUT_DENTRY], [ fops __attribute__ ((unused)) = { .fsync = test_fsync, }; - ],[ - ],[ - AC_MSG_RESULT([no dentry]) - AC_DEFINE(HAVE_FSYNC_WITHOUT_DENTRY, 1, - [fops->fsync() without dentry]) - ],[ - ]) -]) + ],[]) -dnl # -dnl # Linux 3.1 - 3.x API -dnl # -AC_DEFUN([ZFS_AC_KERNEL_FSYNC_RANGE], [ - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([fsync_range], [ #include int test_fsync(struct file *f, loff_t a, loff_t b, int c) @@ -57,18 +23,31 @@ AC_DEFUN([ZFS_AC_KERNEL_FSYNC_RANGE], [ fops __attribute__ ((unused)) = { .fsync = test_fsync, }; - ],[ - ],[ - AC_MSG_RESULT([range]) - AC_DEFINE(HAVE_FSYNC_RANGE, 1, - [fops->fsync() with range]) - ],[ - ]) + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_FSYNC], [ - AC_MSG_CHECKING([whether fops->fsync() wants]) - ZFS_AC_KERNEL_FSYNC_WITH_DENTRY - ZFS_AC_KERNEL_FSYNC_WITHOUT_DENTRY - ZFS_AC_KERNEL_FSYNC_RANGE + dnl # + dnl # Linux 2.6.35 - Linux 3.0 API + dnl # + AC_MSG_CHECKING([whether fops->fsync() wants no dentry]) + ZFS_LINUX_TEST_RESULT([fsync_without_dentry], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FSYNC_WITHOUT_DENTRY, 1, + [fops->fsync() without dentry]) + ],[ + AC_MSG_RESULT([no]) + + dnl # + dnl # Linux 3.1 - 3.x API + dnl # + AC_MSG_CHECKING([whether fops->fsync() wants range]) + ZFS_LINUX_TEST_RESULT([fsync_range], [ + AC_MSG_RESULT([range]) + AC_DEFINE(HAVE_FSYNC_RANGE, 1, + [fops->fsync() with range]) + ],[ + ZFS_LINUX_TEST_ERROR([fops->fsync]) + ]) + ]) ]) diff --git a/config/kernel-generic_fillattr.m4 b/config/kernel-generic_fillattr.m4 new file mode 100644 index 0000000000..0acd5d5310 --- /dev/null +++ b/config/kernel-generic_fillattr.m4 @@ -0,0 +1,28 @@ +dnl # +dnl # 5.12 API +dnl # +dnl # generic_fillattr in linux/fs.h now requires a struct user_namespace* +dnl # as the first arg, to support idmapped mounts. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS], [ + ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ + #include + ],[ + struct user_namespace *userns = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(userns, in, k); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS], [ + AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + diff --git a/config/kernel-generic_io_acct.m4 b/config/kernel-generic_io_acct.m4 index 0aa7621622..0f4381db4c 100644 --- a/config/kernel-generic_io_acct.m4 +++ b/config/kernel-generic_io_acct.m4 @@ -1,12 +1,29 @@ dnl # -dnl # 3.19 API addition +dnl # Check for generic io accounting interface. dnl # -dnl # torvalds/linux@394ffa503bc40e32d7f54a9b817264e81ce131b4 allows us to -dnl # increment iostat counters without generic_make_request(). -dnl # -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT_3ARG], [ - AC_MSG_CHECKING([whether 3 arg generic IO accounting symbols are available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ + ZFS_LINUX_TEST_SRC([disk_io_acct], [ + #include + ], [ + struct gendisk *disk = NULL; + struct bio *bio = NULL; + unsigned long start_time; + + start_time = disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio)); + disk_end_io_acct(disk, bio_op(bio), start_time); + ]) + + ZFS_LINUX_TEST_SRC([bio_io_acct], [ + #include + ], [ + struct bio *bio = NULL; + unsigned long start_time; + + start_time = bio_start_io_acct(bio); + bio_end_io_acct(bio, start_time); + ]) + + ZFS_LINUX_TEST_SRC([generic_acct_3args], [ #include void (*generic_start_io_acct_f)(int, unsigned long, @@ -16,24 +33,9 @@ AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT_3ARG], [ ], [ generic_start_io_acct(0, 0, NULL); generic_end_io_acct(0, NULL, 0); - ], [generic_start_io_acct], [block/bio.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, - [generic_start_io_acct()/generic_end_io_acct() available]) - ], [ - AC_MSG_RESULT(no) ]) -]) -dnl # -dnl # Linux 4.14 API, -dnl # -dnl # generic_start_io_acct/generic_end_io_acct now require request_queue to be -dnl # provided. No functional changes, but preparation for inflight accounting -dnl # -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT_4ARG], [ - AC_MSG_CHECKING([whether 4 arg generic IO accounting symbols are available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ + ZFS_LINUX_TEST_SRC([generic_acct_4args], [ #include void (*generic_start_io_acct_f)(struct request_queue *, int, @@ -43,11 +45,68 @@ AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT_4ARG], [ ], [ generic_start_io_acct(NULL, 0, 0, NULL); generic_end_io_acct(NULL, 0, NULL, 0); - ], [generic_start_io_acct], [block/bio.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, - [generic_start_io_acct()/generic_end_io_acct() 4 arg available]) - ], [ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT], [ + dnl # + dnl # 5.12 API, + dnl # + dnl # bio_start_io_acct() and bio_end_io_acct() became GPL-exported + dnl # so use disk_start_io_acct() and disk_end_io_acct() instead + dnl # + AC_MSG_CHECKING([whether generic disk_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([disk_io_acct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DISK_IO_ACCT, 1, [disk_*_io_acct() available]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 5.7 API, + dnl # + dnl # Added bio_start_io_acct() and bio_end_io_acct() helpers. + dnl # + AC_MSG_CHECKING([whether generic bio_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([bio_io_acct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_IO_ACCT, 1, [bio_*_io_acct() available]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.14 API, + dnl # + dnl # generic_start_io_acct/generic_end_io_acct now require + dnl # request_queue to be provided. No functional changes, + dnl # but preparation for inflight accounting. + dnl # + AC_MSG_CHECKING([whether generic_*_io_acct wants 4 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_4args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, + [generic_*_io_acct() 4 arg available]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 3.19 API addition + dnl # + dnl # torvalds/linux@394ffa50 allows us to increment + dnl # iostat counters without generic_make_request(). + dnl # + AC_MSG_CHECKING( + [whether generic_*_io_acct wants 3 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, + [generic_*_io_acct() 3 arg available]) + ], [ + AC_MSG_RESULT(no) + ]) + ]) + ]) ]) ]) diff --git a/config/kernel-generic_readlink.m4 b/config/kernel-generic_readlink.m4 index 914431de4f..a7a33b408a 100644 --- a/config/kernel-generic_readlink.m4 +++ b/config/kernel-generic_readlink.m4 @@ -4,18 +4,21 @@ dnl # dnl # NULL inode_operations.readlink implies generic_readlink(), which dnl # has been made static. dnl # -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL], [ - AC_MSG_CHECKING([whether generic_readlink is global]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL], [ + ZFS_LINUX_TEST_SRC([generic_readlink_global], [ #include ],[ int i __attribute__ ((unused)); - i = generic_readlink(NULL, NULL, 0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL], [ + AC_MSG_CHECKING([whether generic_readlink is global]) + ZFS_LINUX_TEST_RESULT([generic_readlink_global], [ AC_MSG_RESULT([yes]) AC_DEFINE(HAVE_GENERIC_READLINK, 1, - [generic_readlink is global]) + [generic_readlink is global]) ],[ AC_MSG_RESULT([no]) ]) diff --git a/config/kernel-get-disk-and-module.m4 b/config/kernel-get-disk-and-module.m4 deleted file mode 100644 index 2a51a5af7d..0000000000 --- a/config/kernel-get-disk-and-module.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 4.16 API change -dnl # Verify if get_disk_and_module() symbol is available. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_AND_MODULE], - [AC_MSG_CHECKING([whether get_disk_and_module() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - struct gendisk *disk = NULL; - (void) get_disk_and_module(disk); - ], [get_disk_and_module], [block/genhd.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_DISK_AND_MODULE, - 1, [get_disk_and_module() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-get-disk-ro.m4 b/config/kernel-get-disk-ro.m4 index 13ed81217e..8a379c7669 100644 --- a/config/kernel-get-disk-ro.m4 +++ b/config/kernel-get-disk-ro.m4 @@ -1,21 +1,20 @@ dnl # dnl # 2.6.x API change dnl # -AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_RO], [ - AC_MSG_CHECKING([whether get_disk_ro() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_DISK_RO], [ + ZFS_LINUX_TEST_SRC([get_disk_ro], [ #include ],[ struct gendisk *disk = NULL; (void) get_disk_ro(disk); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_DISK_RO, 1, - [blk_disk_ro() is available]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_RO], [ + AC_MSG_CHECKING([whether get_disk_ro() is available]) + ZFS_LINUX_TEST_RESULT([get_disk_ro], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([get_disk_ro()]) + ]) ]) diff --git a/config/kernel-get-link.m4 b/config/kernel-get-link.m4 index 3cda08c1b4..e4f478e37c 100644 --- a/config/kernel-get-link.m4 +++ b/config/kernel-get-link.m4 @@ -1,13 +1,29 @@ dnl # dnl # Supported get_link() interfaces checked newest to oldest. +dnl # Note this interface used to be named follow_link. dnl # -AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_LINK], [ - dnl # - dnl # 4.2 API change - dnl # - This kernel retired the nameidata structure. - dnl # - AC_MSG_CHECKING([whether iops->follow_link() passes cookie]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ + ZFS_LINUX_TEST_SRC([inode_operations_get_link], [ + #include + const char *get_link(struct dentry *de, struct inode *ip, + struct delayed_call *done) { return "symlink"; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .get_link = get_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_get_link_cookie], [ + #include + const char *get_link(struct dentry *de, struct + inode *ip, void **cookie) { return "symlink"; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .get_link = get_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_follow_link], [ #include const char *follow_link(struct dentry *de, void **cookie) { return "symlink"; } @@ -15,35 +31,17 @@ AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_LINK], [ iops __attribute__ ((unused)) = { .follow_link = follow_link, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FOLLOW_LINK_COOKIE, 1, - [iops->follow_link() cookie]) - ],[ - dnl # - dnl # 2.6.32 API - dnl # - AC_MSG_RESULT(no) - AC_MSG_CHECKING( - [whether iops->follow_link() passes nameidata]) - ZFS_LINUX_TRY_COMPILE([ + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_follow_link_nameidata], [ #include - void *follow_link(struct dentry *de, struct - nameidata *nd) { return (void *)NULL; } - static struct inode_operations - iops __attribute__ ((unused)) = { - .follow_link = follow_link, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FOLLOW_LINK_NAMEIDATA, 1, - [iops->follow_link() nameidata]) - ],[ - AC_MSG_ERROR(no; please file a bug report) - ]) - ]) + void *follow_link(struct dentry *de, struct + nameidata *nd) { return (void *)NULL; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .follow_link = follow_link, + }; + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_GET_LINK], [ @@ -53,20 +51,12 @@ AC_DEFUN([ZFS_AC_KERNEL_GET_LINK], [ dnl # used it to retire the put_link() interface. dnl # AC_MSG_CHECKING([whether iops->get_link() passes delayed]) - ZFS_LINUX_TRY_COMPILE([ - #include - const char *get_link(struct dentry *de, struct inode *ip, - struct delayed_call *done) { return "symlink"; } - static struct inode_operations - iops __attribute__ ((unused)) = { - .get_link = get_link, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([inode_operations_get_link], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_LINK_DELAYED, 1, - [iops->get_link() delayed]) + AC_DEFINE(HAVE_GET_LINK_DELAYED, 1, [iops->get_link() delayed]) ],[ + AC_MSG_RESULT(no) + dnl # dnl # 4.5 API change dnl # The follow_link() interface has been replaced by @@ -74,27 +64,41 @@ AC_DEFUN([ZFS_AC_KERNEL_GET_LINK], [ dnl # - An inode is passed as a separate argument dnl # - When called in RCU mode a NULL dentry is passed. dnl # - AC_MSG_RESULT(no) AC_MSG_CHECKING([whether iops->get_link() passes cookie]) - ZFS_LINUX_TRY_COMPILE([ - #include - const char *get_link(struct dentry *de, struct - inode *ip, void **cookie) { return "symlink"; } - static struct inode_operations - iops __attribute__ ((unused)) = { - .get_link = get_link, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([inode_operations_get_link_cookie], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GET_LINK_COOKIE, 1, [iops->get_link() cookie]) ],[ - dnl # - dnl # Check for the follow_link APIs. - dnl # AC_MSG_RESULT(no) - ZFS_AC_KERNEL_FOLLOW_LINK + + dnl # + dnl # 4.2 API change + dnl # This kernel retired the nameidata structure. + dnl # + AC_MSG_CHECKING( + [whether iops->follow_link() passes cookie]) + ZFS_LINUX_TEST_RESULT([inode_operations_follow_link], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FOLLOW_LINK_COOKIE, 1, + [iops->follow_link() cookie]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 2.6.32 API + dnl # + AC_MSG_CHECKING( + [whether iops->follow_link() passes nameidata]) + ZFS_LINUX_TEST_RESULT( + [inode_operations_follow_link_nameidata],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FOLLOW_LINK_NAMEIDATA, 1, + [iops->follow_link() nameidata]) + ],[ + ZFS_LINUX_TEST_ERROR([get_link]) + ]) + ]) ]) ]) ]) diff --git a/config/kernel-global_page_state.m4 b/config/kernel-global_page_state.m4 index f4a40011f6..badb5e5d2e 100644 --- a/config/kernel-global_page_state.m4 +++ b/config/kernel-global_page_state.m4 @@ -4,16 +4,21 @@ dnl # dnl # 75ef71840539 mm, vmstat: add infrastructure for per-node vmstats dnl # 599d0c954f91 mm, vmscan: move LRU lists to node dnl # -AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE], [ - AC_MSG_CHECKING([whether global_node_page_state() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE], [ + ZFS_LINUX_TEST_SRC([global_node_page_state], [ #include #include ],[ (void) global_node_page_state(0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE], [ + AC_MSG_CHECKING([whether global_node_page_state() exists]) + ZFS_LINUX_TEST_RESULT([global_node_page_state], [ AC_MSG_RESULT(yes) - AC_DEFINE(ZFS_GLOBAL_NODE_PAGE_STATE, 1, [global_node_page_state() exists]) + AC_DEFINE(ZFS_GLOBAL_NODE_PAGE_STATE, 1, + [global_node_page_state() exists]) ],[ AC_MSG_RESULT(no) ]) @@ -24,16 +29,21 @@ dnl # 4.14 API change dnl # dnl # c41f012ade0b mm: rename global_page_state to global_zone_page_state dnl # -AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE], [ - AC_MSG_CHECKING([whether global_zone_page_state() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE], [ + ZFS_LINUX_TEST_SRC([global_zone_page_state], [ #include #include ],[ (void) global_zone_page_state(0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE], [ + AC_MSG_CHECKING([whether global_zone_page_state() exists]) + ZFS_LINUX_TEST_RESULT([global_zone_page_state], [ AC_MSG_RESULT(yes) - AC_DEFINE(ZFS_GLOBAL_ZONE_PAGE_STATE, 1, [global_zone_page_state() exists]) + AC_DEFINE(ZFS_GLOBAL_ZONE_PAGE_STATE, 1, + [global_zone_page_state() exists]) ],[ AC_MSG_RESULT(no) ]) @@ -44,9 +54,11 @@ dnl # Create a define and autoconf variable for an enum member dnl # AC_DEFUN([ZFS_AC_KERNEL_ENUM_MEMBER], [ AC_MSG_CHECKING([whether enum $2 contains $1]) - AS_IF([AC_TRY_COMMAND("${srcdir}/scripts/enum-extract.pl" "$2" "$3" | egrep -qx $1)],[ + AS_IF([AC_TRY_COMMAND( + "${srcdir}/scripts/enum-extract.pl" "$2" "$3" | egrep -qx $1)],[ AC_MSG_RESULT([yes]) - AC_DEFINE(m4_join([_], [ZFS_ENUM], m4_toupper($2), $1), 1, [enum $2 contains $1]) + AC_DEFINE(m4_join([_], [ZFS_ENUM], m4_toupper($2), $1), 1, + [enum $2 contains $1]) m4_join([_], [ZFS_ENUM], m4_toupper($2), $1)=1 ],[ AC_MSG_RESULT([no]) @@ -59,8 +71,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR],[ AC_MSG_RESULT(no) AC_MSG_RESULT([$1 in either node_stat_item or zone_stat_item: $2]) - AC_MSG_RESULT([configure needs updating, see: config/kernel-global_page_state.m4]) - AC_MSG_FAILURE([SHUT 'ER DOWN CLANCY, SHE'S PUMPIN' MUD!]) + ZFS_LINUX_TEST_ERROR([global page state]) ]) AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK], [ @@ -75,19 +86,23 @@ AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK], [ ]) dnl # -dnl # Ensure the config tests are finding one and only one of each enum of interest +dnl # Ensure the config tests are finding one and only one of each enum. dnl # AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY], [ - AC_MSG_CHECKING([global_page_state enums are sane]) + AC_MSG_CHECKING([whether global_page_state enums are sane]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_FILE_PAGES]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_ANON]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_FILE]) - ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_SLAB_RECLAIMABLE]) AC_MSG_RESULT(yes) ]) +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE], [ + ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE + ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE +]) + dnl # dnl # enum members in which we're interested dnl # @@ -95,15 +110,19 @@ AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE], [ ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE - ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY ]) diff --git a/config/kernel-group-info.m4 b/config/kernel-group-info.m4 index 849a1e246a..0fee1d36d5 100644 --- a/config/kernel-group-info.m4 +++ b/config/kernel-group-info.m4 @@ -2,20 +2,21 @@ dnl # dnl # 4.9 API change dnl # group_info changed from 2d array via >blocks to 1d array via ->gid dnl # -AC_DEFUN([ZFS_AC_KERNEL_GROUP_INFO_GID], [ - AC_MSG_CHECKING([whether group_info->gid exists]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_GROUP_INFO_GID], [ + ZFS_LINUX_TEST_SRC([group_info_gid], [ #include ],[ struct group_info *gi = groups_alloc(1); gi->gid[0] = KGIDT_INIT(0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GROUP_INFO_GID], [ + AC_MSG_CHECKING([whether group_info->gid exists]) + ZFS_LINUX_TEST_RESULT([group_info_gid], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GROUP_INFO_GID, 1, [group_info->gid exists]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel-hotplug.m4 b/config/kernel-hotplug.m4 new file mode 100644 index 0000000000..e796a6d2e8 --- /dev/null +++ b/config/kernel-hotplug.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 4.6 API change +dnl # Added CPU hotplug APIs +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CPU_HOTPLUG], [ + ZFS_LINUX_TEST_SRC([cpu_hotplug], [ + #include + ],[ + enum cpuhp_state state = CPUHP_ONLINE; + int (*fp)(unsigned int, struct hlist_node *) = NULL; + cpuhp_state_add_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_state_remove_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_setup_state_multi(state, "", fp, fp); + cpuhp_remove_multi_state(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CPU_HOTPLUG], [ + AC_MSG_CHECKING([whether CPU hotplug APIs exist]) + ZFS_LINUX_TEST_RESULT([cpu_hotplug], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CPU_HOTPLUG, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-in-compat-syscall.m4 b/config/kernel-in-compat-syscall.m4 index 9fca9da20e..baaac8c4fd 100644 --- a/config/kernel-in-compat-syscall.m4 +++ b/config/kernel-in-compat-syscall.m4 @@ -4,13 +4,17 @@ dnl # Added in_compat_syscall() which can be overridden on a per- dnl # architecture basis. Prior to this is_compat_task() was the dnl # provided interface. dnl # -AC_DEFUN([ZFS_AC_KERNEL_IN_COMPAT_SYSCALL], [ - AC_MSG_CHECKING([whether in_compat_syscall() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL], [ + ZFS_LINUX_TEST_SRC([in_compat_syscall], [ #include ],[ in_compat_syscall(); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_IN_COMPAT_SYSCALL], [ + AC_MSG_CHECKING([whether in_compat_syscall() is available]) + ZFS_LINUX_TEST_RESULT([in_compat_syscall], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IN_COMPAT_SYSCALL, 1, [in_compat_syscall() is available]) diff --git a/config/kernel-inode-create.m4 b/config/kernel-inode-create.m4 new file mode 100644 index 0000000000..a6ea11fb61 --- /dev/null +++ b/config/kernel-inode-create.m4 @@ -0,0 +1,53 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([create_userns], [ + #include + #include + + int inode_create(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + umode_t umode, bool flag) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .create = inode_create, + }; + ],[]) + + dnl # + dnl # 3.6 API change + dnl # + ZFS_LINUX_TEST_SRC([create_flags], [ + #include + #include + + int inode_create(struct inode *inode ,struct dentry *dentry, + umode_t umode, bool flag) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .create = inode_create, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CREATE], [ + AC_MSG_CHECKING([whether iops->create() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([create_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_CREATE_USERNS, 1, + [iops->create() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->create() passes flags]) + ZFS_LINUX_TEST_RESULT([create_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->create()]) + ]) + ]) +]) diff --git a/config/kernel-inode-getattr.m4 b/config/kernel-inode-getattr.m4 index f10e0b2510..f62e82f523 100644 --- a/config/kernel-inode-getattr.m4 +++ b/config/kernel-inode-getattr.m4 @@ -1,10 +1,30 @@ -dnl # -dnl # Linux 4.11 API -dnl # See torvalds/linux@a528d35 -dnl # -AC_DEFUN([ZFS_AC_PATH_KERNEL_IOPS_GETATTR], [ - AC_MSG_CHECKING([whether iops->getattr() takes a path]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ + dnl # + dnl # Linux 5.12 API + dnl # The getattr I/O operations handler type was extended to require + dnl # a struct user_namespace* as its first arg, to support idmapped + dnl # mounts. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_getattr_userns], [ + #include + + int test_getattr( + struct user_namespace *userns, + const struct path *p, struct kstat *k, + u32 request_mask, unsigned int query_flags) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .getattr = test_getattr, + }; + ],[]) + + dnl # + dnl # Linux 4.11 API + dnl # See torvalds/linux@a528d35 + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_getattr_path], [ #include int test_getattr( @@ -16,24 +36,9 @@ AC_DEFUN([ZFS_AC_PATH_KERNEL_IOPS_GETATTR], [ iops __attribute__ ((unused)) = { .getattr = test_getattr, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, - [iops->getattr() takes a path]) - ],[ - AC_MSG_RESULT(no) - ]) -]) + ],[]) - - -dnl # -dnl # Linux 3.9 - 4.10 API -dnl # -AC_DEFUN([ZFS_AC_VFSMOUNT_KERNEL_IOPS_GETATTR], [ - AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([inode_operations_getattr_vfsmount], [ #include int test_getattr( @@ -45,23 +50,43 @@ AC_DEFUN([ZFS_AC_VFSMOUNT_KERNEL_IOPS_GETATTR], [ iops __attribute__ ((unused)) = { .getattr = test_getattr, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_GETATTR], [ + dnl # + dnl # Kernel 5.12 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes user_namespace]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, - [iops->getattr() takes a vfsmount]) + AC_DEFINE(HAVE_USERNS_IOPS_GETATTR, 1, + [iops->getattr() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) + + dnl # + dnl # Kernel 4.11 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes a path]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, + [iops->getattr() takes a path]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # Kernel < 4.11 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, + [iops->getattr() takes a vfsmount]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) ]) ]) - - -dnl # -dnl # The interface of the getattr callback from the inode_operations -dnl # structure changed. Also, the interface of the simple_getattr() -dnl # function provided by the kernel changed. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GETATTR], [ - ZFS_AC_PATH_KERNEL_IOPS_GETATTR - ZFS_AC_VFSMOUNT_KERNEL_IOPS_GETATTR -]) diff --git a/config/kernel-inode-lock.m4 b/config/kernel-inode-lock.m4 index 8dee014227..5eb04af787 100644 --- a/config/kernel-inode-lock.m4 +++ b/config/kernel-inode-lock.m4 @@ -4,20 +4,21 @@ dnl # i_mutex is changed to i_rwsem. Instead of directly using dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared() dnl # We test inode_lock_shared because inode_lock is introduced earlier. dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_LOCK], [ - AC_MSG_CHECKING([whether inode_lock_shared() exists]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_LOCK], [ + ZFS_LINUX_TEST_SRC([inode_lock], [ #include ],[ struct inode *inode = NULL; inode_lock_shared(inode); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_LOCK], [ + AC_MSG_CHECKING([whether inode_lock_shared() exists]) + ZFS_LINUX_TEST_RESULT([inode_lock], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel-inode-lookup.m4 b/config/kernel-inode-lookup.m4 new file mode 100644 index 0000000000..1a56e69b04 --- /dev/null +++ b/config/kernel-inode-lookup.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 3.6 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS], [ + ZFS_LINUX_TEST_SRC([lookup_flags], [ + #include + #include + + struct dentry *inode_lookup(struct inode *inode, + struct dentry *dentry, unsigned int flags) { return NULL; } + + static const struct inode_operations iops + __attribute__ ((unused)) = { + .lookup = inode_lookup, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_LOOKUP_FLAGS], [ + AC_MSG_CHECKING([whether iops->lookup() passes flags]) + ZFS_LINUX_TEST_RESULT([lookup_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->lookup()]) + ]) +]) diff --git a/config/kernel-inode-set-flags.m4 b/config/kernel-inode-set-flags.m4 index e0ad26796d..133f666a95 100644 --- a/config/kernel-inode-set-flags.m4 +++ b/config/kernel-inode-set-flags.m4 @@ -2,14 +2,18 @@ dnl # dnl # 3.15 API change dnl # inode_set_flags introduced to set i_flags dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_FLAGS], [ - AC_MSG_CHECKING([whether inode_set_flags() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS], [ + ZFS_LINUX_TEST_SRC([inode_set_flags], [ #include ],[ struct inode inode; inode_set_flags(&inode, S_IMMUTABLE, S_IMMUTABLE); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_FLAGS], [ + AC_MSG_CHECKING([whether inode_set_flags() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_flags], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_SET_FLAGS, 1, [inode_set_flags() exists]) ],[ diff --git a/config/kernel-inode-set-iversion.m4 b/config/kernel-inode-set-iversion.m4 index 9a7d7890e5..dd415de324 100644 --- a/config/kernel-inode-set-iversion.m4 +++ b/config/kernel-inode-set-iversion.m4 @@ -2,14 +2,18 @@ dnl # dnl # 4.16 API change dnl # inode_set_iversion introduced to set i_version dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_IVERSION], [ - AC_MSG_CHECKING([whether inode_set_iversion() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION], [ + ZFS_LINUX_TEST_SRC([inode_set_iversion], [ #include ],[ struct inode inode; inode_set_iversion(&inode, 1); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_IVERSION], [ + AC_MSG_CHECKING([whether inode_set_iversion() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_iversion], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_SET_IVERSION, 1, [inode_set_iversion() exists]) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index f5818411aa..9c016c7900 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -1,12 +1,24 @@ -dnl # -dnl # 4.18 API change -dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ - AC_MSG_CHECKING([whether inode->i_*time's are timespec64]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ + + dnl # + dnl # 5.6 API change + dnl # timespec64_trunc() replaced by timestamp_truncate() interface. + dnl # + ZFS_LINUX_TEST_SRC([timestamp_truncate], [ + #include + ],[ + struct timespec64 ts; + struct inode ip; + + memset(&ts, 0, sizeof(ts)); + ts = timestamp_truncate(ts, &ip); + ]) + + dnl # + dnl # 4.18 API change + dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64. + dnl # + ZFS_LINUX_TEST_SRC([inode_times], [ #include ],[ struct inode ip; @@ -14,12 +26,25 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); ts = ip.i_mtime; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ + AC_MSG_CHECKING([whether timestamp_truncate() exists]) + ZFS_LINUX_TEST_RESULT([timestamp_truncate], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_TIMESTAMP_TRUNCATE, 1, + [timestamp_truncate() exists]) ],[ AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode->i_*time's are timespec64]) + ZFS_LINUX_TEST_RESULT([inode_times], [ + AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, [inode->i_*time's are timespec64]) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel-insert-inode-locked.m4 b/config/kernel-insert-inode-locked.m4 index da141d180a..348aff9a57 100644 --- a/config/kernel-insert-inode-locked.m4 +++ b/config/kernel-insert-inode-locked.m4 @@ -2,17 +2,20 @@ dnl # dnl # 2.6.28 API change dnl # Added insert_inode_locked() helper function. dnl # -AC_DEFUN([ZFS_AC_KERNEL_INSERT_INODE_LOCKED], - [AC_MSG_CHECKING([whether insert_inode_locked() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED], [ + ZFS_LINUX_TEST_SRC([insert_inode_locked], [ #include ], [ insert_inode_locked(NULL); - ], [insert_inode_locked], [fs/inode.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INSERT_INODE_LOCKED, 1, - [insert_inode_locked() is available]) - ], [ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INSERT_INODE_LOCKED], [ + AC_MSG_CHECKING([whether insert_inode_locked() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([insert_inode_locked], + [insert_inode_locked], [fs/inode.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([insert_inode_locked()]) ]) ]) diff --git a/config/kernel-invalidate-bdev-args.m4 b/config/kernel-invalidate-bdev-args.m4 deleted file mode 100644 index 09c2ebf26e..0000000000 --- a/config/kernel-invalidate-bdev-args.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 2.6.22 API change -dnl # Unused destroy_dirty_buffers arg removed from prototype. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS], [ - AC_MSG_CHECKING([whether invalidate_bdev() wants 1 arg]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct block_device *bdev = NULL; - invalidate_bdev(bdev); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_INVALIDATE_BDEV, 1, - [invalidate_bdev() wants 1 arg]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-is_owner_or_cap.m4 b/config/kernel-is_owner_or_cap.m4 index da07e58dda..a90cf3da64 100644 --- a/config/kernel-is_owner_or_cap.m4 +++ b/config/kernel-is_owner_or_cap.m4 @@ -4,33 +4,43 @@ dnl # The is_owner_or_cap() macro was renamed to inode_owner_or_capable(), dnl # This is used for permission checks in the xattr and file attribute call dnl # paths. dnl # -AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ - AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) - ZFS_LINUX_TRY_COMPILE([ +dnl # 5.12 API change, +dnl # inode_owner_or_capable() now takes struct user_namespace * +dnl # to support idmapped mounts +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE], [ + ZFS_LINUX_TEST_SRC([inode_owner_or_capable], [ #include ],[ struct inode *ip = NULL; (void) inode_owner_or_capable(ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_owner_or_capable_idmapped], [ + #include ],[ + struct inode *ip = NULL; + (void) inode_owner_or_capable(&init_user_ns, ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ + AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE, 1, [inode_owner_or_capable() exists]) - ],[ + ], [ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether is_owner_or_cap() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - struct inode *ip = NULL; - (void) is_owner_or_cap(ip); - ],[ + + AC_MSG_CHECKING( + [whether inode_owner_or_capable() takes user_ns]) + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable_idmapped], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IS_OWNER_OR_CAP, 1, - [is_owner_or_cap() exists]) + AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED, 1, + [inode_owner_or_capable() takes user_ns]) ],[ - AC_MSG_ERROR(no - Please file a bug report at - https://github.com/zfsonlinux/zfs/issues/new) + ZFS_LINUX_TEST_ERROR([capability]) ]) ]) ]) diff --git a/config/kernel-kmap-atomic-args.m4 b/config/kernel-kmap-atomic-args.m4 index beb1692e72..1172505afc 100644 --- a/config/kernel-kmap-atomic-args.m4 +++ b/config/kernel-kmap-atomic-args.m4 @@ -3,18 +3,20 @@ dnl # 2.6.37 API change dnl # kmap_atomic changed from assigning hard-coded named slot to using dnl # push/pop based dynamical allocation. dnl # -AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ - AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS], [ + ZFS_LINUX_TEST_SRC([kmap_atomic], [ #include ],[ struct page page; kmap_atomic(&page); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1, - [kmap_atomic wants 1 args]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ + AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) + ZFS_LINUX_TEST_RESULT([kmap_atomic], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kmap_atomic()]) ]) ]) diff --git a/config/kernel-kmem-cache.m4 b/config/kernel-kmem-cache.m4 index 21cc53d349..0e9fe9eb2a 100644 --- a/config/kernel-kmem-cache.m4 +++ b/config/kernel-kmem-cache.m4 @@ -1,55 +1,12 @@ -dnl # -dnl # 2.6.35 API change, -dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are -dnl # private allocation flags which are applied when allocating a new slab -dnl # in kmem_getpages(). Unfortunately there is no public API for setting -dnl # non-default flags. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE_ALLOCFLAGS], [ - AC_MSG_CHECKING([whether struct kmem_cache has allocflags]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct kmem_cache cachep __attribute__ ((unused)); - cachep.allocflags = GFP_KERNEL; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1, - [struct kmem_cache has allocflags]) - ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether struct kmem_cache has gfpflags]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct kmem_cache cachep __attribute__ ((unused)); - cachep.gfpflags = GFP_KERNEL; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1, - [struct kmem_cache has gfpflags]) - ],[ - AC_MSG_RESULT(no) - ]) - ]) -]) - dnl # dnl # grsecurity API change, dnl # kmem_cache_create() with SLAB_USERCOPY flag replaced by dnl # kmem_cache_create_usercopy(). dnl # -AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY], [ - AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMEM_CACHE_CREATE_USERCOPY], [ + ZFS_LINUX_TEST_SRC([kmem_cache_create_usercopy], [ #include - static void ctor(void *foo) - { - // fake ctor - } + static void ctor(void *foo) { /* fake ctor */ } ],[ struct kmem_cache *skc_linux_cache; const char *name = "test"; @@ -60,13 +17,25 @@ AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY], [ size_t usersize = size - useroffset; skc_linux_cache = kmem_cache_create_usercopy( - name, size, align, flags, useroffset, usersize, ctor); - ],[ + name, size, align, flags, useroffset, usersize, ctor); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY], [ + AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists]) + ZFS_LINUX_TEST_RESULT([kmem_cache_create_usercopy], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KMEM_CACHE_CREATE_USERCOPY, 1, - [kmem_cache_create_usercopy() exists]) + [kmem_cache_create_usercopy() exists]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMEM_CACHE], [ + ZFS_AC_KERNEL_SRC_KMEM_CACHE_CREATE_USERCOPY +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE], [ + ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY ]) diff --git a/config/kernel-kmem.m4 b/config/kernel-kmem.m4 index cc055e530c..43f9e72f88 100644 --- a/config/kernel-kmem.m4 +++ b/config/kernel-kmem.m4 @@ -56,3 +56,53 @@ AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [ AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) AC_MSG_RESULT([$enable_debug_kmem_tracking]) ]) + +dnl # +dnl # 4.12 API, +dnl # Added kvmalloc allocation strategy +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KVMALLOC], [ + ZFS_LINUX_TEST_SRC([kvmalloc], [ + #include + ],[ + void *p __attribute__ ((unused)); + + p = kvmalloc(0, GFP_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KVMALLOC], [ + AC_MSG_CHECKING([whether kvmalloc(ptr, flags) is available]) + ZFS_LINUX_TEST_RESULT([kvmalloc], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KVMALLOC, 1, [kvmalloc exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.8 API, +dnl # __vmalloc PAGE_KERNEL removal +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL], [ + ZFS_LINUX_TEST_SRC([__vmalloc], [ + #include + #include + ],[ + void *p __attribute__ ((unused)); + + p = __vmalloc(0, GFP_KERNEL, PAGE_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL], [ + AC_MSG_CHECKING([whether __vmalloc(ptr, flags, pageflags) is available]) + ZFS_LINUX_TEST_RESULT([__vmalloc], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VMALLOC_PAGE_KERNEL, 1, [__vmalloc page flags exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) +- \ No newline at end of file diff --git a/config/kernel-kstrtoul.m4 b/config/kernel-kstrtoul.m4 index 5530e0e2d8..8e4b542978 100644 --- a/config/kernel-kstrtoul.m4 +++ b/config/kernel-kstrtoul.m4 @@ -1,21 +1,21 @@ dnl # dnl # 2.6.39 API change +dnl # Added kstrtoul() dnl # -dnl # 33ee3b2e2eb9 kstrto*: converting strings to integers done (hopefully) right -dnl # -dnl # If kstrtoul() doesn't exist, fallback to use strict_strtoul() which has -dnl # existed since 2.6.25. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_KSTRTOUL], [ - AC_MSG_CHECKING([whether kstrtoul() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KSTRTOUL], [ + ZFS_LINUX_TEST_SRC([kstrtoul], [ #include ],[ int ret __attribute__ ((unused)) = kstrtoul(NULL, 10, NULL); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KSTRTOUL], [ + AC_MSG_CHECKING([whether kstrtoul() exists]) + ZFS_LINUX_TEST_RESULT([kstrtoul], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KSTRTOUL, 1, [kstrtoul() exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([kstrtoul()]) ]) ]) diff --git a/config/kernel-ktime.m4 b/config/kernel-ktime.m4 new file mode 100644 index 0000000000..64c3b5f903 --- /dev/null +++ b/config/kernel-ktime.m4 @@ -0,0 +1,55 @@ +dnl # +dnl # 4.18: ktime_get_coarse_real_ts64() replaces current_kernel_time64(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME_GET_COARSE_REAL_TS64], [ + ZFS_LINUX_TEST_SRC([ktime_get_coarse_real_ts64], [ + #include + ], [ + struct timespec64 ts; + ktime_get_coarse_real_ts64(&ts); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64], [ + AC_MSG_CHECKING([whether ktime_get_coarse_real_ts64() exists]) + ZFS_LINUX_TEST_RESULT([ktime_get_coarse_real_ts64], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KTIME_GET_COARSE_REAL_TS64, 1, + [ktime_get_coarse_real_ts64() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.18: ktime_get_raw_ts64() replaces getrawmonotonic64(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME_GET_RAW_TS64], [ + ZFS_LINUX_TEST_SRC([ktime_get_raw_ts64], [ + #include + ], [ + struct timespec64 ts; + ktime_get_raw_ts64(&ts); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME_GET_RAW_TS64], [ + AC_MSG_CHECKING([whether ktime_get_raw_ts64() exists]) + ZFS_LINUX_TEST_RESULT([ktime_get_raw_ts64], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KTIME_GET_RAW_TS64, 1, + [ktime_get_raw_ts64() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME], [ + ZFS_AC_KERNEL_SRC_KTIME_GET_COARSE_REAL_TS64 + ZFS_AC_KERNEL_SRC_KTIME_GET_RAW_TS64 +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME], [ + ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64 + ZFS_AC_KERNEL_KTIME_GET_RAW_TS64 +]) diff --git a/config/kernel-ktime_get_coarse_real_ts64.m4 b/config/kernel-ktime_get_coarse_real_ts64.m4 deleted file mode 100644 index d6be8c4185..0000000000 --- a/config/kernel-ktime_get_coarse_real_ts64.m4 +++ /dev/null @@ -1,18 +0,0 @@ -dnl # -dnl # 4.18: ktime_get_coarse_real_ts64() added. Use it in place of -dnl # current_kernel_time64(). -dnl # -AC_DEFUN([ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64], - [AC_MSG_CHECKING([whether ktime_get_coarse_real_ts64() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - ], [ - struct timespec64 ts; - ktime_get_coarse_real_ts64(&ts); - ], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KTIME_GET_COARSE_REAL_TS64, 1, [ktime_get_coarse_real_ts64() exists]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-kuid-helpers.m4 b/config/kernel-kuid-helpers.m4 index 60713b9d31..38a439fa6e 100644 --- a/config/kernel-kuid-helpers.m4 +++ b/config/kernel-kuid-helpers.m4 @@ -5,18 +5,20 @@ dnl # became necessary to go through one more level of indirection dnl # when dealing with uid/gid - namely the kuid type. dnl # dnl # -AC_DEFUN([ZFS_AC_KERNEL_KUID_HELPERS], [ - AC_MSG_CHECKING([whether i_(uid|gid)_(read|write) exist]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUID_HELPERS], [ + ZFS_LINUX_TEST_SRC([i_uid_read], [ #include ],[ struct inode *ip = NULL; (void) i_uid_read(ip); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KUID_HELPERS, 1, - [i_(uid|gid)_(read|write) exist]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KUID_HELPERS], [ + AC_MSG_CHECKING([whether i_(uid|gid)_(read|write) exist]) + ZFS_LINUX_TEST_RESULT([i_uid_read], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([i_uid_read]) ]) ]) diff --git a/config/kernel-kuidgid.m4 b/config/kernel-kuidgid.m4 index 82685d2636..b7e441408c 100644 --- a/config/kernel-kuidgid.m4 +++ b/config/kernel-kuidgid.m4 @@ -1,28 +1,21 @@ dnl # -dnl # User namespaces, use kuid_t in place of uid_t -dnl # where available. Not strictly a user namespaces thing -dnl # but it should prevent surprises +dnl # 3.8 API change, +dnl # User namespaces, use kuid_t in place of uid_t where available. dnl # -AC_DEFUN([ZFS_AC_KERNEL_KUIDGID_T], [ - AC_MSG_CHECKING([whether kuid_t/kgid_t is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUIDGID_T], [ + ZFS_LINUX_TEST_SRC([kuidgid_t], [ #include ], [ kuid_t userid __attribute__ ((unused)) = KUIDT_INIT(0); kgid_t groupid __attribute__ ((unused)) = KGIDT_INIT(0); - ],[ - ZFS_LINUX_TRY_COMPILE([ - #include - ], [ - kuid_t userid __attribute__ ((unused)) = 0; - kgid_t groupid __attribute__ ((unused)) = 0; - ],[ - AC_MSG_RESULT(yes; optional) - ],[ - AC_MSG_RESULT(yes; mandatory) - AC_DEFINE(HAVE_KUIDGID_T, 1, [kuid_t/kgid_t in use]) - ]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KUIDGID_T], [ + AC_MSG_CHECKING([whether kuid_t/kgid_t is available]) + ZFS_LINUX_TEST_RESULT([kuidgid_t], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kuid_t/kgid_t]) ]) ]) diff --git a/config/kernel-lookup-bdev.m4 b/config/kernel-lookup-bdev.m4 deleted file mode 100644 index abbf55d9bb..0000000000 --- a/config/kernel-lookup-bdev.m4 +++ /dev/null @@ -1,29 +0,0 @@ -dnl # -dnl # 2.6.27, lookup_bdev() was exported. -dnl # 4.4.0-6.21 - x.y on Ubuntu, lookup_bdev() takes 2 arguments. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_LOOKUP_BDEV], - [AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - lookup_bdev(NULL); - ], [lookup_bdev], [fs/block_dev.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, [lookup_bdev() wants 1 arg]) - ], [ - AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether lookup_bdev() wants 2 args]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - lookup_bdev(NULL, FMODE_READ); - ], [lookup_bdev], [fs/block_dev.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_LOOKUP_BDEV, 1, - [lookup_bdev() wants 2 args]) - ], [ - AC_MSG_RESULT(no) - ]) - ]) -]) \ No newline at end of file diff --git a/config/kernel-lookup-nameidata.m4 b/config/kernel-lookup-nameidata.m4 deleted file mode 100644 index 5453be5e8e..0000000000 --- a/config/kernel-lookup-nameidata.m4 +++ /dev/null @@ -1,26 +0,0 @@ -dnl # -dnl # 3.6 API change -dnl # -AC_DEFUN([ZFS_AC_KERNEL_LOOKUP_NAMEIDATA], [ - AC_MSG_CHECKING([whether iops->lookup() passes nameidata]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - struct dentry *inode_lookup(struct inode *inode, - struct dentry *dentry, struct nameidata *nidata) - { return NULL; } - - static const struct inode_operations iops - __attribute__ ((unused)) = { - .lookup = inode_lookup, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_LOOKUP_NAMEIDATA, 1, - [iops->lookup() passes nameidata]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-lseek-execute.m4 b/config/kernel-lseek-execute.m4 index 8c4032b92c..652f611f8d 100644 --- a/config/kernel-lseek-execute.m4 +++ b/config/kernel-lseek-execute.m4 @@ -2,9 +2,8 @@ dnl # dnl # 3.11 API change dnl # lseek_execute helper exported dnl # -AC_DEFUN([ZFS_AC_KERNEL_LSEEK_EXECUTE], - [AC_MSG_CHECKING([whether lseek_execute() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE], [ + ZFS_LINUX_TEST_SRC([lseek_execute], [ #include ], [ struct file *fp __attribute__ ((unused)) = NULL; @@ -13,10 +12,15 @@ AC_DEFUN([ZFS_AC_KERNEL_LSEEK_EXECUTE], loff_t maxsize __attribute__ ((unused)) = 0; lseek_execute(fp, ip, offset, maxsize); - ], [lseek_exclusive], [fs/read_write.c], [ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_LSEEK_EXECUTE], [ + AC_MSG_CHECKING([whether lseek_execute() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lseek_execute], + [lseek_exclusive], [fs/read_write.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_LSEEK_EXECUTE, 1, - [lseek_execute() is available]) + AC_DEFINE(HAVE_LSEEK_EXECUTE, 1, [lseek_execute() is available]) ], [ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 new file mode 100644 index 0000000000..86b202a7a2 --- /dev/null +++ b/config/kernel-make-request-fn.m4 @@ -0,0 +1,160 @@ +dnl # +dnl # Check for make_request_fn interface. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ + ZFS_LINUX_TEST_SRC([make_request_fn_void], [ + #include + void make_request(struct request_queue *q, + struct bio *bio) { return; } + ],[ + blk_queue_make_request(NULL, &make_request); + ]) + + ZFS_LINUX_TEST_SRC([make_request_fn_blk_qc_t], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + blk_queue_make_request(NULL, &make_request); + ]) + + ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + struct request_queue *q __attribute__ ((unused)); + q = blk_alloc_queue(make_request, NUMA_NO_NODE); + ]) + + ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn_rh], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + struct request_queue *q __attribute__ ((unused)); + q = blk_alloc_queue_rh(make_request, NUMA_NO_NODE); + ]) + + ZFS_LINUX_TEST_SRC([block_device_operations_submit_bio], [ + #include + ],[ + struct block_device_operations o; + o.submit_bio = NULL; + ]) + + ZFS_LINUX_TEST_SRC([blk_alloc_disk], [ + #include + ],[ + struct gendisk *disk __attribute__ ((unused)); + disk = blk_alloc_disk(NUMA_NO_NODE); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ + dnl # Checked as part of the blk_alloc_queue_request_fn test + dnl # + dnl # Linux 5.9 API Change + dnl # make_request_fn was moved into block_device_operations->submit_bio + dnl # + AC_MSG_CHECKING([whether submit_bio is member of struct block_device_operations]) + ZFS_LINUX_TEST_RESULT([block_device_operations_submit_bio], [ + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS, 1, + [submit_bio is member of struct block_device_operations]) + + dnl # + dnl # Linux 5.14 API Change: + dnl # blk_alloc_queue() + alloc_disk() combo replaced by + dnl # a single call to blk_alloc_disk(). + dnl # + AC_MSG_CHECKING([whether blk_alloc_disk() exists]) + ZFS_LINUX_TEST_RESULT([blk_alloc_disk], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_ALLOC_DISK], 1, [blk_alloc_disk() exists]) + ], [ + AC_MSG_RESULT(no) + ]) + ],[ + AC_MSG_RESULT(no) + + dnl # Checked as part of the blk_alloc_queue_request_fn test + dnl # + dnl # Linux 5.7 API Change + dnl # blk_alloc_queue() expects request function. + dnl # + AC_MSG_CHECKING([whether blk_alloc_queue() expects request function]) + ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn], [ + AC_MSG_RESULT(yes) + + dnl # This is currently always the case. + AC_MSG_CHECKING([whether make_request_fn() returns blk_qc_t]) + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN, 1, + [blk_alloc_queue() expects request function]) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() returns blk_qc_t]) + ],[ + dnl # + dnl # CentOS Stream 4.18.0-257 API Change + dnl # The Linux 5.7 blk_alloc_queue() change was back- + dnl # ported and the symbol renamed blk_alloc_queue_rh(). + dnl # As of this kernel version they're not providing + dnl # any compatibility code in the kernel for this. + dnl # + ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn_rh], [ + AC_MSG_RESULT(yes) + + dnl # This is currently always the case. + AC_MSG_CHECKING([whether make_request_fn_rh() returns blk_qc_t]) + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH, 1, + [blk_alloc_queue_rh() expects request function]) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() returns blk_qc_t]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # Linux 3.2 API Change + dnl # make_request_fn returns void. + dnl # + AC_MSG_CHECKING( + [whether make_request_fn() returns void]) + ZFS_LINUX_TEST_RESULT([make_request_fn_void], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MAKE_REQUEST_FN_RET, void, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_VOID, 1, + [Noting that make_request_fn() returns void]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # Linux 4.4 API Change + dnl # make_request_fn returns blk_qc_t. + dnl # + AC_MSG_CHECKING( + [whether make_request_fn() returns blk_qc_t]) + ZFS_LINUX_TEST_RESULT([make_request_fn_blk_qc_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() ] + [returns blk_qc_t]) + ],[ + ZFS_LINUX_TEST_ERROR([make_request_fn]) + ]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-misc-minor.m4 b/config/kernel-misc-minor.m4 index a020d2ebca..20fe2cd2f3 100644 --- a/config/kernel-misc-minor.m4 +++ b/config/kernel-misc-minor.m4 @@ -6,7 +6,7 @@ dnl # number. Start with a large known available unreserved minor and work dnl # our way down to lower value if a collision is detected. dnl # AC_DEFUN([ZFS_AC_KERNEL_MISC_MINOR], [ - AC_MSG_CHECKING([for available /dev/zfs minor]) + AC_MSG_CHECKING([whether /dev/zfs minor is available]) for i in $(seq 249 -1 200); do if ! grep -q "^#define\s\+.*_MINOR\s\+.*$i" \ diff --git a/config/kernel-mk-request-fn.m4 b/config/kernel-mk-request-fn.m4 deleted file mode 100644 index 57eebe23de..0000000000 --- a/config/kernel-mk-request-fn.m4 +++ /dev/null @@ -1,65 +0,0 @@ -dnl # -dnl # Linux 3.2 API Change -dnl # make_request_fn returns void instead of int. -dnl # -dnl # Linux 4.4 API Change -dnl # make_request_fn returns blk_qc_t. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ - AC_MSG_CHECKING([whether make_request_fn() returns int]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int make_request(struct request_queue *q, struct bio *bio) - { - return (0); - } - ],[ - blk_queue_make_request(NULL, &make_request); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(MAKE_REQUEST_FN_RET, int, - [make_request_fn() returns int]) - AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_INT, 1, - [Noting that make_request_fn() returns int]) - ],[ - AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether make_request_fn() returns void]) - ZFS_LINUX_TRY_COMPILE([ - #include - - void make_request(struct request_queue *q, struct bio *bio) - { - return; - } - ],[ - blk_queue_make_request(NULL, &make_request); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(MAKE_REQUEST_FN_RET, void, - [make_request_fn() returns void]) - ],[ - AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether make_request_fn() returns blk_qc_t]) - ZFS_LINUX_TRY_COMPILE([ - #include - - blk_qc_t make_request(struct request_queue *q, struct bio *bio) - { - return (BLK_QC_T_NONE); - } - ],[ - blk_queue_make_request(NULL, &make_request); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, - [make_request_fn() returns blk_qc_t]) - AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, - [Noting that make_request_fn() returns blk_qc_t]) - ],[ - AC_MSG_ERROR(no - Please file a bug report at - https://github.com/zfsonlinux/zfs/issues/new) - ]) - ]) - ]) -]) diff --git a/config/kernel-mkdir-umode-t.m4 b/config/kernel-mkdir-umode-t.m4 deleted file mode 100644 index ebc21be9ec..0000000000 --- a/config/kernel-mkdir-umode-t.m4 +++ /dev/null @@ -1,29 +0,0 @@ -dnl # -dnl # 3.3 API change -dnl # The VFS .create, .mkdir and .mknod callbacks were updated to take a -dnl # umode_t type rather than an int. The expectation is that any backport -dnl # would also change all three prototypes. However, if it turns out that -dnl # some distribution doesn't backport the whole thing this could be -dnl # broken apart in to three separate checks. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_MKDIR_UMODE_T], [ - AC_MSG_CHECKING([whether iops->create()/mkdir()/mknod() take umode_t]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int mkdir(struct inode *inode, struct dentry *dentry, - umode_t umode) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .mkdir = mkdir, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, - [iops->create()/mkdir()/mknod() take umode_t]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-mkdir.m4 b/config/kernel-mkdir.m4 new file mode 100644 index 0000000000..a162bcd880 --- /dev/null +++ b/config/kernel-mkdir.m4 @@ -0,0 +1,65 @@ +dnl # +dnl # Supported mkdir() interfaces checked newest to oldest. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ + dnl # + dnl # 5.12 API change + dnl # The struct user_namespace arg was added as the first argument to + dnl # mkdir() + dnl # + ZFS_LINUX_TEST_SRC([mkdir_user_namespace], [ + #include + + int mkdir(struct user_namespace *userns, + struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) + + dnl # + dnl # 3.3 API change + dnl # The VFS .create, .mkdir and .mknod callbacks were updated to take a + dnl # umode_t type rather than an int. The expectation is that any backport + dnl # would also change all three prototypes. However, if it turns out that + dnl # some distribution doesn't backport the whole thing this could be + dnl # broken apart into three separate checks. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_mkdir], [ + #include + + int mkdir(struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MKDIR], [ + dnl # + dnl # 5.12 API change + dnl # The struct user_namespace arg was added as the first argument to + dnl # mkdir() of the iops structure. + dnl # + AC_MSG_CHECKING([whether iops->mkdir() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mkdir_user_namespace], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_MKDIR_USERNS, 1, + [iops->mkdir() takes struct user_namespace*]) + ],[ + AC_MSG_CHECKING([whether iops->mkdir() takes umode_t]) + ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, + [iops->mkdir() takes umode_t]) + ],[ + ZFS_LINUX_TEST_ERROR([mkdir()]) + ]) + ]) +]) diff --git a/config/kernel-mknod.m4 b/config/kernel-mknod.m4 new file mode 100644 index 0000000000..ffe4510600 --- /dev/null +++ b/config/kernel-mknod.m4 @@ -0,0 +1,30 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([mknod_userns], [ + #include + #include + + int tmp_mknod(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + umode_t u, dev_t d) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mknod = tmp_mknod, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MKNOD], [ + AC_MSG_CHECKING([whether iops->mknod() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mknod_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_MKNOD_USERNS, 1, + [iops->mknod() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-mod-param.m4 b/config/kernel-mod-param.m4 index b72be684a4..e00f19d61e 100644 --- a/config/kernel-mod-param.m4 +++ b/config/kernel-mod-param.m4 @@ -2,9 +2,8 @@ dnl # dnl # Grsecurity kernel API change dnl # constified parameters of module_param_call() methods dnl # -AC_DEFUN([ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST], [ - AC_MSG_CHECKING([whether module_param_call() is hardened]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST], [ + ZFS_LINUX_TEST_SRC([module_param_call], [ #include #include @@ -19,8 +18,12 @@ AC_DEFUN([ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST], [ } module_param_call(p, param_set, param_get, NULL, 0644); - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST], [ + AC_MSG_CHECKING([whether module_param_call() is hardened]) + ZFS_LINUX_TEST_RESULT([module_param_call], [ AC_MSG_RESULT(yes) AC_DEFINE(MODULE_PARAM_CALL_CONST, 1, [hardened module_param_call]) diff --git a/config/kernel-objtool.m4 b/config/kernel-objtool.m4 index 467329b254..f9f9d657d8 100644 --- a/config/kernel-objtool.m4 +++ b/config/kernel-objtool.m4 @@ -1,41 +1,70 @@ dnl # -dnl # 4.6 API for compile-time stack validation +dnl # Detect objtool functionality. dnl # -AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ - AC_MSG_CHECKING([for compile-time stack validation (objtool)]) + +dnl # +dnl # Kernel 5.10: linux/frame.h was renamed linux/objtool.h +dnl # +AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL_HEADER], [ + AC_MSG_CHECKING([whether objtool header is available]) ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + ],[ + AC_DEFINE(HAVE_KERNEL_OBJTOOL_HEADER, 1, + [kernel has linux/objtool.h]) + AC_MSG_RESULT(linux/objtool.h) + ],[ + AC_MSG_RESULT(linux/frame.h) + ]) +]) + +dnl # +dnl # Check for objtool support. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [ + + dnl # 4.6 API for compile-time stack validation + ZFS_LINUX_TEST_SRC([objtool], [ #undef __ASSEMBLY__ + #include #include ],[ #if !defined(FRAME_BEGIN) - CTASSERT(1); + #error "FRAME_BEGIN is not defined" + #endif + ]) + + dnl # 4.6 API added STACK_FRAME_NON_STANDARD macro + ZFS_LINUX_TEST_SRC([stack_frame_non_standard], [ + #ifdef HAVE_KERNEL_OBJTOOL_HEADER + #include + #else + #include #endif ],[ + #if !defined(STACK_FRAME_NON_STANDARD) + #error "STACK_FRAME_NON_STANDARD is not defined." + #endif + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ + AC_MSG_CHECKING( + [whether compile-time stack validation (objtool) is available]) + ZFS_LINUX_TEST_RESULT([objtool], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_OBJTOOL, 1, [kernel does stack verification]) - ZFS_AC_KERNEL_STACK_FRAME_NON_STANDARD - ],[ - AC_MSG_RESULT(no) - ]) -]) - -dnl # -dnl # 4.6 API added STACK_FRAME_NON_STANDARD macro -dnl # -AC_DEFUN([ZFS_AC_KERNEL_STACK_FRAME_NON_STANDARD], [ - AC_MSG_CHECKING([whether STACK_FRAME_NON_STANDARD is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - #if !defined(STACK_FRAME_NON_STANDARD) - CTASSERT(1); - #endif - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_STACK_FRAME_NON_STANDARD, 1, - [STACK_FRAME_NON_STANDARD is defined]) + AC_MSG_CHECKING([whether STACK_FRAME_NON_STANDARD is defined]) + ZFS_LINUX_TEST_RESULT([stack_frame_non_standard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_STACK_FRAME_NON_STANDARD, 1, + [STACK_FRAME_NON_STANDARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-open-bdev-exclusive.m4 b/config/kernel-open-bdev-exclusive.m4 deleted file mode 100644 index 0661315a61..0000000000 --- a/config/kernel-open-bdev-exclusive.m4 +++ /dev/null @@ -1,18 +0,0 @@ -dnl # -dnl # 2.6.28 API change -dnl # open/close_bdev_excl() renamed to open/close_bdev_exclusive() -dnl # -AC_DEFUN([ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE], - [AC_MSG_CHECKING([whether open_bdev_exclusive() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - open_bdev_exclusive(NULL, 0, NULL); - ], [open_bdev_exclusive], [fs/block_dev.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_OPEN_BDEV_EXCLUSIVE, 1, - [open_bdev_exclusive() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-pde-data.m4 b/config/kernel-pde-data.m4 index 8aa4c2204e..f866d77a11 100644 --- a/config/kernel-pde-data.m4 +++ b/config/kernel-pde-data.m4 @@ -2,16 +2,19 @@ dnl # dnl # 3.10 API change, dnl # PDE is replaced by PDE_DATA dnl # -AC_DEFUN([ZFS_AC_KERNEL_PDE_DATA], [ - AC_MSG_CHECKING([whether PDE_DATA() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_PDE_DATA], [ + ZFS_LINUX_TEST_SRC([pde_data], [ #include ], [ PDE_DATA(NULL); - ], [PDE_DATA], [], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PDE_DATA, 1, [yes]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PDE_DATA], [ + AC_MSG_CHECKING([whether PDE_DATA() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([pde_data], [PDE_DATA], [], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([PDE_DATA]) ]) ]) diff --git a/config/kernel-percpu.m4 b/config/kernel-percpu.m4 new file mode 100644 index 0000000000..5125dd5c5b --- /dev/null +++ b/config/kernel-percpu.m4 @@ -0,0 +1,87 @@ +dnl # +dnl # 3.18 API change, +dnl # The function percpu_counter_init now must be passed a GFP mask. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT], [ + ZFS_LINUX_TEST_SRC([percpu_counter_init_with_gfp], [ + #include + #include + ],[ + struct percpu_counter counter; + int error; + + error = percpu_counter_init(&counter, 0, GFP_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_INIT], [ + AC_MSG_CHECKING([whether percpu_counter_init() wants gfp_t]) + ZFS_LINUX_TEST_RESULT([percpu_counter_init_with_gfp], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PERCPU_COUNTER_INIT_WITH_GFP, 1, + [percpu_counter_init() wants gfp_t]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.13 API change, +dnl # __percpu_counter_add() was renamed to percpu_counter_add_batch(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_ADD_BATCH], [ + ZFS_LINUX_TEST_SRC([percpu_counter_add_batch], [ + #include + ],[ + struct percpu_counter counter; + + percpu_counter_add_batch(&counter, 1, 1); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_ADD_BATCH], [ + AC_MSG_CHECKING([whether percpu_counter_add_batch() is defined]) + ZFS_LINUX_TEST_RESULT([percpu_counter_add_batch], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PERCPU_COUNTER_ADD_BATCH, 1, + [percpu_counter_add_batch() is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.10 API change, +dnl # The "count" was moved into ref->data, from ref +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_REF_COUNT_IN_DATA], [ + ZFS_LINUX_TEST_SRC([percpu_ref_count_in_data], [ + #include + ],[ + struct percpu_ref_data d; + + atomic_long_set(&d.count, 1L); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA], [ + AC_MSG_CHECKING([whether is inside percpu_ref.data]) + ZFS_LINUX_TEST_RESULT([percpu_ref_count_in_data], [ + AC_MSG_RESULT(yes) + AC_DEFINE(ZFS_PERCPU_REF_COUNT_IN_DATA, 1, + [count is located in percpu_ref.data]) + ],[ + AC_MSG_RESULT(no) + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU], [ + ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT + ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_ADD_BATCH + ZFS_AC_KERNEL_SRC_PERCPU_REF_COUNT_IN_DATA +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU], [ + ZFS_AC_KERNEL_PERCPU_COUNTER_INIT + ZFS_AC_KERNEL_PERCPU_COUNTER_ADD_BATCH + ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA +]) diff --git a/config/kernel-proc-operations.m4 b/config/kernel-proc-operations.m4 new file mode 100644 index 0000000000..df216222ec --- /dev/null +++ b/config/kernel-proc-operations.m4 @@ -0,0 +1,41 @@ +dnl # +dnl # 5.6 API Change +dnl # The proc_ops structure was introduced to replace the use of +dnl # of the file_operations structure when registering proc handlers. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_OPERATIONS], [ + ZFS_LINUX_TEST_SRC([proc_ops_struct], [ + #include + + int test_open(struct inode *ip, struct file *fp) { return 0; } + ssize_t test_read(struct file *fp, char __user *ptr, + size_t size, loff_t *offp) { return 0; } + ssize_t test_write(struct file *fp, const char __user *ptr, + size_t size, loff_t *offp) { return 0; } + loff_t test_lseek(struct file *fp, loff_t off, int flag) + { return 0; } + int test_release(struct inode *ip, struct file *fp) + { return 0; } + + const struct proc_ops test_ops __attribute__ ((unused)) = { + .proc_open = test_open, + .proc_read = test_read, + .proc_write = test_write, + .proc_lseek = test_lseek, + .proc_release = test_release, + }; + ], [ + struct proc_dir_entry *entry __attribute__ ((unused)) = + proc_create_data("test", 0444, NULL, &test_ops, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PROC_OPERATIONS], [ + AC_MSG_CHECKING([whether proc_ops structure exists]) + ZFS_LINUX_TEST_RESULT([proc_ops_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PROC_OPS_STRUCT, 1, [proc_ops structure exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-put-link.m4 b/config/kernel-put-link.m4 index a0bb36ef27..4234861f33 100644 --- a/config/kernel-put-link.m4 +++ b/config/kernel-put-link.m4 @@ -1,17 +1,35 @@ dnl # dnl # Supported symlink APIs dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PUT_LINK], [ + ZFS_LINUX_TEST_SRC([put_link_cookie], [ + #include + void put_link(struct inode *ip, void *cookie) + { return; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .put_link = put_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([put_link_nameidata], [ + #include + void put_link(struct dentry *de, struct + nameidata *nd, void *ptr) { return; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .put_link = put_link, + }; + ],[]) +]) + AC_DEFUN([ZFS_AC_KERNEL_PUT_LINK], [ dnl # dnl # 4.5 API change dnl # get_link() uses delayed done, there is no put_link() interface. + dnl # This check initially uses the inode_operations_get_link result dnl # - ZFS_LINUX_TRY_COMPILE([ - #if !defined(HAVE_GET_LINK_DELAYED) - #error "Expecting get_link() delayed done" - #endif - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([inode_operations_get_link], [ AC_DEFINE(HAVE_PUT_LINK_DELAYED, 1, [iops->put_link() delayed]) ],[ dnl # @@ -19,41 +37,24 @@ AC_DEFUN([ZFS_AC_KERNEL_PUT_LINK], [ dnl # This kernel retired the nameidata structure. dnl # AC_MSG_CHECKING([whether iops->put_link() passes cookie]) - ZFS_LINUX_TRY_COMPILE([ - #include - void put_link(struct inode *ip, void *cookie) - { return; } - static struct inode_operations - iops __attribute__ ((unused)) = { - .put_link = put_link, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([put_link_cookie], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_PUT_LINK_COOKIE, 1, [iops->put_link() cookie]) ],[ + AC_MSG_RESULT(no) + dnl # dnl # 2.6.32 API dnl # - AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether iops->put_link() passes nameidata]) - ZFS_LINUX_TRY_COMPILE([ - #include - void put_link(struct dentry *de, struct - nameidata *nd, void *ptr) { return; } - static struct inode_operations - iops __attribute__ ((unused)) = { - .put_link = put_link, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([put_link_nameidata], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_PUT_LINK_NAMEIDATA, 1, [iops->put_link() nameidata]) ],[ - AC_MSG_ERROR(no; please file a bug report) + ZFS_LINUX_TEST_ERROR([put_link]) ]) ]) ]) diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index 9f894fb4db..302db43f57 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -1,11 +1,10 @@ -dnl # -dnl # 4.9 API change, -dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants -dnl # flags. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ - AC_MSG_CHECKING([whether iops->rename() wants flags]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ + dnl # + dnl # 4.9 API change, + dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants + dnl # flags. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [ #include int rename_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, @@ -15,11 +14,43 @@ AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ iops __attribute__ ((unused)) = { .rename = rename_fn, }; - ],[ - ],[ + ],[]) + + dnl # + dnl # 5.12 API change, + dnl # + dnl # Linux 5.12 introduced passing struct user_namespace* as the first argument + dnl # of the rename() and other inode_operations members. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [ + #include + int rename_fn(struct user_namespace *user_ns, struct inode *sip, + struct dentry *sdp, struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename = rename_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RENAME], [ + AC_MSG_CHECKING([whether iops->rename() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, [iops->rename() wants flags]) + AC_DEFINE(HAVE_IOPS_RENAME_USERNS, 1, + [iops->rename() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iop->rename() wants flags]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, + [iops->rename() wants flags]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-revalidate-disk-size.m4 b/config/kernel-revalidate-disk-size.m4 new file mode 100644 index 0000000000..a7d0cb3cda --- /dev/null +++ b/config/kernel-revalidate-disk-size.m4 @@ -0,0 +1,46 @@ +dnl # +dnl # 5.11 API change +dnl # revalidate_disk_size() has been removed entirely. +dnl # +dnl # 5.10 API change +dnl # revalidate_disk() was replaced by revalidate_disk_size() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REVALIDATE_DISK], [ + + ZFS_LINUX_TEST_SRC([revalidate_disk_size], [ + #include + ], [ + struct gendisk *disk = NULL; + (void) revalidate_disk_size(disk, false); + ]) + + ZFS_LINUX_TEST_SRC([revalidate_disk], [ + #include + ], [ + struct gendisk *disk = NULL; + (void) revalidate_disk(disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REVALIDATE_DISK], [ + + AC_MSG_CHECKING([whether revalidate_disk_size() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([revalidate_disk_size], + [revalidate_disk_size], [block/genhd.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REVALIDATE_DISK_SIZE, 1, + [revalidate_disk_size() is available]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether revalidate_disk() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([revalidate_disk], + [revalidate_disk], [block/genhd.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REVALIDATE_DISK, 1, + [revalidate_disk() is available]) + ], [ + AC_MSG_RESULT(no) + ]) + ]) +]) diff --git a/config/kernel-rw.m4 b/config/kernel-rw.m4 index 1c8a265e0a..85b47d5c6f 100644 --- a/config/kernel-rw.m4 +++ b/config/kernel-rw.m4 @@ -3,11 +3,8 @@ dnl # 4.14 API change dnl # kernel_write() which was introduced in 3.9 was updated to take dnl # the offset as a pointer which is needed by vn_rdwr(). dnl # -AC_DEFUN([ZFS_AC_KERNEL_WRITE], [ - AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITE], [ + ZFS_LINUX_TEST_SRC([kernel_write], [ #include ],[ struct file *file = NULL; @@ -17,14 +14,18 @@ AC_DEFUN([ZFS_AC_KERNEL_WRITE], [ ssize_t ret; ret = kernel_write(file, buf, count, pos); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WRITE], [ + AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer]) + ZFS_LINUX_TEST_RESULT([kernel_write], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_WRITE_PPOS, 1, [kernel_write() take loff_t pointer]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) dnl # @@ -32,11 +33,8 @@ dnl # 4.14 API change dnl # kernel_read() which has existed for forever was updated to take dnl # the offset as a pointer which is needed by vn_rdwr(). dnl # -AC_DEFUN([ZFS_AC_KERNEL_READ], [ - AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_READ], [ + ZFS_LINUX_TEST_SRC([kernel_read], [ #include ],[ struct file *file = NULL; @@ -46,12 +44,26 @@ AC_DEFUN([ZFS_AC_KERNEL_READ], [ ssize_t ret; ret = kernel_read(file, buf, count, pos); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_READ], [ + AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer]) + ZFS_LINUX_TEST_RESULT([kernel_read], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_READ_PPOS, 1, [kernel_read() take loff_t pointer]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_RW], [ + ZFS_AC_KERNEL_SRC_WRITE + ZFS_AC_KERNEL_SRC_READ +]) + +AC_DEFUN([ZFS_AC_KERNEL_RW], [ + ZFS_AC_KERNEL_WRITE + ZFS_AC_KERNEL_READ ]) diff --git a/config/kernel-rwsem.m4 b/config/kernel-rwsem.m4 index 532c227181..d3a64a8efa 100644 --- a/config/kernel-rwsem.m4 +++ b/config/kernel-rwsem.m4 @@ -1,52 +1,26 @@ -dnl # -dnl # 3.1 API Change -dnl # -dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to -dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW], [ - AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct rw_semaphore dummy_semaphore __attribute__ ((unused)); - raw_spinlock_t dummy_lock __attribute__ ((unused)) = - __RAW_SPIN_LOCK_INITIALIZER(dummy_lock); - dummy_semaphore.wait_lock = dummy_lock; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(RWSEM_SPINLOCK_IS_RAW, 1, - [struct rw_semaphore member wait_lock is raw_spinlock_t]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) - dnl # dnl # 3.16 API Change dnl # dnl # rwsem-spinlock "->activity" changed to "->count" dnl # -AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ACTIVITY], [ - AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_ACTIVITY], [ + ZFS_LINUX_TEST_SRC([rwsem_activity], [ #include ],[ struct rw_semaphore dummy_semaphore __attribute__ ((unused)); dummy_semaphore.activity = 0; - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ACTIVITY], [ + AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) + ZFS_LINUX_TEST_RESULT([rwsem_activity], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1, - [struct rw_semaphore has member activity]) + [struct rw_semaphore has member activity]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) dnl # @@ -54,22 +28,33 @@ dnl # 4.8 API Change dnl # dnl # rwsem "->count" changed to atomic_long_t type dnl # -AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT], [ - AC_MSG_CHECKING( - [whether struct rw_semaphore has atomic_long_t member count]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_ATOMIC_LONG_COUNT], [ + ZFS_LINUX_TEST_SRC([rwsem_atomic_long_count], [ #include ],[ DECLARE_RWSEM(dummy_semaphore); (void) atomic_long_read(&dummy_semaphore.count); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT], [ + AC_MSG_CHECKING( + [whether struct rw_semaphore has atomic_long_t member count]) + ZFS_LINUX_TEST_RESULT([rwsem_atomic_long_count], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1, - [struct rw_semaphore has atomic_long_t member count]) + [struct rw_semaphore has atomic_long_t member count]) ],[ AC_MSG_RESULT(no) ]) - EXTRA_KCFLAGS="$tmp_flags" +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM], [ + ZFS_AC_KERNEL_SRC_RWSEM_ACTIVITY + ZFS_AC_KERNEL_SRC_RWSEM_ATOMIC_LONG_COUNT +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM], [ + ZFS_AC_KERNEL_RWSEM_ACTIVITY + ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT ]) diff --git a/config/kernel-sched.m4 b/config/kernel-sched.m4 index 640b008aab..17e49fbdf4 100644 --- a/config/kernel-sched.m4 +++ b/config/kernel-sched.m4 @@ -2,18 +2,21 @@ dnl # dnl # 3.9 API change, dnl # Moved things from linux/sched.h to linux/sched/rt.h dnl # -AC_DEFUN([ZFS_AC_KERNEL_SCHED_RT_HEADER], - [AC_MSG_CHECKING([whether header linux/sched/rt.h exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED_RT_HEADER], [ + ZFS_LINUX_TEST_SRC([sched_rt_header], [ #include #include ],[ return 0; - ],[ - AC_DEFINE(HAVE_SCHED_RT_HEADER, 1, [linux/sched/rt.h exists]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED_RT_HEADER], [ + AC_MSG_CHECKING([whether header linux/sched/rt.h exists]) + ZFS_LINUX_TEST_RESULT([sched_rt_header], [ AC_MSG_RESULT(yes) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([sched_rt_header]) ]) ]) @@ -21,36 +24,59 @@ dnl # dnl # 4.11 API change, dnl # Moved things from linux/sched.h to linux/sched/signal.h dnl # -AC_DEFUN([ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER], - [AC_MSG_CHECKING([whether header linux/sched/signal.h exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED_SIGNAL_HEADER], [ + ZFS_LINUX_TEST_SRC([sched_signal_header], [ #include #include ],[ return 0; - ],[ - AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, [linux/sched/signal.h exists]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER], [ + AC_MSG_CHECKING([whether header linux/sched/signal.h exists]) + ZFS_LINUX_TEST_RESULT([sched_signal_header], [ + AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, + [linux/sched/signal.h exists]) AC_MSG_RESULT(yes) ],[ AC_MSG_RESULT(no) ]) ]) + dnl # dnl # 3.19 API change dnl # The io_schedule_timeout() function is present in all 2.6.32 kernels dnl # but it was not exported until Linux 3.19. The RHEL 7.x kernels which dnl # are based on a 3.10 kernel do export this symbol. dnl # -AC_DEFUN([ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT], [ - AC_MSG_CHECKING([whether io_schedule_timeout() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_IO_SCHEDULE_TIMEOUT], [ + ZFS_LINUX_TEST_SRC([io_schedule_timeout], [ #include ], [ (void) io_schedule_timeout(1); - ], [io_schedule_timeout], [], [ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT], [ + AC_MSG_CHECKING([whether io_schedule_timeout() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([io_schedule_timeout], + [io_schedule_timeout], [], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IO_SCHEDULE_TIMEOUT, 1, [yes]) ],[ AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED], [ + ZFS_AC_KERNEL_SRC_SCHED_RT_HEADER + ZFS_AC_KERNEL_SRC_SCHED_SIGNAL_HEADER + ZFS_AC_KERNEL_SRC_IO_SCHEDULE_TIMEOUT +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED], [ + ZFS_AC_KERNEL_SCHED_RT_HEADER + ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER + ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT +]) diff --git a/config/kernel-security-inode-init.m4 b/config/kernel-security-inode-init.m4 index a62176d42b..4e4bfd29b2 100644 --- a/config/kernel-security-inode-init.m4 +++ b/config/kernel-security-inode-init.m4 @@ -1,32 +1,3 @@ -dnl # -dnl # 2.6.39 API change -dnl # The security_inode_init_security() function now takes an additional -dnl # qstr argument which must be passed in from the dentry if available. -dnl # Passing a NULL is safe when no qstr is available the relevant -dnl # security checks will just be skipped. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_6ARGS_SECURITY_INODE_INIT_SECURITY], [ - AC_MSG_CHECKING([whether security_inode_init_security wants 6 args]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct inode *ip __attribute__ ((unused)) = NULL; - struct inode *dip __attribute__ ((unused)) = NULL; - const struct qstr *str __attribute__ ((unused)) = NULL; - char *name __attribute__ ((unused)) = NULL; - void *value __attribute__ ((unused)) = NULL; - size_t len __attribute__ ((unused)) = 0; - - security_inode_init_security(ip, dip, str, &name, &value, &len); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY, 1, - [security_inode_init_security wants 6 args]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - dnl # dnl # 3.2 API change dnl # The security_inode_init_security() API has been changed to include @@ -34,9 +5,8 @@ dnl # a filesystem specific callback to write security extended attributes. dnl # This was done to support the initialization of multiple LSM xattrs dnl # and the EVM xattr. dnl # -AC_DEFUN([ZFS_AC_KERNEL_CALLBACK_SECURITY_INODE_INIT_SECURITY], [ - AC_MSG_CHECKING([whether security_inode_init_security wants callback]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SECURITY_INODE_INIT_SECURITY_CALLBACK], [ + ZFS_LINUX_TEST_SRC([security_inode_init_security], [ #include ],[ struct inode *ip __attribute__ ((unused)) = NULL; @@ -45,11 +15,22 @@ AC_DEFUN([ZFS_AC_KERNEL_CALLBACK_SECURITY_INODE_INIT_SECURITY], [ initxattrs func __attribute__ ((unused)) = NULL; security_inode_init_security(ip, dip, str, func, NULL); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY, 1, - [security_inode_init_security wants callback]) - ],[ - AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_SECURITY_INODE_INIT_SECURITY_CALLBACK], [ + AC_MSG_CHECKING([whether security_inode_init_security wants callback]) + ZFS_LINUX_TEST_RESULT([security_inode_init_security], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([security_inode_init_security callback]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SECURITY_INODE], [ + ZFS_AC_KERNEL_SRC_SECURITY_INODE_INIT_SECURITY_CALLBACK +]) + +AC_DEFUN([ZFS_AC_KERNEL_SECURITY_INODE], [ + ZFS_AC_KERNEL_SECURITY_INODE_INIT_SECURITY_CALLBACK +]) diff --git a/config/kernel-set-nlink.m4 b/config/kernel-set-nlink.m4 index f7ffc0d3a5..fa4f928b27 100644 --- a/config/kernel-set-nlink.m4 +++ b/config/kernel-set-nlink.m4 @@ -1,20 +1,22 @@ dnl # -dnl # Linux v3.2-rc1 API change -dnl # SHA: bfe8684869601dacfcb2cd69ef8cfd9045f62170 +dnl # Linux 3.2 API change +dnl # set_nlink() dnl # -AC_DEFUN([ZFS_AC_KERNEL_SET_NLINK], [ - AC_MSG_CHECKING([whether set_nlink() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_NLINK], [ + ZFS_LINUX_TEST_SRC([set_nlink], [ #include ],[ struct inode node; unsigned int link = 0; (void) set_nlink(&node, link); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SET_NLINK, 1, - [set_nlink() is available]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_NLINK], [ + AC_MSG_CHECKING([whether set_nlink() is available]) + ZFS_LINUX_TEST_RESULT([set_nlink], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([set_nlink()]) ]) ]) diff --git a/config/kernel-setattr-prepare.m4 b/config/kernel-setattr-prepare.m4 index 32f7deb77a..24245aa534 100644 --- a/config/kernel-setattr-prepare.m4 +++ b/config/kernel-setattr-prepare.m4 @@ -1,23 +1,52 @@ -dnl # -dnl # 4.9 API change -dnl # The inode_change_ok() function has been renamed setattr_prepare() -dnl # and updated to take a dentry rather than an inode. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SETATTR_PREPARE], - [AC_MSG_CHECKING([whether setattr_prepare() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SETATTR_PREPARE], [ + dnl # + dnl # 4.9 API change + dnl # The inode_change_ok() function has been renamed setattr_prepare() + dnl # and updated to take a dentry rather than an inode. + dnl # + ZFS_LINUX_TEST_SRC([setattr_prepare], [ #include ], [ struct dentry *dentry = NULL; struct iattr *attr = NULL; - int error; + int error __attribute__ ((unused)) = + setattr_prepare(dentry, attr); + ]) - error = setattr_prepare(dentry, attr); - ], [setattr_prepare], [fs/attr.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SETATTR_PREPARE, 1, - [setattr_prepare() is available]) + dnl # + dnl # 5.12 API change + dnl # The setattr_prepare() function has been changed to accept a new argument + dnl # for struct user_namespace* + dnl # + ZFS_LINUX_TEST_SRC([setattr_prepare_userns], [ + #include ], [ - AC_MSG_RESULT(no) + struct dentry *dentry = NULL; + struct iattr *attr = NULL; + struct user_namespace *userns = NULL; + int error __attribute__ ((unused)) = + setattr_prepare(userns, dentry, attr); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SETATTR_PREPARE], [ + AC_MSG_CHECKING([whether setattr_prepare() is available and accepts struct user_namespace*]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare_userns], + [setattr_prepare], [fs/attr.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SETATTR_PREPARE_USERNS, 1, + [setattr_prepare() accepts user_namespace]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether setattr_prepare() is available, doesn't accept user_namespace]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], + [setattr_prepare], [fs/attr.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SETATTR_PREPARE_NO_USERNS, 1, + [setattr_prepare() is available, doesn't accept user_namespace]) + ], [ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-sget-args.m4 b/config/kernel-sget-args.m4 index 9d1745925f..afa62c797d 100644 --- a/config/kernel-sget-args.m4 +++ b/config/kernel-sget-args.m4 @@ -2,9 +2,8 @@ dnl # dnl # 3.6 API change, dnl # 'sget' now takes the mount flags as an argument. dnl # -AC_DEFUN([ZFS_AC_KERNEL_5ARG_SGET], - [AC_MSG_CHECKING([whether sget() wants 5 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SGET], [ + ZFS_LINUX_TEST_SRC([sget_5args], [ #include ],[ struct file_system_type *type = NULL; @@ -13,11 +12,14 @@ AC_DEFUN([ZFS_AC_KERNEL_5ARG_SGET], int flags = 0; void *data = NULL; (void) sget(type, test, set, flags, data); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_5ARG_SGET, 1, [sget() wants 5 args]) - ],[ - AC_MSG_RESULT(no) ]) ]) +AC_DEFUN([ZFS_AC_KERNEL_SGET], [ + AC_MSG_CHECKING([whether sget() wants 5 args]) + ZFS_LINUX_TEST_RESULT([sget_5args], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([sget()]) + ]) +]) diff --git a/config/kernel-show-options.m4 b/config/kernel-show-options.m4 index 67d683c55e..93bd5fbfbb 100644 --- a/config/kernel-show-options.m4 +++ b/config/kernel-show-options.m4 @@ -1,22 +1,25 @@ dnl # dnl # Linux 3.3 API dnl # -AC_DEFUN([ZFS_AC_KERNEL_SHOW_OPTIONS], [ - AC_MSG_CHECKING([whether sops->show_options() wants dentry]) - - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHOW_OPTIONS], [ + ZFS_LINUX_TEST_SRC([super_operations_show_options], [ #include - int show_options (struct seq_file * x, struct dentry * y) { return 0; }; + int show_options(struct seq_file * x, struct dentry * y) { + return 0; + }; + static struct super_operations sops __attribute__ ((unused)) = { .show_options = show_options, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHOW_OPTIONS], [ + AC_MSG_CHECKING([whether sops->show_options() wants dentry]) + ZFS_LINUX_TEST_RESULT([super_operations_show_options], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_SHOW_OPTIONS_WITH_DENTRY, 1, - [sops->show_options() with dentry]) ],[ - AC_MSG_RESULT([no]) + ZFS_LINUX_TEST_ERROR([sops->show_options()]) ]) ]) diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 405cbf42cf..a40c86d5c5 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -4,9 +4,8 @@ dnl # The super_block structure now stores a per-filesystem shrinker. dnl # This interface is preferable because it can be used to specifically dnl # target only the zfs filesystem for pruning. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SHRINK], [ - AC_MSG_CHECKING([whether super_block has s_shrink]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ + ZFS_LINUX_TEST_SRC([super_block_s_shrink], [ #include int shrink(struct shrinker *s, struct shrink_control *sc) @@ -14,99 +13,18 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK], [ static const struct super_block sb __attribute__ ((unused)) = { - .s_shrink.shrink = shrink, .s_shrink.seeks = DEFAULT_SEEKS, .s_shrink.batch = 0, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SHRINK, 1, [struct super_block has s_shrink]) - - ],[ - AC_MSG_RESULT(no) - ]) + ],[]) ]) -dnl # -dnl # 3.3 API change -dnl # The super_block structure was changed to use an hlist_node instead -dnl # of a list_head for the .s_instance linkage. -dnl # -dnl # This was done in part to resolve a race in the iterate_supers_type() -dnl # function which was introduced in Linux 3.0 kernel. The iterator -dnl # was supposed to provide a safe way to call an arbitrary function on -dnl # all super blocks of a specific type. Unfortunately, because a -dnl # list_head was used it was possible for iterate_supers_type() to -dnl # get stuck spinning a super block which was just deactivated. -dnl # -dnl # This can occur because when the list head is removed from the -dnl # fs_supers list it is reinitialized to point to itself. If the -dnl # iterate_supers_type() function happened to be processing the -dnl # removed list_head it will get stuck spinning on that list_head. -dnl # -dnl # To resolve the issue for existing 3.0 - 3.2 kernels we detect when -dnl # a list_head is used. Then to prevent the spinning from occurring -dnl # the .next pointer is set to the fs_supers list_head which ensures -dnl # the iterate_supers_type() function will always terminate. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_S_INSTANCES_LIST_HEAD], [ - AC_MSG_CHECKING([whether super_block has s_instances list_head]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct super_block sb __attribute__ ((unused)); - - INIT_LIST_HEAD(&sb.s_instances); - ],[ +AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [ + AC_MSG_CHECKING([whether super_block has s_shrink]) + ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_S_INSTANCES_LIST_HEAD, 1, - [struct super_block has s_instances list_head]) ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_NR_CACHED_OBJECTS], [ - AC_MSG_CHECKING([whether sops->nr_cached_objects() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int nr_cached_objects(struct super_block *sb) { return 0; } - - static const struct super_operations - sops __attribute__ ((unused)) = { - .nr_cached_objects = nr_cached_objects, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NR_CACHED_OBJECTS, 1, - [sops->nr_cached_objects() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_FREE_CACHED_OBJECTS], [ - AC_MSG_CHECKING([whether sops->free_cached_objects() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - void free_cached_objects(struct super_block *sb, int x) - { return; } - - static const struct super_operations - sops __attribute__ ((unused)) = { - .free_cached_objects = free_cached_objects, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FREE_CACHED_OBJECTS, 1, - [sops->free_cached_objects() exists]) - ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) ]) ]) @@ -115,15 +33,19 @@ dnl # 3.12 API change dnl # The nid member was added to struct shrink_control to support dnl # NUMA-aware shrinkers. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID], [ - AC_MSG_CHECKING([whether shrink_control has nid]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID], [ + ZFS_LINUX_TEST_SRC([shrink_control_nid], [ #include ],[ struct shrink_control sc __attribute__ ((unused)); unsigned long scnidsize __attribute__ ((unused)) = sizeof(sc.nid); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID], [ + AC_MSG_CHECKING([whether shrink_control has nid]) + ZFS_LINUX_TEST_RESULT([shrink_control_nid], [ AC_MSG_RESULT(yes) AC_DEFINE(SHRINK_CONTROL_HAS_NID, 1, [struct shrink_control has nid]) @@ -132,137 +54,98 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID], [ ]) ]) - -AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - dnl # - dnl # 2.6.23 to 2.6.34 API change - dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask) - dnl # - AC_MSG_CHECKING([whether old 2-argument shrinker exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ + ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control], [ #include - - int shrinker_cb(int nr_to_scan, gfp_t gfp_mask) { - return 0; - } + int shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } ],[ struct shrinker cache_shrinker = { .shrink = shrinker_cb, .seeks = DEFAULT_SEEKS, }; register_shrinker(&cache_shrinker); + ]) + + ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control_split], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ + dnl # + dnl # 3.0 - 3.11 API change + dnl # ->shrink(struct shrinker *, struct shrink_control *sc) + dnl # + AC_MSG_CHECKING([whether new 2-argument shrinker exists]) + ZFS_LINUX_TEST_RESULT([shrinker_cb_shrink_control], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1, - [old shrinker callback wants 2 args]) + AC_DEFINE(HAVE_SINGLE_SHRINKER_CALLBACK, 1, + [new shrinker callback wants 2 args]) ],[ AC_MSG_RESULT(no) - dnl # - dnl # 2.6.35 - 2.6.39 API change - dnl # ->shrink(struct shrinker *, - dnl # int nr_to_scan, gfp_t gfp_mask) - dnl # - AC_MSG_CHECKING([whether old 3-argument shrinker exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - int shrinker_cb(struct shrinker *shrink, int nr_to_scan, - gfp_t gfp_mask) { - return 0; - } - ],[ - struct shrinker cache_shrinker = { - .shrink = shrinker_cb, - .seeks = DEFAULT_SEEKS, - }; - register_shrinker(&cache_shrinker); - ],[ + dnl # + dnl # 3.12 API change, + dnl # ->shrink() is logically split in to + dnl # ->count_objects() and ->scan_objects() + dnl # + AC_MSG_CHECKING([whether ->count_objects callback exists]) + ZFS_LINUX_TEST_RESULT([shrinker_cb_shrink_control_split], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, - [old shrinker callback wants 3 args]) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, + [->count_objects exists]) ],[ - AC_MSG_RESULT(no) - dnl # - dnl # 3.0 - 3.11 API change - dnl # ->shrink(struct shrinker *, - dnl # struct shrink_control *sc) - dnl # - AC_MSG_CHECKING( - [whether new 2-argument shrinker exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int shrinker_cb(struct shrinker *shrink, - struct shrink_control *sc) { - return 0; - } - ],[ - struct shrinker cache_shrinker = { - .shrink = shrinker_cb, - .seeks = DEFAULT_SEEKS, - }; - register_shrinker(&cache_shrinker); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1, - [new shrinker callback wants 2 args]) - ],[ - AC_MSG_RESULT(no) - dnl # - dnl # 3.12 API change, - dnl # ->shrink() is logically split in to - dnl # ->count_objects() and ->scan_objects() - dnl # - AC_MSG_CHECKING( - [whether ->count_objects callback exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - - unsigned long shrinker_cb( - struct shrinker *shrink, - struct shrink_control *sc) { - return 0; - } - ],[ - struct shrinker cache_shrinker = { - .count_objects = shrinker_cb, - .scan_objects = shrinker_cb, - .seeks = DEFAULT_SEEKS, - }; - register_shrinker(&cache_shrinker); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, - 1, [->count_objects exists]) - ],[ - AC_MSG_ERROR(error) - ]) - ]) + ZFS_LINUX_TEST_ERROR([shrinker]) ]) ]) - EXTRA_KCFLAGS="$tmp_flags" ]) dnl # dnl # 2.6.39 API change, dnl # Shrinker adjust to use common shrink_control structure. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ - AC_MSG_CHECKING([whether struct shrink_control exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT], [ + ZFS_LINUX_TEST_SRC([shrink_control_struct], [ #include ],[ struct shrink_control sc __attribute__ ((unused)); sc.nr_to_scan = 0; sc.gfp_mask = GFP_KERNEL; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1, - [struct shrink_control exists]) - ],[ - AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ + AC_MSG_CHECKING([whether struct shrink_control exists]) + ZFS_LINUX_TEST_RESULT([shrink_control_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1, + [struct shrink_control exists]) + ],[ + ZFS_LINUX_TEST_ERROR([shrink_control]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ + ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID + ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK + ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [ + ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID + ZFS_AC_KERNEL_SHRINKER_CALLBACK + ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT +]) diff --git a/config/kernel-siginfo.m4 b/config/kernel-siginfo.m4 new file mode 100644 index 0000000000..6ddb0dcc37 --- /dev/null +++ b/config/kernel-siginfo.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 4.20 API change +dnl # Added kernel_siginfo_t +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SIGINFO], [ + ZFS_LINUX_TEST_SRC([siginfo], [ + #include + ],[ + kernel_siginfo_t info __attribute__ ((unused)); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SIGINFO], [ + AC_MSG_CHECKING([whether kernel_siginfo_t tyepedef exists]) + ZFS_LINUX_TEST_RESULT([siginfo], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SIGINFO, 1, [kernel_siginfo_t exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-signal-stop.m4 b/config/kernel-signal-stop.m4 new file mode 100644 index 0000000000..6cb86e7c4c --- /dev/null +++ b/config/kernel-signal-stop.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 4.4 API change +dnl # Added kernel_signal_stop +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SIGNAL_STOP], [ + ZFS_LINUX_TEST_SRC([signal_stop], [ + #include + ],[ + kernel_signal_stop(); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SIGNAL_STOP], [ + AC_MSG_CHECKING([whether signal_stop() exists]) + ZFS_LINUX_TEST_RESULT([signal_stop], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SIGNAL_STOP, 1, [signal_stop() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-special-state.m4 b/config/kernel-special-state.m4 new file mode 100644 index 0000000000..aa60aabebc --- /dev/null +++ b/config/kernel-special-state.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 4.17 API change +dnl # Added set_special_state() function +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE], [ + ZFS_LINUX_TEST_SRC([set_special_state], [ + #include + ],[ + set_special_state(TASK_STOPPED); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_SPECIAL_STATE], [ + AC_MSG_CHECKING([whether set_special_state() exists]) + ZFS_LINUX_TEST_RESULT([set_special_state], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_SPECIAL_STATE, 1, [set_special_state() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4 deleted file mode 100644 index d6d6640070..0000000000 --- a/config/kernel-spinlock.m4 +++ /dev/null @@ -1,24 +0,0 @@ -dnl # -dnl # 2.6.36 API change, -dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to -dnl # a spinlock_t to improve the fastpath performance. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK], [ - AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - ZFS_LINUX_TRY_COMPILE([ - #include - #include - ],[ - static struct fs_struct fs; - spin_lock_init(&fs.lock); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1, - [struct fs_struct uses spinlock_t]) - ],[ - AC_MSG_RESULT(no) - ]) - EXTRA_KCFLAGS="$tmp_flags" -]) diff --git a/config/kernel-stdarg.m4 b/config/kernel-stdarg.m4 new file mode 100644 index 0000000000..5bc8dd859d --- /dev/null +++ b/config/kernel-stdarg.m4 @@ -0,0 +1,32 @@ +dnl # +dnl # Linux 5.15 gets rid of -isystem and external inclusion +dnl # and ships its own . Check if this header file does +dnl # exist and provide all necessary definitions for variable argument +dnl # functions. Adjust the inclusion of according to the +dnl # results. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG], [ + ZFS_LINUX_TEST_SRC([has_standalone_linux_stdarg], [ + #include + + #if !defined(va_start) || !defined(va_end) || \ + !defined(va_arg) || !defined(va_copy) + #error " is invalid" + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG], [ + dnl # + dnl # Linux 5.15 ships its own stdarg.h and doesn't allow to + dnl # include compiler headers. + dnl # + AC_MSG_CHECKING([whether standalone exists]) + ZFS_LINUX_TEST_RESULT([has_standalone_linux_stdarg], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_STANDALONE_LINUX_STDARG, 1, + [standalone exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-submit_bio.m4 b/config/kernel-submit_bio.m4 deleted file mode 100644 index da5f85ca72..0000000000 --- a/config/kernel-submit_bio.m4 +++ /dev/null @@ -1,20 +0,0 @@ -dnl # -dnl # 4.8 API change -dnl # The rw argument has been removed from submit_bio/submit_bio_wait. -dnl # Callers are now expected to set bio->bi_rw instead of passing it in. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SUBMIT_BIO], [ - AC_MSG_CHECKING([whether submit_bio() wants 1 arg]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - blk_qc_t blk_qc; - struct bio *bio = NULL; - blk_qc = submit_bio(bio); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_SUBMIT_BIO, 1, [submit_bio() wants 1 arg]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-super-userns.m4 b/config/kernel-super-userns.m4 index de94ad967a..1ad35f2d19 100644 --- a/config/kernel-super-userns.m4 +++ b/config/kernel-super-userns.m4 @@ -3,15 +3,19 @@ dnl # 4.8 API change dnl # struct user_namespace was added to struct super_block as dnl # super->s_user_ns member dnl # -AC_DEFUN([ZFS_AC_KERNEL_SUPER_USER_NS], [ - AC_MSG_CHECKING([whether super_block->s_user_ns exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_USER_NS], [ + ZFS_LINUX_TEST_SRC([super_user_ns], [ #include #include - ],[ + ], [ struct super_block super; super.s_user_ns = (struct user_namespace *)NULL; - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SUPER_USER_NS], [ + AC_MSG_CHECKING([whether super_block->s_user_ns exists]) + ZFS_LINUX_TEST_RESULT([super_user_ns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_SUPER_USER_NS, 1, [super_block->s_user_ns exists]) diff --git a/config/kernel-symlink.m4 b/config/kernel-symlink.m4 new file mode 100644 index 0000000000..d90366d04b --- /dev/null +++ b/config/kernel-symlink.m4 @@ -0,0 +1,30 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([symlink_userns], [ + #include + #include + + int tmp_symlink(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + const char *path) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .symlink = tmp_symlink, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYMLINK], [ + AC_MSG_CHECKING([whether iops->symlink() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([symlink_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_SYMLINK_USERNS, 1, + [iops->symlink() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4 index b0e1afa153..403cff3f41 100644 --- a/config/kernel-timer.m4 +++ b/config/kernel-timer.m4 @@ -6,15 +6,11 @@ dnl # (older kernels). Also sanity check the from_timer() and timer_setup() dnl # macros are available as well, since they will be used in the same newer dnl # kernels that support the new timer_list.func signature. dnl # -dnl # Also check for the existance of flags in struct timer_list, they were +dnl # Also check for the existence of flags in struct timer_list, they were dnl # added in 4.1-rc8 via 0eeda71bc30d. - -AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ - AC_MSG_CHECKING([whether timer_setup() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" - - ZFS_LINUX_TRY_COMPILE([ +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TIMER_SETUP], [ + ZFS_LINUX_TEST_SRC([timer_setup], [ #include struct my_task_timer { @@ -24,13 +20,34 @@ AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ void task_expire(struct timer_list *tl) { - struct my_task_timer *task_timer = from_timer(task_timer, tl, timer); + struct my_task_timer *task_timer = + from_timer(task_timer, tl, timer); task_timer->data = 42; } ],[ struct my_task_timer task_timer; timer_setup(&task_timer.timer, task_expire, 0); + ]) + + ZFS_LINUX_TEST_SRC([timer_list_function], [ + #include + void task_expire(struct timer_list *tl) {} ],[ + struct timer_list tl; + tl.function = task_expire; + ]) + + ZFS_LINUX_TEST_SRC([timer_list_flags], [ + #include + ],[ + struct timer_list tl; + tl.flags = 2; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ + AC_MSG_CHECKING([whether timer_setup() is available]) + ZFS_LINUX_TEST_RESULT([timer_setup], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_TIMER_SETUP, 1, [timer_setup() is available]) @@ -39,14 +56,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ ]) AC_MSG_CHECKING([whether timer function expects timer_list]) - - ZFS_LINUX_TRY_COMPILE([ - #include - void task_expire(struct timer_list *tl) {} - ],[ - struct timer_list tl; - tl.function = task_expire; - ],[ + ZFS_LINUX_TEST_RESULT([timer_list_function], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1, [timer_list.function gets a timer_list]) @@ -55,19 +65,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ ]) AC_MSG_CHECKING([whether struct timer_list has flags]) - - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct timer_list tl; - tl.flags = 2; - ],[ + ZFS_LINUX_TEST_RESULT([timer_list_flags], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_KERNEL_TIMER_LIST_FLAGS, 1, [struct timer_list has a flags member]) ],[ AC_MSG_RESULT(no) ]) - - EXTRA_KCFLAGS="$tmp_flags" ]) diff --git a/config/kernel-tmpfile.m4 b/config/kernel-tmpfile.m4 index 5aad90450e..45c2e6ceea 100644 --- a/config/kernel-tmpfile.m4 +++ b/config/kernel-tmpfile.m4 @@ -2,22 +2,44 @@ dnl # dnl # 3.11 API change dnl # Add support for i_op->tmpfile dnl # -AC_DEFUN([ZFS_AC_KERNEL_TMPFILE], [ - AC_MSG_CHECKING([whether i_op->tmpfile() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ + dnl # + dnl # 5.11 API change + dnl # add support for userns parameter to tmpfile + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_tmpfile_userns], [ #include - int tmpfile(struct inode *inode, struct dentry *dentry, + int tmpfile(struct user_namespace *userns, + struct inode *inode, struct dentry *dentry, umode_t mode) { return 0; } static struct inode_operations iops __attribute__ ((unused)) = { .tmpfile = tmpfile, }; - ],[ - ],[ + ],[]) + ZFS_LINUX_TEST_SRC([inode_operations_tmpfile], [ + #include + int tmpfile(struct inode *inode, struct dentry *dentry, + umode_t mode) { return 0; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .tmpfile = tmpfile, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TMPFILE], [ + AC_MSG_CHECKING([whether i_op->tmpfile() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_TMPFILE, 1, - [i_op->tmpfile() exists]) + AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) + AC_DEFINE(HAVE_TMPFILE_USERNS, 1, [i_op->tmpfile() has userns]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-totalhigh_pages.m4 b/config/kernel-totalhigh_pages.m4 index b22e86d4db..4ecb03a50a 100644 --- a/config/kernel-totalhigh_pages.m4 +++ b/config/kernel-totalhigh_pages.m4 @@ -1,16 +1,18 @@ dnl # dnl # 5.0 API change dnl # -dnl # ca79b0c211af mm: convert totalram_pages and totalhigh_pages variables to atomic -dnl # -AC_DEFUN([ZFS_AC_KERNEL_TOTALHIGH_PAGES], [ - AC_MSG_CHECKING([whether totalhigh_pages() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES], [ + ZFS_LINUX_TEST_SRC([totalhigh_pages], [ #include ],[ unsigned long pages __attribute__ ((unused)); pages = totalhigh_pages(); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TOTALHIGH_PAGES], [ + AC_MSG_CHECKING([whether totalhigh_pages() exists]) + ZFS_LINUX_TEST_RESULT([totalhigh_pages], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_TOTALHIGH_PAGES, 1, [totalhigh_pages() exists]) ],[ diff --git a/config/kernel-totalram-pages-func.m4 b/config/kernel-totalram-pages-func.m4 index a6eac64543..d0e812a8d2 100644 --- a/config/kernel-totalram-pages-func.m4 +++ b/config/kernel-totalram-pages-func.m4 @@ -2,16 +2,21 @@ dnl # dnl # Linux 5.0: totalram_pages is no longer a global variable, and must be dnl # read via the totalram_pages() helper function. dnl # -AC_DEFUN([ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC], [ - AC_MSG_CHECKING([whether totalram_pages() exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC], [ + ZFS_LINUX_TEST_SRC([totalram_pages], [ #include ],[ unsigned long pages __attribute__ ((unused)); pages = totalram_pages(); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC], [ + AC_MSG_CHECKING([whether totalram_pages() exists]) + ZFS_LINUX_TEST_RESULT([totalram_pages], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_TOTALRAM_PAGES_FUNC, 1, [kernel has totalram_pages()]) + AC_DEFINE(HAVE_TOTALRAM_PAGES_FUNC, 1, + [kernel has totalram_pages()]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-truncate-range.m4 b/config/kernel-truncate-range.m4 deleted file mode 100644 index da2cb50fcb..0000000000 --- a/config/kernel-truncate-range.m4 +++ /dev/null @@ -1,24 +0,0 @@ -dnl # -dnl # 3.5.0 API change -dnl # torvalds/linux@17cf28afea2a1112f240a3a2da8af883be024811 removed -dnl # truncate_range(). The file hole punching functionality is now -dnl # provided by fallocate() -dnl # -AC_DEFUN([ZFS_AC_KERNEL_TRUNCATE_RANGE], [ - AC_MSG_CHECKING([whether iops->truncate_range() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - void truncate_range(struct inode *inode, loff_t start, - loff_t end) { return; } - static struct inode_operations iops __attribute__ ((unused)) = { - .truncate_range = truncate_range, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_TRUNCATE_RANGE, 1, - [iops->truncate_range() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-truncate-setsize.m4 b/config/kernel-truncate-setsize.m4 index 7e4aff479a..76c82ef302 100644 --- a/config/kernel-truncate-setsize.m4 +++ b/config/kernel-truncate-setsize.m4 @@ -2,17 +2,20 @@ dnl # dnl # 2.6.35 API change dnl # Added truncate_setsize() helper function. dnl # -AC_DEFUN([ZFS_AC_KERNEL_TRUNCATE_SETSIZE], - [AC_MSG_CHECKING([whether truncate_setsize() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE], [ + ZFS_LINUX_TEST_SRC([truncate_setsize], [ #include ], [ truncate_setsize(NULL, 0); - ], [truncate_setsize], [mm/truncate.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_TRUNCATE_SETSIZE, 1, - [truncate_setsize() is available]) - ], [ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TRUNCATE_SETSIZE], [ + AC_MSG_CHECKING([whether truncate_setsize() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([truncate_setsize], + [truncate_setsize], [mm/truncate.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([truncate_setsize]) ]) ]) diff --git a/config/kernel-userns-capabilities.m4 b/config/kernel-userns-capabilities.m4 index fa3381978b..026503623a 100644 --- a/config/kernel-userns-capabilities.m4 +++ b/config/kernel-userns-capabilities.m4 @@ -2,16 +2,45 @@ dnl # dnl # 2.6.38 API change dnl # ns_capable() was introduced dnl # -AC_DEFUN([ZFS_AC_KERNEL_NS_CAPABLE], [ - AC_MSG_CHECKING([whether ns_capable exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_NS_CAPABLE], [ + ZFS_LINUX_TEST_SRC([ns_capable], [ #include ],[ ns_capable((struct user_namespace *)NULL, CAP_SYS_ADMIN); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_NS_CAPABLE], [ + AC_MSG_CHECKING([whether ns_capable exists]) + ZFS_LINUX_TEST_RESULT([ns_capable], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NS_CAPABLE, 1, - [ns_capable exists]) + ],[ + ZFS_LINUX_TEST_ERROR([ns_capable()]) + ]) +]) + +dnl # +dnl # 4.10 API change +dnl # has_capability() was exported. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_HAS_CAPABILITY], [ + ZFS_LINUX_TEST_SRC([has_capability], [ + #include + ],[ + struct task_struct *task = NULL; + int cap = 0; + bool result __attribute__ ((unused)); + + result = has_capability(task, cap); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_HAS_CAPABILITY], [ + AC_MSG_CHECKING([whether has_capability() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([has_capability], + [has_capability], [kernel/capability.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_HAS_CAPABILITY, 1, [has_capability() is available]) ],[ AC_MSG_RESULT(no) ]) @@ -19,23 +48,23 @@ AC_DEFUN([ZFS_AC_KERNEL_NS_CAPABLE], [ dnl # dnl # 2.6.39 API change -dnl # struct user_namespace was added to struct cred_t as -dnl # cred->user_ns member -dnl # Note that current_user_ns() was added in 2.6.28. +dnl # struct user_namespace was added to struct cred_t as cred->user_ns member dnl # -AC_DEFUN([ZFS_AC_KERNEL_CRED_USER_NS], [ - AC_MSG_CHECKING([whether cred_t->user_ns exists]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CRED_USER_NS], [ + ZFS_LINUX_TEST_SRC([cred_user_ns], [ #include ],[ struct cred cr; cr.user_ns = (struct user_namespace *)NULL; - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CRED_USER_NS], [ + AC_MSG_CHECKING([whether cred_t->user_ns exists]) + ZFS_LINUX_TEST_RESULT([cred_user_ns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CRED_USER_NS, 1, - [cred_t->user_ns exists]) ],[ - AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([cred_t->user_ns()]) ]) ]) @@ -44,24 +73,34 @@ dnl # 3.4 API change dnl # kuid_has_mapping() and kgid_has_mapping() were added to distinguish dnl # between internal kernel uids/gids and user namespace uids/gids. dnl # -AC_DEFUN([ZFS_AC_KERNEL_KUID_HAS_MAPPING], [ - AC_MSG_CHECKING([whether kuid_has_mapping/kgid_has_mapping exist]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING], [ + ZFS_LINUX_TEST_SRC([kuid_has_mapping], [ #include ],[ kuid_has_mapping((struct user_namespace *)NULL, KUIDT_INIT(0)); kgid_has_mapping((struct user_namespace *)NULL, KGIDT_INIT(0)); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_KUID_HAS_MAPPING, 1, - [kuid_has_mapping/kgid_has_mapping exist]) - ],[ - AC_MSG_RESULT(no) ]) ]) +AC_DEFUN([ZFS_AC_KERNEL_KUID_HAS_MAPPING], [ + AC_MSG_CHECKING([whether kuid_has_mapping/kgid_has_mapping exist]) + ZFS_LINUX_TEST_RESULT([kuid_has_mapping], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kuid_has_mapping()]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES], [ + ZFS_AC_KERNEL_SRC_NS_CAPABLE + ZFS_AC_KERNEL_SRC_HAS_CAPABILITY + ZFS_AC_KERNEL_SRC_CRED_USER_NS + ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING +]) + AC_DEFUN([ZFS_AC_KERNEL_USERNS_CAPABILITIES], [ ZFS_AC_KERNEL_NS_CAPABLE + ZFS_AC_KERNEL_HAS_CAPABILITY ZFS_AC_KERNEL_CRED_USER_NS ZFS_AC_KERNEL_KUID_HAS_MAPPING ]) diff --git a/config/kernel-urange-sleep.m4 b/config/kernel-usleep_range.m4 similarity index 61% rename from config/kernel-urange-sleep.m4 rename to config/kernel-usleep_range.m4 index b5764de3ed..06eb381a3c 100644 --- a/config/kernel-urange-sleep.m4 +++ b/config/kernel-usleep_range.m4 @@ -1,21 +1,23 @@ dnl # -dnl # 2.6.36 API compatibility. -dnl # Added usleep_range timer. +dnl # 2.6.36 API compatibility- Added usleep_range timer. +dnl # dnl # usleep_range is a finer precision implementation of msleep dnl # designed to be a drop-in replacement for udelay where a precise dnl # sleep / busy-wait is unnecessary. dnl # -AC_DEFUN([ZFS_AC_KERNEL_USLEEP_RANGE], [ - AC_MSG_CHECKING([whether usleep_range() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_USLEEP_RANGE], [ + ZFS_LINUX_TEST_SRC([usleep_range], [ #include ],[ usleep_range(0, 0); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_USLEEP_RANGE, 1, - [usleep_range is available]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_USLEEP_RANGE], [ + AC_MSG_CHECKING([whether usleep_range() is available]) + ZFS_LINUX_TEST_RESULT([usleep_range], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([usleep_range()]) ]) ]) diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 index cc50bfbe4e..82583d52fc 100644 --- a/config/kernel-vfs-direct_IO.m4 +++ b/config/kernel-vfs-direct_IO.m4 @@ -1,9 +1,8 @@ dnl # -dnl # Linux 4.6.x API change +dnl # Check for direct IO interfaces. dnl # -AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER], [ - AC_MSG_CHECKING([whether aops->direct_IO() uses iov_iter]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ + ZFS_LINUX_TEST_SRC([direct_io_iter], [ #include ssize_t test_direct_IO(struct kiocb *kiocb, @@ -13,24 +12,9 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER], [ aops __attribute__ ((unused)) = { .direct_IO = test_direct_IO, }; - ],[ - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER, 1, - [aops->direct_IO() uses iov_iter without rw]) - zfs_ac_direct_io="yes" - ],[ - AC_MSG_RESULT([no]) - ]) -]) + ],[]) -dnl # -dnl # Linux 4.1.x API change -dnl # -AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_OFFSET], [ - AC_MSG_CHECKING( - [whether aops->direct_IO() uses iov_iter with offset]) - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([direct_io_iter_offset], [ #include ssize_t test_direct_IO(struct kiocb *kiocb, @@ -40,24 +24,9 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_OFFSET], [ aops __attribute__ ((unused)) = { .direct_IO = test_direct_IO, }; - ],[ - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_OFFSET, 1, - [aops->direct_IO() uses iov_iter with offset]) - zfs_ac_direct_io="yes" - ],[ - AC_MSG_RESULT([no]) - ]) -]) + ],[]) -dnl # -dnl # Linux 3.16.x API change -dnl # -AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_RW_OFFSET], [ - AC_MSG_CHECKING( - [whether aops->direct_IO() uses iov_iter with rw and offset]) - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([direct_io_iter_rw_offset], [ #include ssize_t test_direct_IO(int rw, struct kiocb *kiocb, @@ -67,23 +36,9 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_RW_OFFSET], [ aops __attribute__ ((unused)) = { .direct_IO = test_direct_IO, }; - ],[ - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET, 1, - [aops->direct_IO() uses iov_iter with rw and offset]) - zfs_ac_direct_io="yes" - ],[ - AC_MSG_RESULT([no]) - ]) -]) + ],[]) -dnl # -dnl # Ancient Linux API (predates git) -dnl # -AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_IOVEC], [ - AC_MSG_CHECKING([whether aops->direct_IO() uses iovec]) - ZFS_LINUX_TRY_COMPILE([ + ZFS_LINUX_TEST_SRC([direct_io_iovec], [ #include ssize_t test_direct_IO(int rw, struct kiocb *kiocb, @@ -94,37 +49,61 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO_IOVEC], [ aops __attribute__ ((unused)) = { .direct_IO = test_direct_IO, }; - ],[ - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1, - [aops->direct_IO() uses iovec]) - zfs_ac_direct_io="yes" - ],[ - AC_MSG_RESULT([no]) - ]) + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [ - zfs_ac_direct_io="no" + dnl # + dnl # Linux 4.6.x API change + dnl # + AC_MSG_CHECKING([whether aops->direct_IO() uses iov_iter]) + ZFS_LINUX_TEST_RESULT([direct_io_iter], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER, 1, + [aops->direct_IO() uses iov_iter without rw]) + ],[ + AC_MSG_RESULT([no]) - if test "$zfs_ac_direct_io" = "no"; then - ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER - fi + dnl # + dnl # Linux 4.1.x API change + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses offset]) + ZFS_LINUX_TEST_RESULT([direct_io_iter_offset], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_OFFSET, 1, + [aops->direct_IO() uses iov_iter with offset]) - if test "$zfs_ac_direct_io" = "no"; then - ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_OFFSET - fi + ],[ + AC_MSG_RESULT([no]) - if test "$zfs_ac_direct_io" = "no"; then - ZFS_AC_KERNEL_VFS_DIRECT_IO_ITER_RW_OFFSET - fi + dnl # + dnl # Linux 3.16.x API change + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses rw and offset]) + ZFS_LINUX_TEST_RESULT([direct_io_iter_rw_offset], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET, 1, + [aops->direct_IO() uses iov_iter with ] + [rw and offset]) + ],[ + AC_MSG_RESULT([no]) - if test "$zfs_ac_direct_io" = "no"; then - ZFS_AC_KERNEL_VFS_DIRECT_IO_IOVEC - fi - - if test "$zfs_ac_direct_io" = "no"; then - AC_MSG_ERROR([no; unknown direct IO interface]) - fi + dnl # + dnl # Ancient Linux API (predates git) + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses iovec]) + ZFS_LINUX_TEST_RESULT([direct_io_iovec], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1, + [aops->direct_IO() uses iovec]) + ],[ + ZFS_LINUX_TEST_ERROR([direct IO]) + AC_MSG_RESULT([no]) + ]) + ]) + ]) + ]) ]) diff --git a/config/kernel-vfs-fsync.m4 b/config/kernel-vfs-fsync.m4 index a474f9f174..159efca453 100644 --- a/config/kernel-vfs-fsync.m4 +++ b/config/kernel-vfs-fsync.m4 @@ -2,16 +2,19 @@ dnl # dnl # 2.6.35 API change, dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype. dnl # -AC_DEFUN([ZFS_AC_KERNEL_2ARGS_VFS_FSYNC], [ - AC_MSG_CHECKING([whether vfs_fsync() wants 2 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_fsync_2args], [ #include ],[ vfs_fsync(NULL, 0); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_VFS_FSYNC, 1, [vfs_fsync() wants 2 args]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_FSYNC_2ARGS], [ + AC_MSG_CHECKING([whether vfs_fsync() wants 2 args]) + ZFS_LINUX_TEST_RESULT([vfs_fsync_2args], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([vfs_fsync()]) ]) ]) diff --git a/config/kernel-vfs-getattr.m4 b/config/kernel-vfs-getattr.m4 index b13723538f..eb07853cc4 100644 --- a/config/kernel-vfs-getattr.m4 +++ b/config/kernel-vfs-getattr.m4 @@ -2,19 +2,23 @@ dnl # dnl # 4.11 API, a528d35e@torvalds/linux dnl # vfs_getattr(const struct path *p, struct kstat *s, u32 m, unsigned int f) dnl # -AC_DEFUN([ZFS_AC_KERNEL_4ARGS_VFS_GETATTR], [ - AC_MSG_CHECKING([whether vfs_getattr() wants 4 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_4ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_4args], [ #include ],[ vfs_getattr((const struct path *)NULL, (struct kstat *)NULL, (u32)0, (unsigned int)0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_4ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 4 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_4args], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_4ARGS_VFS_GETATTR, 1, - [vfs_getattr wants 4 args]) + [vfs_getattr wants 4 args]) ],[ AC_MSG_RESULT(no) ]) @@ -24,17 +28,21 @@ dnl # dnl # 3.9 API dnl # vfs_getattr(struct path *p, struct kstat *s) dnl # -AC_DEFUN([ZFS_AC_KERNEL_2ARGS_VFS_GETATTR], [ - AC_MSG_CHECKING([whether vfs_getattr() wants 2 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_2ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_2args], [ #include ],[ vfs_getattr((struct path *) NULL, (struct kstat *)NULL); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_2ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 2 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_2args], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1, - [vfs_getattr wants 2 args]) + [vfs_getattr wants 2 args]) ],[ AC_MSG_RESULT(no) ]) @@ -44,19 +52,35 @@ dnl # dnl # <3.9 API dnl # vfs_getattr(struct vfsmount *v, struct dentry *d, struct kstat *k) dnl # -AC_DEFUN([ZFS_AC_KERNEL_3ARGS_VFS_GETATTR], [ - AC_MSG_CHECKING([whether vfs_getattr() wants 3 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_3ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_3args], [ #include ],[ vfs_getattr((struct vfsmount *)NULL, (struct dentry *)NULL, (struct kstat *)NULL); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_3ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 3 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_3args], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_3ARGS_VFS_GETATTR, 1, - [vfs_getattr wants 3 args]) + [vfs_getattr wants 3 args]) ],[ AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR], [ + ZFS_AC_KERNEL_SRC_VFS_GETATTR_4ARGS + ZFS_AC_KERNEL_SRC_VFS_GETATTR_2ARGS + ZFS_AC_KERNEL_SRC_VFS_GETATTR_3ARGS +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR], [ + ZFS_AC_KERNEL_VFS_GETATTR_4ARGS + ZFS_AC_KERNEL_VFS_GETATTR_2ARGS + ZFS_AC_KERNEL_VFS_GETATTR_3ARGS +]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 new file mode 100644 index 0000000000..bee6d0be96 --- /dev/null +++ b/config/kernel-vfs-iov_iter.m4 @@ -0,0 +1,162 @@ +dnl # +dnl # Check for available iov_iter functionality. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ + ZFS_LINUX_TEST_SRC([iov_iter_types], [ + #include + #include + ],[ + int type __attribute__ ((unused)) = + ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE; + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_advance], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t advance = 512; + + iov_iter_advance(&iter, advance); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_revert], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t revert = 512; + + iov_iter_revert(&iter, revert); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_fault_in_readable], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t size = 512; + int error __attribute__ ((unused)); + + error = iov_iter_fault_in_readable(&iter, size); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_count], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t bytes __attribute__ ((unused)); + + bytes = iov_iter_count(&iter); + ]) + + ZFS_LINUX_TEST_SRC([copy_to_iter], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + char buf[512] = { 0 }; + size_t size = 512; + size_t bytes __attribute__ ((unused)); + + bytes = copy_to_iter((const void *)&buf, size, &iter); + ]) + + ZFS_LINUX_TEST_SRC([copy_from_iter], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + char buf[512] = { 0 }; + size_t size = 512; + size_t bytes __attribute__ ((unused)); + + bytes = copy_from_iter((void *)&buf, size, &iter); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ + enable_vfs_iov_iter="yes" + + AC_MSG_CHECKING([whether iov_iter types are available]) + ZFS_LINUX_TEST_RESULT([iov_iter_types], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_TYPES, 1, + [iov_iter types are available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_advance() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_advance], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_ADVANCE, 1, + [iov_iter_advance() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_revert() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_revert], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_REVERT, 1, + [iov_iter_revert() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_fault_in_readable() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_fault_in_readable], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_FAULT_IN_READABLE, 1, + [iov_iter_fault_in_readable() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_count() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_count], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_COUNT, 1, + [iov_iter_count() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether copy_to_iter() is available]) + ZFS_LINUX_TEST_RESULT([copy_to_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_COPY_TO_ITER, 1, + [copy_to_iter() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether copy_from_iter() is available]) + ZFS_LINUX_TEST_RESULT([copy_from_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_COPY_FROM_ITER, 1, + [copy_from_iter() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + dnl # + dnl # As of the 4.9 kernel support is provided for iovecs, kvecs, + dnl # bvecs and pipes in the iov_iter structure. As long as the + dnl # other support interfaces are all available the iov_iter can + dnl # be correctly used in the uio structure. + dnl # + AS_IF([test "x$enable_vfs_iov_iter" = "xyes"], [ + AC_DEFINE(HAVE_VFS_IOV_ITER, 1, + [All required iov_iter interfaces are available]) + ]) +]) diff --git a/config/kernel-vfs-iterate.m4 b/config/kernel-vfs-iterate.m4 index 5de901d446..172118eac8 100644 --- a/config/kernel-vfs-iterate.m4 +++ b/config/kernel-vfs-iterate.m4 @@ -1,9 +1,5 @@ -AC_DEFUN([ZFS_AC_KERNEL_VFS_ITERATE], [ - dnl # - dnl # 4.7 API change - dnl # - AC_MSG_CHECKING([whether fops->iterate_shared() is available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_ITERATE], [ + ZFS_LINUX_TEST_SRC([file_operations_iterate_shared], [ #include int iterate(struct file *filp, struct dir_context * context) { return 0; } @@ -12,11 +8,44 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_ITERATE], [ __attribute__ ((unused)) = { .iterate_shared = iterate, }; - ],[ - ],[ + ],[]) + + ZFS_LINUX_TEST_SRC([file_operations_iterate], [ + #include + int iterate(struct file *filp, + struct dir_context *context) { return 0; } + + static const struct file_operations fops + __attribute__ ((unused)) = { + .iterate = iterate, + }; + + #if defined(FMODE_KABI_ITERATE) + #error "RHEL 7.5, FMODE_KABI_ITERATE interface" + #endif + ],[]) + + ZFS_LINUX_TEST_SRC([file_operations_readdir], [ + #include + int readdir(struct file *filp, void *entry, + filldir_t func) { return 0; } + + static const struct file_operations fops + __attribute__ ((unused)) = { + .readdir = readdir, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_ITERATE], [ + dnl # + dnl # 4.7 API change + dnl # + AC_MSG_CHECKING([whether fops->iterate_shared() is available]) + ZFS_LINUX_TEST_RESULT([file_operations_iterate_shared], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_VFS_ITERATE_SHARED, 1, - [fops->iterate_shared() is available]) + [fops->iterate_shared() is available]) ],[ AC_MSG_RESULT(no) @@ -31,44 +60,23 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_ITERATE], [ dnl # to using fops.readdir() to retain KABI compatibility. dnl # AC_MSG_CHECKING([whether fops->iterate() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - int iterate(struct file *filp, - struct dir_context *context) { return 0; } - - static const struct file_operations fops - __attribute__ ((unused)) = { - .iterate = iterate, - }; - - #if defined(FMODE_KABI_ITERATE) - #error "RHEL 7.5, FMODE_KABI_ITERATE interface" - #endif - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([file_operations_iterate], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_VFS_ITERATE, 1, - [fops->iterate() is available]) + [fops->iterate() is available]) ],[ AC_MSG_RESULT(no) + dnl # + dnl # readdir interface introduced + dnl # AC_MSG_CHECKING([whether fops->readdir() is available]) - ZFS_LINUX_TRY_COMPILE([ - #include - int readdir(struct file *filp, void *entry, - filldir_t func) { return 0; } - - static const struct file_operations fops - __attribute__ ((unused)) = { - .readdir = readdir, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([file_operations_readdir], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_VFS_READDIR, 1, - [fops->readdir() is available]) + [fops->readdir() is available]) ],[ - AC_MSG_ERROR(no; file a bug report with ZoL) + ZFS_LINUX_TEST_ERROR([vfs_iterate]) ]) ]) ]) diff --git a/config/kernel-vfs-rw-iterate.m4 b/config/kernel-vfs-rw-iterate.m4 index ace54f7071..000353ec15 100644 --- a/config/kernel-vfs-rw-iterate.m4 +++ b/config/kernel-vfs-rw-iterate.m4 @@ -1,9 +1,8 @@ dnl # dnl # Linux 3.16 API dnl # -AC_DEFUN([ZFS_AC_KERNEL_VFS_RW_ITERATE], - [AC_MSG_CHECKING([whether fops->read/write_iter() are available]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE], [ + ZFS_LINUX_TEST_SRC([file_operations_rw], [ #include ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to) @@ -16,39 +15,41 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_RW_ITERATE], .read_iter = test_read, .write_iter = test_write, }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_VFS_RW_ITERATE, 1, - [fops->read/write_iter() are available]) + ],[]) - ZFS_AC_KERNEL_NEW_SYNC_READ + ZFS_LINUX_TEST_SRC([new_sync_rw], [ + #include ],[ - AC_MSG_RESULT(no) + ssize_t ret __attribute__ ((unused)); + struct file *filp = NULL; + char __user *rbuf = NULL; + const char __user *wbuf = NULL; + size_t len = 0; + loff_t ppos; + + ret = new_sync_read(filp, rbuf, len, &ppos); + ret = new_sync_write(filp, wbuf, len, &ppos); ]) ]) -dnl # -dnl # Linux 4.1 API -dnl # -AC_DEFUN([ZFS_AC_KERNEL_NEW_SYNC_READ], - [AC_MSG_CHECKING([whether new_sync_read/write() are available]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - ssize_t ret __attribute__ ((unused)); - struct file *filp = NULL; - char __user *rbuf = NULL; - const char __user *wbuf = NULL; - size_t len = 0; - loff_t ppos; - - ret = new_sync_read(filp, rbuf, len, &ppos); - ret = new_sync_write(filp, wbuf, len, &ppos); - ],[ +AC_DEFUN([ZFS_AC_KERNEL_VFS_RW_ITERATE], [ + AC_MSG_CHECKING([whether fops->read/write_iter() are available]) + ZFS_LINUX_TEST_RESULT([file_operations_rw], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NEW_SYNC_READ, 1, - [new_sync_read()/new_sync_write() are available]) + AC_DEFINE(HAVE_VFS_RW_ITERATE, 1, + [fops->read/write_iter() are available]) + + dnl # + dnl # Linux 4.1 API + dnl # + AC_MSG_CHECKING([whether new_sync_read/write() are available]) + ZFS_LINUX_TEST_RESULT([new_sync_rw], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_NEW_SYNC_READ, 1, + [new_sync_read()/new_sync_write() are available]) + ],[ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) ]) @@ -57,19 +58,22 @@ AC_DEFUN([ZFS_AC_KERNEL_NEW_SYNC_READ], dnl # dnl # Linux 4.1.x API dnl # -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_WRITE_CHECKS], - [AC_MSG_CHECKING([whether generic_write_checks() takes kiocb]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS], [ + ZFS_LINUX_TEST_SRC([generic_write_checks], [ #include - ],[ struct kiocb *iocb = NULL; struct iov_iter *iov = NULL; generic_write_checks(iocb, iov); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS], [ + AC_MSG_CHECKING([whether generic_write_checks() takes kiocb]) + ZFS_LINUX_TEST_RESULT([generic_write_checks], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GENERIC_WRITE_CHECKS_KIOCB, 1, - [generic_write_checks() takes kiocb]) + [generic_write_checks() takes kiocb]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-vfs-set_page_dirty.m4 b/config/kernel-vfs-set_page_dirty.m4 new file mode 100644 index 0000000000..a9d252e4e0 --- /dev/null +++ b/config/kernel-vfs-set_page_dirty.m4 @@ -0,0 +1,34 @@ +dnl # +dnl # Linux 5.14 adds a change to require set_page_dirty to be manually +dnl # wired up in struct address_space_operations. Determine if this needs +dnl # to be done. This patch set also introduced __set_page_dirty_nobuffers +dnl # declaration in linux/pagemap.h, so these tests look for the presence +dnl # of that function to tell the compiler to assign set_page_dirty in +dnl # module/os/linux/zfs/zpl_file.c +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS], [ + ZFS_LINUX_TEST_SRC([vfs_has_set_page_dirty_nobuffers], [ + #include + #include + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .set_page_dirty = __set_page_dirty_nobuffers, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS], [ + dnl # + dnl # Linux 5.14 change requires set_page_dirty() to be assigned + dnl # in address_space_operations() + dnl # + AC_MSG_CHECKING([__set_page_dirty_nobuffers exists]) + ZFS_LINUX_TEST_RESULT([vfs_has_set_page_dirty_nobuffers], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS, 1, + [__set_page_dirty_nobuffers exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-wait.m4 b/config/kernel-wait.m4 index d6442c1df6..0414242bf6 100644 --- a/config/kernel-wait.m4 +++ b/config/kernel-wait.m4 @@ -1,31 +1,11 @@ dnl # -dnl # 3.17 API change, -dnl # wait_on_bit() no longer requires an action argument. The former -dnl # "wait_on_bit" interface required an 'action' function to be provided -dnl # which does the actual waiting. There were over 20 such functions in the -dnl # kernel, many of them identical, though most cases can be satisfied by one -dnl # of just two functions: one which uses io_schedule() and one which just -dnl # uses schedule(). This API change was made to consolidate all of those -dnl # redundant wait functions. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_WAIT_ON_BIT], [ - AC_MSG_CHECKING([whether wait_on_bit() takes an action]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int (*action)(void *) = NULL; - wait_on_bit(NULL, 0, action, 0); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes]) - ],[ - AC_MSG_RESULT(no) - ]) -]) -dnl # dnl # 4.13 API change dnl # Renamed struct wait_queue -> struct wait_queue_entry. dnl # +dnl # N.B. The type check is performed before all other checks +dnl # since ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY depends on +dnl # HAVE_WAIT_QUEUE_ENTRY_T being set in confdefs.h. +dnl # AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T], [ AC_MSG_CHECKING([whether wait_queue_entry_t exists]) ZFS_LINUX_TRY_COMPILE([ @@ -41,14 +21,42 @@ AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T], [ ]) ]) +dnl # +dnl # 3.17 API change, +dnl # wait_on_bit() no longer requires an action argument. The former +dnl # "wait_on_bit" interface required an 'action' function to be provided +dnl # which does the actual waiting. There were over 20 such functions in the +dnl # kernel, many of them identical, though most cases can be satisfied by one +dnl # of just two functions: one which uses io_schedule() and one which just +dnl # uses schedule(). This API change was made to consolidate all of those +dnl # redundant wait functions. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT_ON_BIT], [ + ZFS_LINUX_TEST_SRC([wait_on_bit], [ + #include + ],[ + int (*action)(void *) = NULL; + wait_on_bit(NULL, 0, action, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT_ON_BIT], [ + AC_MSG_CHECKING([whether wait_on_bit() takes an action]) + ZFS_LINUX_TEST_RESULT([wait_on_bit], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 4.13 API change dnl # Renamed wait_queue_head::task_list -> wait_queue_head::head dnl # Renamed wait_queue_entry::task_list -> wait_queue_entry::entry dnl # -AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY], [ - AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY], [ + ZFS_LINUX_TEST_SRC([wait_queue_head_entry], [ #include #ifdef HAVE_WAIT_QUEUE_ENTRY_T @@ -66,7 +74,12 @@ AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY], [ head = &wq_head.head; entry = &wq_entry.entry; - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY], [ + AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist]) + ZFS_LINUX_TEST_RESULT([wait_queue_head_entry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_WAIT_QUEUE_HEAD_ENTRY, 1, [wq_head->head and wq_entry->entry exist]) @@ -74,3 +87,13 @@ AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY], [ AC_MSG_RESULT(no) ]) ]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT], [ + ZFS_AC_KERNEL_SRC_WAIT_ON_BIT + ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT], [ + ZFS_AC_KERNEL_WAIT_ON_BIT + ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY +]) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 0b61b85b1d..00b1e74a9c 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -3,9 +3,8 @@ dnl # 2.6.35 API change, dnl # The 'struct xattr_handler' was constified in the generic dnl # super_block structure. dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], [ - AC_MSG_CHECKING([whether super_block uses const struct xattr_handler]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER], [ + ZFS_LINUX_TEST_SRC([const_xattr_handler], [ #include #include @@ -22,13 +21,15 @@ AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], [ const struct super_block sb __attribute__ ((unused)) = { .s_xattr = xattr_handlers, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], [ + AC_MSG_CHECKING([whether super_block uses const struct xattr_handler]) + ZFS_LINUX_TEST_RESULT([const_xattr_handler], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_CONST_XATTR_HANDLER, 1, - [super_block uses const struct xattr_handler]) ],[ - AC_MSG_RESULT([no]) + ZFS_LINUX_TEST_ERROR([const xattr_handler]) ]) ]) @@ -38,17 +39,20 @@ dnl # struct xattr_handler added new member "name". dnl # xattr_handler which matches to whole name rather than prefix should use dnl # "name" instead of "prefix", e.g. "system.posix_acl_access" dnl # -AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_NAME], [ - AC_MSG_CHECKING([whether xattr_handler has name]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME], [ + ZFS_LINUX_TEST_SRC([xattr_handler_name], [ #include static const struct xattr_handler xops __attribute__ ((unused)) = { .name = XATTR_NAME_POSIX_ACL_ACCESS, }; - ],[ - ],[ + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_NAME], [ + AC_MSG_CHECKING([whether xattr_handler has name]) + ZFS_LINUX_TEST_RESULT([xattr_handler_name], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_HANDLER_NAME, 1, [xattr_handler has name]) @@ -57,42 +61,11 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_NAME], [ ]) ]) -dnl # -dnl # 4.9 API change, -dnl # iops->{set,get,remove}xattr and generic_{set,get,remove}xattr are -dnl # removed. xattr operations will directly go through sb->s_xattr. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_HAVE_GENERIC_SETXATTR], [ - AC_MSG_CHECKING([whether generic_setxattr() exists]) - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .setxattr = generic_setxattr - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_SETXATTR, 1, - [generic_setxattr() exists]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - dnl # dnl # Supported xattr handler get() interfaces checked newest to oldest. dnl # -AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ - dnl # - dnl # 4.7 API change, - dnl # The xattr_handler->get() callback was changed to take both - dnl # dentry and inode. - dnl # - AC_MSG_CHECKING([whether xattr_handler->get() wants both dentry and inode]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode], [ #include int get(const struct xattr_handler *handler, @@ -102,8 +75,41 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ xops __attribute__ ((unused)) = { .get = get, }; - ],[ - ],[ + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_xattr_handler], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry], [ + #include + + int get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ + dnl # + dnl # 4.7 API change, + dnl # The xattr_handler->get() callback was changed to take both + dnl # dentry and inode. + dnl # + AC_MSG_CHECKING([whether xattr_handler->get() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE, 1, [xattr_handler->get() wants both dentry and inode]) @@ -115,70 +121,28 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ dnl # should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether xattr_handler->get() wants xattr_handler]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .get = get, - }; - ],[ - ],[ + AC_MSG_CHECKING( + [whether xattr_handler->get() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_xattr_handler], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_HANDLER, 1, [xattr_handler->get() wants xattr_handler]) ],[ dnl # dnl # 2.6.33 API change, - dnl # The xattr_handler->get() callback was changed to take - dnl # a dentry instead of an inode, and a handler_flags - dnl # argument was added. + dnl # The xattr_handler->get() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether xattr_handler->get() wants dentry]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int handler_flags) - { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .get = get, - }; - ],[ - ],[ + AC_MSG_CHECKING( + [whether xattr_handler->get() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, [xattr_handler->get() wants dentry]) ],[ - dnl # - dnl # 2.6.32 API - dnl # - AC_MSG_RESULT(no) - AC_MSG_CHECKING( - [whether xattr_handler->get() wants inode]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int get(struct inode *ip, const char *name, - void *buffer, size_t size) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .get = get, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_GET_INODE, 1, - [xattr_handler->get() wants inode]) - ],[ - AC_MSG_ERROR([no; please file a bug report]) - ]) + ZFS_LINUX_TEST_ERROR([xattr get()]) ]) ]) ]) @@ -187,14 +151,23 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ dnl # dnl # Supported xattr handler set() interfaces checked newest to oldest. dnl # -AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ - dnl # - dnl # 4.7 API change, - dnl # The xattr_handler->set() callback was changed to take both - dnl # dentry and inode. - dnl # - AC_MSG_CHECKING([whether xattr_handler->set() wants both dentry and inode]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_set_userns], [ + #include + + int set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry_inode], [ #include int set(const struct xattr_handler *handler, @@ -206,84 +179,87 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ xops __attribute__ ((unused)) = { .set = set, }; - ],[ - ],[ + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_set_xattr_handler], [ + #include + + int set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry], [ + #include + + int set(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, + int handler_flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ + dnl # + dnl # 5.12 API change, + dnl # The xattr_handler->set() callback was changed to 8 arguments, and + dnl # struct user_namespace* was inserted as arg #2 + dnl # + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry, inode, and user_namespace]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, - [xattr_handler->set() wants both dentry and inode]) + AC_DEFINE(HAVE_XATTR_SET_USERNS, 1, + [xattr_handler->set() takes user_namespace]) ],[ dnl # - dnl # 4.4 API change, - dnl # The xattr_handler->set() callback was changed to take a - dnl # xattr_handler, and handler_flags argument was removed and - dnl # should be accessed by handler->flags. + dnl # 4.7 API change, + dnl # The xattr_handler->set() callback was changed to take both + dnl # dentry and inode. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether xattr_handler->set() wants xattr_handler]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) - { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .set = set, - }; - ],[ - ],[ + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, - [xattr_handler->set() wants xattr_handler]) + AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, + [xattr_handler->set() wants both dentry and inode]) ],[ dnl # - dnl # 2.6.33 API change, + dnl # 4.4 API change, dnl # The xattr_handler->set() callback was changed to take a - dnl # dentry instead of an inode, and a handler_flags - dnl # argument was added. + dnl # xattr_handler, and handler_flags argument was removed and + dnl # should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether xattr_handler->set() wants dentry]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int set(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, - int handler_flags) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .set = set, - }; - ],[ - ],[ + AC_MSG_CHECKING( + [whether xattr_handler->set() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, - [xattr_handler->set() wants dentry]) + AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, + [xattr_handler->set() wants xattr_handler]) ],[ dnl # - dnl # 2.6.32 API + dnl # 2.6.33 API change, + dnl # The xattr_handler->set() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( - [whether xattr_handler->set() wants inode]) - ZFS_LINUX_TRY_COMPILE([ - #include - - int set(struct inode *ip, const char *name, - const void *buffer, size_t size, int flags) - { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .set = set, - }; - ],[ - ],[ + [whether xattr_handler->set() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_INODE, 1, - [xattr_handler->set() wants inode]) + AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, + [xattr_handler->set() wants dentry]) ],[ - AC_MSG_ERROR([no; please file a bug report]) + ZFS_LINUX_TEST_ERROR([xattr set()]) ]) ]) ]) @@ -293,12 +269,8 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ dnl # dnl # Supported xattr handler list() interfaces checked newest to oldest. dnl # -AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ - dnl # 4.5 API change, - dnl # The xattr_handler->list() callback was changed to take only a - dnl # dentry and it only needs to return if it's accessible. - AC_MSG_CHECKING([whether xattr_handler->list() wants simple]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ + ZFS_LINUX_TEST_SRC([xattr_handler_list_simple], [ #include bool list(struct dentry *dentry) { return 0; } @@ -306,8 +278,40 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ xops __attribute__ ((unused)) = { .list = list, }; - ],[ - ],[ + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_list_xattr_handler], [ + #include + + size_t list(const struct xattr_handler *handler, + struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .list = list, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_list_dentry], [ + #include + + size_t list(struct dentry *dentry, + char *list, size_t list_size, + const char *name, size_t name_len, + int handler_flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .list = list, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ + dnl # 4.5 API change, + dnl # The xattr_handler->list() callback was changed to take only a + dnl # dentry and it only needs to return if it's accessible. + AC_MSG_CHECKING([whether xattr_handler->list() wants simple]) + ZFS_LINUX_TEST_RESULT([xattr_handler_list_simple], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_SIMPLE, 1, [xattr_handler->list() wants simple]) @@ -321,18 +325,7 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->list() wants xattr_handler]) - ZFS_LINUX_TRY_COMPILE([ - #include - - size_t list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .list = list, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([xattr_handler_list_xattr_handler], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_HANDLER, 1, [xattr_handler->list() wants xattr_handler]) @@ -346,48 +339,12 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->list() wants dentry]) - ZFS_LINUX_TRY_COMPILE([ - #include - - size_t list(struct dentry *dentry, - char *list, size_t list_size, - const char *name, size_t name_len, - int handler_flags) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .list = list, - }; - ],[ - ],[ + ZFS_LINUX_TEST_RESULT([xattr_handler_list_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_DENTRY, 1, [xattr_handler->list() wants dentry]) ],[ - dnl # - dnl # 2.6.32 API - dnl # - AC_MSG_RESULT(no) - AC_MSG_CHECKING( - [whether xattr_handler->list() wants inode]) - ZFS_LINUX_TRY_COMPILE([ - #include - - size_t list(struct inode *ip, char *lst, - size_t list_size, const char *name, - size_t name_len) { return 0; } - static const struct xattr_handler - xops __attribute__ ((unused)) = { - .list = list, - }; - ],[ - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_LIST_INODE, 1, - [xattr_handler->list() wants inode]) - ],[ - AC_MSG_ERROR( - [no; please file a bug report]) - ]) + ZFS_LINUX_TEST_ERROR([xattr list()]) ]) ]) ]) @@ -398,20 +355,71 @@ dnl # 3.7 API change, dnl # The posix_acl_{from,to}_xattr functions gained a new dnl # parameter: user_ns dnl # -AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ - AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS], [ + ZFS_LINUX_TEST_SRC([posix_acl_from_xattr_userns], [ #include #include #include ],[ posix_acl_from_xattr(&init_user_ns, NULL, 0); - ],[ + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ + AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) + ZFS_LINUX_TEST_RESULT([posix_acl_from_xattr_userns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_POSIX_ACL_FROM_XATTR_USERNS, 1, [posix_acl_from_xattr() needs user_ns]) + ],[ + ZFS_LINUX_TEST_ERROR([posix_acl_from_xattr()]) + ]) +]) + +dnl # +dnl # 4.9 API change, +dnl # iops->{set,get,remove}xattr and generic_{set,get,remove}xattr are +dnl # removed. xattr operations will directly go through sb->s_xattr. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR], [ + ZFS_LINUX_TEST_SRC([have_generic_setxattr], [ + #include + #include + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .setxattr = generic_setxattr + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_SETXATTR], [ + AC_MSG_CHECKING([whether generic_setxattr() exists]) + ZFS_LINUX_TEST_RESULT([have_generic_setxattr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_SETXATTR, 1, + [generic_setxattr() exists]) ],[ AC_MSG_RESULT(no) ]) ]) +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR], [ + ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST + ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS + ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR], [ + ZFS_AC_KERNEL_CONST_XATTR_HANDLER + ZFS_AC_KERNEL_XATTR_HANDLER_NAME + ZFS_AC_KERNEL_XATTR_HANDLER_GET + ZFS_AC_KERNEL_XATTR_HANDLER_SET + ZFS_AC_KERNEL_XATTR_HANDLER_LIST + ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS + ZFS_AC_KERNEL_GENERIC_SETXATTR +]) diff --git a/config/kernel-zlib.m4 b/config/kernel-zlib.m4 index 3ca7cf682d..752d388389 100644 --- a/config/kernel-zlib.m4 +++ b/config/kernel-zlib.m4 @@ -1,63 +1,26 @@ -dnl # -dnl # zlib inflate compat, -dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE], [ - AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined]) - ZFS_LINUX_TRY_COMPILE([ - #if !defined(CONFIG_ZLIB_INFLATE) && \ - !defined(CONFIG_ZLIB_INFLATE_MODULE) - #error CONFIG_ZLIB_INFLATE not defined - #endif - ],[ ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_ERROR([ - *** This kernel does not include the required zlib inflate support. - *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.]) - ]) -]) - -dnl # -dnl # zlib deflate compat, -dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE], [ - AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined]) - ZFS_LINUX_TRY_COMPILE([ - #if !defined(CONFIG_ZLIB_DEFLATE) && \ - !defined(CONFIG_ZLIB_DEFLATE_MODULE) - #error CONFIG_ZLIB_DEFLATE not defined - #endif - ],[ ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_ERROR([ - *** This kernel does not include the required zlib deflate support. - *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.]) - ]) -]) - dnl # dnl # 2.6.39 API compat, +dnl dnl # The function zlib_deflate_workspacesize() now take 2 arguments. dnl # This was done to avoid always having to allocate the maximum size dnl # workspace (268K). The caller can now specific the windowBits and dnl # memLevel compression parameters to get a smaller workspace. dnl # -AC_DEFUN([ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], - [AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args]) - ZFS_LINUX_TRY_COMPILE([ +AC_DEFUN([ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], [ + ZFS_LINUX_TEST_SRC([2args_zlib_deflate_workspacesize], [ #include ],[ return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1, - [zlib_deflate_workspacesize() wants 2 args]) - ],[ - AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], [ + AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args]) + ZFS_LINUX_TEST_RESULT([2args_zlib_deflate_workspacesize], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1, + [zlib_deflate_workspacesize() wants 2 args]) + ],[ + ZFS_LINUX_TEST_ERROR([zlib_deflate_workspacesize()]) ]) ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index ef875efa87..0b94f3bd9c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -2,177 +2,245 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - ZFS_AC_KERNEL - ZFS_AC_QAT + AM_COND_IF([BUILD_LINUX], [ + dnl # Setup the kernel build environment. + ZFS_AC_KERNEL + ZFS_AC_QAT + + dnl # Sanity checks for module building and CONFIG_* defines + ZFS_AC_KERNEL_TEST_MODULE + ZFS_AC_KERNEL_CONFIG_DEFINED + + dnl # Sequential ZFS_LINUX_TRY_COMPILE tests + ZFS_AC_KERNEL_FPU_HEADER + ZFS_AC_KERNEL_OBJTOOL_HEADER + ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T + ZFS_AC_KERNEL_MISC_MINOR + ZFS_AC_KERNEL_DECLARE_EVENT_CLASS + + dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests + ZFS_AC_KERNEL_TEST_SRC + ZFS_AC_KERNEL_TEST_RESULT + + AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ + KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" + ]) + + AC_SUBST(KERNEL_MAKE) + ]) +]) + +dnl # +dnl # Generate and compile all of the kernel API test cases to determine +dnl # which interfaces are available. By invoking the kernel build system +dnl # only once the compilation can be done in parallel significantly +dnl # speeding up the process. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ + ZFS_AC_KERNEL_SRC_OBJTOOL + ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE + ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE + ZFS_AC_KERNEL_SRC_PDE_DATA + ZFS_AC_KERNEL_SRC_FALLOCATE + ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE + ZFS_AC_KERNEL_SRC_RWSEM + ZFS_AC_KERNEL_SRC_SCHED + ZFS_AC_KERNEL_SRC_USLEEP_RANGE + ZFS_AC_KERNEL_SRC_KMEM_CACHE + ZFS_AC_KERNEL_SRC_KVMALLOC + ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL + ZFS_AC_KERNEL_SRC_WAIT + ZFS_AC_KERNEL_SRC_INODE_TIMES + ZFS_AC_KERNEL_SRC_INODE_LOCK + ZFS_AC_KERNEL_SRC_GROUP_INFO_GID + ZFS_AC_KERNEL_SRC_RW + ZFS_AC_KERNEL_SRC_TIMER_SETUP + ZFS_AC_KERNEL_SRC_SUPER_USER_NS + ZFS_AC_KERNEL_SRC_PROC_OPERATIONS + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS + ZFS_AC_KERNEL_SRC_BIO + ZFS_AC_KERNEL_SRC_BLKDEV + ZFS_AC_KERNEL_SRC_BLK_QUEUE + ZFS_AC_KERNEL_SRC_REVALIDATE_DISK + ZFS_AC_KERNEL_SRC_GET_DISK_RO + ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL + ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY + ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE + ZFS_AC_KERNEL_SRC_XATTR + ZFS_AC_KERNEL_SRC_ACL + ZFS_AC_KERNEL_SRC_INODE_GETATTR + ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS + ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION + ZFS_AC_KERNEL_SRC_SHOW_OPTIONS + ZFS_AC_KERNEL_SRC_FILE_INODE + ZFS_AC_KERNEL_SRC_FILE_DENTRY + ZFS_AC_KERNEL_SRC_FSYNC + ZFS_AC_KERNEL_SRC_AIO_FSYNC + ZFS_AC_KERNEL_SRC_EVICT_INODE + ZFS_AC_KERNEL_SRC_DIRTY_INODE + ZFS_AC_KERNEL_SRC_SHRINKER + ZFS_AC_KERNEL_SRC_MKDIR + ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS + ZFS_AC_KERNEL_SRC_CREATE + ZFS_AC_KERNEL_SRC_GET_LINK + ZFS_AC_KERNEL_SRC_PUT_LINK + ZFS_AC_KERNEL_SRC_TMPFILE + ZFS_AC_KERNEL_SRC_AUTOMOUNT + ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE + ZFS_AC_KERNEL_SRC_COMMIT_METADATA + ZFS_AC_KERNEL_SRC_CLEAR_INODE + ZFS_AC_KERNEL_SRC_SETATTR_PREPARE + ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED + ZFS_AC_KERNEL_SRC_DENTRY + ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE + ZFS_AC_KERNEL_SRC_SECURITY_INODE + ZFS_AC_KERNEL_SRC_FST_MOUNT + ZFS_AC_KERNEL_SRC_BDI + ZFS_AC_KERNEL_SRC_SET_NLINK + ZFS_AC_KERNEL_SRC_SGET + ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE + ZFS_AC_KERNEL_SRC_VFS_GETATTR + ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS + ZFS_AC_KERNEL_SRC_VFS_ITERATE + ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO + ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE + ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_SRC_VFS_IOV_ITER + ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS + ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE + ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN + ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT + ZFS_AC_KERNEL_SRC_FPU + ZFS_AC_KERNEL_SRC_FMODE_T + ZFS_AC_KERNEL_SRC_KUIDGID_T + ZFS_AC_KERNEL_SRC_KUID_HELPERS + ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST + ZFS_AC_KERNEL_SRC_RENAME + ZFS_AC_KERNEL_SRC_CURRENT_TIME + ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES + ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL + ZFS_AC_KERNEL_SRC_KTIME + ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC + ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES + ZFS_AC_KERNEL_SRC_KSTRTOUL + ZFS_AC_KERNEL_SRC_PERCPU + ZFS_AC_KERNEL_SRC_CPU_HOTPLUG + ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_SRC_MKNOD + ZFS_AC_KERNEL_SRC_SYMLINK + ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS + ZFS_AC_KERNEL_SRC_SIGNAL_STOP + ZFS_AC_KERNEL_SRC_SIGINFO + ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE + ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS + ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG + + AC_MSG_CHECKING([for available kernel interfaces]) + ZFS_LINUX_TEST_COMPILE_ALL([kabi]) + AC_MSG_RESULT([done]) +]) + +dnl # +dnl # Check results of kernel interface tests. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_ACCESS_OK_TYPE - ZFS_AC_TEST_MODULE - ZFS_AC_KERNEL_MISC_MINOR + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL - ZFS_AC_KERNEL_CONFIG - ZFS_AC_KERNEL_CTL_NAME ZFS_AC_KERNEL_PDE_DATA - ZFS_AC_KERNEL_2ARGS_VFS_FSYNC - ZFS_AC_KERNEL_FS_STRUCT_SPINLOCK - ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE - ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW - ZFS_AC_KERNEL_RWSEM_ACTIVITY - ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT - ZFS_AC_KERNEL_SCHED_RT_HEADER - ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER - ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT - ZFS_AC_KERNEL_4ARGS_VFS_GETATTR - ZFS_AC_KERNEL_3ARGS_VFS_GETATTR - ZFS_AC_KERNEL_2ARGS_VFS_GETATTR + ZFS_AC_KERNEL_RWSEM + ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE - ZFS_AC_KERNEL_KMEM_CACHE_ALLOCFLAGS - ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY - ZFS_AC_KERNEL_WAIT_ON_BIT - ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T - ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY + ZFS_AC_KERNEL_KMEM_CACHE + ZFS_AC_KERNEL_KVMALLOC + ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL + ZFS_AC_KERNEL_WAIT ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_INODE_LOCK ZFS_AC_KERNEL_GROUP_INFO_GID - ZFS_AC_KERNEL_WRITE - ZFS_AC_KERNEL_READ + ZFS_AC_KERNEL_RW ZFS_AC_KERNEL_TIMER_SETUP - ZFS_AC_KERNEL_DECLARE_EVENT_CLASS - ZFS_AC_KERNEL_CURRENT_BIO_TAIL ZFS_AC_KERNEL_SUPER_USER_NS - ZFS_AC_KERNEL_SUBMIT_BIO - ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS - ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID - ZFS_AC_KERNEL_TYPE_FMODE_T - ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH - ZFS_AC_KERNEL_BLKDEV_REREAD_PART - ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE - ZFS_AC_KERNEL_LOOKUP_BDEV - ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS - ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE - ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE - ZFS_AC_KERNEL_BIO_BVEC_ITER - ZFS_AC_KERNEL_BIO_FAILFAST_DTD - ZFS_AC_KERNEL_BIO_SET_DEV - ZFS_AC_KERNEL_REQ_FAILFAST_MASK - ZFS_AC_KERNEL_REQ_OP_DISCARD - ZFS_AC_KERNEL_REQ_OP_SECURE_ERASE - ZFS_AC_KERNEL_REQ_OP_FLUSH - ZFS_AC_KERNEL_BIO_BI_OPF - ZFS_AC_KERNEL_BIO_END_IO_T_ARGS - ZFS_AC_KERNEL_BIO_BI_STATUS - ZFS_AC_KERNEL_BIO_RW_BARRIER - ZFS_AC_KERNEL_BIO_RW_DISCARD - ZFS_AC_KERNEL_BLK_QUEUE_BDI - ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR - ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET - ZFS_AC_KERNEL_BLK_QUEUE_FLUSH - ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS - ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS - ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BIO_RW_UNPLUG - ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG - ZFS_AC_KERNEL_GET_DISK_AND_MODULE + ZFS_AC_KERNEL_PROC_OPERATIONS + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS + ZFS_AC_KERNEL_BIO + ZFS_AC_KERNEL_BLKDEV + ZFS_AC_KERNEL_BLK_QUEUE + ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO - ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY - ZFS_AC_KERNEL_CONST_XATTR_HANDLER - ZFS_AC_KERNEL_XATTR_HANDLER_NAME - ZFS_AC_KERNEL_XATTR_HANDLER_GET - ZFS_AC_KERNEL_XATTR_HANDLER_SET - ZFS_AC_KERNEL_XATTR_HANDLER_LIST ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE - ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS - ZFS_AC_KERNEL_POSIX_ACL_RELEASE - ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE - ZFS_AC_KERNEL_POSIX_ACL_CHMOD - ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T - ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS - ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION - ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA - ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL - ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS - ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL - ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL - ZFS_AC_KERNEL_INODE_OPERATIONS_GETATTR + ZFS_AC_KERNEL_XATTR + ZFS_AC_KERNEL_ACL + ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION - ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY ZFS_AC_KERNEL_FSYNC - ZFS_AC_KERNEL_EVICT_INODE - ZFS_AC_KERNEL_DIRTY_INODE_WITH_FLAGS - ZFS_AC_KERNEL_NR_CACHED_OBJECTS - ZFS_AC_KERNEL_FREE_CACHED_OBJECTS - ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_AIO_FSYNC - ZFS_AC_KERNEL_MKDIR_UMODE_T - ZFS_AC_KERNEL_LOOKUP_NAMEIDATA - ZFS_AC_KERNEL_CREATE_NAMEIDATA + ZFS_AC_KERNEL_EVICT_INODE + ZFS_AC_KERNEL_DIRTY_INODE + ZFS_AC_KERNEL_SHRINKER + ZFS_AC_KERNEL_MKDIR + ZFS_AC_KERNEL_LOOKUP_FLAGS + ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK ZFS_AC_KERNEL_TMPFILE - ZFS_AC_KERNEL_TRUNCATE_RANGE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_CLEAR_INODE ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED - ZFS_AC_KERNEL_D_MAKE_ROOT - ZFS_AC_KERNEL_D_OBTAIN_ALIAS - ZFS_AC_KERNEL_D_PRUNE_ALIASES - ZFS_AC_KERNEL_D_SET_D_OP - ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA - ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS + ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_TRUNCATE_SETSIZE - ZFS_AC_KERNEL_6ARGS_SECURITY_INODE_INIT_SECURITY - ZFS_AC_KERNEL_CALLBACK_SECURITY_INODE_INIT_SECURITY + ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT - ZFS_AC_KERNEL_SHRINK - ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID - ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT - ZFS_AC_KERNEL_SHRINKER_CALLBACK - ZFS_AC_KERNEL_S_INSTANCES_LIST_HEAD - ZFS_AC_KERNEL_S_D_OP ZFS_AC_KERNEL_BDI ZFS_AC_KERNEL_SET_NLINK - ZFS_AC_KERNEL_ELEVATOR_CHANGE - ZFS_AC_KERNEL_5ARG_SGET + ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE + ZFS_AC_KERNEL_VFS_GETATTR + ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE - ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_DIRECT_IO - ZFS_AC_KERNEL_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_VFS_RW_ITERATE + ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN - ZFS_AC_KERNEL_GENERIC_IO_ACCT_3ARG - ZFS_AC_KERNEL_GENERIC_IO_ACCT_4ARG + ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU + ZFS_AC_KERNEL_FMODE_T + ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST - ZFS_AC_KERNEL_RENAME_WANTS_FLAGS - ZFS_AC_KERNEL_HAVE_GENERIC_SETXATTR + ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_CURRENT_TIME - ZFS_AC_KERNEL_GLOBAL_PAGE_STATE - ZFS_AC_KERNEL_ACL_HAS_REFCOUNT ZFS_AC_KERNEL_USERNS_CAPABILITIES ZFS_AC_KERNEL_IN_COMPAT_SYSCALL - ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64 + ZFS_AC_KERNEL_KTIME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES - ZFS_AC_KERNEL_BLK_QUEUE_DISCARD - ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE ZFS_AC_KERNEL_KSTRTOUL - - AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ - KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" - ]) - - AC_SUBST(KERNEL_MAKE) + ZFS_AC_KERNEL_PERCPU + ZFS_AC_KERNEL_CPU_HOTPLUG + ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_MKNOD + ZFS_AC_KERNEL_SYMLINK + ZFS_AC_KERNEL_BIO_MAX_SEGS + ZFS_AC_KERNEL_SIGNAL_STOP + ZFS_AC_KERNEL_SIGINFO + ZFS_AC_KERNEL_SET_SPECIAL_STATE + ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS + ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ]) dnl # @@ -191,9 +259,10 @@ AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution - *** is installed. If you are building with a custom kernel, make sure the - *** kernel is configured, built, and the '--with-linux=PATH' configure - *** option refers to the location of the kernel source.]) + *** is installed. If you are building with a custom kernel, make sure + *** the kernel is configured, built, and the '--with-linux=PATH' + *** configure option refers to the location of the kernel source. + ]) ]) ], [ LINUX_SYMBOLS=NONE @@ -271,27 +340,27 @@ AC_DEFUN([ZFS_AC_KERNEL], [ utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1], [ - utsrelease=linux/version.h + utsrelease=$utsrelease1 ], [test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2], [ - utsrelease=linux/utsrelease.h + utsrelease=$utsrelease2 ], [test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3], [ - utsrelease=generated/utsrelease.h + utsrelease=$utsrelease3 ]) - AS_IF([test "$utsrelease"], [ - kernsrcver=`(echo "#include <$utsrelease>"; - echo "kernsrcver=UTS_RELEASE") | - ${CPP} -I $kernelbuild/include - | - grep "^kernsrcver=" | cut -d \" -f 2` - + AS_IF([test -n "$utsrelease"], [ + kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) - AC_MSG_ERROR([*** Cannot determine kernel version.]) + AC_MSG_ERROR([ + *** Cannot determine kernel version. + ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then - AC_MSG_ERROR([*** Cannot find UTS_RELEASE definition.]) + AC_MSG_ERROR([ + *** Cannot find UTS_RELEASE definition. + ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. @@ -301,6 +370,13 @@ AC_DEFUN([ZFS_AC_KERNEL], [ AC_MSG_RESULT([$kernsrcver]) + AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ + AC_MSG_ERROR([ + *** Cannot build against kernel version $kernsrcver. + *** The minimum supported kernel version is $ZFS_META_KVER_MIN. + ]) + ]) + LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} @@ -313,24 +389,27 @@ AC_DEFUN([ZFS_AC_KERNEL], [ ]) dnl # -dnl # Detect the QAT module to be built against -dnl # QAT provides hardware acceleration for data compression: -dnl # https://01.org/intel-quickassist-technology -dnl # * Download and install QAT driver from the above link -dnl # * Start QAT driver in your system: -dnl # service qat_service start -dnl # * Enable QAT in ZFS, e.g.: -dnl # ./configure --with-qat=/QAT1.6 -dnl # make -dnl # * Set GZIP compression in ZFS dataset: -dnl # zfs set compression = gzip -dnl # Then the data written to this ZFS pool is compressed -dnl # by QAT accelerator automatically, and de-compressed by -dnl # QAT when read from the pool. -dnl # * Get QAT hardware statistics by: -dnl # cat /proc/icp_dh895xcc_dev/qat -dnl # * To disable QAT: -dnl # insmod zfs.ko zfs_qat_disable=1 +dnl # Detect the QAT module to be built against, QAT provides hardware +dnl # acceleration for data compression: +dnl # +dnl # https://01.org/intel-quickassist-technology +dnl # +dnl # 1) Download and install QAT driver from the above link +dnl # 2) Start QAT driver in your system: +dnl # service qat_service start +dnl # 3) Enable QAT in ZFS, e.g.: +dnl # ./configure --with-qat=/QAT1.6 +dnl # make +dnl # 4) Set GZIP compression in ZFS dataset: +dnl # zfs set compression = gzip +dnl # +dnl # Then the data written to this ZFS pool is compressed by QAT accelerator +dnl # automatically, and de-compressed by QAT when read from the pool. +dnl # +dnl # 1) Get QAT hardware statistics with: +dnl # cat /proc/icp_dh895xcc_dev/qat +dnl # 2) To disable QAT: +dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], @@ -351,11 +430,11 @@ AC_DEFUN([ZFS_AC_QAT], [ QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ - *** Please make sure the qat driver package is installed - *** and specify the location of the qat source with the - *** '--with-qat=PATH' option then try again. Failed to - *** find cpa.h in: - ${QAT_SRC}/include]) + *** Please make sure the qat driver package is installed + *** and specify the location of the qat source with the + *** '--with-qat=PATH' option then try again. Failed to + *** find cpa.h in: + ${QAT_SRC}/include]) ]) ]) @@ -369,9 +448,9 @@ AC_DEFUN([ZFS_AC_QAT], [ QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ - *** Please make sure the qat driver is installed then try again. - *** Failed to find icp_qa_al.ko or qat_api.ko in: - $QAT_OBJ]) + *** Please make sure the qat driver is installed then try again. + *** Failed to find icp_qa_al.ko or qat_api.ko in: + $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) @@ -392,10 +471,10 @@ AC_DEFUN([ZFS_AC_QAT], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ - AC_MSG_ERROR([ - *** Please make sure the qat driver is installed then try again. - *** Failed to find Module.symvers in: - $QAT_SYMBOLS]) + AC_MSG_ERROR([ + *** Please make sure the qat driver is installed then try again. + *** Failed to find Module.symvers in: + $QAT_SYMBOLS ]) ]) ]) @@ -404,14 +483,16 @@ AC_DEFUN([ZFS_AC_QAT], [ dnl # dnl # Basic toolchain sanity check. dnl # -AC_DEFUN([ZFS_AC_TEST_MODULE], [ +AC_DEFUN([ZFS_AC_KERNEL_TEST_MODULE], [ AC_MSG_CHECKING([whether modules can be built]) - ZFS_LINUX_TRY_COMPILE([],[],[ + ZFS_LINUX_TRY_COMPILE([], [], [ AC_MSG_RESULT([yes]) ],[ AC_MSG_RESULT([no]) if test "x$enable_linux_builtin" != xyes; then - AC_MSG_ERROR([*** Unable to build an empty module.]) + AC_MSG_ERROR([ + *** Unable to build an empty module. + ]) else AC_MSG_ERROR([ *** Unable to build an empty module. @@ -420,126 +501,12 @@ AC_DEFUN([ZFS_AC_TEST_MODULE], [ ]) ]) -dnl # -dnl # Certain kernel build options are not supported. These must be -dnl # detected at configure time and cause a build failure. Otherwise -dnl # modules may be successfully built that behave incorrectly. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG], [ - AS_IF([test "x$cross_compiling" != xyes], [ - AC_RUN_IFELSE([ - AC_LANG_PROGRAM([ - #include "$LINUX/include/linux/license.h" - ], [ - return !license_is_gpl_compatible("$ZFS_META_LICENSE"); - ]) - ], [ - AC_DEFINE([ZFS_IS_GPL_COMPATIBLE], [1], - [Define to 1 if GPL-only symbols can be used]) - ], [ - ]) - ]) - - ZFS_AC_KERNEL_CONFIG_THREAD_SIZE - ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC - ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS - ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE - ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE -]) - -dnl # -dnl # Check configured THREAD_SIZE -dnl # -dnl # The stack size will vary by architecture, but as of Linux 3.15 on x86_64 -dnl # the default thread stack size was increased to 16K from 8K. Therefore, -dnl # on newer kernels and some architectures stack usage optimizations can be -dnl # conditionally applied to improve performance without negatively impacting -dnl # stability. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_THREAD_SIZE], [ - AC_MSG_CHECKING([whether kernel was built with 16K or larger stacks]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - #if (THREAD_SIZE < 16384) - #error "THREAD_SIZE is less than 16K" - #endif - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_LARGE_STACKS, 1, [kernel has large stacks]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) - -dnl # -dnl # Check CONFIG_DEBUG_LOCK_ALLOC -dnl # -dnl # This is typically only set for debug kernels because it comes with -dnl # a performance penalty. However, when it is set it maps the non-GPL -dnl # symbol mutex_lock() to the GPL-only mutex_lock_nested() symbol. -dnl # This will cause a failure at link time which we'd rather know about -dnl # at compile time. -dnl # -dnl # Since we plan to pursue making mutex_lock_nested() a non-GPL symbol -dnl # with the upstream community we add a check to detect this case. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC], [ - - ZFS_LINUX_CONFIG([DEBUG_LOCK_ALLOC], [ - AC_MSG_CHECKING([whether mutex_lock() is GPL-only]) - tmp_flags="$EXTRA_KCFLAGS" - ZFS_LINUX_TRY_COMPILE([ - #include - #include - - MODULE_LICENSE("$ZFS_META_LICENSE"); - ],[ - struct mutex lock; - - mutex_init(&lock); - mutex_lock(&lock); - mutex_unlock(&lock); - ],[ - AC_MSG_RESULT(no) - ],[ - AC_MSG_RESULT(yes) - AC_MSG_ERROR([ - *** Kernel built with CONFIG_DEBUG_LOCK_ALLOC which is incompatible - *** with the CDDL license and will prevent the module linking stage - *** from succeeding. You must rebuild your kernel without this - *** option enabled.]) - ]) - EXTRA_KCFLAGS="$tmp_flags" - ], []) -]) - -dnl # -dnl # Check CONFIG_TRIM_UNUSED_KSYMS -dnl # -dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS disabled. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS], [ - AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled]) - ZFS_LINUX_TRY_COMPILE([ - #if defined(CONFIG_TRIM_UNUSED_KSYMS) - #error CONFIG_TRIM_UNUSED_KSYMS not defined - #endif - ],[ ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_ERROR([ - *** This kernel has unused symbols trimming enabled, please disable. - *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.]) - ]) -]) - dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ -cat - <<_ACEOF >conftest.h +test -d build/$2 || mkdir -p build/$2 +cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) @@ -548,79 +515,320 @@ dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ -cat confdefs.h - <<_ACEOF >conftest.c +test -d build/$2 || mkdir -p build/$2 +cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # -dnl # ZFS_LANG_PROGRAM(C)([PROLOGUE], [BODY]) +dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # -m4_define([ZFS_LANG_PROGRAM], [ +dnl # $1 - test case name +dnl # $2 - add to top-level Makefile +dnl # $3 - additional build flags +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ + test -d build || mkdir -p build + test -d build/$1 || mkdir -p build/$1 + + file=build/$1/Makefile + + dnl # Example command line to manually build source. + cat - <<_ACEOF >$file +# Example command line to manually build source +# make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 + +ccflags-y := -Werror $FRAME_LARGER_THAN +_ACEOF + + dnl # Additional custom CFLAGS as requested. + m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) + + dnl # Test case source + echo "obj-m := $1.o" >>$file + + AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) +]) + +dnl # +dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) +dnl # +m4_define([ZFS_LINUX_TEST_PROGRAM], [ +#include $1 + int main (void) { -dnl Do *not* indent the following line: there may be CPP directives. -dnl Don't move the `;' right after for the same reason. $2 - ; - return 0; + ; + return 0; } + +MODULE_DESCRIPTION("conftest"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +MODULE_LICENSE($3); ]) dnl # -dnl # ZFS_LINUX_COMPILE_IFELSE / like AC_COMPILE_IFELSE +dnl # ZFS_LINUX_TEST_REMOVE +dnl # +dnl # Removes the specified test source and results. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ + test -d build/$1 && rm -Rf build/$1 + test -f build/Makefile && sed '/$1/d' build/Makefile +]) + +dnl # +dnl # ZFS_LINUX_COMPILE +dnl # +dnl # $1 - build dir +dnl # $2 - test command +dnl # $3 - pass command +dnl # $4 - fail command +dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' +dnl # $6 - set KBUILD_MODPOST_WARN='yes' +dnl # +dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} +dnl # +AC_DEFUN([ZFS_LINUX_COMPILE], [ + AC_TRY_COMMAND([ + KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" + make modules -k -j$TEST_JOBS -C $LINUX_OBJ $ARCH_UM + M=$PWD/$1 >$1/build.log 2>&1]) + AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_COMPILE +dnl # +dnl # Perform a full compile excluding the final modpost phase. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ + ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ + mv $2/Makefile $2/Makefile.compile.$1 + mv $2/build.log $2/build.log.$1 + ],[ + AC_MSG_ERROR([ + *** Unable to compile test source to determine kernel interfaces.]) + ], [yes], []) +]) + +dnl # +dnl # ZFS_LINUX_TEST_MODPOST +dnl # +dnl # Perform a full compile including the modpost phase. This may +dnl # be an incremental build if the objects have already been built. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ + ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ + mv $2/Makefile $2/Makefile.modpost.$1 + cat $2/build.log >>build/build.log.$1 + ],[ + AC_MSG_ERROR([ + *** Unable to modpost test source to determine kernel interfaces.]) + ], [], [yes]) +]) + +dnl # +dnl # Perform the compilation of the test cases in two phases. +dnl # +dnl # Phase 1) attempt to build the object files for all of the tests +dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not +dnl # perform the final modpost stage. +dnl # +dnl # Phase 2) disable all tests which failed the initial compilation, +dnl # then invoke the final modpost step for the remaining tests. +dnl # +dnl # This allows us efficiently build the test cases in parallel while +dnl # remaining resilient to build failures which are expected when +dnl # detecting the available kernel interfaces. +dnl # +dnl # The maximum allowed parallelism can be controlled by setting the +dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). +dnl # +AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ + dnl # Phase 1 - Compilation only, final linking is skipped. + ZFS_LINUX_TEST_COMPILE([$1], [build]) + + dnl # + dnl # Phase 2 - When building external modules disable test cases + dnl # which failed to compile and invoke modpost to verify the + dnl # final linking. + dnl # + dnl # Test names suffixed with '_license' call modpost independently + dnl # to ensure that a single incompatibility does not result in the + dnl # modpost phase exiting early. This check is not performed on + dnl # every symbol since the majority are compatible and doing so + dnl # would significantly slow down this phase. + dnl # + dnl # When configuring for builtin (--enable-linux-builtin) + dnl # fake the linking step artificially create the expected .ko + dnl # files for tests which did compile. This is required for + dnl # kernels which do not have loadable module support or have + dnl # not yet been built. + dnl # + AS_IF([test "x$enable_linux_builtin" = "xno"], [ + for dir in $(awk '/^obj-m/ { print [$]3 }' \ + build/Makefile.compile.$1); do + name=${dir%/} + AS_IF([test -f build/$name/$name.o], [ + AS_IF([test "${name##*_}" = "license"], [ + ZFS_LINUX_TEST_MODPOST([$1], + [build/$name]) + echo "obj-n += $dir" >>build/Makefile + ], [ + echo "obj-m += $dir" >>build/Makefile + ]) + ], [ + echo "obj-n += $dir" >>build/Makefile + ]) + done + + ZFS_LINUX_TEST_MODPOST([$1], [build]) + ], [ + for dir in $(awk '/^obj-m/ { print [$]3 }' \ + build/Makefile.compile.$1); do + name=${dir%/} + AS_IF([test -f build/$name/$name.o], [ + touch build/$name/$name.ko + ]) + done + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_SRC +dnl # +dnl # $1 - name +dnl # $2 - global +dnl # $3 - source +dnl # $4 - extra cflags +dnl # $5 - check license-compatibility +dnl # +dnl # Check if the test source is buildable at all and then if it is +dnl # license compatible. +dnl # +dnl # N.B because all of the test cases are compiled in parallel they +dnl # must never depend on the results of previous tests. Each test +dnl # needs to be entirely independent. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_SRC], [ + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], + [["Dual BSD/GPL"]])], [$1]) + ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) + + AS_IF([ test -n "$5" ], [ + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( + [[$2]], [[$3]], [[$5]])], [$1_license]) + ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_RESULT +dnl # +dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) +dnl # $2 - run on success (valid .ko generated) +dnl # $3 - run on failure (unable to compile) +dnl # +AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ + AS_IF([test -d build/$1], [ + AS_IF([test -f build/$1/$1.ko], [$2], [$3]) + ], [ + AC_MSG_ERROR([ + *** No matching source for the "$1" test, check that + *** both the test source and result macros refer to the same name. + ]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_ERROR +dnl # +dnl # Generic error message which can be used when none of the expected +dnl # kernel interfaces were detected. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ + AC_MSG_ERROR([ + *** None of the expected "$1" interfaces were detected. + *** This may be because your kernel version is newer than what is + *** supported, or you are using a patched custom kernel with + *** incompatible modifications. + *** + *** ZFS Version: $ZFS_META_ALIAS + *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_RESULT_SYMBOL +dnl # +dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to +dnl # verify symbol exports, unless --enable-linux-builtin was provided to +dnl # configure. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ + AS_IF([ ! test -f build/$1/$1.ko], [ + $5 + ], [ + AS_IF([test "x$enable_linux_builtin" != "xyes"], [ + ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) + ], [ + $4 + ]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ - m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1])]) - m4_ifvaln([$6], [ZFS_LINUX_CONFTEST_H([$6])], [ZFS_LINUX_CONFTEST_H([])]) - rm -Rf build && mkdir -p build && touch build/conftest.mod.c - echo "obj-m := conftest.o" >build/Makefile - modpost_flag='' - test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage - AS_IF( - [AC_TRY_COMMAND(cp conftest.c conftest.h build && make [$2] -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $FRAME_LARGER_THAN $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag) >/dev/null && AC_TRY_COMMAND([$3])], - [$4], - [_AC_MSG_LOG_CONFTEST m4_ifvaln([$5],[$5])] - ) - rm -Rf build + ZFS_LINUX_TEST_REMOVE([conftest]) + + m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) + m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], + [ZFS_LINUX_CONFTEST_H([], [conftest])]) + + ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], + [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) + ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # -dnl # ZFS_LINUX_TRY_COMPILE like AC_TRY_COMPILE +dnl # ZFS_LINUX_TRY_COMPILE dnl # -AC_DEFUN([ZFS_LINUX_TRY_COMPILE], - [ZFS_LINUX_COMPILE_IFELSE( - [AC_LANG_SOURCE([ZFS_LANG_PROGRAM([[$1]], [[$2]])])], - [modules], - [test -s build/conftest.o], - [$3], [$4]) -]) - +dnl # $1 - global +dnl # $2 - source +dnl # $3 - run on success (valid .ko generated) +dnl # $4 - run on failure (unable to compile) dnl # -dnl # ZFS_LINUX_CONFIG +dnl # When configuring as builtin (--enable-linux-builtin) for kernels +dnl # without loadable module support (CONFIG_MODULES=n) only the object +dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # -AC_DEFUN([ZFS_LINUX_CONFIG], - [AC_MSG_CHECKING([whether kernel was built with CONFIG_$1]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - #ifndef CONFIG_$1 - #error CONFIG_$1 not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - $2 - ],[ - AC_MSG_RESULT([no]) - $3 +AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ + AS_IF([test "x$enable_linux_builtin" = "xyes"], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], + [[ZFS_META_LICENSE]])], + [test -f build/conftest/conftest.o], [$3], [$4]) + ], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], + [[ZFS_META_LICENSE]])], + [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT -dnl # check symbol exported or not +dnl # +dnl # Check if a symbol is exported on not by consulting the symbols +dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ @@ -649,8 +857,10 @@ AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL -dnl # like ZFS_LINUX_TRY_COMPILE, except ZFS_CHECK_SYMBOL_EXPORT -dnl # is called if not compiling for builtin +dnl # +dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called +dnl # to verify symbol exports, unless --enable-linux-builtin was provided +dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) @@ -673,10 +883,9 @@ dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # -AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], - [ZFS_LINUX_COMPILE_IFELSE( - [AC_LANG_SOURCE([ZFS_LANG_PROGRAM([[$1]], [[$2]])])], - [modules], - [test -s build/conftest.o], - [$3], [$4], [$5]) +AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], + [test -f build/conftest/conftest.ko], + [$3], [$4], [$5]) ]) diff --git a/config/lib-link.m4 b/config/lib-link.m4 index 0ff10731fa..041f976d79 100644 --- a/config/lib-link.m4 +++ b/config/lib-link.m4 @@ -67,8 +67,8 @@ AC_DEFUN([AC_LIB_HAVE_LINKFLAGS], AC_LIB_LINKFLAGS_BODY([$1], [$2]) dnl Add $INC[]NAME to CPPFLAGS before performing the following checks, - dnl because if the user has installed lib[]Name and not disabled its use - dnl via --without-lib[]Name-prefix, he wants to use it. + dnl so that if lib[]Name is installed, it will be used (unless + dnl disabled via --without-lib[]Name-prefix). ac_save_CPPFLAGS="$CPPFLAGS" AC_LIB_APPENDTOVAR([CPPFLAGS], [$INC]NAME) @@ -216,7 +216,7 @@ AC_DEFUN([AC_LIB_LINKFLAGS_BODY], fi ]) dnl Search the library and its dependencies in $additional_libdir and - dnl $LDFLAGS. Using breadth-first-seach. + dnl $LDFLAGS. Using breadth-first-search. LIB[]NAME= LTLIB[]NAME= INC[]NAME= diff --git a/config/lib-prefix.m4 b/config/lib-prefix.m4 index 8adb17bb91..f7db2371db 100644 --- a/config/lib-prefix.m4 +++ b/config/lib-prefix.m4 @@ -8,10 +8,9 @@ dnl From Bruno Haible. dnl AC_LIB_PREFIX adds to the CPPFLAGS and LDFLAGS the flags that are needed dnl to access previously installed libraries. The basic assumption is that -dnl a user will want packages to use other packages he previously installed -dnl with the same --prefix option. -dnl This macro is not needed if only AC_LIB_LINKFLAGS is used to locate -dnl libraries, but is otherwise very convenient. +dnl packages should use other packages that are installed with the same +dnl --prefix option. This macro is not needed if only AC_LIB_LINKFLAGS is +dnl used to locate libraries, but is otherwise very convenient. AC_DEFUN([AC_LIB_PREFIX], [ AC_BEFORE([$0], [AC_LIB_LINKFLAGS]) diff --git a/config/mount-helper.m4 b/config/mount-helper.m4 index 0a6c767084..e559b9ab27 100644 --- a/config/mount-helper.m4 +++ b/config/mount-helper.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_MOUNT_HELPER], [ AC_ARG_WITH(mounthelperdir, - AC_HELP_STRING([--with-mounthelperdir=DIR], + AS_HELP_STRING([--with-mounthelperdir=DIR], [install mount.zfs in dir [[/sbin]]]), mounthelperdir=$withval,mounthelperdir=/sbin) diff --git a/config/pkg.m4 b/config/pkg.m4 index 13a8890178..f9075e56c8 100644 --- a/config/pkg.m4 +++ b/config/pkg.m4 @@ -86,7 +86,7 @@ dnl Check to see whether a particular set of modules exists. Similar to dnl PKG_CHECK_MODULES(), but does not set variables or print errors. dnl dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place +dnl only at the first occurrence in configure.ac, so if the first place dnl it's called might be skipped (such as if it is within an "if", you dnl have to call PKG_CHECK_EXISTS manually AC_DEFUN([PKG_CHECK_EXISTS], diff --git a/config/rpm.am b/config/rpm.am index 51a20b3e6a..13bd54a625 100644 --- a/config/rpm.am +++ b/config/rpm.am @@ -6,6 +6,12 @@ # Build targets for RPM packages. ############################################################################### +PHONY += srpm srpms srpm-kmod srpm-dkms srpm-utils +PHONY += rpm rpms rpm-kmod rpm-dkms rpm-utils rpm-utils-initramfs +PHONY += srpm-common rpm-common rpm-local + +srpm-kmod srpm-dkms srpm-utils: dist + srpm-kmod: $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-kmod" \ def='${SRPM_DEFINE_COMMON} ${SRPM_DEFINE_KMOD}' srpm-common @@ -29,10 +35,22 @@ rpm-dkms: srpm-dkms $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-dkms" \ def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_DKMS}' rpm-common +# The rpm-utils and rpm-utils-initramfs targets are identical except for the +# zfs-initramfs package: rpm-utils never includes it, rpm-utils-initramfs +# includes it if detected at configure time. The zfs-initramfs package does +# not work on any known RPM-based distribution and the resulting RPM is only +# used to create a Debian package. The rpm-utils-initramfs target is not +# intended to be specified by the user directly, it is provided as a +# dependency of the deb-utils target. + rpm-utils: srpm-utils $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}" \ def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_UTIL}' rpm-common +rpm-utils-initramfs: srpm-utils + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}" \ + def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_UTIL} ${RPM_DEFINE_INITRAMFS}' rpm-common + rpm: rpm-kmod rpm-dkms rpm-utils rpms: rpm-kmod rpm-dkms rpm-utils @@ -54,7 +72,7 @@ rpm-local: cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \ cp $(distdir).tar.gz $(rpmbuild)/SOURCES) -srpm-common: dist +srpm-common: @(dist=`$(RPM) --eval %{?dist}`; \ rpmpkg=$(pkg)-$(VERSION)-$(RELEASE)$$dist*src.rpm; \ rpmspec=$(pkg).spec; \ diff --git a/config/suppressed-warnings.txt b/config/suppressed-warnings.txt deleted file mode 100644 index 621e3cdba7..0000000000 --- a/config/suppressed-warnings.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# Expected warnings which should be suppressed by buildbot -# -None : ^libtool: install: warning: relinking `.*'$ -None : ^libtool: install: warning: remember to run `libtool --finish .*'$ -None : ^libtool: install: warning: `.*' has not been installed in `.*'$ -None : ^warning: File listed twice:.* diff --git a/config/tgz.am b/config/tgz.am index 0657d045d1..2499ba4230 100644 --- a/config/tgz.am +++ b/config/tgz.am @@ -1,3 +1,5 @@ +PHONY += tgz tgz-kmod tgz-utils tgz-local + tgz-local: @(if test "${HAVE_ALIEN}" = "no"; then \ echo -e "\n" \ @@ -8,17 +10,14 @@ tgz-local: fi) tgz-kmod: tgz-local rpm-kmod -if CONFIG_KERNEL name=${PACKAGE}; \ version=${VERSION}-${RELEASE}; \ arch=`$(RPM) -qp $${name}-kmod-$${version}.src.rpm --qf %{arch} | tail -1`; \ pkg1=kmod-$${name}*$${version}.$${arch}.rpm; \ fakeroot $(ALIEN) --scripts --to-tgz $$pkg1; \ $(RM) $$pkg1 -endif tgz-utils: tgz-local rpm-utils -if CONFIG_USER name=${PACKAGE}; \ version=${VERSION}-${RELEASE}; \ arch=`$(RPM) -qp $${name}-$${version}.src.rpm --qf %{arch} | tail -1`; \ @@ -27,6 +26,5 @@ if CONFIG_USER pkg3=$${name}-test-$${version}.$${arch}.rpm; \ fakeroot $(ALIEN) --scripts --to-tgz $$pkg1 $$pkg2 $$pkg3; \ $(RM) $$pkg1 $$pkg2 $$pkg3 -endif tgz: tgz-kmod tgz-utils diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 index 37627b813b..1153cd6941 100644 --- a/config/toolchain-simd.m4 +++ b/config/toolchain-simd.m4 @@ -3,7 +3,7 @@ dnl # Checks if host toolchain supports SIMD instructions dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ case "$host_cpu" in - x86_64 | x86 | i686) + amd64 | x86_64 | x86 | i686) ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2 ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3 @@ -23,6 +23,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE ;; esac ]) @@ -401,3 +402,23 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [ AC_MSG_RESULT([no]) ]) ]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ + AC_MSG_CHECKING([whether host toolchain supports MOVBE]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("movbe 0(%eax), %eax"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_MOVBE], 1, [Define if host toolchain supports MOVBE]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/user-clock_gettime.m4 b/config/user-clock_gettime.m4 new file mode 100644 index 0000000000..c96024da79 --- /dev/null +++ b/config/user-clock_gettime.m4 @@ -0,0 +1,12 @@ +dnl # +dnl # Check if librt is required for clock_gettime. +dnl # clock_gettime is generally available in libc on modern systems. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_CLOCK_GETTIME], [ + AC_CHECK_FUNC([clock_gettime], [], [ + AC_CHECK_LIB([rt], [clock_gettime], [ + AC_SUBST([LIBCLOCK_GETTIME], [-lrt])], [ + AC_MSG_FAILURE([*** clock_gettime is missing in libc and librt]) + ]) + ]) +]) diff --git a/config/user-dracut.m4 b/config/user-dracut.m4 index 95f800bda4..b9705297f7 100644 --- a/config/user-dracut.m4 +++ b/config/user-dracut.m4 @@ -1,7 +1,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_DRACUT], [ AC_MSG_CHECKING(for dracut directory) AC_ARG_WITH([dracutdir], - AC_HELP_STRING([--with-dracutdir=DIR], + AS_HELP_STRING([--with-dracutdir=DIR], [install dracut helpers @<:@default=check@:>@]), [dracutdir=$withval], [dracutdir=check]) diff --git a/config/user-gettext.m4 b/config/user-gettext.m4 index 89d1d45bf1..824318eab9 100644 --- a/config/user-gettext.m4 +++ b/config/user-gettext.m4 @@ -2,7 +2,5 @@ dnl # dnl # Check if libintl and possibly libiconv are needed for gettext() functionality dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_GETTEXT], [ - AM_ICONV AM_GNU_GETTEXT([external]) - LIBS="$LIBS $LTLIBINTL $LTLIBICONV" ]) diff --git a/config/user-libaio.m4 b/config/user-libaio.m4 index d7a7cb508d..95c144d76b 100644 --- a/config/user-libaio.m4 +++ b/config/user-libaio.m4 @@ -2,13 +2,5 @@ dnl # dnl # Check for libaio - only used for libaiot test cases. dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_LIBAIO], [ - LIBAIO= - - AC_CHECK_HEADER([libaio.h], [ - user_libaio=yes - AC_SUBST([LIBAIO], ["-laio"]) - AC_DEFINE([HAVE_LIBAIO], 1, [Define if you have libaio]) - ], [ - user_libaio=no - ]) + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBAIO, [], [libaio.h], [], [aio], [], [user_libaio=yes], [user_libaio=no]) ]) diff --git a/config/user-libatomic.m4 b/config/user-libatomic.m4 new file mode 100644 index 0000000000..d15069f9c4 --- /dev/null +++ b/config/user-libatomic.m4 @@ -0,0 +1,28 @@ +dnl # +dnl # If -latomic exists and atomic.c doesn't link without it, +dnl # it's needed for __atomic intrinsics. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBATOMIC], [ + AC_MSG_CHECKING([whether -latomic is required]) + + saved_libs="$LIBS" + LIBS="$LIBS -latomic" + LIBATOMIC_LIBS="" + + AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])], [ + LIBS="$saved_libs" + saved_cflags="$CFLAGS" + CFLAGS="$CFLAGS -isystem lib/libspl/include" + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include "lib/libspl/atomic.c"], [])], [], [LIBATOMIC_LIBS="-latomic"]) + CFLAGS="$saved_cflags" + ]) + + if test -n "$LIBATOMIC_LIBS"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + LIBS="$saved_libs" + AC_SUBST([LIBATOMIC_LIBS]) +]) diff --git a/config/user-libblkid.m4 b/config/user-libblkid.m4 index 88e6f990b7..f2016dcb15 100644 --- a/config/user-libblkid.m4 +++ b/config/user-libblkid.m4 @@ -3,11 +3,7 @@ dnl # Check for libblkid. Basic support for detecting ZFS pools dnl # has existing in blkid since 2008. dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - LIBBLKID= - - AC_CHECK_HEADER([blkid/blkid.h], [], [AC_MSG_FAILURE([ - *** blkid.h missing, libblkid-devel package required])]) - - AC_SUBST([LIBBLKID], ["-lblkid"]) - AC_DEFINE([HAVE_LIBBLKID], 1, [Define if you have libblkid]) + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBBLKID, [blkid], [blkid/blkid.h], [], [blkid], [], [], [ + AC_MSG_FAILURE([ + *** blkid.h missing, libblkid-devel package required])]) ]) diff --git a/config/user-libcrypto.m4 b/config/user-libcrypto.m4 new file mode 100644 index 0000000000..7293e1b0b4 --- /dev/null +++ b/config/user-libcrypto.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check for libcrypto. Used for userspace password derivation via PBKDF2. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBCRYPTO], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBCRYPTO, [libcrypto], [openssl/evp.h], [], [crypto], [PKCS5_PBKDF2_HMAC_SHA1], [], [ + AC_MSG_FAILURE([ + *** evp.h missing, libssl-devel package required])]) +]) diff --git a/config/user-libexec.m4 b/config/user-libexec.m4 index 31bcea3fcf..5379c25b4a 100644 --- a/config/user-libexec.m4 +++ b/config/user-libexec.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_ZFSEXEC], [ AC_ARG_WITH(zfsexecdir, - AC_HELP_STRING([--with-zfsexecdir=DIR], + AS_HELP_STRING([--with-zfsexecdir=DIR], [install scripts [[@<:@libexecdir@:>@/zfs]]]), [zfsexecdir=$withval], [zfsexecdir="${libexecdir}/zfs"]) diff --git a/config/user-libfetch.m4 b/config/user-libfetch.m4 new file mode 100644 index 0000000000..f5149fc1a5 --- /dev/null +++ b/config/user-libfetch.m4 @@ -0,0 +1,71 @@ +dnl # +dnl # Check for a libfetch - either fetch(3) or libcurl. +dnl # +dnl # There are two configuration dimensions: +dnl # * fetch(3) vs libcurl +dnl # * static vs dynamic +dnl # +dnl # fetch(3) is only dynamic. +dnl # We use sover 6, which first appeared in FreeBSD 8.0-RELEASE. +dnl # +dnl # libcurl development packages include curl-config(1) – we want: +dnl # * HTTPS support +dnl # * version at least 7.16 (October 2006), for sover 4 +dnl # * to decide if it's static or not +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBFETCH], [ + AC_MSG_CHECKING([for libfetch]) + LIBFETCH_LIBS= + LIBFETCH_IS_FETCH=0 + LIBFETCH_IS_LIBCURL=0 + LIBFETCH_DYNAMIC=0 + LIBFETCH_SONAME= + have_libfetch= + + saved_libs="$LIBS" + LIBS="$LIBS -lfetch" + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ + #include + #include + #include + ]], [fetchGetURL("", "");])], [ + have_libfetch=1 + LIBFETCH_IS_FETCH=1 + LIBFETCH_DYNAMIC=1 + LIBFETCH_SONAME='"libfetch.so.6"' + LIBFETCH_LIBS="-ldl" + AC_MSG_RESULT([fetch(3)]) + ], []) + LIBS="$saved_libs" + + if test -z "$have_libfetch"; then + if curl-config --protocols 2>/dev/null | grep -q HTTPS && + test "$(printf "%u" "0x$(curl-config --vernum)")" -ge "$(printf "%u" "0x071000")"; then + have_libfetch=1 + LIBFETCH_IS_LIBCURL=1 + if test "$(curl-config --built-shared)" = "yes"; then + LIBFETCH_DYNAMIC=1 + LIBFETCH_SONAME='"libcurl.so.4"' + LIBFETCH_LIBS="-ldl" + AC_MSG_RESULT([libcurl]) + else + LIBFETCH_LIBS="$(curl-config --libs)" + AC_MSG_RESULT([libcurl (static)]) + fi + + CCFLAGS="$CCFLAGS $(curl-config --cflags)" + fi + fi + + if test -z "$have_libfetch"; then + AC_MSG_RESULT([none]) + fi + + AC_SUBST([LIBFETCH_LIBS]) + AC_SUBST([LIBFETCH_DYNAMIC]) + AC_SUBST([LIBFETCH_SONAME]) + AC_DEFINE_UNQUOTED([LIBFETCH_IS_FETCH], [$LIBFETCH_IS_FETCH], [libfetch is fetch(3)]) + AC_DEFINE_UNQUOTED([LIBFETCH_IS_LIBCURL], [$LIBFETCH_IS_LIBCURL], [libfetch is libcurl]) + AC_DEFINE_UNQUOTED([LIBFETCH_DYNAMIC], [$LIBFETCH_DYNAMIC], [whether the chosen libfetch is to be loaded at run-time]) + AC_DEFINE_UNQUOTED([LIBFETCH_SONAME], [$LIBFETCH_SONAME], [soname of chosen libfetch]) +]) diff --git a/config/user-libssl.m4 b/config/user-libssl.m4 deleted file mode 100644 index f6824510fd..0000000000 --- a/config/user-libssl.m4 +++ /dev/null @@ -1,12 +0,0 @@ -dnl # -dnl # Check for libssl. Used for userspace password derivation via PBKDF2. -dnl # -AC_DEFUN([ZFS_AC_CONFIG_USER_LIBSSL], [ - LIBSSL= - - AC_CHECK_HEADER([openssl/evp.h], [], [AC_MSG_FAILURE([ - *** evp.h missing, libssl-devel package required])]) - - AC_SUBST([LIBSSL], ["-lssl -lcrypto"]) - AC_DEFINE([HAVE_LIBSSL], 1, [Define if you have libssl]) -]) diff --git a/config/user-libtirpc.m4 b/config/user-libtirpc.m4 index 19c02c9d54..aa7ab4a1fd 100644 --- a/config/user-libtirpc.m4 +++ b/config/user-libtirpc.m4 @@ -19,7 +19,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBTIRPC], [ ]) AS_IF([test "x$have_xdr" = "x"], [ - FIND_SYSTEM_LIBRARY(LIBTIRPC, [libtirpc], [rpc/xdr.h], [tirpc], [tirpc], [xdrmem_create], [], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBTIRPC, [libtirpc], [rpc/xdr.h], [tirpc], [tirpc], [xdrmem_create], [], [ AS_IF([test "x$with_tirpc" = "xyes"], [ AC_MSG_FAILURE([--with-tirpc was given, but libtirpc is not available, try installing libtirpc-devel]) ],[dnl ELSE diff --git a/config/user-libudev.m4 b/config/user-libudev.m4 index 9b7454927e..8c3c1d7e00 100644 --- a/config/user-libudev.m4 +++ b/config/user-libudev.m4 @@ -2,18 +2,16 @@ dnl # dnl # Check for libudev - needed for vdev auto-online and auto-replace dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUDEV], [ - LIBUDEV= + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUDEV, [libudev], [libudev.h], [], [udev], [], [user_libudev=yes], [user_libudev=no]) - AC_CHECK_HEADER([libudev.h], [ - user_libudev=yes - AC_SUBST([LIBUDEV], ["-ludev"]) - AC_DEFINE([HAVE_LIBUDEV], 1, [Define if you have libudev]) - ], [ - user_libudev=no + AS_IF([test "x$user_libudev" = xyes], [ + AX_SAVE_FLAGS + + CFLAGS="$CFLAGS $LIBUDEV_CFLAGS" + LIBS="$LIBUDEV_LIBS $LIBS" + + AC_CHECK_FUNCS([udev_device_get_is_initialized]) + + AX_RESTORE_FLAGS ]) - - AC_SEARCH_LIBS([udev_device_get_is_initialized], [udev], [ - AC_DEFINE([HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED], 1, [ - Define if udev_device_get_is_initialized is available])], []) - ]) diff --git a/config/user-libuuid.m4 b/config/user-libuuid.m4 index f0da671a3f..0cfa83c992 100644 --- a/config/user-libuuid.m4 +++ b/config/user-libuuid.m4 @@ -2,17 +2,7 @@ dnl # dnl # Check for libuuid dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUUID], [ - LIBUUID= - - AC_CHECK_HEADER([uuid/uuid.h], [], [AC_MSG_FAILURE([ - *** uuid/uuid.h missing, libuuid-devel package required])]) - - AC_SEARCH_LIBS([uuid_generate], [uuid], [], [AC_MSG_FAILURE([ - *** uuid_generate() missing, libuuid-devel package required])]) - - AC_SEARCH_LIBS([uuid_is_null], [uuid], [], [AC_MSG_FAILURE([ - *** uuid_is_null() missing, libuuid-devel package required])]) - - AC_SUBST([LIBUUID], ["-luuid"]) - AC_DEFINE([HAVE_LIBUUID], 1, [Define if you have libuuid]) + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUUID, [uuid], [uuid/uuid.h], [], [uuid], [uuid_generate uuid_is_null], [], [ + AC_MSG_FAILURE([*** libuuid-devel package required]) + ]) ]) diff --git a/config/user-makedev.m4 b/config/user-makedev.m4 index 4383681a8f..8986107aef 100644 --- a/config/user-makedev.m4 +++ b/config/user-makedev.m4 @@ -3,13 +3,12 @@ dnl # glibc 2.25 dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS], [ AC_MSG_CHECKING([makedev() is declared in sys/sysmacros.h]) - AC_TRY_COMPILE( - [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include - ],[ + ]], [[ int k; k = makedev(0,0); - ],[ + ]])],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MAKEDEV_IN_SYSMACROS, 1, [makedev() is declared in sys/sysmacros.h]) @@ -23,13 +22,12 @@ dnl # glibc X < Y < 2.25 dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV], [ AC_MSG_CHECKING([makedev() is declared in sys/mkdev.h]) - AC_TRY_COMPILE( - [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include - ],[ + ]], [[ int k; k = makedev(0,0); - ],[ + ]])],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MAKEDEV_IN_MKDEV, 1, [makedev() is declared in sys/mkdev.h]) diff --git a/config/user-pam.m4 b/config/user-pam.m4 new file mode 100644 index 0000000000..9db35808c3 --- /dev/null +++ b/config/user-pam.m4 @@ -0,0 +1,38 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_PAM], [ + AC_ARG_ENABLE([pam], + AS_HELP_STRING([--enable-pam], + [install pam_zfs_key module [[default: check]]]), + [enable_pam=$enableval], + [enable_pam=check]) + + AC_ARG_WITH(pammoduledir, + AS_HELP_STRING([--with-pammoduledir=DIR], + [install pam module in dir [[$libdir/security]]]), + [pammoduledir="$withval"],[pammoduledir=$libdir/security]) + + AC_ARG_WITH(pamconfigsdir, + AS_HELP_STRING([--with-pamconfigsdir=DIR], + [install pam-config files in dir [DATADIR/pam-configs]]), + [pamconfigsdir="$withval"], + [pamconfigsdir='${datadir}/pam-configs']) + + AS_IF([test "x$enable_pam" != "xno"], [ + AC_CHECK_HEADERS([security/pam_modules.h], [ + enable_pam=yes + ], [ + AS_IF([test "x$enable_pam" = "xyes"], [ + AC_MSG_FAILURE([ + *** security/pam_modules.h missing, libpam0g-dev package required + ]) + ],[ + enable_pam=no + ]) + ]) + ]) + AS_IF([test "x$enable_pam" = "xyes"], [ + DEFINE_PAM='--with pam' + ]) + AC_SUBST(DEFINE_PAM) + AC_SUBST(pammoduledir) + AC_SUBST(pamconfigsdir) +]) diff --git a/config/user-systemd.m4 b/config/user-systemd.m4 index 3e6a4a281f..63f02ad2a8 100644 --- a/config/user-systemd.m4 +++ b/config/user-systemd.m4 @@ -1,27 +1,27 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_SYSTEMD], [ AC_ARG_ENABLE(systemd, - AC_HELP_STRING([--enable-systemd], + AS_HELP_STRING([--enable-systemd], [install systemd unit/preset files [[default: yes]]]), [enable_systemd=$enableval], [enable_systemd=check]) AC_ARG_WITH(systemdunitdir, - AC_HELP_STRING([--with-systemdunitdir=DIR], + AS_HELP_STRING([--with-systemdunitdir=DIR], [install systemd unit files in dir [[/usr/lib/systemd/system]]]), systemdunitdir=$withval,systemdunitdir=/usr/lib/systemd/system) AC_ARG_WITH(systemdpresetdir, - AC_HELP_STRING([--with-systemdpresetdir=DIR], + AS_HELP_STRING([--with-systemdpresetdir=DIR], [install systemd preset files in dir [[/usr/lib/systemd/system-preset]]]), systemdpresetdir=$withval,systemdpresetdir=/usr/lib/systemd/system-preset) AC_ARG_WITH(systemdmodulesloaddir, - AC_HELP_STRING([--with-systemdmodulesloaddir=DIR], + AS_HELP_STRING([--with-systemdmodulesloaddir=DIR], [install systemd module load files into dir [[/usr/lib/modules-load.d]]]), systemdmodulesloaddir=$withval,systemdmodulesloaddir=/usr/lib/modules-load.d) AC_ARG_WITH(systemdgeneratordir, - AC_HELP_STRING([--with-systemdgeneratordir=DIR], + AS_HELP_STRING([--with-systemdgeneratordir=DIR], [install systemd generators in dir [[/usr/lib/systemd/system-generators]]]), systemdgeneratordir=$withval,systemdgeneratordir=/usr/lib/systemd/system-generators) diff --git a/config/user-sysvinit.m4 b/config/user-sysvinit.m4 index 65dcc38192..b6b63f1cfa 100644 --- a/config/user-sysvinit.m4 +++ b/config/user-sysvinit.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_SYSVINIT], [ AC_ARG_ENABLE(sysvinit, - AC_HELP_STRING([--enable-sysvinit], + AS_HELP_STRING([--enable-sysvinit], [install SysV init scripts [default: yes]]), [],enable_sysvinit=yes) diff --git a/config/user-udev.m4 b/config/user-udev.m4 index 65dc79fb48..e6120fc8fe 100644 --- a/config/user-udev.m4 +++ b/config/user-udev.m4 @@ -1,7 +1,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_UDEV], [ AC_MSG_CHECKING(for udev directories) AC_ARG_WITH(udevdir, - AC_HELP_STRING([--with-udevdir=DIR], + AS_HELP_STRING([--with-udevdir=DIR], [install udev helpers @<:@default=check@:>@]), [udevdir=$withval], [udevdir=check]) @@ -18,7 +18,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_UDEV], [ ]) AC_ARG_WITH(udevruledir, - AC_HELP_STRING([--with-udevruledir=DIR], + AS_HELP_STRING([--with-udevruledir=DIR], [install udev rules [[UDEVDIR/rules.d]]]), [udevruledir=$withval], [udevruledir="${udevdir}/rules.d"]) diff --git a/config/user-zlib.m4 b/config/user-zlib.m4 index 82c0962e45..1f3792829b 100644 --- a/config/user-zlib.m4 +++ b/config/user-zlib.m4 @@ -2,20 +2,7 @@ dnl # dnl # Check for zlib dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_ZLIB], [ - ZLIB= - - AC_CHECK_HEADER([zlib.h], [], [AC_MSG_FAILURE([ - *** zlib.h missing, zlib-devel package required])]) - - AC_SEARCH_LIBS([compress2], [z], [], [AC_MSG_FAILURE([ - *** compress2() missing, zlib-devel package required])]) - - AC_SEARCH_LIBS([uncompress], [z], [], [AC_MSG_FAILURE([ - *** uncompress() missing, zlib-devel package required])]) - - AC_SEARCH_LIBS([crc32], [z], [], [AC_MSG_FAILURE([ - *** crc32() missing, zlib-devel package required])]) - - AC_SUBST([ZLIB], ["-lz"]) - AC_DEFINE([HAVE_ZLIB], 1, [Define if you have zlib]) + ZFS_AC_FIND_SYSTEM_LIBRARY(ZLIB, [zlib], [zlib.h], [], [z], [compress2 uncompress crc32], [], [ + AC_MSG_FAILURE([*** zlib-devel package required]) + ]) ]) diff --git a/config/user.m4 b/config/user.m4 index 1ee9dbe263..670820b377 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -4,30 +4,39 @@ dnl # AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_GETTEXT ZFS_AC_CONFIG_USER_MOUNT_HELPER - ZFS_AC_CONFIG_USER_UDEV - ZFS_AC_CONFIG_USER_SYSTEMD ZFS_AC_CONFIG_USER_SYSVINIT ZFS_AC_CONFIG_USER_DRACUT + AM_COND_IF([BUILD_FREEBSD], [ + PKG_INSTALLDIR(['${prefix}/libdata/pkgconfig'])], [ + PKG_INSTALLDIR + ]) ZFS_AC_CONFIG_USER_ZLIB - ZFS_AC_CONFIG_USER_LIBUUID + AM_COND_IF([BUILD_LINUX], [ + ZFS_AC_CONFIG_USER_UDEV + ZFS_AC_CONFIG_USER_SYSTEMD + ZFS_AC_CONFIG_USER_LIBUUID + ZFS_AC_CONFIG_USER_LIBBLKID + ]) ZFS_AC_CONFIG_USER_LIBTIRPC - ZFS_AC_CONFIG_USER_LIBBLKID ZFS_AC_CONFIG_USER_LIBUDEV - ZFS_AC_CONFIG_USER_LIBSSL + ZFS_AC_CONFIG_USER_LIBCRYPTO ZFS_AC_CONFIG_USER_LIBAIO + ZFS_AC_CONFIG_USER_LIBATOMIC + ZFS_AC_CONFIG_USER_LIBFETCH + ZFS_AC_CONFIG_USER_CLOCK_GETTIME + ZFS_AC_CONFIG_USER_PAM ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - ZFS_AC_TEST_FRAMEWORK - AC_CHECK_FUNCS([mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy]) ]) dnl # dnl # Setup the environment for the ZFS Test Suite. Currently only -dnl # Linux sytle systems are supported but this infrastructure can +dnl # Linux style systems are supported but this infrastructure can dnl # be extended to support other platforms if needed. dnl # AC_DEFUN([ZFS_AC_TEST_FRAMEWORK], [ diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 8e221f2d7d..ec4a2026bf 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -11,6 +11,7 @@ AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [ DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_with_debug" + WITH_DEBUG="true" AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled]) KERNEL_DEBUG_CFLAGS="-Werror" @@ -22,6 +23,7 @@ AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [ DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_without_debug" + WITH_DEBUG="" KERNEL_DEBUG_CFLAGS="" KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" @@ -32,6 +34,9 @@ dnl # When debugging is enabled: dnl # - Enable all ASSERTs (-DDEBUG) dnl # - Promote all compiler warnings to errors (-Werror) dnl # +dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics +dnl # can ensue.) +dnl # AC_DEFUN([ZFS_AC_DEBUG], [ AC_MSG_CHECKING([whether assertion support will be enabled]) AC_ARG_ENABLE([debug], @@ -47,10 +52,25 @@ AC_DEFUN([ZFS_AC_DEBUG], [ [ZFS_AC_DEBUG_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debug])]) + AS_CASE(["x$enable_invariants"], + ["xyes"], + [], + ["xno"], + [], + [ZFS_AC_DEBUG_INVARIANTS_DETECT]) + + AS_CASE(["x$enable_invariants"], + ["xyes"], + [ZFS_AC_DEBUG_ENABLE], + ["xno"], + [], + [AC_MSG_ERROR([Unknown option $enable_invariants])]) + AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUG_CPPFLAGS) AC_SUBST(DEBUG_LDFLAGS) AC_SUBST(DEBUG_ZFS) + AC_SUBST(WITH_DEBUG) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) @@ -59,9 +79,9 @@ AC_DEFUN([ZFS_AC_DEBUG], [ ]) AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [ - DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline" + DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA" - KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline" + KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $NO_IPA_SRA" KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y" DEBUGINFO_ZFS="_with_debuginfo" @@ -110,7 +130,7 @@ AC_DEFUN([ZFS_AC_DEBUG_KMEM], [ [enable_debug_kmem=no]) AS_IF([test "x$enable_debug_kmem" = xyes], [ - KERNEL_DEBUG_CPPFLAGS+=" -DDEBUG_KMEM" + KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM" DEBUG_KMEM_ZFS="_with_debug_kmem" ], [ DEBUG_KMEM_ZFS="_without_debug_kmem" @@ -140,7 +160,7 @@ AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ [enable_debug_kmem_tracking=no]) AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [ - KERNEL_DEBUG_CPPFLAGS+=" -DDEBUG_KMEM_TRACKING" + KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING" DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking" ], [ DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking" @@ -152,27 +172,74 @@ AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ AC_MSG_RESULT([$enable_debug_kmem_tracking]) ]) +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], [ + AS_IF([sysctl -n kern.conftxt | fgrep -qx $'options\tINVARIANTS'], + [enable_invariants="yes"], + [enable_invariants="no"]) +]) + +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT], [ + AM_COND_IF([BUILD_FREEBSD], + [ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], + [enable_invariants="no"]) +]) + +dnl # +dnl # Detected for the running kernel by default, enables INVARIANTS features +dnl # in the FreeBSD kernel module. This feature must be used when building +dnl # for a FreeBSD kernel with "options INVARIANTS" in the KERNCONF and must +dnl # not be used when the INVARIANTS option is absent. +dnl # +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [ + AC_MSG_CHECKING([whether FreeBSD kernel INVARIANTS checks are enabled]) + AC_ARG_ENABLE([invariants], + [AS_HELP_STRING([--enable-invariants], + [Enable FreeBSD kernel INVARIANTS checks [[default: detect]]])], + [], [ZFS_AC_DEBUG_INVARIANTS_DETECT]) + + AS_IF([test "x$enable_invariants" = xyes], + [WITH_INVARIANTS="true"], + [WITH_INVARIANTS=""]) + AC_SUBST(WITH_INVARIANTS) + + AC_MSG_RESULT([$enable_invariants]) +]) + AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ + AX_COUNT_CPUS([]) + AC_SUBST(CPU_COUNT) + ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE + ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION + ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER + ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_CC_ASAN ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD + ZFS_AC_CONFIG_ALWAYS_SYSTEM ZFS_AC_CONFIG_ALWAYS_ARCH ZFS_AC_CONFIG_ALWAYS_PYTHON ZFS_AC_CONFIG_ALWAYS_PYZFS + ZFS_AC_CONFIG_ALWAYS_SED + ZFS_AC_CONFIG_ALWAYS_CPPCHECK + ZFS_AC_CONFIG_ALWAYS_SHELLCHECK ]) AC_DEFUN([ZFS_AC_CONFIG], [ + + dnl # Remove the previous build test directory. + rm -Rf build + ZFS_CONFIG=all AC_ARG_WITH([config], AS_HELP_STRING([--with-config=CONFIG], [Config file 'kernel|user|all|srpm']), [ZFS_CONFIG="$withval"]) AC_ARG_ENABLE([linux-builtin], - [AC_HELP_STRING([--enable-linux-builtin], + [AS_HELP_STRING([--enable-linux-builtin], [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], [], [enable_linux_builtin=no]) @@ -183,6 +250,14 @@ AC_DEFUN([ZFS_AC_CONFIG], [ ZFS_AC_CONFIG_ALWAYS + AM_COND_IF([BUILD_LINUX], [ + AC_ARG_VAR([TEST_JOBS], [simultaneous jobs during configure]) + if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then + TEST_JOBS=$CPU_COUNT + fi + AC_SUBST(TEST_JOBS) + ]) + case "$ZFS_CONFIG" in kernel) ZFS_AC_CONFIG_KERNEL ;; user) ZFS_AC_CONFIG_USER ;; @@ -205,6 +280,7 @@ AC_DEFUN([ZFS_AC_CONFIG], [ [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) + AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) ]) dnl # @@ -242,12 +318,12 @@ AC_DEFUN([ZFS_AC_RPM], [ ]) RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' - RPM_DEFINE_COMMON+=' --define "$(DEBUG_KMEM_ZFS) 1"' - RPM_DEFINE_COMMON+=' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' - RPM_DEFINE_COMMON+=' --define "$(DEBUGINFO_ZFS) 1"' - RPM_DEFINE_COMMON+=' --define "$(ASAN_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' - RPM_DEFINE_UTIL=' --define "_initconfdir $(DEFAULT_INITCONF_DIR)"' + RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since dnl # their values may not be set when running: @@ -255,19 +331,19 @@ AC_DEFUN([ZFS_AC_RPM], [ dnl # ./configure --with-config=srpm dnl # AS_IF([test -n "$dracutdir" ], [ - RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)"' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"' ]) AS_IF([test -n "$udevdir" ], [ - RPM_DEFINE_UTIL+=' --define "_udevdir $(udevdir)"' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"' ]) AS_IF([test -n "$udevruledir" ], [ - RPM_DEFINE_UTIL+=' --define "_udevdir $(udevruledir)"' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) - RPM_DEFINE_UTIL+=' $(DEFINE_INITRAMFS)' - RPM_DEFINE_UTIL+=' $(DEFINE_SYSTEMD)' - RPM_DEFINE_UTIL+=' $(DEFINE_PYZFS)' - RPM_DEFINE_UTIL+=' $(DEFINE_PYTHON_VERSION)' - RPM_DEFINE_UTIL+=' $(DEFINE_PYTHON_PKG_VERSION)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)' dnl # Override default lib directory on Debian/Ubuntu systems. The dnl # provided /usr/lib/rpm/platform//macros files do not @@ -279,14 +355,20 @@ AC_DEFUN([ZFS_AC_RPM], [ dnl # AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [ MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" - RPM_DEFINE_UTIL+=' --define "_lib $(MULTIARCH_LIBDIR)"' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"' AC_SUBST(MULTIARCH_LIBDIR) ]) - RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)"' - RPM_DEFINE_KMOD+=' --define "ksrc $(LINUX)"' - RPM_DEFINE_KMOD+=' --define "kobj $(LINUX_OBJ)"' - RPM_DEFINE_KMOD+=' --define "_wrong_version_format_terminate_build 0"' + dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL, + dnl # since the values will not be set otherwise. The spec files + dnl # provide defaults for them. + dnl # + RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"' + AM_COND_IF([CONFIG_KERNEL], [ + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"' + ]) RPM_DEFINE_DKMS='' @@ -374,6 +456,9 @@ AC_DEFUN([ZFS_AC_ALIEN], [ AC_MSG_CHECKING([whether $ALIEN is available]) AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') + ALIEN_MAJOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[1] }') + ALIEN_MINOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[2] }') + ALIEN_POINT=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[3] }') HAVE_ALIEN=yes AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) ],[ @@ -384,6 +469,9 @@ AC_DEFUN([ZFS_AC_ALIEN], [ AC_SUBST(HAVE_ALIEN) AC_SUBST(ALIEN) AC_SUBST(ALIEN_VERSION) + AC_SUBST(ALIEN_MAJOR) + AC_SUBST(ALIEN_MINOR) + AC_SUBST(ALIEN_POINT) ]) dnl # @@ -391,32 +479,44 @@ dnl # Using the VENDOR tag from config.guess set the default dnl # package type for 'make pkg': (rpm | deb | tgz) dnl # AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ - AC_MSG_CHECKING([linux distribution]) - if test -f /etc/toss-release ; then - VENDOR=toss ; - elif test -f /etc/fedora-release ; then - VENDOR=fedora ; - elif test -f /etc/redhat-release ; then - VENDOR=redhat ; - elif test -f /etc/gentoo-release ; then - VENDOR=gentoo ; - elif test -f /etc/arch-release ; then - VENDOR=arch ; - elif test -f /etc/SuSE-release ; then - VENDOR=sles ; - elif test -f /etc/slackware-version ; then - VENDOR=slackware ; - elif test -f /etc/lunar.release ; then - VENDOR=lunar ; - elif test -f /etc/lsb-release ; then - VENDOR=ubuntu ; - elif test -f /etc/debian_version ; then - VENDOR=debian ; - elif test -f /etc/alpine-release ; then - VENDOR=alpine ; - else - VENDOR= ; - fi + AC_MSG_CHECKING([os distribution]) + AC_ARG_WITH([vendor], + [AS_HELP_STRING([--with-vendor], + [Distribution vendor @<:@default=check@:>@])], + [with_vendor=$withval], + [with_vendor=check]) + AS_IF([test "x$with_vendor" = "xcheck"],[ + if test -f /etc/toss-release ; then + VENDOR=toss ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/arch-release ; then + VENDOR=arch ; + elif test -f /etc/SuSE-release ; then + VENDOR=sles ; + elif test -f /etc/slackware-version ; then + VENDOR=slackware ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/lsb-release ; then + VENDOR=ubuntu ; + elif test -f /etc/debian_version ; then + VENDOR=debian ; + elif test -f /etc/alpine-release ; then + VENDOR=alpine ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; + else + VENDOR= ; + fi], + [ test "x${with_vendor}" != x],[ + VENDOR="$with_vendor" ], + [ VENDOR= ; ] + ) AC_MSG_RESULT([$VENDOR]) AC_SUBST(VENDOR) @@ -433,17 +533,21 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ lunar) DEFAULT_PACKAGE=tgz ;; ubuntu) DEFAULT_PACKAGE=deb ;; debian) DEFAULT_PACKAGE=deb ;; + freebsd) DEFAULT_PACKAGE=pkg ;; *) DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) - DEFAULT_INIT_DIR=$sysconfdir/init.d AC_MSG_CHECKING([default init directory]) - AC_MSG_RESULT([$DEFAULT_INIT_DIR]) - AC_SUBST(DEFAULT_INIT_DIR) + case "$VENDOR" in + freebsd) initdir=$sysconfdir/rc.d ;; + *) initdir=$sysconfdir/init.d;; + esac + AC_MSG_RESULT([$initdir]) + AC_SUBST(initdir) - AC_MSG_CHECKING([default init script type]) + AC_MSG_CHECKING([default init script type and shell]) case "$VENDOR" in toss) DEFAULT_INIT_SCRIPT=redhat ;; redhat) DEFAULT_INIT_SCRIPT=redhat ;; @@ -456,35 +560,53 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ lunar) DEFAULT_INIT_SCRIPT=lunar ;; ubuntu) DEFAULT_INIT_SCRIPT=lsb ;; debian) DEFAULT_INIT_SCRIPT=lsb ;; + freebsd) DEFAULT_INIT_SCRIPT=freebsd;; *) DEFAULT_INIT_SCRIPT=lsb ;; esac - AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT]) - AC_SUBST(DEFAULT_INIT_SCRIPT) - AC_MSG_CHECKING([default init config direectory]) case "$VENDOR" in - alpine) DEFAULT_INITCONF_DIR=/etc/conf.d ;; - gentoo) DEFAULT_INITCONF_DIR=/etc/conf.d ;; - toss) DEFAULT_INITCONF_DIR=/etc/sysconfig ;; - redhat) DEFAULT_INITCONF_DIR=/etc/sysconfig ;; - fedora) DEFAULT_INITCONF_DIR=/etc/sysconfig ;; - sles) DEFAULT_INITCONF_DIR=/etc/sysconfig ;; - ubuntu) DEFAULT_INITCONF_DIR=/etc/default ;; - debian) DEFAULT_INITCONF_DIR=/etc/default ;; - *) DEFAULT_INITCONF_DIR=/etc/default ;; + gentoo) DEFAULT_INIT_SHELL="/sbin/openrc-run";; + alpine) DEFAULT_INIT_SHELL="/sbin/openrc-run";; + *) DEFAULT_INIT_SHELL="/bin/sh" ;; esac - AC_MSG_RESULT([$DEFAULT_INITCONF_DIR]) - AC_SUBST(DEFAULT_INITCONF_DIR) + + AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT:$DEFAULT_INIT_SHELL]) + AC_SUBST(DEFAULT_INIT_SCRIPT) + AC_SUBST(DEFAULT_INIT_SHELL) + + AC_MSG_CHECKING([default nfs server init script]) + AS_IF([test "$VENDOR" = "debian"], + [DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"], + [DEFAULT_INIT_NFS_SERVER="nfs"] + ) + AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER]) + AC_SUBST(DEFAULT_INIT_NFS_SERVER) + + AC_MSG_CHECKING([default init config directory]) + case "$VENDOR" in + alpine) initconfdir=/etc/conf.d ;; + gentoo) initconfdir=/etc/conf.d ;; + toss) initconfdir=/etc/sysconfig ;; + redhat) initconfdir=/etc/sysconfig ;; + fedora) initconfdir=/etc/sysconfig ;; + sles) initconfdir=/etc/sysconfig ;; + ubuntu) initconfdir=/etc/default ;; + debian) initconfdir=/etc/default ;; + freebsd) initconfdir=$sysconfdir/rc.conf.d;; + *) initconfdir=/etc/default ;; + esac + AC_MSG_RESULT([$initconfdir]) + AC_SUBST(initconfdir) AC_MSG_CHECKING([whether initramfs-tools is available]) if test -d /usr/share/initramfs-tools ; then - DEFINE_INITRAMFS='--define "_initramfs 1"' + RPM_DEFINE_INITRAMFS='--define "_initramfs 1"' AC_MSG_RESULT([yes]) else - DEFINE_INITRAMFS='' + RPM_DEFINE_INITRAMFS='' AC_MSG_RESULT([no]) fi - AC_SUBST(DEFINE_INITRAMFS) + AC_SUBST(RPM_DEFINE_INITRAMFS) ]) dnl # @@ -492,7 +614,9 @@ dnl # Default ZFS package configuration dnl # AC_DEFUN([ZFS_AC_PACKAGE], [ ZFS_AC_DEFAULT_PACKAGE - ZFS_AC_RPM - ZFS_AC_DPKG - ZFS_AC_ALIEN + AS_IF([test x$VENDOR != xfreebsd], [ + ZFS_AC_RPM + ZFS_AC_DPKG + ZFS_AC_ALIEN + ]) ]) diff --git a/config/zfs-meta.m4 b/config/zfs-meta.m4 index aa0fc14209..1c9d246124 100644 --- a/config/zfs-meta.m4 +++ b/config/zfs-meta.m4 @@ -73,14 +73,14 @@ AC_DEFUN([ZFS_AC_META], [ if test ! -f ".nogitrelease" && git rev-parse --git-dir > /dev/null 2>&1; then _match="${ZFS_META_NAME}-${ZFS_META_VERSION}" _alias=$(git describe --match=${_match} 2>/dev/null) - _release=$(echo ${_alias}|cut -f3- -d'-'|sed 's/-/_/g') + _release=$(echo ${_alias}|sed "s/${ZFS_META_NAME}//"|cut -f3- -d'-'|sed 's/-/_/g') if test -n "${_release}"; then ZFS_META_RELEASE=${_release} _zfs_ac_meta_type="git describe" else _match="${ZFS_META_NAME}-${ZFS_META_VERSION}-${ZFS_META_RELEASE}" _alias=$(git describe --match=${_match} 2>/dev/null) - _release=$(echo ${_alias}|cut -f3- -d'-'|sed 's/-/_/g') + _release=$(echo ${_alias}|sed 's/${ZFS_META_NAME}//'|cut -f3- -d'-'|sed 's/-/_/g') if test -n "${_release}"; then ZFS_META_RELEASE=${_release} _zfs_ac_meta_type="git describe" @@ -138,6 +138,24 @@ AC_DEFUN([ZFS_AC_META], [ AC_SUBST([ZFS_META_AUTHOR]) fi + ZFS_META_KVER_MIN=_ZFS_AC_META_GETVAL([Linux-Minimum]); + if test -n "$ZFS_META_KVER_MIN"; then + AC_DEFINE_UNQUOTED([ZFS_META_KVER_MIN], + ["$ZFS_META_KVER_MIN"], + [Define the minimum compatible kernel version.] + ) + AC_SUBST([ZFS_META_KVER_MIN]) + fi + + ZFS_META_KVER_MAX=_ZFS_AC_META_GETVAL([Linux-Maximum]); + if test -n "$ZFS_META_KVER_MAX"; then + AC_DEFINE_UNQUOTED([ZFS_META_KVER_MAX], + ["$ZFS_META_KVER_MAX"], + [Define the maximum compatible kernel version.] + ) + AC_SUBST([ZFS_META_KVER_MAX]) + fi + m4_pattern_allow([^LT_(CURRENT|REVISION|AGE)$]) ZFS_META_LT_CURRENT=_ZFS_AC_META_GETVAL([LT_Current]); ZFS_META_LT_REVISION=_ZFS_AC_META_GETVAL([LT_Revision]); diff --git a/configure.ac b/configure.ac index db614084e3..ebc7b276a6 100644 --- a/configure.ac +++ b/configure.ac @@ -36,7 +36,7 @@ AC_LANG(C) ZFS_AC_META AC_CONFIG_AUX_DIR([config]) AC_CONFIG_MACRO_DIR([config]) -AC_CANONICAL_SYSTEM +AC_CANONICAL_TARGET AM_MAINTAINER_MODE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AM_INIT_AUTOMAKE([subdir-objects]) @@ -45,9 +45,10 @@ AC_CONFIG_HEADERS([zfs_config.h], [ awk -f ${ac_srcdir}/config/config.awk zfs_config.h.tmp >zfs_config.h && rm zfs_config.h.tmp) || exit 1]) +LT_INIT AC_PROG_INSTALL AC_PROG_CC -AC_PROG_LIBTOOL +AC_PROG_LN_S PKG_PROG_PKG_CONFIG AM_PROG_AS AM_PROG_CC_C_O @@ -55,122 +56,163 @@ AX_CODE_COVERAGE _AM_PROG_TAR(pax) ZFS_AC_LICENSE -ZFS_AC_PACKAGE ZFS_AC_CONFIG +ZFS_AC_PACKAGE ZFS_AC_DEBUG ZFS_AC_DEBUGINFO ZFS_AC_DEBUG_KMEM ZFS_AC_DEBUG_KMEM_TRACKING +ZFS_AC_DEBUG_INVARIANTS AC_CONFIG_FILES([ Makefile - udev/Makefile - udev/rules.d/Makefile + cmd/Makefile + cmd/arc_summary/Makefile + cmd/arcstat/Makefile + cmd/dbufstat/Makefile + cmd/fsck_zfs/Makefile + cmd/mount_zfs/Makefile + cmd/raidz_test/Makefile + cmd/vdev_id/Makefile + cmd/zdb/Makefile + cmd/zed/Makefile + cmd/zed/zed.d/Makefile + cmd/zfs/Makefile + cmd/zfs_ids_to_path/Makefile + cmd/zgenhostid/Makefile + cmd/zhack/Makefile + cmd/zinject/Makefile + cmd/zpool/Makefile + cmd/zstream/Makefile + cmd/ztest/Makefile + cmd/zvol_id/Makefile + cmd/zvol_wait/Makefile + cmd/zpool_influxdb/Makefile + contrib/Makefile + contrib/bash_completion.d/Makefile + contrib/bpftrace/Makefile + contrib/dracut/02zfsexpandknowledge/Makefile + contrib/dracut/90zfs/Makefile + contrib/dracut/Makefile + contrib/initramfs/Makefile + contrib/initramfs/conf.d/Makefile + contrib/initramfs/conf-hooks.d/Makefile + contrib/initramfs/hooks/Makefile + contrib/initramfs/scripts/Makefile + contrib/initramfs/scripts/local-top/Makefile + contrib/pam_zfs_key/Makefile + contrib/pyzfs/Makefile + contrib/pyzfs/setup.py + contrib/zcp/Makefile etc/Makefile + etc/default/Makefile etc/init.d/Makefile - etc/zfs/Makefile - etc/systemd/Makefile - etc/systemd/system/Makefile - etc/systemd/system-generators/Makefile - etc/sudoers.d/Makefile etc/modules-load.d/Makefile - man/Makefile - man/man1/Makefile - man/man5/Makefile - man/man8/Makefile + etc/sudoers.d/Makefile + etc/systemd/Makefile + etc/systemd/system-generators/Makefile + etc/systemd/system/Makefile + etc/zfs/Makefile + include/Makefile + include/os/Makefile + include/os/freebsd/Makefile + include/os/freebsd/linux/Makefile + include/os/freebsd/spl/Makefile + include/os/freebsd/spl/acl/Makefile + include/os/freebsd/spl/rpc/Makefile + include/os/freebsd/spl/sys/Makefile + include/os/freebsd/zfs/Makefile + include/os/freebsd/zfs/sys/Makefile + include/os/linux/Makefile + include/os/linux/kernel/Makefile + include/os/linux/kernel/linux/Makefile + include/os/linux/spl/Makefile + include/os/linux/spl/rpc/Makefile + include/os/linux/spl/sys/Makefile + include/os/linux/zfs/Makefile + include/os/linux/zfs/sys/Makefile + include/sys/Makefile + include/sys/crypto/Makefile + include/sys/fm/Makefile + include/sys/fm/fs/Makefile + include/sys/fs/Makefile + include/sys/lua/Makefile + include/sys/sysevent/Makefile + include/sys/zstd/Makefile lib/Makefile - lib/libspl/Makefile - lib/libspl/asm-generic/Makefile - lib/libspl/asm-i386/Makefile - lib/libspl/asm-x86_64/Makefile - lib/libspl/include/Makefile - lib/libspl/include/ia32/Makefile - lib/libspl/include/ia32/sys/Makefile - lib/libspl/include/rpc/Makefile - lib/libspl/include/sys/Makefile - lib/libspl/include/sys/dktp/Makefile - lib/libspl/include/util/Makefile lib/libavl/Makefile lib/libefi/Makefile lib/libicp/Makefile lib/libnvpair/Makefile - lib/libzutil/Makefile + lib/libshare/Makefile + lib/libspl/Makefile + lib/libspl/include/Makefile + lib/libspl/include/ia32/Makefile + lib/libspl/include/ia32/sys/Makefile + lib/libspl/include/os/Makefile + lib/libspl/include/os/freebsd/Makefile + lib/libspl/include/os/freebsd/sys/Makefile + lib/libspl/include/os/linux/Makefile + lib/libspl/include/os/linux/sys/Makefile + lib/libspl/include/rpc/Makefile + lib/libspl/include/sys/Makefile + lib/libspl/include/sys/dktp/Makefile + lib/libspl/include/util/Makefile lib/libtpool/Makefile lib/libunicode/Makefile lib/libuutil/Makefile - lib/libzpool/Makefile - lib/libzfs/libzfs.pc - lib/libzfs/libzfs_core.pc lib/libzfs/Makefile + lib/libzfs/libzfs.pc + lib/libzfsbootenv/Makefile + lib/libzfsbootenv/libzfsbootenv.pc lib/libzfs_core/Makefile - lib/libshare/Makefile - cmd/Makefile - cmd/zdb/Makefile - cmd/zhack/Makefile - cmd/zfs/Makefile - cmd/zinject/Makefile - cmd/zpool/Makefile - cmd/zstreamdump/Makefile - cmd/ztest/Makefile - cmd/mount_zfs/Makefile - cmd/fsck_zfs/Makefile - cmd/zvol_id/Makefile - cmd/vdev_id/Makefile - cmd/arcstat/Makefile - cmd/dbufstat/Makefile - cmd/arc_summary/Makefile - cmd/zed/Makefile - cmd/raidz_test/Makefile - cmd/zgenhostid/Makefile - contrib/Makefile - contrib/bash_completion.d/Makefile - contrib/dracut/Makefile - contrib/dracut/02zfsexpandknowledge/Makefile - contrib/dracut/90zfs/Makefile - contrib/initramfs/Makefile - contrib/initramfs/hooks/Makefile - contrib/initramfs/scripts/Makefile - contrib/initramfs/scripts/local-top/Makefile - contrib/pyzfs/Makefile - contrib/pyzfs/setup.py + lib/libzfs_core/libzfs_core.pc + lib/libzpool/Makefile + lib/libzstd/Makefile + lib/libzutil/Makefile + man/Makefile + module/Kbuild module/Makefile module/avl/Makefile + module/icp/Makefile + module/lua/Makefile module/nvpair/Makefile + module/os/linux/spl/Makefile + module/os/linux/zfs/Makefile + module/spl/Makefile module/unicode/Makefile module/zcommon/Makefile module/zfs/Makefile - module/lua/Makefile - module/icp/Makefile - module/spl/Makefile - include/Makefile - include/linux/Makefile - include/spl/Makefile - include/spl/rpc/Makefile - include/spl/sys/Makefile - include/sys/Makefile - include/sys/fs/Makefile - include/sys/fm/Makefile - include/sys/fm/fs/Makefile - include/sys/crypto/Makefile - include/sys/sysevent/Makefile - include/sys/lua/Makefile + module/zstd/Makefile + rpm/Makefile + rpm/generic/Makefile + rpm/generic/zfs-dkms.spec + rpm/generic/zfs-kmod.spec + rpm/generic/zfs.spec + rpm/redhat/Makefile + rpm/redhat/zfs-dkms.spec + rpm/redhat/zfs-kmod.spec + rpm/redhat/zfs.spec scripts/Makefile tests/Makefile + tests/runfiles/Makefile tests/test-runner/Makefile tests/test-runner/bin/Makefile tests/test-runner/include/Makefile tests/test-runner/man/Makefile - tests/runfiles/Makefile tests/zfs-tests/Makefile tests/zfs-tests/callbacks/Makefile tests/zfs-tests/cmd/Makefile + tests/zfs-tests/cmd/badsend/Makefile + tests/zfs-tests/cmd/btree_test/Makefile tests/zfs-tests/cmd/chg_usr_exec/Makefile - tests/zfs-tests/cmd/user_ns_exec/Makefile tests/zfs-tests/cmd/devname2devid/Makefile + tests/zfs-tests/cmd/draid/Makefile tests/zfs-tests/cmd/dir_rd_update/Makefile tests/zfs-tests/cmd/file_check/Makefile tests/zfs-tests/cmd/file_trunc/Makefile tests/zfs-tests/cmd/file_write/Makefile + tests/zfs-tests/cmd/get_diff/Makefile tests/zfs-tests/cmd/largest_file/Makefile tests/zfs-tests/cmd/libzfs_input_check/Makefile tests/zfs-tests/cmd/mkbusy/Makefile @@ -179,6 +221,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/mktree/Makefile tests/zfs-tests/cmd/mmap_exec/Makefile tests/zfs-tests/cmd/mmap_libaio/Makefile + tests/zfs-tests/cmd/mmap_seek/Makefile tests/zfs-tests/cmd/mmapwrite/Makefile tests/zfs-tests/cmd/nvlist_to_lua/Makefile tests/zfs-tests/cmd/randfree_file/Makefile @@ -186,27 +229,35 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/readmmap/Makefile tests/zfs-tests/cmd/rename_dir/Makefile tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile + tests/zfs-tests/cmd/send_doall/Makefile + tests/zfs-tests/cmd/stride_dd/Makefile tests/zfs-tests/cmd/threadsappend/Makefile + tests/zfs-tests/cmd/user_ns_exec/Makefile tests/zfs-tests/cmd/xattrtest/Makefile tests/zfs-tests/include/Makefile tests/zfs-tests/tests/Makefile tests/zfs-tests/tests/functional/Makefile tests/zfs-tests/tests/functional/acl/Makefile + tests/zfs-tests/tests/functional/acl/off/Makefile tests/zfs-tests/tests/functional/acl/posix/Makefile + tests/zfs-tests/tests/functional/acl/posix-sa/Makefile + tests/zfs-tests/tests/functional/alloc_class/Makefile tests/zfs-tests/tests/functional/arc/Makefile tests/zfs-tests/tests/functional/atime/Makefile tests/zfs-tests/tests/functional/bootfs/Makefile + tests/zfs-tests/tests/functional/btree/Makefile tests/zfs-tests/tests/functional/cache/Makefile tests/zfs-tests/tests/functional/cachefile/Makefile tests/zfs-tests/tests/functional/casenorm/Makefile - tests/zfs-tests/tests/functional/checksum/Makefile tests/zfs-tests/tests/functional/channel_program/Makefile tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile tests/zfs-tests/tests/functional/chattr/Makefile + tests/zfs-tests/tests/functional/checksum/Makefile tests/zfs-tests/tests/functional/clean_mirror/Makefile tests/zfs-tests/tests/functional/cli_root/Makefile tests/zfs-tests/tests/functional/cli_root/zdb/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile @@ -215,15 +266,15 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_diff/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_get/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_inherit/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile - tests/zfs-tests/tests/functional/cli_root/zfs/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_program/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_promote/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_property/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile - tests/zfs-tests/tests/functional/cli_root/zfs_remap/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_reservation/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_rollback/Makefile @@ -236,6 +287,8 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_attach/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_clear/Makefile @@ -251,13 +304,12 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile - tests/zfs-tests/tests/functional/cli_root/zpool/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_online/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_remove/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile - tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_replace/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_set/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile @@ -266,34 +318,41 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile tests/zfs-tests/tests/functional/cli_user/Makefile tests/zfs-tests/tests/functional/cli_user/misc/Makefile tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile + tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile tests/zfs-tests/tests/functional/compression/Makefile tests/zfs-tests/tests/functional/cp_files/Makefile + tests/zfs-tests/tests/functional/crtime/Makefile tests/zfs-tests/tests/functional/ctime/Makefile tests/zfs-tests/tests/functional/deadman/Makefile tests/zfs-tests/tests/functional/delegate/Makefile tests/zfs-tests/tests/functional/devices/Makefile tests/zfs-tests/tests/functional/events/Makefile tests/zfs-tests/tests/functional/exec/Makefile + tests/zfs-tests/tests/functional/fallocate/Makefile tests/zfs-tests/tests/functional/fault/Makefile + tests/zfs-tests/tests/functional/features/Makefile tests/zfs-tests/tests/functional/features/async_destroy/Makefile tests/zfs-tests/tests/functional/features/large_dnode/Makefile - tests/zfs-tests/tests/functional/features/Makefile tests/zfs-tests/tests/functional/grow/Makefile tests/zfs-tests/tests/functional/history/Makefile tests/zfs-tests/tests/functional/hkdf/Makefile tests/zfs-tests/tests/functional/inheritance/Makefile tests/zfs-tests/tests/functional/inuse/Makefile tests/zfs-tests/tests/functional/io/Makefile + tests/zfs-tests/tests/functional/l2arc/Makefile tests/zfs-tests/tests/functional/large_files/Makefile tests/zfs-tests/tests/functional/largest_pool/Makefile - tests/zfs-tests/tests/functional/link_count/Makefile tests/zfs-tests/tests/functional/libzfs/Makefile tests/zfs-tests/tests/functional/limits/Makefile + tests/zfs-tests/tests/functional/link_count/Makefile + tests/zfs-tests/tests/functional/log_spacemap/Makefile tests/zfs-tests/tests/functional/migration/Makefile tests/zfs-tests/tests/functional/mmap/Makefile tests/zfs-tests/tests/functional/mmp/Makefile @@ -303,8 +362,9 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/no_space/Makefile tests/zfs-tests/tests/functional/nopwrite/Makefile tests/zfs-tests/tests/functional/online_offline/Makefile - tests/zfs-tests/tests/functional/pool_names/Makefile + tests/zfs-tests/tests/functional/pam/Makefile tests/zfs-tests/tests/functional/pool_checkpoint/Makefile + tests/zfs-tests/tests/functional/pool_names/Makefile tests/zfs-tests/tests/functional/poolversion/Makefile tests/zfs-tests/tests/functional/privilege/Makefile tests/zfs-tests/tests/functional/procfs/Makefile @@ -312,6 +372,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/pyzfs/Makefile tests/zfs-tests/tests/functional/quota/Makefile tests/zfs-tests/tests/functional/raidz/Makefile + tests/zfs-tests/tests/functional/redacted_send/Makefile tests/zfs-tests/tests/functional/redundancy/Makefile tests/zfs-tests/tests/functional/refquota/Makefile tests/zfs-tests/tests/functional/refreserv/Makefile @@ -326,20 +387,21 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/snapshot/Makefile tests/zfs-tests/tests/functional/snapused/Makefile tests/zfs-tests/tests/functional/sparse/Makefile - tests/zfs-tests/tests/functional/alloc_class/Makefile + tests/zfs-tests/tests/functional/suid/Makefile tests/zfs-tests/tests/functional/threadsappend/Makefile tests/zfs-tests/tests/functional/tmpfile/Makefile tests/zfs-tests/tests/functional/trim/Makefile tests/zfs-tests/tests/functional/truncate/Makefile + tests/zfs-tests/tests/functional/upgrade/Makefile tests/zfs-tests/tests/functional/user_namespace/Makefile tests/zfs-tests/tests/functional/userquota/Makefile - tests/zfs-tests/tests/functional/upgrade/Makefile tests/zfs-tests/tests/functional/vdev_zaps/Makefile tests/zfs-tests/tests/functional/write_dirs/Makefile tests/zfs-tests/tests/functional/xattr/Makefile + tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile - tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile tests/zfs-tests/tests/perf/Makefile @@ -347,15 +409,8 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/perf/regression/Makefile tests/zfs-tests/tests/perf/scripts/Makefile tests/zfs-tests/tests/stress/Makefile - rpm/Makefile - rpm/redhat/Makefile - rpm/redhat/zfs.spec - rpm/redhat/zfs-kmod.spec - rpm/redhat/zfs-dkms.spec - rpm/generic/Makefile - rpm/generic/zfs.spec - rpm/generic/zfs-kmod.spec - rpm/generic/zfs-dkms.spec + udev/Makefile + udev/rules.d/Makefile zfs.release ]) diff --git a/contrib/Makefile.am b/contrib/Makefile.am index 81926a83ee..5ec13ece53 100644 --- a/contrib/Makefile.am +++ b/contrib/Makefile.am @@ -1,2 +1,12 @@ -SUBDIRS = bash_completion.d dracut initramfs pyzfs -DIST_SUBDIRS = bash_completion.d dracut initramfs pyzfs +include $(top_srcdir)/config/Shellcheck.am + +SUBDIRS = bash_completion.d pyzfs zcp +if BUILD_LINUX +SUBDIRS += bpftrace dracut initramfs +endif +if PAM_ZFS_ENABLED +SUBDIRS += pam_zfs_key +endif +DIST_SUBDIRS = bash_completion.d bpftrace dracut initramfs pam_zfs_key pyzfs zcp + +SHELLCHECKDIRS = bash_completion.d bpftrace dracut initramfs diff --git a/contrib/bash_completion.d/.gitignore b/contrib/bash_completion.d/.gitignore new file mode 100644 index 0000000000..0fd9cc63af --- /dev/null +++ b/contrib/bash_completion.d/.gitignore @@ -0,0 +1 @@ +/zfs diff --git a/contrib/bash_completion.d/Makefile.am b/contrib/bash_completion.d/Makefile.am index 4f13af6b3c..8c8d1acebe 100644 --- a/contrib/bash_completion.d/Makefile.am +++ b/contrib/bash_completion.d/Makefile.am @@ -1,5 +1,13 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + bashcompletiondir = $(sysconfdir)/bash_completion.d noinst_DATA = zfs -EXTRA_DIST = $(noinst_DATA) +EXTRA_DIST += $(noinst_DATA) +SUBSTFILES += $(noinst_DATA) + +SHELLCHECKSCRIPTS = $(noinst_DATA) +SHELLCHECK_SHELL = bash +SHELLCHECK_IGNORE = ,SC2207 diff --git a/contrib/bash_completion.d/zfs b/contrib/bash_completion.d/zfs.in similarity index 60% rename from contrib/bash_completion.d/zfs rename to contrib/bash_completion.d/zfs.in index 914db43cba..41ce2f871e 100644 --- a/contrib/bash_completion.d/zfs +++ b/contrib/bash_completion.d/zfs.in @@ -1,4 +1,4 @@ -# Copyright (c) 2013, Aneurin Price +# Copyright (c) 2010-2016, Aneurin Price # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation @@ -21,13 +21,14 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. -if [[ -w /dev/zfs ]]; then - __ZFS_CMD="zfs" - __ZPOOL_CMD="zpool" -else - __ZFS_CMD="sudo zfs" - __ZPOOL_CMD="sudo zpool" -fi +__ZFS_CMD="@sbindir@/zfs" +__ZPOOL_CMD="@sbindir@/zpool" + +# Disable bash's built-in hostname completion, as this makes it impossible to +# provide completions containing an @-sign, which is necessary for completing +# snapshot names. If bash_completion is in use, this will already be disabled +# and replaced with better completions anyway. +shopt -u hostcomplete __zfs_get_commands() { @@ -51,59 +52,104 @@ __zfs_get_inheritable_properties() __zfs_list_datasets() { - $__ZFS_CMD list -H -o name -t filesystem,volume + $__ZFS_CMD list -H -o name -s name -t filesystem,volume "$@" } __zfs_list_filesystems() { - $__ZFS_CMD list -H -o name -t filesystem + $__ZFS_CMD list -H -o name -s name -t filesystem } __zfs_match_snapshot() { - local base_dataset=${cur%@*} - if [[ $base_dataset != $cur ]] + local base_dataset="${cur%@*}" + if [ "$base_dataset" != "$cur" ] then - $__ZFS_CMD list -H -o name -t snapshot -d 1 $base_dataset + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 "$base_dataset" else - $__ZFS_CMD list -H -o name -t filesystem,volume | awk '{print $1"@"}' + if [ "$cur" != "" ] && __zfs_list_datasets "$cur" &> /dev/null + then + $__ZFS_CMD list -H -o name -s name -t filesystem -r "$cur" | tail -n +2 + # We output the base dataset name even though we might be + # completing a command that can only take a snapshot, because it + # prevents bash from considering the completion finished when it + # ends in the bare @. + echo "$cur" + echo "$cur@" + else + local datasets + datasets="$(__zfs_list_datasets)" + # As above + echo "$datasets" + if [[ "$cur" == */ ]] + then + # If the current command ends with a slash, then the only way + # it can be completed with a single tab press (ie. in this pass) + # is if it has exactly one child, so that's the only time we + # need to offer a suggestion with an @ appended. + local num_children + # This is actually off by one as zfs list includes the named + # dataset in addition to its children + num_children=$(__zfs_list_datasets -d 1 "${cur%/}" 2> /dev/null | wc -l) + if [[ $num_children != 2 ]] + then + return 0 + fi + fi + echo "$datasets" | awk '{print $1 "@"}' + fi fi } -__zfs_match_explicit_snapshot() +__zfs_match_snapshot_or_bookmark() { - local base_dataset=${cur%@*} - if [[ $base_dataset != $cur ]] + local base_dataset="${cur%[#@]*}" + if [ "$base_dataset" != "$cur" ] then - $__ZFS_CMD list -H -o name -t snapshot -d 1 $base_dataset + if [[ $cur == *@* ]] + then + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 "$base_dataset" + else + $__ZFS_CMD list -H -o name -s name -t bookmark -d 1 "$base_dataset" + fi + else + $__ZFS_CMD list -H -o name -s name -t filesystem,volume + if [ -e "$cur" ] && $__ZFS_CMD list -H -o name -s name -t filesystem,volume "$cur" &> /dev/null + then + echo "$cur@" + echo "$cur#" + fi fi } __zfs_match_multiple_snapshots() { - local existing_opts=$(expr "$cur" : '\(.*\)[%,]') - if [[ $existing_opts ]] + local existing_opts + existing_opts="$(expr "$cur" : '\(.*\)[%,]')" + if [ -e "$existing_opts" ] then - local base_dataset=${cur%@*} - if [[ $base_dataset != $cur ]] + local base_dataset="${cur%@*}" + if [ "$base_dataset" != "$cur" ] then - local cur=${cur##*,} + local cur="${cur##*,}" if [[ $cur =~ ^%|%.*% ]] then # correct range syntax is start%end return 1 fi - local range_start=$(expr "$cur" : '\(.*%\)') - $__ZFS_CMD list -H -o name -t snapshot -d 1 $base_dataset | sed 's$.*@$'$range_start'$g' + local range_start + range_start="$(expr "$cur" : '\(.*%\)')" + # shellcheck disable=SC2016 + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 "$base_dataset" | sed 's$.*@$'"$range_start"'$g' fi else - __zfs_match_explicit_snapshot; __zfs_list_datasets + __zfs_match_snapshot_or_bookmark fi } __zfs_list_volumes() { - $__ZFS_CMD list -H -o name -t volume + $__ZFS_CMD list -H -o name -s name -t volume } __zfs_argument_chosen() @@ -114,13 +160,13 @@ __zfs_argument_chosen() local prev="${COMP_WORDS[$word]}" if [[ ${COMP_WORDS[$word-1]} != -[tos] ]] then - if [[ "$prev" == [^,]*,* ]] || [[ "$prev" == *[@:]* ]] + if [[ "$prev" == [^,]*,* ]] || [[ "$prev" == *[@:\#]* ]] then return 0 fi - for property in $@ + for property in "$@" do - if [[ $prev == "$property" ]] + if [[ $prev == "$property"* ]] then return 0 fi @@ -136,6 +182,7 @@ __zfs_complete_ordered_arguments() local list2=$2 local cur=$3 local extra=$4 + # shellcheck disable=SC2086 if __zfs_argument_chosen $list1 then COMPREPLY=($(compgen -W "$list2 $extra" -- "$cur")) @@ -148,10 +195,11 @@ __zfs_complete_multiple_options() { local options=$1 local cur=$2 + local existing_opts COMPREPLY=($(compgen -W "$options" -- "${cur##*,}")) - local existing_opts=$(expr "$cur" : '\(.*,\)') - if [[ $existing_opts ]] + existing_opts=$(expr "$cur" : '\(.*,\)') + if [[ $existing_opts ]] then COMPREPLY=( "${COMPREPLY[@]/#/${existing_opts}}" ) fi @@ -169,12 +217,28 @@ __zfs_complete_switch() fi } +__zfs_complete_nospace() +{ + # Google indicates that there may still be bash versions out there that + # don't have compopt. + if type compopt &> /dev/null + then + compopt -o nospace + fi +} + __zfs_complete() { local cur prev cmd cmds COMPREPLY=() - # Don't split on colon - _get_comp_words_by_ref -n : -c cur -p prev -w COMP_WORDS -i COMP_CWORD + if type _get_comp_words_by_ref &> /dev/null + then + # Don't split on colon + _get_comp_words_by_ref -n : -c cur -p prev -w COMP_WORDS -i COMP_CWORD + else + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + fi cmd="${COMP_WORDS[1]}" if [[ ${prev##*/} == zfs ]] @@ -185,10 +249,19 @@ __zfs_complete() fi case "${cmd}" in + bookmark) + if __zfs_argument_chosen + then + COMPREPLY=($(compgen -W "${prev%@*}# ${prev/@/#}" -- "$cur")) + else + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + ;; clone) case "${prev}" in -o) COMPREPLY=($(compgen -W "$(__zfs_get_editable_properties)" -- "$cur")) + __zfs_complete_nospace ;; *) if ! __zfs_complete_switch "o,p" @@ -209,10 +282,10 @@ __zfs_complete() COMPREPLY=($(compgen -W "" -- "$cur")) ;; -t) - __zfs_complete_multiple_options "filesystem volume snapshot all" "$cur" + __zfs_complete_multiple_options "filesystem volume snapshot bookmark all" "$cur" ;; -s) - __zfs_complete_multiple_options "local default inherited temporary none" "$cur" + __zfs_complete_multiple_options "local default inherited temporary received none" "$cur" ;; -o) __zfs_complete_multiple_options "name property value source received all" "$cur" @@ -220,9 +293,10 @@ __zfs_complete() *) if ! __zfs_complete_switch "H,r,p,d,o,t,s" then + # shellcheck disable=SC2046 if __zfs_argument_chosen $(__zfs_get_properties) then - COMPREPLY=($(compgen -W "$(__zfs_match_explicit_snapshot) $(__zfs_list_datasets)" -- "$cur")) + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) else __zfs_complete_multiple_options "$(__zfs_get_properties)" "$cur" fi @@ -233,7 +307,7 @@ __zfs_complete() inherit) if ! __zfs_complete_switch "r" then - __zfs_complete_ordered_arguments "$(__zfs_get_inheritable_properties)" "$(__zfs_match_explicit_snapshot) $(__zfs_list_datasets)" $cur + __zfs_complete_ordered_arguments "$(__zfs_get_inheritable_properties)" "$(__zfs_match_snapshot)" "$cur" fi ;; list) @@ -242,7 +316,7 @@ __zfs_complete() COMPREPLY=($(compgen -W "" -- "$cur")) ;; -t) - __zfs_complete_multiple_options "filesystem volume snapshot all" "$cur" + __zfs_complete_multiple_options "filesystem volume snapshot bookmark all" "$cur" ;; -o) __zfs_complete_multiple_options "$(__zfs_get_properties)" "$cur" @@ -253,7 +327,7 @@ __zfs_complete() *) if ! __zfs_complete_switch "H,r,d,o,t,s,S" then - COMPREPLY=($(compgen -W "$(__zfs_match_explicit_snapshot) $(__zfs_list_datasets)" -- "$cur")) + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) fi ;; esac @@ -268,26 +342,39 @@ __zfs_complete() fi ;; send) - if ! __zfs_complete_switch "d,n,P,p,R,v,i,I" + if ! __zfs_complete_switch "D,n,P,p,R,v,e,L,i,I" then - COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + if __zfs_argument_chosen + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + else + if [[ $prev == -*i* ]] + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot_or_bookmark)" -- "$cur")) + else + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + fi fi ;; snapshot) case "${prev}" in -o) COMPREPLY=($(compgen -W "$(__zfs_get_editable_properties)" -- "$cur")) + __zfs_complete_nospace ;; *) if ! __zfs_complete_switch "o,r" then - COMPREPLY=($(compgen -W "$(__zfs_list_datasets | awk '{print $1"@"}')" -- "$cur")) + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + __zfs_complete_nospace fi ;; esac ;; set) - __zfs_complete_ordered_arguments "$(__zfs_get_editable_properties)" "$(__zfs_match_explicit_snapshot) $(__zfs_list_datasets)" $cur + __zfs_complete_ordered_arguments "$(__zfs_get_editable_properties)" "$(__zfs_match_snapshot)" "$cur" + __zfs_complete_nospace ;; upgrade) case "${prev}" in @@ -305,14 +392,18 @@ __zfs_complete() destroy) if ! __zfs_complete_switch "d,f,n,p,R,r,v" then - __zfs_complete_multiple_options "$(__zfs_match_multiple_snapshots)" $cur + __zfs_complete_multiple_options "$(__zfs_match_multiple_snapshots)" "$cur" + __zfs_complete_nospace fi ;; *) - COMPREPLY=($(compgen -W "$(__zfs_match_explicit_snapshot) $(__zfs_list_datasets)" -- "$cur")) + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) ;; esac - __ltrim_colon_completions "$cur" + if type __ltrim_colon_completions &> /dev/null + then + __ltrim_colon_completions "$cur" + fi return 0 } @@ -338,7 +429,7 @@ __zpool_list_pools() __zpool_complete() { - local cur prev cmd cmds + local cur prev cmd cmds pools COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" @@ -353,7 +444,7 @@ __zpool_complete() case "${cmd}" in get) - __zfs_complete_ordered_arguments "$(__zpool_get_properties)" "$(__zpool_list_pools)" $cur + __zfs_complete_ordered_arguments "$(__zpool_get_properties)" "$(__zpool_list_pools)" "$cur" return 0 ;; import) @@ -366,11 +457,13 @@ __zpool_complete() return 0 ;; set) - __zfs_complete_ordered_arguments "$(__zpool_get_editable_properties)" "$(__zpool_list_pools)" $cur + __zfs_complete_ordered_arguments "$(__zpool_get_editable_properties)" "$(__zpool_list_pools)" "$cur" + __zfs_complete_nospace return 0 ;; add|attach|clear|create|detach|offline|online|remove|replace) - local pools="$(__zpool_list_pools)" + pools="$(__zpool_list_pools)" + # shellcheck disable=SC2086 if __zfs_argument_chosen $pools then _filedir diff --git a/contrib/bpftrace/Makefile.am b/contrib/bpftrace/Makefile.am new file mode 100644 index 0000000000..05e4f1c507 --- /dev/null +++ b/contrib/bpftrace/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Shellcheck.am + +EXTRA_DIST = \ + taskqlatency.bt \ + zfs-trace.sh + +SHELLCHECKSCRIPTS = zfs-trace.sh diff --git a/contrib/bpftrace/taskqlatency.bt b/contrib/bpftrace/taskqlatency.bt new file mode 100644 index 0000000000..598f9882b3 --- /dev/null +++ b/contrib/bpftrace/taskqlatency.bt @@ -0,0 +1,54 @@ +#include + +kprobe:trace_zfs_taskq_ent__birth +{ + $tqent = (struct taskq_ent *)arg0; + + $tqent_id = $tqent->tqent_id; + $tq_name = str($tqent->tqent_taskq->tq_name); + + @birth[$tq_name, $tqent_id] = nsecs; +} + +kprobe:trace_zfs_taskq_ent__start +{ + $tqent = (struct taskq_ent *)arg0; + + @tqent_id[tid] = $tqent->tqent_id; + @tq_name[tid] = str($tqent->tqent_taskq->tq_name); + + @start[@tq_name[tid], @tqent_id[tid]] = nsecs; +} + +kprobe:trace_zfs_taskq_ent__start +/ @birth[@tq_name[tid], @tqent_id[tid]] / +{ + @queue_lat_us[@tq_name[tid]] = + hist((nsecs - @birth[@tq_name[tid], @tqent_id[tid]])/1000); + delete(@birth[@tq_name[tid], @tqent_id[tid]]); +} + +kprobe:trace_zfs_taskq_ent__finish +/ @start[@tq_name[tid], @tqent_id[tid]] / +{ + $tqent = (struct taskq_ent *)arg0; + + @exec_lat_us[@tq_name[tid], ksym($tqent->tqent_func)] = + hist((nsecs - @start[@tq_name[tid], @tqent_id[tid]])/1000); + delete(@start[@tq_name[tid], @tqent_id[tid]]); +} + +kprobe:trace_zfs_taskq_ent__finish +{ + delete(@tq_name[tid]); + delete(@tqent_id[tid]); +} + +END +{ + clear(@birth); + clear(@start); + + clear(@tq_name); + clear(@tqent_id); +} diff --git a/contrib/bpftrace/zfs-trace.sh b/contrib/bpftrace/zfs-trace.sh new file mode 100755 index 0000000000..54f66f3ba3 --- /dev/null +++ b/contrib/bpftrace/zfs-trace.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +ZVER=$(cut -f 1 -d '-' /sys/module/zfs/version) +KVER=$(uname -r) + +exec bpftrace \ + --include "/usr/src/zfs-$ZVER/$KVER/zfs_config.h" \ + -I "/usr/src/zfs-$ZVER/include" \ + -I "/usr/src/zfs-$ZVER/include/spl" \ + "$@" diff --git a/contrib/dracut/02zfsexpandknowledge/Makefile.am b/contrib/dracut/02zfsexpandknowledge/Makefile.am index a5c567c161..b1bbb6bd3a 100644 --- a/contrib/dracut/02zfsexpandknowledge/Makefile.am +++ b/contrib/dracut/02zfsexpandknowledge/Makefile.am @@ -1,22 +1,8 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + pkgdracutdir = $(dracutdir)/modules.d/02zfsexpandknowledge pkgdracut_SCRIPTS = \ module-setup.sh -EXTRA_DIST = \ - $(top_srcdir)/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in - -$(pkgdracut_SCRIPTS):%:%.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@datadir\@,$(datadir),g' \ - -e 's,@dracutdir\@,$(dracutdir),g' \ - -e 's,@udevdir\@,$(udevdir),g' \ - -e 's,@udevruledir\@,$(udevruledir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -clean-local:: - -$(RM) $(pkgdracut_SCRIPTS) - -distclean-local:: - -$(RM) $(pkgdracut_SCRIPTS) +SUBSTFILES += $(pkgdracut_SCRIPTS) diff --git a/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in b/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in index c22141f00f..d21ab74cc0 100755 --- a/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in +++ b/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash get_devtype() { local typ @@ -14,22 +14,16 @@ get_pool_devices() { local poolconfigtemp local poolconfigoutput local pooldev - local prefix local resolved - poolconfigtemp=`mktemp` - @sbindir@/zpool list -v -H -P "$1" > "$poolconfigtemp" 2>&1 - if [ "$?" != "0" ] ; then - poolconfigoutput=$(cat "$poolconfigtemp") + poolconfigtemp="$(mktemp)" + if ! @sbindir@/zpool list -v -H -P "$1" > "$poolconfigtemp" 2>&1 ; then + poolconfigoutput="$(cat "$poolconfigtemp")" dinfo "zfsexpandknowledge: pool $1 cannot be listed: $poolconfigoutput" else - cat "$poolconfigtemp" | awk -F '\t' '/\t\/dev/ { print $2 }' | \ - while read pooldev ; do - if [ -n "$pooldev" -a -e "$pooldev" ] ; then - if [ -h "$pooldev" ] ; then - resolved=`readlink -f "$pooldev"` - else - resolved="$pooldev" - fi + awk -F '\t' '/\t\/dev/ { print $2 }' "$poolconfigtemp" | \ + while read -r pooldev ; do + if [ -e "$pooldev" ] ; then + resolved="$(readlink -f "$pooldev")" dinfo "zfsexpandknowledge: pool $1 has device $pooldev (which resolves to $resolved)" echo "$resolved" fi @@ -40,22 +34,20 @@ get_pool_devices() { find_zfs_block_devices() { local dev - local blockdev local mp local fstype local pool - local key - local n - local poolconfigoutput - numfields=`head -1 /proc/self/mountinfo | awk '{print NF}'` - if [ "$numfields" == "10" ] ; then - fields="n n n n mp n n fstype dev n" + local _ + numfields="$(awk '{print NF; exit}' /proc/self/mountinfo)" + if [ "$numfields" = "10" ] ; then + fields="_ _ _ _ mp _ _ fstype dev _" else - fields="n n n n mp n n n fstype dev n" + fields="_ _ _ _ mp _ _ _ fstype dev _" fi - while read $fields ; do - if [ "$fstype" != "zfs" ]; then continue ; fi - if [ "$mp" == "$1" ]; then + # shellcheck disable=SC2086 + while read -r ${fields?} ; do + [ "$fstype" = "zfs" ] || continue + if [ "$mp" = "$1" ]; then pool=$(echo "$dev" | cut -d / -f 1) get_pool_devices "$pool" fi @@ -74,13 +66,12 @@ check() { local blockdevs local fstype local majmin - local _slavedev - local _slavedevname - local _slavedevtype - local _slavemajmin - local _dev + local _depdev + local _depdevname + local _depdevtype -if [[ $hostonly ]]; then +# shellcheck disable=SC2154 +if [ -n "$hostonly" ]; then for mp in \ "/" \ @@ -100,23 +91,22 @@ if [[ $hostonly ]]; then mountpoint "$mp" >/dev/null 2>&1 || continue blockdevs=$(find_zfs_block_devices "$mp") if [ -z "$blockdevs" ] ; then continue ; fi - dinfo "zfsexpandknowledge: block devices backing ZFS dataset $mp: $blockdevs" + dinfo "zfsexpandknowledge: block devices backing ZFS dataset $mp: ${blockdevs//$'\n'/ }" for dev in $blockdevs do array_contains "$dev" "${host_devs[@]}" || host_devs+=("$dev") fstype=$(get_devtype "$dev") host_fs_types["$dev"]="$fstype" majmin=$(get_maj_min "$dev") - if [[ -d /sys/dev/block/$majmin/slaves ]] ; then - for _slavedev in /sys/dev/block/$majmin/slaves/*; do - [[ -f $_slavedev/dev ]] || continue - _slavedev=/dev/$(basename "$_slavedev") - _slavedevname=$(udevadm info --query=property --name="$_slavedev" | grep "^DEVNAME=" | sed 's|^DEVNAME=||') - _slavedevtype=$(get_devtype "$_slavedevname") - _slavemajmin=$(get_maj_min "$_slavedevname") - dinfo "zfsexpandknowledge: slave block device backing ZFS dataset $mp: $_slavedevname" - array_contains "$_slavedevname" "${host_devs[@]}" || host_devs+=("$_slavedevname") - host_fs_types["$_slavedevname"]="$_slavedevtype" + if [ -d "/sys/dev/block/$majmin/slaves" ] ; then + for _depdev in "/sys/dev/block/$majmin/slaves"/*; do + [[ -f $_depdev/dev ]] || continue + _depdev=/dev/$(basename "$_depdev") + _depdevname=$(udevadm info --query=property --name="$_depdev" | grep "^DEVNAME=" | sed 's|^DEVNAME=||') + _depdevtype=$(get_devtype "$_depdevname") + dinfo "zfsexpandknowledge: underlying block device backing ZFS dataset $mp: ${_depdevname//$'\n'/ }" + array_contains "$_depdevname" "${host_devs[@]}" || host_devs+=("$_depdevname") + host_fs_types["$_depdevname"]="$_depdevtype" done fi done diff --git a/contrib/dracut/90zfs/.gitignore b/contrib/dracut/90zfs/.gitignore index 85c23f75ec..cb84212f3a 100644 --- a/contrib/dracut/90zfs/.gitignore +++ b/contrib/dracut/90zfs/.gitignore @@ -1,9 +1,2 @@ -export-zfs.sh -module-setup.sh -mount-zfs.sh -parse-zfs.sh -zfs-generator.sh -zfs-lib.sh -zfs-load-key.sh -zfs-needshutdown.sh -zfs-env-bootfs.service +*.sh +*.service diff --git a/contrib/dracut/90zfs/Makefile.am b/contrib/dracut/90zfs/Makefile.am index 0a557f57f2..3f70503009 100644 --- a/contrib/dracut/90zfs/Makefile.am +++ b/contrib/dracut/90zfs/Makefile.am @@ -1,3 +1,6 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + pkgdracutdir = $(dracutdir)/modules.d/90zfs pkgdracut_SCRIPTS = \ export-zfs.sh \ @@ -7,31 +10,15 @@ pkgdracut_SCRIPTS = \ zfs-generator.sh \ zfs-load-key.sh \ zfs-needshutdown.sh \ - zfs-lib.sh + zfs-lib.sh \ + import-opts-generator.sh pkgdracut_DATA = \ - zfs-env-bootfs.service + zfs-env-bootfs.service \ + zfs-snapshot-bootfs.service \ + zfs-rollback-bootfs.service -EXTRA_DIST = \ - $(top_srcdir)/contrib/dracut/90zfs/export-zfs.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/module-setup.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/mount-zfs.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/parse-zfs.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/zfs-generator.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/zfs-load-key.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/zfs-needshutdown.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/zfs-lib.sh.in \ - $(top_srcdir)/contrib/dracut/90zfs/zfs-env-bootfs.service.in +SUBSTFILES += $(pkgdracut_SCRIPTS) $(pkgdracut_DATA) -$(pkgdracut_SCRIPTS) $(pkgdracut_DATA) :%:%.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@udevdir\@,$(udevdir),g' \ - -e 's,@udevruledir\@,$(udevruledir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - -e 's,@systemdunitdir\@,$(systemdunitdir),g' \ - -e 's,@mounthelperdir\@,$(mounthelperdir),g' \ - $< >'$@' - -distclean-local:: - -$(RM) $(pkgdracut_SCRIPTS) $(pkgdracut_DATA) +# Provided by /bin/sleep, and, again, every implementation of that supports this +CHECKBASHISMS_IGNORE = -e 'sleep only takes one integer' -e 'sleep 0.' diff --git a/contrib/dracut/90zfs/import-opts-generator.sh.in b/contrib/dracut/90zfs/import-opts-generator.sh.in new file mode 100755 index 0000000000..8bc8c9b35b --- /dev/null +++ b/contrib/dracut/90zfs/import-opts-generator.sh.in @@ -0,0 +1,5 @@ +#!/bin/sh + +. /lib/dracut-zfs-lib.sh + +echo ZPOOL_IMPORT_OPTS="$ZPOOL_IMPORT_OPTS" diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in index 4efc4b0186..a4b62da1f7 100755 --- a/contrib/dracut/90zfs/module-setup.sh.in +++ b/contrib/dracut/90zfs/module-setup.sh.in @@ -1,15 +1,14 @@ -#!/bin/bash +#!/usr/bin/env bash +# shellcheck disable=SC2154 check() { # We depend on udev-rules being loaded [ "${1}" = "-d" ] && return 0 # Verify the zfs tool chain - for tool in "@sbindir@/zpool" "@sbindir@/zfs" "@mounthelperdir@/mount.zfs" ; do + for tool in "@sbindir@/zgenhostid" "@sbindir@/zpool" "@sbindir@/zfs" "@mounthelperdir@/mount.zfs" ; do test -x "$tool" || return 1 done - # Verify grep exists - which grep >/dev/null 2>&1 || return 1 return 0 } @@ -38,22 +37,34 @@ install() { inst_rules @udevruledir@/60-zvol.rules dracut_install hostid dracut_install grep + dracut_install @sbindir@/zgenhostid dracut_install @sbindir@/zfs dracut_install @sbindir@/zpool - # Workaround for zfsonlinux/zfs#4749 by ensuring libgcc_s.so(.1) is included - if [[ -n "$(ldd @sbindir@/zpool | grep -F 'libgcc_s.so')" ]]; then + # Workaround for https://github.com/openzfs/zfs/issues/4749 by + # ensuring libgcc_s.so(.1) is included + if ldd @sbindir@/zpool | grep -qF 'libgcc_s.so'; then # Dracut will have already tracked and included it :; - elif command -v gcc-config 2>&1 1>/dev/null; then + elif command -v gcc-config >/dev/null 2>&1; then # On systems with gcc-config (Gentoo, Funtoo, etc.): # Use the current profile to resolve the appropriate path - dracut_install "/usr/lib/gcc/$(s=$(gcc-config -c); echo ${s%-*}/${s##*-})/libgcc_s.so.1" - elif [[ -n "$(ls /usr/lib/libgcc_s.so* 2>/dev/null)" ]]; then + s="$(gcc-config -c)" + dracut_install "/usr/lib/gcc/${s%-*}/${s##*-}/libgcc_s.so"* + elif [ "$(echo /usr/lib/libgcc_s.so*)" != "/usr/lib/libgcc_s.so*" ]; then # Try a simple path first dracut_install /usr/lib/libgcc_s.so* + elif [ "$(echo /lib*/libgcc_s.so*)" != "/lib*/libgcc_s.so*" ]; then + # SUSE + dracut_install /lib*/libgcc_s.so* else # Fallback: Guess the path and include all matches - dracut_install /usr/lib/gcc/*/*/libgcc_s.so* + dracut_install /usr/lib*/gcc/**/libgcc_s.so* + fi + # shellcheck disable=SC2050 + if [ @LIBFETCH_DYNAMIC@ != 0 ]; then + for d in $libdirs; do + [ -e "$d/"@LIBFETCH_SONAME@ ] && dracut_install "$d/"@LIBFETCH_SONAME@ + done fi dracut_install @mounthelperdir@/mount.zfs dracut_install @udevdir@/vdev_id @@ -83,31 +94,50 @@ install() { fi # Synchronize initramfs and system hostid - AA=`hostid | cut -b 1,2` - BB=`hostid | cut -b 3,4` - CC=`hostid | cut -b 5,6` - DD=`hostid | cut -b 7,8` - echo -ne "\\x${DD}\\x${CC}\\x${BB}\\x${AA}" > "${initdir}/etc/hostid" + if [ -f @sysconfdir@/hostid ]; then + inst @sysconfdir@/hostid + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/hostid + elif HOSTID="$(hostid 2>/dev/null)" && [ "${HOSTID}" != "00000000" ]; then + zgenhostid -o "${initdir}@sysconfdir@/hostid" "${HOSTID}" + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/hostid + fi if dracut_module_included "systemd"; then mkdir -p "${initdir}/$systemdsystemunitdir/zfs-import.target.wants" - for _item in scan cache ; do - dracut_install @systemdunitdir@/zfs-import-$_item.service - if ! [ -L "${initdir}/$systemdsystemunitdir/zfs-import.target.wants"/zfs-import-$_item.service ]; then - ln -s ../zfs-import-$_item.service "${initdir}/$systemdsystemunitdir/zfs-import.target.wants"/zfs-import-$_item.service - type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-import-$_item.service + for _service in "zfs-import-scan.service" "zfs-import-cache.service" ; do + dracut_install "@systemdunitdir@/$_service" + if ! [ -L "${initdir}/$systemdsystemunitdir/zfs-import.target.wants/$_service" ]; then + ln -sf ../$_service "${initdir}/$systemdsystemunitdir/zfs-import.target.wants/$_service" + type mark_hostonly >/dev/null 2>&1 && mark_hostonly "@systemdunitdir@/$_service" fi done + inst "${moddir}"/zfs-env-bootfs.service "${systemdsystemunitdir}"/zfs-env-bootfs.service ln -s ../zfs-env-bootfs.service "${initdir}/${systemdsystemunitdir}/zfs-import.target.wants"/zfs-env-bootfs.service type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-env-bootfs.service + dracut_install systemd-ask-password dracut_install systemd-tty-ask-password-agent + mkdir -p "${initdir}/$systemdsystemunitdir/initrd.target.wants" dracut_install @systemdunitdir@/zfs-import.target if ! [ -L "${initdir}/$systemdsystemunitdir/initrd.target.wants"/zfs-import.target ]; then ln -s ../zfs-import.target "${initdir}/$systemdsystemunitdir/initrd.target.wants"/zfs-import.target type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-import.target fi + + for _service in zfs-snapshot-bootfs.service zfs-rollback-bootfs.service ; do + inst "${moddir}/$_service" "${systemdsystemunitdir}/$_service" + if ! [ -L "${initdir}/$systemdsystemunitdir/initrd.target.wants/$_service" ]; then + ln -s "../$_service" "${initdir}/$systemdsystemunitdir/initrd.target.wants/$_service" + fi + done + + # There isn't a pkg-config variable for this, + # and dracut doesn't automatically resolve anything this'd be next to + local systemdsystemenvironmentgeneratordir + systemdsystemenvironmentgeneratordir="$(pkg-config --variable=prefix systemd || echo "/usr")/lib/systemd/system-environment-generators" + mkdir -p "${initdir}/${systemdsystemenvironmentgeneratordir}" + inst "${moddir}"/import-opts-generator.sh "${systemdsystemenvironmentgeneratordir}"/zfs-import-opts.sh fi } diff --git a/contrib/dracut/90zfs/mount-zfs.sh.in b/contrib/dracut/90zfs/mount-zfs.sh.in index 23f7e3e295..68e3f0e0d6 100755 --- a/contrib/dracut/90zfs/mount-zfs.sh.in +++ b/contrib/dracut/90zfs/mount-zfs.sh.in @@ -1,4 +1,5 @@ #!/bin/sh +# shellcheck disable=SC2034,SC2154 . /lib/dracut-zfs-lib.sh @@ -38,11 +39,10 @@ modprobe zfs 2>/dev/null udevadm settle if [ "${root}" = "zfs:AUTO" ] ; then - ZFS_DATASET="$(find_bootfs)" - if [ $? -ne 0 ] ; then + if ! ZFS_DATASET="$(find_bootfs)" ; then + # shellcheck disable=SC2086 zpool import -N -a ${ZPOOL_IMPORT_OPTS} - ZFS_DATASET="$(find_bootfs)" - if [ $? -ne 0 ] ; then + if ! ZFS_DATASET="$(find_bootfs)" ; then warn "ZFS: No bootfs attribute found in importable pools." export_all -F @@ -58,15 +58,19 @@ ZFS_POOL="${ZFS_DATASET%%/*}" if import_pool "${ZFS_POOL}" ; then # Load keys if we can or if we need to - if [ $(zpool list -H -o feature@encryption $(echo "${ZFS_POOL}" | awk -F\/ '{print $1}')) = 'active' ]; then + if [ "$(zpool list -H -o feature@encryption "${ZFS_POOL}")" = 'active' ]; then # if the root dataset has encryption enabled ENCRYPTIONROOT="$(zfs get -H -o value encryptionroot "${ZFS_DATASET}")" if ! [ "${ENCRYPTIONROOT}" = "-" ]; then - # decrypt them - ask_for_password \ - --tries 5 \ - --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}: " \ - --cmd "zfs load-key '${ENCRYPTIONROOT}'" + KEYSTATUS="$(zfs get -H -o value keystatus "${ENCRYPTIONROOT}")" + # if the key needs to be loaded + if [ "$KEYSTATUS" = "unavailable" ]; then + # decrypt them + ask_for_password \ + --tries 5 \ + --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}: " \ + --cmd "zfs load-key '${ENCRYPTIONROOT}'" + fi fi fi # Let us tell the initrd to run on shutdown. diff --git a/contrib/dracut/90zfs/parse-zfs.sh.in b/contrib/dracut/90zfs/parse-zfs.sh.in index eccfdc6bcb..fe786a8806 100755 --- a/contrib/dracut/90zfs/parse-zfs.sh.in +++ b/contrib/dracut/90zfs/parse-zfs.sh.in @@ -1,4 +1,5 @@ #!/bin/sh +# shellcheck disable=SC2034,SC2154 . /lib/dracut-lib.sh @@ -6,11 +7,7 @@ spl_hostid=$(getarg spl_hostid=) if [ -n "${spl_hostid}" ] ; then info "ZFS: Using hostid from command line: ${spl_hostid}" - AA=$(echo "${spl_hostid}" | cut -b 1,2) - BB=$(echo "${spl_hostid}" | cut -b 3,4) - CC=$(echo "${spl_hostid}" | cut -b 5,6) - DD=$(echo "${spl_hostid}" | cut -b 7,8) - echo -ne "\\x${DD}\\x${CC}\\x${BB}\\x${AA}" >/etc/hostid + zgenhostid -f "${spl_hostid}" elif [ -f "/etc/hostid" ] ; then info "ZFS: Using hostid from /etc/hostid: $(hostid)" else @@ -32,7 +29,7 @@ case "${root}" in info "ZFS: Enabling autodetection of bootfs after udev settles." ;; - ZFS\=*|zfs:*|zfs:FILESYSTEM\=*|FILESYSTEM\=*) + ZFS=*|zfs:*|FILESYSTEM=*) # root is explicit ZFS root. Parse it now. We can handle # a root=... param in any of the following formats: # root=ZFS=rpool/ROOT diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 3cdf69100d..e143cb5ec1 100644 --- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -8,7 +8,7 @@ Before=zfs-import.target [Service] Type=oneshot -ExecStart=/bin/sh -c "/bin/systemctl set-environment BOOTFS=$(@sbindir@/zpool list -H -o bootfs | grep -m1 -v '^-$')" +ExecStart=/bin/sh -c "exec systemctl set-environment BOOTFS=$(@sbindir@/zpool list -H -o bootfs | grep -m1 -v '^-$')" [Install] WantedBy=zfs-import.target diff --git a/contrib/dracut/90zfs/zfs-generator.sh.in b/contrib/dracut/90zfs/zfs-generator.sh.in index 0b8a8aaca7..b57c64c688 100755 --- a/contrib/dracut/90zfs/zfs-generator.sh.in +++ b/contrib/dracut/90zfs/zfs-generator.sh.in @@ -1,6 +1,8 @@ -#!/bin/bash +#!/bin/sh +# shellcheck disable=SC2016,SC1004 -echo "zfs-generator: starting" >> /dev/kmsg +grep -wq debug /proc/cmdline && debug=1 +[ -n "$debug" ] && echo "zfs-generator: starting" >> /dev/kmsg GENERATOR_DIR="$1" [ -n "$GENERATOR_DIR" ] || { @@ -10,52 +12,108 @@ GENERATOR_DIR="$1" [ -f /lib/dracut-lib.sh ] && dracutlib=/lib/dracut-lib.sh [ -f /usr/lib/dracut/modules.d/99base/dracut-lib.sh ] && dracutlib=/usr/lib/dracut/modules.d/99base/dracut-lib.sh - -type getarg >/dev/null 2>&1 || { - echo "zfs-generator: loading Dracut library from $dracutlib" >> /dev/kmsg +command -v getarg >/dev/null 2>&1 || { + [ -n "$debug" ] && echo "zfs-generator: loading Dracut library from $dracutlib" >> /dev/kmsg . "$dracutlib" } +. /lib/dracut-zfs-lib.sh + [ -z "$root" ] && root=$(getarg root=) [ -z "$rootfstype" ] && rootfstype=$(getarg rootfstype=) [ -z "$rootflags" ] && rootflags=$(getarg rootflags=) # If root is not ZFS= or zfs: or rootfstype is not zfs # then we are not supposed to handle it. -[ "${root##zfs:}" = "${root}" -a "${root##ZFS=}" = "${root}" -a "$rootfstype" != "zfs" ] && exit 0 +[ "${root##zfs:}" = "${root}" ] && + [ "${root##ZFS=}" = "${root}" ] && + [ "$rootfstype" != "zfs" ] && + exit 0 -rootfstype=zfs -if echo "${rootflags}" | grep -Eq '^zfsutil$|^zfsutil,|,zfsutil$|,zfsutil,' ; then - true -elif test -n "${rootflags}" ; then - rootflags="zfsutil,${rootflags}" -else - rootflags=zfsutil +case ",${rootflags}," in + *,zfsutil,*) ;; + ,,) rootflags=zfsutil ;; + *) rootflags="zfsutil,${rootflags}" ;; +esac + +if [ "${root}" != "zfs:AUTO" ]; then + root="${root##zfs:}" + root="${root##ZFS=}" fi -echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf >> /dev/kmsg +[ -n "$debug" ] && echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR/sysroot.mount.d/zfs-enhancement.conf" >> /dev/kmsg -[ -d "$GENERATOR_DIR" ] || mkdir "$GENERATOR_DIR" -[ -d "$GENERATOR_DIR"/sysroot.mount.d ] || mkdir "$GENERATOR_DIR"/sysroot.mount.d +mkdir -p "$GENERATOR_DIR"/sysroot.mount.d "$GENERATOR_DIR"/initrd-root-fs.target.requires "$GENERATOR_DIR"/dracut-pre-mount.service.d { echo "[Unit]" echo "Before=initrd-root-fs.target" echo "After=zfs-import.target" + echo echo "[Mount]" - if [ "${root}" = "zfs:AUTO" ] ; then + if [ "${root}" = "zfs:AUTO" ]; then echo "PassEnvironment=BOOTFS" echo 'What=${BOOTFS}' else - root="${root##zfs:}" - root="${root##ZFS=}" echo "What=${root}" fi - echo "Type=${rootfstype}" + echo "Type=zfs" echo "Options=${rootflags}" } > "$GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf +ln -fs ../sysroot.mount "$GENERATOR_DIR"/initrd-root-fs.target.requires/sysroot.mount -[ -d "$GENERATOR_DIR"/initrd-root-fs.target.requires ] || mkdir -p "$GENERATOR_DIR"/initrd-root-fs.target.requires -ln -s ../sysroot.mount "$GENERATOR_DIR"/initrd-root-fs.target.requires/sysroot.mount -echo "zfs-generator: finished" >> /dev/kmsg \ No newline at end of file +if [ "${root}" = "zfs:AUTO" ]; then + { + echo "[Unit]" + echo "Before=initrd-root-fs.target" + echo "After=sysroot.mount" + echo "DefaultDependencies=no" + echo + echo "[Service]" + echo "Type=oneshot" + echo "PassEnvironment=BOOTFS" + echo "ExecStart=/bin/sh -c '" ' \ + . /lib/dracut-zfs-lib.sh; \ + _zfs_nonroot_necessities_cb() { \ + zfs mount | grep -m1 -q "^$1 " && return 0; \ + echo "Mounting $1 on /sysroot$2"; \ + mount -o zfsutil -t zfs "$1" "/sysroot$2"; \ + }; \ + for_relevant_root_children "${BOOTFS}" _zfs_nonroot_necessities_cb;' \ + "'" + } > "$GENERATOR_DIR"/zfs-nonroot-necessities.service + ln -fs ../zfs-nonroot-necessities.service "$GENERATOR_DIR"/initrd-root-fs.target.requires/zfs-nonroot-necessities.service +else + # We can solve this statically at generation time, so do! + _zfs_generator_cb() { + dset="${1}" + mpnt="${2}" + unit="sysroot$(echo "$mpnt" | sed 's;/;-;g').mount" + + { + echo "[Unit]" + echo "Before=initrd-root-fs.target" + echo "After=sysroot.mount" + echo + echo "[Mount]" + echo "Where=/sysroot${mpnt}" + echo "What=${dset}" + echo "Type=zfs" + echo "Options=zfsutil" + } > "$GENERATOR_DIR/${unit}" + ln -fs ../"${unit}" "$GENERATOR_DIR"/initrd-root-fs.target.requires/"${unit}" + } + + for_relevant_root_children "${root}" _zfs_generator_cb +fi + + +{ + echo "[Unit]" + echo "After=zfs-import.target" +} > "$GENERATOR_DIR"/dracut-pre-mount.service.d/zfs-enhancement.conf + +[ -n "$debug" ] && echo "zfs-generator: finished" >> /dev/kmsg + +exit 0 diff --git a/contrib/dracut/90zfs/zfs-lib.sh.in b/contrib/dracut/90zfs/zfs-lib.sh.in index 23c07af9e8..defc0bfc8e 100755 --- a/contrib/dracut/90zfs/zfs-lib.sh.in +++ b/contrib/dracut/90zfs/zfs-lib.sh.in @@ -5,12 +5,8 @@ command -v getargbool >/dev/null || { # Compatibility with older Dracut versions. # With apologies to the Dracut developers. getargbool() { - if ! [ -z "$_b" ]; then - unset _b - fi _default="$1"; shift - _b=$(getarg "$@") - [ $? -ne 0 ] && [ -z "$_b" ] && _b="$_default" + ! _b=$(getarg "$@") && [ -z "$_b" ] && _b="$_default" if [ -n "$_b" ]; then [ "$_b" = "0" ] && return 1 [ "$_b" = "no" ] && return 1 @@ -23,6 +19,7 @@ command -v getargbool >/dev/null || { OLDIFS="${IFS}" NEWLINE=" " +TAB=" " ZPOOL_IMPORT_OPTS="" if getargbool 0 zfs_force -y zfs.force -y zfsforce ; then @@ -58,10 +55,11 @@ find_bootfs() { # import_pool POOL # imports the given zfs pool if it isn't imported already. import_pool() { - pool="${1}" + pool="${1}" if ! zpool list -H "${pool}" > /dev/null 2>&1; then info "ZFS: Importing pool ${pool}..." + # shellcheck disable=SC2086 if ! zpool import -N ${ZPOOL_IMPORT_OPTS} "${pool}" ; then warn "ZFS: Unable to import pool ${pool}" return 1 @@ -71,32 +69,67 @@ import_pool() { return 0 } +_mount_dataset_cb() { + mount -o zfsutil -t zfs "${1}" "${NEWROOT}${2}" +} + # mount_dataset DATASET # mounts the given zfs dataset. mount_dataset() { - dataset="${1}" + dataset="${1}" mountpoint="$(zfs get -H -o value mountpoint "${dataset}")" + ret=0 # We need zfsutil for non-legacy mounts and not for legacy mounts. if [ "${mountpoint}" = "legacy" ] ; then - mount -t zfs "${dataset}" "${NEWROOT}" + mount -t zfs "${dataset}" "${NEWROOT}" || ret=$? else - mount -o zfsutil -t zfs "${dataset}" "${NEWROOT}" + mount -o zfsutil -t zfs "${dataset}" "${NEWROOT}" || ret=$? + + if [ "$ret" = "0" ]; then + for_relevant_root_children "${dataset}" _mount_dataset_cb || ret=$? + fi fi - return $? + return ${ret} +} + +# for_relevant_root_children DATASET EXEC +# Runs "EXEC dataset mountpoint" for all children of DATASET that are needed for system bringup +# Used by zfs-generator.sh and friends, too! +for_relevant_root_children() { + dataset="${1}" + exec="${2}" + + zfs list -t filesystem -Ho name,mountpoint,canmount -r "${dataset}" | + ( + _ret=0 + while IFS="${TAB}" read -r dataset mountpoint canmount; do + [ "$canmount" != "on" ] && continue + + case "$mountpoint" in + /etc|/bin|/lib|/lib??|/libx32|/usr) + # If these aren't mounted we may not be able to get to the real init at all, or pollute the dataset holding the rootfs + "${exec}" "${dataset}" "${mountpoint}" || _ret=$? + ;; + *) + # Up to the real init to remount everything else it might need + ;; + esac + done + exit ${_ret} + ) } # export_all OPTS # exports all imported zfs pools. export_all() { - opts="${@}" ret=0 IFS="${NEWLINE}" for pool in $(zpool list -H -o name) ; do if zpool list -H "${pool}" > /dev/null 2>&1; then - zpool export "${pool}" ${opts} || ret=$? + zpool export "${pool}" "$@" || ret=$? fi done IFS="${OLDIFS}" @@ -144,10 +177,10 @@ ask_for_password() { { flock -s 9; # Prompt for password with plymouth, if installed and running. - if whereis plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then + if plymouth --ping 2>/dev/null; then plymouth ask-for-password \ - --prompt "$ply_prompt" --number-of-tries="$ply_tries" \ - --command="$ply_cmd" + --prompt "$ply_prompt" --number-of-tries="$ply_tries" | \ + eval "$ply_cmd" ret=$? else if [ "$tty_echo_off" = yes ]; then diff --git a/contrib/dracut/90zfs/zfs-load-key.sh.in b/contrib/dracut/90zfs/zfs-load-key.sh.in index 9e7adfc797..2138ff943c 100755 --- a/contrib/dracut/90zfs/zfs-load-key.sh.in +++ b/contrib/dracut/90zfs/zfs-load-key.sh.in @@ -1,7 +1,8 @@ #!/bin/sh +# shellcheck disable=SC2154 # only run this on systemd systems, we handle the decrypt in mount-zfs.sh in the mount hook otherwise -[ -e /bin/systemctl ] || return 0 +[ -e /bin/systemctl ] || [ -e /usr/bin/systemctl ] || return 0 # This script only gets executed on systemd systems, see mount-zfs.sh for non-systemd systems @@ -17,31 +18,56 @@ [ "${root##zfs:}" = "${root}" ] && [ "${root##ZFS=}" = "${root}" ] && [ "$rootfstype" != "zfs" ] && exit 0 # There is a race between the zpool import and the pre-mount hooks, so we wait for a pool to be imported -while true; do - zpool list -H | grep -q -v '^$' && break - [ "$(systemctl is-failed zfs-import-cache.service)" = 'failed' ] && exit 1 - [ "$(systemctl is-failed zfs-import-scan.service)" = 'failed' ] && exit 1 +while [ "$(zpool list -H)" = "" ]; do + systemctl is-failed --quiet zfs-import-cache.service zfs-import-scan.service && exit 1 sleep 0.1s done # run this after import as zfs-import-cache/scan service is confirmed good +# we do not overwrite the ${root} variable, but create a new one, BOOTFS, to hold the dataset if [ "${root}" = "zfs:AUTO" ] ; then - root="$(zpool list -H -o bootfs | awk '$1 != "-" {print; exit}')" + BOOTFS="$(zpool list -H -o bootfs | awk '$1 != "-" {print; exit}')" else - root="${root##zfs:}" - root="${root##ZFS=}" + BOOTFS="${root##zfs:}" + BOOTFS="${BOOTFS##ZFS=}" fi # if pool encryption is active and the zfs command understands '-o encryption' -if [ "$(zpool list -H -o feature@encryption $(echo "${root}" | awk -F\/ '{print $1}'))" = 'active' ]; then +if [ "$(zpool list -H -o feature@encryption "${BOOTFS%%/*}")" = 'active' ]; then # if the root dataset has encryption enabled - ENCRYPTIONROOT=$(zfs get -H -o value encryptionroot "${root}") + ENCRYPTIONROOT="$(zfs get -H -o value encryptionroot "${BOOTFS}")" if ! [ "${ENCRYPTIONROOT}" = "-" ]; then - # decrypt them - TRY_COUNT=5 - while [ $TRY_COUNT -gt 0 ]; do - systemd-ask-password "Encrypted ZFS password for ${root}" --no-tty | zfs load-key "${ENCRYPTIONROOT}" && break - TRY_COUNT=$((TRY_COUNT - 1)) - done + KEYSTATUS="$(zfs get -H -o value keystatus "${ENCRYPTIONROOT}")" + # continue only if the key needs to be loaded + [ "$KEYSTATUS" = "unavailable" ] || exit 0 + + KEYLOCATION="$(zfs get -H -o value keylocation "${ENCRYPTIONROOT}")" + case "${KEYLOCATION%%://*}" in + prompt) + for _ in 1 2 3; do + systemd-ask-password "Encrypted ZFS password for ${BOOTFS}" --no-tty | zfs load-key "${ENCRYPTIONROOT}" && break + done + ;; + http*) + systemctl start network-online.target + zfs load-key "${ENCRYPTIONROOT}" + ;; + file) + KEYFILE="${KEYLOCATION#file://}" + [ -r "${KEYFILE}" ] || udevadm settle + [ -r "${KEYFILE}" ] || { + info "Waiting for key ${KEYFILE} for ${ENCRYPTIONROOT}..." + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do + sleep 0.5s + [ -r "${KEYFILE}" ] && break + done + } + [ -r "${KEYFILE}" ] || warn "Key ${KEYFILE} for ${ENCRYPTIONROOT} hasn't appeared. Trying anyway." + zfs load-key "${ENCRYPTIONROOT}" + ;; + *) + zfs load-key "${ENCRYPTIONROOT}" + ;; + esac fi fi diff --git a/contrib/dracut/90zfs/zfs-needshutdown.sh.in b/contrib/dracut/90zfs/zfs-needshutdown.sh.in index e3d1b59cca..dd6de30c27 100755 --- a/contrib/dracut/90zfs/zfs-needshutdown.sh.in +++ b/contrib/dracut/90zfs/zfs-needshutdown.sh.in @@ -1,6 +1,6 @@ #!/bin/sh -type getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh +command -v getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh if zpool list 2>&1 | grep -q 'no pools available' ; then info "ZFS: No active pools, no need to export anything." diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in new file mode 100644 index 0000000000..bdc2469432 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Rollback bootfs just before it is mounted +Requisite=zfs-import.target +After=zfs-import.target zfs-snapshot-bootfs.service +Before=dracut-mount.service +DefaultDependencies=no +ConditionKernelCommandLine=bootfs.rollback + +[Service] +# ${BOOTFS} should have been set by zfs-env-bootfs.service +Type=oneshot +ExecStartPre=/bin/sh -c 'test -n "${BOOTFS}"' +ExecStart=/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.rollback)"; exec @sbindir@/zfs rollback -Rf "${BOOTFS}@${SNAPNAME:-%v}"' +RemainAfterExit=yes diff --git a/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in new file mode 100644 index 0000000000..6ea13850c3 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Snapshot bootfs just before it is mounted +Requisite=zfs-import.target +After=zfs-import.target +Before=dracut-mount.service +DefaultDependencies=no +ConditionKernelCommandLine=bootfs.snapshot + +[Service] +# ${BOOTFS} should have been set by zfs-env-bootfs.service +Type=oneshot +ExecStartPre=/bin/sh -c 'test -n "${BOOTFS}"' +ExecStart=-/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.snapshot)"; exec @sbindir@/zfs snapshot "${BOOTFS}@${SNAPNAME:-%v}"' +RemainAfterExit=yes diff --git a/contrib/dracut/Makefile.am b/contrib/dracut/Makefile.am index 1065e5e94f..8c9a6be089 100644 --- a/contrib/dracut/Makefile.am +++ b/contrib/dracut/Makefile.am @@ -1,3 +1,6 @@ +include $(top_srcdir)/config/Shellcheck.am + SUBDIRS = 02zfsexpandknowledge 90zfs +SHELLCHECKDIRS = $(SUBDIRS) EXTRA_DIST = README.dracut.markdown diff --git a/contrib/dracut/README.dracut.markdown b/contrib/dracut/README.dracut.markdown index b5fb288a13..f31543c3cf 100644 --- a/contrib/dracut/README.dracut.markdown +++ b/contrib/dracut/README.dracut.markdown @@ -59,6 +59,30 @@ to recover from this, you may use the `zfs_force` option or boot from a different filesystem and `zpool import -f` then `zpool export` the pool before rebooting with the new hostid. +* `bootfs.snapshot`: If listed, enables the zfs-snapshot-bootfs service on a Dracut system. The zfs-snapshot-bootfs service simply runs `zfs snapshot $BOOTFS@%v` after the pool has been imported but before the bootfs is mounted. `$BOOTFS` is substituted with the value of the bootfs setting on the pool. `%v` is substituted with the version string of the kernel currently being booted (e.g. 5.6.6-200.fc31.x86\_64). Failure to create the snapshot (e.g. because one with the same name already exists) will be logged, but will not otherwise interrupt the boot process. + + It is safe to leave the bootfs.snapshot flag set persistently on your kernel command line so that a new snapshot of your bootfs will be created on every kernel update. If you leave bootfs.snapshot set persistently on your kernel command line, you may find the below script helpful for automatically removing old snapshots of the bootfs along with their associated kernel. + + #!/usr/bin/sh + + if [[ "$1" == "remove" ]] && grep -q "\bbootfs.snapshot\b" /proc/cmdline; then + zfs destroy $(findmnt -n -o source /)@$2 &> /dev/null + fi + + exit 0 + + To use the above script place it in a plain text file named /etc/kernel/install.d/99-zfs-cleanup.install and mark it executable with the following command: + + $ chmod +x /etc/kernel/install.d/99-zfs-cleanup.install + + On Red Hat based systems, you can change the value of `installonly_limit` in /etc/dnf/dnf.conf to adjust the number of kernels and their associated snapshots that are kept. + +* `bootfs.snapshot=`: Is identical to the bootfs.snapshot parameter explained above except that the value substituted for \ will be used when creating the snapshot instead of the version string of the kernel currently being booted. + +* `bootfs.rollback`: If listed, enables the zfs-rollback-bootfs service on a Dracut system. The zfs-rollback-bootfs service simply runs `zfs rollback -Rf $BOOTFS@%v` after the pool has been imported but before the bootfs is mounted. If the rollback operation fails, the boot process will be interrupted with a Dracut rescue shell. __Use this parameter with caution. Intermediate snapshots of the bootfs will be destroyed!__ TIP: Keep your user data (e.g. /home) on separate file systems (it can be in the same pool though). + +* `bootfs.rollback=`: Is identical to the bootfs.rollback parameter explained above except that the value substituted for \ will be used when rolling back the bootfs instead of the version string of the kernel currently being booted. If you use this form, choose a snapshot that is new enough to contain the needed kernel modules under /lib/modules or use a kernel that has all the needed modules built-in. + How it Works ============ diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am index 87ec7a86f5..931ceb1316 100644 --- a/contrib/initramfs/Makefile.am +++ b/contrib/initramfs/Makefile.am @@ -1,23 +1,12 @@ -initrddir = $(datarootdir)/initramfs-tools +include $(top_srcdir)/config/Shellcheck.am -initrd_SCRIPTS = \ - conf.d/zfs conf-hooks.d/zfs hooks/zfs scripts/zfs scripts/local-top/zfs +initrddir = /usr/share/initramfs-tools -SUBDIRS = hooks scripts +dist_initrd_SCRIPTS = \ + zfsunlock + +SUBDIRS = conf.d conf-hooks.d hooks scripts +SHELLCHECKDIRS = hooks scripts EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/conf.d/zfs \ - $(top_srcdir)/contrib/initramfs/conf-hooks.d/zfs \ - $(top_srcdir)/contrib/initramfs/README.initramfs.markdown - -install-initrdSCRIPTS: $(EXTRA_DIST) - for d in conf.d conf-hooks.d hooks scripts scripts/local-top; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_srcdir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ - done - if [ -f etc/init.d/zfs ]; then \ - $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \ - cp $(top_srcdir)/etc/init.d/zfs \ - $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \ - fi + README.initramfs.markdown diff --git a/contrib/initramfs/README.initramfs.markdown b/contrib/initramfs/README.initramfs.markdown index fa19f001af..34e9bab3c7 100644 --- a/contrib/initramfs/README.initramfs.markdown +++ b/contrib/initramfs/README.initramfs.markdown @@ -1,94 +1,84 @@ -DESCRIPTION - These scripts are intended to be used with initramfs-tools, which is a similar - software product to "dracut" (which is used in RedHat based distributions), - and is mainly used by Debian GNU/Linux and derivatives to create an initramfs - so that the system can be booted off a ZFS filesystem. If you have no need or - interest in this, then it can safely be ignored. +## Description - These script were written with the primary intention of being portable and - usable on as many systems as possible. +These scripts are intended to be used with `initramfs-tools`, which is a +similar software product to `dracut` (which is used in Red Hat based +distributions), and is mainly used by Debian GNU/Linux and derivatives. - This is, in practice, usually not possible. But the intention is there. - And it is a good one. +These scripts share some common functionality with the SysV init scripts, +primarily the `/etc/zfs/zfs-functions` script. - They have been tested successfully on: +## Configuration - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie +### Root pool/filesystem - It uses some functionality common with the SYSV init scripts, primarily - the "/etc/zfs/zfs-functions" script. +Different distributions have their own standard on what to specify on the +kernel command line to boot off a ZFS filesystem. -FUNCTIONALITY - * Supports booting of a ZFS snapshot. - Do this by cloning the snapshot into a dataset. If this, the resulting - dataset, already exists, destroy it. Then mount it as the root filesystem. - * If snapshot does not exist, use base dataset (the part before '@') - as boot filesystem instead. - * Clone with 'mountpoint=none' and 'canmount=noauto' - we mount manually - and explicitly. - * Allow rollback of snapshots instead of clone it and boot from the clone. - * If no snapshot is specified on the 'root=' kernel command line, but - there is an '@', then get a list of snapshots below that filesystem - and ask the user which to use. +This script supports the following kernel command line argument combinations +(in this order - first match wins): - * Support all currently used kernel command line arguments - * Core options: - All the different distributions have their own standard on what to specify - on the kernel command line to boot of a ZFS filesystem. +* `rpool=` +* `bootfs=/` +* `rpool= bootfs=/` +* `-B zfs-bootfs=/` +* `root=/` +* `root=ZFS=/` +* `root=zfs:AUTO` +* `root=zfs:/` +* `rpool=rpool` - Supports the following kernel command line argument combinations - (in this order - first match win): - * rpool= (tries to finds bootfs automatically) - * bootfs=/ (uses this for rpool - first part) - * rpool= bootfs=/ - * -B zfs-bootfs=/ (uses this for rpool - first part) - * rpool=rpool (default if none of the above is used) - * root=/ (uses this for rpool - first part) - * root=ZFS=/ (uses this for rpool - first part, without 'ZFS=') - * root=zfs:AUTO (tries to detect both pool and rootfs - * root=zfs:/ (uses this for rpool - first part, without 'zfs:') +If a pool is specified, it will be used. Otherwise, in `AUTO` mode, all pools +will be searched. Pools may be excluded from the search by listing them in +`ZFS_POOL_EXCEPTIONS` in `/etc/default/zfs`. - Option could also be - * Extra (control) options: - * zfsdebug=(on,yes,1) Show extra debugging information - * zfsforce=(on,yes,1) Force import the pool - * rollback=(on,yes,1) Rollback (instead of clone) the snapshot +Pools will be imported as follows: - * 'Smarter' way to import pools. Don't just try cache file or /dev. - * Try to use /dev/disk/by-vdev (if /etc/zfs/vdev_id.conf exists), - * Try /dev/mapper (to be able to use LUKS backed pools as well as - multi-path devices). - * /dev/disk/by-id and any other /dev/disk/by-* directory that may exist. - * Use /dev as a last ditch attempt. - * Fallback to using the cache file if that exist if nothing else worked. - * Only try to import pool if it haven't already been imported - * This will negate the need to force import a pool that have not been - exported cleanly. - * Support exclusion of pools to import by setting ZFS_POOL_EXCEPTIONS - in /etc/default/zfs. +* Try `/dev/disk/by-vdev` if it exists; see `/etc/zfs/vdev_id.conf`. +* Try `/dev/disk/by-id` and any other `/dev/disk/by-*` directories. +* Try `/dev`. +* Use the cache file if nothing else worked. - Controlling in which order devices is searched for is controlled by - ZPOOL_IMPORT_PATH variable set in /etc/defaults/zfs. +This order may be modified by setting `ZPOOL_IMPORT_PATH` in +`/etc/default/zfs`. - * Support additional configuration variable ZFS_INITRD_ADDITIONAL_DATASETS - to mount additional filesystems not located under your root dataset. +If a dataset is specified, it will be used as the root filesystem. Otherwise, +this script will attempt to find a root filesystem automatically (in the +specified pool or all pools, as described above). - For example, if the root fs is specified as 'rpool/ROOT/rootfs', it will - automatically and without specific configuration mount any filesystems - below this on the mount point specified in the 'mountpoint' property. - Such as 'rpool/root/rootfs/var', 'rpool/root/rootfs/usr' etc) +Filesystems below the root filesystem will be automatically mounted with no +additional configuration necessary. For example, if the root filesystem is +`rpool/ROOT/rootfs`, `rpool/root/rootfs/var`, `rpool/root/rootfs/usr`, etc. +will be mounted (if they exist). - However, if one prefer to have separate filesystems, not located below - the root fs (such as 'rpool/var', 'rpool/ROOT/opt' etc), special - configuration needs to be done. This is what the variable, set in - /etc/defaults/zfs file, needs to be configured. The 'mountpoint' - property needs to be correct for this to work though. +### Snapshots - * Allows mounting a rootfs with mountpoint=legacy set. +The `` can be a snapshot. In this case, the snapshot will be cloned +and the clone used as the root filesystem. Note: - * Include /etc/modprobe.d/{zfs,spl}.conf in the initrd if it/they exist. +* If the snapshot does not exist, the base dataset (the part before `@`) is + used as the boot filesystem instead. +* If the resulting clone dataset already exists, it is destroyed. +* The clone is created with `mountpoint=none` and `canmount=noauto`. The root + filesystem is mounted manually by the initramfs script. +* If no snapshot is specified on the `root=` kernel command line, but + there is an `@`, the user will be prompted to choose a snapshot to use. - * Include the udev rule to use by-vdev for pool imports. +### Extra options - * Include the /etc/default/zfs file to the initrd. +The following kernel command line arguments are supported: + +* `zfsdebug=(on,yes,1)`: Show extra debugging information +* `zfsforce=(on,yes,1)`: Force import the pool +* `rollback=(on,yes,1)`: Rollback to (instead of clone) the snapshot + +### Unlocking a ZFS encrypted root over SSH + +To use this feature: + +1. Install the `dropbear-initramfs` package. You may wish to uninstall the + `cryptsetup-initramfs` package to avoid warnings. +2. Add your SSH key(s) to `/etc/dropbear-initramfs/authorized_keys`. Note + that Dropbear does not support ed25519 keys before version 2020.79; + in that case, use RSA (2048-bit or more) instead. +3. Rebuild the initramfs with your keys: `update-initramfs -u` +4. During the system boot, login via SSH and run: `zfsunlock` diff --git a/contrib/initramfs/conf-hooks.d/Makefile.am b/contrib/initramfs/conf-hooks.d/Makefile.am new file mode 100644 index 0000000000..f84ba5cc7e --- /dev/null +++ b/contrib/initramfs/conf-hooks.d/Makefile.am @@ -0,0 +1,4 @@ +confhooksddir = /usr/share/initramfs-tools/conf-hooks.d + +dist_confhooksd_DATA = \ + zfs diff --git a/contrib/initramfs/conf-hooks.d/zfs b/contrib/initramfs/conf-hooks.d/zfs index 29950cac04..b86d36223e 100644 --- a/contrib/initramfs/conf-hooks.d/zfs +++ b/contrib/initramfs/conf-hooks.d/zfs @@ -1,2 +1,9 @@ # Force the inclusion of Busybox in the initramfs. BUSYBOX=y + +# Setup the keyboard mapping so passphrases can be entered correctly. +KEYMAP=y + +# Require the plymouth script to guarantee working video for the passphrase +# prompting. +FRAMEBUFFER=y diff --git a/contrib/initramfs/conf.d/Makefile.am b/contrib/initramfs/conf.d/Makefile.am new file mode 100644 index 0000000000..5ef27e0aa1 --- /dev/null +++ b/contrib/initramfs/conf.d/Makefile.am @@ -0,0 +1,4 @@ +confddir = /usr/share/initramfs-tools/conf.d + +dist_confd_DATA = \ + zfs diff --git a/contrib/initramfs/hooks/.gitignore b/contrib/initramfs/hooks/.gitignore index 73304bc2cd..4e1604e188 100644 --- a/contrib/initramfs/hooks/.gitignore +++ b/contrib/initramfs/hooks/.gitignore @@ -1 +1,2 @@ zfs +zfsunlock diff --git a/contrib/initramfs/hooks/Makefile.am b/contrib/initramfs/hooks/Makefile.am index c866b4fb6c..0cd1aafcd3 100644 --- a/contrib/initramfs/hooks/Makefile.am +++ b/contrib/initramfs/hooks/Makefile.am @@ -1,21 +1,10 @@ -hooksdir = $(datarootdir)/initramfs-tools/hooks +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + +hooksdir = /usr/share/initramfs-tools/hooks hooks_SCRIPTS = \ - zfs + zfs \ + zfsunlock -EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/hooks/zfs.in - -$(hooks_SCRIPTS):%:%.in - -$(SED) -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - -e 's,@udevdir\@,$(udevdir),g' \ - -e 's,@udevruledir\@,$(udevruledir),g' \ - -e 's,@mounthelperdir\@,$(mounthelperdir),g' \ - $< >'$@' - -clean-local:: - -$(RM) $(hooks_SCRIPTS) - -distclean-local:: - -$(RM) $(hooks_SCRIPTS) +SUBSTFILES += $(hooks_SCRIPTS) diff --git a/contrib/initramfs/hooks/zfs.in b/contrib/initramfs/hooks/zfs.in index e35354141d..9d5c397cf2 100755 --- a/contrib/initramfs/hooks/zfs.in +++ b/contrib/initramfs/hooks/zfs.in @@ -1,106 +1,56 @@ #!/bin/sh # -# Add ZoL filesystem capabilities to an initrd, usually for a native ZFS root. +# Add OpenZFS filesystem capabilities to an initrd, usually for a native ZFS root. # -# This hook installs udev rules for ZoL. -PREREQ="zdev" - -# These prerequisites are provided by the zfsutils package. The zdb utility is -# not strictly required, but it can be useful at the initramfs recovery prompt. -COPY_EXEC_LIST="@sbindir@/zdb @sbindir@/zpool @sbindir@/zfs" -COPY_EXEC_LIST="$COPY_EXEC_LIST @mounthelperdir@/mount.zfs @udevdir@/vdev_id" -COPY_FILE_LIST="/etc/hostid @sysconfdir@/zfs/zpool.cache" -COPY_FILE_LIST="$COPY_FILE_LIST @sysconfdir@/default/zfs" -COPY_FILE_LIST="$COPY_FILE_LIST @sysconfdir@/zfs/zfs-functions" -COPY_FILE_LIST="$COPY_FILE_LIST @sysconfdir@/zfs/vdev_id.conf" -COPY_FILE_LIST="$COPY_FILE_LIST @udevruledir@/69-vdev.rules" - -# These prerequisites are provided by the base system. -COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/dirname /bin/hostname /sbin/blkid" -COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/env" - -# Explicitly specify all kernel modules because automatic dependency resolution -# is unreliable on many systems. -BASE_MODULES="zlib_deflate spl zavl zcommon znvpair zunicode zlua zfs icp" -CRPT_MODULES="sun-ccm sun-gcm sun-ctr" -MANUAL_ADD_MODULES_LIST="$BASE_MODULES" - -# Generic result code. -RC=0 - -case $1 in -prereqs) - echo "$PREREQ" - exit 0 - ;; -esac - -for ii in $COPY_EXEC_LIST -do - if [ ! -x "$ii" ] - then - echo "Error: $ii is not executable." - RC=2 - fi -done - -if [ "$RC" -ne 0 ] -then - exit "$RC" +if [ "$1" = "prereqs" ]; then + echo "udev" + exit fi . /usr/share/initramfs-tools/hook-functions -mkdir -p "$DESTDIR/etc/" - -# ZDB uses pthreads for some functions, but the library dependency is not -# automatically detected. The `find` utility and extended `cp` options are -# used here because libgcc_s.so could be in a subdirectory of /lib for -# multi-arch installations. -cp --target-directory="$DESTDIR" --parents $(find /lib/ -type f -name libgcc_s.so.1) - -for ii in $COPY_EXEC_LIST -do - copy_exec "$ii" +for req in "@sbindir@/zpool" "@sbindir@/zfs" "@mounthelperdir@/mount.zfs"; do + copy_exec "$req" || { + echo "$req not available!" >&2 + exit 2 + } done -for ii in $COPY_FILE_LIST -do - dir=$(dirname "$ii") - [ -d "$dir" ] && mkdir -p "$DESTDIR/$dir" - [ -f "$ii" ] && cp -p "$ii" "$DESTDIR/$ii" +copy_exec "@udevdir@/vdev_id" +copy_exec "@udevdir@/zvol_id" +if command -v systemd-ask-password > /dev/null; then + copy_exec "$(command -v systemd-ask-password)" +fi + +# We use pthreads, but i-t from buster doesn't automatically +# copy this indirect dependency: this can be removed when buster finally dies. +find /lib/ -type f -name "libgcc_s.so.[1-9]" | while read -r libgcc; do + copy_exec "$libgcc" done -for ii in $MANUAL_ADD_MODULES_LIST -do - manual_add_modules "$ii" -done +# shellcheck disable=SC2050 +if [ @LIBFETCH_DYNAMIC@ != 0 ]; then + find /lib/ -name @LIBFETCH_SONAME@ | while read -r libfetch; do + copy_exec "$libfetch" + done +fi -if [ -f "/etc/hostname" ] -then - cp -p "/etc/hostname" "$DESTDIR/etc/" +copy_file config "/etc/hostid" +copy_file cache "@sysconfdir@/zfs/zpool.cache" +copy_file config "@initconfdir@/zfs" +copy_file config "@sysconfdir@/zfs/zfs-functions" +copy_file config "@sysconfdir@/zfs/vdev_id.conf" +copy_file rule "@udevruledir@/60-zvol.rules" +copy_file rule "@udevruledir@/69-vdev.rules" + +manual_add_modules zfs + +if [ -f "/etc/hostname" ]; then + copy_file config "/etc/hostname" else - hostname >"$DESTDIR/etc/hostname" + hostname="$(mktemp -t hostname.XXXXXXXXXX)" + hostname > "$hostname" + copy_file config "$hostname" "/etc/hostname" + rm -f "$hostname" fi - -for ii in zfs zfs.conf spl spl.conf -do - if [ -f "/etc/modprobe.d/$ii" ]; then - if [ ! -d "$DESTDIR/etc/modprobe.d" ]; then - mkdir -p $DESTDIR/etc/modprobe.d - fi - cp -p "/etc/modprobe.d/$ii" $DESTDIR/etc/modprobe.d/ - fi -done - -# With pull request #1476 (not yet merged) comes a verbose warning -# if /usr/bin/net doesn't exist or isn't executable. Just create -# a dummy... -[ ! -d "$DESTDIR/usr/bin" ] && mkdir -p "$DESTDIR/usr/bin" -if [ ! -x "$DESTDIR/usr/bin/net" ]; then - touch "$DESTDIR/usr/bin/net" - chmod +x "$DESTDIR/usr/bin/net" -fi - -exit 0 diff --git a/contrib/initramfs/hooks/zfsunlock.in b/contrib/initramfs/hooks/zfsunlock.in new file mode 100644 index 0000000000..4776087d9a --- /dev/null +++ b/contrib/initramfs/hooks/zfsunlock.in @@ -0,0 +1,10 @@ +#!/bin/sh + +if [ "$1" = "prereqs" ]; then + echo "dropbear" + exit +fi + +. /usr/share/initramfs-tools/hook-functions + +copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin/zfsunlock diff --git a/contrib/initramfs/scripts/Makefile.am b/contrib/initramfs/scripts/Makefile.am index a550311cd7..444a5f374b 100644 --- a/contrib/initramfs/scripts/Makefile.am +++ b/contrib/initramfs/scripts/Makefile.am @@ -1,20 +1,11 @@ -scriptsdir = $(datarootdir)/initramfs-tools/scripts +include $(top_srcdir)/config/Shellcheck.am -scripts_DATA = \ +scriptsdir = /usr/share/initramfs-tools/scripts + +dist_scripts_SCRIPTS = \ zfs SUBDIRS = local-top -EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/scripts/zfs.in - -$(scripts_DATA):%:%.in - -$(SED) -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -clean-local:: - -$(RM) $(scripts_SCRIPTS) - -distclean-local:: - -$(RM) $(scripts_SCRIPTS) +SHELLCHECKDIRS = $(SUBDIRS) +SHELLCHECK_SHELL = sh diff --git a/contrib/initramfs/scripts/local-top/Makefile.am b/contrib/initramfs/scripts/local-top/Makefile.am index 88aa2d4ffa..897f9b2e21 100644 --- a/contrib/initramfs/scripts/local-top/Makefile.am +++ b/contrib/initramfs/scripts/local-top/Makefile.am @@ -1,3 +1,6 @@ -localtopdir = $(datarootdir)/initramfs-tools/scripts/local-top +include $(top_srcdir)/config/Shellcheck.am -EXTRA_DIST = zfs +localtopdir = /usr/share/initramfs-tools/scripts/local-top + +dist_localtop_SCRIPTS = \ + zfs diff --git a/contrib/initramfs/scripts/local-top/zfs b/contrib/initramfs/scripts/local-top/zfs index e8e5cd2645..6b80e9f436 100755 --- a/contrib/initramfs/scripts/local-top/zfs +++ b/contrib/initramfs/scripts/local-top/zfs @@ -1,18 +1,11 @@ #!/bin/sh -PREREQ="mdadm mdrun multipath" +# shellcheck disable=SC2154 -prereqs() -{ - echo "$PREREQ" -} -case $1 in -# get pre-requisites -prereqs) - prereqs +if [ "$1" = "prereqs" ]; then + echo mdadm mdrun multipath exit 0 - ;; -esac +fi # @@ -20,10 +13,10 @@ esac # message() { - if [ -x /bin/plymouth ] && plymouth --ping; then - plymouth message --text="$@" + if plymouth --ping 2>/dev/null; then + plymouth message --text="$*" else - echo "$@" >&2 + echo "$*" >&2 fi return 0 } diff --git a/contrib/initramfs/scripts/zfs.in b/contrib/initramfs/scripts/zfs similarity index 78% rename from contrib/initramfs/scripts/zfs.in rename to contrib/initramfs/scripts/zfs index 36b7f436c1..35502291e6 100644 --- a/contrib/initramfs/scripts/zfs.in +++ b/contrib/initramfs/scripts/zfs @@ -5,26 +5,20 @@ # # Enable this by passing boot=zfs on the kernel command line. # +# $quiet, $root, $rpool, $bootfs come from the cmdline: +# shellcheck disable=SC2154 -# Source the common init script +# Source the common functions . /etc/zfs/zfs-functions -# Paths to what we need - in the initrd, these paths are hardcoded, -# so override the defines in zfs-functions. -ZFS="@sbindir@/zfs" -ZPOOL="@sbindir@/zpool" -ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" -export ZFS ZPOOL ZPOOL_CACHE - - # Start interactive shell. # Use debian's panic() if defined, because it allows to prevent shell access # by setting panic in cmdline (e.g. panic=0 or panic=15). # See "4.5 Disable root prompt on the initramfs" of Securing Debian Manual: # https://www.debian.org/doc/manuals/securing-debian-howto/ch4.en.html shell() { - if type panic > /dev/null 2>&1; then - panic $@ + if command -v panic > /dev/null 2>&1; then + panic else /bin/sh fi @@ -34,22 +28,23 @@ shell() { # pools and mounting any filesystems. pre_mountroot() { - if type run_scripts > /dev/null 2>&1 && \ - [ -f "/scripts/local-top" -o -d "/scripts/local-top" ] + if command -v run_scripts > /dev/null 2>&1 then - [ "$quiet" != "y" ] && \ - zfs_log_begin_msg "Running /scripts/local-top" - run_scripts /scripts/local-top - [ "$quiet" != "y" ] && zfs_log_end_msg - fi + if [ -f "/scripts/local-top" ] || [ -d "/scripts/local-top" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-top" + run_scripts /scripts/local-top + [ "$quiet" != "y" ] && zfs_log_end_msg + fi - if type run_scripts > /dev/null 2>&1 && \ - [ -f "/scripts/local-premount" -o -d "/scripts/local-premount" ] - then - [ "$quiet" != "y" ] && \ - zfs_log_begin_msg "Running /scripts/local-premount" - run_scripts /scripts/local-premount - [ "$quiet" != "y" ] && zfs_log_end_msg + if [ -f "/scripts/local-premount" ] || [ -d "/scripts/local-premount" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-premount" + run_scripts /scripts/local-premount + [ "$quiet" != "y" ] && zfs_log_end_msg + fi fi } @@ -65,10 +60,10 @@ disable_plymouth() # Get a ZFS filesystem property value. get_fs_value() { - local fs="$1" - local value=$2 + fs="$1" + value=$2 - "${ZFS}" get -H -ovalue $value "$fs" 2> /dev/null + "${ZFS}" get -H -ovalue "$value" "$fs" 2> /dev/null } # Find the 'bootfs' property on pool $1. @@ -76,9 +71,9 @@ get_fs_value() # pool by exporting it again. find_rootfs() { - local pool="$1" + pool="$1" - # If 'POOL_IMPORTED' isn't set, no pool imported and therefor + # If 'POOL_IMPORTED' isn't set, no pool imported and therefore # we won't be able to find a root fs. [ -z "${POOL_IMPORTED}" ] && return 1 @@ -92,7 +87,7 @@ find_rootfs() # Make sure it's not '-' and that it starts with /. if [ "${ZFS_BOOTFS}" != "-" ] && \ - $(get_fs_value "${ZFS_BOOTFS}" mountpoint | grep -q '^/$') + get_fs_value "${ZFS_BOOTFS}" mountpoint | grep -q '^/$' then # Keep it mounted POOL_IMPORTED=1 @@ -101,23 +96,18 @@ find_rootfs() # Not boot fs here, export it and later try again.. "${ZPOOL}" export "$pool" - POOL_IMPORTED="" - + POOL_IMPORTED= + ZFS_BOOTFS= return 1 } # Support function to get a list of all pools, separated with ';' find_pools() { - local CMD="$*" - local pools pool - - pools=$($CMD 2> /dev/null | \ + pools=$("$@" 2> /dev/null | \ grep -E "pool:|^[a-zA-Z0-9]" | \ sed 's@.*: @@' | \ - while read pool; do \ - echo -n "$pool;" - done) + tr '\n' ';') echo "${pools%%;}" # Return without the last ';'. } @@ -125,8 +115,6 @@ find_pools() # Get a list of all available pools get_pools() { - local available_pools npools - if [ -n "${ZFS_POOL_IMPORT}" ]; then echo "$ZFS_POOL_IMPORT" return 0 @@ -135,7 +123,7 @@ get_pools() # Get the base list of available pools. available_pools=$(find_pools "$ZPOOL" import) - # Just in case - seen it happen (that a pool isn't visable/found + # Just in case - seen it happen (that a pool isn't visible/found # with a simple "zpool import" but only when using the "-d" # option or setting ZPOOL_IMPORT_PATH). if [ -d "/dev/disk/by-id" ] @@ -167,9 +155,8 @@ get_pools() # Filter out any exceptions... if [ -n "$ZFS_POOL_EXCEPTIONS" ] then - local found="" - local apools="" - local pool exception + found="" + apools="" OLD_IFS="$IFS" ; IFS=";" for pool in $available_pools @@ -202,8 +189,7 @@ get_pools() # Import given pool $1 import_pool() { - local pool="$1" - local dirs dir + pool="$1" # Verify that the pool isn't already imported # Make as sure as we can to not require '-f' to import. @@ -213,15 +199,15 @@ import_pool() # to something we can use later with the real import(s). We want to # make sure we find all by* dirs, BUT by-vdev should be first (if it # exists). - if [ -n "$USE_DISK_BY_ID" -a -z "$ZPOOL_IMPORT_PATH" ] + if [ -n "$USE_DISK_BY_ID" ] && [ -z "$ZPOOL_IMPORT_PATH" ] then - dirs="$(for dir in $(echo /dev/disk/by-*) + dirs="$(for dir in /dev/disk/by-* do # Ignore by-vdev here - we want it first! echo "$dir" | grep -q /by-vdev && continue [ ! -d "$dir" ] && continue - echo -n "$dir:" + printf "%s" "$dir:" done | sed 's,:$,,g')" if [ -d "/dev/disk/by-vdev" ] @@ -285,7 +271,9 @@ import_pool() # with more logging etc. load_module_initrd() { - if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" > 0 ] + [ -n "$ROOTDELAY" ] && ZFS_INITRD_PRE_MOUNTROOT_SLEEP="$ROOTDELAY" + + if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ] 2>/dev/null then if [ "$quiet" != "y" ]; then zfs_log_begin_msg "Sleeping for" \ @@ -296,9 +284,9 @@ load_module_initrd() fi # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. - if type wait_for_udev > /dev/null 2>&1 ; then + if command -v wait_for_udev > /dev/null 2>&1 ; then wait_for_udev 10 - elif type wait_for_dev > /dev/null 2>&1 ; then + elif command -v wait_for_dev > /dev/null 2>&1 ; then wait_for_dev fi @@ -308,7 +296,7 @@ load_module_initrd() # Load the module load_module "zfs" || return 1 - if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" > 0 ] + if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" -gt 0 ] 2>/dev/null then if [ "$quiet" != "y" ]; then zfs_log_begin_msg "Sleeping for" \ @@ -324,12 +312,10 @@ load_module_initrd() # Mount a given filesystem mount_fs() { - local fs="$1" - local mountpoint + fs="$1" # Check that the filesystem exists - "${ZFS}" list -oname -tfilesystem -H "${fs}" > /dev/null 2>&1 - [ "$?" -ne 0 ] && return 1 + "${ZFS}" list -oname -tfilesystem -H "${fs}" > /dev/null 2>&1 || return 1 # Skip filesystems with canmount=off. The root fs should not have # canmount=off, but ignore it for backwards compatibility just in case. @@ -341,14 +327,15 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - if [ "$mountpoint" = "legacy" -o "$mountpoint" = "none" ]; then + ZFS_CMD="mount -o zfsutil -t zfs" + if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in # clone_snap() if that's usable. mountpoint=$(get_fs_value "$fs" org.zol:mountpoint) - if [ "$mountpoint" = "legacy" -o \ - "$mountpoint" = "none" -o \ - "$mountpoint" = "-" ] + if [ "$mountpoint" = "legacy" ] || + [ "$mountpoint" = "none" ] || + [ "$mountpoint" = "-" ] then if [ "$fs" != "${ZFS_BOOTFS}" ]; then # We don't have a proper mountpoint and this @@ -360,15 +347,11 @@ mount_fs() fi fi + # If it's not a legacy filesystem, it can only be a + # native one... if [ "$mountpoint" = "legacy" ]; then ZFS_CMD="mount -t zfs" - else - # If it's not a legacy filesystem, it can only be a - # native one... - ZFS_CMD="mount -o zfsutil -t zfs" fi - else - ZFS_CMD="mount -o zfsutil -t zfs" fi # Possibly decrypt a filesystem using native encryption. @@ -401,39 +384,54 @@ mount_fs() return 0 } -# Unlock a ZFS native crypted filesystem. +# Unlock a ZFS native encrypted filesystem. decrypt_fs() { - local fs="$1" - + fs="$1" + # If pool encryption is active and the zfs command understands '-o encryption' - if [ "$(zpool list -H -o feature@encryption $(echo "${fs}" | awk -F\/ '{print $1}'))" = 'active' ]; then + if [ "$(zpool list -H -o feature@encryption "${fs%%/*}")" = 'active' ]; then # Determine dataset that holds key for root dataset - ENCRYPTIONROOT=$(${ZFS} get -H -o value encryptionroot "${fs}") - DECRYPT_CMD="${ZFS} load-key '${ENCRYPTIONROOT}'" + ENCRYPTIONROOT="$(get_fs_value "${fs}" encryptionroot)" + KEYLOCATION="$(get_fs_value "${ENCRYPTIONROOT}" keylocation)" + + echo "${ENCRYPTIONROOT}" > /run/zfs_fs_name # If root dataset is encrypted... if ! [ "${ENCRYPTIONROOT}" = "-" ]; then + KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" + # Continue only if the key needs to be loaded + [ "$KEYSTATUS" = "unavailable" ] || return 0 + + # Do not prompt if key is stored noninteractively, + if ! [ "${KEYLOCATION}" = "prompt" ]; then + $ZFS load-key "${ENCRYPTIONROOT}" # Prompt with plymouth, if active - if [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then - plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" \ - --number-of-tries="3" \ - --command="${DECRYPT_CMD}" + elif /bin/plymouth --ping 2>/dev/null; then + echo "plymouth" > /run/zfs_console_askpwd_cmd + for _ in 1 2 3; do + plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ + $ZFS load-key "${ENCRYPTIONROOT}" && break + done - # Prompt with systemd, if active + # Prompt with systemd, if active elif [ -e /run/systemd/system ]; then - TRY_COUNT=3 - while [ $TRY_COUNT -gt 0 ]; do + echo "systemd-ask-password" > /run/zfs_console_askpwd_cmd + for _ in 1 2 3; do systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ - ${DECRYPT_CMD} && break - TRY_COUNT=$((TRY_COUNT - 1)) + $ZFS load-key "${ENCRYPTIONROOT}" && break done # Prompt with ZFS tty, otherwise else - eval "${DECRYPT_CMD}" + # Temporarily setting "printk" to "7" allows the prompt to appear even when the "quiet" kernel option has been used + echo "load-key" > /run/zfs_console_askpwd_cmd + storeprintk="$(awk '{print $1}' /proc/sys/kernel/printk)" + echo 7 > /proc/sys/kernel/printk + $ZFS load-key "${ENCRYPTIONROOT}" + echo "$storeprintk" > /proc/sys/kernel/printk fi fi fi @@ -444,7 +442,7 @@ decrypt_fs() # Destroy a given filesystem. destroy_fs() { - local fs="$1" + fs="$1" [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Destroying '$fs'" @@ -479,9 +477,9 @@ destroy_fs() # mounted with a 'zfs mount -a' in the init/systemd scripts). clone_snap() { - local snap="$1" - local destfs="$2" - local mountpoint="$3" + snap="$1" + destfs="$2" + mountpoint="$3" [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'" @@ -519,7 +517,7 @@ clone_snap() # Rollback a given snapshot. rollback_snap() { - local snap="$1" + snap="$1" [ "$quiet" != "y" ] && zfs_log_begin_msg "Rollback $snap" @@ -549,9 +547,7 @@ rollback_snap() # to the user to choose from. ask_user_snap() { - local fs="$1" - local i=1 - local SNAP snapnr snap debug + fs="$1" # We need to temporarily disable debugging. Set 'debug' so we # remember to enabled it again. @@ -564,16 +560,25 @@ ask_user_snap() # Because we need the resulting snapshot, which is sent on # stdout to the caller, we use stderr for our questions. echo "What snapshot do you want to boot from?" > /dev/stderr - while read snap; do - echo " $i: ${snap}" > /dev/stderr - eval `echo SNAP_$i=$snap` - i=$((i + 1)) - done < /dev/stderr - read snapnr + i=1 + for snap in "$@"; do + echo " $i: $snap" + i=$((i + 1)) + done > /dev/stderr + + # expr instead of test here because [ a -lt 0 ] errors out, + # but expr falls back to lexicographical, which works out right + snapnr=0 + while expr "$snapnr" "<" 1 > /dev/null || + expr "$snapnr" ">" "$#" > /dev/null + do + printf "%s" "Snap nr [1-$#]? " > /dev/stderr + read -r snapnr + done # Re-enable debugging. if [ -n "${debug}" ]; then @@ -581,16 +586,16 @@ EOT set -x fi - echo "$(eval echo "$"SNAP_$snapnr)" + eval echo '$'"$snapnr" } setup_snapshot_booting() { - local snap="$1" - local s destfs subfs mountpoint retval=0 filesystems fs + snap="$1" + retval=0 - # Make sure that the snapshot specified actually exist. - if [ ! $(get_fs_value "${snap}" type) ] + # Make sure that the snapshot specified actually exists. + if [ ! "$(get_fs_value "${snap}" type)" ] then # Snapshot does not exist (...@ ?) # ask the user for a snapshot to use. @@ -606,8 +611,8 @@ setup_snapshot_booting() if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline then # If the destination dataset for the clone - # already exists, destroy it. Recursivly - if [ $(get_fs_value "${rootfs}_${snapname}" type) ]; then + # already exists, destroy it. Recursively + if [ "$(get_fs_value "${rootfs}_${snapname}" type)" ]; then filesystems=$("${ZFS}" list -oname -tfilesystem -H \ -r -Sname "${ZFS_BOOTFS}") for fs in $filesystems; do @@ -616,7 +621,7 @@ setup_snapshot_booting() fi fi - # Get all snapshots, recursivly (might need to clone /usr, /var etc + # Get all snapshots, recursively (might need to clone /usr, /var etc # as well). for s in $("${ZFS}" list -H -oname -tsnapshot -r "${rootfs}" | \ grep "${snapname}") @@ -625,6 +630,7 @@ setup_snapshot_booting() then # Rollback snapshot rollback_snap "$s" || retval=$((retval + 1)) + ZFS_BOOTFS="${rootfs}" else # Setup a destination filesystem name. # Ex: Called with 'rpool/ROOT/debian@snap2' @@ -642,8 +648,8 @@ setup_snapshot_booting() # with clone_snap(). If legacy or none, then use # the sub fs value. mountpoint=$(get_fs_value "${s%%@*}" mountpoint) - if [ "$mountpoint" = "legacy" -o \ - "$mountpoint" = "none" ] + if [ "$mountpoint" = "legacy" ] || \ + [ "$mountpoint" = "none" ] then if [ -n "${subfs}" ]; then mountpoint="${subfs}" @@ -668,8 +674,6 @@ setup_snapshot_booting() # This is the main function. mountroot() { - local snaporig snapsub destfs pool POOLS - # ---------------------------------------------------------------- # I N I T I A L S E T U P @@ -703,7 +707,8 @@ mountroot() # ------------ # Look for the cache file (if any). - [ ! -f ${ZPOOL_CACHE} ] && unset ZPOOL_CACHE + [ -f "${ZPOOL_CACHE}" ] || unset ZPOOL_CACHE + [ -s "${ZPOOL_CACHE}" ] || unset ZPOOL_CACHE # ------------ # Compatibility: 'ROOT' is for Debian GNU/Linux (etc), @@ -732,7 +737,7 @@ mountroot() # No longer set in the defaults file, but it could have been set in # get_pools() in some circumstances. If it's something, but not 'yes', # it's no good to us. - [ -n "$USE_DISK_BY_ID" -a "$USE_DISK_BY_ID" != 'yes' ] && \ + [ -n "$USE_DISK_BY_ID" ] && [ "$USE_DISK_BY_ID" != 'yes' ] && \ unset USE_DISK_BY_ID # ---------------------------------------------------------------- @@ -778,12 +783,12 @@ mountroot() # ------------ # If we have 'ROOT' (see above), but not 'ZFS_BOOTFS', then use # 'ROOT' - [ -n "$ROOT" -a -z "${ZFS_BOOTFS}" ] && ZFS_BOOTFS="$ROOT" + [ -n "$ROOT" ] && [ -z "${ZFS_BOOTFS}" ] && ZFS_BOOTFS="$ROOT" # ------------ # Check for the `-B zfs-bootfs=%s/%u,...` kind of parameter. # NOTE: Only use the pool name and dataset. The rest is not - # supported by ZoL (whatever it's for). + # supported by OpenZFS (whatever it's for). if [ -z "$ZFS_RPOOL" ] then # The ${zfs-bootfs} variable is set at the kernel command @@ -793,17 +798,18 @@ mountroot() # # Reassign the variable by dumping the environment and # stripping the zfs-bootfs= prefix. Let the shell handle - # quoting through the eval command. + # quoting through the eval command: + # shellcheck disable=SC2046 eval ZFS_RPOOL=$(set | sed -n -e 's,^zfs-bootfs=,,p') fi # ------------ # No root fs or pool specified - do auto detect. - if [ -z "$ZFS_RPOOL" -a -z "${ZFS_BOOTFS}" ] + if [ -z "$ZFS_RPOOL" ] && [ -z "${ZFS_BOOTFS}" ] then # Do auto detect. Do this by 'cheating' - set 'root=zfs:AUTO' # which will be caught later - ROOT=zfs:AUTO + ROOT='zfs:AUTO' fi # ---------------------------------------------------------------- @@ -814,6 +820,11 @@ mountroot() then # Try to detect both pool and root fs. + # If we got here, that means we don't have a hint so as to + # the root dataset, but with root=zfs:AUTO on cmdline, + # this says "zfs:AUTO" here and interferes with checks later + ZFS_BOOTFS= + [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Attempting to import additional pools." @@ -831,8 +842,8 @@ mountroot() do [ -z "$pool" ] && continue - import_pool "$pool" - find_rootfs "$pool" + IFS="$OLD_IFS" import_pool "$pool" + IFS="$OLD_IFS" find_rootfs "$pool" && break done IFS="$OLD_IFS" @@ -843,12 +854,12 @@ mountroot() # Strip 'zfs:' and 'ZFS='. ZFS_BOOTFS="${ROOT#*[:=]}" - # Stip everything after the first slash. + # Strip everything after the first slash. ZFS_RPOOL="${ZFS_BOOTFS%%/*}" fi # Import the pool (if not already done so in the AUTO check above). - if [ -n "$ZFS_RPOOL" -a -z "${POOL_IMPORTED}" ] + if [ -n "$ZFS_RPOOL" ] && [ -z "${POOL_IMPORTED}" ] then [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Importing ZFS root pool '$ZFS_RPOOL'" @@ -870,7 +881,7 @@ mountroot() echo "" echo "No pool imported. Manually import the root pool" echo "at the command prompt and then exit." - echo "Hint: Try: zpool import -R ${rootmnt} -N ${ZFS_RPOOL}" + echo "Hint: Try: zpool import -N ${ZFS_RPOOL}" shell fi @@ -878,27 +889,12 @@ mountroot() pool="$("${ZPOOL}" get name,guid -o name,value -H | \ awk -v pool="${ZFS_RPOOL}" '$2 == pool { print $1 }')" if [ -n "$pool" ]; then - ZFS_BOOTFS="${pool}/${ZFS_BOOTFS#*/}" + # If $ZFS_BOOTFS contains guid, replace the guid portion with $pool + ZFS_BOOTFS=$(echo "$ZFS_BOOTFS" | \ + sed -e "s/$("${ZPOOL}" get guid -o value "$pool" -H)/$pool/g") ZFS_RPOOL="${pool}" fi - # Set elevator=noop on the root pool's vdevs' disks. ZFS already - # does this for wholedisk vdevs (for all pools), so this is only - # important for partitions. - "${ZPOOL}" status -L "${ZFS_RPOOL}" 2> /dev/null | - awk '/^\t / && !/(mirror|raidz)/ { - dev=$1; - sub(/[0-9]+$/, "", dev); - print dev - }' | - while read i - do - if grep -sq noop /sys/block/$i/queue/scheduler - then - echo noop > "/sys/block/$i/queue/scheduler" - fi - done - # ---------------------------------------------------------------- # P R E P A R E R O O T F I L E S Y S T E M @@ -944,12 +940,22 @@ mountroot() # Go through the complete list (recursively) of all filesystems below # the real root dataset - filesystems=$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}") - for fs in $filesystems $ZFS_INITRD_ADDITIONAL_DATASETS - do + filesystems="$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}")" + OLD_IFS="$IFS" ; IFS=" +" + for fs in $filesystems; do + IFS="$OLD_IFS" mount_fs "$fs" + done + IFS="$OLD_IFS" + for fs in $ZFS_INITRD_ADDITIONAL_DATASETS; do mount_fs "$fs" done + touch /run/zfs_unlock_complete + if [ -e /run/zfs_unlock_complete_notify ]; then + read -r < /run/zfs_unlock_complete_notify + fi + # ------------ # Debugging information if [ -n "${ZFS_DEBUG}" ] @@ -965,8 +971,8 @@ mountroot() echo echo "=> waiting for ENTER before continuing because of 'zfsdebug=1'. " - echo -n " 'c' for shell, 'r' for reboot, 'ENTER' to continue. " - read b + printf "%s" " 'c' for shell, 'r' for reboot, 'ENTER' to continue. " + read -r b [ "$b" = "c" ] && /bin/sh [ "$b" = "r" ] && reboot -f @@ -976,12 +982,14 @@ mountroot() # ------------ # Run local bottom script - if type run_scripts > /dev/null 2>&1 && \ - [ -f "/scripts/local-bottom" -o -d "/scripts/local-bottom" ] + if command -v run_scripts > /dev/null 2>&1 then - [ "$quiet" != "y" ] && \ - zfs_log_begin_msg "Running /scripts/local-bottom" - run_scripts /scripts/local-bottom - [ "$quiet" != "y" ] && zfs_log_end_msg + if [ -f "/scripts/local-bottom" ] || [ -d "/scripts/local-bottom" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-bottom" + run_scripts /scripts/local-bottom + [ "$quiet" != "y" ] && zfs_log_end_msg + fi fi } diff --git a/contrib/initramfs/zfsunlock b/contrib/initramfs/zfsunlock new file mode 100755 index 0000000000..cf8e452490 --- /dev/null +++ b/contrib/initramfs/zfsunlock @@ -0,0 +1,42 @@ +#!/bin/sh + +set -eu +if [ ! -e /run/zfs_fs_name ]; then + echo "Wait for the root pool to be imported or press Ctrl-C to exit." +fi +while [ ! -e /run/zfs_fs_name ]; do + if [ -e /run/zfs_unlock_complete ]; then + exit 0 + fi + sleep 1 +done +echo +echo "Unlocking encrypted ZFS filesystems..." +echo "Enter the password or press Ctrl-C to exit." +echo +zfs_fs_name="" +if [ ! -e /run/zfs_unlock_complete_notify ]; then + mkfifo /run/zfs_unlock_complete_notify +fi +while [ ! -e /run/zfs_unlock_complete ]; do + zfs_fs_name=$(cat /run/zfs_fs_name) + zfs_console_askpwd_cmd=$(cat /run/zfs_console_askpwd_cmd) + systemd-ask-password "Encrypted ZFS password for ${zfs_fs_name}:" | \ + /sbin/zfs load-key "$zfs_fs_name" || true + if [ "$(/sbin/zfs get -H -ovalue keystatus "$zfs_fs_name" 2> /dev/null)" = "available" ]; then + echo "Password for $zfs_fs_name accepted." + zfs_console_askpwd_pid=$(ps | awk '!'"/awk/ && /$zfs_console_askpwd_cmd/ { print \$1; exit }") + if [ -n "$zfs_console_askpwd_pid" ]; then + kill "$zfs_console_askpwd_pid" + fi + # Wait for another filesystem to unlock. + while [ "$(cat /run/zfs_fs_name)" = "$zfs_fs_name" ] && [ ! -e /run/zfs_unlock_complete ]; do + sleep 1 + done + else + echo "Wrong password. Try again." + fi +done +echo "Unlocking complete. Resuming boot sequence..." +echo "Please reconnect in a while." +echo "ok" > /run/zfs_unlock_complete_notify diff --git a/contrib/intel_qat/patch/0001-cryptohash.diff b/contrib/intel_qat/patch/0001-cryptohash.diff new file mode 100644 index 0000000000..2d87c8f362 --- /dev/null +++ b/contrib/intel_qat/patch/0001-cryptohash.diff @@ -0,0 +1,17 @@ +cryptohash.h was dropped and merged with crypto/sha.sh in 5.8 kernel. Details in: +https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=228c4f265c6eb60eaa4ed0edb3bf7c113173576c + +--- +diff --git a/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c b/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c +index 4c389da..e602377 100644 +--- a/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c ++++ b/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c +@@ -66,7 +66,7 @@ + + #include "Osal.h" + #include +-#include ++#include + #include + #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)) + #include diff --git a/contrib/intel_qat/patch/0001-pci_aer.diff b/contrib/intel_qat/patch/0001-pci_aer.diff new file mode 100644 index 0000000000..7516ac4fee --- /dev/null +++ b/contrib/intel_qat/patch/0001-pci_aer.diff @@ -0,0 +1,20 @@ +In kernel 5.7 the pci_cleanup_aer_uncorrect_error_status() function was +renamed with the following commit: + +git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=894020fdd88c1e9a74c60b67c0f19f1c7696ba2f + +This simply updates the function call with the proper name (pci_aer_clear_nonfatal_status()). + +--- +diff --git a/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c b/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c +index a6ce6df..545bb79 100644 +--- a/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c ++++ b/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c +@@ -304,7 +304,7 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) + pr_err("QAT: Can't find acceleration device\n"); + return PCI_ERS_RESULT_DISCONNECT; + } +- pci_cleanup_aer_uncorrect_error_status(pdev); ++ pci_aer_clear_nonfatal_status(pdev); + if (adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_SYNC)) + return PCI_ERS_RESULT_DISCONNECT; diff --git a/contrib/intel_qat/patch/0001-timespec.diff b/contrib/intel_qat/patch/0001-timespec.diff new file mode 100644 index 0000000000..04fb053e1f --- /dev/null +++ b/contrib/intel_qat/patch/0001-timespec.diff @@ -0,0 +1,35 @@ +This patch attempts to expose timespec and getnstimeofday which were +explicitly hidden in the 5.6 kernel with the introduction of the +following commits: + +git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c766d1472c70d25ad475cf56042af1652e792b23 +git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=412c53a680a97cb1ae2c0ab60230e193bee86387 + +Code received from users@dpdk.org, issue tracked under QATE-59888. + +--- +diff --git a/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c b/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c +index 4639834..523e376 100644 +--- a/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c ++++ b/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c +@@ -107,6 +107,8 @@ atomic_t arrived; + extern struct device perf_device; + #endif + ++#define timespec timespec64 ++#define getnstimeofday ktime_get_real_ts64 + + /* Define a number for timeout */ + #define SAMPLE_CODE_MAX_LONG (0x7FFFFFFF) +diff --git a/quickassist/qat/compat/qat_compat.h b/quickassist/qat/compat/qat_compat.h +index 2a02eaf..3515092 100644 +--- a/quickassist/qat/compat/qat_compat.h ++++ b/quickassist/qat/compat/qat_compat.h +@@ -466,4 +466,7 @@ static inline void pci_ignore_hotplug(struct pci_dev *dev) + #if (RHEL_RELEASE_CODE && RHEL_RELEASE_VERSION(7, 3) <= RHEL_RELEASE_CODE) + #define QAT_KPT_CAP_DISCOVERY + #endif ++ ++#define timespec timespec64 ++#define getnstimeofday ktime_get_real_ts64 + #endif /* _QAT_COMPAT_H_ */ diff --git a/contrib/intel_qat/patch/LICENSE b/contrib/intel_qat/patch/LICENSE new file mode 100644 index 0000000000..8e12726c0a --- /dev/null +++ b/contrib/intel_qat/patch/LICENSE @@ -0,0 +1,30 @@ +BSD LICENSE + +Copyright (c) Intel Corporation. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/intel_qat/readme.md b/contrib/intel_qat/readme.md new file mode 100644 index 0000000000..7e45d395bb --- /dev/null +++ b/contrib/intel_qat/readme.md @@ -0,0 +1,27 @@ +# Intel_QAT easy install script + +This contrib contains community compatibility patches to get Intel QAT working on the following kernel versions: +- 5.6 +- 5.7 +- 5.8 + +These patches are based on the following Intel QAT version: +[1.7.l.4.10.0-00014](https://01.org/sites/default/files/downloads/qat1.7.l.4.10.0-00014.tar.gz) + +When using QAT with above kernels versions, the following patches needs to be applied using: +patch -p1 < _$PATCH_ +_Where $PATCH refers to the path of the patch in question_ + +### 5.6 +/patch/0001-timespec.diff + +### 5.7 +/patch/0001-pci_aer.diff + +### 5.8 +/patch/0001-cryptohash.diff + + +_Patches are supplied by [Storage Performance Development Kit (SPDK)](https://github.com/spdk/spdk)_ + + diff --git a/contrib/pam_zfs_key/Makefile.am b/contrib/pam_zfs_key/Makefile.am new file mode 100644 index 0000000000..f0f2550afc --- /dev/null +++ b/contrib/pam_zfs_key/Makefile.am @@ -0,0 +1,19 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBCRYPTO_CFLAGS) + +pammodule_LTLIBRARIES=pam_zfs_key.la + +pam_zfs_key_la_SOURCES = pam_zfs_key.c + +pam_zfs_key_la_LIBADD = \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la + +pam_zfs_key_la_LDFLAGS = -version-info 1:0:0 -avoid-version -module -shared + +pam_zfs_key_la_LIBADD += -lpam $(LIBCRYPTO_LIBS) + +dist_pamconfigs_DATA = zfs_key diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c new file mode 100644 index 0000000000..dead090f97 --- /dev/null +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -0,0 +1,835 @@ +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Copyright (c) 2020, Felix Dörre + * All rights reserved. + */ + +#include +#include +#include + +#include + +#include +#include + +#define PAM_SM_AUTH +#define PAM_SM_PASSWORD +#define PAM_SM_SESSION +#include + +#if defined(__linux__) +#include +#define MAP_FLAGS MAP_PRIVATE | MAP_ANONYMOUS +#elif defined(__FreeBSD__) +#include +static void +pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vsyslog(loglevel, fmt, args); + va_end(args); +} +#define MAP_FLAGS MAP_PRIVATE | MAP_ANON | MAP_NOCORE +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; + +static libzfs_handle_t *g_zfs; + +static void destroy_pw(pam_handle_t *pamh, void *data, int errcode); + +typedef int (*mlock_func_t) (const void *, size_t); + +typedef struct { + size_t len; + char *value; +} pw_password_t; + +/* + * Try to mlock(2) or munlock(2) addr while handling EAGAIN by retrying ten + * times and sleeping 10 milliseconds in between for a total of 0.1 + * seconds. lock_func must point to either mlock(2) or munlock(2). + */ +static int +try_lock(mlock_func_t lock_func, const void *addr, size_t len) +{ + int err; + int retries = 10; + useconds_t sleep_dur = 10 * 1000; + + if ((err = (*lock_func)(addr, len)) != EAGAIN) { + return (err); + } + for (int i = retries; i > 0; --i) { + (void) usleep(sleep_dur); + if ((err = (*lock_func)(addr, len)) != EAGAIN) { + break; + } + } + return (err); +} + + +static pw_password_t * +alloc_pw_size(size_t len) +{ + pw_password_t *pw = malloc(sizeof (pw_password_t)); + if (!pw) { + return (NULL); + } + pw->len = len; + /* + * We use mmap(2) rather than malloc(3) since later on we mlock(2) the + * memory region. Since mlock(2) and munlock(2) operate on whole memory + * pages we should allocate a whole page here as mmap(2) does. Further + * this ensures that the addresses passed to mlock(2) an munlock(2) are + * on a page boundary as suggested by FreeBSD and required by some + * other implementations. Finally we avoid inadvertently munlocking + * memory mlocked by an concurrently running instance of us. + */ + pw->value = mmap(NULL, pw->len, PROT_READ | PROT_WRITE, MAP_FLAGS, + -1, 0); + + if (pw->value == MAP_FAILED) { + free(pw); + return (NULL); + } + if (try_lock(mlock, pw->value, pw->len) != 0) { + (void) munmap(pw->value, pw->len); + free(pw); + return (NULL); + } + return (pw); +} + +static pw_password_t * +alloc_pw_string(const char *source) +{ + size_t len = strlen(source) + 1; + pw_password_t *pw = alloc_pw_size(len); + + if (!pw) { + return (NULL); + } + memcpy(pw->value, source, pw->len); + return (pw); +} + +static void +pw_free(pw_password_t *pw) +{ + bzero(pw->value, pw->len); + if (try_lock(munlock, pw->value, pw->len) == 0) { + (void) munmap(pw->value, pw->len); + } + free(pw); +} + +static pw_password_t * +pw_fetch(pam_handle_t *pamh) +{ + const char *token; + if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, + "couldn't get password from PAM stack"); + return (NULL); + } + if (!token) { + pam_syslog(pamh, LOG_ERR, + "token from PAM stack is null"); + return (NULL); + } + return (alloc_pw_string(token)); +} + +static const pw_password_t * +pw_fetch_lazy(pam_handle_t *pamh) +{ + pw_password_t *pw = pw_fetch(pamh); + if (pw == NULL) { + return (NULL); + } + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); + if (ret != PAM_SUCCESS) { + pw_free(pw); + pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); + return (NULL); + } + return (pw); +} + +static const pw_password_t * +pw_get(pam_handle_t *pamh) +{ + const pw_password_t *authtok = NULL; + int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, + (const void**)(&authtok)); + if (ret == PAM_SUCCESS) + return (authtok); + if (ret == PAM_NO_MODULE_DATA) + return (pw_fetch_lazy(pamh)); + pam_syslog(pamh, LOG_ERR, "password not available"); + return (NULL); +} + +static int +pw_clear(pam_handle_t *pamh) +{ + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); + if (ret != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, "clearing password failed"); + return (-1); + } + return (0); +} + +static void +destroy_pw(pam_handle_t *pamh, void *data, int errcode) +{ + if (data != NULL) { + pw_free((pw_password_t *)data); + } +} + +static int +pam_zfs_init(pam_handle_t *pamh) +{ + int error = 0; + if ((g_zfs = libzfs_init()) == NULL) { + error = errno; + pam_syslog(pamh, LOG_ERR, "Zfs initialization error: %s", + libzfs_error_init(error)); + } + return (error); +} + +static void +pam_zfs_free(void) +{ + libzfs_fini(g_zfs); +} + +static pw_password_t * +prepare_passphrase(pam_handle_t *pamh, zfs_handle_t *ds, + const char *passphrase, nvlist_t *nvlist) +{ + pw_password_t *key = alloc_pw_size(WRAPPING_KEY_LEN); + if (!key) { + return (NULL); + } + uint64_t salt; + uint64_t iters; + if (nvlist != NULL) { + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + pw_free(key); + return (NULL); + } + int bytes_read = 0; + char *buf = (char *)&salt; + size_t bytes = sizeof (uint64_t); + while (bytes_read < bytes) { + ssize_t len = read(fd, buf + bytes_read, bytes + - bytes_read); + if (len < 0) { + close(fd); + pw_free(key); + return (NULL); + } + bytes_read += len; + } + close(fd); + + if (nvlist_add_uint64(nvlist, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt)) { + pam_syslog(pamh, LOG_ERR, + "failed to add salt to nvlist"); + pw_free(key); + return (NULL); + } + iters = DEFAULT_PBKDF2_ITERATIONS; + if (nvlist_add_uint64(nvlist, zfs_prop_to_name( + ZFS_PROP_PBKDF2_ITERS), iters)) { + pam_syslog(pamh, LOG_ERR, + "failed to add iters to nvlist"); + pw_free(key); + return (NULL); + } + } else { + salt = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_SALT); + iters = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_ITERS); + } + + salt = LE_64(salt); + if (!PKCS5_PBKDF2_HMAC_SHA1((char *)passphrase, + strlen(passphrase), (uint8_t *)&salt, + sizeof (uint64_t), iters, WRAPPING_KEY_LEN, + (uint8_t *)key->value)) { + pam_syslog(pamh, LOG_ERR, "pbkdf failed"); + pw_free(key); + return (NULL); + } + return (key); +} + +static int +is_key_loaded(pam_handle_t *pamh, const char *ds_name) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + int keystatus = zfs_prop_get_int(ds, ZFS_PROP_KEYSTATUS); + zfs_close(ds); + return (keystatus != ZFS_KEYSTATUS_UNAVAILABLE); +} + +static int +change_key(pam_handle_t *pamh, const char *ds_name, + const char *passphrase) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + nvlist_t *nvlist = fnvlist_alloc(); + pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, nvlist); + if (key == NULL) { + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + if (nvlist_add_string(nvlist, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + "prompt")) { + pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keylocation"); + pw_free(key); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + if (nvlist_add_uint64(nvlist, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + ZFS_KEYFORMAT_PASSPHRASE)) { + pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keyformat"); + pw_free(key); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + int ret = lzc_change_key(ds_name, DCP_CMD_NEW_KEY, nvlist, + (uint8_t *)key->value, WRAPPING_KEY_LEN); + pw_free(key); + if (ret) { + pam_syslog(pamh, LOG_ERR, "change_key failed: %d", ret); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + nvlist_free(nvlist); + zfs_close(ds); + return (0); +} + +static int +decrypt_mount(pam_handle_t *pamh, const char *ds_name, + const char *passphrase) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL); + if (key == NULL) { + zfs_close(ds); + return (-1); + } + int ret = lzc_load_key(ds_name, B_FALSE, (uint8_t *)key->value, + WRAPPING_KEY_LEN); + pw_free(key); + if (ret) { + pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); + zfs_close(ds); + return (-1); + } + ret = zfs_mount(ds, NULL, 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); + zfs_close(ds); + return (-1); + } + zfs_close(ds); + return (0); +} + +static int +unmount_unload(pam_handle_t *pamh, const char *ds_name) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + int ret = zfs_unmount(ds, NULL, 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); + zfs_close(ds); + return (-1); + } + + ret = lzc_unload_key(ds_name); + if (ret) { + pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret); + zfs_close(ds); + return (-1); + } + zfs_close(ds); + return (0); +} + +typedef struct { + char *homes_prefix; + char *runstatedir; + char *homedir; + char *dsname; + uid_t uid; + const char *username; + int unmount_and_unload; +} zfs_key_config_t; + +static int +zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, + int argc, const char **argv) +{ + config->homes_prefix = strdup("rpool/home"); + if (config->homes_prefix == NULL) { + pam_syslog(pamh, LOG_ERR, "strdup failure"); + return (-1); + } + config->runstatedir = strdup(RUNSTATEDIR "/pam_zfs_key"); + if (config->runstatedir == NULL) { + pam_syslog(pamh, LOG_ERR, "strdup failure"); + free(config->homes_prefix); + return (-1); + } + const char *name; + if (pam_get_user(pamh, &name, NULL) != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, + "couldn't get username from PAM stack"); + free(config->runstatedir); + free(config->homes_prefix); + return (-1); + } + struct passwd *entry = getpwnam(name); + if (!entry) { + free(config->runstatedir); + free(config->homes_prefix); + return (-1); + } + config->uid = entry->pw_uid; + config->username = name; + config->unmount_and_unload = 1; + config->dsname = NULL; + config->homedir = NULL; + for (int c = 0; c < argc; c++) { + if (strncmp(argv[c], "homes=", 6) == 0) { + free(config->homes_prefix); + config->homes_prefix = strdup(argv[c] + 6); + } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { + free(config->runstatedir); + config->runstatedir = strdup(argv[c] + 12); + } else if (strcmp(argv[c], "nounmount") == 0) { + config->unmount_and_unload = 0; + } else if (strcmp(argv[c], "prop_mountpoint") == 0) { + config->homedir = strdup(entry->pw_dir); + } + } + return (0); +} + +static void +zfs_key_config_free(zfs_key_config_t *config) +{ + free(config->homes_prefix); + free(config->runstatedir); + free(config->homedir); + free(config->dsname); +} + +static int +find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) +{ + zfs_type_t type = zfs_get_type(zhp); + zfs_key_config_t *target = data; + char mountpoint[ZFS_MAXPROPLEN]; + + /* Skip any datasets whose type does not match */ + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { + zfs_close(zhp); + return (0); + } + + /* Skip any datasets whose mountpoint does not match */ + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE); + if (strcmp(target->homedir, mountpoint) != 0) { + zfs_close(zhp); + return (0); + } + + target->dsname = strdup(zfs_get_name(zhp)); + zfs_close(zhp); + return (1); +} + +static char * +zfs_key_config_get_dataset(zfs_key_config_t *config) +{ + if (config->homedir != NULL && + config->homes_prefix != NULL) { + zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, "dataset %s not found", + config->homes_prefix); + zfs_close(zhp); + return (NULL); + } + + (void) zfs_iter_filesystems(zhp, find_dsname_by_prop_value, + config); + zfs_close(zhp); + char *dsname = config->dsname; + config->dsname = NULL; + return (dsname); + } + + size_t len = ZFS_MAX_DATASET_NAME_LEN; + size_t total_len = strlen(config->homes_prefix) + 1 + + strlen(config->username); + if (total_len > len) { + return (NULL); + } + char *ret = malloc(len + 1); + if (!ret) { + return (NULL); + } + ret[0] = 0; + strcat(ret, config->homes_prefix); + strcat(ret, "/"); + strcat(ret, config->username); + return (ret); +} + +static int +zfs_key_config_modify_session_counter(pam_handle_t *pamh, + zfs_key_config_t *config, int delta) +{ + const char *runtime_path = config->runstatedir; + if (mkdir(runtime_path, S_IRWXU) != 0 && errno != EEXIST) { + pam_syslog(pamh, LOG_ERR, "Can't create runtime path: %d", + errno); + return (-1); + } + if (chown(runtime_path, 0, 0) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d", + errno); + return (-1); + } + if (chmod(runtime_path, S_IRWXU) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d", + errno); + return (-1); + } + size_t runtime_path_len = strlen(runtime_path); + size_t counter_path_len = runtime_path_len + 1 + 10; + char *counter_path = malloc(counter_path_len + 1); + if (!counter_path) { + return (-1); + } + counter_path[0] = 0; + strcat(counter_path, runtime_path); + snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", + config->uid); + const int fd = open(counter_path, + O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, + S_IRUSR | S_IWUSR); + free(counter_path); + if (fd < 0) { + pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno); + return (-1); + } + if (flock(fd, LOCK_EX) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't lock counter file: %d", errno); + close(fd); + return (-1); + } + char counter[20]; + char *pos = counter; + int remaining = sizeof (counter) - 1; + int ret; + counter[sizeof (counter) - 1] = 0; + while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) { + remaining -= ret; + pos += ret; + } + *pos = 0; + long int counter_value = strtol(counter, NULL, 10); + counter_value += delta; + if (counter_value < 0) { + counter_value = 0; + } + lseek(fd, 0, SEEK_SET); + if (ftruncate(fd, 0) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't truncate counter file: %d", + errno); + close(fd); + return (-1); + } + snprintf(counter, sizeof (counter), "%ld", counter_value); + remaining = strlen(counter); + pos = counter; + while (remaining > 0 && (ret = write(fd, pos, remaining)) > 0) { + remaining -= ret; + pos += ret; + } + close(fd); + return (counter_value); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_authenticate(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (pw_fetch_lazy(pamh) == NULL) { + return (PAM_AUTH_ERR); + } + + return (PAM_SUCCESS); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_setcred(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + return (PAM_SUCCESS); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_chauthtok(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_PERM_DENIED); + } + zfs_key_config_t config; + if (zfs_key_config_load(pamh, &config, argc, argv) == -1) { + return (PAM_SERVICE_ERR); + } + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + { + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + int key_loaded = is_key_loaded(pamh, dataset); + if (key_loaded == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + if (! key_loaded) { + pam_syslog(pamh, LOG_ERR, + "key not loaded, returning try_again"); + zfs_key_config_free(&config); + return (PAM_PERM_DENIED); + } + } + + if ((flags & PAM_UPDATE_AUTHTOK) != 0) { + const pw_password_t *token = pw_get(pamh); + if (token == NULL) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (change_key(pamh, dataset, token->value) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + if (pw_clear(pamh) == -1) { + return (PAM_SERVICE_ERR); + } + } else { + zfs_key_config_free(&config); + } + return (PAM_SUCCESS); +} + +PAM_EXTERN int +pam_sm_open_session(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_SUCCESS); + } + zfs_key_config_t config; + zfs_key_config_load(pamh, &config, argc, argv); + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + int counter = zfs_key_config_modify_session_counter(pamh, &config, 1); + if (counter != 1) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + const pw_password_t *token = pw_get(pamh); + if (token == NULL) { + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (decrypt_mount(pamh, dataset, token->value) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + if (pw_clear(pamh) == -1) { + return (PAM_SERVICE_ERR); + } + return (PAM_SUCCESS); + +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_close_session(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_SUCCESS); + } + zfs_key_config_t config; + zfs_key_config_load(pamh, &config, argc, argv); + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + int counter = zfs_key_config_modify_session_counter(pamh, &config, -1); + if (counter != 0) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + if (config.unmount_and_unload) { + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + if (unmount_unload(pamh, dataset) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + free(dataset); + pam_zfs_free(); + } + + zfs_key_config_free(&config); + return (PAM_SUCCESS); +} diff --git a/contrib/pam_zfs_key/zfs_key b/contrib/pam_zfs_key/zfs_key new file mode 100644 index 0000000000..e3ed5c4f2f --- /dev/null +++ b/contrib/pam_zfs_key/zfs_key @@ -0,0 +1,13 @@ +Name: Unlock zfs datasets for user +Default: yes +Priority: 128 +Auth-Type: Additional +Auth: + optional pam_zfs_key.so +Session-Interactive-Only: yes +Session-Type: Additional +Session: + optional pam_zfs_key.so +Password-Type: Additional +Password: + optional pam_zfs_key.so diff --git a/contrib/pyzfs/Makefile.am b/contrib/pyzfs/Makefile.am index 1549bf2379..fa1bb32ce2 100644 --- a/contrib/pyzfs/Makefile.am +++ b/contrib/pyzfs/Makefile.am @@ -24,7 +24,7 @@ all-local: # files are later created by manually loading the Python modules. # install-exec-local: - $(PYTHON) $(srcdir)/setup.py install \ + $(PYTHON) $(builddir)/setup.py install \ --prefix $(prefix) \ --root $(DESTDIR)/ \ --install-lib $(pythonsitedir) \ diff --git a/contrib/pyzfs/README b/contrib/pyzfs/README index 52983e5a90..bd22409795 100644 --- a/contrib/pyzfs/README +++ b/contrib/pyzfs/README @@ -25,4 +25,4 @@ a temporary directory specified by, for instance, TMP environment variable on a memory backed filesystem. Package documentation: http://pyzfs.readthedocs.org -Package development: https://github.com/zfsonlinux/zfs +Package development: https://github.com/openzfs/zfs diff --git a/contrib/pyzfs/docs/source/conf.py b/contrib/pyzfs/docs/source/conf.py index 4ffd7c93e5..4bbb938b62 100644 --- a/contrib/pyzfs/docs/source/conf.py +++ b/contrib/pyzfs/docs/source/conf.py @@ -291,7 +291,7 @@ autodoc_member_order = 'bysource' ####################### # Neutralize effects of function wrapping on documented signatures. -# The affected signatures could be explcitly placed into the +# The affected signatures could be explicitly placed into the # documentation (either in .rst files or as a first line of a # docstring). import functools diff --git a/contrib/pyzfs/libzfs_core/__init__.py b/contrib/pyzfs/libzfs_core/__init__.py index a195b05f52..25ea3e495b 100644 --- a/contrib/pyzfs/libzfs_core/__init__.py +++ b/contrib/pyzfs/libzfs_core/__init__.py @@ -32,7 +32,7 @@ of the error codes to the exceptions by interpreting a context in which the error code is produced. To submit an issue or contribute to development of this package -please visit its `GitHub repository `_. +please visit its `GitHub repository `_. .. data:: MAXNAMELEN @@ -73,7 +73,6 @@ from ._libzfs_core import ( lzc_receive_with_cmdprops, lzc_receive_with_header, lzc_release, - lzc_remap, lzc_reopen, lzc_rollback, lzc_rollback_to, @@ -129,7 +128,6 @@ __all__ = [ 'lzc_receive_with_cmdprops', 'lzc_receive_with_header', 'lzc_release', - 'lzc_remap', 'lzc_reopen', 'lzc_rollback', 'lzc_rollback_to', diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 55de55d422..2dfed224c2 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -19,14 +19,31 @@ Important `libzfs_core` constants. """ from __future__ import absolute_import, division, print_function +import errno +import sys + + +# Compat for platform-specific errnos +if sys.platform.startswith('freebsd'): + ECHRNG = errno.ENXIO + ECKSUM = 97 # EINTEGRITY + ETIME = errno.ETIMEDOUT +else: + ECHRNG = errno.ECHRNG + ECKSUM = errno.EBADE + ETIME = errno.ETIME # https://stackoverflow.com/a/1695250 -def enum(*sequential, **named): - enums = dict(((b, a) for a, b in enumerate(sequential)), **named) +def enum_with_offset(offset, sequential, named): + enums = dict(((b, a + offset) for a, b in enumerate(sequential)), **named) return type('Enum', (), enums) +def enum(*sequential, **named): + return enum_with_offset(0, sequential, named) + + #: Maximum length of any ZFS name. MAXNAMELEN = 255 #: Default channel program limits @@ -60,12 +77,38 @@ zio_encrypt = enum( 'ZIO_CRYPT_AES_256_GCM' ) # ZFS-specific error codes -ZFS_ERR_CHECKPOINT_EXISTS = 1024 -ZFS_ERR_DISCARDING_CHECKPOINT = 1025 -ZFS_ERR_NO_CHECKPOINT = 1026 -ZFS_ERR_DEVRM_IN_PROGRESS = 1027 -ZFS_ERR_VDEV_TOO_BIG = 1028 -ZFS_ERR_WRONG_PARENT = 1033 - +zfs_errno = enum_with_offset(1024, [ + 'ZFS_ERR_CHECKPOINT_EXISTS', + 'ZFS_ERR_DISCARDING_CHECKPOINT', + 'ZFS_ERR_NO_CHECKPOINT', + 'ZFS_ERR_DEVRM_IN_PROGRESS', + 'ZFS_ERR_VDEV_TOO_BIG', + 'ZFS_ERR_IOC_CMD_UNAVAIL', + 'ZFS_ERR_IOC_ARG_UNAVAIL', + 'ZFS_ERR_IOC_ARG_REQUIRED', + 'ZFS_ERR_IOC_ARG_BADTYPE', + 'ZFS_ERR_WRONG_PARENT', + 'ZFS_ERR_FROM_IVSET_GUID_MISSING', + 'ZFS_ERR_FROM_IVSET_GUID_MISMATCH', + 'ZFS_ERR_SPILL_BLOCK_FLAG_MISSING', + 'ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE', + 'ZFS_ERR_EXPORT_IN_PROGRESS', + 'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR', + 'ZFS_ERR_STREAM_TRUNCATED', + 'ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH', + 'ZFS_ERR_RESILVER_IN_PROGRESS', + 'ZFS_ERR_REBUILD_IN_PROGRESS', + 'ZFS_ERR_BADPROP', + ], + {} +) +# compat before we used the enum helper for these values +ZFS_ERR_CHECKPOINT_EXISTS = zfs_errno.ZFS_ERR_CHECKPOINT_EXISTS +assert(ZFS_ERR_CHECKPOINT_EXISTS == 1024) +ZFS_ERR_DISCARDING_CHECKPOINT = zfs_errno.ZFS_ERR_DISCARDING_CHECKPOINT +ZFS_ERR_NO_CHECKPOINT = zfs_errno.ZFS_ERR_NO_CHECKPOINT +ZFS_ERR_DEVRM_IN_PROGRESS = zfs_errno.ZFS_ERR_DEVRM_IN_PROGRESS +ZFS_ERR_VDEV_TOO_BIG = zfs_errno.ZFS_ERR_VDEV_TOO_BIG +ZFS_ERR_WRONG_PARENT = zfs_errno.ZFS_ERR_WRONG_PARENT # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index b888fd7255..f494461f63 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -33,13 +33,17 @@ import re import string from . import exceptions as lzc_exc from ._constants import ( + ECHRNG, + ECKSUM, + ETIME, MAXNAMELEN, ZFS_ERR_CHECKPOINT_EXISTS, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, - ZFS_ERR_WRONG_PARENT + ZFS_ERR_WRONG_PARENT, + zfs_errno ) @@ -55,6 +59,8 @@ def lzc_create_translate_error(ret, name, ds_type, props): raise lzc_exc.ParentNotFound(name) if ret == ZFS_ERR_WRONG_PARENT: raise lzc_exc.WrongParent(_fs_name(name)) + if ret == zfs_errno.ZFS_ERR_BADPROP: + raise lzc_exc.PropertyInvalid(name) raise _generic_exception(ret, name, "Failed to create filesystem") @@ -147,21 +153,36 @@ def lzc_destroy_snaps_translate_errors(ret, errlist, snaps, defer): def lzc_bookmark_translate_errors(ret, errlist, bookmarks): + if ret == 0: return def _map(ret, name): + source = bookmarks[name] if ret == errno.EINVAL: if name: - snap = bookmarks[name] pool_names = map(_pool_name, bookmarks.keys()) - if not _is_valid_bmark_name(name): - return lzc_exc.BookmarkNameInvalid(name) - elif not _is_valid_snap_name(snap): - return lzc_exc.SnapshotNameInvalid(snap) - elif _fs_name(name) != _fs_name(snap): - return lzc_exc.BookmarkMismatch(name) - elif any(x != _pool_name(name) for x in pool_names): + + # use _validate* functions for MAXNAMELEN check + try: + _validate_bmark_name(name) + except lzc_exc.ZFSError as e: + return e + + try: + _validate_snap_name(source) + source_is_snap = True + except lzc_exc.ZFSError: + source_is_snap = False + try: + _validate_bmark_name(source) + source_is_bmark = True + except lzc_exc.ZFSError: + source_is_bmark = False + if not source_is_snap and not source_is_bmark: + return lzc_exc.BookmarkSourceInvalid(source) + + if any(x != _pool_name(name) for x in pool_names): return lzc_exc.PoolsDiffer(name) else: invalid_names = [ @@ -174,6 +195,8 @@ def lzc_bookmark_translate_errors(ret, errlist, bookmarks): return lzc_exc.SnapshotNotFound(name) if ret == errno.ENOTSUP: return lzc_exc.BookmarkNotSupported(name) + if ret == zfs_errno.ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: + return lzc_exc.BookmarkMismatch(source) return _generic_exception(ret, name, "Failed to create bookmark") _handle_err_list( @@ -399,6 +422,8 @@ def lzc_receive_translate_errors( def _map(ret, name): if ret == errno.EINVAL: return lzc_exc.PropertyInvalid(name) + if ret == zfs_errno.ZFS_ERR_BADPROP: + return lzc_exc.PropertyInvalid(name) return _generic_exception(ret, name, "Failed to set property") _handle_err_list( errno.EINVAL, properrs, [snapname], @@ -444,10 +469,14 @@ def lzc_receive_translate_errors( raise lzc_exc.ReadOnlyPool(_pool_name(snapname)) if ret == errno.EAGAIN: raise lzc_exc.SuspendedPool(_pool_name(snapname)) - if ret == errno.EBADE: # ECKSUM + if ret == ECKSUM: raise lzc_exc.BadStream() if ret == ZFS_ERR_WRONG_PARENT: raise lzc_exc.WrongParent(_fs_name(snapname)) + if ret == zfs_errno.ZFS_ERR_STREAM_TRUNCATED: + raise lzc_exc.StreamTruncated() + if ret == zfs_errno.ZFS_ERR_BADPROP: + raise lzc_exc.PropertyInvalid(snapname) raise lzc_exc.StreamIOError(ret) @@ -532,7 +561,7 @@ def lzc_channel_program_translate_error(ret, name, error): return if ret == errno.ENOENT: raise lzc_exc.PoolNotFound(name) - if ret == errno.ETIME: + if ret == ETIME: raise lzc_exc.ZCPTimeout() if ret == errno.ENOMEM: raise lzc_exc.ZCPMemoryError() @@ -540,7 +569,7 @@ def lzc_channel_program_translate_error(ret, name, error): raise lzc_exc.ZCPSpaceError() if ret == errno.EPERM: raise lzc_exc.ZCPPermissionError() - if ret == errno.ECHRNG: + if ret == ECHRNG: raise lzc_exc.ZCPRuntimeError(error) if ret == errno.EINVAL: if error is None: @@ -550,18 +579,6 @@ def lzc_channel_program_translate_error(ret, name, error): raise _generic_exception(ret, name, "Failed to execute channel program") -def lzc_remap_translate_error(ret, name): - if ret == 0: - return - if ret == errno.ENOENT: - raise lzc_exc.DatasetNotFound(name) - if ret == errno.EINVAL: - _validate_fs_name(name) - if ret == errno.ENOTSUP: - return lzc_exc.FeatureNotSupported(name) - raise _generic_exception(ret, name, "Failed to remap dataset") - - def lzc_pool_checkpoint_translate_error(ret, name, discard=False): if ret == 0: return diff --git a/contrib/pyzfs/libzfs_core/_libzfs_core.py b/contrib/pyzfs/libzfs_core/_libzfs_core.py index 5c8a1f5e69..fcfa5be31b 100644 --- a/contrib/pyzfs/libzfs_core/_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/_libzfs_core.py @@ -300,7 +300,7 @@ def lzc_destroy_snaps(snaps, defer): Typical error is :exc:`SnapshotIsCloned` if `defer` is `False`. The snapshot names are validated quite loosely and invalid names are - typically ignored as nonexisiting snapshots. + typically ignored as nonexisting snapshots. A snapshot name referring to a filesystem that doesn't exist is ignored. @@ -319,14 +319,15 @@ def lzc_bookmark(bookmarks): Create bookmarks. :param bookmarks: a dict that maps names of wanted bookmarks to names of - existing snapshots. + existing snapshots or bookmarks. :type bookmarks: dict of bytes to bytes :raises BookmarkFailure: if any of the bookmarks can not be created for any reason. The bookmarks `dict` maps from name of the bookmark (e.g. :file:`{pool}/{fs}#{bmark}`) to the name of the snapshot - (e.g. :file:`{pool}/{fs}@{snap}`). All the bookmarks and snapshots must + (e.g. :file:`{pool}/{fs}@{snap}`) or existint bookmark + :file:`{pool}/{fs}@{snap}`. All the bookmarks and snapshots must be in the same pool. ''' errlist = {} @@ -470,7 +471,7 @@ def lzc_hold(holds, fd=None): Holds for snapshots which don't exist will be skipped and have an entry added to the return value, but will not cause an overall failure. No exceptions is raised if all holds, for snapshots that existed, were - succesfully created. + successfully created. Otherwise :exc:`.HoldFailure` exception is raised and no holds will be created. :attr:`.HoldFailure.errors` may contain a single element for an error that @@ -654,7 +655,7 @@ def lzc_send_space(snapname, fromsnap=None, flags=None): should be done. :param fromsnap: the optional starting snapshot name. If not `None` then an incremental stream size is estimated, otherwise - a full stream is esimated. + a full stream is estimated. :type fromsnap: `bytes` or `None` :param flags: the flags that control what enhanced features can be used in the stream. @@ -1178,11 +1179,11 @@ def receive_header(fd): the type of the dataset for which the stream has been created (volume, filesystem) ''' - # read sizeof(dmu_replay_record_t) bytes directly into the memort backing + # read sizeof(dmu_replay_record_t) bytes directly into the memory backing # 'record' record = _ffi.new("dmu_replay_record_t *") _ffi.buffer(record)[:] = os.read(fd, _ffi.sizeof(record[0])) - # get drr_begin member and its representation as a Pythn dict + # get drr_begin member and its representation as a Python dict drr_begin = record.drr_u.drr_begin header = {} for field, descr in _ffi.typeof(drr_begin).fields: @@ -1562,22 +1563,6 @@ def lzc_promote(name): errors.lzc_promote_translate_error(ret, name) -@_uncommitted() -def lzc_remap(name): - ''' - Remaps the ZFS dataset. - - :param bytes name: the name of the dataset to remap. - :raises NameInvalid: if the dataset name is invalid. - :raises NameTooLong: if the dataset name is too long. - :raises DatasetNotFound: if the dataset does not exist. - :raises FeatureNotSupported: if the pool containing the dataset does not - have the *obsolete_counts* feature enabled. - ''' - ret = _lib.lzc_remap(name) - errors.lzc_remap_translate_error(ret, name) - - @_uncommitted() def lzc_pool_checkpoint(name): ''' @@ -1704,7 +1689,7 @@ def lzc_set_props(name, prop, val): # As the extended API is not committed yet, the names of the new interfaces # are not settled down yet. # It's not clear if atomically setting multiple properties is an achievable -# goal and an interface acting on mutiple entities must do so atomically +# goal and an interface acting on multiple entities must do so atomically # by convention. # Being able to set a single property at a time is sufficient for ClusterHQ. lzc_set_prop = lzc_set_props @@ -1741,7 +1726,7 @@ def lzc_list(name, options): Absence of this option implies all types. The first of the returned file descriptors can be used to - read the listing in a binary encounded format. The data is + read the listing in a binary encoded format. The data is a series of variable sized records each starting with a fixed size header, the header is followed by a serialized ``nvlist``. Each record describes a single element and contains the element's diff --git a/contrib/pyzfs/libzfs_core/_nvlist.py b/contrib/pyzfs/libzfs_core/_nvlist.py index fe4239a3c0..dc6d820bde 100644 --- a/contrib/pyzfs/libzfs_core/_nvlist.py +++ b/contrib/pyzfs/libzfs_core/_nvlist.py @@ -113,7 +113,7 @@ def packed_nvlist_out(packed_nvlist, packed_size): :param bytes packed_nvlist: packed nvlist_t. :param int packed_size: nvlist_t packed size. - :return: an `dict` of values representing the data containted by nvlist_t. + :return: an `dict` of values representing the data contained by nvlist_t. :rtype: dict """ props = {} diff --git a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py index ce2d9d62c3..1b46a08919 100644 --- a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py @@ -127,7 +127,6 @@ CDEF = """ int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); int lzc_sync(const char *, nvlist_t *, nvlist_t **); int lzc_unload_key(const char *); - int lzc_remap(const char *); int lzc_pool_checkpoint(const char *); int lzc_pool_checkpoint_discard(const char *); int lzc_rename(const char *, const char *); diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index f465cd3d93..e484b07b64 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -21,12 +21,16 @@ from __future__ import absolute_import, division, print_function import errno from ._constants import ( + ECHRNG, + ECKSUM, + ETIME, ZFS_ERR_CHECKPOINT_EXISTS, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, - ZFS_ERR_WRONG_PARENT + ZFS_ERR_WRONG_PARENT, + zfs_errno ) @@ -77,7 +81,7 @@ class MultipleOperationsFailure(ZFSError): ZFSError.__str__(self), len(self.errors), self.suppressed_count) def __repr__(self): - return "%s(%r, %r, errors=%r, supressed=%r)" % ( + return "%s(%r, %r, errors=%r, suppressed=%r)" % ( self.__class__.__name__, self.errno, self.message, self.errors, self.suppressed_count) @@ -227,7 +231,15 @@ class BookmarkNotFound(ZFSError): class BookmarkMismatch(ZFSError): errno = errno.EINVAL - message = "Bookmark is not in snapshot's filesystem" + message = "source is not an ancestor of the new bookmark's dataset" + + def __init__(self, name): + self.name = name + + +class BookmarkSourceInvalid(ZFSError): + errno = errno.EINVAL + message = "Bookmark source is not a valid snapshot or existing bookmark" def __init__(self, name): self.name = name @@ -316,7 +328,7 @@ class DestinationModified(ZFSError): class BadStream(ZFSError): - errno = errno.EBADE + errno = ECKSUM message = "Bad backup stream" @@ -340,6 +352,11 @@ class StreamFeatureIncompatible(ZFSError): message = "Incompatible embedded feature with encrypted receive" +class StreamTruncated(ZFSError): + errno = zfs_errno.ZFS_ERR_STREAM_TRUNCATED + message = "incomplete stream" + + class ReceivePropertyFailure(MultipleOperationsFailure): message = "Receiving of properties failed for one or more reasons" @@ -372,7 +389,7 @@ class NoSpace(ZFSError): class QuotaExceeded(ZFSError): errno = errno.EDQUOT - message = "Quouta exceeded" + message = "Quota exceeded" def __init__(self, name): self.name = name @@ -524,7 +541,7 @@ class ZCPSyntaxError(ZCPError): class ZCPRuntimeError(ZCPError): - errno = errno.ECHRNG + errno = ECHRNG message = "Channel programs encountered a runtime error" def __init__(self, details): @@ -537,7 +554,7 @@ class ZCPLimitInvalid(ZCPError): class ZCPTimeout(ZCPError): - errno = errno.ETIME + errno = ETIME message = "Channel program timed out" diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py index 25f20a4aee..d949d88d5a 100644 --- a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -154,8 +154,8 @@ def os_open(name, mode): @contextlib.contextmanager def dev_null(): - with os_open('/dev/null', os.O_WRONLY) as fd: - yield fd + with tempfile.TemporaryFile(suffix='.zstream') as fd: + yield fd.fileno() @contextlib.contextmanager @@ -252,9 +252,9 @@ def skipUnlessBookmarksSupported(f): def snap_always_unmounted_before_destruction(): - # Apparently ZoL automatically unmounts the snapshot + # Apparently OpenZFS automatically unmounts the snapshot # only if it is mounted at its default .zfs/snapshot - # mountpoint. + # mountpoint under Linux. return ( platform.system() != 'Linux', 'snapshot is not auto-unmounted') @@ -1032,17 +1032,37 @@ class ZFSTest(unittest.TestCase): bmarks = [ZFSTest.pool.makeName( b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] bmark_dict = {x: y for x, y in zip(bmarks, snaps)} - lzc.lzc_snapshot(snaps) lzc.lzc_bookmark(bmark_dict) lzc.lzc_destroy_snaps(snaps, defer=False) + @skipUnlessBookmarksSupported + def test_bookmark_copying(self): + snaps = [ZFSTest.pool.makeName(s) for s in [ + b'fs1@snap1', b'fs1@snap2', b'fs2@snap1']] + bmarks = [ZFSTest.pool.makeName(x) for x in [ + b'fs1#bmark1', b'fs1#bmark2', b'fs2#bmark1']] + bmarks_copies = [ZFSTest.pool.makeName(x) for x in [ + b'fs1#bmark1_copy', b'fs1#bmark2_copy', b'fs2#bmark1_copy']] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + bmark_copies_dict = {x: y for x, y in zip(bmarks_copies, bmarks)} + + for snap in snaps: + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + lzc.lzc_bookmark(bmark_copies_dict) + lzc.lzc_destroy_bookmarks(bmarks_copies) + + lzc.lzc_destroy_bookmarks(bmarks) + lzc.lzc_destroy_snaps(snaps, defer=False) + @skipUnlessBookmarksSupported def test_bookmarks_empty(self): lzc.lzc_bookmark({}) @skipUnlessBookmarksSupported - def test_bookmarks_mismatching_name(self): + def test_bookmarks_foreign_source(self): snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] bmarks = [ZFSTest.pool.makeName(b'fs2#bmark1')] bmark_dict = {x: y for x, y in zip(bmarks, snaps)} @@ -1107,7 +1127,7 @@ class ZFSTest(unittest.TestCase): self.assertIsInstance(e, lzc_exc.NameTooLong) @skipUnlessBookmarksSupported - def test_bookmarks_mismatching_names(self): + def test_bookmarks_foreign_sources(self): snaps = [ZFSTest.pool.makeName( b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] bmarks = [ZFSTest.pool.makeName( @@ -1122,7 +1142,7 @@ class ZFSTest(unittest.TestCase): self.assertIsInstance(e, lzc_exc.BookmarkMismatch) @skipUnlessBookmarksSupported - def test_bookmarks_partially_mismatching_names(self): + def test_bookmarks_partially_foreign_sources(self): snaps = [ZFSTest.pool.makeName( b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] bmarks = [ZFSTest.pool.makeName( @@ -1154,33 +1174,48 @@ class ZFSTest(unittest.TestCase): @skipUnlessBookmarksSupported def test_bookmarks_missing_snap(self): + fss = [ZFSTest.pool.makeName(b'fs1'), ZFSTest.pool.makeName(b'fs2')] snaps = [ZFSTest.pool.makeName( b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] bmarks = [ZFSTest.pool.makeName( b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] bmark_dict = {x: y for x, y in zip(bmarks, snaps)} - lzc.lzc_snapshot(snaps[0:1]) + lzc.lzc_snapshot(snaps[0:1]) # only create fs1@snap1 + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: lzc.lzc_bookmark(bmark_dict) for e in ctx.exception.errors: self.assertIsInstance(e, lzc_exc.SnapshotNotFound) + # no new bookmarks are created if one or more sources do not exist + for fs in fss: + fsbmarks = lzc.lzc_get_bookmarks(fs) + self.assertEqual(len(fsbmarks), 0) + @skipUnlessBookmarksSupported def test_bookmarks_missing_snaps(self): + fss = [ZFSTest.pool.makeName(b'fs1'), ZFSTest.pool.makeName(b'fs2')] snaps = [ZFSTest.pool.makeName( b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] bmarks = [ZFSTest.pool.makeName( b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + # do not create any snapshots + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: lzc.lzc_bookmark(bmark_dict) for e in ctx.exception.errors: self.assertIsInstance(e, lzc_exc.SnapshotNotFound) + # no new bookmarks are created if one or more sources do not exist + for fs in fss: + fsbmarks = lzc.lzc_get_bookmarks(fs) + self.assertEqual(len(fsbmarks), 0) + @skipUnlessBookmarksSupported def test_bookmarks_for_the_same_snap(self): snap = ZFSTest.pool.makeName(b'fs1@snap1') @@ -1913,7 +1948,7 @@ class ZFSTest(unittest.TestCase): filecmp.cmp( os.path.join(mnt1, name), os.path.join(mnt2, name), False)) - # This test case fails unless unless a patch from + # This test case fails unless a patch from # https://clusterhq.atlassian.net/browse/ZFS-20 # is applied to libzfs_core, otherwise it succeeds. @unittest.skip("fails with unpatched libzfs_core") @@ -2160,7 +2195,7 @@ class ZFSTest(unittest.TestCase): with streams(srcfs, src1, src2) as (_, (full, incr)): lzc.lzc_receive(dst1, full.fileno()) lzc.lzc_snapshot([dst_snap]) - # becase cannot receive incremental and set origin on a non-clone + # because cannot receive incremental and set origin on a non-clone with self.assertRaises(lzc_exc.BadStream): lzc.lzc_receive(dst2, incr.fileno(), origin=dst1) @@ -2375,7 +2410,7 @@ class ZFSTest(unittest.TestCase): for i in range(1024): f.write(b'x' * 1024) lzc.lzc_receive(dst, stream.fileno(), force=True) - # The temporary file dissappears and any access, even close(), + # The temporary file disappears and any access, even close(), # results in EIO. self.assertFalse(os.path.exists(f.name)) with self.assertRaises(IOError): @@ -2462,7 +2497,7 @@ class ZFSTest(unittest.TestCase): for i in range(1024): f.write(b'x' * 1024) lzc.lzc_receive(dst2, incr.fileno(), force=True) - # The temporary file dissappears and any access, even close(), + # The temporary file disappears and any access, even close(), # results in EIO. self.assertFalse(os.path.exists(f.name)) with self.assertRaises(IOError): @@ -2679,7 +2714,7 @@ class ZFSTest(unittest.TestCase): lzc.lzc_send(src, None, stream.fileno()) stream.seek(0) stream.truncate(1024 * 3) - with self.assertRaises(lzc_exc.BadStream): + with self.assertRaises(lzc_exc.StreamTruncated): lzc.lzc_receive_resumable(dst, stream.fileno()) # Resume token code from zfs_send_resume_token_to_nvlist() # XXX: if used more than twice move this code into an external func @@ -2736,7 +2771,7 @@ class ZFSTest(unittest.TestCase): lzc.lzc_send(snap2, snap1, stream.fileno()) stream.seek(0) stream.truncate(1024 * 3) - with self.assertRaises(lzc_exc.BadStream): + with self.assertRaises(lzc_exc.StreamTruncated): lzc.lzc_receive_resumable(dst2, stream.fileno()) # Resume token code from zfs_send_resume_token_to_nvlist() # format: --- @@ -3632,31 +3667,6 @@ zfs.sync.snapshot('""" + pool + b"""@zcp') with self.assertRaises(lzc_exc.EncryptionKeyNotLoaded): lzc.lzc_unload_key(fs) - def test_remap_missing_fs(self): - name = b"nonexistent" - - with self.assertRaises(lzc_exc.DatasetNotFound): - lzc.lzc_remap(name) - - def test_remap_invalid_fs(self): - ds = ZFSTest.pool.makeName(b"fs1") - snap = ds + b"@snap1" - - lzc.lzc_snapshot([snap]) - with self.assertRaises(lzc_exc.NameInvalid): - lzc.lzc_remap(snap) - - def test_remap_too_long_fs_name(self): - name = ZFSTest.pool.makeTooLongName() - - with self.assertRaises(lzc_exc.NameTooLong): - lzc.lzc_remap(name) - - def test_remap(self): - name = ZFSTest.pool.makeName(b"fs1") - - lzc.lzc_remap(name) - def test_checkpoint(self): pool = ZFSTest.pool.getRoot().getName() diff --git a/contrib/zcp/Makefile.am b/contrib/zcp/Makefile.am new file mode 100644 index 0000000000..e6a777ad7b --- /dev/null +++ b/contrib/zcp/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = autosnap.lua diff --git a/contrib/zcp/autosnap.lua b/contrib/zcp/autosnap.lua new file mode 100644 index 0000000000..d9ae32ce45 --- /dev/null +++ b/contrib/zcp/autosnap.lua @@ -0,0 +1,75 @@ +-- Recursively snapshot every dataset with a given property +-- +-- Usage: zfs program autosnap.lua -- [-n] [-p ] + +results = {} + +args = ... +argv = args["argv"] +usage = [[ + + +usage: zfs program autosnap.lua -- [-n] [-p ] + + -n: performs checks only, does not take snapshots + -p : property to check. [default: com.sun:auto-snapshot] + : root snapshot to create [example: tank/data@backup] +]] + +property = "com.sun:auto-snapshot" +noop = false +root_snap = nil + +for i, arg in ipairs(argv) do + if arg == "-n" then + noop = true + elseif arg == "-p" then + elseif argv[i-1] == "-p" then + property = arg + else + root_snap = arg + end +end + +if root_snap == nil or property == nil then + error(usage) +end + +root_ds_name = "" +snap_name = "" +for i = 1, #root_snap do + if root_snap:sub(i, i) == "@" then + root_ds_name = root_snap:sub(1, i-1) + snap_name = root_snap:sub(i+1, root_snap:len()) + end +end + +function auto_snap(root) + auto, source = zfs.get_prop(root, property) + if auto == "true" then + ds_snap_name = root .. "@" .. snap_name + err = 0 + if noop then + err = zfs.check.snapshot(ds_snap_name) + else + err = zfs.sync.snapshot(ds_snap_name) + end + results[ds_snap_name] = err + end + for child in zfs.list.children(root) do + auto_snap(child) + end +end + +auto_snap(root_ds_name) +err_txt = "" +for ds, err in pairs(results) do + if err ~= 0 then + err_txt = err_txt .. "failed to create " .. ds .. ": " .. err .. "\n" + end +end +if err_txt ~= "" then + error(err_txt) +end + +return results diff --git a/copy-builtin b/copy-builtin index 1dcfcb961e..cd6f259092 100755 --- a/copy-builtin +++ b/copy-builtin @@ -1,6 +1,6 @@ -#!/bin/bash +#!/bin/sh -set -e +set -ef usage() { @@ -9,93 +9,57 @@ usage() } [ "$#" -eq 1 ] || usage -KERNEL_DIR="$(readlink --canonicalize-existing "$1")" - -MODULES=() -MODULES+="spl" -for MODULE_DIR in module/* -do - [ -d "$MODULE_DIR" ] || continue - [ "spl" = "${MODULE_DIR##*/}" ] && continue - MODULES+=("${MODULE_DIR##*/}") -done +KERNEL_DIR="$1" if ! [ -e 'zfs_config.h' ] then - echo >&2 - echo " $0: you did not run configure, or you're not in the ZFS source directory." >&2 - echo " $0: run configure with --with-linux=$KERNEL_DIR and --enable-linux-builtin." >&2 - echo >&2 - exit 1 -fi + echo "$0: you did not run configure, or you're not in the ZFS source directory." + echo "$0: run configure with --with-linux=$KERNEL_DIR and --enable-linux-builtin." -make clean || true -scripts/make_gitrev.sh || true + exit 1 +fi >&2 + +make clean ||: +make gitrev rm -rf "$KERNEL_DIR/include/zfs" "$KERNEL_DIR/fs/zfs" -cp --recursive include "$KERNEL_DIR/include/zfs" -cp --recursive module "$KERNEL_DIR/fs/zfs" +cp -R include "$KERNEL_DIR/include/zfs" +cp -R module "$KERNEL_DIR/fs/zfs" cp zfs_config.h "$KERNEL_DIR/include/zfs/" -for MODULE in "${MODULES[@]}" -do - sed -i.bak '/obj =/d' "$KERNEL_DIR/fs/zfs/$MODULE/Makefile" - sed -i.bak '/src =/d' "$KERNEL_DIR/fs/zfs/$MODULE/Makefile" -done - -cat > "$KERNEL_DIR/fs/zfs/Kconfig" <<"EOF" +cat > "$KERNEL_DIR/fs/zfs/Kconfig" < "$FILE.new" @@ -106,8 +70,5 @@ add_after() add_after "$KERNEL_DIR/fs/Kconfig" 'if BLOCK' 'source "fs/zfs/Kconfig"' add_after "$KERNEL_DIR/fs/Makefile" 'endif' 'obj-$(CONFIG_ZFS) += zfs/' -echo >&2 -echo " $0: done." >&2 -echo " $0: now you can build the kernel with ZFS support." >&2 -echo " $0: make sure you enable ZFS support (CONFIG_ZFS) before building." >&2 -echo >&2 +echo "$0: done. now you can build the kernel with ZFS support." >&2 +echo "$0: make sure you enable ZFS support (CONFIG_ZFS) before building." >&2 diff --git a/etc/Makefile.am b/etc/Makefile.am index 28b955106e..aa9ff182c8 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -1,2 +1,9 @@ -SUBDIRS = zfs sudoers.d $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) -DIST_SUBDIRS = init.d zfs systemd modules-load.d sudoers.d +include $(top_srcdir)/config/Shellcheck.am + +SUBDIRS = zfs sudoers.d +SHELLCHECKDIRS = zfs +if BUILD_LINUX +SHELLCHECKDIRS += default $(ZFS_INIT_SYSV) +SUBDIRS += default $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) +endif +DIST_SUBDIRS = default init.d zfs systemd modules-load.d sudoers.d diff --git a/etc/default/.gitignore b/etc/default/.gitignore new file mode 100644 index 0000000000..73304bc2cd --- /dev/null +++ b/etc/default/.gitignore @@ -0,0 +1 @@ +zfs diff --git a/etc/default/Makefile.am b/etc/default/Makefile.am new file mode 100644 index 0000000000..b88eb54949 --- /dev/null +++ b/etc/default/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + +initconf_SCRIPTS = zfs + +SUBSTFILES += $(initconf_SCRIPTS) + +SHELLCHECK_SHELL = sh +SHELLCHECK_IGNORE = ,SC2034 diff --git a/etc/init.d/zfs.in b/etc/default/zfs.in similarity index 71% rename from etc/init.d/zfs.in rename to etc/default/zfs.in index 7998569b2c..3b6e5486dd 100644 --- a/etc/init.d/zfs.in +++ b/etc/default/zfs.in @@ -1,5 +1,11 @@ # ZoL userland configuration. +# NOTE: This file is intended for sysv init and initramfs. +# Changing some of these settings may not make any difference on +# systemd-based setup, e.g. setting ZFS_MOUNT=no will not prevent systemd +# from launching zfs-mount.service during boot. +# See: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=901436 + # To enable a boolean setting, set it to yes, on, true, or 1. # Anything else will be interpreted as unset. @@ -46,13 +52,6 @@ ZPOOL_IMPORT_ALL_VISIBLE='no' # This is a space separated list. #ZFS_POOL_EXCEPTIONS="test2" -# List of pools that SHOULD be imported at boot by the initramfs -# instead of trying to import all available pools. If this is set -# then ZFS_POOL_EXCEPTIONS is ignored. -# Only applicable for Debian GNU/Linux {dkms,initramfs}. -# This is a semi-colon separated list. -#ZFS_POOL_IMPORT="pool1;pool2" - # Should the datasets be mounted verbosely? # A mount counter will be used when mounting if set to 'yes'. VERBOSE_MOUNT='no' @@ -91,38 +90,14 @@ MOUNT_EXTRA_OPTIONS="" # Only applicable for Debian GNU/Linux {dkms,initramfs}. ZFS_DKMS_ENABLE_DEBUG='no' +# Build kernel modules with the --enable-debuginfo switch? +# Only applicable for Debian GNU/Linux {dkms,initramfs}. +ZFS_DKMS_ENABLE_DEBUGINFO='no' + # Keep debugging symbols in kernel modules? # Only applicable for Debian GNU/Linux {dkms,initramfs}. ZFS_DKMS_DISABLE_STRIP='no' -# Wait for this many seconds in the initrd pre_mountroot? -# This delays startup and should be '0' on most systems. -# Only applicable for Debian GNU/Linux {dkms,initramfs}. -ZFS_INITRD_PRE_MOUNTROOT_SLEEP='0' - -# Wait for this many seconds in the initrd mountroot? -# This delays startup and should be '0' on most systems. This might help on -# systems which have their ZFS root on a USB disk that takes just a little -# longer to be available -# Only applicable for Debian GNU/Linux {dkms,initramfs}. -ZFS_INITRD_POST_MODPROBE_SLEEP='0' - -# List of additional datasets to mount after the root dataset is mounted? -# -# The init script will use the mountpoint specified in the 'mountpoint' -# property value in the dataset to determine where it should be mounted. -# -# This is a space separated list, and will be mounted in the order specified, -# so if one filesystem depends on a previous mountpoint, make sure to put -# them in the right order. -# -# It is not necessary to add filesystems below the root fs here. It is -# taken care of by the initrd script automatically. These are only for -# additional filesystems needed. Such as /opt, /usr/local which is not -# located under the root fs. -# Example: If root FS is 'rpool/ROOT/rootfs', this would make sense. -#ZFS_INITRD_ADDITIONAL_DATASETS="rpool/ROOT/usr rpool/ROOT/var" - # Optional arguments for the ZFS Event Daemon (ZED). # See zed(8) for more information on available options. #ZED_ARGS="-M" diff --git a/etc/init.d/.gitignore b/etc/init.d/.gitignore index 3f16b08ecc..43a673d553 100644 --- a/etc/init.d/.gitignore +++ b/etc/init.d/.gitignore @@ -1,4 +1,3 @@ -zfs-functions zfs-import zfs-mount zfs-share diff --git a/etc/init.d/Makefile.am b/etc/init.d/Makefile.am index 93432386a2..f93af1fd77 100644 --- a/etc/init.d/Makefile.am +++ b/etc/init.d/Makefile.am @@ -1,44 +1,10 @@ -initdir = $(DEFAULT_INIT_DIR) +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + +EXTRA_DIST += README.md + init_SCRIPTS = zfs-import zfs-mount zfs-share zfs-zed -initcommondir = $(sysconfdir)/zfs -initcommon_SCRIPTS = zfs-functions +SUBSTFILES += $(init_SCRIPTS) -initconfdir = $(DEFAULT_INITCONF_DIR) -initconf_SCRIPTS = zfs - -EXTRA_DIST = \ - $(top_srcdir)/etc/init.d/zfs-functions.in \ - $(top_srcdir)/etc/init.d/zfs-share.in \ - $(top_srcdir)/etc/init.d/zfs-import.in \ - $(top_srcdir)/etc/init.d/zfs-mount.in \ - $(top_srcdir)/etc/init.d/zfs-zed.in \ - $(top_srcdir)/etc/init.d/zfs.in - -$(init_SCRIPTS) $(initconf_SCRIPTS) $(initcommon_SCRIPTS):%:%.in - -(if [ -e /etc/debian_version ]; then \ - NFS_SRV=nfs-kernel-server; \ - else \ - NFS_SRV=nfs; \ - fi; \ - if [ -e /sbin/openrc-run ]; then \ - SHELL=/sbin/openrc-run; \ - else \ - SHELL=/bin/sh; \ - fi; \ - $(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@udevdir\@,$(udevdir),g' \ - -e 's,@udevruledir\@,$(udevruledir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - -e 's,@initconfdir\@,$(initconfdir),g' \ - -e 's,@initdir\@,$(initdir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e "s,@SHELL\@,$$SHELL,g" \ - -e "s,@NFS_SRV\@,$$NFS_SRV,g" \ - $< >'$@'; \ - [ '$@' = 'zfs-functions' -o '$@' = 'zfs' ] || \ - chmod +x '$@') - -distclean-local:: - -$(RM) $(init_SCRIPTS) $(initcommon_SCRIPTS) $(initconf_SCRIPTS) +SHELLCHECK_SHELL = dash # local variables diff --git a/etc/init.d/README.md b/etc/init.d/README.md index 89edb1da31..c14b01937d 100644 --- a/etc/init.d/README.md +++ b/etc/init.d/README.md @@ -16,7 +16,7 @@ DESCRIPTION SUPPORT If you find that they don't work for your platform, please report this - at the ZFS On Linux issue tracker at https://github.com/zfsonlinux/zfs/issues. + at the OpenZFS issue tracker at https://github.com/openzfs/zfs/issues. Please include: @@ -35,7 +35,7 @@ SUPPORT If you're making your own distribution and you want the scripts to work on that, the biggest problem you'll (probably) have is the part - at the beginning of the "zfs-functions.in" file which sets up the + at the beginning of the "zfs-functions" file which sets up the logging output. INSTALLING INIT SCRIPT LINKS diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in old mode 100644 new mode 100755 index 420d2e8a7a..e4bc7b8339 --- a/etc/init.d/zfs-import.in +++ b/etc/init.d/zfs-import.in @@ -1,4 +1,4 @@ -#!@SHELL@ +#!@DEFAULT_INIT_SHELL@ # # zfs-import This script will import ZFS pools # @@ -26,10 +26,8 @@ # # Released under the 2-clause BSD license. # -# The original script that acted as a template for this script came from -# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a -# licensing stansa) in the commit dated Mar 24, 2011: -# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a +# This script is based on debian/zfsutils.zfs.init from the +# Debian GNU/kFreeBSD zfsutils 8.1-3 package, written by Aurelien Jarno. # Source the common init script . @sysconfdir@/zfs/zfs-functions @@ -56,16 +54,13 @@ do_verbatim_import() # Support function to get a list of all pools, separated with ';' find_pools() { - local CMD="$*" local pools - pools=$($CMD 2> /dev/null | \ + pools=$("$@" 2> /dev/null | \ grep -E "pool:|^[a-zA-Z0-9]" | \ sed 's@.*: @@' | \ sort | \ - while read pool; do \ - echo -n "$pool;" - done) + tr '\n' ';') echo "${pools%%;}" # Return without the last ';'. } @@ -77,10 +72,11 @@ do_import_all_visible() local exception dir ZPOOL_IMPORT_PATH RET=0 r=1 # In case not shutdown cleanly. + # shellcheck disable=SC2154 [ -n "$init" ] && rm -f /etc/dfs/sharetab # Just simplify code later on. - if [ -n "$USE_DISK_BY_ID" -a "$USE_DISK_BY_ID" != 'yes' ] + if [ -n "$USE_DISK_BY_ID" ] && [ "$USE_DISK_BY_ID" != 'yes' ] then # It's something, but not 'yes' so it's no good to us. unset USE_DISK_BY_ID @@ -90,7 +86,7 @@ do_import_all_visible() already_imported=$(find_pools "$ZPOOL" list -H -oname) available_pools=$(find_pools "$ZPOOL" import) - # Just in case - seen it happen (that a pool isn't visable/found + # Just in case - seen it happen (that a pool isn't visible/found # with a simple "zpool import" but only when using the "-d" # option or setting ZPOOL_IMPORT_PATH). if [ -d "/dev/disk/by-id" ] @@ -153,7 +149,7 @@ do_import_all_visible() # to something we can use later with the real import(s). We want to # make sure we find all by* dirs, BUT by-vdev should be first (if it # exists). - if [ -n "$USE_DISK_BY_ID" -a -z "$ZPOOL_IMPORT_PATH" ] + if [ -n "$USE_DISK_BY_ID" ] && [ -z "$ZPOOL_IMPORT_PATH" ] then local dirs dirs="$(for dir in $(echo /dev/disk/by-*) @@ -162,7 +158,7 @@ do_import_all_visible() echo "$dir" | grep -q /by-vdev && continue [ ! -d "$dir" ] && continue - echo -n "$dir:" + printf "%s" "$dir:" done | sed 's,:$,,g')" if [ -d "/dev/disk/by-vdev" ] @@ -187,7 +183,7 @@ do_import_all_visible() # Needs to be exported for "zpool" to catch it. [ -n "$ZPOOL_IMPORT_PATH" ] && export ZPOOL_IMPORT_PATH - # Mount all availible pools (except those set in ZFS_POOL_EXCEPTIONS. + # Mount all available pools (except those set in ZFS_POOL_EXCEPTIONS. # # If not interactive (run from init - variable init='/sbin/init') # we get ONE line for all pools being imported, with just a dot @@ -219,6 +215,7 @@ do_import_all_visible() # Import by using ZPOOL_IMPORT_PATH (either set above or in # the config file) _or_ with the 'built in' default search # paths. This is the preferred way. + # shellcheck disable=SC2086 "$ZPOOL" import -N ${ZPOOL_IMPORT_OPTS} "$pool" 2> /dev/null r="$?" ; RET=$((RET + r)) if [ "$r" -eq 0 ] @@ -231,7 +228,7 @@ do_import_all_visible() # using the cache file soon and that might succeed. [ ! -f "$ZPOOL_CACHE" ] && zfs_log_end_msg "$RET" - if [ "$r" -gt 0 -a -f "$ZPOOL_CACHE" ] + if [ "$r" -gt 0 ] && [ -f "$ZPOOL_CACHE" ] then # Failed to import without a cache file. Try WITH... if [ -z "$init" ] && check_boolean "$VERBOSE_MOUNT" @@ -240,6 +237,7 @@ do_import_all_visible() zfs_log_progress_msg " using cache file" fi + # shellcheck disable=SC2086 "$ZPOOL" import -c "$ZPOOL_CACHE" -N ${ZPOOL_IMPORT_OPTS} \ "$pool" 2> /dev/null r="$?" ; RET=$((RET + r)) @@ -254,7 +252,7 @@ do_import_all_visible() [ -n "$init" ] && zfs_log_end_msg "$RET" IFS="$OLD_IFS" - [ -n "$already_imported" -a -z "$available_pools" ] && return 0 + [ -n "$already_imported" ] && [ -z "$available_pools" ] && return 0 return "$RET" } diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in old mode 100644 new mode 100755 index fa954e0939..000619b671 --- a/etc/init.d/zfs-mount.in +++ b/etc/init.d/zfs-mount.in @@ -1,4 +1,4 @@ -#!@SHELL@ +#!@DEFAULT_INIT_SHELL@ # # zfs-mount This script will mount/umount the zfs filesystems. # @@ -23,10 +23,8 @@ # # Released under the 2-clause BSD license. # -# The original script that acted as a template for this script came from -# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a -# licensing stansa) in the commit dated Mar 24, 2011: -# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a +# This script is based on debian/zfsutils.zfs.init from the +# Debian GNU/kFreeBSD zfsutils 8.1-3 package, written by Aurelien Jarno. # Source the common init script . @sysconfdir@/zfs/zfs-functions @@ -34,9 +32,8 @@ # ---------------------------------------------------- chkroot() { - while read line; do - set -- $line - if [ "$2" = "/" ]; then + while read -r _ mp _; do + if [ "$mp" = "/" ]; then return 0 fi done < /proc/self/mounts @@ -65,7 +62,7 @@ do_depend() # Mount all datasets/filesystems do_mount() { - local verbose overlay i mntpt val + local verbose overlay i mntpt check_boolean "$VERBOSE_MOUNT" && verbose=v check_boolean "$DO_OVERLAY_MOUNTS" && overlay=O @@ -83,11 +80,11 @@ do_mount() read_mtab "^/dev/(zd|zvol)" read_fstab "^/dev/(zd|zvol)" - i=0; var=$(eval echo FSTAB_$i) - while [ -n "$(eval echo "$""$var")" ] + i=0; var="FSTAB_0" + while [ -n "$(eval echo "\$$var")" ] do - mntpt=$(eval echo "$""$var") - dev=$(eval echo "$"FSTAB_dev_$i) + mntpt=$(eval echo "\$$var") + dev=$(eval echo "\$FSTAB_dev_$i") if ! in_mtab "$mntpt" && ! is_mounted "$mntpt" && [ -e "$dev" ] then check_boolean "$VERBOSE_MOUNT" && \ @@ -96,15 +93,15 @@ do_mount() fi i=$((i + 1)) - var=$(eval echo FSTAB_$i) + var=$(eval echo "FSTAB_$i") done read_mtab "[[:space:]]zfs[[:space:]]" read_fstab "[[:space:]]zfs[[:space:]]" - i=0; var=$(eval echo FSTAB_$i) - while [ -n "$(eval echo "$""$var")" ] + i=0; var=$(eval echo "FSTAB_$i") + while [ -n "$(eval echo "\$$var")" ] do - mntpt=$(eval echo "$""$var") + mntpt=$(eval echo "\$$var") if ! in_mtab "$mntpt" && ! is_mounted "$mntpt" then check_boolean "$VERBOSE_MOUNT" && \ @@ -113,7 +110,7 @@ do_mount() fi i=$((i + 1)) - var=$(eval echo FSTAB_$i) + var=$(eval echo "FSTAB_$i") done check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 @@ -136,11 +133,11 @@ do_unmount() read_mtab "^/dev/(zd|zvol)" read_fstab "^/dev/(zd|zvol)" - i=0; var=$(eval echo FSTAB_$i) - while [ -n "$(eval echo "$""$var")" ] + i=0; var="FSTAB_0" + while [ -n "$(eval echo "\$$var")" ] do - mntpt=$(eval echo "$""$var") - dev=$(eval echo "$"FSTAB_dev_$i) + mntpt=$(eval echo "\$$var") + dev=$(eval echo "\$FSTAB_dev_$i") if in_mtab "$mntpt" then check_boolean "$VERBOSE_MOUNT" && \ @@ -149,15 +146,15 @@ do_unmount() fi i=$((i + 1)) - var=$(eval echo FSTAB_$i) + var=$(eval echo "FSTAB_$i") done read_mtab "[[:space:]]zfs[[:space:]]" read_fstab "[[:space:]]zfs[[:space:]]" - i=0; var=$(eval echo FSTAB_$i) - while [ -n "$(eval echo "$""$var")" ] + i=0; var="FSTAB_0" + while [ -n "$(eval echo "\$$var")" ] do - mntpt=$(eval echo "$""$var") + mntpt=$(eval echo "\$$var") if in_mtab "$mntpt"; then check_boolean "$VERBOSE_MOUNT" && \ zfs_log_progress_msg "$mntpt " @@ -165,7 +162,7 @@ do_unmount() fi i=$((i + 1)) - var=$(eval echo FSTAB_$i) + var=$(eval echo "FSTAB_$i") done check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in old mode 100644 new mode 100755 index bdbadf6fef..ef628fe463 --- a/etc/init.d/zfs-share.in +++ b/etc/init.d/zfs-share.in @@ -1,4 +1,4 @@ -#!@SHELL@ +#!@DEFAULT_INIT_SHELL@ # # zfs-share This script will network share zfs filesystems and volumes. # @@ -13,8 +13,8 @@ # Required-Stop: $local_fs $network $remote_fs zfs-mount # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 -# Should-Start: iscsi iscsitarget istgt scst @NFS_SRV@ samba samba4 zfs-mount zfs-zed -# Should-Stop: iscsi iscsitarget istgt scst @NFS_SRV@ samba samba4 zfs-mount zfs-zed +# Should-Start: iscsi iscsitarget istgt scst @DEFAULT_INIT_NFS_SERVER@ samba samba4 zfs-mount zfs-zed +# Should-Stop: iscsi iscsitarget istgt scst @DEFAULT_INIT_NFS_SERVER@ samba samba4 zfs-mount zfs-zed # Short-Description: Network share ZFS datasets and volumes. # Description: Run the `zfs share -a` or `zfs unshare -a` commands # for controlling iSCSI, NFS, or CIFS network shares. @@ -22,10 +22,8 @@ # # Released under the 2-clause BSD license. # -# The original script that acted as a template for this script came from -# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a -# licensing stansa) in the commit dated Mar 24, 2011: -# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a +# This script is based on debian/zfsutils.zfs.init from the +# Debian GNU/kFreeBSD zfsutils 8.1-3 package, written by Aurelien Jarno. # Source the common init script . @sysconfdir@/zfs/zfs-functions diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in old mode 100644 new mode 100755 index fe3c22594c..e5256cbc62 --- a/etc/init.d/zfs-zed.in +++ b/etc/init.d/zfs-zed.in @@ -1,4 +1,4 @@ -#!@SHELL@ +#!@DEFAULT_INIT_SHELL@ # # zfs-zed # @@ -21,10 +21,8 @@ # # Released under the 2-clause BSD license. # -# The original script that acted as a template for this script came from -# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a -# licensing stansa) in the commit dated Mar 24, 2011: -# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a +# This script is based on debian/zfsutils.zfs.init from the +# Debian GNU/kFreeBSD zfsutils 8.1-3 package, written by Aurelien Jarno. # Source the common init script . @sysconfdir@/zfs/zfs-functions @@ -32,6 +30,7 @@ ZED_NAME="zed" ZED_PIDFILE="@runstatedir@/$ZED_NAME.pid" +# shellcheck disable=SC2034 extra_started_commands="reload" # Exit if the package is not installed @@ -57,24 +56,20 @@ do_start() do_stop() { - local pools RET + local pools check_module_loaded "zfs" || exit 0 zfs_action "Stopping ZFS Event Daemon" zfs_daemon_stop \ - "$ZED_PIDFILE" "$ZED" "$ZED_NAME" - if [ "$?" -eq "0" ] + "$ZED_PIDFILE" "$ZED" "$ZED_NAME" || return "$?" + + # Let's see if we have any pools imported + pools=$("$ZPOOL" list -H -oname) + if [ -z "$pools" ] then - # Let's see if we have any pools imported - pools=$("$ZPOOL" list -H -oname) - if [ -z "$pools" ] - then - # No pools imported, it is/should be safe/possible to - # unload modules. - zfs_action "Unloading modules" rmmod zfs zunicode \ - zavl zcommon znvpair zlua spl - return "$?" - fi - else + # No pools imported, it is/should be safe/possible to + # unload modules. + zfs_action "Unloading modules" rmmod zfs zunicode \ + zavl zcommon znvpair zlua spl return "$?" fi } diff --git a/etc/modules-load.d/Makefile.am b/etc/modules-load.d/Makefile.am index 58c7acd44e..8a2955767b 100644 --- a/etc/modules-load.d/Makefile.am +++ b/etc/modules-load.d/Makefile.am @@ -1,13 +1,2 @@ -modulesload_DATA = \ +dist_modulesload_DATA = \ zfs.conf - -EXTRA_DIST = \ - $(top_srcdir)/etc/modules-load.d/zfs.conf.in - -$(modulesload_DATA):%:%.in - -$(SED) \ - -e '' \ - $< >'$@' - -distclean-local:: - -$(RM) $(modulesload_DATA) diff --git a/etc/modules-load.d/zfs.conf.in b/etc/modules-load.d/zfs.conf similarity index 100% rename from etc/modules-load.d/zfs.conf.in rename to etc/modules-load.d/zfs.conf diff --git a/etc/sudoers.d/Makefile.am b/etc/sudoers.d/Makefile.am index ca9186a7ea..6f7ac8dbfd 100644 --- a/etc/sudoers.d/Makefile.am +++ b/etc/sudoers.d/Makefile.am @@ -2,4 +2,4 @@ sudoersddir = $(sysconfdir)/sudoers.d sudoersd_DATA = zfs EXTRA_DIST = \ - $(top_srcdir)/etc/sudoers.d/zfs + zfs diff --git a/etc/sudoers.d/zfs b/etc/sudoers.d/zfs index f66ebad216..82a25ba81e 100644 --- a/etc/sudoers.d/zfs +++ b/etc/sudoers.d/zfs @@ -3,6 +3,7 @@ ## to read basic SMART health statistics for a pool. ## ## CAUTION: Any syntax error introduced here will break sudo. +## Editing with 'visudo' is recommended: visudo -f /etc/sudoers.d/zfs ## # ALL ALL = (root) NOPASSWD: /usr/sbin/smartctl -a /dev/[hsv]d[a-z0-9]* diff --git a/etc/systemd/Makefile.am b/etc/systemd/Makefile.am index 7b47b93fc1..66232a5ff1 100644 --- a/etc/systemd/Makefile.am +++ b/etc/systemd/Makefile.am @@ -1 +1,4 @@ +include $(top_srcdir)/config/Shellcheck.am + SUBDIRS = system system-generators +SHELLCHECKDIRS = system-generators diff --git a/etc/systemd/system-generators/Makefile.am b/etc/systemd/system-generators/Makefile.am index c730982a51..e5920bf392 100644 --- a/etc/systemd/system-generators/Makefile.am +++ b/etc/systemd/system-generators/Makefile.am @@ -1,15 +1,14 @@ -systemdgenerator_SCRIPTS = \ +include $(top_srcdir)/config/Rules.am + +systemdgenerator_PROGRAMS = \ zfs-mount-generator -EXTRA_DIST = \ - $(top_srcdir)/etc/systemd/system-generators/zfs-mount-generator.in +zfs_mount_generator_SOURCES = \ + zfs-mount-generator.c -$(systemdgenerator_SCRIPTS): %: %.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' +zfs_mount_generator_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la -distclean-local:: - -$(RM) $(systemdgenerator_SCRIPTS) +zfs_mount_generator_LDFLAGS = -pthread + +include $(top_srcdir)/config/CppCheck.am diff --git a/etc/systemd/system-generators/zfs-mount-generator.c b/etc/systemd/system-generators/zfs-mount-generator.c new file mode 100644 index 0000000000..b806339deb --- /dev/null +++ b/etc/systemd/system-generators/zfs-mount-generator.c @@ -0,0 +1,1083 @@ +/* + * Copyright (c) 2017 Antonio Russo + * Copyright (c) 2020 InsanePrawn + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STRCMP ((int(*)(const void *, const void *))&strcmp) +#define PID_T_CMP ((int(*)(const void *, const void *))&pid_t_cmp) + +static int +pid_t_cmp(const pid_t *lhs, const pid_t *rhs) +{ + /* + * This is always valid, quoth sys_types.h(7posix): + * > blksize_t, pid_t, and ssize_t shall be signed integer types. + */ + return (*lhs - *rhs); +} + +#define EXIT_ENOMEM() \ + do { \ + fprintf(stderr, PROGNAME "[%d]: " \ + "not enough memory (L%d)!\n", getpid(), __LINE__); \ + _exit(1); \ + } while (0) + + +#define PROGNAME "zfs-mount-generator" +#define FSLIST SYSCONFDIR "/zfs/zfs-list.cache" +#define ZFS SBINDIR "/zfs" + +#define OUTPUT_HEADER \ + "# Automatically generated by " PROGNAME "\n" \ + "\n" + +/* + * Starts like the one in libzfs_util.c but also matches "//" + * and captures until the end, since we actually use it for path extraxion + */ +#define URI_REGEX_S "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):\\/\\/\\(.*\\)$" +static regex_t uri_regex; + +static char *argv0; + +static const char *destdir = "/tmp"; +static int destdir_fd = -1; + +static void *known_pools = NULL; /* tsearch() of C strings */ +static struct { + sem_t noauto_not_on_sem; + + sem_t noauto_names_sem; + size_t noauto_names_len; + size_t noauto_names_max; + char noauto_names[][NAME_MAX]; +} *noauto_files; + + +static char * +systemd_escape(const char *input, const char *prepend, const char *append) +{ + size_t len = strlen(input); + size_t applen = strlen(append); + size_t prelen = strlen(prepend); + char *ret = malloc(4 * len + prelen + applen + 1); + if (!ret) + EXIT_ENOMEM(); + + memcpy(ret, prepend, prelen); + char *out = ret + prelen; + + const char *cur = input; + if (*cur == '.') { + memcpy(out, "\\x2e", 4); + out += 4; + ++cur; + } + for (; *cur; ++cur) { + if (*cur == '/') + *(out++) = '-'; + else if (strchr( + "0123456789" + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + ":_.", *cur)) + *(out++) = *cur; + else { + sprintf(out, "\\x%02x", (int)*cur); + out += 4; + } + } + + memcpy(out, append, applen + 1); + return (ret); +} + +static void +simplify_path(char *path) +{ + char *out = path; + for (char *cur = path; *cur; ++cur) { + if (*cur == '/') { + while (*(cur + 1) == '/') + ++cur; + *(out++) = '/'; + } else + *(out++) = *cur; + } + + *(out++) = '\0'; +} + +static bool +strendswith(const char *what, const char *suff) +{ + size_t what_l = strlen(what); + size_t suff_l = strlen(suff); + + return ((what_l >= suff_l) && + (strcmp(what + what_l - suff_l, suff) == 0)); +} + +/* Assumes already-simplified path, doesn't modify input */ +static char * +systemd_escape_path(char *input, const char *prepend, const char *append) +{ + if (strcmp(input, "/") == 0) { + char *ret; + if (asprintf(&ret, "%s-%s", prepend, append) == -1) + EXIT_ENOMEM(); + return (ret); + } else { + /* + * path_is_normalized() (flattened for absolute paths here), + * required for proper escaping + */ + if (strstr(input, "/./") || strstr(input, "/../") || + strendswith(input, "/.") || strendswith(input, "/..")) + return (NULL); + + + if (input[0] == '/') + ++input; + + char *back = &input[strlen(input) - 1]; + bool deslash = *back == '/'; + if (deslash) + *back = '\0'; + + char *ret = systemd_escape(input, prepend, append); + + if (deslash) + *back = '/'; + return (ret); + } +} + +static FILE * +fopenat(int dirfd, const char *pathname, int flags, + const char *stream_mode, mode_t mode) +{ + int fd = openat(dirfd, pathname, flags, mode); + if (fd < 0) + return (NULL); + + return (fdopen(fd, stream_mode)); +} + +static int +line_worker(char *line, const char *cachefile) +{ + char *toktmp; + /* BEGIN CSTYLED */ + const char *dataset = strtok_r(line, "\t", &toktmp); + char *p_mountpoint = strtok_r(NULL, "\t", &toktmp); + const char *p_canmount = strtok_r(NULL, "\t", &toktmp); + const char *p_atime = strtok_r(NULL, "\t", &toktmp); + const char *p_relatime = strtok_r(NULL, "\t", &toktmp); + const char *p_devices = strtok_r(NULL, "\t", &toktmp); + const char *p_exec = strtok_r(NULL, "\t", &toktmp); + const char *p_readonly = strtok_r(NULL, "\t", &toktmp); + const char *p_setuid = strtok_r(NULL, "\t", &toktmp); + const char *p_nbmand = strtok_r(NULL, "\t", &toktmp); + const char *p_encroot = strtok_r(NULL, "\t", &toktmp) ?: "-"; + char *p_keyloc = strtok_r(NULL, "\t", &toktmp) ?: strdupa("none"); + const char *p_systemd_requires = strtok_r(NULL, "\t", &toktmp) ?: "-"; + const char *p_systemd_requiresmountsfor = strtok_r(NULL, "\t", &toktmp) ?: "-"; + const char *p_systemd_before = strtok_r(NULL, "\t", &toktmp) ?: "-"; + const char *p_systemd_after = strtok_r(NULL, "\t", &toktmp) ?: "-"; + char *p_systemd_wantedby = strtok_r(NULL, "\t", &toktmp) ?: strdupa("-"); + char *p_systemd_requiredby = strtok_r(NULL, "\t", &toktmp) ?: strdupa("-"); + const char *p_systemd_nofail = strtok_r(NULL, "\t", &toktmp) ?: "-"; + const char *p_systemd_ignore = strtok_r(NULL, "\t", &toktmp) ?: "-"; + /* END CSTYLED */ + + const char *pool = dataset; + if ((toktmp = strchr(pool, '/')) != NULL) + pool = strndupa(pool, toktmp - pool); + + if (p_nbmand == NULL) { + fprintf(stderr, PROGNAME "[%d]: %s: not enough tokens!\n", + getpid(), dataset); + return (1); + } + + strncpy(argv0, dataset, strlen(argv0)); + + /* Minimal pre-requisites to mount a ZFS dataset */ + const char *after = "zfs-import.target"; + const char *wants = "zfs-import.target"; + const char *bindsto = NULL; + char *wantedby = NULL; + char *requiredby = NULL; + bool noauto = false; + bool wantedby_append = true; + + /* + * zfs-import.target is not needed if the pool is already imported. + * This avoids a dependency loop on root-on-ZFS systems: + * systemd-random-seed.service After (via RequiresMountsFor) + * var-lib.mount After + * zfs-import.target After + * zfs-import-{cache,scan}.service After + * cryptsetup.service After + * systemd-random-seed.service + */ + if (tfind(pool, &known_pools, STRCMP)) { + after = ""; + wants = ""; + } + + if (strcmp(p_systemd_after, "-") == 0) + p_systemd_after = NULL; + if (strcmp(p_systemd_before, "-") == 0) + p_systemd_before = NULL; + if (strcmp(p_systemd_requires, "-") == 0) + p_systemd_requires = NULL; + if (strcmp(p_systemd_requiresmountsfor, "-") == 0) + p_systemd_requiresmountsfor = NULL; + + + if (strcmp(p_encroot, "-") != 0) { + char *keyloadunit = + systemd_escape(p_encroot, "zfs-load-key@", ".service"); + + if (strcmp(dataset, p_encroot) == 0) { + const char *keymountdep = NULL; + bool is_prompt = false; + + regmatch_t uri_matches[3]; + if (regexec(&uri_regex, p_keyloc, + sizeof (uri_matches) / sizeof (*uri_matches), + uri_matches, 0) == 0) { + p_keyloc[uri_matches[2].rm_eo] = '\0'; + const char *path = + &p_keyloc[uri_matches[2].rm_so]; + + /* + * Assumes all URI keylocations need + * the mount for their path; + * http://, for example, wouldn't + * (but it'd need network-online.target et al.) + */ + keymountdep = path; + } else { + if (strcmp(p_keyloc, "prompt") != 0) + fprintf(stderr, PROGNAME "[%d]: %s: " + "unknown non-URI keylocation=%s\n", + getpid(), dataset, p_keyloc); + + is_prompt = true; + } + + + /* Generate the key-load .service unit */ + FILE *keyloadunit_f = fopenat(destdir_fd, keyloadunit, + O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, "w", + 0644); + if (!keyloadunit_f) { + fprintf(stderr, PROGNAME "[%d]: %s: " + "couldn't open %s under %s: %s\n", + getpid(), dataset, keyloadunit, destdir, + strerror(errno)); + return (1); + } + + fprintf(keyloadunit_f, + OUTPUT_HEADER + "[Unit]\n" + "Description=Load ZFS key for %s\n" + "SourcePath=" FSLIST "/%s\n" + "Documentation=man:zfs-mount-generator(8)\n" + "DefaultDependencies=no\n" + "Wants=%s\n" + "After=%s\n", + dataset, cachefile, wants, after); + + if (p_systemd_requires) + fprintf(keyloadunit_f, + "Requires=%s\n", p_systemd_requires); + + if (p_systemd_requiresmountsfor || keymountdep) { + fprintf(keyloadunit_f, "RequiresMountsFor="); + if (p_systemd_requiresmountsfor) + fprintf(keyloadunit_f, + "%s ", p_systemd_requiresmountsfor); + if (keymountdep) + fprintf(keyloadunit_f, + "'%s'", keymountdep); + fprintf(keyloadunit_f, "\n"); + } + + /* BEGIN CSTYLED */ + fprintf(keyloadunit_f, + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "# This avoids a dependency loop involving systemd-journald.socket if this\n" + "# dataset is a parent of the root filesystem.\n" + "StandardOutput=null\n" + "StandardError=null\n" + "ExecStart=/bin/sh -euc '" + "[ \"$$(" ZFS " get -H -o value keystatus \"%s\")\" = \"unavailable\" ] || exit 0;", + dataset); + if (is_prompt) + fprintf(keyloadunit_f, + "for i in 1 2 3; do " + "systemd-ask-password --id=\"zfs:%s\" \"Enter passphrase for %s:\" |" + "" ZFS " load-key \"%s\" && exit 0;" + "done;" + "exit 1", + dataset, dataset, dataset); + else + fprintf(keyloadunit_f, + "exec " ZFS " load-key \"%s\"", + dataset); + + fprintf(keyloadunit_f, + "'\n" + "ExecStop=/bin/sh -euc '" + "[ \"$$(" ZFS " get -H -o value keystatus \"%s\")\" = \"available\" ] || exit 0;" + "exec " ZFS " unload-key \"%s\"" + "'\n", + dataset, dataset); + /* END CSTYLED */ + + (void) fclose(keyloadunit_f); + } + + /* Update dependencies for the mount file to want this */ + bindsto = keyloadunit; + if (after[0] == '\0') + after = keyloadunit; + else if (asprintf(&toktmp, "%s %s", after, keyloadunit) != -1) + after = toktmp; + else + EXIT_ENOMEM(); + } + + + /* Skip generation of the mount unit if org.openzfs.systemd:ignore=on */ + if (strcmp(p_systemd_ignore, "-") == 0 || + strcmp(p_systemd_ignore, "off") == 0) { + /* ok */ + } else if (strcmp(p_systemd_ignore, "on") == 0) + return (0); + else { + fprintf(stderr, PROGNAME "[%d]: %s: " + "invalid org.openzfs.systemd:ignore=%s\n", + getpid(), dataset, p_systemd_ignore); + return (1); + } + + /* Check for canmount */ + if (strcmp(p_canmount, "on") == 0) { + /* ok */ + } else if (strcmp(p_canmount, "noauto") == 0) + noauto = true; + else if (strcmp(p_canmount, "off") == 0) + return (0); + else { + fprintf(stderr, PROGNAME "[%d]: %s: invalid canmount=%s\n", + getpid(), dataset, p_canmount); + return (1); + } + + /* Check for legacy and blank mountpoints */ + if (strcmp(p_mountpoint, "legacy") == 0 || + strcmp(p_mountpoint, "none") == 0) + return (0); + else if (p_mountpoint[0] != '/') { + fprintf(stderr, PROGNAME "[%d]: %s: invalid mountpoint=%s\n", + getpid(), dataset, p_mountpoint); + return (1); + } + + /* Escape the mountpoint per systemd policy */ + simplify_path(p_mountpoint); + const char *mountfile = systemd_escape_path(p_mountpoint, "", ".mount"); + if (mountfile == NULL) { + fprintf(stderr, + PROGNAME "[%d]: %s: abnormal simplified mountpoint: %s\n", + getpid(), dataset, p_mountpoint); + return (1); + } + + + /* + * Parse options, cf. lib/libzfs/libzfs_mount.c:zfs_add_options + * + * The longest string achievable here is + * ",atime,strictatime,nodev,noexec,rw,nosuid,nomand". + */ + char opts[64] = ""; + + /* atime */ + if (strcmp(p_atime, "on") == 0) { + /* relatime */ + if (strcmp(p_relatime, "on") == 0) + strcat(opts, ",atime,relatime"); + else if (strcmp(p_relatime, "off") == 0) + strcat(opts, ",atime,strictatime"); + else + fprintf(stderr, + PROGNAME "[%d]: %s: invalid relatime=%s\n", + getpid(), dataset, p_relatime); + } else if (strcmp(p_atime, "off") == 0) { + strcat(opts, ",noatime"); + } else + fprintf(stderr, PROGNAME "[%d]: %s: invalid atime=%s\n", + getpid(), dataset, p_atime); + + /* devices */ + if (strcmp(p_devices, "on") == 0) + strcat(opts, ",dev"); + else if (strcmp(p_devices, "off") == 0) + strcat(opts, ",nodev"); + else + fprintf(stderr, PROGNAME "[%d]: %s: invalid devices=%s\n", + getpid(), dataset, p_devices); + + /* exec */ + if (strcmp(p_exec, "on") == 0) + strcat(opts, ",exec"); + else if (strcmp(p_exec, "off") == 0) + strcat(opts, ",noexec"); + else + fprintf(stderr, PROGNAME "[%d]: %s: invalid exec=%s\n", + getpid(), dataset, p_exec); + + /* readonly */ + if (strcmp(p_readonly, "on") == 0) + strcat(opts, ",ro"); + else if (strcmp(p_readonly, "off") == 0) + strcat(opts, ",rw"); + else + fprintf(stderr, PROGNAME "[%d]: %s: invalid readonly=%s\n", + getpid(), dataset, p_readonly); + + /* setuid */ + if (strcmp(p_setuid, "on") == 0) + strcat(opts, ",suid"); + else if (strcmp(p_setuid, "off") == 0) + strcat(opts, ",nosuid"); + else + fprintf(stderr, PROGNAME "[%d]: %s: invalid setuid=%s\n", + getpid(), dataset, p_setuid); + + /* nbmand */ + if (strcmp(p_nbmand, "on") == 0) + strcat(opts, ",mand"); + else if (strcmp(p_nbmand, "off") == 0) + strcat(opts, ",nomand"); + else + fprintf(stderr, PROGNAME "[%d]: %s: invalid nbmand=%s\n", + getpid(), dataset, p_setuid); + + if (strcmp(p_systemd_wantedby, "-") != 0) { + noauto = true; + + if (strcmp(p_systemd_wantedby, "none") != 0) + wantedby = p_systemd_wantedby; + } + + if (strcmp(p_systemd_requiredby, "-") != 0) { + noauto = true; + + if (strcmp(p_systemd_requiredby, "none") != 0) + requiredby = p_systemd_requiredby; + } + + /* + * For datasets with canmount=on, a dependency is created for + * local-fs.target by default. To avoid regressions, this dependency + * is reduced to "wants" rather than "requires" when nofail!=off. + * **THIS MAY CHANGE** + * noauto=on disables this behavior completely. + */ + if (!noauto) { + if (strcmp(p_systemd_nofail, "off") == 0) + requiredby = strdupa("local-fs.target"); + else { + wantedby = strdupa("local-fs.target"); + wantedby_append = strcmp(p_systemd_nofail, "on") != 0; + } + } + + /* + * Handle existing files: + * 1. We never overwrite existing files, although we may delete + * files if we're sure they were created by us. (see 5.) + * 2. We handle files differently based on canmount. + * Units with canmount=on always have precedence over noauto. + * This is enforced by the noauto_not_on_sem semaphore, + * which is only unlocked when the last canmount=on process exits. + * It is important to use p_canmount and not noauto here, + * since we categorise by canmount while other properties, + * e.g. org.openzfs.systemd:wanted-by, also modify noauto. + * 3. If no unit file exists for a noauto dataset, we create one. + * Additionally, we use noauto_files to track the unit file names + * (which are the systemd-escaped mountpoints) of all (exclusively) + * noauto datasets that had a file created. + * 4. If the file to be created is found in the tracking array, + * we do NOT create it. + * 5. If a file exists for a noauto dataset, + * we check whether the file name is in the array. + * If it is, we have multiple noauto datasets for the same + * mountpoint. In such cases, we remove the file for safety. + * We leave the file name in the tracking array to avoid + * further noauto datasets creating a file for this path again. + */ + + { + sem_t *our_sem = (strcmp(p_canmount, "on") == 0) ? + &noauto_files->noauto_names_sem : + &noauto_files->noauto_not_on_sem; + while (sem_wait(our_sem) == -1 && errno == EINTR) + ; + } + + struct stat stbuf; + bool already_exists = fstatat(destdir_fd, mountfile, &stbuf, 0) == 0; + + bool is_known = false; + for (size_t i = 0; i < noauto_files->noauto_names_len; ++i) { + if (strncmp( + noauto_files->noauto_names[i], mountfile, NAME_MAX) == 0) { + is_known = true; + break; + } + } + + if (already_exists) { + if (is_known) { + /* If it's in $noauto_files, we must be noauto too */ + + /* See 5 */ + errno = 0; + (void) unlinkat(destdir_fd, mountfile, 0); + + /* See 2 */ + fprintf(stderr, PROGNAME "[%d]: %s: " + "removing duplicate noauto unit %s%s%s\n", + getpid(), dataset, mountfile, + errno ? "" : " failed: ", + errno ? "" : strerror(errno)); + } else { + /* Don't log for canmount=noauto */ + if (strcmp(p_canmount, "on") == 0) + fprintf(stderr, PROGNAME "[%d]: %s: " + "%s already exists. Skipping.\n", + getpid(), dataset, mountfile); + } + + /* File exists: skip current dataset */ + if (strcmp(p_canmount, "on") == 0) + sem_post(&noauto_files->noauto_names_sem); + return (0); + } else { + if (is_known) { + /* See 4 */ + if (strcmp(p_canmount, "on") == 0) + sem_post(&noauto_files->noauto_names_sem); + return (0); + } else if (strcmp(p_canmount, "noauto") == 0) { + if (noauto_files->noauto_names_len == + noauto_files->noauto_names_max) + fprintf(stderr, PROGNAME "[%d]: %s: " + "noauto dataset limit (%zu) reached! " + "Not tracking %s. Please report this to " + "https://github.com/openzfs/zfs\n", + getpid(), dataset, + noauto_files->noauto_names_max, mountfile); + else { + strncpy(noauto_files->noauto_names[ + noauto_files->noauto_names_len], + mountfile, NAME_MAX); + ++noauto_files->noauto_names_len; + } + } + } + + + FILE *mountfile_f = fopenat(destdir_fd, mountfile, + O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, "w", 0644); + if (strcmp(p_canmount, "on") == 0) + sem_post(&noauto_files->noauto_names_sem); + if (!mountfile_f) { + fprintf(stderr, + PROGNAME "[%d]: %s: couldn't open %s under %s: %s\n", + getpid(), dataset, mountfile, destdir, strerror(errno)); + return (1); + } + + fprintf(mountfile_f, + OUTPUT_HEADER + "[Unit]\n" + "SourcePath=" FSLIST "/%s\n" + "Documentation=man:zfs-mount-generator(8)\n" + "\n" + "Before=", + cachefile); + + if (p_systemd_before) + fprintf(mountfile_f, "%s ", p_systemd_before); + fprintf(mountfile_f, "zfs-mount.service"); /* Ensures we don't race */ + if (requiredby) + fprintf(mountfile_f, " %s", requiredby); + if (wantedby && wantedby_append) + fprintf(mountfile_f, " %s", wantedby); + + fprintf(mountfile_f, + "\n" + "After="); + if (p_systemd_after) + fprintf(mountfile_f, "%s ", p_systemd_after); + fprintf(mountfile_f, "%s\n", after); + + fprintf(mountfile_f, "Wants=%s\n", wants); + + if (bindsto) + fprintf(mountfile_f, "BindsTo=%s\n", bindsto); + if (p_systemd_requires) + fprintf(mountfile_f, "Requires=%s\n", p_systemd_requires); + if (p_systemd_requiresmountsfor) + fprintf(mountfile_f, + "RequiresMountsFor=%s\n", p_systemd_requiresmountsfor); + + fprintf(mountfile_f, + "\n" + "[Mount]\n" + "Where=%s\n" + "What=%s\n" + "Type=zfs\n" + "Options=defaults%s,zfsutil\n", + p_mountpoint, dataset, opts); + + (void) fclose(mountfile_f); + + if (!requiredby && !wantedby) + return (0); + + /* Finally, create the appropriate dependencies */ + char *linktgt; + if (asprintf(&linktgt, "../%s", mountfile) == -1) + EXIT_ENOMEM(); + + char *dependencies[][2] = { + {"wants", wantedby}, + {"requires", requiredby}, + {} + }; + for (__typeof__(&*dependencies) dep = &*dependencies; **dep; ++dep) { + if (!(*dep)[1]) + continue; + + for (char *reqby = strtok_r((*dep)[1], " ", &toktmp); + reqby; + reqby = strtok_r(NULL, " ", &toktmp)) { + char *depdir; + if (asprintf(&depdir, "%s.%s", reqby, (*dep)[0]) == -1) + EXIT_ENOMEM(); + + (void) mkdirat(destdir_fd, depdir, 0755); + int depdir_fd = openat(destdir_fd, depdir, + O_PATH | O_DIRECTORY | O_CLOEXEC); + if (depdir_fd < 0) { + fprintf(stderr, PROGNAME "[%d]: %s: " + "couldn't open %s under %s: %s\n", + getpid(), dataset, depdir, destdir, + strerror(errno)); + free(depdir); + continue; + } + + if (symlinkat(linktgt, depdir_fd, mountfile) == -1) + fprintf(stderr, PROGNAME "[%d]: %s: " + "couldn't symlink at " + "%s under %s under %s: %s\n", + getpid(), dataset, mountfile, + depdir, destdir, strerror(errno)); + + (void) close(depdir_fd); + free(depdir); + } + } + + return (0); +} + + +static int +pool_enumerator(zpool_handle_t *pool, void *data __attribute__((unused))) +{ + int ret = 0; + + /* + * Pools are guaranteed-unique by the kernel, + * no risk of leaking dupes here + */ + char *name = strdup(zpool_get_name(pool)); + if (!name || !tsearch(name, &known_pools, STRCMP)) { + free(name); + ret = ENOMEM; + } + + zpool_close(pool); + return (ret); +} + +int +main(int argc, char **argv) +{ + struct timespec time_init = {}; + clock_gettime(CLOCK_MONOTONIC_RAW, &time_init); + + { + int kmfd = open("/dev/kmsg", O_WRONLY | O_CLOEXEC); + if (kmfd >= 0) { + (void) dup2(kmfd, STDERR_FILENO); + (void) close(kmfd); + } + } + + uint8_t debug = 0; + + argv0 = argv[0]; + switch (argc) { + case 1: + /* Use default */ + break; + case 2: + case 4: + destdir = argv[1]; + break; + default: + fprintf(stderr, + PROGNAME "[%d]: wrong argument count: %d\n", + getpid(), argc - 1); + _exit(1); + } + + { + destdir_fd = open(destdir, O_PATH | O_DIRECTORY | O_CLOEXEC); + if (destdir_fd < 0) { + fprintf(stderr, PROGNAME "[%d]: " + "can't open destination directory %s: %s\n", + getpid(), destdir, strerror(errno)); + _exit(1); + } + } + + DIR *fslist_dir = opendir(FSLIST); + if (!fslist_dir) { + if (errno != ENOENT) + fprintf(stderr, + PROGNAME "[%d]: couldn't open " FSLIST ": %s\n", + getpid(), strerror(errno)); + _exit(0); + } + + { + libzfs_handle_t *libzfs = libzfs_init(); + if (libzfs) { + if (zpool_iter(libzfs, pool_enumerator, NULL) != 0) + fprintf(stderr, PROGNAME "[%d]: " + "error listing pools, ignoring\n", + getpid()); + libzfs_fini(libzfs); + } else + fprintf(stderr, PROGNAME "[%d]: " + "couldn't start libzfs, ignoring\n", + getpid()); + } + + { + int regerr = regcomp(&uri_regex, URI_REGEX_S, 0); + if (regerr != 0) { + fprintf(stderr, + PROGNAME "[%d]: invalid regex: %d\n", + getpid(), regerr); + _exit(1); + } + } + + { + /* + * We could just get a gigabyte here and Not Care, + * but if vm.overcommit_memory=2, then MAP_NORESERVE is ignored + * and we'd try (and likely fail) to rip it out of swap + */ + noauto_files = mmap(NULL, 4 * 1024 * 1024, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); + if (noauto_files == MAP_FAILED) { + fprintf(stderr, + PROGNAME "[%d]: couldn't allocate IPC region: %s\n", + getpid(), strerror(errno)); + _exit(1); + } + + sem_init(&noauto_files->noauto_not_on_sem, true, 0); + sem_init(&noauto_files->noauto_names_sem, true, 1); + noauto_files->noauto_names_len = 0; + /* Works out to 16447ish, *well* enough */ + noauto_files->noauto_names_max = + (4 * 1024 * 1024 - sizeof (*noauto_files)) / NAME_MAX; + } + + char *line = NULL; + size_t linelen = 0; + struct timespec time_start = {}; + { + const char *dbgenv = getenv("ZFS_DEBUG"); + if (dbgenv) + debug = atoi(dbgenv); + else { + FILE *cmdline = fopen("/proc/cmdline", "re"); + if (cmdline != NULL) { + if (getline(&line, &linelen, cmdline) >= 0) + debug = strstr(line, "debug") ? 2 : 0; + (void) fclose(cmdline); + } + } + + if (debug && !isatty(STDOUT_FILENO)) + dup2(STDERR_FILENO, STDOUT_FILENO); + } + + size_t forked_canmount_on = 0; + size_t forked_canmount_not_on = 0; + size_t canmount_on_pids_len = 128; + pid_t *canmount_on_pids = + malloc(canmount_on_pids_len * sizeof (*canmount_on_pids)); + if (canmount_on_pids == NULL) + canmount_on_pids_len = 0; + + if (debug) + clock_gettime(CLOCK_MONOTONIC_RAW, &time_start); + + ssize_t read; + pid_t pid; + struct dirent *cachent; + while ((cachent = readdir(fslist_dir)) != NULL) { + if (strcmp(cachent->d_name, ".") == 0 || + strcmp(cachent->d_name, "..") == 0) + continue; + + FILE *cachefile = fopenat(dirfd(fslist_dir), cachent->d_name, + O_RDONLY | O_CLOEXEC, "r", 0); + if (!cachefile) { + fprintf(stderr, PROGNAME "[%d]: " + "couldn't open %s under " FSLIST ": %s\n", + getpid(), cachent->d_name, strerror(errno)); + continue; + } + + while ((read = getline(&line, &linelen, cachefile)) >= 0) { + line[read - 1] = '\0'; /* newline */ + + switch (pid = fork()) { + case -1: + fprintf(stderr, + PROGNAME "[%d]: couldn't fork for %s: %s\n", + getpid(), line, strerror(errno)); + break; + case 0: /* child */ + _exit(line_worker(line, cachent->d_name)); + default: { /* parent */ + char *tmp; + char *dset = strtok_r(line, "\t", &tmp); + strtok_r(NULL, "\t", &tmp); + char *canmount = strtok_r(NULL, "\t", &tmp); + bool canmount_on = + canmount && strncmp(canmount, "on", 2) == 0; + + if (debug >= 2) + printf(PROGNAME ": forked %d, " + "canmount_on=%d, dataset=%s\n", + (int)pid, canmount_on, dset); + + if (canmount_on && + forked_canmount_on == + canmount_on_pids_len) { + size_t new_len = + (canmount_on_pids_len ?: 16) * 2; + void *new_pidlist = + realloc(canmount_on_pids, + new_len * + sizeof (*canmount_on_pids)); + if (!new_pidlist) { + fprintf(stderr, + PROGNAME "[%d]: " + "out of memory! " + "Mount ordering may be " + "affected.\n", getpid()); + continue; + } + + canmount_on_pids = new_pidlist; + canmount_on_pids_len = new_len; + } + + if (canmount_on) { + canmount_on_pids[forked_canmount_on] = + pid; + ++forked_canmount_on; + } else + ++forked_canmount_not_on; + break; + } + } + } + + (void) fclose(cachefile); + } + free(line); + + if (forked_canmount_on == 0) { + /* No canmount=on processes to finish, so don't deadlock here */ + for (size_t i = 0; i < forked_canmount_not_on; ++i) + sem_post(&noauto_files->noauto_not_on_sem); + } else { + /* Likely a no-op, since we got these from a narrow fork loop */ + qsort(canmount_on_pids, forked_canmount_on, + sizeof (*canmount_on_pids), PID_T_CMP); + } + + int status, ret = 0; + struct rusage usage; + size_t forked_canmount_on_max = forked_canmount_on; + while ((pid = wait4(-1, &status, 0, &usage)) != -1) { + ret |= WEXITSTATUS(status) | WTERMSIG(status); + + if (forked_canmount_on != 0) { + if (bsearch(&pid, canmount_on_pids, + forked_canmount_on_max, sizeof (*canmount_on_pids), + PID_T_CMP)) + --forked_canmount_on; + + if (forked_canmount_on == 0) { + /* + * All canmount=on processes have finished, + * let all the lower-priority ones finish now + */ + for (size_t i = 0; + i < forked_canmount_not_on; ++i) + sem_post( + &noauto_files->noauto_not_on_sem); + } + } + + if (debug >= 2) + printf(PROGNAME ": %d done, user=%llu.%06us, " + "system=%llu.%06us, maxrss=%ldB, ex=0x%x\n", + (int)pid, + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + (unsigned long long) usage.ru_stime.tv_sec, + (unsigned int) usage.ru_stime.tv_usec, + usage.ru_maxrss * 1024, status); + } + + if (debug) { + struct timespec time_end = {}; + clock_gettime(CLOCK_MONOTONIC_RAW, &time_end); + + getrusage(RUSAGE_SELF, &usage); + printf( + "\n" + PROGNAME ": self : " + "user=%llu.%06us, system=%llu.%06us, maxrss=%ldB\n", + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + (unsigned long long) usage.ru_stime.tv_sec, + (unsigned int) usage.ru_stime.tv_usec, + usage.ru_maxrss * 1024); + + getrusage(RUSAGE_CHILDREN, &usage); + printf(PROGNAME ": children: " + "user=%llu.%06us, system=%llu.%06us, maxrss=%ldB\n", + (unsigned long long) usage.ru_utime.tv_sec, + (unsigned int) usage.ru_utime.tv_usec, + (unsigned long long) usage.ru_stime.tv_sec, + (unsigned int) usage.ru_stime.tv_usec, + usage.ru_maxrss * 1024); + + if (time_start.tv_nsec > time_end.tv_nsec) { + time_end.tv_nsec = + 1000000000 + time_end.tv_nsec - time_start.tv_nsec; + time_end.tv_sec -= 1; + } else + time_end.tv_nsec -= time_start.tv_nsec; + time_end.tv_sec -= time_start.tv_sec; + + if (time_init.tv_nsec > time_start.tv_nsec) { + time_start.tv_nsec = + 1000000000 + time_start.tv_nsec - time_init.tv_nsec; + time_start.tv_sec -= 1; + } else + time_start.tv_nsec -= time_init.tv_nsec; + time_start.tv_sec -= time_init.tv_sec; + + time_init.tv_nsec = time_start.tv_nsec + time_end.tv_nsec; + time_init.tv_sec = + time_start.tv_sec + time_end.tv_sec + + time_init.tv_nsec / 1000000000; + time_init.tv_nsec %= 1000000000; + + printf(PROGNAME ": wall : " + "total=%llu.%09llus = " + "init=%llu.%09llus + real=%llu.%09llus\n", + (unsigned long long) time_init.tv_sec, + (unsigned long long) time_init.tv_nsec, + (unsigned long long) time_start.tv_sec, + (unsigned long long) time_start.tv_nsec, + (unsigned long long) time_end.tv_sec, + (unsigned long long) time_end.tv_nsec); + } + + _exit(ret); +} diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in deleted file mode 100755 index 5428eb25d9..0000000000 --- a/etc/systemd/system-generators/zfs-mount-generator.in +++ /dev/null @@ -1,205 +0,0 @@ -#!/bin/sh - -# zfs-mount-generator - generates systemd mount units for zfs -# Copyright (c) 2017 Antonio Russo -# -# Permission is hereby granted, free of charge, to any person obtaining -# a copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -set -e - -FSLIST="@sysconfdir@/zfs/zfs-list.cache" - -[ -d "${FSLIST}" ] || exit 0 - -do_fail() { - printf 'zfs-mount-generator: %s\n' "$*" > /dev/kmsg - exit 1 -} - -# see systemd.generator -if [ $# -eq 0 ] ; then - dest_norm="/tmp" -elif [ $# -eq 3 ] ; then - dest_norm="${1}" -else - do_fail "zero or three arguments required" -fi - -# For ZFSs marked "auto", a dependency is created for local-fs.target. To -# avoid regressions, this dependency is reduced to "wants" rather than -# "requires". **THIS MAY CHANGE** -req_dir="${dest_norm}/local-fs.target.wants/" -mkdir -p "${req_dir}" - -# All needed information about each ZFS is available from -# zfs list -H -t filesystem -o -# cached in $FSLIST, and each line is processed by the following function: -# See the list below for the properties and their order - -process_line() { - - # zfs list -H -o name,... - # fields are tab separated - IFS="$(printf '\t')" - # protect against special characters in, e.g., mountpoints - set -f - set -- $1 - dataset="${1}" - p_mountpoint="${2}" - p_canmount="${3}" - p_atime="${4}" - p_relatime="${5}" - p_devices="${6}" - p_exec="${7}" - p_readonly="${8}" - p_setuid="${9}" - p_nbmand="${10}" - - # Check for canmount=off . - if [ "${p_canmount}" = "off" ] ; then - return - elif [ "${p_canmount}" = "noauto" ] ; then - # Don't let a noauto marked mountpoint block an "auto" market mountpoint - return - elif [ "${p_canmount}" = "on" ] ; then - : # This is OK - else - do_fail "invalid canmount" - fi - - # Check for legacy and blank mountpoints. - if [ "${p_mountpoint}" = "legacy" ] ; then - return - elif [ "${p_mountpoint}" = "none" ] ; then - return - elif [ "${p_mountpoint%"${p_mountpoint#?}"}" != "/" ] ; then - do_fail "invalid mountpoint $*" - fi - - # Escape the mountpoint per systemd policy. - mountfile="$(systemd-escape "${p_mountpoint#?}").mount" - - # Parse options - # see lib/libzfs/libzfs_mount.c:zfs_add_options - opts="" - - # atime - if [ "${p_atime}" = on ] ; then - # relatime - if [ "${p_relatime}" = on ] ; then - opts="${opts},atime,relatime" - elif [ "${p_relatime}" = off ] ; then - opts="${opts},atime,strictatime" - else - printf 'zfs-mount-generator: (%s) invalid relatime\n' \ - "${dataset}" >/dev/kmsg - fi - elif [ "${p_atime}" = off ] ; then - opts="${opts},noatime" - else - printf 'zfs-mount-generator: (%s) invalid atime\n' \ - "${dataset}" >/dev/kmsg - fi - - # devices - if [ "${p_devices}" = on ] ; then - opts="${opts},dev" - elif [ "${p_devices}" = off ] ; then - opts="${opts},nodev" - else - printf 'zfs-mount-generator: (%s) invalid devices\n' \ - "${dataset}" >/dev/kmsg - fi - - # exec - if [ "${p_exec}" = on ] ; then - opts="${opts},exec" - elif [ "${p_exec}" = off ] ; then - opts="${opts},noexec" - else - printf 'zfs-mount-generator: (%s) invalid exec\n' \ - "${dataset}" >/dev/kmsg - fi - - # readonly - if [ "${p_readonly}" = on ] ; then - opts="${opts},ro" - elif [ "${p_readonly}" = off ] ; then - opts="${opts},rw" - else - printf 'zfs-mount-generator: (%s) invalid readonly\n' \ - "${dataset}" >/dev/kmsg - fi - - # setuid - if [ "${p_setuid}" = on ] ; then - opts="${opts},suid" - elif [ "${p_setuid}" = off ] ; then - opts="${opts},nosuid" - else - printf 'zfs-mount-generator: (%s) invalid setuid\n' \ - "${dataset}" >/dev/kmsg - fi - - # nbmand - if [ "${p_nbmand}" = on ] ; then - opts="${opts},mand" - elif [ "${p_nbmand}" = off ] ; then - opts="${opts},nomand" - else - printf 'zfs-mount-generator: (%s) invalid nbmand\n' \ - "${dataset}" >/dev/kmsg - fi - - # If the mountpoint has already been created, give it precedence. - if [ -e "${dest_norm}/${mountfile}" ] ; then - printf 'zfs-mount-generator: %s already exists\n' "${mountfile}" \ - >/dev/kmsg - return - fi - - # By ordering before zfs-mount.service, we avoid race conditions. - cat > "${dest_norm}/${mountfile}" << EOF -# Automatically generated by zfs-mount-generator - -[Unit] -SourcePath=${cachefile} -Documentation=man:zfs-mount-generator(8) -Before=local-fs.target zfs-mount.service -After=zfs-import.target -Wants=zfs-import.target - -[Mount] -Where=${p_mountpoint} -What=${dataset} -Type=zfs -Options=defaults${opts},zfsutil -EOF - - # Finally, create the appropriate dependency - ln -s "../${mountfile}" "${req_dir}" -} - -# Feed each line into process_line -for cachefile in "${FSLIST}/"* ; do - while read -r fs ; do - process_line "${fs}" - done < "${cachefile}" -done diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in index 884a69b5b6..e4056a92cd 100644 --- a/etc/systemd/system/50-zfs.preset.in +++ b/etc/systemd/system/50-zfs.preset.in @@ -5,4 +5,5 @@ enable zfs-import.target enable zfs-mount.service enable zfs-share.service enable zfs-zed.service +enable zfs-volume-wait.service enable zfs.target diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am index 1586209caa..c374a52ac7 100644 --- a/etc/systemd/system/Makefile.am +++ b/etc/systemd/system/Makefile.am @@ -1,3 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + systemdpreset_DATA = \ 50-zfs.preset @@ -7,25 +9,13 @@ systemdunit_DATA = \ zfs-import-scan.service \ zfs-mount.service \ zfs-share.service \ + zfs-volume-wait.service \ zfs-import.target \ + zfs-volumes.target \ zfs.target -EXTRA_DIST = \ - $(top_srcdir)/etc/systemd/system/zfs-zed.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-import-cache.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-import-scan.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-mount.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-share.service.in \ - $(top_srcdir)/etc/systemd/system/zfs-import.target.in \ - $(top_srcdir)/etc/systemd/system/zfs.target.in \ - $(top_srcdir)/etc/systemd/system/50-zfs.preset.in +SUBSTFILES += $(systemdpreset_DATA) $(systemdunit_DATA) -$(systemdunit_DATA) $(systemdpreset_DATA):%:%.in - -$(SED) -e 's,@bindir\@,$(bindir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e 's,@sbindir\@,$(sbindir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -distclean-local:: - -$(RM) $(systemdunit_DATA) $(systemdpreset_DATA) +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(systemdunitdir)" + ln -sf /dev/null "$(DESTDIR)$(systemdunitdir)/zfs-import.service" diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in index cacb536515..5e5c6281c9 100644 --- a/etc/systemd/system/zfs-import-cache.service.in +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -5,14 +5,16 @@ DefaultDependencies=no Requires=systemd-udev-settle.service After=systemd-udev-settle.service After=cryptsetup.target +After=multipathd.target After=systemd-remount-fs.service Before=zfs-import.target -ConditionPathExists=@sysconfdir@/zfs/zpool.cache +ConditionFileNotEmpty=@sysconfdir@/zfs/zpool.cache +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot RemainAfterExit=yes -ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN +ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in index 4aae9f06e5..d3c083f7e9 100644 --- a/etc/systemd/system/zfs-import-scan.service.in +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -5,13 +5,15 @@ DefaultDependencies=no Requires=systemd-udev-settle.service After=systemd-udev-settle.service After=cryptsetup.target +After=multipathd.target Before=zfs-import.target -ConditionPathExists=!@sysconfdir@/zfs/zpool.cache +ConditionFileNotEmpty=!@sysconfdir@/zfs/zpool.cache +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot RemainAfterExit=yes -ExecStart=@sbindir@/zpool import -aN -o cachefile=none +ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index a18691a468..3ab82fb033 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -6,6 +6,7 @@ After=systemd-udev-settle.service After=zfs-import.target After=systemd-remount-fs.service Before=local-fs.target +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 75ff6e9467..745077513c 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -5,13 +5,14 @@ After=nfs-server.service nfs-kernel-server.service After=smb.service Before=rpc-statd-notify.service Wants=zfs-mount.service +After=zfs-mount.service PartOf=nfs-server.service nfs-kernel-server.service PartOf=smb.service +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot RemainAfterExit=yes -ExecStartPre=-/bin/rm -f /etc/dfs/sharetab ExecStart=@sbindir@/zfs share -a [Install] diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in new file mode 100644 index 0000000000..4c77724d8b --- /dev/null +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Wait for ZFS Volume (zvol) links in /dev +DefaultDependencies=no +After=systemd-udev-settle.service +After=zfs-import.target +ConditionPathIsDirectory=/sys/module/zfs + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@bindir@/zvol_wait + +[Install] +WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-volumes.target.in b/etc/systemd/system/zfs-volumes.target.in new file mode 100644 index 0000000000..5cb9a10f49 --- /dev/null +++ b/etc/systemd/system/zfs-volumes.target.in @@ -0,0 +1,7 @@ +[Unit] +Description=ZFS volumes are ready +After=zfs-volume-wait.service +Requires=zfs-volume-wait.service + +[Install] +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in index f4313625ee..008075138f 100644 --- a/etc/systemd/system/zfs-zed.service.in +++ b/etc/systemd/system/zfs-zed.service.in @@ -1,6 +1,7 @@ [Unit] Description=ZFS Event Daemon (zed) Documentation=man:zed(8) +ConditionPathIsDirectory=/sys/module/zfs [Service] ExecStart=@sbindir@/zed -F diff --git a/etc/zfs/.gitignore b/etc/zfs/.gitignore new file mode 100644 index 0000000000..1b2d752deb --- /dev/null +++ b/etc/zfs/.gitignore @@ -0,0 +1 @@ +zfs-functions diff --git a/etc/zfs/Makefile.am b/etc/zfs/Makefile.am index 52f6634df6..3dee81c758 100644 --- a/etc/zfs/Makefile.am +++ b/etc/zfs/Makefile.am @@ -1,10 +1,18 @@ +include $(top_srcdir)/config/Substfiles.am +include $(top_srcdir)/config/Shellcheck.am + pkgsysconfdir = $(sysconfdir)/zfs -pkgsysconf_DATA = \ +dist_pkgsysconf_DATA = \ vdev_id.conf.alias.example \ vdev_id.conf.sas_direct.example \ vdev_id.conf.sas_switch.example \ vdev_id.conf.multipath.example \ vdev_id.conf.scsi.example -EXTRA_DIST = $(pkgsysconf_DATA) +pkgsysconf_SCRIPTS = \ + zfs-functions + +SUBSTFILES += $(pkgsysconf_SCRIPTS) + +SHELLCHECK_SHELL = dash # local variables diff --git a/etc/zfs/vdev_id.conf.sas_direct.example b/etc/zfs/vdev_id.conf.sas_direct.example index 0a6f130cb2..d17ed149d8 100644 --- a/etc/zfs/vdev_id.conf.sas_direct.example +++ b/etc/zfs/vdev_id.conf.sas_direct.example @@ -2,7 +2,7 @@ multipath no topology sas_direct phys_per_port 4 -# Additionally create /dev/by-enclousure/ symlinks for enclosure devices +# Additionally create /dev/by-enclosure/ symlinks for enclosure devices enclosure_symlinks yes # PCI_ID HBA PORT CHANNEL NAME diff --git a/etc/init.d/zfs-functions.in b/etc/zfs/zfs-functions.in similarity index 75% rename from etc/init.d/zfs-functions.in rename to etc/zfs/zfs-functions.in index 490503e913..2fb065afdb 100644 --- a/etc/init.d/zfs-functions.in +++ b/etc/zfs/zfs-functions.in @@ -5,22 +5,20 @@ # # Released under the 2-clause BSD license. # -# The original script that acted as a template for this script came from -# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a -# licensing stansa) in the commit dated Mar 24, 2011: -# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a +# This script is based on debian/zfsutils.zfs.init from the +# Debian GNU/kFreeBSD zfsutils 8.1-3 package, written by Aurelien Jarno. PATH=/sbin:/bin:/usr/bin:/usr/sbin # Source function library if [ -f /etc/rc.d/init.d/functions ]; then - # RedHat and derivates + # RedHat and derivatives . /etc/rc.d/init.d/functions elif [ -L /etc/init.d/functions.sh ]; then # Gentoo . /etc/init.d/functions.sh elif [ -f /lib/lsb/init-functions ]; then - # LSB, Debian GNU/Linux and derivates + # LSB, Debian, and derivatives . /lib/lsb/init-functions fi @@ -46,7 +44,7 @@ elif type success > /dev/null 2>&1 ; then fi } - zfs_log_begin_msg() { echo -n "$1 "; } + zfs_log_begin_msg() { printf "%s" "$1 "; } zfs_log_end_msg() { zfs_set_ifs "$OLD_IFS" if [ "$1" -eq 0 ]; then @@ -63,17 +61,17 @@ elif type success > /dev/null 2>&1 ; then echo zfs_set_ifs "$TMP_IFS" } - zfs_log_progress_msg() { echo -n $"$1"; } + zfs_log_progress_msg() { printf "%s" "$""$1"; } elif type einfo > /dev/null 2>&1 ; then # Gentoo functions zfs_log_begin_msg() { ebegin "$1"; } zfs_log_end_msg() { eend "$1"; } zfs_log_failure_msg() { eend "$1"; } -# zfs_log_progress_msg() { echo -n "$1"; } - zfs_log_progress_msg() { echo -n; } +# zfs_log_progress_msg() { printf "%s" "$1"; } + zfs_log_progress_msg() { :; } else - # Unknown - simple substitues. - zfs_log_begin_msg() { echo -n "$1"; } + # Unknown - simple substitutes. + zfs_log_begin_msg() { printf "%s" "$1"; } zfs_log_end_msg() { ret=$1 if [ "$ret" -ge 1 ]; then @@ -84,7 +82,7 @@ else return "$ret" } zfs_log_failure_msg() { echo "$1"; } - zfs_log_progress_msg() { echo -n "$1"; } + zfs_log_progress_msg() { printf "%s" "$1"; } fi # Paths to what we need @@ -96,8 +94,8 @@ ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" # Sensible defaults ZFS_MOUNT='yes' ZFS_UNMOUNT='yes' - -export ZFS ZED ZPOOL ZPOOL_CACHE ZFS_MOUNT ZFS_UNMOUNT +ZFS_SHARE='yes' +ZFS_UNSHARE='yes' # Source zfs configuration, overriding the defaults if [ -f @initconfdir@/zfs ]; then @@ -106,6 +104,8 @@ fi # ---------------------------------------------------- +export ZFS ZED ZPOOL ZPOOL_CACHE ZFS_MOUNT ZFS_UNMOUNT ZFS_SHARE ZFS_UNSHARE + zfs_action() { local MSG="$1"; shift @@ -134,27 +134,28 @@ zfs_daemon_start() { local PIDFILE="$1"; shift local DAEMON_BIN="$1"; shift - local DAEMON_ARGS="$*" if type start-stop-daemon > /dev/null 2>&1 ; then # LSB functions start-stop-daemon --start --quiet --pidfile "$PIDFILE" \ --exec "$DAEMON_BIN" --test > /dev/null || return 1 - start-stop-daemon --start --quiet --exec "$DAEMON_BIN" -- \ - $DAEMON_ARGS || return 2 + # shellcheck disable=SC2086 + start-stop-daemon --start --quiet --exec "$DAEMON_BIN" -- \ + "$@" || return 2 - # On Debian GNU/Linux, there's a 'sendsigs' script that will + # On Debian, there's a 'sendsigs' script that will # kill basically everything quite early and zed is stopped # much later than that. We don't want zed to be among them, # so add the zed pid to list of pids to ignore. - if [ -f "$PIDFILE" -a -d /run/sendsigs.omit.d ] + if [ -f "$PIDFILE" ] && [ -d /run/sendsigs.omit.d ] then ln -sf "$PIDFILE" /run/sendsigs.omit.d/zed fi elif type daemon > /dev/null 2>&1 ; then - # Fedora/RedHat functions - daemon --pidfile "$PIDFILE" "$DAEMON_BIN" $DAEMON_ARGS + # Fedora/RedHat functions + # shellcheck disable=SC2086 + daemon --pidfile "$PIDFILE" "$DAEMON_BIN" "$@" return $? else # Unsupported @@ -180,15 +181,17 @@ zfs_daemon_stop() # LSB functions start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 \ --pidfile "$PIDFILE" --name "$DAEMON_NAME" - [ "$?" = 0 ] && rm -f "$PIDFILE" + ret="$?" + [ "$ret" = 0 ] && rm -f "$PIDFILE" - return $? + return "$ret" elif type killproc > /dev/null 2>&1 ; then # Fedora/RedHat functions killproc -p "$PIDFILE" "$DAEMON_NAME" - [ "$?" = 0 ] && rm -f "$PIDFILE" + ret="$?" + [ "$ret" = 0 ] && rm -f "$PIDFILE" - return $? + return "$ret" else # Unsupported return 3 @@ -232,7 +235,7 @@ zfs_daemon_reload() return $? elif type killproc > /dev/null 2>&1 ; then # Fedora/RedHat functions - killproc -p "$PIDFILE" "$DAEMON_NAME" -HUP + killproc -p "$PIDFILE" "$DAEMON_NAME" -HUP return $? else # Unsupported @@ -283,8 +286,9 @@ checksystem() # Called with zfs=(off|no|0) - bail because we don't # want anything import, mounted or shared. # HOWEVER, only do this if we're called at the boot up - # (from init), not if we're running interactivly (as in + # (from init), not if we're running interactively (as in # from the shell - we know what we're doing). + # shellcheck disable=SC2154 [ -n "$init" ] && exit 3 fi @@ -294,18 +298,12 @@ checksystem() # Just make sure that /dev/zfs is created. udev_trigger - if ! [ "$(uname -m)" = "x86_64" ]; then - echo "Warning: You're not running 64bit. Currently native zfs in"; - echo " Linux is only supported and tested on 64bit."; - # should we break here? People doing this should know what they - # do, thus i'm not breaking here. - fi - return 0 } get_root_pool() { + # shellcheck disable=SC2046 set -- $(mount | grep ' on / ') [ "$5" = "zfs" ] && echo "${1%%/*}" } @@ -343,9 +341,10 @@ load_module() read_mtab() { local match="$1" - local fs mntpnt fstype opts rest TMPFILE + local fs mntpnt fstype opts rest # Unset all MTAB_* variables + # shellcheck disable=SC2046 unset $(env | grep ^MTAB_ | sed 's,=.*,,') while read -r fs mntpnt fstype opts rest; do @@ -357,8 +356,8 @@ read_mtab() # * We need to use the external echo, because the # internal one would interpret the backslash code # (incorrectly), giving us a  instead. - mntpnt=$(/bin/echo "$mntpnt" | sed "s,\\\0,\\\00,g") - fs=$(/bin/echo "$fs" | sed "s,\\\0,\\\00,") + mntpnt=$(/bin/echo "$mntpnt" | sed 's,\\0,\\00,g') + fs=$(/bin/echo "$fs" | sed 's,\\0,\\00,') # Remove 'unwanted' characters. mntpnt=$(printf '%b\n' "$mntpnt" | sed -e 's,/,,g' \ @@ -366,17 +365,20 @@ read_mtab() fs=$(printf '%b\n' "$fs") # Set the variable. - eval export MTAB_$mntpnt=\"$fs\" + eval export "MTAB_$mntpnt=\"$fs\"" fi done < /proc/self/mounts } in_mtab() { - local fs="$(echo "$1" | sed 's,/,_,g')" + local mntpnt="$1" + # Remove 'unwanted' characters. + mntpnt=$(printf '%b\n' "$mntpnt" | sed -e 's,/,,g' \ + -e 's,-,,g' -e 's,\.,,g' -e 's, ,,g') local var - var="$(eval echo MTAB_$fs)" + var="$(eval echo "MTAB_$mntpnt")" [ "$(eval echo "$""$var")" != "" ] return "$?" } @@ -385,21 +387,22 @@ in_mtab() read_fstab() { local match="$1" - local i var TMPFILE + local i var # Unset all FSTAB_* variables + # shellcheck disable=SC2046 unset $(env | grep ^FSTAB_ | sed 's,=.*,,') i=0 while read -r fs mntpnt fstype opts; do - echo "$fs" | egrep -qE '^#|^$' && continue - echo "$mntpnt" | egrep -qE '^none|^swap' && continue - echo "$fstype" | egrep -qE '^swap' && continue + echo "$fs" | grep -qE '^#|^$' && continue + echo "$mntpnt" | grep -qE '^none|^swap' && continue + echo "$fstype" | grep -qE '^swap' && continue if echo "$fs $mntpnt $fstype $opts" | grep -qE "$match"; then - eval export FSTAB_dev_$i="$fs" + eval export "FSTAB_dev_$i=$fs" fs=$(printf '%b\n' "$fs" | sed 's,/,_,g') - eval export FSTAB_$i="$mntpnt" + eval export "FSTAB_$i=$mntpnt" i=$((i + 1)) fi @@ -410,7 +413,7 @@ in_fstab() { local var - var="$(eval echo FSTAB_$1)" + var="$(eval echo "FSTAB_$1")" [ "${var}" != "" ] return $? } @@ -418,19 +421,11 @@ in_fstab() is_mounted() { local mntpt="$1" - local line + local mp - mount | \ - while read line; do - if echo "$line" | grep -q " on $mntpt "; then - # returns: - # 0 on unsuccessful match - # 1 on a successful match - return 1 - fi - done + while read -r _ mp _; do + [ "$mp" = "$mntpt" ] && return 0 + done < /proc/self/mounts - # The negation will flip the subshell return result where the default - # return value is 0 when a match is not found. - return $(( !$? )) + return 1 } diff --git a/include/Makefile.am b/include/Makefile.am index bac47d98d9..4da43afd85 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -1,25 +1,24 @@ -SUBDIRS = linux spl sys +SUBDIRS = sys os COMMON_H = \ - $(top_srcdir)/include/zfeature_common.h \ - $(top_srcdir)/include/zfs_comutil.h \ - $(top_srcdir)/include/zfs_deleg.h \ - $(top_srcdir)/include/zfs_fletcher.h \ - $(top_srcdir)/include/zfs_namecheck.h \ - $(top_srcdir)/include/zfs_prop.h + cityhash.h \ + zfeature_common.h \ + zfs_comutil.h \ + zfs_deleg.h \ + zfs_fletcher.h \ + zfs_namecheck.h \ + zfs_prop.h USER_H = \ - $(top_srcdir)/include/libnvpair.h \ - $(top_srcdir)/include/libuutil_common.h \ - $(top_srcdir)/include/libuutil.h \ - $(top_srcdir)/include/libuutil_impl.h \ - $(top_srcdir)/include/libzfs.h \ - $(top_srcdir)/include/libzfs_core.h \ - $(top_srcdir)/include/libzfs_impl.h \ - $(top_srcdir)/include/libzutil.h \ - $(top_srcdir)/include/thread_pool.h - -EXTRA_DIST = $(COMMON_H) $(USER_H) + libnvpair.h \ + libuutil_common.h \ + libuutil.h \ + libuutil_impl.h \ + libzfs.h \ + libzfsbootenv.h \ + libzfs_core.h \ + libzutil.h \ + thread_pool.h if CONFIG_USER libzfsdir = $(includedir)/libzfs @@ -27,6 +26,8 @@ libzfs_HEADERS = $(COMMON_H) $(USER_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include kernel_HEADERS = $(COMMON_H) endif +endif diff --git a/include/sys/cityhash.h b/include/cityhash.h similarity index 90% rename from include/sys/cityhash.h rename to include/cityhash.h index 33c3b7bc25..3b2d1e84b5 100644 --- a/include/sys/cityhash.h +++ b/include/cityhash.h @@ -24,7 +24,7 @@ */ #ifndef _SYS_CITYHASH_H -#define _SYS_CITYHASH_H +#define _SYS_CITYHASH_H extern __attribute__((visibility("default"))) #include @@ -32,7 +32,7 @@ extern "C" { #endif -uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); +_SYS_CITYHASH_H uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); #ifdef __cplusplus } diff --git a/include/libnvpair.h b/include/libnvpair.h index 5277f9574d..bc50c3b7e1 100644 --- a/include/libnvpair.h +++ b/include/libnvpair.h @@ -24,7 +24,7 @@ */ #ifndef _LIBNVPAIR_H -#define _LIBNVPAIR_H +#define _LIBNVPAIR_H extern __attribute__((visibility("default"))) #include #include @@ -42,13 +42,13 @@ extern "C" { * are all imported from included above. */ -extern int nvpair_value_match(nvpair_t *, int, char *, char **); -extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, +_LIBNVPAIR_H int nvpair_value_match(nvpair_t *, int, char *, char **); +_LIBNVPAIR_H int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **); -extern void nvlist_print(FILE *, nvlist_t *); -int nvlist_print_json(FILE *, nvlist_t *); -extern void dump_nvlist(nvlist_t *, int); +_LIBNVPAIR_H void nvlist_print(FILE *, nvlist_t *); +_LIBNVPAIR_H int nvlist_print_json(FILE *, nvlist_t *); +_LIBNVPAIR_H void dump_nvlist(nvlist_t *, int); /* * Private nvlist printing interface that allows the caller some control @@ -88,18 +88,18 @@ enum nvlist_indent_mode { NVLIST_INDENT_TABBED /* Indent with tabstops */ }; -extern nvlist_prtctl_t nvlist_prtctl_alloc(void); -extern void nvlist_prtctl_free(nvlist_prtctl_t); -extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); +_LIBNVPAIR_H nvlist_prtctl_t nvlist_prtctl_alloc(void); +_LIBNVPAIR_H void nvlist_prtctl_free(nvlist_prtctl_t); +_LIBNVPAIR_H void nvlist_prt(nvlist_t *, nvlist_prtctl_t); /* Output stream */ -extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); -extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); +_LIBNVPAIR_H void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); +_LIBNVPAIR_H FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); /* Indentation mode, start indent, indent increment; default tabbed/0/1 */ -extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, - int, int); -extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); +_LIBNVPAIR_H void nvlist_prtctl_setindent(nvlist_prtctl_t, + enum nvlist_indent_mode, int, int); +_LIBNVPAIR_H void nvlist_prtctl_doindent(nvlist_prtctl_t, int); enum nvlist_prtctl_fmt { NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ @@ -107,9 +107,10 @@ enum nvlist_prtctl_fmt { NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ }; -extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, +_LIBNVPAIR_H void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, const char *); -extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); +_LIBNVPAIR_H void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, + ...); /* * Function prototypes for interfaces that appoint a new rendering function @@ -139,7 +140,7 @@ extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); */ #define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ - extern void funcname(nvlist_prtctl_t, \ + _LIBNVPAIR_H void funcname(nvlist_prtctl_t, \ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ void *) @@ -170,7 +171,7 @@ NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); * Return values as above. */ #define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ - extern void funcname(nvlist_prtctl_t, \ + _LIBNVPAIR_H void funcname(nvlist_prtctl_t, \ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ void *) diff --git a/include/libuutil.h b/include/libuutil.h index d0248901b4..cadc20d2d8 100644 --- a/include/libuutil.h +++ b/include/libuutil.h @@ -81,15 +81,18 @@ const char *uu_strerror(uint32_t); extern void uu_alt_exit(int); extern const char *uu_setpname(char *); extern const char *uu_getpname(void); -/*PRINTFLIKE1*/ -extern void uu_warn(const char *, ...); -extern void uu_vwarn(const char *, va_list); -/*PRINTFLIKE1*/ -extern void uu_die(const char *, ...) __NORETURN; -extern void uu_vdie(const char *, va_list) __NORETURN; -/*PRINTFLIKE2*/ -extern void uu_xdie(int, const char *, ...) __NORETURN; -extern void uu_vxdie(int, const char *, va_list) __NORETURN; +extern void uu_warn(const char *, ...) + __attribute__((format(printf, 1, 2))); +extern void uu_vwarn(const char *, va_list) + __attribute__((format(printf, 1, 0))); +extern void uu_die(const char *, ...) + __attribute__((format(printf, 1, 2))) __NORETURN; +extern void uu_vdie(const char *, va_list) + __attribute__((format(printf, 1, 0))) __NORETURN; +extern void uu_xdie(int, const char *, ...) + __attribute__((format(printf, 2, 3))) __NORETURN; +extern void uu_vxdie(int, const char *, va_list) + __attribute__((format(printf, 2, 0))) __NORETURN; /* * Exit status functions (not to be used directly) @@ -98,28 +101,6 @@ extern int *uu_exit_ok(void); extern int *uu_exit_fatal(void); extern int *uu_exit_usage(void); -/* - * Debug print facility functions. - */ -typedef struct uu_dprintf uu_dprintf_t; - -typedef enum { - UU_DPRINTF_SILENT, - UU_DPRINTF_FATAL, - UU_DPRINTF_WARNING, - UU_DPRINTF_NOTICE, - UU_DPRINTF_INFO, - UU_DPRINTF_DEBUG -} uu_dprintf_severity_t; - -extern uu_dprintf_t *uu_dprintf_create(const char *, uu_dprintf_severity_t, - uint_t); -/*PRINTFLIKE3*/ -extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t, - const char *, ...); -extern void uu_dprintf_destroy(uu_dprintf_t *); -extern const char *uu_dprintf_getname(uu_dprintf_t *); - /* * Identifier test flags and function. */ @@ -128,18 +109,13 @@ extern const char *uu_dprintf_getname(uu_dprintf_t *); int uu_check_name(const char *, uint_t); -/* - * File creation functions. - */ -extern int uu_open_tmp(const char *dir, uint_t uflags); - /* * Convenience functions. */ #define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) -/*PRINTFLIKE1*/ -extern char *uu_msprintf(const char *format, ...); +extern char *uu_msprintf(const char *format, ...) + __attribute__((format(printf, 1, 2))); extern void *uu_zalloc(size_t); extern char *uu_strdup(const char *); extern void uu_free(void *); @@ -149,7 +125,6 @@ extern boolean_t uu_streq(const char *a, const char *b); extern char *uu_strndup(const char *s, size_t n); extern boolean_t uu_strbw(const char *a, const char *b); extern void *uu_memdup(const void *buf, size_t sz); -extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); /* * Comparison function type definition. diff --git a/include/libuutil_impl.h b/include/libuutil_impl.h index f978b475ef..753bbff246 100644 --- a/include/libuutil_impl.h +++ b/include/libuutil_impl.h @@ -42,16 +42,9 @@ extern "C" { void uu_set_error(uint_t); -/*PRINTFLIKE1*/ -void uu_panic(const char *format, ...); +void uu_panic(const char *format, ...) __attribute__((format(printf, 1, 2))); -struct uu_dprintf { - char *uud_name; - uu_dprintf_severity_t uud_severity; - uint_t uud_flags; -}; - /* * For debugging purposes, libuutil keeps around linked lists of all uu_lists * and uu_avls, along with pointers to their parents. These can cause false diff --git a/include/libzfs.h b/include/libzfs.h index e2ec2d9bce..c0883a9836 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -21,17 +21,18 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2021, Colm Buckley */ #ifndef _LIBZFS_H -#define _LIBZFS_H +#define _LIBZFS_H extern __attribute__((visibility("default"))) #include #include @@ -79,7 +80,7 @@ typedef enum zfs_error { EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ - EZFS_RESILVERING, /* currently resilvering */ + EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ @@ -88,8 +89,8 @@ typedef enum zfs_error { EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ - EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ - EZFS_SHARENFSFAILED, /* share(1M) failed */ + EZFS_UNSHARENFSFAILED, /* failed to unshare over nfs */ + EZFS_SHARENFSFAILED, /* failed to share over nfs */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ @@ -147,6 +148,8 @@ typedef enum zfs_error { EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ + EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ + EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; @@ -193,60 +196,64 @@ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; +_LIBZFS_H int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); +_LIBZFS_H int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, + boolean_t *, boolean_t *); + /* * Library initialization */ -extern libzfs_handle_t *libzfs_init(void); -extern void libzfs_fini(libzfs_handle_t *); +_LIBZFS_H libzfs_handle_t *libzfs_init(void); +_LIBZFS_H void libzfs_fini(libzfs_handle_t *); -extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); -extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); +_LIBZFS_H libzfs_handle_t *zpool_get_handle(zpool_handle_t *); +_LIBZFS_H libzfs_handle_t *zfs_get_handle(zfs_handle_t *); -extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); +_LIBZFS_H void libzfs_print_on_error(libzfs_handle_t *, boolean_t); -extern void zfs_save_arguments(int argc, char **, char *, int); -extern int zpool_log_history(libzfs_handle_t *, const char *); +_LIBZFS_H void zfs_save_arguments(int argc, char **, char *, int); +_LIBZFS_H int zpool_log_history(libzfs_handle_t *, const char *); -extern int libzfs_errno(libzfs_handle_t *); -extern const char *libzfs_error_init(int); -extern const char *libzfs_error_action(libzfs_handle_t *); -extern const char *libzfs_error_description(libzfs_handle_t *); -extern int zfs_standard_error(libzfs_handle_t *, int, const char *); -extern void libzfs_mnttab_init(libzfs_handle_t *); -extern void libzfs_mnttab_fini(libzfs_handle_t *); -extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); -extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, +_LIBZFS_H int libzfs_errno(libzfs_handle_t *); +_LIBZFS_H const char *libzfs_error_init(int); +_LIBZFS_H const char *libzfs_error_action(libzfs_handle_t *); +_LIBZFS_H const char *libzfs_error_description(libzfs_handle_t *); +_LIBZFS_H int zfs_standard_error(libzfs_handle_t *, int, const char *); +_LIBZFS_H void libzfs_mnttab_init(libzfs_handle_t *); +_LIBZFS_H void libzfs_mnttab_fini(libzfs_handle_t *); +_LIBZFS_H void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); +_LIBZFS_H int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); -extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, +_LIBZFS_H void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); -extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); +_LIBZFS_H void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ -extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); -extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); -extern void zpool_close(zpool_handle_t *); -extern const char *zpool_get_name(zpool_handle_t *); -extern int zpool_get_state(zpool_handle_t *); -extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); -extern const char *zpool_pool_state_to_name(pool_state_t); -extern void zpool_free_handles(libzfs_handle_t *); +_LIBZFS_H zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); +_LIBZFS_H zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); +_LIBZFS_H void zpool_close(zpool_handle_t *); +_LIBZFS_H const char *zpool_get_name(zpool_handle_t *); +_LIBZFS_H int zpool_get_state(zpool_handle_t *); +_LIBZFS_H const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); +_LIBZFS_H const char *zpool_pool_state_to_name(pool_state_t); +_LIBZFS_H void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); -extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); -extern boolean_t zpool_skip_pool(const char *); +_LIBZFS_H int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); +_LIBZFS_H boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ -extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, +_LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); -extern int zpool_destroy(zpool_handle_t *, const char *); -extern int zpool_add(zpool_handle_t *, nvlist_t *); +_LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); +_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ @@ -264,6 +271,9 @@ typedef struct trimflags { /* request a secure trim, requires support from device */ boolean_t secure; + /* after starting trim, block until trim completes */ + boolean_t wait; + /* trim at the requested rate in bytes/second */ uint64_t rate; } trimflags_t; @@ -271,54 +281,60 @@ typedef struct trimflags { /* * Functions to manipulate pool and vdev state */ -extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); -extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, +_LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +_LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); -extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, +_LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, + nvlist_t *); +_LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); -extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); -extern int zpool_reguid(zpool_handle_t *); -extern int zpool_reopen_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); +_LIBZFS_H int zpool_reguid(zpool_handle_t *); +_LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); -extern int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); -extern int zpool_vdev_online(zpool_handle_t *, const char *, int, +_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); -extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); -extern int zpool_vdev_attach(zpool_handle_t *, const char *, - const char *, nvlist_t *, int); -extern int zpool_vdev_detach(zpool_handle_t *, const char *); -extern int zpool_vdev_remove(zpool_handle_t *, const char *); -extern int zpool_vdev_remove_cancel(zpool_handle_t *); -extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); -extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, - splitflags_t); +_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); +_LIBZFS_H int zpool_vdev_attach(zpool_handle_t *, const char *, + const char *, nvlist_t *, int, boolean_t); +_LIBZFS_H int zpool_vdev_detach(zpool_handle_t *, const char *); +_LIBZFS_H int zpool_vdev_remove(zpool_handle_t *, const char *); +_LIBZFS_H int zpool_vdev_remove_cancel(zpool_handle_t *); +_LIBZFS_H int zpool_vdev_indirect_size(zpool_handle_t *, const char *, + uint64_t *); +_LIBZFS_H int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, + nvlist_t *, splitflags_t); -extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); -extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); -extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); +_LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); +_LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +_LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); -extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, +_LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); -extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, +_LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); -extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); -extern uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); +_LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, + const char *); +_LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, + const char *path); -const char *zpool_get_state_str(zpool_handle_t *); +_LIBZFS_H const char *zpool_get_state_str(zpool_handle_t *); /* * Functions to manage pool properties */ -extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); -extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, +_LIBZFS_H int zpool_set_prop(zpool_handle_t *, const char *, const char *); +_LIBZFS_H int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); -extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, +_LIBZFS_H uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); +_LIBZFS_H int zpool_props_refresh(zpool_handle_t *); -extern const char *zpool_prop_to_name(zpool_prop_t); -extern const char *zpool_prop_values(zpool_prop_t); +_LIBZFS_H const char *zpool_prop_to_name(zpool_prop_t); +_LIBZFS_H const char *zpool_prop_values(zpool_prop_t); /* * Pool health statistics. @@ -376,6 +392,11 @@ typedef enum { ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ + ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ + ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ + ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ + ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ /* * Finally, the following indicates a healthy pool. @@ -383,36 +404,36 @@ typedef enum { ZPOOL_STATUS_OK } zpool_status_t; -extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, +_LIBZFS_H zpool_status_t zpool_get_status(zpool_handle_t *, char **, zpool_errata_t *); -extern zpool_status_t zpool_import_status(nvlist_t *, char **, +_LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, char **, zpool_errata_t *); /* * Statistics and configuration functions. */ -extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); -extern nvlist_t *zpool_get_features(zpool_handle_t *); -extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); -extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); +_LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +_LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); +_LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); +_LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ -extern int zpool_export(zpool_handle_t *, boolean_t, const char *); -extern int zpool_export_force(zpool_handle_t *, const char *); -extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, +_LIBZFS_H int zpool_export(zpool_handle_t *, boolean_t, const char *); +_LIBZFS_H int zpool_export_force(zpool_handle_t *, const char *); +_LIBZFS_H int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); -extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, +_LIBZFS_H int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); -extern void zpool_print_unsup_feat(nvlist_t *config); +_LIBZFS_H void zpool_print_unsup_feat(nvlist_t *config); /* * Miscellaneous pool functions */ struct zfs_cmd; -extern const char *zfs_history_event_names[]; +_LIBZFS_H const char *zfs_history_event_names[]; typedef enum { VDEV_NAME_PATH = 1 << 0, @@ -421,34 +442,39 @@ typedef enum { VDEV_NAME_TYPE_ID = 1 << 3, } vdev_name_t; -extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, +_LIBZFS_H char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); -extern int zpool_upgrade(zpool_handle_t *, uint64_t); -extern int zpool_get_history(zpool_handle_t *, nvlist_t **); -extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, +_LIBZFS_H int zpool_upgrade(zpool_handle_t *, uint64_t); +_LIBZFS_H int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, + boolean_t *); +_LIBZFS_H int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); -extern int zpool_events_clear(libzfs_handle_t *, int *); -extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int); -extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, - size_t len); -extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); -extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); -extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, +_LIBZFS_H int zpool_events_clear(libzfs_handle_t *, int *); +_LIBZFS_H int zpool_events_seek(libzfs_handle_t *, uint64_t, int); +_LIBZFS_H void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, + char *, size_t); +_LIBZFS_H void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, + size_t); +_LIBZFS_H int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); +_LIBZFS_H int zpool_get_physpath(zpool_handle_t *, char *, size_t); +_LIBZFS_H void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); -extern int zpool_checkpoint(zpool_handle_t *); -extern int zpool_discard_checkpoint(zpool_handle_t *); +_LIBZFS_H int zpool_checkpoint(zpool_handle_t *); +_LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); +_LIBZFS_H boolean_t zpool_is_draid_spare(const char *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ -extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); -extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); -extern void zfs_close(zfs_handle_t *); -extern zfs_type_t zfs_get_type(const zfs_handle_t *); -extern const char *zfs_get_name(const zfs_handle_t *); -extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); -extern const char *zfs_get_pool_name(const zfs_handle_t *); +_LIBZFS_H zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); +_LIBZFS_H zfs_handle_t *zfs_handle_dup(zfs_handle_t *); +_LIBZFS_H void zfs_close(zfs_handle_t *); +_LIBZFS_H zfs_type_t zfs_get_type(const zfs_handle_t *); +_LIBZFS_H zfs_type_t zfs_get_underlying_type(const zfs_handle_t *); +_LIBZFS_H const char *zfs_get_name(const zfs_handle_t *); +_LIBZFS_H zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); +_LIBZFS_H const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, @@ -458,55 +484,60 @@ extern const char *zfs_get_pool_name(const zfs_handle_t *); /* * zfs dataset property management */ -extern const char *zfs_prop_default_string(zfs_prop_t); -extern uint64_t zfs_prop_default_numeric(zfs_prop_t); -extern const char *zfs_prop_column_name(zfs_prop_t); -extern boolean_t zfs_prop_align_right(zfs_prop_t); +_LIBZFS_H const char *zfs_prop_default_string(zfs_prop_t); +_LIBZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); +_LIBZFS_H const char *zfs_prop_column_name(zfs_prop_t); +_LIBZFS_H boolean_t zfs_prop_align_right(zfs_prop_t); -extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, - uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); +_LIBZFS_H nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, + nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, + const char *); -extern const char *zfs_prop_to_name(zfs_prop_t); -extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); -extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); -extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, +_LIBZFS_H const char *zfs_prop_to_name(zfs_prop_t); +_LIBZFS_H int zfs_prop_set(zfs_handle_t *, const char *, const char *); +_LIBZFS_H int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); +_LIBZFS_H int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); -extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, +_LIBZFS_H int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); -extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, +_LIBZFS_H int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); -extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue); -extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, +_LIBZFS_H int zfs_prop_get_userquota_int(zfs_handle_t *zhp, + const char *propname, uint64_t *propvalue); +_LIBZFS_H int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); -extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, +_LIBZFS_H int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); -extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, +_LIBZFS_H int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); -extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, +_LIBZFS_H int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); -extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); -extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); -extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); -extern const char *zfs_prop_values(zfs_prop_t); -extern int zfs_prop_is_string(zfs_prop_t prop); -extern nvlist_t *zfs_get_all_props(zfs_handle_t *); -extern nvlist_t *zfs_get_user_props(zfs_handle_t *); -extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); -extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); +_LIBZFS_H uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); +_LIBZFS_H uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); +_LIBZFS_H int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); +_LIBZFS_H const char *zfs_prop_values(zfs_prop_t); +_LIBZFS_H int zfs_prop_is_string(zfs_prop_t prop); +_LIBZFS_H nvlist_t *zfs_get_all_props(zfs_handle_t *); +_LIBZFS_H nvlist_t *zfs_get_user_props(zfs_handle_t *); +_LIBZFS_H nvlist_t *zfs_get_recvd_props(zfs_handle_t *); +_LIBZFS_H nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); + +_LIBZFS_H int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, + boolean_t *, boolean_t *); /* * zfs encryption management */ -extern int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); -extern int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, - boolean_t stdin_available, uint8_t **, uint_t *); -extern int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, +_LIBZFS_H int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, + char *); +_LIBZFS_H int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, + nvlist_t *, boolean_t stdin_available, uint8_t **, uint_t *); +_LIBZFS_H int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, nvlist_t *); -extern int zfs_crypto_attempt_load_keys(libzfs_handle_t *, char *); -extern int zfs_crypto_load_key(zfs_handle_t *, boolean_t, char *); -extern int zfs_crypto_unload_key(zfs_handle_t *); -extern int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); +_LIBZFS_H int zfs_crypto_attempt_load_keys(libzfs_handle_t *, char *); +_LIBZFS_H int zfs_crypto_load_key(zfs_handle_t *, boolean_t, char *); +_LIBZFS_H int zfs_crypto_unload_key(zfs_handle_t *); +_LIBZFS_H int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); typedef struct zprop_list { int pl_prop; @@ -518,9 +549,9 @@ typedef struct zprop_list { boolean_t pl_fixed; } zprop_list_t; -extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, +_LIBZFS_H int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); -extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); +_LIBZFS_H void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" @@ -535,22 +566,23 @@ extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); /* * zpool property management */ -extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); -extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, +_LIBZFS_H int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **, + boolean_t); +_LIBZFS_H int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); -extern const char *zpool_prop_default_string(zpool_prop_t); -extern uint64_t zpool_prop_default_numeric(zpool_prop_t); -extern const char *zpool_prop_column_name(zpool_prop_t); -extern boolean_t zpool_prop_align_right(zpool_prop_t); +_LIBZFS_H const char *zpool_prop_default_string(zpool_prop_t); +_LIBZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); +_LIBZFS_H const char *zpool_prop_column_name(zpool_prop_t); +_LIBZFS_H boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ -extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, +_LIBZFS_H int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); -extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, +_LIBZFS_H int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); -extern void zprop_free_list(zprop_list_t *); +_LIBZFS_H void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 @@ -577,7 +609,7 @@ typedef struct zprop_get_cbdata { zfs_type_t cb_type; } zprop_get_cbdata_t; -void zprop_print_one_property(const char *, zprop_get_cbdata_t *, +_LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); @@ -585,17 +617,19 @@ void zprop_print_one_property(const char *, zprop_get_cbdata_t *, * Iterator functions. */ typedef int (*zfs_iter_f)(zfs_handle_t *, void *); -extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); -extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, +_LIBZFS_H int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, + void *); +_LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); -extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, +_LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); -extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); -extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, + void *); +_LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; @@ -603,41 +637,58 @@ typedef struct get_all_cb { size_t cb_used; } get_all_cb_t; -void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, - zfs_iter_f, void *, boolean_t); -void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); +_LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, + size_t, zfs_iter_f, void *, boolean_t); +_LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ -extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, +_LIBZFS_H int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); -extern int zfs_create_ancestors(libzfs_handle_t *, const char *); -extern int zfs_destroy(zfs_handle_t *, boolean_t); -extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); -extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); -extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); -extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); -extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, +_LIBZFS_H int zfs_create_ancestors(libzfs_handle_t *, const char *); +_LIBZFS_H int zfs_destroy(zfs_handle_t *, boolean_t); +_LIBZFS_H int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); +_LIBZFS_H int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); +_LIBZFS_H int zfs_destroy_snaps_nvl_os(libzfs_handle_t *, nvlist_t *); +_LIBZFS_H int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); +_LIBZFS_H int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, + nvlist_t *); +_LIBZFS_H int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); -extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); +_LIBZFS_H int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); + +typedef struct renameflags { + /* recursive rename */ + int recursive : 1; + + /* don't unmount file systems */ + int nounmount : 1; + + /* force unmount file systems */ + int forceunmount : 1; +} renameflags_t; + +_LIBZFS_H int zfs_rename(zfs_handle_t *, const char *, renameflags_t); typedef struct sendflags { - /* print informational messages (ie, -v was specified) */ - boolean_t verbose; + /* Amount of extra information to print. */ + int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; + /* for recursive send, skip sending missing snapshots */ + boolean_t skipmissing; + /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; - /* do deduplication */ - boolean_t dedup; + /* field no longer used, maintained for backwards compatibility */ + boolean_t pad; /* send properties (ie, -p) */ boolean_t props; @@ -668,34 +719,42 @@ typedef struct sendflags { /* include snapshot holds in send stream */ boolean_t holds; + + /* stream represents a partially received dataset */ + boolean_t saved; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); -extern int zfs_send(zfs_handle_t *, const char *, const char *, +_LIBZFS_H int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); -extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t flags); -extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, +_LIBZFS_H int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, const char *); -extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, +_LIBZFS_H int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); +_LIBZFS_H int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, + const char *); +_LIBZFS_H int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); +_LIBZFS_H nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); -extern int zfs_promote(zfs_handle_t *); -extern int zfs_hold(zfs_handle_t *, const char *, const char *, +_LIBZFS_H int zfs_promote(zfs_handle_t *); +_LIBZFS_H int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); -extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); -extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); -extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); -extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); +_LIBZFS_H int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); +_LIBZFS_H int zfs_release(zfs_handle_t *, const char *, const char *, + boolean_t); +_LIBZFS_H int zfs_get_holds(zfs_handle_t *, nvlist_t **); +_LIBZFS_H uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, + nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); -extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, +_LIBZFS_H int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); -extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); -extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); +_LIBZFS_H int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); +_LIBZFS_H int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ @@ -736,9 +795,15 @@ typedef struct recvflags { /* skip receive of snapshot holds */ boolean_t skipholds; + + /* mount the filesystem unless nomount is specified */ + boolean_t domount; + + /* force unmount while recv snapshot (private) */ + boolean_t forceunmount; } recvflags_t; -extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, +_LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { @@ -747,57 +812,71 @@ typedef enum diff_flags { ZFS_DIFF_CLASSIFY = 0x4 } diff_flags_t; -extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, +_LIBZFS_H int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ -extern const char *zfs_type_to_name(zfs_type_t); -extern void zfs_refresh_properties(zfs_handle_t *); -extern int zfs_name_valid(const char *, zfs_type_t); -extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); -extern int zfs_parent_name(zfs_handle_t *, char *, size_t); -extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, +_LIBZFS_H const char *zfs_type_to_name(zfs_type_t); +_LIBZFS_H void zfs_refresh_properties(zfs_handle_t *); +_LIBZFS_H int zfs_name_valid(const char *, zfs_type_t); +_LIBZFS_H zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, zfs_type_t); -extern int zfs_spa_version(zfs_handle_t *, int *); -extern boolean_t zfs_bookmark_exists(const char *path); +_LIBZFS_H int zfs_parent_name(zfs_handle_t *, char *, size_t); +_LIBZFS_H boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, + zfs_type_t); +_LIBZFS_H int zfs_spa_version(zfs_handle_t *, int *); +_LIBZFS_H boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ -extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); -extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); -extern int zfs_mount(zfs_handle_t *, const char *, int); -extern int zfs_unmount(zfs_handle_t *, const char *, int); -extern int zfs_unmountall(zfs_handle_t *, int); +_LIBZFS_H boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); +_LIBZFS_H boolean_t zfs_is_mounted(zfs_handle_t *, char **); +_LIBZFS_H int zfs_mount(zfs_handle_t *, const char *, int); +_LIBZFS_H int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); +_LIBZFS_H int zfs_unmount(zfs_handle_t *, const char *, int); +_LIBZFS_H int zfs_unmountall(zfs_handle_t *, int); +_LIBZFS_H int zfs_mount_delegation_check(void); + +#if defined(__linux__) || defined(__APPLE__) +_LIBZFS_H int zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); +_LIBZFS_H void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, + char *mntopts, char *mtabopt); +#endif /* * Share support functions. */ -extern boolean_t zfs_is_shared(zfs_handle_t *); -extern int zfs_share(zfs_handle_t *); -extern int zfs_unshare(zfs_handle_t *); +_LIBZFS_H boolean_t zfs_is_shared(zfs_handle_t *); +_LIBZFS_H int zfs_share(zfs_handle_t *); +_LIBZFS_H int zfs_unshare(zfs_handle_t *); /* * Protocol-specific share support functions. */ -extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); -extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); -extern int zfs_share_nfs(zfs_handle_t *); -extern int zfs_share_smb(zfs_handle_t *); -extern int zfs_shareall(zfs_handle_t *); -extern int zfs_unshare_nfs(zfs_handle_t *, const char *); -extern int zfs_unshare_smb(zfs_handle_t *, const char *); -extern int zfs_unshareall_nfs(zfs_handle_t *); -extern int zfs_unshareall_smb(zfs_handle_t *); -extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); -extern int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); -extern int zfs_unshareall(zfs_handle_t *); -extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, +_LIBZFS_H boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); +_LIBZFS_H boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); +_LIBZFS_H int zfs_share_nfs(zfs_handle_t *); +_LIBZFS_H int zfs_share_smb(zfs_handle_t *); +_LIBZFS_H int zfs_shareall(zfs_handle_t *); +_LIBZFS_H int zfs_unshare_nfs(zfs_handle_t *, const char *); +_LIBZFS_H int zfs_unshare_smb(zfs_handle_t *, const char *); +_LIBZFS_H int zfs_unshareall_nfs(zfs_handle_t *); +_LIBZFS_H int zfs_unshareall_smb(zfs_handle_t *); +_LIBZFS_H int zfs_unshareall_bypath(zfs_handle_t *, const char *); +_LIBZFS_H int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); +_LIBZFS_H int zfs_unshareall(zfs_handle_t *); +_LIBZFS_H int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); +_LIBZFS_H void zfs_commit_nfs_shares(void); +_LIBZFS_H void zfs_commit_smb_shares(void); +_LIBZFS_H void zfs_commit_all_shares(void); +_LIBZFS_H void zfs_commit_shares(const char *); -extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); +_LIBZFS_H int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Utility functions to run an external process. @@ -806,51 +885,83 @@ extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); #define STDERR_VERBOSE 0x02 #define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ -int libzfs_run_process(const char *, char **, int flags); -int libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[], - char **lines[], int *lines_cnt); -int libzfs_run_process_get_stdout_nopath(const char *path, char *argv[], - char *env[], char **lines[], int *lines_cnt); +_LIBZFS_H int libzfs_run_process(const char *, char **, int); +_LIBZFS_H int libzfs_run_process_get_stdout(const char *, char *[], char *[], + char **[], int *); +_LIBZFS_H int libzfs_run_process_get_stdout_nopath(const char *, char *[], + char *[], char **[], int *); -void libzfs_free_str_array(char **strs, int count); +_LIBZFS_H void libzfs_free_str_array(char **, int); -int libzfs_envvar_is_set(char *envvar); +_LIBZFS_H int libzfs_envvar_is_set(char *); /* * Utility functions for zfs version */ -extern void zfs_version_userland(char *, int); -extern int zfs_version_kernel(char *, int); -extern int zfs_version_print(void); +_LIBZFS_H void zfs_version_userland(char *, int); +_LIBZFS_H int zfs_version_kernel(char *, int); +_LIBZFS_H int zfs_version_print(void); /* * Given a device or file, determine if it is part of a pool. */ -extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, +_LIBZFS_H int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ -extern int zpool_clear_label(int); +_LIBZFS_H int zpool_clear_label(int); +_LIBZFS_H int zpool_set_bootenv(zpool_handle_t *, const nvlist_t *); +_LIBZFS_H int zpool_get_bootenv(zpool_handle_t *, nvlist_t **); /* * Management interfaces for SMB ACL files */ -int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); -int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); -int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); -int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); +_LIBZFS_H int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); +_LIBZFS_H int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); +_LIBZFS_H int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); +_LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, + char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ -extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); -extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); +_LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); +_LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); +_LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); +_LIBZFS_H void zpool_disable_volume_os(const char *); -extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); +/* + * Parse a features file for -o compatibility + */ +typedef enum { + ZPOOL_COMPATIBILITY_OK, + ZPOOL_COMPATIBILITY_WARNTOKEN, + ZPOOL_COMPATIBILITY_BADTOKEN, + ZPOOL_COMPATIBILITY_BADFILE, + ZPOOL_COMPATIBILITY_NOFILES +} zpool_compat_status_t; + +_LIBZFS_H zpool_compat_status_t zpool_load_compat(const char *, + boolean_t *, char *, size_t); + +#ifdef __FreeBSD__ + +/* + * Attach/detach the given filesystem to/from the given jail. + */ +_LIBZFS_H int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); + +/* + * Set loader options for next boot. + */ +_LIBZFS_H int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, + const char *); + +#endif /* __FreeBSD__ */ #ifdef __cplusplus } diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 74a64d1077..9020d70db3 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -20,14 +20,14 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. */ #ifndef _LIBZFS_CORE_H -#define _LIBZFS_CORE_H +#define _LIBZFS_CORE_H extern __attribute__((visibility("default"))) #include #include @@ -38,88 +38,114 @@ extern "C" { #endif -int libzfs_core_init(void); -void libzfs_core_fini(void); +_LIBZFS_CORE_H int libzfs_core_init(void); +_LIBZFS_CORE_H void libzfs_core_fini(void); + +struct zfs_cmd; +_LIBZFS_CORE_H int lzc_ioctl_fd(int, unsigned long, struct zfs_cmd *); /* - * NB: this type should be kept binary compatible with dmu_objset_type_t. + * NB: this type should be kept binary-compatible with dmu_objset_type_t. */ enum lzc_dataset_type { LZC_DATSET_TYPE_ZFS = 2, LZC_DATSET_TYPE_ZVOL }; -int lzc_remap(const char *fsname); -int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); -int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *, uint8_t *, +_LIBZFS_CORE_H int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *, + uint8_t *, uint_t); +_LIBZFS_CORE_H int lzc_clone(const char *, const char *, nvlist_t *); +_LIBZFS_CORE_H int lzc_promote(const char *, char *, int); +_LIBZFS_CORE_H int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); +_LIBZFS_CORE_H int lzc_bookmark(nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_get_bookmark_props(const char *, nvlist_t **); +_LIBZFS_CORE_H int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); +_LIBZFS_CORE_H int lzc_unload_key(const char *); +_LIBZFS_CORE_H int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); -int lzc_clone(const char *, const char *, nvlist_t *); -int lzc_promote(const char *, char *, int); -int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); -int lzc_bookmark(nvlist_t *, nvlist_t **); -int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); -int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); -int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); -int lzc_unload_key(const char *); -int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); -int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, - nvlist_t **); -int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t, +_LIBZFS_CORE_H int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t, + nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_redact(const char *, const char *, nvlist_t *); -int lzc_snaprange_space(const char *, const char *, uint64_t *); +_LIBZFS_CORE_H int lzc_snaprange_space(const char *, const char *, uint64_t *); -int lzc_hold(nvlist_t *, int, nvlist_t **); -int lzc_release(nvlist_t *, nvlist_t **); -int lzc_get_holds(const char *, nvlist_t **); +_LIBZFS_CORE_H int lzc_hold(nvlist_t *, int, nvlist_t **); +_LIBZFS_CORE_H int lzc_release(nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_get_holds(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, LZC_SEND_FLAG_COMPRESS = 1 << 2, LZC_SEND_FLAG_RAW = 1 << 3, + LZC_SEND_FLAG_SAVED = 1 << 4, }; -int lzc_send(const char *, const char *, int, enum lzc_send_flags); -int lzc_send_resume(const char *, const char *, int, +_LIBZFS_CORE_H int lzc_send(const char *, const char *, int, + enum lzc_send_flags); +_LIBZFS_CORE_H int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t); -int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); +_LIBZFS_CORE_H int lzc_send_space(const char *, const char *, + enum lzc_send_flags, uint64_t *); struct dmu_replay_record; -int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, boolean_t, - int); -int lzc_receive_resumable(const char *, nvlist_t *, const char *, boolean_t, - boolean_t, int); -int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, - boolean_t, boolean_t, int, const struct dmu_replay_record *); -int lzc_receive_one(const char *, nvlist_t *, const char *, boolean_t, - boolean_t, boolean_t, int, const struct dmu_replay_record *, int, +_LIBZFS_CORE_H int lzc_send_redacted(const char *, const char *, int, + enum lzc_send_flags, const char *); +_LIBZFS_CORE_H int lzc_send_resume_redacted(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t, const char *); +_LIBZFS_CORE_H int lzc_receive(const char *, nvlist_t *, const char *, + boolean_t, boolean_t, int); +_LIBZFS_CORE_H int lzc_receive_resumable(const char *, nvlist_t *, const char *, + boolean_t, boolean_t, int); +_LIBZFS_CORE_H int lzc_receive_with_header(const char *, nvlist_t *, + const char *, boolean_t, boolean_t, boolean_t, int, + const struct dmu_replay_record *); +_LIBZFS_CORE_H int lzc_receive_one(const char *, nvlist_t *, const char *, + boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); -int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, - uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, - const struct dmu_replay_record *, int, uint64_t *, uint64_t *, - uint64_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_receive_with_cmdprops(const char *, nvlist_t *, + nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, + boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, + uint64_t *, uint64_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_send_space(const char *, const char *, + enum lzc_send_flags, uint64_t *); +_LIBZFS_CORE_H int lzc_send_space_resume_redacted(const char *, const char *, + enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, + int, uint64_t *); +_LIBZFS_CORE_H uint64_t lzc_send_progress(int); -boolean_t lzc_exists(const char *); +_LIBZFS_CORE_H boolean_t lzc_exists(const char *); -int lzc_rollback(const char *, char *, int); -int lzc_rollback_to(const char *, const char *); +_LIBZFS_CORE_H int lzc_rollback(const char *, char *, int); +_LIBZFS_CORE_H int lzc_rollback_to(const char *, const char *); -int lzc_rename(const char *, const char *); -int lzc_destroy(const char *); +_LIBZFS_CORE_H int lzc_rename(const char *, const char *); +_LIBZFS_CORE_H int lzc_destroy(const char *); -int lzc_channel_program(const char *, const char *, uint64_t, - uint64_t, nvlist_t *, nvlist_t **); -int lzc_channel_program_nosync(const char *, const char *, uint64_t, +_LIBZFS_CORE_H int lzc_channel_program(const char *, const char *, uint64_t, uint64_t, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_channel_program_nosync(const char *, const char *, + uint64_t, uint64_t, nvlist_t *, nvlist_t **); -int lzc_sync(const char *, nvlist_t *, nvlist_t **); -int lzc_reopen(const char *, boolean_t); +_LIBZFS_CORE_H int lzc_sync(const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_reopen(const char *, boolean_t); -int lzc_pool_checkpoint(const char *); -int lzc_pool_checkpoint_discard(const char *); +_LIBZFS_CORE_H int lzc_pool_checkpoint(const char *); +_LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); +_LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); +_LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, + boolean_t *); +_LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); + +_LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *); +_LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **); #ifdef __cplusplus } #endif diff --git a/include/libzfs_impl.h b/include/libzfs_impl.h deleted file mode 100644 index 9a46b9f129..0000000000 --- a/include/libzfs_impl.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * CDDL HEADER SART - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2018 Datto Inc. - */ - -#ifndef _LIBZFS_IMPL_H -#define _LIBZFS_IMPL_H - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct libzfs_handle { - int libzfs_error; - int libzfs_fd; - FILE *libzfs_mnttab; - FILE *libzfs_sharetab; - zpool_handle_t *libzfs_pool_handles; - uu_avl_pool_t *libzfs_ns_avlpool; - uu_avl_t *libzfs_ns_avl; - uint64_t libzfs_ns_gen; - int libzfs_desc_active; - char libzfs_action[1024]; - char libzfs_desc[1024]; - int libzfs_printerr; - int libzfs_storeerr; /* stuff error messages into buffer */ - void *libzfs_sharehdl; /* libshare handle */ - uint_t libzfs_shareflags; - boolean_t libzfs_mnttab_enable; - /* - * We need a lock to handle the case where parallel mount - * threads are populating the mnttab cache simultaneously. The - * lock only protects the integrity of the avl tree, and does - * not protect the contents of the mnttab entries themselves. - */ - pthread_mutex_t libzfs_mnttab_cache_lock; - avl_tree_t libzfs_mnttab_cache; - int libzfs_pool_iter; - char libzfs_chassis_id[256]; - boolean_t libzfs_prop_debug; -}; - -#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ - -struct zfs_handle { - libzfs_handle_t *zfs_hdl; - zpool_handle_t *zpool_hdl; - char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; - zfs_type_t zfs_type; /* type including snapshot */ - zfs_type_t zfs_head_type; /* type excluding snapshot */ - dmu_objset_stats_t zfs_dmustats; - nvlist_t *zfs_props; - nvlist_t *zfs_user_props; - nvlist_t *zfs_recvd_props; - boolean_t zfs_mntcheck; - char *zfs_mntopts; - uint8_t *zfs_props_table; -}; - -/* - * This is different from checking zfs_type, because it will also catch - * snapshots of volumes. - */ -#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) - -struct zpool_handle { - libzfs_handle_t *zpool_hdl; - zpool_handle_t *zpool_next; - char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; - int zpool_state; - size_t zpool_config_size; - nvlist_t *zpool_config; - nvlist_t *zpool_old_config; - nvlist_t *zpool_props; - diskaddr_t zpool_start_block; -}; - -typedef enum { - PROTO_NFS = 0, - PROTO_SMB = 1, - PROTO_END = 2 -} zfs_share_proto_t; - -/* - * The following can be used as a bitmask and any new values - * added must preserve that capability. - */ -typedef enum { - SHARED_NOT_SHARED = 0x0, - SHARED_NFS = 0x2, - SHARED_SMB = 0x4 -} zfs_share_type_t; - -#define CONFIG_BUF_MINSIZE 262144 - -int zfs_error(libzfs_handle_t *, int, const char *); -int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); -void zfs_error_aux(libzfs_handle_t *, const char *, ...); -void *zfs_alloc(libzfs_handle_t *, size_t); -void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); -char *zfs_asprintf(libzfs_handle_t *, const char *, ...); -char *zfs_strdup(libzfs_handle_t *, const char *); -int no_memory(libzfs_handle_t *); - -int zfs_standard_error(libzfs_handle_t *, int, const char *); -int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); -int zpool_standard_error(libzfs_handle_t *, int, const char *); -int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); - -zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); -zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); - -int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, - nvlist_t *, char **, uint64_t *, const char *); -int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, - zfs_type_t type); - -/* - * Use this changelist_gather() flag to force attempting mounts - * on each change node regardless of whether or not it is currently - * mounted. - */ -#define CL_GATHER_MOUNT_ALWAYS 1 -/* - * changelist_gather() flag to force it to iterate on mounted datasets only - */ -#define CL_GATHER_ITER_MOUNTED 2 - -typedef struct prop_changelist prop_changelist_t; - -int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); -int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); -int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); -int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); -int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); -void zcmd_free_nvlists(zfs_cmd_t *); - -int changelist_prefix(prop_changelist_t *); -int changelist_postfix(prop_changelist_t *); -void changelist_rename(prop_changelist_t *, const char *, const char *); -void changelist_remove(prop_changelist_t *, const char *); -void changelist_free(prop_changelist_t *); -prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); -int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); -int changelist_haszonedchild(prop_changelist_t *); - -void remove_mountpoint(zfs_handle_t *); -int create_parents(libzfs_handle_t *, char *, int); -boolean_t isa_child_of(const char *dataset, const char *parent); - -zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); -zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, - nvlist_t *props); - -int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); - -boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); - -int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, - boolean_t modifying); - -void namespace_clear(libzfs_handle_t *); - -/* - * libshare (sharemgr) interfaces used internally. - */ - -extern int zfs_init_libshare(libzfs_handle_t *, int); -extern void zfs_uninit_libshare(libzfs_handle_t *); -extern int zfs_parse_options(char *, zfs_share_proto_t); - -extern int zfs_unshare_proto(zfs_handle_t *, - const char *, zfs_share_proto_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_IMPL_H */ diff --git a/include/libzfsbootenv.h b/include/libzfsbootenv.h new file mode 100644 index 0000000000..cbc8751dc5 --- /dev/null +++ b/include/libzfsbootenv.h @@ -0,0 +1,43 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _LIBZFSBOOTENV_H +#define _LIBZFSBOOTENV_H extern __attribute__((visibility("default"))) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum lzbe_flags { + lzbe_add, /* add data to existing nvlist */ + lzbe_replace /* replace current nvlist */ +} lzbe_flags_t; + +_LIBZFSBOOTENV_H int lzbe_nvlist_get(const char *, const char *, void **); +_LIBZFSBOOTENV_H int lzbe_nvlist_set(const char *, const char *, void *); +_LIBZFSBOOTENV_H void lzbe_nvlist_free(void *); +_LIBZFSBOOTENV_H int lzbe_add_pair(void *, const char *, const char *, void *, + size_t); +_LIBZFSBOOTENV_H int lzbe_remove_pair(void *, const char *); +_LIBZFSBOOTENV_H int lzbe_set_boot_device(const char *, lzbe_flags_t, + const char *); +_LIBZFSBOOTENV_H int lzbe_get_boot_device(const char *, char **); +_LIBZFSBOOTENV_H int lzbe_bootenv_print(const char *, const char *, FILE *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFSBOOTENV_H */ diff --git a/include/libzutil.h b/include/libzutil.h index 69d1e6bbd6..c0a660ea70 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -24,7 +24,7 @@ */ #ifndef _LIBZUTIL_H -#define _LIBZUTIL_H +#define _LIBZUTIL_H extern __attribute__((visibility("default"))) #include #include @@ -56,8 +56,8 @@ typedef const struct pool_config_ops { /* * An instance of pool_config_ops_t is expected in the caller's binary. */ -extern const pool_config_ops_t libzfs_config_ops; -extern const pool_config_ops_t libzpool_config_ops; +_LIBZUTIL_H const pool_config_ops_t libzfs_config_ops; +_LIBZUTIL_H const pool_config_ops_t libzpool_config_ops; typedef struct importargs { char **path; /* a list of paths to search */ @@ -70,26 +70,21 @@ typedef struct importargs { nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ } importargs_t; -extern nvlist_t *zpool_search_import(void *, importargs_t *, - const pool_config_ops_t *); -extern int zpool_find_config(void *, const char *, nvlist_t **, importargs_t *, +_LIBZUTIL_H nvlist_t *zpool_search_import(void *, importargs_t *, const pool_config_ops_t *); +_LIBZUTIL_H int zpool_find_config(void *, const char *, nvlist_t **, + importargs_t *, const pool_config_ops_t *); -extern const char * const * zpool_default_search_paths(size_t *count); -extern int zpool_read_label(int, nvlist_t **, int *); -extern int zpool_label_disk_wait(const char *, int); +_LIBZUTIL_H const char * const * zpool_default_search_paths(size_t *count); +_LIBZUTIL_H int zpool_read_label(int, nvlist_t **, int *); +_LIBZUTIL_H int zpool_label_disk_wait(const char *, int); -#ifdef HAVE_LIBUDEV struct udev_device; -extern int zfs_device_get_devid(struct udev_device *, char *, size_t); -extern int zfs_device_get_physical(struct udev_device *, char *, size_t); -#else -#define zfs_device_get_devid(dev, bufptr, buflen) (ENODATA) -#define zfs_device_get_physical(dev, bufptr, buflen) (ENODATA) -#endif +_LIBZUTIL_H int zfs_device_get_devid(struct udev_device *, char *, size_t); +_LIBZUTIL_H int zfs_device_get_physical(struct udev_device *, char *, size_t); -extern void update_vdev_config_dev_strs(nvlist_t *); +_LIBZUTIL_H void update_vdev_config_dev_strs(nvlist_t *); /* * Default device paths @@ -98,24 +93,24 @@ extern void update_vdev_config_dev_strs(nvlist_t *); #define UDISK_ROOT "/dev/disk" #define ZVOL_ROOT "/dev/zvol" -extern int zfs_append_partition(char *path, size_t max_len); -extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); +_LIBZUTIL_H int zfs_append_partition(char *path, size_t max_len); +_LIBZUTIL_H int zfs_resolve_shortname(const char *name, char *path, + size_t pathlen); -extern char *zfs_strip_partition(char *); -extern char *zfs_strip_partition_path(char *); +_LIBZUTIL_H char *zfs_strip_partition(char *); +_LIBZUTIL_H char *zfs_strip_path(char *); -extern int zfs_strcmp_pathname(const char *, const char *, int); +_LIBZUTIL_H int zfs_strcmp_pathname(const char *, const char *, int); -extern int zfs_dev_is_dm(const char *); -extern int zfs_dev_is_whole_disk(const char *); -extern char *zfs_get_underlying_path(const char *); -extern char *zfs_get_enclosure_sysfs_path(const char *); +_LIBZUTIL_H boolean_t zfs_dev_is_dm(const char *); +_LIBZUTIL_H boolean_t zfs_dev_is_whole_disk(const char *); +_LIBZUTIL_H int zfs_dev_flush(int); +_LIBZUTIL_H char *zfs_get_underlying_path(const char *); +_LIBZUTIL_H char *zfs_get_enclosure_sysfs_path(const char *); -#ifdef HAVE_LIBUDEV -extern boolean_t is_mpath_whole_disk(const char *); -#else -#define is_mpath_whole_disk(path) (B_FALSE) -#endif +_LIBZUTIL_H boolean_t is_mpath_whole_disk(const char *); + +_LIBZUTIL_H boolean_t zfs_isnumber(const char *); /* * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". @@ -137,18 +132,46 @@ enum zfs_nicenum_format { /* * Convert a number to a human-readable form. */ -extern void zfs_nicebytes(uint64_t, char *, size_t); -extern void zfs_nicenum(uint64_t, char *, size_t); -extern void zfs_nicenum_format(uint64_t, char *, size_t, +_LIBZUTIL_H void zfs_nicebytes(uint64_t, char *, size_t); +_LIBZUTIL_H void zfs_nicenum(uint64_t, char *, size_t); +_LIBZUTIL_H void zfs_nicenum_format(uint64_t, char *, size_t, enum zfs_nicenum_format); -extern void zfs_nicetime(uint64_t, char *, size_t); +_LIBZUTIL_H void zfs_nicetime(uint64_t, char *, size_t); +_LIBZUTIL_H void zfs_niceraw(uint64_t, char *, size_t); #define nicenum(num, buf, size) zfs_nicenum(num, buf, size) -extern void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); -extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, +_LIBZUTIL_H void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); +_LIBZUTIL_H int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); +struct zfs_cmd; + +/* + * List of colors to use + */ +#define ANSI_RED "\033[0;31m" +#define ANSI_YELLOW "\033[0;33m" +#define ANSI_RESET "\033[0m" +#define ANSI_BOLD "\033[1m" + +_LIBZUTIL_H void color_start(char *color); +_LIBZUTIL_H void color_end(void); +_LIBZUTIL_H int printf_color(char *color, char *format, ...); + +_LIBZUTIL_H const char *zfs_basename(const char *path); +_LIBZUTIL_H ssize_t zfs_dirnamelen(const char *path); + +/* + * These functions are used by the ZFS libraries and cmd/zpool code, but are + * not exported in the ABI. + */ +typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); +int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data); +int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, + void *data); +void update_vdevs_config_dev_sysfs_path(nvlist_t *config); #ifdef __cplusplus } #endif diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am deleted file mode 100644 index efb49520e6..0000000000 --- a/include/linux/Makefile.am +++ /dev/null @@ -1,28 +0,0 @@ -COMMON_H = - -KERNEL_H = \ - $(top_srcdir)/include/linux/dcache_compat.h \ - $(top_srcdir)/include/linux/xattr_compat.h \ - $(top_srcdir)/include/linux/vfs_compat.h \ - $(top_srcdir)/include/linux/blkdev_compat.h \ - $(top_srcdir)/include/linux/utsname_compat.h \ - $(top_srcdir)/include/linux/kmap_compat.h \ - $(top_srcdir)/include/linux/simd_x86.h \ - $(top_srcdir)/include/linux/simd_aarch64.h \ - $(top_srcdir)/include/linux/mod_compat.h \ - $(top_srcdir)/include/linux/page_compat.h \ - $(top_srcdir)/include/linux/compiler_compat.h - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) - -if CONFIG_USER -libzfsdir = $(includedir)/libzfs/linux -libzfs_HEADERS = $(COMMON_H) $(USER_H) -endif - -if CONFIG_KERNEL -kerneldir = @prefix@/src/zfs-$(VERSION)/include/linux -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) -endif diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h deleted file mode 100644 index 12cd746778..0000000000 --- a/include/linux/simd_x86.h +++ /dev/null @@ -1,733 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (C) 2016 Gvozden Neskovic . - */ - -/* - * USER API: - * - * Kernel fpu methods: - * kfpu_begin() - * kfpu_end() - * - * SIMD support: - * - * Following functions should be called to determine whether CPU feature - * is supported. All functions are usable in kernel and user space. - * If a SIMD algorithm is using more than one instruction set - * all relevant feature test functions should be called. - * - * Supported features: - * zfs_sse_available() - * zfs_sse2_available() - * zfs_sse3_available() - * zfs_ssse3_available() - * zfs_sse4_1_available() - * zfs_sse4_2_available() - * - * zfs_avx_available() - * zfs_avx2_available() - * - * zfs_bmi1_available() - * zfs_bmi2_available() - * - * zfs_avx512f_available() - * zfs_avx512cd_available() - * zfs_avx512er_available() - * zfs_avx512pf_available() - * zfs_avx512bw_available() - * zfs_avx512dq_available() - * zfs_avx512vl_available() - * zfs_avx512ifma_available() - * zfs_avx512vbmi_available() - * - * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers - * also add zfs_avx512vl_available() to feature check. - */ - -#ifndef _SIMD_X86_H -#define _SIMD_X86_H - -#include - -/* only for __x86 */ -#if defined(__x86) - -#include - -#if defined(_KERNEL) -#include -#else -#include -#endif - -#if defined(_KERNEL) - -#if defined(HAVE_KERNEL_FPU_API_HEADER) -#include -#include -#else -#include -#include -#endif - -#if defined(HAVE_UNDERSCORE_KERNEL_FPU) -#define kfpu_begin() \ -{ \ - preempt_disable(); \ - __kernel_fpu_begin(); \ -} -#define kfpu_end() \ -{ \ - __kernel_fpu_end(); \ - preempt_enable(); \ -} -#elif defined(HAVE_KERNEL_FPU) -#define kfpu_begin() kernel_fpu_begin() -#define kfpu_end() kernel_fpu_end() -#else -/* Kernel doesn't export any kernel_fpu_* functions */ -#include /* For kernel xgetbv() */ -#define kfpu_begin() panic("This code should never run") -#define kfpu_end() panic("This code should never run") -#endif /* defined(HAVE_KERNEL_FPU) */ - -#else -/* - * fpu dummy methods for userspace - */ -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) -#endif /* defined(_KERNEL) */ - -/* - * CPUID feature tests for user-space. Linux kernel provides an interface for - * CPU feature testing. - */ -#if !defined(_KERNEL) - -/* - * x86 registers used implicitly by CPUID - */ -typedef enum cpuid_regs { - EAX = 0, - EBX, - ECX, - EDX, - CPUID_REG_CNT = 4 -} cpuid_regs_t; - -/* - * List of instruction sets identified by CPUID - */ -typedef enum cpuid_inst_sets { - SSE = 0, - SSE2, - SSE3, - SSSE3, - SSE4_1, - SSE4_2, - OSXSAVE, - AVX, - AVX2, - BMI1, - BMI2, - AVX512F, - AVX512CD, - AVX512DQ, - AVX512BW, - AVX512IFMA, - AVX512VBMI, - AVX512PF, - AVX512ER, - AVX512VL, - AES, - PCLMULQDQ -} cpuid_inst_sets_t; - -/* - * Instruction set descriptor. - */ -typedef struct cpuid_feature_desc { - uint32_t leaf; /* CPUID leaf */ - uint32_t subleaf; /* CPUID sub-leaf */ - uint32_t flag; /* bit mask of the feature */ - cpuid_regs_t reg; /* which CPUID return register to test */ -} cpuid_feature_desc_t; - -#define _AVX512F_BIT (1U << 16) -#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) -#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) -#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) -#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) -#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ -#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) -#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) -#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ -#define _AES_BIT (1U << 25) -#define _PCLMULQDQ_BIT (1U << 1) - -/* - * Descriptions of supported instruction sets - */ -static const cpuid_feature_desc_t cpuid_features[] = { - [SSE] = {1U, 0U, 1U << 25, EDX }, - [SSE2] = {1U, 0U, 1U << 26, EDX }, - [SSE3] = {1U, 0U, 1U << 0, ECX }, - [SSSE3] = {1U, 0U, 1U << 9, ECX }, - [SSE4_1] = {1U, 0U, 1U << 19, ECX }, - [SSE4_2] = {1U, 0U, 1U << 20, ECX }, - [OSXSAVE] = {1U, 0U, 1U << 27, ECX }, - [AVX] = {1U, 0U, 1U << 28, ECX }, - [AVX2] = {7U, 0U, 1U << 5, EBX }, - [BMI1] = {7U, 0U, 1U << 3, EBX }, - [BMI2] = {7U, 0U, 1U << 8, EBX }, - [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, - [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, - [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, - [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, - [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, - [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, - [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, - [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, - [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, - [AES] = {1U, 0U, _AES_BIT, ECX }, - [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, -}; - -/* - * Check if OS supports AVX and AVX2 by checking XCR0 - * Only call this function if CPUID indicates that AVX feature is - * supported by the CPU, otherwise it might be an illegal instruction. - */ -static inline uint64_t -xgetbv(uint32_t index) -{ - uint32_t eax, edx; - /* xgetbv - instruction byte code */ - __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" - : "=a" (eax), "=d" (edx) - : "c" (index)); - - return ((((uint64_t)edx)<<32) | (uint64_t)eax); -} - -/* - * Check if CPU supports a feature - */ -static inline boolean_t -__cpuid_check_feature(const cpuid_feature_desc_t *desc) -{ - uint32_t r[CPUID_REG_CNT]; - - if (__get_cpuid_max(0, NULL) >= desc->leaf) { - /* - * __cpuid_count is needed to properly check - * for AVX2. It is a macro, so return parameters - * are passed by value. - */ - __cpuid_count(desc->leaf, desc->subleaf, - r[EAX], r[EBX], r[ECX], r[EDX]); - return ((r[desc->reg] & desc->flag) == desc->flag); - } - return (B_FALSE); -} - -#define CPUID_FEATURE_CHECK(name, id) \ -static inline boolean_t \ -__cpuid_has_ ## name(void) \ -{ \ - return (__cpuid_check_feature(&cpuid_features[id])); \ -} - -/* - * Define functions for user-space CPUID features testing - */ -CPUID_FEATURE_CHECK(sse, SSE); -CPUID_FEATURE_CHECK(sse2, SSE2); -CPUID_FEATURE_CHECK(sse3, SSE3); -CPUID_FEATURE_CHECK(ssse3, SSSE3); -CPUID_FEATURE_CHECK(sse4_1, SSE4_1); -CPUID_FEATURE_CHECK(sse4_2, SSE4_2); -CPUID_FEATURE_CHECK(avx, AVX); -CPUID_FEATURE_CHECK(avx2, AVX2); -CPUID_FEATURE_CHECK(osxsave, OSXSAVE); -CPUID_FEATURE_CHECK(bmi1, BMI1); -CPUID_FEATURE_CHECK(bmi2, BMI2); -CPUID_FEATURE_CHECK(avx512f, AVX512F); -CPUID_FEATURE_CHECK(avx512cd, AVX512CD); -CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); -CPUID_FEATURE_CHECK(avx512bw, AVX512BW); -CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); -CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); -CPUID_FEATURE_CHECK(avx512pf, AVX512PF); -CPUID_FEATURE_CHECK(avx512er, AVX512ER); -CPUID_FEATURE_CHECK(avx512vl, AVX512VL); -CPUID_FEATURE_CHECK(aes, AES); -CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); - -#endif /* !defined(_KERNEL) */ - - -/* - * Detect register set support - */ -static inline boolean_t -__simd_state_enabled(const uint64_t state) -{ - boolean_t has_osxsave; - uint64_t xcr0; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU) - has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); -#else - has_osxsave = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_osxsave = __cpuid_has_osxsave(); -#endif - - if (!has_osxsave) - return (B_FALSE); - - xcr0 = xgetbv(0); - return ((xcr0 & state) == state); -} - -#define _XSTATE_SSE_AVX (0x2 | 0x4) -#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) - -#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) -#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) - - -/* - * Check if SSE instruction set is available - */ -static inline boolean_t -zfs_sse_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_XMM)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_sse()); -#endif -} - -/* - * Check if SSE2 instruction set is available - */ -static inline boolean_t -zfs_sse2_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_XMM2)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_sse2()); -#endif -} - -/* - * Check if SSE3 instruction set is available - */ -static inline boolean_t -zfs_sse3_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_XMM3)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_sse3()); -#endif -} - -/* - * Check if SSSE3 instruction set is available - */ -static inline boolean_t -zfs_ssse3_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_SSSE3)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_ssse3()); -#endif -} - -/* - * Check if SSE4.1 instruction set is available - */ -static inline boolean_t -zfs_sse4_1_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_sse4_1()); -#endif -} - -/* - * Check if SSE4.2 instruction set is available - */ -static inline boolean_t -zfs_sse4_2_available(void) -{ -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_sse4_2()); -#endif -} - -/* - * Check if AVX instruction set is available - */ -static inline boolean_t -zfs_avx_available(void) -{ - boolean_t has_avx; -#if defined(_KERNEL) -#if defined(KERNEL_EXPORTS_X86_FPU) - has_avx = !!boot_cpu_has(X86_FEATURE_AVX); -#else - has_avx = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx = __cpuid_has_avx(); -#endif - - return (has_avx && __ymm_enabled()); -} - -/* - * Check if AVX2 instruction set is available - */ -static inline boolean_t -zfs_avx2_available(void) -{ - boolean_t has_avx2; -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2); -#else - has_avx2 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx2 = __cpuid_has_avx2(); -#endif - - return (has_avx2 && __ymm_enabled()); -} - -/* - * Check if BMI1 instruction set is available - */ -static inline boolean_t -zfs_bmi1_available(void) -{ -#if defined(_KERNEL) -#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_BMI1)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_bmi1()); -#endif -} - -/* - * Check if BMI2 instruction set is available - */ -static inline boolean_t -zfs_bmi2_available(void) -{ -#if defined(_KERNEL) -#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_BMI2)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_bmi2()); -#endif -} - -/* - * Check if AES instruction set is available - */ -static inline boolean_t -zfs_aes_available(void) -{ -#if defined(_KERNEL) -#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_AES)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_aes()); -#endif -} - -/* - * Check if PCLMULQDQ instruction set is available - */ -static inline boolean_t -zfs_pclmulqdq_available(void) -{ -#if defined(_KERNEL) -#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU) - return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); -#else - return (B_FALSE); -#endif -#elif !defined(_KERNEL) - return (__cpuid_has_pclmulqdq()); -#endif -} - -/* - * AVX-512 family of instruction sets: - * - * AVX512F Foundation - * AVX512CD Conflict Detection Instructions - * AVX512ER Exponential and Reciprocal Instructions - * AVX512PF Prefetch Instructions - * - * AVX512BW Byte and Word Instructions - * AVX512DQ Double-word and Quadword Instructions - * AVX512VL Vector Length Extensions - * - * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) - * AVX512VBMI Vector Byte Manipulation Instructions - */ - - -/* Check if AVX512F instruction set is available */ -static inline boolean_t -zfs_avx512f_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512f(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512CD instruction set is available */ -static inline boolean_t -zfs_avx512cd_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512CD); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512cd(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512ER instruction set is available */ -static inline boolean_t -zfs_avx512er_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512ER); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512er(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512PF instruction set is available */ -static inline boolean_t -zfs_avx512pf_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512PF); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512pf(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512BW instruction set is available */ -static inline boolean_t -zfs_avx512bw_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512BW); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512bw(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512DQ instruction set is available */ -static inline boolean_t -zfs_avx512dq_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512DQ); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512dq(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512VL instruction set is available */ -static inline boolean_t -zfs_avx512vl_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512vl(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512IFMA instruction set is available */ -static inline boolean_t -zfs_avx512ifma_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512IFMA); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512ifma(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -/* Check if AVX512VBMI instruction set is available */ -static inline boolean_t -zfs_avx512vbmi_available(void) -{ - boolean_t has_avx512 = B_FALSE; - -#if defined(_KERNEL) -#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU) - has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VBMI); -#else - has_avx512 = B_FALSE; -#endif -#elif !defined(_KERNEL) - has_avx512 = __cpuid_has_avx512f() && - __cpuid_has_avx512vbmi(); -#endif - - return (has_avx512 && __zmm_enabled()); -} - -#endif /* defined(__x86) */ - -#endif /* _SIMD_X86_H */ diff --git a/include/os/Makefile.am b/include/os/Makefile.am new file mode 100644 index 0000000000..7eab1abde9 --- /dev/null +++ b/include/os/Makefile.am @@ -0,0 +1,6 @@ +if BUILD_LINUX +SUBDIRS = linux +endif +if BUILD_FREEBSD +SUBDIRS = freebsd +endif diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am new file mode 100644 index 0000000000..3c87d4a0e7 --- /dev/null +++ b/include/os/freebsd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = linux spl zfs diff --git a/include/os/freebsd/linux/Makefile.am b/include/os/freebsd/linux/Makefile.am new file mode 100644 index 0000000000..00cff7f5dc --- /dev/null +++ b/include/os/freebsd/linux/Makefile.am @@ -0,0 +1,5 @@ +KERNEL_H = \ + compiler.h \ + types.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h new file mode 100644 index 0000000000..20903717b5 --- /dev/null +++ b/include/os/freebsd/linux/compiler.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. + * Copyright (c) 2015 François Tigeot + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_COMPILER_H_ +#define _LINUX_COMPILER_H_ + +#include + +#define __user +#define __kernel +#define __safe +#define __force +#define __nocast +#define __iomem +#define __chk_user_ptr(x) ((void)0) +#define __chk_io_ptr(x) ((void)0) +#define __builtin_warning(x, y...) (1) +#define __acquires(x) +#define __releases(x) +#define __acquire(x) do { } while (0) +#define __release(x) do { } while (0) +#define __cond_lock(x, c) (c) +#define __bitwise +#define __devinitdata +#define __deprecated +#define __init +#define __initconst +#define __devinit +#define __devexit +#define __exit +#define __rcu +#define __percpu +#define __weak __weak_symbol +#define __malloc +#define ___stringify(...) #__VA_ARGS__ +#define __stringify(...) ___stringify(__VA_ARGS__) +#define __attribute_const__ __attribute__((__const__)) +#undef __always_inline +#define __always_inline inline +#define noinline __noinline +#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) +#define fallthrough __attribute__((__fallthrough__)) + +#if !defined(_KERNEL) && !defined(_STANDALONE) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#define typeof(x) __typeof(x) + +#define uninitialized_var(x) x = x +#define __maybe_unused __unused +#define __always_unused __unused +#define __must_check __result_use_check + +#define __printf(a, b) __printflike(a, b) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define smp_rmb() rmb() +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + +#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) + +#define WRITE_ONCE(x, v) do { \ + barrier(); \ + ACCESS_ONCE(x) = (v); \ + barrier(); \ +} while (0) + +#define lockless_dereference(p) READ_ONCE(p) + +#define _AT(T, X) ((T)(X)) + +#endif /* _LINUX_COMPILER_H_ */ diff --git a/include/os/freebsd/linux/types.h b/include/os/freebsd/linux/types.h new file mode 100644 index 0000000000..d290317cc0 --- /dev/null +++ b/include/os/freebsd/linux/types.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_TYPES_H_ +#define _LINUX_TYPES_H_ + +#include + + +#ifndef __bitwise__ +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#endif + +typedef uint16_t __le16; +typedef uint16_t __be16; +typedef uint32_t __le32; +typedef uint32_t __be32; +typedef uint64_t __le64; +typedef uint64_t __be64; + +typedef unsigned gfp_t; +typedef off_t loff_t; +typedef vm_paddr_t resource_size_t; +typedef uint16_t __bitwise__ __sum16; +typedef unsigned long pgoff_t; +typedef unsigned __poll_t; + +typedef uint64_t u64; +typedef u64 phys_addr_t; + +typedef size_t __kernel_size_t; + +#define DECLARE_BITMAP(n, bits) \ + unsigned long n[howmany(bits, sizeof (long) * 8)] + +typedef unsigned long irq_hw_number_t; + +struct rcu_head { + void *raw[2]; +} __aligned(sizeof (void *)); + +typedef void (*rcu_callback_t)(struct rcu_head *head); +typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef int linux_task_fn_t(void *data); + +#endif /* _LINUX_TYPES_H_ */ diff --git a/include/os/freebsd/spl/Makefile.am b/include/os/freebsd/spl/Makefile.am new file mode 100644 index 0000000000..b321825cb7 --- /dev/null +++ b/include/os/freebsd/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = acl rpc sys diff --git a/include/os/freebsd/spl/acl/Makefile.am b/include/os/freebsd/spl/acl/Makefile.am new file mode 100644 index 0000000000..5c0698d02e --- /dev/null +++ b/include/os/freebsd/spl/acl/Makefile.am @@ -0,0 +1,4 @@ +KERNEL_H = \ + acl_common.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/acl/acl_common.h b/include/os/freebsd/spl/acl/acl_common.h new file mode 100644 index 0000000000..44f5bed592 --- /dev/null +++ b/include/os/freebsd/spl/acl/acl_common.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _ACL_COMMON_H +#define _ACL_COMMON_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +extern int acltrivial(const char *); +extern void adjust_ace_pair(ace_t *pair, mode_t mode); +extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t); +extern int ace_trivial_common(void *, int, + uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, + uint32_t *mask)); +#if !defined(_KERNEL) +extern acl_t *acl_alloc(acl_type_t); +extern void acl_free(acl_t *aclp); +extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, + uid_t owner, gid_t group); +#endif /* !_KERNEL */ +int cmp2acls(void *a, void *b); +int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); +void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); + +#ifdef __cplusplus +} +#endif + +#endif /* _ACL_COMMON_H */ diff --git a/include/os/freebsd/spl/rpc/Makefile.am b/include/os/freebsd/spl/rpc/Makefile.am new file mode 100644 index 0000000000..f6faf4b188 --- /dev/null +++ b/include/os/freebsd/spl/rpc/Makefile.am @@ -0,0 +1,4 @@ +KERNEL_H = \ + xdr.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h new file mode 100644 index 0000000000..c98466e9d1 --- /dev/null +++ b/include/os/freebsd/spl/rpc/xdr.h @@ -0,0 +1,71 @@ +/* + * Sun RPC is a product of Sun Microsystems, Inc. and is provided for + * unrestricted use provided that this legend is included on all tape + * media and as a part of the software program in whole or part. Users + * may copy or modify Sun RPC without charge, but are not authorized + * to license or distribute it to anyone else except as part of a product or + * program developed by the user. + * + * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE + * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. + * + * Sun RPC is provided with no support and without any obligation on the + * part of Sun Microsystems, Inc. to assist in its use, correction, + * modification or enhancement. + * + * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE + * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC + * OR ANY PART THEREOF. + * + * In no event will Sun Microsystems, Inc. be liable for any lost revenue + * or profits or other special, indirect and consequential damages, even if + * Sun has been advised of the possibility of such damages. + * + * Sun Microsystems, Inc. + * 2550 Garcia Avenue + * Mountain View, California 94043 + */ + +#ifndef _OPENSOLARIS_RPC_XDR_H_ +#define _OPENSOLARIS_RPC_XDR_H_ + +#include +#include_next + +#if !defined(_KERNEL) && !defined(_STANDALONE) + +#include + +/* + * Taken from sys/xdr/xdr_mem.c. + * + * FreeBSD's userland XDR doesn't implement control method (only the kernel), + * but OpenSolaris nvpair still depend on it, so we have to implement it here. + */ +static __inline bool_t +xdrmem_control(XDR *xdrs, int request, void *info) +{ + xdr_bytesrec *xptr; + + switch (request) { + case XDR_GET_BYTES_AVAIL: + xptr = (xdr_bytesrec *)info; + xptr->xc_is_last_record = TRUE; + xptr->xc_num_avail = xdrs->x_handy; + return (TRUE); + default: + assert(!"unexpected request"); + } + return (FALSE); +} + +#undef XDR_CONTROL +#define XDR_CONTROL(xdrs, req, op) \ + (((xdrs)->x_ops->x_control == NULL) ? \ + xdrmem_control((xdrs), (req), (op)) : \ + (*(xdrs)->x_ops->x_control)(xdrs, req, op)) + +#endif /* !_KERNEL && !_STANDALONE */ + +#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/freebsd/spl/sys/Makefile.am b/include/os/freebsd/spl/sys/Makefile.am new file mode 100644 index 0000000000..232aaf569f --- /dev/null +++ b/include/os/freebsd/spl/sys/Makefile.am @@ -0,0 +1,75 @@ +KERNEL_H = \ + acl_impl.h \ + acl.h \ + atomic.h \ + byteorder.h \ + callb.h \ + ccompat.h \ + ccompile.h \ + cmn_err.h \ + condvar.h \ + cred.h \ + ctype.h \ + debug.h \ + dirent.h \ + disp.h \ + dkio.h \ + extdirent.h \ + fcntl.h \ + file.h \ + freebsd_rwlock.h \ + idmap.h \ + inttypes.h \ + isa_defs.h \ + kmem_cache.h \ + kidmap.h \ + kmem.h \ + kstat.h \ + list_impl.h \ + list.h \ + lock.h \ + Makefile.am \ + misc.h \ + mod_os.h \ + mode.h \ + mount.h \ + mutex.h \ + param.h \ + policy.h \ + proc.h \ + processor.h \ + procfs_list.h \ + random.h \ + rwlock.h \ + sdt.h \ + sid.h \ + sig.h \ + simd_x86.h \ + simd.h \ + spl_condvar.h \ + string.h \ + strings.h \ + sunddi.h \ + sysmacros.h \ + systeminfo.h \ + systm.h \ + taskq.h \ + thread.h \ + time.h \ + timer.h \ + trace_zfs.h \ + trace.h \ + types.h \ + types32.h \ + uio.h \ + uuid.h \ + vfs.h \ + vm.h \ + vmsystm.h \ + vnode_impl.h \ + vnode.h \ + wmsum.h \ + zmod.h \ + zone.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/sys/acl.h b/include/os/freebsd/spl/sys/acl.h new file mode 100644 index 0000000000..ee50b0a183 --- /dev/null +++ b/include/os/freebsd/spl/sys/acl.h @@ -0,0 +1,216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_ACL_H +#define _SYS_ACL_H + +#include +#include + +/* + * When compiling OpenSolaris kernel code, this file is included instead of the + * FreeBSD one. Include the original sys/acl.h as well. + */ +#undef _SYS_ACL_H +#include_next +#define _SYS_ACL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ +typedef struct { + int a_type; /* the type of ACL entry */ + uid_t a_id; /* the entry in -uid or gid */ + o_mode_t a_perm; /* the permission field */ +} aclent_t; + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +/* + * The following are Defined types for an aclent_t. + */ +#define USER_OBJ (0x01) /* object owner */ +#define USER (0x02) /* additional users */ +#define GROUP_OBJ (0x04) /* owning group of the object */ +#define GROUP (0x08) /* additional groups */ +#define CLASS_OBJ (0x10) /* file group class and mask entry */ +#define OTHER_OBJ (0x20) /* other entry for the object */ +#define ACL_DEFAULT (0x1000) /* default flag */ +/* default object owner */ +#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) +/* default additional users */ +#define DEF_USER (ACL_DEFAULT | USER) +/* default owning group */ +#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) +/* default additional groups */ +#define DEF_GROUP (ACL_DEFAULT | GROUP) +/* default mask entry */ +#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) +/* default other entry */ +#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) + +/* + * The following are defined for ace_t. + */ +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ + ACL_DEFAULTED) + +/* + * These are only applicable in a CIFS context. + */ +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ + ACE_READ_NAMED_ATTRS) + +#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS) + +#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) +/* + * The following flags are supported by both NFSv4 ACLs and ace_t. + */ +#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | \ + ACE_INHERIT_ONLY_ACE | \ + ACE_INHERITED_ACE | \ + ACE_IDENTIFIER_GROUP) + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ + ACE_IDENTIFIER_GROUP) +#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +/* cmd args to acl(2) for aclent_t */ +#define GETACL 1 +#define SETACL 2 +#define GETACLCNT 3 + +/* cmd's to manipulate ace acls. */ +#define ACE_GETACL 4 +#define ACE_SETACL 5 +#define ACE_GETACLCNT 6 + +/* minimal acl entries from GETACLCNT */ +#define MIN_ACL_ENTRIES 4 + +extern void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp); +extern int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries); +extern void ksort(caddr_t, int, int, int (*)(void *, void *)); +extern int cmp2acls(void *, void *); + +extern int acl(const char *path, int cmd, int cnt, void *buf); +extern int facl(int fd, int cmd, int cnt, void *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_H */ diff --git a/include/os/freebsd/spl/sys/acl_impl.h b/include/os/freebsd/spl/sys/acl_impl.h new file mode 100644 index 0000000000..1efbd6d73b --- /dev/null +++ b/include/os/freebsd/spl/sys/acl_impl.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_IMPL_H +#define _SYS_ACL_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * acl flags + * + * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED + * flags can also be stored in this field. + */ +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 + +typedef enum acl_type { + ACLENT_T = 0, + ACE_T = 1 +} zfs_acl_type_t; + +struct acl_info { + zfs_acl_type_t acl_type; /* style of acl */ + int acl_cnt; /* number of acl entries */ + int acl_entry_size; /* sizeof acl entry */ + int acl_flags; /* special flags about acl */ + void *acl_aclp; /* the acl */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h new file mode 100644 index 0000000000..1a68bfc4de --- /dev/null +++ b/include/os/freebsd/spl/sys/atomic.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ATOMIC_H_ +#define _OPENSOLARIS_SYS_ATOMIC_H_ + +#ifndef _STANDALONE + +#include +#include + +#define atomic_sub_64 atomic_subtract_64 + +#if defined(__i386__) && (defined(_KERNEL) || defined(KLD_MODULE)) +#define I386_HAVE_ATOMIC64 +#endif + +#if defined(__i386__) || defined(__amd64__) || defined(__arm__) +/* No spurious failures from fcmpset. */ +#define STRONG_FCMPSET +#endif + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \ + !defined(HAS_EMULATED_ATOMIC64) +extern void atomic_add_64(volatile uint64_t *target, int64_t delta); +extern void atomic_dec_64(volatile uint64_t *target); +extern uint64_t atomic_swap_64(volatile uint64_t *a, uint64_t value); +extern uint64_t atomic_load_64(volatile uint64_t *a); +extern uint64_t atomic_add_64_nv(volatile uint64_t *target, int64_t delta); +extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp, + uint64_t newval); +#endif + +#define membar_producer atomic_thread_fence_rel + +static __inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return (atomic_fetchadd_32(target, delta) + delta); +} + +static __inline uint_t +atomic_add_int_nv(volatile uint_t *target, int delta) +{ + return (atomic_add_32_nv(target, delta)); +} + +static __inline void +atomic_inc_32(volatile uint32_t *target) +{ + atomic_add_32(target, 1); +} + +static __inline uint32_t +atomic_inc_32_nv(volatile uint32_t *target) +{ + return (atomic_add_32_nv(target, 1)); +} + +static __inline void +atomic_dec_32(volatile uint32_t *target) +{ + atomic_subtract_32(target, 1); +} + +static __inline uint32_t +atomic_dec_32_nv(volatile uint32_t *target) +{ + return (atomic_add_32_nv(target, -1)); +} + +#ifndef __sparc64__ +static inline uint32_t +atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval) +{ +#ifdef STRONG_FCMPSET + (void) atomic_fcmpset_32(target, &cmp, newval); +#else + uint32_t expected = cmp; + + do { + if (atomic_fcmpset_32(target, &cmp, newval)) + break; + } while (cmp == expected); +#endif + return (cmp); +} +#endif + +#if defined(__LP64__) || defined(__mips_n32) || \ + defined(ARM_HAVE_ATOMIC64) || defined(I386_HAVE_ATOMIC64) || \ + defined(HAS_EMULATED_ATOMIC64) +static __inline void +atomic_dec_64(volatile uint64_t *target) +{ + atomic_subtract_64(target, 1); +} + +static inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (atomic_fetchadd_64(target, delta) + delta); +} + +#ifndef __sparc64__ +static inline uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ +#ifdef STRONG_FCMPSET + (void) atomic_fcmpset_64(target, &cmp, newval); +#else + uint64_t expected = cmp; + + do { + if (atomic_fcmpset_64(target, &cmp, newval)) + break; + } while (cmp == expected); +#endif + return (cmp); +} +#endif +#endif + +static __inline void +atomic_inc_64(volatile uint64_t *target) +{ + atomic_add_64(target, 1); +} + +static __inline uint64_t +atomic_inc_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, 1)); +} + +static __inline uint64_t +atomic_dec_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, -1)); +} + +#if !defined(COMPAT_32BIT) && defined(__LP64__) +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_64((volatile uint64_t *)target, + (uint64_t)cmp, (uint64_t)newval)); +} +#else +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_32((volatile uint32_t *)target, + (uint32_t)cmp, (uint32_t)newval)); +} +#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ + +#else /* _STANDALONE */ +/* + * sometimes atomic_add_64 is defined, sometimes not, but the + * following is always right for the boot loader. + */ +#undef atomic_add_64 +#define atomic_add_64(ptr, val) *(ptr) += val +#endif /* !_STANDALONE */ + +#endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/include/os/freebsd/spl/sys/byteorder.h b/include/os/freebsd/spl/sys/byteorder.h new file mode 100644 index 0000000000..0b3d01eb37 --- /dev/null +++ b/include/os/freebsd/spl/sys/byteorder.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _OPENSOLARIS_SYS_BYTEORDER_H_ +#define _OPENSOLARIS_SYS_BYTEORDER_H_ + +#include + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#if BYTE_ORDER == _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#if !defined(_STANDALONE) +#if BYTE_ORDER == _BIG_ENDIAN +#define htonll(x) BMASK_64(x) +#define ntohll(x) BMASK_64(x) +#else /* BYTE_ORDER == _LITTLE_ENDIAN */ +#ifndef __LP64__ +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#else /* !__LP64__ */ +#define htonll(x) BSWAP_64(x) +#define ntohll(x) BSWAP_64(x) +#endif /* __LP64__ */ +#endif /* BYTE_ORDER */ +#endif /* _STANDALONE */ + +#define BE_IN32(xa) htonl(*((uint32_t *)(void *)(xa))) + +#endif /* _OPENSOLARIS_SYS_BYTEORDER_H_ */ diff --git a/include/os/freebsd/spl/sys/callb.h b/include/os/freebsd/spl/sys/callb.h new file mode 100644 index 0000000000..cc67b0263c --- /dev/null +++ b/include/os/freebsd/spl/sys/callb.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * definitions of callback classes (c_class) + * + * Callbacks belong in the same class if (1) their callback routines + * do the same kind of processing (ideally, using the same callback function) + * and (2) they can/should be executed at the same time in a cpr + * suspend/resume operation. + * + * Note: The DAEMON class, in particular, is for stopping kernel threads + * and nothing else. The CALLB_* macros below should be used to deal + * with kernel threads, and the callback function should be callb_generic_cpr. + * Another idiosyncrasy of the DAEMON class is that if a suspend operation + * fails, some of the callback functions may be called with the RESUME + * code which were never called with SUSPEND. Not a problem currently, + * but see bug 4201851. + */ +#define CB_CL_CPR_DAEMON 0 +#define CB_CL_CPR_VM 1 +#define CB_CL_CPR_CALLOUT 2 +#define CB_CL_CPR_OBP 3 +#define CB_CL_CPR_FB 4 +#define CB_CL_PANIC 5 +#define CB_CL_CPR_RPC 6 +#define CB_CL_CPR_PROMPRINTF 7 +#define CB_CL_UADMIN 8 +#define CB_CL_CPR_PM 9 +#define CB_CL_HALT 10 +#define CB_CL_CPR_DMA 11 +#define CB_CL_CPR_POST_USER 12 +#define CB_CL_UADMIN_PRE_VFS 13 +#define CB_CL_MDBOOT CB_CL_UADMIN +#define CB_CL_ENTER_DEBUGGER 14 +#define CB_CL_CPR_POST_KERNEL 15 +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ + +/* + * CB_CL_CPR_DAEMON class specific definitions are given below: + */ + +/* + * code for CPR callb_execute_class + */ +#define CB_CODE_CPR_CHKPT 0 +#define CB_CODE_CPR_RESUME 1 + +typedef void * callb_id_t; +/* + * Per kernel thread structure for CPR daemon callbacks. + * Must be protected by either a existing lock in the daemon or + * a new lock created for such a purpose. + */ +typedef struct callb_cpr { + kmutex_t *cc_lockp; /* lock to protect this struct */ + char cc_events; /* various events for CPR */ + callb_id_t cc_id; /* callb id address */ + kcondvar_t cc_callb_cv; /* cv for callback waiting */ + kcondvar_t cc_stop_cv; /* cv to checkpoint block */ +} callb_cpr_t; + +/* + * cc_events definitions + */ +#define CALLB_CPR_START 1 /* a checkpoint request's started */ +#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ +#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ + +/* + * Used when checking that all kernel threads are stopped. + */ +#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ +#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ +#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ + /* due to pwr mgmt of disks, make -- */ + /* big enough for worst spinup time */ + +/* + * + * CALLB_CPR_INIT macro is used by kernel threads to add their entry to + * the callback table and perform other initialization. It automatically + * adds the thread as being in the callback class CB_CL_CPR_DAEMON. + * + * cp - ptr to the callb_cpr_t structure for this kernel thread + * + * lockp - pointer to mutex protecting the callb_cpr_t struct + * + * func - pointer to the callback function for this kernel thread. + * It has the prototype boolean_t (void *arg, int code) + * where: arg - ptr to the callb_cpr_t structure + * code - not used for this type of callback + * returns: B_TRUE if successful; B_FALSE if unsuccessful. + * + * name - a string giving the name of the kernel thread + * + * Note: lockp is the lock to protect the callb_cpr_t (cp) structure + * later on. No lock held is needed for this initialization. + */ +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + strlcpy(curthread->td_name, (name), \ + sizeof (curthread->td_name)); \ + bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ + (cp)->cc_lockp = lockp; \ + (cp)->cc_id = callb_add(func, (void *)(cp), \ + CB_CL_CPR_DAEMON, name); \ + cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL); \ + cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL); \ + } + +#ifndef __lock_lint +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); +#else +#define CALLB_CPR_ASSERT(cp) +#endif +/* + * Some threads (like the idle threads) do not adhere to the callback + * protocol and are always considered safe. Such threads must never exit. + * They register their presence by calling this macro during their + * initialization. + * + * Args: + * t - thread pointer of the client kernel thread + * name - a string giving the name of the kernel thread + */ +#define CALLB_CPR_INIT_SAFE(t, name) { \ + (void) callb_add_thread(callb_generic_cpr_safe, \ + (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ + name, t); \ + } +/* + * The lock to protect cp's content must be held before + * calling the following two macros. + * + * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END + * is safe for checkpoint/resume. + */ +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + } +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp) \ + while ((cp)->cc_events & CALLB_CPR_START) \ + cv_wait(&(cp)->cc_stop_cv, lockp); \ + (cp)->cc_events &= ~CALLB_CPR_SAFE; \ + } +/* + * cv_destroy is nop right now but may be needed in the future. + */ +#define CALLB_CPR_EXIT(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + mutex_exit((cp)->cc_lockp); \ + (void) callb_delete((cp)->cc_id); \ + cv_destroy(&(cp)->cc_callb_cv); \ + cv_destroy(&(cp)->cc_stop_cv); \ + } + +extern callb_cpr_t callb_cprinfo_safe; +extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); +extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); +extern int callb_delete(callb_id_t); +extern void callb_execute(callb_id_t, int); +extern void *callb_execute_class(int, int); +extern boolean_t callb_generic_cpr(void *, int); +extern boolean_t callb_generic_cpr_safe(void *, int); +extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); +extern void callb_lock_table(void); +extern void callb_unlock_table(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CALLB_H */ diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h new file mode 100644 index 0000000000..59abe921db --- /dev/null +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -0,0 +1,153 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_CCOMPAT_H +#define _SYS_CCOMPAT_H + +#if __FreeBSD_version < 1300051 +#define vm_page_valid(m) (m)->valid = VM_PAGE_BITS_ALL +#define vm_page_do_sunbusy(m) +#define vm_page_none_valid(m) ((m)->valid == 0) +#else +#define vm_page_do_sunbusy(m) vm_page_sunbusy(m) +#endif + +#if __FreeBSD_version < 1300074 +#define VOP_UNLOCK1(x) VOP_UNLOCK(x, 0) +#else +#define VOP_UNLOCK1(x) VOP_UNLOCK(x) +#endif + +#if __FreeBSD_version < 1300064 +#define VN_IS_DOOMED(vp) ((vp)->v_iflag & VI_DOOMED) +#endif + +#if __FreeBSD_version < 1300068 +#define VFS_VOP_VECTOR_REGISTER(x) +#endif + +#if __FreeBSD_version >= 1300076 +#define getnewvnode_reserve_() getnewvnode_reserve() +#else +#define getnewvnode_reserve_() getnewvnode_reserve(1) +#endif + +#if __FreeBSD_version < 1300102 +#define ASSERT_VOP_IN_SEQC(zp) +#define MNTK_FPLOOKUP 0 +#define vn_seqc_write_begin(vp) +#define vn_seqc_write_end(vp) + +#ifndef VFS_SMR_DECLARE +#define VFS_SMR_DECLARE +#endif +#ifndef VFS_SMR_ZONE_SET +#define VFS_SMR_ZONE_SET(zone) +#endif +#endif + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +typedef struct { + volatile int counter; +} atomic_t; + + /* BEGIN CSTYLED */ +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = (p)->next) + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) + +#define container_of(ptr, type, member) \ +({ \ + const __typeof(((type *)0)->member) *__p = (ptr); \ + (type *)((uintptr_t)__p - offsetof(type, member)); \ +}) + /* END CSTYLED */ + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + n->next = h->first; + if (h->first != NULL) + h->first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + WRITE_ONCE(*(n->pprev), n->next); + if (n->next != NULL) + n->next->pprev = n->pprev; +} + /* BEGIN CSTYLED */ +#define READ_ONCE(x) ({ \ + __typeof(x) __var = ({ \ + barrier(); \ + ACCESS_ONCE(x); \ + }); \ + barrier(); \ + __var; \ +}) + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL + +#define INIT_HLIST_NODE(node) \ + do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ + } while (0) + +/* END CSTYLED */ +static inline int +atomic_read(const atomic_t *v) +{ + return (READ_ONCE(v->counter)); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, 1) + 1); +} + +static inline int +atomic_dec(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, -1) - 1); +} +#endif diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h new file mode 100644 index 0000000000..23e6379834 --- /dev/null +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -0,0 +1,193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CCOMPILE_H +#define _SYS_CCOMPILE_H + +/* + * This file contains definitions designed to enable different compilers + * to be used harmoniously on Solaris systems. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(INVARIANTS) && !defined(ZFS_DEBUG) +#define ZFS_DEBUG +#undef NDEBUG +#endif + +#define EXPORT_SYMBOL(x) +#define MODULE_AUTHOR(s) +#define MODULE_DESCRIPTION(s) +#define MODULE_LICENSE(s) +#define module_param(a, b, c) +#define module_param_call(a, b, c, d, e) +#define module_param_named(a, b, c, d) +#define MODULE_PARM_DESC(a, b) +#define asm __asm +#ifdef ZFS_DEBUG +#undef NDEBUG +#endif +#if !defined(ZFS_DEBUG) && !defined(NDEBUG) +#define NDEBUG +#endif + +#ifndef EINTEGRITY +#define EINTEGRITY 97 /* EINTEGRITY is new in 13 */ +#endif + +/* + * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD + * equivalents. This gives us more useful error messages from strerror(3). + */ +#define ECKSUM EINTEGRITY +#define EFRAGS ENOSPC + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ECANCELED + +#define EREMOTEIO EREMOTE +#define ECHRNG ENXIO +#define ETIME ETIMEDOUT + +#ifndef LOCORE +#ifndef HAVE_RPC_TYPES +typedef int bool_t; +typedef int enum_t; +#endif +#endif + +#ifndef __cplusplus +#define __init +#define __exit +#endif + +#if defined(_KERNEL) || defined(_STANDALONE) +#define param_set_charp(a, b) (0) +#define ATTR_UID AT_UID +#define ATTR_GID AT_GID +#define ATTR_MODE AT_MODE +#define ATTR_XVATTR AT_XVATTR +#define ATTR_CTIME AT_CTIME +#define ATTR_MTIME AT_MTIME +#define ATTR_ATIME AT_ATIME +#if defined(_STANDALONE) +#define vmem_free kmem_free +#define vmem_zalloc kmem_zalloc +#define vmem_alloc kmem_zalloc +#else +#define vmem_free zfs_kmem_free +#define vmem_zalloc(size, flags) zfs_kmem_alloc(size, flags | M_ZERO) +#define vmem_alloc zfs_kmem_alloc +#endif +#define MUTEX_NOLOCKDEP 0 +#define RW_NOLOCKDEP 0 + +#else +#define FALSE 0 +#define TRUE 1 + /* + * XXX We really need to consolidate on standard + * error codes in the common code + */ +#define ENOSTR ENOTCONN +#define ENODATA EINVAL + + +#define __BSD_VISIBLE 1 +#ifndef IN_BASE +#define __POSIX_VISIBLE 201808 +#define __XSI_VISIBLE 1000 +#endif +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#define mmap64 mmap +/* Note: this file can be used on linux/macOS when bootstrapping tools. */ +#if defined(__FreeBSD__) +#define open64 open +#define pwrite64 pwrite +#define ftruncate64 ftruncate +#define lseek64 lseek +#define pread64 pread +#define stat64 stat +#define lstat64 lstat +#define statfs64 statfs +#define readdir64 readdir +#define dirent64 dirent +#endif +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define RLIM64_INFINITY RLIM_INFINITY +#ifndef HAVE_ERESTART +#define ERESTART EAGAIN +#endif +#define ABS(a) ((a) < 0 ? -(a) : (a)) + +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CCOMPILE_H */ diff --git a/include/os/freebsd/spl/sys/cmn_err.h b/include/os/freebsd/spl/sys/cmn_err.h new file mode 100644 index 0000000000..ddc2f0049e --- /dev/null +++ b/include/os/freebsd/spl/sys/cmn_err.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CMN_ERR_H +#define _SYS_CMN_ERR_H + +#if !defined(_ASM) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Common error handling severity levels */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifndef _ASM + +extern void cmn_err(int, const char *, ...) + __attribute__((format(printf, 2, 3))); + +extern void vzcmn_err(zoneid_t, int, const char *, __va_list) + __attribute__((format(printf, 3, 0))); + +extern void vcmn_err(int, const char *, __va_list) + __attribute__((format(printf, 2, 0))); + +extern void zcmn_err(zoneid_t, int, const char *, ...) + __attribute__((format(printf, 3, 4))); + +extern void vzprintf(zoneid_t, const char *, __va_list) + __attribute__((format(printf, 2, 0))); + +extern void zprintf(zoneid_t, const char *, ...) + __attribute__((format(printf, 2, 3))); + +extern void vuprintf(const char *, __va_list) + __attribute__((format(printf, 1, 0))); + +extern void panic(const char *, ...) + __attribute__((format(printf, 1, 2))); + +#endif /* !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CMN_ERR_H */ diff --git a/include/os/freebsd/spl/sys/condvar.h b/include/os/freebsd/spl/sys/condvar.h new file mode 100644 index 0000000000..9b1893bcb8 --- /dev/null +++ b/include/os/freebsd/spl/sys/condvar.h @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * Copyright (c) 2013 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_CONDVAR_H_ +#define _OPENSOLARIS_SYS_CONDVAR_H_ + +#include +#include + +#include +#include +#include +#include + +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + +static __inline sbintime_t +zfs_nstosbt(int64_t _ns) +{ + sbintime_t sb = 0; + +#ifdef KASSERT + KASSERT(_ns >= 0, ("Negative values illegal for nstosbt: %jd", _ns)); +#endif + if (_ns >= SBT_1S) { + sb = (_ns / 1000000000) * SBT_1S; + _ns = _ns % 1000000000; + } + /* 9223372037 = ceil(2^63 / 1000000000) */ + sb += ((_ns * 9223372037ull) + 0x7fffffff) >> 31; + return (sb); +} + + +typedef struct cv kcondvar_t; +#define CALLOUT_FLAG_ABSOLUTE C_ABSOLUTE + +typedef enum { + CV_DEFAULT, + CV_DRIVER +} kcv_type_t; + +#define zfs_cv_init(cv, name, type, arg) do { \ + const char *_name; \ + ASSERT((type) == CV_DEFAULT); \ + for (_name = #cv; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #cv; \ + cv_init((cv), _name); \ +} while (0) +#define cv_init(cv, name, type, arg) zfs_cv_init(cv, name, type, arg) + + +static inline int +cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + + return (_cv_wait_sig(cvp, &(mp)->lock_object) == 0); +} + +static inline int +cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sbt((cvp), &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + return (1); +} + +static inline int +cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sig_sbt(cvp, &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + if (rc == EINTR || rc == ERESTART) + return (0); + + return (1); +} + +#define cv_timedwait_io cv_timedwait +#define cv_timedwait_idle cv_timedwait +#define cv_timedwait_sig_io cv_timedwait_sig +#define cv_wait_io cv_wait +#define cv_wait_io_sig cv_wait_sig +#define cv_wait_idle cv_wait +#define cv_timedwait_io_hires cv_timedwait_hires +#define cv_timedwait_idle_hires cv_timedwait_hires + +static inline int +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (-1); + rc = cv_timedwait_sbt(cvp, mp, zfs_nstosbt(tim), + zfs_nstosbt(res), C_ABSOLUTE); + + if (rc == EWOULDBLOCK) + return (-1); + + KASSERT(rc == 0, ("unexpected rc value %d", rc)); + return (1); +} + +static inline int +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + sbintime_t sbt; + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (-1); + + sbt = zfs_nstosbt(tim); + rc = cv_timedwait_sig_sbt(cvp, mp, sbt, zfs_nstosbt(res), C_ABSOLUTE); + + switch (rc) { + case EWOULDBLOCK: + return (-1); + case EINTR: + case ERESTART: + return (0); + default: + KASSERT(rc == 0, ("unexpected rc value %d", rc)); + return (1); + } +} + +#endif /* _OPENSOLARIS_SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/cred.h b/include/os/freebsd/spl/sys/cred.h new file mode 100644 index 0000000000..86f79011d6 --- /dev/null +++ b/include/os/freebsd/spl/sys/cred.h @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_CRED_H +#define _SYS_CRED_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The credential is an opaque kernel private data structure defined in + * . + */ + +typedef struct ucred cred_t; + +#define CRED() curthread->td_ucred +#define kcred (thread0.td_ucred) + +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) +#define crgetuid(cred) ((cred)->cr_uid) +#define crgetruid(cred) ((cred)->cr_ruid) +#define crgetgid(cred) ((cred)->cr_gid) +#define crgetgroups(cred) ((cred)->cr_groups) +#define crgetngroups(cred) ((cred)->cr_ngroups) +#define crgetsid(cred, i) (NULL) + +struct proc; /* cred.h is included in proc.h */ +struct prcred; +struct ksid; +struct ksidlist; +struct credklpd; +struct credgrp; + +struct auditinfo_addr; /* cred.h is included in audit.h */ + +extern int ngroups_max; +/* + * kcred is used when you need all privileges. + */ + +extern void cred_init(void); +extern void crfree(cred_t *); +extern cred_t *cralloc(void); /* all but ref uninitialized */ +extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */ +extern cred_t *crget(void); /* initialized */ +extern void crcopy_to(cred_t *, cred_t *); +extern cred_t *crdup(cred_t *); +extern void crdup_to(cred_t *, cred_t *); +extern cred_t *crgetcred(void); +extern void crset(struct proc *, cred_t *); +extern void crset_zone_privall(cred_t *); +extern int supgroupmember(gid_t, const cred_t *); +extern int hasprocperm(const cred_t *, const cred_t *); +extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); +extern int crcmp(const cred_t *, const cred_t *); +extern cred_t *zone_kcred(void); + +extern gid_t crgetrgid(const cred_t *); +extern gid_t crgetsgid(const cred_t *); + +#define crgetzoneid(cr) ((cr)->cr_prison->pr_id) +extern projid_t crgetprojid(const cred_t *); + +extern cred_t *crgetmapped(const cred_t *); + + +extern const struct auditinfo_addr *crgetauinfo(const cred_t *); +extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); + +extern uint_t crgetref(const cred_t *); + +extern const gid_t *crgetggroups(const struct credgrp *); + + +/* + * Sets real, effective and/or saved uid/gid; + * -1 argument accepted as "no change". + */ +extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); +extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); + +/* + * Sets real, effective and saved uids/gids all to the same + * values. Both values must be non-negative and <= MAXUID + */ +extern int crsetugid(cred_t *, uid_t, gid_t); + +/* + * Functions to handle the supplemental group list. + */ +extern struct credgrp *crgrpcopyin(int, gid_t *); +extern void crgrprele(struct credgrp *); +extern void crsetcredgrp(cred_t *, struct credgrp *); + +/* + * Private interface for setting zone association of credential. + */ +struct zone; +extern void crsetzone(cred_t *, struct zone *); +extern struct zone *crgetzone(const cred_t *); + +/* + * Private interface for setting project id in credential. + */ +extern void crsetprojid(cred_t *, projid_t); + +/* + * Private interface for nfs. + */ +extern cred_t *crnetadjust(cred_t *); + +/* + * Private interface for procfs. + */ +extern void cred2prcred(const cred_t *, struct prcred *); + +/* + * Private interfaces for Rampart Trusted Solaris. + */ +struct ts_label_s; +extern struct ts_label_s *crgetlabel(const cred_t *); +extern boolean_t crisremote(const cred_t *); + +/* + * Private interfaces for ephemeral uids. + */ +#define VALID_UID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_uid((zn), (id))) + +#define VALID_GID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_gid((zn), (id))) + +extern boolean_t valid_ephemeral_uid(struct zone *, uid_t); +extern boolean_t valid_ephemeral_gid(struct zone *, gid_t); + +extern int eph_uid_alloc(struct zone *, int, uid_t *, int); +extern int eph_gid_alloc(struct zone *, int, gid_t *, int); + +extern void crsetsid(cred_t *, struct ksid *, int); +extern void crsetsidlist(cred_t *, struct ksidlist *); + +extern struct ksidlist *crgetsidlist(const cred_t *); + +extern int crsetpriv(cred_t *, ...); + +extern struct credklpd *crgetcrklpd(const cred_t *); +extern void crsetcrklpd(cred_t *, struct credklpd *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRED_H */ diff --git a/include/os/freebsd/spl/sys/ctype.h b/include/os/freebsd/spl/sys/ctype.h new file mode 100644 index 0000000000..f225858072 --- /dev/null +++ b/include/os/freebsd/spl/sys/ctype.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_SYS_CTYPE_H_ +#define _SPL_SYS_CTYPE_H_ +#include_next + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +/* BEGIN CSTYLED */ +#define ispunct(C) \ + (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) +/* END CSTYLED */ + +#endif diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h new file mode 100644 index 0000000000..1f820bc334 --- /dev/null +++ b/include/os/freebsd/spl/sys/debug.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) +#define CTASSERT_GLOBAL(x) CTASSERT(x) + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define IMPLY(A, B) \ + ((void)(likely((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)(likely(!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/freebsd/spl/sys/dirent.h b/include/os/freebsd/spl/sys/dirent.h new file mode 100644 index 0000000000..2403766a42 --- /dev/null +++ b/include/os/freebsd/spl/sys/dirent.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DIRENT_H_ +#define _OPENSOLARIS_SYS_DIRENT_H_ + +#include + +#include_next + +typedef struct dirent dirent64_t; +typedef ino_t ino64_t; + +#define dirent64 dirent + +#define d_ino d_fileno + +#define DIRENT64_RECLEN(len) _GENERIC_DIRLEN(len) + +#endif /* !_OPENSOLARIS_SYS_DIRENT_H_ */ diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h new file mode 100644 index 0000000000..2be1b76e43 --- /dev/null +++ b/include/os/freebsd/spl/sys/disp.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 Andriy Gapon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DISP_H_ +#define _OPENSOLARIS_SYS_DISP_H_ + +#include + +#define kpreempt(x) kern_yield(PRI_USER) + +#endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/freebsd/spl/sys/dkio.h new file mode 100644 index 0000000000..aed54ba508 --- /dev/null +++ b/include/os/freebsd/spl/sys/dkio.h @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _OPENSOLARIS_SYS_DKIO_H_ +#define _OPENSOLARIS_SYS_DKIO_H_ + +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/freebsd/spl/sys/extdirent.h b/include/os/freebsd/spl/sys/extdirent.h new file mode 100644 index 0000000000..b22e8e8563 --- /dev/null +++ b/include/os/freebsd/spl/sys/extdirent.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_EXTDIRENT_H +#define _SYS_EXTDIRENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * Extended file-system independent directory entry. This style of + * dirent provides additional informational flag bits for each + * directory entry. This dirent will be returned instead of the + * standard dirent if a VOP_READDIR() requests dirent flags via + * V_RDDIR_ENTFLAGS, and if the file system supports the flags. + */ +typedef struct edirent { + ino64_t ed_ino; /* "inode number" of entry */ + off64_t ed_off; /* offset of disk directory entry */ + uint32_t ed_eflags; /* per-entry flags */ + unsigned short ed_reclen; /* length of this record */ + char ed_name[1]; /* name of file */ +} edirent_t; + +#define EDIRENT_RECLEN(namelen) \ + ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7) +#define EDIRENT_NAMELEN(reclen) \ + ((reclen) - (offsetof(edirent_t, ed_name[0]))) + +/* + * Extended entry flags + * Extended entries include a bitfield of extra information + * regarding that entry. + */ +#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */ + +/* + * Extended flags accessor function + */ +#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT) +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EXTDIRENT_H */ diff --git a/include/os/freebsd/spl/sys/fcntl.h b/include/os/freebsd/spl/sys/fcntl.h new file mode 100644 index 0000000000..4301d6e151 --- /dev/null +++ b/include/os/freebsd/spl/sys/fcntl.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SPL_SYS_FCNTL_H_ +#define _SPL_SYS_FCNTL_H_ + +#include_next + +#define O_LARGEFILE 0 +#define O_RSYNC 0 + +#ifndef O_DSYNC +#define O_DSYNC 0 +#endif + +#endif /* _SPL_SYS_FCNTL_H_ */ diff --git a/include/os/freebsd/spl/sys/file.h b/include/os/freebsd/spl/sys/file.h new file mode 100644 index 0000000000..51e59b1133 --- /dev/null +++ b/include/os/freebsd/spl/sys/file.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FILE_H_ +#define _OPENSOLARIS_SYS_FILE_H_ + +#include +#include_next + +#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ + +typedef struct file file_t; + +#include + +static __inline file_t * +getf_caps(int fd, cap_rights_t *rightsp) +{ + struct file *fp; + + if (fget(curthread, fd, rightsp, &fp) == 0) + return (fp); + return (NULL); +} + +#endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/include/os/freebsd/spl/sys/freebsd_rwlock.h b/include/os/freebsd/spl/sys/freebsd_rwlock.h new file mode 100644 index 0000000000..b760f8cf23 --- /dev/null +++ b/include/os/freebsd/spl/sys/freebsd_rwlock.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ +#define _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ + +#include_next + +#endif diff --git a/include/os/freebsd/spl/sys/idmap.h b/include/os/freebsd/spl/sys/idmap.h new file mode 100644 index 0000000000..39eeb905c7 --- /dev/null +++ b/include/os/freebsd/spl/sys/idmap.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_IDMAP_H +#define _SYS_IDMAP_H + + +/* Idmap status codes */ +#define IDMAP_SUCCESS 0 +#define IDMAP_NEXT 1 +#define IDMAP_ERR_OTHER -10000 +#define IDMAP_ERR_INTERNAL -9999 +#define IDMAP_ERR_MEMORY -9998 +#define IDMAP_ERR_NORESULT -9997 +#define IDMAP_ERR_NOTUSER -9996 +#define IDMAP_ERR_NOTGROUP -9995 +#define IDMAP_ERR_NOTSUPPORTED -9994 +#define IDMAP_ERR_W2U_NAMERULE -9993 +#define IDMAP_ERR_U2W_NAMERULE -9992 +#define IDMAP_ERR_CACHE -9991 +#define IDMAP_ERR_DB -9990 +#define IDMAP_ERR_ARG -9989 +#define IDMAP_ERR_SID -9988 +#define IDMAP_ERR_IDTYPE -9987 +#define IDMAP_ERR_RPC_HANDLE -9986 +#define IDMAP_ERR_RPC -9985 +#define IDMAP_ERR_CLIENT_HANDLE -9984 +#define IDMAP_ERR_BUSY -9983 +#define IDMAP_ERR_PERMISSION_DENIED -9982 +#define IDMAP_ERR_NOMAPPING -9981 +#define IDMAP_ERR_NEW_ID_ALLOC_REQD -9980 +#define IDMAP_ERR_DOMAIN -9979 +#define IDMAP_ERR_SECURITY -9978 +#define IDMAP_ERR_NOTFOUND -9977 +#define IDMAP_ERR_DOMAIN_NOTFOUND -9976 +#define IDMAP_ERR_UPDATE_NOTALLOWED -9975 +#define IDMAP_ERR_CFG -9974 +#define IDMAP_ERR_CFG_CHANGE -9973 +#define IDMAP_ERR_NOTMAPPED_WELLKNOWN -9972 +#define IDMAP_ERR_RETRIABLE_NET_ERR -9971 +#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970 +#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969 +#define IDMAP_ERR_BAD_UTF8 -9968 +#define IDMAP_ERR_NONE_GENERATED -9967 +#define IDMAP_ERR_PROP_UNKNOWN -9966 +#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965 +#define IDMAP_ERR_NS_LDAP_PARTIAL -9964 +#define IDMAP_ERR_NS_LDAP_CFG -9963 +#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962 +#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961 + +/* Reserved GIDs for some well-known SIDs */ +#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */ +#define IDMAP_WK_CREATOR_GROUP_GID 2147483649U +#define IDMAP_WK__MAX_GID 2147483649U + +/* Reserved UIDs for some well-known SIDs */ +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U +#define IDMAP_WK__MAX_UID 2147483648U + +/* Reserved SIDs */ +#define IDMAP_WK_CREATOR_SID_AUTHORITY "S-1-3" + +/* + * Max door RPC size for ID mapping (can't be too large relative to the + * default user-land thread stack size, since clnt_door_call() + * alloca()s). See libidmap:idmap_init(). + */ +#define IDMAP_MAX_DOOR_RPC (256 * 1024) + +#define IDMAP_SENTINEL_PID UINT32_MAX +#define IDMAP_ID_IS_EPHEMERAL(pid) \ + (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID)) + +#endif /* _SYS_IDMAP_H */ diff --git a/include/os/freebsd/spl/sys/inttypes.h b/include/os/freebsd/spl/sys/inttypes.h new file mode 100644 index 0000000000..651685d304 --- /dev/null +++ b/include/os/freebsd/spl/sys/inttypes.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/isa_defs.h b/include/os/freebsd/spl/sys/isa_defs.h new file mode 100644 index 0000000000..399d510b5f --- /dev/null +++ b/include/os/freebsd/spl/sys/isa_defs.h @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H +#include + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if defined(__powerpc64__) +#define _LONG_LONG_ALIGNMENT 8 +#define _MULTI_DATAMODEL +#else +#define _LONG_LONG_ALIGNMENT 4 +#endif +#define _LONG_LONG_ALIGNMENT_32 4 +#define _ALIGNMENT_REQUIRED 1 + +#define _SUNOS_VTOC_16 1 + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#if BYTE_ORDER == _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif BYTE_ORDER == _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#else +#error "unknown byte order" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/include/os/freebsd/spl/sys/kidmap.h b/include/os/freebsd/spl/sys/kidmap.h new file mode 100644 index 0000000000..dc0cf5988a --- /dev/null +++ b/include/os/freebsd/spl/sys/kidmap.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KIDMAP_H_ +#define _OPENSOLARIS_SYS_KIDMAP_H_ + +#include + +typedef int32_t idmap_stat; +typedef void idmap_get_handle_t; + +#define kidmap_get_create() (NULL) +#define kidmap_get_destroy(hdl) do { } while (0) +#define kidmap_get_mappings(hdl) (NULL) + +#endif /* _OPENSOLARIS_SYS_KIDMAP_H_ */ diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h new file mode 100644 index 0000000000..dc3b4f5d78 --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KMEM_H_ +#define _OPENSOLARIS_SYS_KMEM_H_ + +#ifdef _KERNEL +#include +#include +#include +#include + +#include +#include +#include + +MALLOC_DECLARE(M_SOLARIS); + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +#define KM_SLEEP M_WAITOK +#define KM_PUSHPAGE M_WAITOK +#define KM_NOSLEEP M_NOWAIT +#define KM_NORMALPRI 0 +#define KMC_NODEBUG UMA_ZONE_NODUMP + +typedef struct vmem vmem_t; + +extern char *kmem_asprintf(const char *, ...); +extern char *kmem_vasprintf(const char *fmt, va_list ap); + +typedef struct kmem_cache { + char kc_name[32]; +#if !defined(KMEM_DEBUG) + uma_zone_t kc_zone; +#else + size_t kc_size; +#endif + int (*kc_constructor)(void *, void *, int); + void (*kc_destructor)(void *, void *); + void *kc_private; +} kmem_cache_t; + +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +void *zfs_kmem_alloc(size_t size, int kmflags); +void zfs_kmem_free(void *buf, size_t size); +uint64_t kmem_size(void); +kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); +void kmem_cache_destroy(kmem_cache_t *cache); +void *kmem_cache_alloc(kmem_cache_t *cache, int flags); +void kmem_cache_free(kmem_cache_t *cache, void *buf); +boolean_t kmem_cache_reap_active(void); +void kmem_cache_reap_soon(kmem_cache_t *); +void kmem_reap(void); +int kmem_debugging(void); +void *calloc(size_t n, size_t s); + + +#define kmem_cache_reap_now kmem_cache_reap_soon +#define freemem vm_free_count() +#define minfree vm_cnt.v_free_min +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) \ + zfs_kmem_alloc((size), (kmflags) | M_ZERO) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) + +#endif /* _KERNEL */ + +#ifdef _STANDALONE +/* + * At the moment, we just need it for the type. We redirect the alloc/free + * routines to the usual Free and Malloc in that environment. + */ +typedef int kmem_cache_t; +#endif /* _STANDALONE */ + +#endif /* _OPENSOLARIS_SYS_KMEM_H_ */ diff --git a/include/os/freebsd/spl/sys/kmem_cache.h b/include/os/freebsd/spl/sys/kmem_cache.h new file mode 100644 index 0000000000..9eec3b4585 --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem_cache.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#ifdef _KERNEL +#include + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +extern void spl_kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); + +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) + +#endif /* _KERNEL */ + +#endif diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h new file mode 100644 index 0000000000..947dfee623 --- /dev/null +++ b/include/os/freebsd/spl/sys/kstat.h @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include +#ifndef _STANDALONE +#include +#endif +struct list_head {}; +#include +#include + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 5 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; +typedef struct kstat_s kstat_t; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +struct seq_file { + char *sf_buf; + size_t sf_size; +}; + +void seq_printf(struct seq_file *m, const char *fmt, ...); + + +typedef struct kstat_module { + char ksm_name[KSTAT_STRLEN+1]; /* module name */ + struct list_head ksm_module_list; /* module linkage */ + struct list_head ksm_kstat_list; /* list of kstat entries */ + struct proc_dir_entry *ksm_proc; /* proc entry */ +} kstat_module_t; + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*seq_headers)(struct seq_file *); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(kstat_t *ksp, loff_t index); +} kstat_raw_ops_t; + +struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of data records */ + size_t ks_data_size; /* size of kstat data section */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + void *ks_private1; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + struct list_head ks_list; /* kstat linkage */ + kstat_module_t *ks_owner; /* kstat module linkage */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ +#ifndef _STANDALONE + struct sysctl_ctx_list ks_sysctl_ctx; + struct sysctl_oid *ks_sysctl_root; +#endif /* _STANDALONE */ +}; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +int spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern void __kstat_set_seq_raw_ops(kstat_t *ksp, + int (*headers)(struct seq_file *), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags); + +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); + +#define kstat_set_seq_raw_ops(k, h, d, a) \ + __kstat_set_seq_raw_ops(k, h, d, a) +#define kstat_set_raw_ops(k, h, d, a) \ + __kstat_set_raw_ops(k, h, d, a) +#ifndef _STANDALONE +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) + +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) +#else +#define kstat_create(m, i, n, c, t, s, f) ((kstat_t *)0) +#define kstat_install(k) +#define kstat_delete(k) +#endif + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/freebsd/spl/sys/list.h b/include/os/freebsd/spl/sys/list.h new file mode 100644 index 0000000000..6db92ed429 --- /dev/null +++ b/include/os/freebsd/spl/sys/list.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/lib/libspl/include/sys/signal.h b/include/os/freebsd/spl/sys/list_impl.h similarity index 68% rename from lib/libspl/include/sys/signal.h rename to include/os/freebsd/spl/sys/list_impl.h index df9221a694..a6614f9a38 100644 --- a/lib/libspl/include/sys/signal.h +++ b/include/os/freebsd/spl/sys/list_impl.h @@ -20,16 +20,32 @@ * CDDL HEADER END */ /* - * Copyright 2017 Zettabyte Software, LLC. All rights reserved. + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -/* - * Compiling against musl correctly points out that including sys/signal.h is - * disallowed by the Single UNIX Specification when building in userspace, so - * we implement a dummy header to redirect the include to the proper header. - */ -#ifndef _LIBSPL_SYS_SIGNAL_H -#define _LIBSPL_SYS_SIGNAL_H -#include -#endif /* _LIBSPL_SYS_SIGNAL_H */ +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/lock.h b/include/os/freebsd/spl/sys/lock.h new file mode 100644 index 0000000000..7d5dc26abc --- /dev/null +++ b/include/os/freebsd/spl/sys/lock.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_LOCK_H_ +#define _OPENSOLARIS_SYS_LOCK_H_ + +#include_next + +#define LO_ALLMASK (LO_INITIALIZED | LO_WITNESS | LO_QUIET | \ + LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE | \ + LO_DUPOK | LO_CLASSMASK | LO_NOPROFILE) +#define LO_EXPECTED (LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE | \ + LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK | (2 << LO_CLASSSHIFT)) + +#endif /* _OPENSOLARIS_SYS_LOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h new file mode 100644 index 0000000000..3481507d2c --- /dev/null +++ b/include/os/freebsd/spl/sys/misc.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MISC_H_ +#define _OPENSOLARIS_SYS_MISC_H_ + +#include +#include + +#define MAXUID UID_MAX + +#define _ACL_ACLENT_ENABLED 0x1 +#define _ACL_ACE_ENABLED 0x2 + +#define _FIOFFS (INT_MIN) +#define _FIOGDIO (INT_MIN+1) +#define _FIOSDIO (INT_MIN+2) + +#define F_SEEK_DATA FIOSEEKDATA +#define F_SEEK_HOLE FIOSEEKHOLE + +struct opensolaris_utsname { + char *sysname; + char *nodename; + char *release; + char version[32]; + char *machine; +}; + +extern char hw_serial[11]; + +#define task_io_account_read(n) +#define task_io_account_write(n) + +#endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h new file mode 100644 index 0000000000..5695abee7b --- /dev/null +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_MOD_H +#define _SPL_MOD_H + +#include + +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) + +#define EXPORT_SYMBOL(x) +#define module_param(a, b, c) +#define MODULE_PARM_DESC(a, b) + +#define ZMOD_RW CTLFLAG_RWTUN +#define ZMOD_RD CTLFLAG_RDTUN + +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) \ + SYSCTL_DECL(_vfs_ ## scope_prefix); \ + SYSCTL_##type(_vfs_ ## scope_prefix, OID_AUTO, name, perm, &name_prefix ## name, 0, desc) + +#define ZFS_MODULE_PARAM_ARGS SYSCTL_HANDLER_ARGS + +#define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ + SYSCTL_DECL(parent); \ + SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ + ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) + +#define ZFS_MODULE_VIRTUAL_PARAM_CALL ZFS_MODULE_PARAM_CALL + +#define param_set_arc_long_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU" + +#define param_set_arc_min_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_arc_min, "LU" + +#define param_set_arc_max_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_arc_max, "LU" + +#define param_set_arc_int_args(var) \ + CTLTYPE_INT, &var, 0, param_set_arc_int, "I" + +#define param_set_deadman_failmode_args(var) \ + CTLTYPE_STRING, NULL, 0, param_set_deadman_failmode, "A" + +#define param_set_deadman_synctime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_synctime, "LU" + +#define param_set_deadman_ziotime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_ziotime, "LU" + +#define param_set_multihost_interval_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_multihost_interval, "LU" + +#define param_set_slop_shift_args(var) \ + CTLTYPE_INT, &var, 0, param_set_slop_shift, "I" + +#define param_set_min_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_min_auto_ashift, "QU" + +#define param_set_max_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" + +#define fletcher_4_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" + +#include +#define module_init(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) + +#define module_init_early(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSINIT(zfs_ ## fn, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST, wrap_ ## fn, NULL) + +#define module_exit(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSUNINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) +/* END CSTYLED */ + +#endif /* SPL_MOD_H */ diff --git a/include/os/freebsd/spl/sys/mode.h b/include/os/freebsd/spl/sys/mode.h new file mode 100644 index 0000000000..651685d304 --- /dev/null +++ b/include/os/freebsd/spl/sys/mode.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/mount.h b/include/os/freebsd/spl/sys/mount.h new file mode 100644 index 0000000000..42614e4739 --- /dev/null +++ b/include/os/freebsd/spl/sys/mount.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MOUNT_H_ +#define _OPENSOLARIS_SYS_MOUNT_H_ + +#include +#include_next +#ifdef BUILDING_ZFS +#include +#endif +#define MS_FORCE MNT_FORCE +#define MS_REMOUNT MNT_UPDATE + +typedef struct fid fid_t; + +#endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h new file mode 100644 index 0000000000..e757d12c15 --- /dev/null +++ b/include/os/freebsd/spl/sys/mutex.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MUTEX_H_ +#define _OPENSOLARIS_SYS_MUTEX_H_ + +typedef struct sx kmutex_t; + +#include +#include +#include_next +#include_next +#include +#include + +typedef enum { + MUTEX_DEFAULT = 0 /* kernel default mutex */ +} kmutex_type_t; + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || panicstr) + +#ifndef OPENSOLARIS_WITNESS +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS) +#else +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW) +#endif + +#define mutex_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == MUTEX_DEFAULT); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, MUTEX_FLAGS); \ +} while (0) +#define mutex_destroy(lock) sx_destroy(lock) +#define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_nested(lock, type) sx_xlock(lock) +#define mutex_tryenter(lock) sx_try_xlock(lock) +#define mutex_exit(lock) sx_xunlock(lock) +#define mutex_owned(lock) sx_xlocked(lock) +#define mutex_owner(lock) sx_xholder(lock) +#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h new file mode 100644 index 0000000000..92724e332d --- /dev/null +++ b/include/os/freebsd/spl/sys/param.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2007 John Birrell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_ +#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_ + +#include +#include_next +#define PAGESIZE PAGE_SIZE +#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) +#ifdef _KERNEL +#include +#include +#endif +#endif diff --git a/include/os/freebsd/spl/sys/policy.h b/include/os/freebsd/spl/sys/policy.h new file mode 100644 index 0000000000..909ae3886e --- /dev/null +++ b/include/os/freebsd/spl/sys/policy.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $ $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_POLICY_H_ +#define _OPENSOLARIS_SYS_POLICY_H_ + +#include +#include +#include +struct mount; +struct vattr; +struct znode; + +int secpolicy_nfs(cred_t *cr); +int secpolicy_zfs(cred_t *crd); +int secpolicy_zfs_proc(cred_t *cr, proc_t *proc); +int secpolicy_sys_config(cred_t *cr, int checkonly); +int secpolicy_zinject(cred_t *cr); +int secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp); +int secpolicy_basic_link(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_stky_modify(cred_t *cr); +int secpolicy_vnode_remove(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t accmode); +int secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode); +int secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner); +int secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node); +int secpolicy_vnode_create_gid(cred_t *cr); +int secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid); +int secpolicy_vnode_setid_retain(struct znode *zp, cred_t *cr, + boolean_t issuidroot); +void secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr); +int secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr); +int secpolicy_fs_owner(struct mount *vfsp, cred_t *cr); +int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp); +void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp); +int secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype); +int secpolicy_smb(cred_t *cr); + + +#if __FreeBSD_version >= 1300005 +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b)) +#else +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b), 0) +#endif +#endif /* _OPENSOLARIS_SYS_POLICY_H_ */ diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h new file mode 100644 index 0000000000..8583df509b --- /dev/null +++ b/include/os/freebsd/spl/sys/proc.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_PROC_H_ +#define _OPENSOLARIS_SYS_PROC_H_ + +#include +#include +#include_next +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#define CPU curcpu +#define minclsyspri PRIBIO +#define defclsyspri minclsyspri +#define maxclsyspri PVM +#define max_ncpus (mp_maxid + 1) +#define boot_max_ncpus (mp_maxid + 1) + +#define TS_RUN 0 + +#define p0 proc0 + +#define t_tid td_tid + +typedef short pri_t; +typedef struct thread _kthread; +typedef struct thread kthread_t; +typedef struct thread *kthread_id_t; +typedef struct proc proc_t; + +extern proc_t *system_proc; + +static __inline kthread_t * +do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, + size_t len, proc_t *pp, int state, pri_t pri, const char *name) +{ + kthread_t *td = NULL; + proc_t **ppp; + int error; + + /* + * Be sure there are no surprises. + */ + ASSERT(stk == NULL); + ASSERT(len == 0); + ASSERT(state == TS_RUN); + + if (pp == &p0) + ppp = &system_proc; + else + ppp = &pp; + error = kproc_kthread_add(proc, arg, ppp, &td, RFSTOPPED, + stksize / PAGE_SIZE, "zfskern", "%s", name); + if (error == 0) { + thread_lock(td); + sched_prio(td, pri); + sched_add(td, SRQ_BORING); +#if __FreeBSD_version < 1300068 + thread_unlock(td); +#endif + } + return (td); +} + +#define thread_create_named(name, stk, stksize, proc, arg, len, \ + pp, state, pri) \ + do_thread_create(stk, stksize, proc, arg, len, pp, state, pri, name) +#define thread_create(stk, stksize, proc, arg, len, pp, state, pri) \ + do_thread_create(stk, stksize, proc, arg, len, pp, state, pri, #proc) +#define thread_exit() kthread_exit() + +int uread(proc_t *, void *, size_t, uintptr_t); +int uwrite(proc_t *, void *, size_t, uintptr_t); + +static inline boolean_t +zfs_proc_is_caller(proc_t *p) +{ + return (p == curproc); +} + +#endif /* _KERNEL */ +#endif /* _OPENSOLARIS_SYS_PROC_H_ */ diff --git a/include/os/freebsd/spl/sys/processor.h b/include/os/freebsd/spl/sys/processor.h new file mode 100644 index 0000000000..53149840f2 --- /dev/null +++ b/include/os/freebsd/spl/sys/processor.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#include +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +#define getcpuid() curcpu + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/include/os/freebsd/spl/sys/procfs_list.h b/include/os/freebsd/spl/sys/procfs_list.h new file mode 100644 index 0000000000..4bc603756e --- /dev/null +++ b/include/os/freebsd/spl/sys/procfs_list.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#ifndef _STANDALONE + +#include +#include + + +/* + * procfs list manipulation + */ + +typedef struct procfs_list procfs_list_t; +struct procfs_list { + void *pl_private; + void *pl_next_data; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + int (*pl_show)(struct seq_file *f, void *p); + int (*pl_show_header)(struct seq_file *f); + int (*pl_clear)(procfs_list_t *procfs_list); + size_t pl_node_offset; +}; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *submodule, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#else +typedef int procfs_list_t; +#endif /* !_STANDALONE */ + +#endif /* _SPL_PROCFS_LIST_H */ diff --git a/include/os/freebsd/spl/sys/random.h b/include/os/freebsd/spl/sys/random.h new file mode 100644 index 0000000000..7583166e72 --- /dev/null +++ b/include/os/freebsd/spl/sys/random.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RANDOM_H_ +#define _OPENSOLARIS_SYS_RANDOM_H_ + +#include_next +#if __FreeBSD_version >= 1300108 +#include +#endif + +static inline int +random_get_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +static inline int +random_get_pseudo_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +static inline uint32_t +random_in_range(uint32_t range) +{ +#if defined(_KERNEL) && __FreeBSD_version >= 1300108 + return (prng32_bounded(range)); +#else + uint32_t r; + + ASSERT(range != 0); + + if (range == 1) + return (0); + + (void) random_get_pseudo_bytes((uint8_t *)&r, sizeof (r)); + + return (r % range); +#endif +} + +#endif /* !_OPENSOLARIS_SYS_RANDOM_H_ */ diff --git a/include/os/freebsd/spl/sys/rwlock.h b/include/os/freebsd/spl/sys/rwlock.h new file mode 100644 index 0000000000..10107a9bee --- /dev/null +++ b/include/os/freebsd/spl/sys/rwlock.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RWLOCK_H_ +#define _OPENSOLARIS_SYS_RWLOCK_H_ + +#include +#include +#include +#include + +typedef enum { + RW_DEFAULT = 4 /* kernel default rwlock */ +} krw_type_t; + + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +typedef struct sx krwlock_t; + +#ifndef OPENSOLARIS_WITNESS +#define RW_FLAGS (SX_DUPOK | SX_NOWITNESS) +#else +#define RW_FLAGS (SX_DUPOK) +#endif + +#define RW_READ_HELD(x) (rw_read_held((x))) +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) +#define RW_ISWRITER(x) (rw_iswriter(x)) +/* BEGIN CSTYLED */ +#define rw_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == 0 || (type) == RW_DEFAULT); \ + KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \ + LO_EXPECTED, ("lock %s already initialized", #lock)); \ + bzero((lock), sizeof(struct sx)); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, RW_FLAGS); \ +} while (0) +#define rw_destroy(lock) sx_destroy(lock) +#define rw_enter(lock, how) do { \ + if ((how) == RW_READER) \ + sx_slock(lock); \ + else /* if ((how) == RW_WRITER) */ \ + sx_xlock(lock); \ + } while (0) + +#define rw_tryenter(lock, how) \ + ((how) == RW_READER ? sx_try_slock(lock) : sx_try_xlock(lock)) +#define rw_exit(lock) sx_unlock(lock) +#define rw_downgrade(lock) sx_downgrade(lock) +#define rw_tryupgrade(lock) sx_try_upgrade(lock) +#define rw_read_held(lock) \ + ((lock)->sx_lock != SX_LOCK_UNLOCKED && \ + ((lock)->sx_lock & SX_LOCK_SHARED)) +#define rw_write_held(lock) sx_xlocked(lock) +#define rw_lock_held(lock) (rw_read_held(lock) || rw_write_held(lock)) +#define rw_iswriter(lock) sx_xlocked(lock) +#define rw_owner(lock) sx_xholder(lock) + +/* END CSTYLED */ +#endif /* _OPENSOLARIS_SYS_RWLOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h new file mode 100644 index 0000000000..496fc58d7c --- /dev/null +++ b/include/os/freebsd/spl/sys/sdt.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SDT_H_ +#define _OPENSOLARIS_SYS_SDT_H_ + +#include_next +/* BEGIN CSTYLED */ +#ifdef KDTRACE_HOOKS +SDT_PROBE_DECLARE(sdt, , , set__error); + +#define SET_ERROR(err) \ + ((sdt_sdt___set__error->id ? \ + (*sdt_probe_func)(sdt_sdt___set__error->id, \ + (uintptr_t)err, 0, 0, 0, 0) : 0), err) +#else +#define SET_ERROR(err) (err) +#endif + +#endif /* _OPENSOLARIS_SYS_SDT_H_ */ diff --git a/include/os/freebsd/spl/sys/sid.h b/include/os/freebsd/spl/sys/sid.h new file mode 100644 index 0000000000..d3fab8b247 --- /dev/null +++ b/include/os/freebsd/spl/sys/sid.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SID_H_ +#define _OPENSOLARIS_SYS_SID_H_ +#include +#include + +typedef struct ksiddomain { + char *kd_name; /* Domain part of SID */ + uint_t kd_len; +} ksiddomain_t; +typedef void ksid_t; + +static __inline ksiddomain_t * +ksid_lookupdomain(const char *domain) +{ + ksiddomain_t *kd; + size_t len; + + len = strlen(domain) + 1; + kd = kmem_alloc(sizeof (*kd), KM_SLEEP); + kd->kd_len = (uint_t)len; + kd->kd_name = kmem_alloc(len, KM_SLEEP); + strcpy(kd->kd_name, domain); + return (kd); +} + +static __inline void +ksiddomain_rele(ksiddomain_t *kd) +{ + + kmem_free(kd->kd_name, kd->kd_len); + kmem_free(kd, sizeof (*kd)); +} + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#endif /* _OPENSOLARIS_SYS_SID_H_ */ diff --git a/include/os/freebsd/spl/sys/sig.h b/include/os/freebsd/spl/sys/sig.h new file mode 100644 index 0000000000..a4d440d383 --- /dev/null +++ b/include/os/freebsd/spl/sys/sig.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2008 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SIG_H_ +#define _OPENSOLARIS_SYS_SIG_H_ + +#ifndef _STANDALONE + +#include_next +#include +#include +#include +#include +#include +#include + +#define FORREAL 0 +#define JUSTLOOKING 1 + +static __inline int +issig(int why) +{ + struct thread *td = curthread; + struct proc *p; + int sig; + + ASSERT(why == FORREAL || why == JUSTLOOKING); + if (SIGPENDING(td)) { + if (why == JUSTLOOKING) + return (1); + p = td->td_proc; + PROC_LOCK(p); + mtx_lock(&p->p_sigacts->ps_mtx); + sig = cursig(td); + mtx_unlock(&p->p_sigacts->ps_mtx); + PROC_UNLOCK(p); + if (sig != 0) + return (1); + } + return (0); +} + +#endif /* !_STANDALONE */ + +#endif /* _OPENSOLARIS_SYS_SIG_H_ */ diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h new file mode 100644 index 0000000000..53503e8389 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _FREEBSD_SIMD_H +#define _FREEBSD_SIMD_H +#if defined(__amd64__) || defined(__i386__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#endif +#endif diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h new file mode 100644 index 0000000000..480bfd2897 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include +#include +#include + +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) + +#define kfpu_begin() { \ + if (__predict_false(!is_fpu_kern_thread(0))) \ + fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);\ +} + +#define kfpu_end() { \ + if (__predict_false(curpcb->pcb_flags & PCB_FPUNOSAVE)) \ + fpu_kern_leave(curthread, NULL); \ +} + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + + has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE); + + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (!!(cpu_feature & CPUID_SSE)); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (!!(cpu_feature & CPUID_SSE2)); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE3)); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSSE3)); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE41)); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE42)); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + boolean_t has_avx; + + has_avx = !!(cpu_feature2 & CPUID2_AVX); + + return (has_avx && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + boolean_t has_avx2; + + has_avx2 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX2); + + return (has_avx2 && __ymm_enabled()); +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512PF); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512BW); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512DQ); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512IFMA); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_BMI1); + + return (has_avx512 && __zmm_enabled()); +} diff --git a/include/os/freebsd/spl/sys/spl_condvar.h b/include/os/freebsd/spl/sys/spl_condvar.h new file mode 100644 index 0000000000..7405f647d5 --- /dev/null +++ b/include/os/freebsd/spl/sys/spl_condvar.h @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2000 Jake Burkholder . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_CONDVAR_H_ +#define _SPL_SYS_CONDVAR_H_ + +#ifndef LOCORE +#include + +struct lock_object; +struct thread; + +TAILQ_HEAD(cv_waitq, thread); + +/* + * Condition variable. The waiters count is protected by the mutex that + * protects the condition; that is, the mutex that is passed to cv_wait*() + * and is held across calls to cv_signal() and cv_broadcast(). It is an + * optimization to avoid looking up the sleep queue if there are no waiters. + */ +struct cv { + const char *cv_description; + int cv_waiters; +}; + +void cv_init(struct cv *cvp, const char *desc); +void cv_destroy(struct cv *cvp); + +void _cv_wait(struct cv *cvp, struct lock_object *lock); +void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock); +int _cv_wait_sig(struct cv *cvp, struct lock_object *lock); +int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); +int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); + +void cv_signal(struct cv *cvp); +void cv_broadcastpri(struct cv *cvp, int pri); + +#define cv_wait(cvp, lock) \ + _cv_wait((cvp), &(lock)->lock_object) +#define cv_wait_unlock(cvp, lock) \ + _cv_wait_unlock((cvp), &(lock)->lock_object) +#define cv_timedwait_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) +#define cv_timedwait_sig_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sig_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) + +#define cv_broadcast(cvp) cv_broadcastpri(cvp, 0) + +#define cv_wmesg(cvp) ((cvp)->cv_description) + +#endif /* !LOCORE */ +#endif /* _SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/string.h b/include/os/freebsd/spl/sys/string.h new file mode 100644 index 0000000000..859b40285a --- /dev/null +++ b/include/os/freebsd/spl/sys/string.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_STRING_H_ +#define _OPENSOLARIS_SYS_STRING_H_ + +#include + +char *strpbrk(const char *, const char *); +void strident_canon(char *, size_t); +void kmem_strfree(char *); +char *kmem_strdup(const char *s); + +#endif /* _OPENSOLARIS_SYS_STRING_H_ */ diff --git a/include/os/freebsd/spl/sys/strings.h b/include/os/freebsd/spl/sys/strings.h new file mode 100644 index 0000000000..651685d304 --- /dev/null +++ b/include/os/freebsd/spl/sys/strings.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/sunddi.h b/include/os/freebsd/spl/sys/sunddi.h new file mode 100644 index 0000000000..bfbc3e10a1 --- /dev/null +++ b/include/os/freebsd/spl/sys/sunddi.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#ifdef BUILDING_ZFS +#include +#endif + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +extern int ddi_strtoul(const char *, char **, int, unsigned long *); +extern int ddi_strtol(const char *, char **, int, long *); +extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +extern int ddi_strtoll(const char *, char **, int, long long *); + +extern int ddi_copyin(const void *from, void *to, size_t len, int flags); +extern int ddi_copyout(const void *from, void *to, size_t len, int flags); +extern void ddi_sysevent_init(void); + + +int ddi_soft_state_init(void **statep, size_t size, size_t nitems); +void ddi_soft_state_fini(void **statep); + +void *ddi_get_soft_state(void *state, int item); +int ddi_soft_state_zalloc(void *state, int item); +void ddi_soft_state_free(void *state, int item); + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h new file mode 100644 index 0000000000..7e3ab89155 --- /dev/null +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -0,0 +1,410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSMACROS_H +#define _SYS_SYSMACROS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some macros for units conversion + */ +/* + * Disk blocks (sectors) and bytes. + */ +#define dtob(DD) ((DD) << DEV_BSHIFT) +#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) +#define btodt(BB) ((BB) >> DEV_BSHIFT) +#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef SIGNOF +#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifdef _STANDALONE +#define boot_ncpus 1 +#else /* _STANDALONE */ +#define boot_ncpus mp_ncpus +#endif /* _STANDALONE */ +#define kpreempt_disable() critical_enter() +#define kpreempt_enable() critical_exit() +#define CPU_SEQID curcpu +#define CPU_SEQID_UNSTABLE curcpu +#define is_system_labeled() 0 +/* + * Convert a single byte to/from binary-coded decimal (BCD). + */ +extern unsigned char byte_to_bcd[256]; +extern unsigned char bcd_to_byte[256]; + +#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff] +#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff] + +/* + * WARNING: The device number macros defined here should not be used by device + * drivers or user software. Device drivers should use the device functions + * defined in the DDI/DKI interface (see also ddi.h). Application software + * should make use of the library routines available in makedev(3). A set of + * new device macros are provided to operate on the expanded device number + * format supported in SVR4. Macro versions of the DDI device functions are + * provided for use by kernel proper routines only. Macro routines bmajor(), + * major(), minor(), emajor(), eminor(), and makedev() will be removed or + * their definitions changed at the next major release following SVR4. + */ + +#define O_BITSMAJOR 7 /* # of SVR3 major device bits */ +#define O_BITSMINOR 8 /* # of SVR3 minor device bits */ +#define O_MAXMAJ 0x7f /* SVR3 max major value */ +#define O_MAXMIN 0xff /* SVR3 max minor value */ + + +#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */ +#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */ +#define L_MAXMAJ32 0x3fff /* SVR4 max major value */ +#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */ + /* For 3b2 hardware devices the minor is */ + /* restricted to 256 (0-255) */ + +#ifdef _LP64 +#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */ +#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */ +#define L_MAXMAJ 0xfffffffful /* max major value */ +#define L_MAXMIN 0xfffffffful /* max minor value */ +#else +#define L_BITSMAJOR L_BITSMAJOR32 +#define L_BITSMINOR L_BITSMINOR32 +#define L_MAXMAJ L_MAXMAJ32 +#define L_MAXMIN L_MAXMIN32 +#endif + +/* + * These are versions of the kernel routines for compressing and + * expanding long device numbers that don't return errors. + */ +#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR) + +#define DEVCMPL(x) (x) +#define DEVEXPL(x) (x) + +#else + +#define DEVCMPL(x) \ + (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \ + ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \ + ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32))) + +#define DEVEXPL(x) \ + (((x) == NODEV32) ? NODEV : \ + makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32)) + +#endif /* L_BITSMAJOR32 ... */ + +/* convert to old (SVR3.2) dev format */ + +#define cmpdev(x) \ + (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \ + ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \ + ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN))) + +/* convert to new (SVR4) dev format */ + +#define expdev(x) \ + (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \ + ((x) & O_MAXMIN)) + +/* + * Macro for checking power of 2 address alignment. + */ +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Macros for counting and rounding. + */ +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* + * Macro to determine if value is a power of 2 + */ +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* + * Macros for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, P2ALIGN(1200, 1024) == 1024 (1*align) + * eg, P2ALIGN(1024, 1024) == 1024 (1*align) + * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ALIGN(x, align) ((x) & -(align)) + +/* + * return x % (mod) align + * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +#define P2PHASE(x, align) ((x) & ((align) - 1)) + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) + +/* + * return x rounded up to an align boundary + * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) + +/* + * return the ending address of the block that x is in + * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1) + * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1) + */ +#define P2END(x, align) (-(~(x) & -(align))) + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * return TRUE if adding len to off would cause it to cross an align + * boundary. + * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314) + * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284) + */ +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * Macros to atomically increment/decrement a variable. mutex and var + * must be pointers. + */ +#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex) +#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex) + +/* + * Macros to declare bitfields - the order in the parameter list is + * Low to High - that is, declare bit 0 first. We only support 8-bit bitfields + * because if a field crosses a byte boundary it's not likely to be meaningful + * without reassembly in its nonnative endianness. + */ +#if defined(_BIT_FIELDS_LTOH) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _a, _b +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _a, _b, _c +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _a, _b, _c, _d +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _a, _b, _c, _d, _e +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _a, _b, _c, _d, _e, _f +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _a, _b, _c, _d, _e, _f, _g +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _a, _b, _c, _d, _e, _f, _g, _h +#elif defined(_BIT_FIELDS_HTOL) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _b, _a +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _c, _b, _a +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _d, _c, _b, _a +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _e, _d, _c, _b, _a +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _g, _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _h, _g, _f, _e, _d, _c, _b, _a +#else +#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined +#endif /* _BIT_FIELDS_LTOH */ + +#if !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with version */ + +#define offsetof(type, field) __offsetof(type, field) +#endif + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static __inline int +highbit(ulong_t i) +{ +#if defined(HAVE_INLINE_FLSL) + return (flsl(i)); +#else + int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +static __inline int +highbit64(uint64_t i) +{ +#if defined(HAVE_INLINE_FLSLL) + return (flsll(i)); +#else + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSMACROS_H */ diff --git a/include/os/freebsd/spl/sys/systeminfo.h b/include/os/freebsd/spl/sys/systeminfo.h new file mode 100644 index 0000000000..4028cd7cc6 --- /dev/null +++ b/include/os/freebsd/spl/sys/systeminfo.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SYSTEMINFO_H_ +#define _SYS_SYSTEMINFO_H_ + +#define HW_HOSTID_LEN 11 + +#endif /* !_SYS_SYSTEMINFO_H_ */ diff --git a/include/os/freebsd/spl/sys/systm.h b/include/os/freebsd/spl/sys/systm.h new file mode 100644 index 0000000000..98ee955752 --- /dev/null +++ b/include/os/freebsd/spl/sys/systm.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SYSTM_H_ +#define _OPENSOLARIS_SYS_SYSTM_H_ + +#include +#include_next + +#include + +#define PAGESIZE PAGE_SIZE +#define PAGEOFFSET (PAGESIZE - 1) +#define PAGEMASK (~PAGEOFFSET) + +#define delay(x) pause("soldelay", (x)) + +#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h new file mode 100644 index 0000000000..3040549e04 --- /dev/null +++ b/include/os/freebsd/spl/sys/taskq.h @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#ifdef _KERNEL + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +typedef struct taskq { + struct taskqueue *tq_queue; +} taskq_t; + +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq_ent { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + task_func_t *tqent_func; + void *tqent_arg; + taskqid_t tqent_id; + CK_LIST_ENTRY(taskq_ent) tqent_hash; + uint8_t tqent_type; + uint8_t tqent_registered; + uint8_t tqent_cancelled; + volatile uint32_t tqent_rc; +} taskq_ent_t; + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#define taskq_init_ent(x) +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); +void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern void taskq_wait(taskq_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); +void taskq_suspend(taskq_t *); +int taskq_suspended(taskq_t *); +void taskq_resume(taskq_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _KERNEL */ + +#ifdef _STANDALONE +typedef int taskq_ent_t; +#define taskq_init_ent(x) +#endif /* _STANDALONE */ + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/freebsd/spl/sys/thread.h b/include/os/freebsd/spl/sys/thread.h new file mode 100644 index 0000000000..4fb1a542f5 --- /dev/null +++ b/include/os/freebsd/spl/sys/thread.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_THREAD_H_ +#define _SPL_THREAD_H_ + +#define getcomm() curthread->td_name +#define getpid() curthread->td_tid +#endif diff --git a/include/os/freebsd/spl/sys/time.h b/include/os/freebsd/spl/sys/time.h new file mode 100644 index 0000000000..fbc679aacf --- /dev/null +++ b/include/os/freebsd/spl/sys/time.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_TIME_H_ +#define _OPENSOLARIS_SYS_TIME_H_ +#pragma once +#include_next +#include +#ifndef _SYS_KERNEL_H_ +extern int hz; +#endif + +#define SEC 1 +#define MILLISEC 1000UL +#define MICROSEC 1000000UL +#define NANOSEC 1000000000UL +#define TIME_MAX LLONG_MAX + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +typedef longlong_t hrtime_t; + +#if defined(__i386__) || defined(__powerpc__) +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX) +#else +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX) +#endif + +#define SEC_TO_TICK(sec) ((sec) * hz) +#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + nanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + +#define gethrestime_sec() (time_second) +#define gethrestime(ts) getnanotime(ts) +#define gethrtime_waitfree() gethrtime() + +extern int nsec_per_tick; /* nanoseconds per clock tick */ + +#define ddi_get_lbolt64() \ + (int64_t)(((getsbinuptime() >> 16) * hz) >> 16) +#define ddi_get_lbolt() (clock_t)ddi_get_lbolt64() + +#else + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_UPTIME, &ts); + return (((u_int64_t)ts.tv_sec) * NANOSEC + ts.tv_nsec); +} +#endif /* !_OPENSOLARIS_SYS_TIME_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h new file mode 100644 index 0000000000..d4694bb7c0 --- /dev/null +++ b/include/os/freebsd/spl/sys/timer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TIMER_H_ +#define _SPL_TIMER_H_ +#define ddi_time_after(a, b) ((a) > (b)) +#define ddi_time_after64(a, b) ((a) > (b)) +#define usleep_range(wakeup, wakeupepsilon) \ + pause_sbt("usleep_range", ustosbt(wakeup), \ + ustosbt(wakeupepsilon - wakeup), 0) + +#define schedule() pause("schedule", 1) +#endif diff --git a/include/os/freebsd/spl/sys/trace.h b/include/os/freebsd/spl/sys/trace.h new file mode 100644 index 0000000000..d9639d27b6 --- /dev/null +++ b/include/os/freebsd/spl/sys/trace.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/trace_zfs.h b/include/os/freebsd/spl/sys/trace_zfs.h new file mode 100644 index 0000000000..d9639d27b6 --- /dev/null +++ b/include/os/freebsd/spl/sys/trace_zfs.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/types.h b/include/os/freebsd/spl/sys/types.h new file mode 100644 index 0000000000..ecb91fd1bb --- /dev/null +++ b/include/os/freebsd/spl/sys/types.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_TYPES_H_ +#define _SPL_SYS_TYPES_H_ + +#pragma once +/* + * This is a bag of dirty hacks to keep things compiling. + */ +#include_next + +#ifdef __ILP32__ +typedef __uint64_t u_longlong_t; +typedef __int64_t longlong_t; +#else +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; +#endif +#include + +#define _CLOCK_T_DECLARED + +#include +#include +#include + +#define MAXNAMELEN 256 + + + +typedef void zfs_kernel_param_t; + +typedef struct timespec timestruc_t; +typedef struct timespec timespec_t; +typedef struct timespec inode_timespec_t; +/* BEGIN CSTYLED */ +typedef u_int uint_t; +typedef u_char uchar_t; +typedef u_short ushort_t; +typedef u_long ulong_t; +typedef int minor_t; +/* END CSTYLED */ +#ifndef _OFF64_T_DECLARED +#define _OFF64_T_DECLARED +typedef off_t off64_t; +#endif +typedef id_t taskid_t; +typedef id_t projid_t; +typedef id_t poolid_t; +typedef uint_t zoneid_t; +typedef id_t ctid_t; +typedef mode_t o_mode_t; +typedef uint64_t pgcnt_t; + +#define B_FALSE 0 +#define B_TRUE 1 + +typedef short index_t; +typedef off_t offset_t; +#ifndef _PTRDIFF_T_DECLARED +typedef __ptrdiff_t ptrdiff_t; /* pointer difference */ +#define _PTRDIFF_T_DECLARED +#endif +typedef int64_t rlim64_t; +typedef int major_t; + +#else +#ifdef NEED_SOLARIS_BOOLEAN +#if defined(__XOPEN_OR_POSIX) +typedef enum { _B_FALSE, _B_TRUE } boolean_t; +#else +typedef enum { B_FALSE, B_TRUE } boolean_t; +#endif /* defined(__XOPEN_OR_POSIX) */ +#endif + +typedef u_longlong_t u_offset_t; +typedef u_longlong_t len_t; + +typedef longlong_t diskaddr_t; + +#include +#endif /* !_OPENSOLARIS_SYS_TYPES_H_ */ diff --git a/include/os/freebsd/spl/sys/types32.h b/include/os/freebsd/spl/sys/types32.h new file mode 100644 index 0000000000..907b667e5d --- /dev/null +++ b/include/os/freebsd/spl/sys/types32.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h new file mode 100644 index 0000000000..b71f2f2e56 --- /dev/null +++ b/include/os/freebsd/spl/sys/uio.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_UIO_H_ +#define _OPENSOLARIS_SYS_UIO_H_ + +#ifndef _STANDALONE + +#include_next +#include +#include + +typedef struct iovec iovec_t; +typedef enum uio_seg zfs_uio_seg_t; +typedef enum uio_rw zfs_uio_rw_t; + +typedef struct zfs_uio { + struct uio *uio; +} zfs_uio_t; + +#define GET_UIO_STRUCT(u) (u)->uio +#define zfs_uio_segflg(u) GET_UIO_STRUCT(u)->uio_segflg +#define zfs_uio_offset(u) GET_UIO_STRUCT(u)->uio_offset +#define zfs_uio_resid(u) GET_UIO_STRUCT(u)->uio_resid +#define zfs_uio_iovcnt(u) GET_UIO_STRUCT(u)->uio_iovcnt +#define zfs_uio_iovlen(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_len +#define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base +#define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td +#define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw +#define zfs_uio_fault_disable(u, set) +#define zfs_uio_prefaultpages(size, u) (0) + +static inline void +zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) +{ + zfs_uio_offset(uio) = off; +} + +static inline void +zfs_uio_advance(zfs_uio_t *uio, size_t size) +{ + zfs_uio_resid(uio) -= size; + zfs_uio_offset(uio) += size; +} + +static __inline void +zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) +{ + GET_UIO_STRUCT(uio) = uio_s; +} + +int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); + +#endif /* !_STANDALONE */ + +#endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/include/os/freebsd/spl/sys/uuid.h b/include/os/freebsd/spl/sys/uuid.h new file mode 100644 index 0000000000..26d46e8d62 --- /dev/null +++ b/include/os/freebsd/spl/sys/uuid.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UUID_H +#define _SYS_UUID_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The copyright in this file is taken from the original Leach + * & Salz UUID specification, from which this implementation + * is derived. + */ + +/* + * Copyright (c) 1990- 1993, 1996 Open Software Foundation, Inc. + * Copyright (c) 1989 by Hewlett-Packard Company, Palo Alto, Ca. & + * Digital Equipment Corporation, Maynard, Mass. Copyright (c) 1998 + * Microsoft. To anyone who acknowledges that this file is provided + * "AS IS" without any express or implied warranty: permission to use, + * copy, modify, and distribute this file for any purpose is hereby + * granted without fee, provided that the above copyright notices and + * this notice appears in all source code copies, and that none of the + * names of Open Software Foundation, Inc., Hewlett-Packard Company, + * or Digital Equipment Corporation be used in advertising or + * publicity pertaining to distribution of the software without + * specific, written prior permission. Neither Open Software + * Foundation, Inc., Hewlett-Packard Company, Microsoft, nor Digital + * Equipment Corporation makes any representations about the + * suitability of this software for any purpose. + */ + +#include +#include + +typedef struct { + uint8_t nodeID[6]; +} uuid_node_t; + +/* + * The uuid type used throughout when referencing uuids themselves + */ +typedef struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node_addr[6]; +} uuid_t; + +#define UUID_PRINTABLE_STRING_LENGTH 37 + +/* + * Convert a uuid to/from little-endian format + */ +#define UUID_LE_CONVERT(dest, src) \ +{ \ + (dest) = (src); \ + (dest).time_low = LE_32((dest).time_low); \ + (dest).time_mid = LE_16((dest).time_mid); \ + (dest).time_hi_and_version = LE_16((dest).time_hi_and_version); \ +} + +static __inline int +uuid_is_null(const caddr_t uuid) +{ + return (0); +} +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UUID_H */ diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h new file mode 100644 index 0000000000..22d57cc473 --- /dev/null +++ b/include/os/freebsd/spl/sys/vfs.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VFS_H_ +#define _OPENSOLARIS_SYS_VFS_H_ + +#include +#include +#include + +#define rootdir rootvnode + +struct thread; +struct vnode; +typedef struct mount vfs_t; + +typedef int umode_t; + +#define vfs_flag mnt_flag +#define vfs_data mnt_data +#define vfs_count mnt_ref +#define vfs_fsid mnt_stat.f_fsid +#define vfs_bsize mnt_stat.f_bsize +#define vfs_resource mnt_stat.f_mntfromname + +#define v_flag v_vflag +#define v_vfsp v_mount + +#define VFS_RDONLY MNT_RDONLY +#define VFS_NOSETUID MNT_NOSUID +#define VFS_NOEXEC MNT_NOEXEC + +#define VROOT VV_ROOT + +#define XU_NGROUPS 16 + +/* + * Structure defining a mount option for a filesystem. + * option names are found in mntent.h + */ +typedef struct mntopt { + char *mo_name; /* option name */ + char **mo_cancel; /* list of options cancelled by this one */ + char *mo_arg; /* argument string for this option */ + int mo_flags; /* flags for this mount option */ + void *mo_data; /* filesystem specific data */ +} mntopt_t; + +/* + * Flags that apply to mount options + */ + +#define MO_SET 0x01 /* option is set */ +#define MO_NODISPLAY 0x02 /* option not listed in mnttab */ +#define MO_HASVALUE 0x04 /* option takes a value */ +#define MO_IGNORE 0x08 /* option ignored by parser */ +#define MO_DEFAULT MO_SET /* option is on by default */ +#define MO_TAG 0x10 /* flags a tag set by user program */ +#define MO_EMPTY 0x20 /* empty space in option table */ + +#define VFS_NOFORCEOPT 0x01 /* honor MO_IGNORE (don't set option) */ +#define VFS_DISPLAY 0x02 /* Turn off MO_NODISPLAY bit for opt */ +#define VFS_NODISPLAY 0x04 /* Turn on MO_NODISPLAY bit for opt */ +#define VFS_CREATEOPT 0x08 /* Create the opt if it's not there */ + +/* + * Structure holding mount option strings for the mounted file system. + */ +typedef struct mntopts { + uint_t mo_count; /* number of entries in table */ + mntopt_t *mo_list; /* list of mount options */ +} mntopts_t; + +void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused); +void vfs_clearmntopt(vfs_t *vfsp, const char *name); +int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); +int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, + char *fspath, char *fspec, int fsflags); + +typedef uint64_t vfs_feature_t; + +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ +#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ +#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ +#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 + /* Support loaning /returning cache buffer */ + +#define vfs_set_feature(vfsp, feature) do { } while (0) +#define vfs_clear_feature(vfsp, feature) do { } while (0) +#define vfs_has_feature(vfsp, feature) (0) + +#include +#endif /* _OPENSOLARIS_SYS_VFS_H_ */ diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h new file mode 100644 index 0000000000..7b3830be8a --- /dev/null +++ b/include/os/freebsd/spl/sys/vm.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VM_H_ +#define _OPENSOLARIS_SYS_VM_H_ + +#include + +extern const int zfs_vm_pagerret_bad; +extern const int zfs_vm_pagerret_error; +extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerput_sync; +extern const int zfs_vm_pagerput_inval; + +void zfs_vmobject_assert_wlocked(vm_object_t object); +void zfs_vmobject_wlock(vm_object_t object); +void zfs_vmobject_wunlock(vm_object_t object); + +#if __FreeBSD_version >= 1300081 +#define zfs_vmobject_assert_wlocked_12(x) +#define zfs_vmobject_wlock_12(x) +#define zfs_vmobject_wunlock_12(x) +#else +#define zfs_vmobject_assert_wlocked_12(x) \ + zfs_vmobject_assert_wlocked((x)) +#define zfs_vmobject_wlock_12(x) \ + zfs_vmobject_wlock(x) +#define zfs_vmobject_wunlock_12(x) \ + zfs_vmobject_wunlock(x) +#define vm_page_grab_unlocked(obj, idx, flags) \ + vm_page_grab((obj), (idx), (flags)) +#define vm_page_grab_valid_unlocked(m, obj, idx, flags) \ + vm_page_grab_valid((m), (obj), (idx), (flags)) +#endif +static inline caddr_t +zfs_map_page(vm_page_t pp, struct sf_buf **sfp) +{ + *sfp = sf_buf_alloc(pp, 0); + return ((caddr_t)sf_buf_kva(*sfp)); +} + +static inline void +zfs_unmap_page(struct sf_buf *sf) +{ + sf_buf_free(sf); +} + +#endif /* _OPENSOLARIS_SYS_VM_H_ */ diff --git a/include/os/freebsd/spl/sys/vmsystm.h b/include/os/freebsd/spl/sys/vmsystm.h new file mode 100644 index 0000000000..0db34bbe43 --- /dev/null +++ b/include/os/freebsd/spl/sys/vmsystm.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_VMSYSTM_H_ +#define _SPL_VMSYSTM_H_ + +#define xcopyout copyout + +#endif diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h new file mode 100644 index 0000000000..3bc8a18eeb --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +#define _OPENSOLARIS_SYS_VNODE_H_ + +struct vnode; +struct vattr; +struct xucred; + +typedef struct flock flock64_t; +typedef struct vnode vnode_t; +typedef struct vattr vattr_t; +typedef enum vtype vtype_t; + +#include +#include +#include_next +#include +enum symfollow { NO_FOLLOW = NOFOLLOW }; + +#define NOCRED ((struct ucred *)0) /* no credential available */ +#define F_FREESP 11 /* Free file space */ + +#include +#include +#ifndef IN_BASE +#include_next +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct vop_vector vnodeops_t; +#define VOP_FID VOP_VPTOFH +#define vop_fid vop_vptofh +#define vop_fid_args vop_vptofh_args +#define a_fid a_fhp + +#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) + +#ifndef IN_BASE +static __inline int +vn_is_readonly(vnode_t *vp) +{ + return (vp->v_mount->mnt_flag & MNT_RDONLY); +} +#endif +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) do { } while (0) +#define vn_ismntpt(vp) \ + ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) +#define vn_mountedvfs(vp) ((vp)->v_mountedhere) +#define vn_has_cached_data(vp) \ + ((vp)->v_object != NULL && \ + (vp)->v_object->resident_page_count > 0) + +static __inline void +vn_flush_cached_data(vnode_t *vp, boolean_t sync) +{ +#if __FreeBSD_version > 1300054 + if (vm_object_mightbedirty(vp->v_object)) { +#else + if (vp->v_object->flags & OBJ_MIGHTBEDIRTY) { +#endif + int flags = sync ? OBJPC_SYNC : 0; + zfs_vmobject_wlock(vp->v_object); + vm_object_page_clean(vp->v_object, 0, 0, flags); + zfs_vmobject_wunlock(vp->v_object); + } +} + +#define vn_exists(vp) do { } while (0) +#define vn_invalid(vp) do { } while (0) +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) + +#define VN_HOLD(v) vref(v) +#define VN_RELE(v) vrele(v) +#define VN_URELE(v) vput(v) + +#define vnevent_create(vp, ct) do { } while (0) +#define vnevent_link(vp, ct) do { } while (0) +#define vnevent_remove(vp, dvp, name, ct) do { } while (0) +#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest_dir(vp, ct) do { } while (0) + +#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) +#define MANDLOCK(vp, mode) (0) + +/* + * We will use va_spare is place of Solaris' va_mask. + * This field is initialized in zfs_setattr(). + */ +#define va_mask va_spare +/* TODO: va_fileid is shorter than va_nodeid !!! */ +#define va_nodeid va_fileid +/* TODO: This field needs conversion! */ +#define va_nblocks va_bytes +#define va_blksize va_blocksize +#define va_seq va_gen + +#define MAXOFFSET_T OFF_MAX +#define EXCL 0 + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#ifndef FDSYNC +#define FDSYNC FFSYNC +#endif +#define FRSYNC FFSYNC +#define FSYNC FFSYNC +#define FOFFMAX 0x00 +#define FIGNORECASE 0x00 + +/* + * Attributes of interest to the caller of setattr or getattr. + */ +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +/* 0x04000 */ /* unused */ +#define AT_SEQ 0x08000 +/* + * If AT_XVATTR is set then there are additional bits to process in + * the xvattr_t's attribute bitmap. If this is not set then the bitmap + * MUST be ignored. Note that this bit must be set/cleared explicitly. + * That is, setting AT_ALL will NOT set AT_XVATTR. + */ +#define AT_XVATTR 0x10000 + +#define AT_ALL (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ + AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ + AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ + AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV) + +#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) + +#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|\ + AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#ifndef IN_BASE +static __inline void +vattr_init_mask(vattr_t *vap) +{ + + vap->va_mask = 0; + + if (vap->va_uid != (uid_t)VNOVAL) + vap->va_mask |= AT_UID; + if (vap->va_gid != (gid_t)VNOVAL) + vap->va_mask |= AT_GID; + if (vap->va_size != (u_quad_t)VNOVAL) + vap->va_mask |= AT_SIZE; + if (vap->va_atime.tv_sec != VNOVAL) + vap->va_mask |= AT_ATIME; + if (vap->va_mtime.tv_sec != VNOVAL) + vap->va_mask |= AT_MTIME; + if (vap->va_mode != (uint16_t)VNOVAL) + vap->va_mask |= AT_MODE; + if (vap->va_flags != VNOVAL) + vap->va_mask |= AT_XVATTR; +} +#endif + +#define RLIM64_INFINITY 0 + +static __inline int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + + ASSERT(seg == UIO_SYSSPACE); + + return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg)); +} + +#include + +#endif /* _OPENSOLARIS_SYS_VNODE_H_ */ diff --git a/include/os/freebsd/spl/sys/vnode_impl.h b/include/os/freebsd/spl/sys/vnode_impl.h new file mode 100644 index 0000000000..c82b1fc9ad --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode_impl.h @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_VNODE_IMPL_H +#define _SYS_VNODE_IMPL_H + + +#define IS_DEVVP(vp) \ + ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) + +#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +/* + * The xvattr structure is really a variable length structure that + * is made up of: + * - The classic vattr_t (xva_vattr) + * - a 32 bit quantity (xva_mapsize) that specifies the size of the + * attribute bitmaps in 32 bit words. + * - A pointer to the returned attribute bitmap (needed because the + * previous element, the requested attribute bitmap) is variable length. + * - The requested attribute bitmap, which is an array of 32 bit words. + * Callers use the XVA_SET_REQ() macro to set the bits corresponding to + * the attributes that are being requested. + * - The returned attribute bitmap, which is an array of 32 bit words. + * File systems that support optional attributes use the XVA_SET_RTN() + * macro to set the bits corresponding to the attributes that are being + * returned. + * - The xoptattr_t structure which contains the attribute values + * + * xva_mapsize determines how many words in the attribute bitmaps. + * Immediately following the attribute bitmaps is the xoptattr_t. + * xva_getxoptattr() is used to get the pointer to the xoptattr_t + * section. + */ + +#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ +#define XVA_MAGIC 0x78766174 /* Magic # for verification */ + +/* + * The xvattr structure is an extensible structure which permits optional + * attributes to be requested/returned. File systems may or may not support + * optional attributes. They do so at their own discretion but if they do + * support optional attributes, they must register the VFSFT_XVATTR feature + * so that the optional attributes can be set/retrieved. + * + * The fields of the xvattr structure are: + * + * xva_vattr - The first element of an xvattr is a legacy vattr structure + * which includes the common attributes. If AT_XVATTR is set in the va_mask + * then the entire structure is treated as an xvattr. If AT_XVATTR is not + * set, then only the xva_vattr structure can be used. + * + * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. + * + * xva_mapsize - Size of requested and returned attribute bitmaps. + * + * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the + * size of the array before it, xva_reqattrmap[], could change which means + * the location of xva_rtnattrmap[] could change. This will allow unbundled + * file systems to find the location of xva_rtnattrmap[] when the sizes change. + * + * xva_reqattrmap[] - Array of requested attributes. Attributes are + * represented by a specific bit in a specific element of the attribute + * map array. Callers set the bits corresponding to the attributes + * that the caller wants to get/set. + * + * xva_rtnattrmap[] - Array of attributes that the file system was able to + * process. Not all file systems support all optional attributes. This map + * informs the caller which attributes the underlying file system was able + * to set/get. (Same structure as the requested attributes array in terms + * of each attribute corresponding to specific bits and array elements.) + * + * xva_xoptattrs - Structure containing values of optional attributes. + * These values are only valid if the corresponding bits in xva_reqattrmap + * are set and the underlying file system supports those attributes. + */ + + + +/* + * Attribute bits used in the extensible attribute's (xva's) attribute + * bitmaps. Note that the bitmaps are made up of a variable length number + * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" + * is the element in the bitmap (starting at 1). This convention is for + * the convenience of the maintainer to keep track of which element each + * attribute belongs to. + * + * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS + * MUST USE THE XAT_* DEFINES. + */ +#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ +#define XAT0_CREATETIME 0x00000001 /* Create time of file */ +#define XAT0_ARCHIVE 0x00000002 /* Archive */ +#define XAT0_SYSTEM 0x00000004 /* System */ +#define XAT0_READONLY 0x00000008 /* Readonly */ +#define XAT0_HIDDEN 0x00000010 /* Hidden */ +#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ +#define XAT0_IMMUTABLE 0x00000040 /* immutable */ +#define XAT0_APPENDONLY 0x00000080 /* appendonly */ +#define XAT0_NODUMP 0x00000100 /* nodump */ +#define XAT0_OPAQUE 0x00000200 /* opaque */ +#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ +#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ +#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ +#define XAT0_REPARSE 0x00002000 /* FS reparse point */ +#define XAT0_GEN 0x00004000 /* object generation number */ +#define XAT0_OFFLINE 0x00008000 /* offline */ +#define XAT0_SPARSE 0x00010000 /* sparse */ + +/* Support for XAT_* optional attributes */ +#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ +#define XVA_SHFT 32 /* Used to shift index */ + +/* + * Used to pry out the index and attribute bits from the XAT_* attributes + * defined below. Note that we're masking things down to 32 bits then + * casting to uint32_t. + */ +#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) +#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) + +/* + * The following defines present a "flat namespace" so that consumers don't + * need to keep track of which element belongs to which bitmap entry. + * + * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER + */ +#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) +#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) +#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) +#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) +#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) +#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) +#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) +#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) +#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) +#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) +#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) +#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) +#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) +#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) +#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) +#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) +#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) + +/* + * The returned attribute map array (xva_rtnattrmap[]) is located past the + * requested attribute map array (xva_reqattrmap[]). Its location changes + * when the array sizes change. We use a separate pointer in a known location + * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is + * set in xva_init() + */ +#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) + +#define MODEMASK 07777 /* mode bits plus permission bits */ +#define PERMMASK 00777 /* permission bits */ + +/* + * VOP_ACCESS flags + */ +#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ + +/* + * Flags for vnode operations. + */ +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +/* + * Structure used by various vnode operations to determine + * the context (pid, host, identity) of a caller. + * + * The cc_caller_id is used to identify one or more callers who invoke + * operations, possibly on behalf of others. For example, the NFS + * server could have its own cc_caller_id which can be detected by + * vnode/vfs operations or (FEM) monitors on those operations. New + * caller IDs are generated by fs_new_caller_id(). + */ +typedef struct caller_context { + pid_t cc_pid; /* Process ID of the caller */ + int cc_sysid; /* System ID, used for remote calls */ + u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ + ulong_t cc_flags; +} caller_context_t; + +struct taskq; + +/* + * Flags for VOP_LOOKUP + * + * Defined in file.h, but also possible, FIGNORECASE and FSEARCH + * + */ +#define LOOKUP_DIR 0x01 /* want parent dir vp */ +#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ +#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ + +/* + * Flags for VOP_READDIR + */ +#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ +#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ + +/* + * Public vnode manipulation functions. + */ + +void vn_rele_async(struct vnode *vp, struct taskq *taskq); + +#define VN_RELE_ASYNC(vp, taskq) { \ + vn_rele_async(vp, taskq); \ +} + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_EXEC 0x02 /* invocation from exec(2) */ +#define ATTR_COMM 0x04 /* yield common vp attributes */ +#define ATTR_HINT 0x08 /* information returned will be `hint' */ +#define ATTR_REAL 0x10 /* yield attributes of the real vp */ +#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ +#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VNODE_H */ diff --git a/include/os/freebsd/spl/sys/wmsum.h b/include/os/freebsd/spl/sys/wmsum.h new file mode 100644 index 0000000000..9fdd1901b7 --- /dev/null +++ b/include/os/freebsd/spl/sys/wmsum.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * wmsum counters are a reduced version of aggsum counters, optimized for + * write-mostly scenarios. They do not provide optimized read functions, + * but instead allow much cheaper add function. The primary usage is + * infrequently read statistic counters, not requiring exact precision. + * + * The FreeBSD implementation is directly mapped into counter(9) KPI. + */ + +#ifndef _SYS_WMSUM_H +#define _SYS_WMSUM_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define wmsum_t counter_u64_t + +static inline void +wmsum_init(wmsum_t *ws, uint64_t value) +{ + + *ws = counter_u64_alloc(M_WAITOK); + counter_u64_add(*ws, value); +} + +static inline void +wmsum_fini(wmsum_t *ws) +{ + + counter_u64_free(*ws); +} + +static inline uint64_t +wmsum_value(wmsum_t *ws) +{ + + return (counter_u64_fetch(*ws)); +} + +static inline void +wmsum_add(wmsum_t *ws, int64_t delta) +{ + + counter_u64_add(*ws, delta); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_WMSUM_H */ diff --git a/include/os/freebsd/spl/sys/zmod.h b/include/os/freebsd/spl/sys/zmod.h new file mode 100644 index 0000000000..c606b1db5f --- /dev/null +++ b/include/os/freebsd/spl/sys/zmod.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZMOD_H +#define _ZMOD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * zmod - RFC-1950-compatible decompression routines + * + * This file provides the public interfaces to zmod, an in-kernel RFC 1950 + * decompression library. More information about the implementation of these + * interfaces can be found in the usr/src/uts/common/zmod/ directory. + */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) + +extern int z_uncompress(void *, size_t *, const void *, size_t); +extern int z_compress(void *, size_t *, const void *, size_t); +extern int z_compress_level(void *, size_t *, const void *, size_t, int); +extern const char *z_strerror(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZMOD_H */ diff --git a/include/os/freebsd/spl/sys/zone.h b/include/os/freebsd/spl/sys/zone.h new file mode 100644 index 0000000000..dd088de836 --- /dev/null +++ b/include/os/freebsd/spl/sys/zone.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ZONE_H_ +#define _OPENSOLARIS_SYS_ZONE_H_ + +#include + +/* + * Macros to help with zone visibility restrictions. + */ + +#define GLOBAL_ZONEID 0 + +/* + * Is proc in the global zone? + */ +#define INGLOBALZONE(proc) (!jailed((proc)->p_ucred)) + +/* + * Attach the given dataset to the given jail. + */ +extern int zone_dataset_attach(struct ucred *, const char *, int); + +/* + * Detach the given dataset to the given jail. + */ +extern int zone_dataset_detach(struct ucred *, const char *, int); + +/* + * Returns true if the named pool/dataset is visible in the current zone. + */ +extern int zone_dataset_visible(const char *, int *); + +/* + * Safely get the hostid of the specified zone (defaults to machine's hostid + * if the specified zone doesn't emulate a hostid). Passing NULL retrieves + * the global zone's (i.e., physical system's) hostid. + */ +extern uint32_t zone_get_hostid(void *); + +#endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ diff --git a/include/os/freebsd/zfs/Makefile.am b/include/os/freebsd/zfs/Makefile.am new file mode 100644 index 0000000000..081839c48c --- /dev/null +++ b/include/os/freebsd/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/freebsd/zfs/sys/Makefile.am b/include/os/freebsd/zfs/sys/Makefile.am new file mode 100644 index 0000000000..392bb4ae34 --- /dev/null +++ b/include/os/freebsd/zfs/sys/Makefile.am @@ -0,0 +1,15 @@ +KERNEL_H = \ + freebsd_crypto.h \ + sha2.h \ + vdev_os.h \ + zfs_bootenv_os.h \ + zfs_context_os.h \ + zfs_ctldir.h \ + zfs_dir.h \ + zfs_ioctl_compat.h \ + zfs_vfsops_os.h \ + zfs_vnops_os.h \ + zfs_znode_impl.h \ + zpl.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/zfs/sys/freebsd_crypto.h b/include/os/freebsd/zfs/sys/freebsd_crypto.h new file mode 100644 index 0000000000..e240f5b0dd --- /dev/null +++ b/include/os/freebsd/zfs/sys/freebsd_crypto.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Sean Eric Fagan + * Portions Copyright (c) 2005-2011 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file were taken from GELI's implementation of hmac. + * + * $FreeBSD$ + */ + +#ifndef _ZFS_FREEBSD_CRYPTO_H +#define _ZFS_FREEBSD_CRYPTO_H + +#include +#include +#include +#include +#include + +#define SUN_CKM_AES_CCM "CKM_AES_CCM" +#define SUN_CKM_AES_GCM "CKM_AES_GCM" +#define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" + +#define CRYPTO_KEY_RAW 1 + +#define CRYPTO_BITS2BYTES(n) ((n) == 0 ? 0 : (((n) - 1) >> 3) + 1) +#define CRYPTO_BYTES2BITS(n) ((n) << 3) + +struct zio_crypt_info; + +typedef struct freebsd_crypt_session { + struct mtx fs_lock; + crypto_session_t fs_sid; + boolean_t fs_done; +} freebsd_crypt_session_t; + +/* + * Unused types to minimize code differences. + */ +typedef void *crypto_mechanism_t; +typedef void *crypto_ctx_template_t; +/* + * Unlike the ICP crypto_key type, this only + * supports (the equivalent of + * CRYPTO_KEY_RAW). + */ +typedef struct crypto_key { + int ck_format; /* Unused, but minimizes code diff */ + void *ck_data; + size_t ck_length; +} crypto_key_t; + +typedef struct hmac_ctx { + SHA512_CTX innerctx; + SHA512_CTX outerctx; +} *crypto_context_t; + +/* + * The only algorithm ZFS uses for hashing is SHA512_HMAC. + */ +void crypto_mac(const crypto_key_t *key, const void *in_data, + size_t in_data_size, void *out_data, size_t out_data_size); +void crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *key); +void crypto_mac_update(struct hmac_ctx *ctx, const void *data, + size_t data_size); +void crypto_mac_final(struct hmac_ctx *ctx, void *out_data, + size_t out_data_size); + +int freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *, crypto_key_t *); +void freebsd_crypt_freesession(freebsd_crypt_session_t *sessp); + +int freebsd_crypt_uio(boolean_t, freebsd_crypt_session_t *, + struct zio_crypt_info *, zfs_uio_t *, crypto_key_t *, uint8_t *, + size_t, size_t); + +#endif /* _ZFS_FREEBSD_CRYPTO_H */ diff --git a/include/os/freebsd/zfs/sys/sha2.h b/include/os/freebsd/zfs/sys/sha2.h new file mode 100644 index 0000000000..e3923e4ca3 --- /dev/null +++ b/include/os/freebsd/zfs/sys/sha2.h @@ -0,0 +1,200 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include /* for uint_* */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ + +#include +#include +#include +#include +typedef struct { + uint32_t algotype; /* Algorithm Type */ + union { + SHA256_CTX SHA256_ctx; + SHA384_CTX SHA384_ctx; + SHA512_CTX SHA512_ctx; + }; +} SHA2_CTX; + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + + +static inline void +SHA2Init(uint64_t mech, SHA2_CTX *c) +{ + switch (mech) { + case SHA256: + SHA256_Init(&c->SHA256_ctx); + break; + case SHA384: + SHA384_Init(&c->SHA384_ctx); + break; + case SHA512: + SHA512_Init(&c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Init(&c->SHA512_ctx); + break; + default: + panic("unknown mechanism %ju", (uintmax_t)mech); + } + c->algotype = (uint32_t)mech; +} + +static inline void +SHA2Update(SHA2_CTX *c, const void *p, size_t s) +{ + switch (c->algotype) { + case SHA256: + SHA256_Update(&c->SHA256_ctx, p, s); + break; + case SHA384: + SHA384_Update(&c->SHA384_ctx, p, s); + break; + case SHA512: + SHA512_Update(&c->SHA512_ctx, p, s); + break; + case SHA512_256: + SHA512_256_Update(&c->SHA512_ctx, p, s); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +static inline void +SHA2Final(void *p, SHA2_CTX *c) +{ + switch (c->algotype) { + case SHA256: + SHA256_Final(p, &c->SHA256_ctx); + break; + case SHA384: + SHA384_Final(p, &c->SHA384_ctx); + break; + case SHA512: + SHA512_Final(p, &c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Final(p, &c->SHA512_ctx); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/linux/mod_compat.h b/include/os/freebsd/zfs/sys/vdev_os.h similarity index 69% rename from include/linux/mod_compat.h rename to include/os/freebsd/zfs/sys/vdev_os.h index 32aea4471b..59da954b90 100644 --- a/include/linux/mod_compat.h +++ b/include/os/freebsd/zfs/sys/vdev_os.h @@ -20,20 +20,14 @@ */ /* - * Copyright (C) 2016 Gvozden Neskovic . + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#ifndef _MOD_COMPAT_H -#define _MOD_COMPAT_H +#ifndef _SYS_VDEV_OS_H +#define _SYS_VDEV_OS_H -#include -#include +extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size); +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); -/* Grsecurity kernel API change */ -#ifdef MODULE_PARAM_CALL_CONST -typedef const struct kernel_param zfs_kernel_param_t; -#else -typedef struct kernel_param zfs_kernel_param_t; #endif - -#endif /* _MOD_COMPAT_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_bootenv_os.h b/include/os/freebsd/zfs/sys/zfs_bootenv_os.h new file mode 100644 index 0000000000..80c71a6c50 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_bootenv_os.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _ZFS_BOOTENV_OS_H +#define _ZFS_BOOTENV_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_OS BE_FREEBSD_VENDOR + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_OS_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h new file mode 100644 index 0000000000..a32eb52c53 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#include +#include +#include +#include_next +#include +#include +#include +#include +#include +#include +#include + +#if KSTACK_PAGES * PAGE_SIZE >= 16384 +#define HAVE_LARGE_STACKS 1 +#endif + +#define cond_resched() kern_yield(PRI_USER) + +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) + +#define tsd_create(keyp, destructor) do { \ + *(keyp) = osd_thread_register((destructor)); \ + KASSERT(*(keyp) > 0, ("cannot register OSD")); \ +} while (0) + +#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) +#define tsd_get(key) osd_thread_get(curthread, (key)) +#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) +#define fm_panic panic + +extern int zfs_debug_level; +extern struct mtx zfs_debug_mtx; +#define ZFS_LOG(lvl, ...) do { \ + if (((lvl) & 0xff) <= zfs_debug_level) { \ + mtx_lock(&zfs_debug_mtx); \ + printf("%s:%u[%d]: ", \ + __func__, __LINE__, (lvl)); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + if ((lvl) & 0x100) \ + kdb_backtrace(); \ + mtx_unlock(&zfs_debug_mtx); \ + } \ +} while (0) + +#define MSEC_TO_TICK(msec) (howmany((hrtime_t)(msec) * hz, MILLISEC)) +extern int hz; +extern int tick; +typedef int fstrans_cookie_t; +#define spl_fstrans_mark() (0) +#define spl_fstrans_unmark(x) (x = 0) +#define signal_pending(x) SIGPENDING(x) +#define current curthread +#define thread_join(x) +typedef struct opensolaris_utsname utsname_t; +extern utsname_t *utsname(void); +extern int spa_import_rootpool(const char *name, bool checkpointrewind); +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_ctldir.h b/include/os/freebsd/zfs/sys/zfs_ctldir.h new file mode 100644 index 0000000000..da02863a78 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ctldir.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +int zfsctl_root(zfsvfs_t *, int, vnode_t **); +void zfsctl_init(void); +void zfsctl_fini(void); +boolean_t zfsctl_is_node(vnode_t *); +int zfsctl_snapshot_unmount(const char *snapname, int flags); +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_dir.h b/include/os/freebsd/zfs/sys/zfs_dir.h new file mode 100644 index 0000000000..4197e1188c --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_dir.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ + +extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); +extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, + boolean_t *); +extern int zfs_dirlook(znode_t *, const char *name, znode_t **); +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, zfs_acl_ids_t *); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h new file mode 100644 index 0000000000..d36a6d2ce7 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h @@ -0,0 +1,159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_DEADMAN 1 +#define ZFS_IOCVER_LZC 2 +#define ZFS_IOCVER_ZCMD 3 +#define ZFS_IOCVER_EDBP 4 +#define ZFS_IOCVER_RESUME 5 +#define ZFS_IOCVER_INLANES 6 +#define ZFS_IOCVER_PAD 7 +#define ZFS_IOCVER_LEGACY ZFS_IOCVER_PAD +#define ZFS_IOCVER_OZFS 15 + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 +#define ZFS_CMD_COMPAT_DEADMAN 3 +#define ZFS_CMD_COMPAT_LZC 4 +#define ZFS_CMD_COMPAT_ZCMD 5 +#define ZFS_CMD_COMPAT_EDBP 6 +#define ZFS_CMD_COMPAT_RESUME 7 +#define ZFS_CMD_COMPAT_INLANES 8 +#define ZFS_CMD_COMPAT_LEGACY 9 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; +} zfs_iocparm_t; + + +#define LEGACY_MAXPATHLEN 1024 +#define LEGACY_MAXNAMELEN 256 + +/* + * Note: this struct must have the same layout in 32-bit and 64-bit, so + * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit + * kernel. Therefore, we add padding to it so that no "hidden" padding + * is automatically added on 64-bit (but not on 32-bit). + */ +typedef struct zfs_cmd_legacy { + char zc_name[LEGACY_MAXPATHLEN]; /* pool|dataset name */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[LEGACY_MAXPATHLEN * 2]; + char zc_string[LEGACY_MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_legacy_t; + + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#endif /* _KERNEL */ +int zfs_ioctl_legacy_to_ozfs(int request); +int zfs_ioctl_ozfs_to_legacy(int request); +void zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst); +void zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst); + +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h new file mode 100644 index 0000000000..ccbbf4f732 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -0,0 +1,317 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#if __FreeBSD_version >= 1300125 +#define TEARDOWN_RMS +#endif + +#if __FreeBSD_version >= 1300109 +#define TEARDOWN_INACTIVE_RMS +#endif + +#include +#include +#include +#include +#include +#include +#ifdef TEARDOWN_INACTIVE_RMS +#include +#endif +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef TEARDOWN_RMS +typedef struct rmslock zfs_teardown_lock_t; +#else +#define zfs_teardown_lock_t rrmlock_t +#endif + +#ifdef TEARDOWN_INACTIVE_RMS +typedef struct rmslock zfs_teardown_inactive_lock_t; +#else +#define zfs_teardown_inactive_lock_t krwlock_t +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_flags; /* super_block flags */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_type; /* type of acl usable on this fs */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + zfs_teardown_lock_t z_teardown_lock; + zfs_teardown_inactive_lock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + uint64_t z_nr_znodes; /* number of znodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + boolean_t z_use_namecache; /* make use of FreeBSD name cache */ + uint8_t z_xattr; /* xattr type in use */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + dataset_kstats_t z_kstat; /* fs kstats */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ + struct task z_unlinked_drain_task; +}; + +#ifdef TEARDOWN_RMS +#define ZFS_TEARDOWN_INIT(zfsvfs) \ + rms_init(&(zfsvfs)->z_teardown_lock, "zfs teardown") + +#define ZFS_TEARDOWN_DESTROY(zfsvfs) \ + rms_destroy(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ + rms_try_rlock(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ + rms_rlock(&(zfsvfs)->z_teardown_lock); + +#define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ + rms_runlock(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ + rms_wlock(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ + rms_wunlock(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ + rms_unlock(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ + rms_rowned(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ + rms_wowned(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_HELD(zfsvfs) \ + rms_owned_any(&(zfsvfs)->z_teardown_lock) +#else +#define ZFS_TEARDOWN_INIT(zfsvfs) \ + rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) + +#define ZFS_TEARDOWN_DESTROY(zfsvfs) \ + rrm_destroy(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ + rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) + +#define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); + +#define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ + rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) + +#define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ + RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ + RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_HELD(zfsvfs) \ + RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) +#endif + +#ifdef TEARDOWN_INACTIVE_RMS +#define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ + rms_init(&(zfsvfs)->z_teardown_inactive_lock, "zfs teardown inactive") + +#define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ + rms_destroy(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ + rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ + rms_rlock(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ + rms_runlock(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ + rms_wlock(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ + rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ + rms_wowned(&(zfsvfs)->z_teardown_inactive_lock) +#else +#define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ + rw_init(&(zfsvfs)->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL) + +#define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ + rw_destroy(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ + rw_tryenter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) + +#define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ + rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) + +#define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ + rw_exit(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ + rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_WRITER) + +#define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ + rw_exit(&(zfsvfs)->z_teardown_inactive_lock) + +#define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ + RW_WRITE_HELD(&(zfsvfs)->z_teardown_inactive_lock) +#endif + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes[**] currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + * + * [*] 20 bytes on FreeBSD to fit into the size of struct fid. + * [**] 2 bytes on FreeBSD for the above reason. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; +extern int zfs_super_owner; + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); +extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); +extern int zfs_busy(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vnops_os.h b/include/os/freebsd/zfs/sys/zfs_vnops_os.h new file mode 100644 index 0000000000..bf5e03b24c --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vnops_os.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_FS_ZFS_VNOPS_OS_H +#define _SYS_FS_ZFS_VNOPS_OS_H + +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); +int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size); +extern int zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, const char *snm, znode_t *tdzp, + const char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_link(znode_t *tdzp, znode_t *sp, + const char *name, cred_t *cr, int flags); +extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); + +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h new file mode 100644 index 0000000000..4456046e6e --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,187 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _FREEBSD_ZFS_SYS_ZNODE_IMPL_H +#define _FREEBSD_ZFS_SYS_ZNODE_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#define ZNODE_OS_FIELDS \ + struct zfsvfs *z_zfsvfs; \ + vnode_t *z_vnode; \ + char *z_cached_symlink; \ + uint64_t z_uid; \ + uint64_t z_gid; \ + uint64_t z_gen; \ + uint64_t z_atime[2]; \ + uint64_t z_links; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define ZTOI(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((struct znode *)(VP)->v_data) +#define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP)) +#define ITOZ(VP) ((struct znode *)(VP)->v_data) +#define zhold(zp) vhold(ZTOV((zp))) +#define zrele(zp) vrele(ZTOV((zp))) + +#define ZTOZSB(zp) ((zp)->z_zfsvfs) +#define ITOZSB(vp) (VTOZ(vp)->z_zfsvfs) +#define ZTOTYPE(zp) (ZTOV(zp)->v_type) +#define ZTOGID(zp) ((zp)->z_gid) +#define ZTOUID(zp) ((zp)->z_uid) +#define ZTONLNK(zp) ((zp)->z_links) +#define Z_ISBLK(type) ((type) == VBLK) +#define Z_ISCHR(type) ((type) == VCHR) +#define Z_ISLNK(type) ((type) == VLNK) +#define Z_ISDIR(type) ((type) == VDIR) + +#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp)) +#define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync) +#define zn_rlimit_fsize(zp, uio) \ + vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio)) + +/* Called on entry to each ZFS vnode and vfs operation */ +#define ZFS_ENTER(zfsvfs) \ + { \ + ZFS_TEARDOWN_ENTER_READ((zfsvfs), FTAG); \ + if (__predict_false((zfsvfs)->z_unmounted)) { \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ + return (EIO); \ + } \ + } + +/* Must be called before exiting the vop */ +#define ZFS_EXIT(zfsvfs) ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG) + +/* Verifies the znode is valid */ +#define ZFS_VERIFY_ZP(zp) \ + if (__predict_false((zp)->z_sa_hdl == NULL)) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern void zfs_tstamp_update_setup_ext(struct znode *, + uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx); +extern void zfs_znode_free(struct znode *); + +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, + char *buf); +#ifdef __cplusplus +} +#endif + +#endif /* _FREEBSD_SYS_FS_ZFS_ZNODE_H */ diff --git a/include/os/freebsd/zfs/sys/zpl.h b/include/os/freebsd/zfs/sys/zpl.h new file mode 100644 index 0000000000..fb2b4e02d4 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zpl.h @@ -0,0 +1 @@ +/* Don't remove */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am new file mode 100644 index 0000000000..605a1fcb75 --- /dev/null +++ b/include/os/linux/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = kernel spl zfs diff --git a/include/os/linux/kernel/Makefile.am b/include/os/linux/kernel/Makefile.am new file mode 100644 index 0000000000..08b2f5fc5c --- /dev/null +++ b/include/os/linux/kernel/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = linux diff --git a/include/os/linux/kernel/linux/Makefile.am b/include/os/linux/kernel/linux/Makefile.am new file mode 100644 index 0000000000..6ff0df506d --- /dev/null +++ b/include/os/linux/kernel/linux/Makefile.am @@ -0,0 +1,22 @@ +KERNEL_H = \ + dcache_compat.h \ + xattr_compat.h \ + vfs_compat.h \ + blkdev_compat.h \ + utsname_compat.h \ + kmap_compat.h \ + percpu_compat.h \ + simd.h \ + simd_x86.h \ + simd_aarch64.h \ + simd_powerpc.h \ + mod_compat.h \ + page_compat.h \ + compiler_compat.h + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/linux +kernel_HEADERS = $(KERNEL_H) +endif +endif diff --git a/include/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h similarity index 59% rename from include/linux/blkdev_compat.h rename to include/os/linux/kernel/linux/blkdev_compat.h index 084ea61ccc..9fa8884bb7 100644 --- a/include/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -30,15 +30,11 @@ #define _ZFS_BLKDEV_H #include -#include #include #include +#include #include /* for SECTOR_* */ -#ifndef HAVE_FMODE_T -typedef unsigned __bitwise__ fmode_t; -#endif /* HAVE_FMODE_T */ - #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void blk_queue_flag_set(unsigned int flag, struct request_queue *q) @@ -56,7 +52,7 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q) #endif /* - * 4.7 - 4.x API, + * 4.7 API, * The blk_queue_write_cache() interface has replaced blk_queue_flush() * interface. However, the new interface is GPL-only thus we implement * our own trivial wrapper when the GPL-only version is detected. @@ -66,9 +62,6 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q) * interface. However, while the old interface was available to all the * new one is GPL-only. Thus if the GPL-only version is detected we * implement our own trivial helper. - * - * 2.6.x - 2.6.35 - * Legacy blk_queue_ordered() interface. */ static inline void blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) @@ -92,78 +85,22 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) #elif defined(HAVE_BLK_QUEUE_FLUSH) blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); #else - blk_queue_ordered(q, QUEUE_ORDERED_DRAIN, NULL); +#error "Unsupported kernel" #endif } -/* - * Most of the blk_* macros were removed in 2.6.36. Ostensibly this was - * done to improve readability and allow easier grepping. However, from - * a portability stand point the macros are helpful. Therefore the needed - * macros are redefined here if they are missing from the kernel. - */ -#ifndef blk_fs_request -#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) -#endif - -/* - * 2.6.34 API change, - * The blk_queue_max_hw_sectors() function replaces blk_queue_max_sectors(). - */ -#ifndef HAVE_BLK_QUEUE_MAX_HW_SECTORS -#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors -static inline void -__blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_queue_max_sectors(q, max_hw_sectors); -} -#endif - -/* - * 2.6.34 API change, - * The blk_queue_max_segments() function consolidates - * blk_queue_max_hw_segments() and blk_queue_max_phys_segments(). - */ -#ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS -#define blk_queue_max_segments __blk_queue_max_segments -static inline void -__blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) -{ - blk_queue_max_phys_segments(q, max_segments); - blk_queue_max_hw_segments(q, max_segments); -} -#endif - static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { +#if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \ + !defined(HAVE_DISK_UPDATE_READAHEAD) #ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC q->backing_dev_info->ra_pages = ra_pages; #else q->backing_dev_info.ra_pages = ra_pages; #endif -} - -#ifndef HAVE_GET_DISK_AND_MODULE -static inline struct kobject * -get_disk_and_module(struct gendisk *disk) -{ - return (get_disk(disk)); -} #endif - -#ifndef HAVE_GET_DISK_RO -static inline int -get_disk_ro(struct gendisk *disk) -{ - int policy = 0; - - if (disk->part[0]) - policy = disk->part[0]->policy; - - return (policy); } -#endif /* HAVE_GET_DISK_RO */ #ifdef HAVE_BIO_BVEC_ITER #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector @@ -183,10 +120,6 @@ typedef struct bvec_iter bvec_iterator_t; typedef int bvec_iterator_t; #endif -/* - * Portable helper for correctly setting the FAILFAST flags. The - * correct usage has changed 3 times from 2.6.12 to 2.6.38. - */ static inline void bio_set_flags_failfast(struct block_device *bdev, int *flags) { @@ -210,27 +143,13 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) #endif /* BLOCK_EXT_MAJOR */ #endif /* CONFIG_BUG */ -#if defined(HAVE_BIO_RW_FAILFAST_DTD) - /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */ - *flags |= ( - (1 << BIO_RW_FAILFAST_DEV) | - (1 << BIO_RW_FAILFAST_TRANSPORT) | - (1 << BIO_RW_FAILFAST_DRIVER)); -#elif defined(HAVE_REQ_FAILFAST_MASK) - /* - * REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, - * the BIO_* and REQ_* flags were unified under REQ_* flags. - */ *flags |= REQ_FAILFAST_MASK; -#else -#error "Undefined block IO FAILFAST interface." -#endif } /* * Maximum disk label length, it may be undefined for some kernels. */ -#ifndef DISK_NAME_LEN +#if !defined(DISK_NAME_LEN) #define DISK_NAME_LEN 32 #endif /* DISK_NAME_LEN */ @@ -338,56 +257,58 @@ bio_set_bi_error(struct bio *bio, int error) #endif /* HAVE_1ARG_BIO_END_IO_T */ /* - * 2.6.38 - 2.6.x API, - * blkdev_get_by_path() - * blkdev_put() - * - * 2.6.28 - 2.6.37 API, - * open_bdev_exclusive() - * close_bdev_exclusive() - * - * 2.6.12 - 2.6.27 API, - * open_bdev_excl() - * close_bdev_excl() - * - * Used to exclusively open a block device from within the kernel. - */ -#if defined(HAVE_BLKDEV_GET_BY_PATH) -#define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ - (md) | FMODE_EXCL, hld) -#define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) -#elif defined(HAVE_OPEN_BDEV_EXCLUSIVE) -#define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) -#define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) -#else -#define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) -#define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) -#endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */ - -/* - * 4.1 - x.y.z API, + * 4.1 API, * 3.10.0 CentOS 7.x API, * blkdev_reread_part() * * For older kernels trigger a re-reading of the partition table by calling * check_disk_change() which calls flush_disk() to invalidate the device. + * + * For newer kernels (as of 5.10), bdev_check_media_change is used, in favor of + * check_disk_change(), with the modification that invalidation is no longer + * forced. */ +#ifdef HAVE_CHECK_DISK_CHANGE +#define zfs_check_media_change(bdev) check_disk_change(bdev) #ifdef HAVE_BLKDEV_REREAD_PART #define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) #else #define vdev_bdev_reread_part(bdev) check_disk_change(bdev) #endif /* HAVE_BLKDEV_REREAD_PART */ - -/* - * 2.6.22 API change - * The function invalidate_bdev() lost it's second argument because - * it was unused. - */ -#ifdef HAVE_1ARG_INVALIDATE_BDEV -#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) #else -#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) -#endif /* HAVE_1ARG_INVALIDATE_BDEV */ +#ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE +static inline int +zfs_check_media_change(struct block_device *bdev) +{ +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + struct gendisk *gd = bdev->bd_disk; + const struct block_device_operations *bdo = gd->fops; +#endif + + if (!bdev_check_media_change(bdev)) + return (0); + +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + /* + * Force revalidation, to mimic the old behavior of + * check_disk_change() + */ + if (bdo->revalidate_disk) + bdo->revalidate_disk(gd); +#endif + + return (0); +} +#define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) +#else +/* + * This is encountered if check_disk_change() and bdev_check_media_change() + * are not available in the kernel - likely due to an API change that needs + * to be chased down. + */ +#error "Unsupported kernel: no usable disk change check" +#endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */ +#endif /* HAVE_CHECK_DISK_CHANGE */ /* * 2.6.27 API change @@ -396,45 +317,43 @@ bio_set_bi_error(struct bio *bio, int error) * * 4.4.0-6.21 API change for Ubuntu * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions. - */ -#ifdef HAVE_1ARG_LOOKUP_BDEV -#define vdev_lookup_bdev(path) lookup_bdev(path) -#else -#ifdef HAVE_2ARGS_LOOKUP_BDEV -#define vdev_lookup_bdev(path) lookup_bdev(path, 0) -#else -#define vdev_lookup_bdev(path) ERR_PTR(-ENOTSUP) -#endif /* HAVE_2ARGS_LOOKUP_BDEV */ -#endif /* HAVE_1ARG_LOOKUP_BDEV */ - -/* - * 2.6.30 API change - * To ensure good performance preferentially use the physical block size - * for proper alignment. The physical size is supposed to be the internal - * sector size used by the device. This is often 4096 byte for AF devices, - * while a smaller 512 byte logical size is supported for compatibility. * - * Unfortunately, many drives still misreport their physical sector size. - * For devices which are known to lie you may need to manually set this - * at pool creation time with 'zpool create -o ashift=12 ...'. - * - * When the physical block size interface isn't available, we fall back to - * the logical block size interface and then the older hard sector size. + * 5.11 API change + * Changed to take a dev_t argument which is set on success and return a + * non-zero error code on failure. */ -#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE -#define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) -#else -#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -#define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) -#else -#define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -#endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ -#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ +static inline int +vdev_lookup_bdev(const char *path, dev_t *dev) +{ +#if defined(HAVE_DEVT_LOOKUP_BDEV) + return (lookup_bdev(path, dev)); +#elif defined(HAVE_1ARG_LOOKUP_BDEV) + struct block_device *bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return (PTR_ERR(bdev)); + + *dev = bdev->bd_dev; + bdput(bdev); + + return (0); +#elif defined(HAVE_MODE_LOOKUP_BDEV) + struct block_device *bdev = lookup_bdev(path, FMODE_READ); + if (IS_ERR(bdev)) + return (PTR_ERR(bdev)); + + *dev = bdev->bd_dev; + bdput(bdev); + + return (0); +#else +#error "Unsupported kernel" +#endif +} -#ifndef HAVE_BIO_SET_OP_ATTRS /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ +#if !defined(HAVE_BIO_SET_OP_ATTRS) static inline void bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) { @@ -446,21 +365,15 @@ bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) * bio_set_flush - Set the appropriate flags in a bio to guarantee * data are on non-volatile media on completion. * - * 2.6.X - 2.6.36 API, - * WRITE_BARRIER - Tells the block layer to commit all previously submitted - * writes to stable storage before this one is started and that the current - * write is on stable storage upon completion. Also prevents reordering - * on both sides of the current operation. - * * 2.6.37 - 4.8 API, - * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a + * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a * replacement for WRITE_BARRIER to allow expressing richer semantics * to the block layer. It's up to the block layer to implement the * semantics correctly. Use the WRITE_FLUSH_FUA flag combination. * * 4.8 - 4.9 API, * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous - * ZoL releases, prefer the WRITE_FLUSH_FUA flag set if it's available. + * OpenZFS releases, prefer the WRITE_FLUSH_FUA flag set if it's available. * * 4.10 API, * The read/write flags and their modifiers, including WRITE_FLUSH, @@ -471,19 +384,17 @@ bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) static inline void bio_set_flush(struct bio *bio) { -#if defined(REQ_PREFLUSH) /* >= 4.10 */ +#if defined(HAVE_REQ_PREFLUSH) /* >= 4.10 */ bio_set_op_attrs(bio, 0, REQ_PREFLUSH); #elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); -#elif defined(WRITE_BARRIER) /* < 2.6.37 */ - bio_set_op_attrs(bio, 0, WRITE_BARRIER); #else #error "Allowing the build will cause bio_set_flush requests to be ignored." #endif } /* - * 4.8 - 4.x API, + * 4.8 API, * REQ_OP_FLUSH * * 4.8-rc0 - 4.8-rc1, @@ -492,12 +403,6 @@ bio_set_flush(struct bio *bio) * 2.6.36 - 4.7 API, * REQ_FLUSH * - * 2.6.x - 2.6.35 API, - * HAVE_BIO_RW_BARRIER - * - * Used to determine if a cache flush has been requested. This check has - * been left intentionally broad in order to cover both a legacy flush - * and the new preflush behavior introduced in Linux 4.8. This is correct * in all cases but may have a performance impact for some kernels. It * has the advantage of minimizing kernel specific changes in the zvol code. * @@ -507,21 +412,19 @@ bio_is_flush(struct bio *bio) { #if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF) return ((bio_op(bio) == REQ_OP_FLUSH) || (bio->bi_opf & REQ_PREFLUSH)); -#elif defined(REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF) +#elif defined(HAVE_REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF) return (bio->bi_opf & REQ_PREFLUSH); -#elif defined(REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF) +#elif defined(HAVE_REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF) return (bio->bi_rw & REQ_PREFLUSH); -#elif defined(REQ_FLUSH) +#elif defined(HAVE_REQ_FLUSH) return (bio->bi_rw & REQ_FLUSH); -#elif defined(HAVE_BIO_RW_BARRIER) - return (bio->bi_rw & (1 << BIO_RW_BARRIER)); #else -#error "Allowing the build will cause flush requests to be ignored." +#error "Unsupported kernel" #endif } /* - * 4.8 - 4.x API, + * 4.8 API, * REQ_FUA flag moved to bio->bi_opf * * 2.6.x - 4.7 API, @@ -540,46 +443,33 @@ bio_is_fua(struct bio *bio) } /* - * 4.8 - 4.x API, + * 4.8 API, * REQ_OP_DISCARD * * 2.6.36 - 4.7 API, * REQ_DISCARD * - * 2.6.28 - 2.6.35 API, - * BIO_RW_DISCARD - * * In all cases the normal I/O path is used for discards. The only * difference is how the kernel tags individual I/Os as discards. - * - * Note that 2.6.32 era kernels provide both BIO_RW_DISCARD and REQ_DISCARD, - * where BIO_RW_DISCARD is the correct interface. Therefore, it is important - * that the HAVE_BIO_RW_DISCARD check occur before the REQ_DISCARD check. */ static inline boolean_t bio_is_discard(struct bio *bio) { #if defined(HAVE_REQ_OP_DISCARD) return (bio_op(bio) == REQ_OP_DISCARD); -#elif defined(HAVE_BIO_RW_DISCARD) - return (bio->bi_rw & (1 << BIO_RW_DISCARD)); -#elif defined(REQ_DISCARD) +#elif defined(HAVE_REQ_DISCARD) return (bio->bi_rw & REQ_DISCARD); #else -/* potentially triggering the DMU_MAX_ACCESS assertion. */ -#error "Allowing the build will cause discard requests to become writes." +#error "Unsupported kernel" #endif } /* - * 4.8 - 4.x API, + * 4.8 API, * REQ_OP_SECURE_ERASE * * 2.6.36 - 4.7 API, * REQ_SECURE - * - * 2.6.x - 2.6.35 API, - * Unsupported by kernel */ static inline boolean_t bio_is_secure_erase(struct bio *bio) @@ -598,33 +488,18 @@ bio_is_secure_erase(struct bio *bio) * Discard granularity and alignment restrictions may now be set. For * older kernels which do not support this it is safe to skip it. */ -#ifdef HAVE_DISCARD_GRANULARITY static inline void blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) { q->limits.discard_granularity = dg; } -#else -#define blk_queue_discard_granularity(x, dg) ((void)0) -#endif /* HAVE_DISCARD_GRANULARITY */ /* - * 2.6.32 - 4.x API, - * blk_queue_discard() - */ -#if !defined(HAVE_BLK_QUEUE_DISCARD) -#define blk_queue_discard(q) (0); -#endif - -/* - * 4.8 - 4.x API, + * 4.8 API, * blk_queue_secure_erase() * * 2.6.36 - 4.7 API, * blk_queue_secdiscard() - * - * 2.6.x - 2.6.35 API, - * Unsupported by kernel */ static inline int blk_queue_discard_secure(struct request_queue *q) @@ -638,14 +513,6 @@ blk_queue_discard_secure(struct request_queue *q) #endif } -/* - * Default Linux IO Scheduler, - * Setting the scheduler to noop will allow the Linux IO scheduler to - * still perform front and back merging, while leaving the request - * ordering and prioritization to the ZFS IO scheduler. - */ -#define VDEV_SCHEDULER "noop" - /* * A common holder for vdev_bdev_open() is used to relax the exclusive open * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to @@ -655,26 +522,61 @@ blk_queue_discard_secure(struct request_queue *q) */ #define VDEV_HOLDER ((void *)0x2401de7) -static inline void -blk_generic_start_io_acct(struct request_queue *q, int rw, - unsigned long sectors, struct hd_struct *part) +static inline unsigned long +blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), + struct gendisk *disk __attribute__((unused)), + int rw __attribute__((unused)), struct bio *bio) { -#if defined(HAVE_GENERIC_IO_ACCT_3ARG) - generic_start_io_acct(rw, sectors, part); +#if defined(HAVE_DISK_IO_ACCT) + return (disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio))); +#elif defined(HAVE_BIO_IO_ACCT) + return (bio_start_io_acct(bio)); +#elif defined(HAVE_GENERIC_IO_ACCT_3ARG) + unsigned long start_time = jiffies; + generic_start_io_acct(rw, bio_sectors(bio), &disk->part0); + return (start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) - generic_start_io_acct(q, rw, sectors, part); + unsigned long start_time = jiffies; + generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0); + return (start_time); +#else + /* Unsupported */ + return (0); #endif } static inline void -blk_generic_end_io_acct(struct request_queue *q, int rw, - struct hd_struct *part, unsigned long start_time) +blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), + struct gendisk *disk __attribute__((unused)), + int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) { -#if defined(HAVE_GENERIC_IO_ACCT_3ARG) - generic_end_io_acct(rw, part, start_time); +#if defined(HAVE_DISK_IO_ACCT) + disk_end_io_acct(disk, bio_op(bio), start_time); +#elif defined(HAVE_BIO_IO_ACCT) + bio_end_io_acct(bio, start_time); +#elif defined(HAVE_GENERIC_IO_ACCT_3ARG) + generic_end_io_acct(rw, &disk->part0, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) - generic_end_io_acct(q, rw, part, start_time); + generic_end_io_acct(q, rw, &disk->part0, start_time); #endif } +#ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +static inline struct request_queue * +blk_generic_alloc_queue(make_request_fn make_request, int node_id) +{ +#if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) + return (blk_alloc_queue(make_request, node_id)); +#elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH) + return (blk_alloc_queue_rh(make_request, node_id)); +#else + struct request_queue *q = blk_alloc_queue(GFP_KERNEL); + if (q != NULL) + blk_queue_make_request(q, make_request); + + return (q); +#endif +} +#endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + #endif /* _ZFS_BLKDEV_H */ diff --git a/include/linux/compiler_compat.h b/include/os/linux/kernel/linux/compiler_compat.h similarity index 86% rename from include/linux/compiler_compat.h rename to include/os/linux/kernel/linux/compiler_compat.h index 921d32f246..2c0704da2e 100644 --- a/include/linux/compiler_compat.h +++ b/include/os/linux/kernel/linux/compiler_compat.h @@ -28,6 +28,14 @@ #include +#if !defined(fallthrough) +#if defined(HAVE_IMPLICIT_FALLTHROUGH) +#define fallthrough __attribute__((__fallthrough__)) +#else +#define fallthrough ((void)0) +#endif +#endif + #if !defined(READ_ONCE) #define READ_ONCE(x) ACCESS_ONCE(x) #endif diff --git a/include/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h similarity index 81% rename from include/linux/dcache_compat.h rename to include/os/linux/kernel/linux/dcache_compat.h index bdaa5db3e6..d0588a82e9 100644 --- a/include/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -41,24 +41,7 @@ * the dentry structure. To handle this we define an appropriate * dentry_operations_t typedef which can be used. */ -#ifdef HAVE_CONST_DENTRY_OPERATIONS typedef const struct dentry_operations dentry_operations_t; -#else -typedef struct dentry_operations dentry_operations_t; -#endif - -/* - * 2.6.38 API change, - * Added d_set_d_op() helper function which sets some flags in - * dentry->d_flags based on which operations are defined. - */ -#ifndef HAVE_D_SET_D_OP -static inline void -d_set_d_op(struct dentry *dentry, dentry_operations_t *op) -{ - dentry->d_op = op; -} -#endif /* HAVE_D_SET_D_OP */ /* * 2.6.38 API addition, @@ -72,12 +55,10 @@ d_set_d_op(struct dentry *dentry, dentry_operations_t *op) static inline void d_clear_d_op(struct dentry *dentry) { -#ifdef HAVE_D_SET_D_OP dentry->d_op = NULL; dentry->d_flags &= ~( DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -#endif /* HAVE_D_SET_D_OP */ } #endif /* _ZFS_DCACHE_H */ diff --git a/include/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h similarity index 80% rename from include/linux/kmap_compat.h rename to include/os/linux/kernel/linux/kmap_compat.h index b9c7f5bcc9..42f463ab9a 100644 --- a/include/linux/kmap_compat.h +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -29,14 +29,9 @@ #include #include -#ifdef HAVE_1ARG_KMAP_ATOMIC /* 2.6.37 API change */ -#define zfs_kmap_atomic(page, km_type) kmap_atomic(page) -#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr) -#else -#define zfs_kmap_atomic(page, km_type) kmap_atomic(page, km_type) -#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr, km_type) -#endif +#define zfs_kmap_atomic(page) kmap_atomic(page) +#define zfs_kunmap_atomic(addr) kunmap_atomic(addr) /* 5.0 API change - no more 'type' argument for access_ok() */ #ifdef HAVE_ACCESS_OK_TYPE diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h new file mode 100644 index 0000000000..cc42c3f7c7 --- /dev/null +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -0,0 +1,167 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Neskovic . + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _MOD_COMPAT_H +#define _MOD_COMPAT_H + +#include +#include + +/* Grsecurity kernel API change */ +#ifdef MODULE_PARAM_CALL_CONST +typedef const struct kernel_param zfs_kernel_param_t; +#else +typedef struct kernel_param zfs_kernel_param_t; +#endif + +#define ZMOD_RW 0644 +#define ZMOD_RD 0444 + +/* BEGIN CSTYLED */ +#define INT int +#define UINT uint +#define ULONG ulong +#define LONG long +#define STRING charp +/* END CSTYLED */ + +enum scope_prefix_types { + zfs, + zfs_arc, + zfs_condense, + zfs_dbuf, + zfs_dbuf_cache, + zfs_deadman, + zfs_dedup, + zfs_l2arc, + zfs_livelist, + zfs_livelist_condense, + zfs_lua, + zfs_metaslab, + zfs_mg, + zfs_multihost, + zfs_prefetch, + zfs_reconstruct, + zfs_recv, + zfs_send, + zfs_spa, + zfs_trim, + zfs_txg, + zfs_vdev, + zfs_vdev_cache, + zfs_vdev_file, + zfs_vdev_mirror, + zfs_vnops, + zfs_zevent, + zfs_zio, + zfs_zil +}; + +/* + * Declare a module parameter / sysctl node + * + * "scope_prefix" the part of the sysctl / sysfs tree the node resides under + * (currently a no-op on Linux) + * "name_prefix" the part of the variable name that will be excluded from the + * exported names on platforms with a hierarchical namespace + * "name" the part of the variable that will be exposed on platforms with a + * hierarchical namespace, or as name_prefix ## name on Linux + * "type" the variable type + * "perm" the permissions (read/write or read only) + * "desc" a brief description of the option + * + * Examples: + * ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, UINT, + * ZMOD_RW, "Rotating media load increment for non-seeking I/O's"); + * on FreeBSD: + * vfs.zfs.vdev.mirror.rotating_inc + * on Linux: + * zfs_vdev_mirror_rotating_inc + * + * ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, + * "Limit one prefetch call to this size"); + * on FreeBSD: + * vfs.zfs.dmu_prefetch_max + * on Linux: + * dmu_prefetch_max + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param(name_prefix ## name, type, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + +/* + * Declare a module parameter / sysctl node + * + * "scope_prefix" the part of the the sysctl / sysfs tree the node resides under + * (currently a no-op on Linux) + * "name_prefix" the part of the variable name that will be excluded from the + * exported names on platforms with a hierarchical namespace + * "name" the part of the variable that will be exposed on platforms with a + * hierarchical namespace, or as name_prefix ## name on Linux + * "setfunc" setter function + * "getfunc" getter function + * "perm" the permissions (read/write or read only) + * "desc" a brief description of the option + * + * Examples: + * ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, + * param_get_int, ZMOD_RW, "Reserved free space in pool"); + * on FreeBSD: + * vfs.zfs.spa_slop_shift + * on Linux: + * spa_slop_shift + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, getfunc, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param_call(name_prefix ## name, setfunc, getfunc, &name_prefix ## name, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + +/* + * As above, but there is no variable with the name name_prefix ## name, + * so NULL is passed to module_param_call instead. + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_VIRTUAL_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, getfunc, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param_call(name_prefix ## name, setfunc, getfunc, NULL, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + +#define ZFS_MODULE_PARAM_ARGS const char *buf, zfs_kernel_param_t *kp + +#define ZFS_MODULE_DESCRIPTION(s) MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) MODULE_VERSION(s) + +#define module_init_early(fn) module_init(fn) + +#endif /* _MOD_COMPAT_H */ diff --git a/include/linux/page_compat.h b/include/os/linux/kernel/linux/page_compat.h similarity index 81% rename from include/linux/page_compat.h rename to include/os/linux/kernel/linux/page_compat.h index 95acb7d536..bd6cb398b0 100644 --- a/include/linux/page_compat.h +++ b/include/os/linux/kernel/linux/page_compat.h @@ -35,11 +35,6 @@ #else #define nr_inactive_file_pages() global_zone_page_state(NR_INACTIVE_FILE) #endif -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) -#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) -#else -#define nr_slab_reclaimable_pages() global_zone_page_state(NR_SLAB_RECLAIMABLE) -#endif #elif defined(ZFS_GLOBAL_NODE_PAGE_STATE) @@ -59,11 +54,6 @@ #else #define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) #endif -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) -#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) -#else -#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) -#endif #else @@ -71,7 +61,6 @@ #define nr_file_pages() global_page_state(NR_FILE_PAGES) #define nr_inactive_anon_pages() global_page_state(NR_INACTIVE_ANON) #define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) -#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) #endif /* ZFS_GLOBAL_ZONE_PAGE_STATE */ diff --git a/include/os/linux/kernel/linux/percpu_compat.h b/include/os/linux/kernel/linux/percpu_compat.h new file mode 100644 index 0000000000..e7a4242c46 --- /dev/null +++ b/include/os/linux/kernel/linux/percpu_compat.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _ZFS_PERCPU_H +#define _ZFS_PERCPU_H + +#include + +/* + * 3.18 API change, + * percpu_counter_init() now must be passed a gfp mask which will be + * used for the dynamic allocation of the actual counter. + */ +#ifdef HAVE_PERCPU_COUNTER_INIT_WITH_GFP +#define percpu_counter_init_common(counter, n, gfp) \ + percpu_counter_init(counter, n, gfp) +#else +#define percpu_counter_init_common(counter, n, gfp) \ + percpu_counter_init(counter, n) +#endif + +#endif /* _ZFS_PERCPU_H */ diff --git a/include/os/linux/kernel/linux/simd.h b/include/os/linux/kernel/linux/simd.h new file mode 100644 index 0000000000..4cde248e20 --- /dev/null +++ b/include/os/linux/kernel/linux/simd.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Lawrence Livermore National Security, LLC. + */ + +#ifndef _LINUX_SIMD_H +#define _LINUX_SIMD_H + +#if defined(__x86) +#include + +#elif defined(__aarch64__) +#include + +#elif defined(__powerpc__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif +#endif /* _LINUX_SIMD_H */ diff --git a/include/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h similarity index 72% rename from include/linux/simd_aarch64.h rename to include/os/linux/kernel/linux/simd_aarch64.h index 155ef62055..50937e97ce 100644 --- a/include/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -26,37 +26,29 @@ * USER API: * * Kernel fpu methods: - * kfpu_begin() - * kfpu_end() + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() */ -#ifndef _SIMD_AARCH64_H -#define _SIMD_AARCH64_H +#ifndef _LINUX_SIMD_AARCH64_H +#define _LINUX_SIMD_AARCH64_H #include #if defined(__aarch64__) #include - -#if defined(_KERNEL) #include -#define kfpu_begin() \ -{ \ - kernel_neon_begin(); \ -} -#define kfpu_end() \ -{ \ - kernel_neon_end(); \ -} -#else -/* - * fpu dummy methods for userspace - */ -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) -#endif /* defined(_KERNEL) */ + +#define kfpu_allowed() 1 +#define kfpu_begin() kernel_neon_begin() +#define kfpu_end() kernel_neon_end() +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) #endif /* __aarch64__ */ -#endif /* _SIMD_AARCH64_H */ +#endif /* _LINUX_SIMD_AARCH64_H */ diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h new file mode 100644 index 0000000000..108cef22f5 --- /dev/null +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau + * + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_altivec_available() + */ + +#ifndef _LINUX_SIMD_POWERPC_H +#define _LINUX_SIMD_POWERPC_H + +/* only for __powerpc__ */ +#if defined(__powerpc__) + +#include +#include +#include +#include +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() \ + { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define kfpu_end() \ + { \ + disable_kernel_altivec(); \ + preempt_enable(); \ + } +#else +/* seems that before 4.5 no-one bothered disabling ... */ +#define kfpu_end() preempt_enable() +#endif +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +/* + * Check if AltiVec instruction set is available + */ +static inline boolean_t +zfs_altivec_available(void) +{ + boolean_t res; + /* suggested by macallan at netbsd dot org */ +#if defined(__powerpc64__) + u64 msr; +#else + u32 msr; +#endif + kfpu_begin(); + __asm volatile("mfmsr %0" : "=r"(msr)); + /* + * 64 bits -> need to check bit 38 + * Power ISA Version 3.0B + * p944 + * 32 bits -> Need to check bit 6 + * AltiVec Technology Programming Environments Manual + * p49 (2-9) + * They are the same, as ppc counts 'backward' ... + */ + res = (msr & 0x2000000) != 0; + kfpu_end(); + return (res); +} +#endif /* defined(__powerpc) */ + +#endif /* _LINUX_SIMD_POWERPC_H */ diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h new file mode 100644 index 0000000000..cdd3286d21 --- /dev/null +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -0,0 +1,646 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_sse_available() + * zfs_sse2_available() + * zfs_sse3_available() + * zfs_ssse3_available() + * zfs_sse4_1_available() + * zfs_sse4_2_available() + * + * zfs_avx_available() + * zfs_avx2_available() + * + * zfs_bmi1_available() + * zfs_bmi2_available() + * + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() + * + * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers + * also add zfs_avx512vl_available() to feature check. + */ + +#ifndef _LINUX_SIMD_X86_H +#define _LINUX_SIMD_X86_H + +/* only for __x86 */ +#if defined(__x86) + +#include +#include + +/* + * Disable the WARN_ON_FPU() macro to prevent additional dependencies + * when providing the kfpu_* functions. Relevant warnings are included + * as appropriate and are unconditionally enabled. + */ +#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU) +#undef CONFIG_X86_DEBUG_FPU +#endif + +#if defined(HAVE_KERNEL_FPU_API_HEADER) +#include +#include +#else +#include +#include +#endif + +/* + * The following cases are for kernels which export either the + * kernel_fpu_* or __kernel_fpu_* functions. + */ +#if defined(KERNEL_EXPORTS_X86_FPU) + +#define kfpu_allowed() 1 +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#if defined(HAVE_UNDERSCORE_KERNEL_FPU) +#define kfpu_begin() \ +{ \ + preempt_disable(); \ + __kernel_fpu_begin(); \ +} +#define kfpu_end() \ +{ \ + __kernel_fpu_end(); \ + preempt_enable(); \ +} + +#elif defined(HAVE_KERNEL_FPU) +#define kfpu_begin() kernel_fpu_begin() +#define kfpu_end() kernel_fpu_end() + +#else +/* + * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then + * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. + */ +#error "Unreachable kernel configuration" +#endif + +#else /* defined(KERNEL_EXPORTS_X86_FPU) */ + +/* + * When the kernel_fpu_* symbols are unavailable then provide our own + * versions which allow the FPU to be safely used. + */ +#if defined(HAVE_KERNEL_FPU_INTERNAL) + +#include + +extern union fpregs_state **zfs_kfpu_fpregs; + +/* + * Initialize per-cpu variables to store FPU state. + */ +static inline void +kfpu_fini(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (zfs_kfpu_fpregs[cpu] != NULL) { + free_pages((unsigned long)zfs_kfpu_fpregs[cpu], + get_order(sizeof (union fpregs_state))); + } + } + + kfree(zfs_kfpu_fpregs); +} + +static inline int +kfpu_init(void) +{ + zfs_kfpu_fpregs = kzalloc(num_possible_cpus() * + sizeof (union fpregs_state *), GFP_KERNEL); + if (zfs_kfpu_fpregs == NULL) + return (-ENOMEM); + + /* + * The fxsave and xsave operations require 16-/64-byte alignment of + * the target memory. Since kmalloc() provides no alignment + * guarantee instead use alloc_pages_node(). + */ + unsigned int order = get_order(sizeof (union fpregs_state)); + int cpu; + + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_ZERO, order); + if (page == NULL) { + kfpu_fini(); + return (-ENOMEM); + } + + zfs_kfpu_fpregs[cpu] = page_address(page); + } + + return (0); +} + +#define kfpu_allowed() 1 +#define ex_handler_fprestore ex_handler_default + +/* + * FPU save and restore instructions. + */ +#define __asm __asm__ __volatile__ +#define kfpu_fxsave(addr) __asm("fxsave %0" : "=m" (*(addr))) +#define kfpu_fxsaveq(addr) __asm("fxsaveq %0" : "=m" (*(addr))) +#define kfpu_fnsave(addr) __asm("fnsave %0; fwait" : "=m" (*(addr))) +#define kfpu_fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define kfpu_fxrstorq(addr) __asm("fxrstorq %0" : : "m" (*(addr))) +#define kfpu_frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define kfpu_fxsr_clean(rval) __asm("fnclex; emms; fildl %P[addr]" \ + : : [addr] "m" (rval)); + +static inline void +kfpu_save_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + int err; + + low = mask; + hi = mask >> 32; + XSTATE_XSAVE(addr, low, hi, err); + WARN_ON_ONCE(err); +} + +static inline void +kfpu_save_fxsr(struct fxregs_state *addr) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kfpu_fxsave(addr); + else + kfpu_fxsaveq(addr); +} + +static inline void +kfpu_save_fsave(struct fregs_state *addr) +{ + kfpu_fnsave(addr); +} + +static inline void +kfpu_begin(void) +{ + /* + * Preemption and interrupts must be disabled for the critical + * region where the FPU state is being modified. + */ + preempt_disable(); + local_irq_disable(); + + /* + * The current FPU registers need to be preserved by kfpu_begin() + * and restored by kfpu_end(). They are stored in a dedicated + * per-cpu variable, not in the task struct, this allows any user + * FPU state to be correctly preserved and restored. + */ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_save_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_save_fxsr(&state->fxsave); + } else { + kfpu_save_fsave(&state->fsave); + } +} + +static inline void +kfpu_restore_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + XSTATE_XRESTORE(addr, low, hi); +} + +static inline void +kfpu_restore_fxsr(struct fxregs_state *addr) +{ + /* + * On AuthenticAMD K7 and K8 processors the fxrstor instruction only + * restores the _x87 FOP, FIP, and FDP registers when an exception + * is pending. Clean the _x87 state to force the restore. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) + kfpu_fxsr_clean(addr); + + if (IS_ENABLED(CONFIG_X86_32)) { + kfpu_fxrstor(addr); + } else { + kfpu_fxrstorq(addr); + } +} + +static inline void +kfpu_restore_fsave(struct fregs_state *addr) +{ + kfpu_frstor(addr); +} + +static inline void +kfpu_end(void) +{ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_restore_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_restore_fxsr(&state->fxsave); + } else { + kfpu_restore_fsave(&state->fsave); + } + + local_irq_enable(); + preempt_enable(); +} + +#else + +/* + * FPU support is unavailable. + */ +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif /* defined(HAVE_KERNEL_FPU_INTERNAL) */ +#endif /* defined(KERNEL_EXPORTS_X86_FPU) */ + +/* + * Linux kernel provides an interface for CPU feature testing. + */ + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + +#if defined(X86_FEATURE_OSXSAVE) + has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); +#else + has_osxsave = B_FALSE; +#endif + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM)); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM2)); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM3)); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_SSSE3)); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + return (boot_cpu_has(X86_FEATURE_AVX) && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + return (boot_cpu_has(X86_FEATURE_AVX2) && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ +#if defined(X86_FEATURE_BMI1) + return (!!boot_cpu_has(X86_FEATURE_BMI1)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ +#if defined(X86_FEATURE_BMI2) + return (!!boot_cpu_has(X86_FEATURE_BMI2)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ +#if defined(X86_FEATURE_AES) + return (!!boot_cpu_has(X86_FEATURE_AES)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ +#if defined(X86_FEATURE_PCLMULQDQ) + return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if MOVBE instruction is available + */ +static inline boolean_t +zfs_movbe_available(void) +{ +#if defined(X86_FEATURE_MOVBE) + return (!!boot_cpu_has(X86_FEATURE_MOVBE)); +#else + return (B_FALSE); +#endif +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + +/* + * Check if AVX512F instruction set is available + */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512F) + has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512CD instruction set is available + */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512CD) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512CD); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512ER instruction set is available + */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512ER) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512ER); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512PF instruction set is available + */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512PF) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512PF); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512BW instruction set is available + */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512BW) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512DQ instruction set is available + */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512DQ) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512VL instruction set is available + */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512VL) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512IFMA instruction set is available + */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512IFMA) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512IFMA); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512VBMI instruction set is available + */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512VBMI) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VBMI); +#endif + return (has_avx512 && __zmm_enabled()); +} + +#endif /* defined(__x86) */ + +#endif /* _LINUX_SIMD_X86_H */ diff --git a/include/linux/utsname_compat.h b/include/os/linux/kernel/linux/utsname_compat.h similarity index 100% rename from include/linux/utsname_compat.h rename to include/os/linux/kernel/linux/utsname_compat.h diff --git a/include/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h similarity index 68% rename from include/linux/vfs_compat.h rename to include/os/linux/kernel/linux/vfs_compat.h index 04a2c2b879..91e908598f 100644 --- a/include/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -33,43 +33,6 @@ #include /* - * 2.6.28 API change, - * Added insert_inode_locked() helper function, prior to this most callers - * used insert_inode_hash(). The older method doesn't check for collisions - * in the inode_hashtable but it still acceptible for use. - */ -#ifndef HAVE_INSERT_INODE_LOCKED -static inline int -insert_inode_locked(struct inode *ip) -{ - insert_inode_hash(ip); - return (0); -} -#endif /* HAVE_INSERT_INODE_LOCKED */ - -/* - * 2.6.35 API change, - * Add truncate_setsize() if it is not exported by the Linux kernel. - * - * Truncate the inode and pages associated with the inode. The pages are - * unmapped and removed from cache. - */ -#ifndef HAVE_TRUNCATE_SETSIZE -static inline void -truncate_setsize(struct inode *ip, loff_t new) -{ - struct address_space *mapping = ip->i_mapping; - - i_size_write(ip, new); - - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); -} -#endif /* HAVE_TRUNCATE_SETSIZE */ - -/* - * 2.6.32 - 2.6.33, bdi_setup_and_register() is not available. * 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. * 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. * 4.12 - x.y, super_setup_bdi_name() new interface. @@ -142,45 +105,7 @@ zpl_bdi_destroy(struct super_block *sb) sb->s_bdi = NULL; } #else -extern atomic_long_t zfs_bdi_seq; - -static inline int -zpl_bdi_setup(struct super_block *sb, char *name) -{ - struct backing_dev_info *bdi; - int error; - - bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); - bdi->name = name; - bdi->capabilities = BDI_CAP_MAP_COPY; - - error = bdi_init(bdi); - if (error) { - kmem_free(bdi, sizeof (struct backing_dev_info)); - return (error); - } - - error = bdi_register(bdi, NULL, "%.28s-%ld", name, - atomic_long_inc_return(&zfs_bdi_seq)); - if (error) { - bdi_destroy(bdi); - kmem_free(bdi, sizeof (struct backing_dev_info)); - return (error); - } - - sb->s_bdi = bdi; - - return (0); -} -static inline void -zpl_bdi_destroy(struct super_block *sb) -{ - struct backing_dev_info *bdi = sb->s_bdi; - - bdi_destroy(bdi); - kmem_free(bdi, sizeof (struct backing_dev_info)); - sb->s_bdi = NULL; -} +#error "Unsupported kernel" #endif /* @@ -211,41 +136,6 @@ zpl_bdi_destroy(struct super_block *sb) #define SB_NOATIME MS_NOATIME #endif -/* - * 2.6.38 API change, - * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases. - */ -#ifndef LOOKUP_RCU -#define LOOKUP_RCU 0x0 -#endif /* LOOKUP_RCU */ - -/* - * 3.2-rc1 API change, - * Add set_nlink() if it is not exported by the Linux kernel. - * - * i_nlink is read-only in Linux 3.2, but it can be set directly in - * earlier kernels. - */ -#ifndef HAVE_SET_NLINK -static inline void -set_nlink(struct inode *inode, unsigned int nlink) -{ - inode->i_nlink = nlink; -} -#endif /* HAVE_SET_NLINK */ - -/* - * 3.3 API change, - * The VFS .create, .mkdir and .mknod callbacks were updated to take a - * umode_t type rather than an int. To cleanly handle both definitions - * the zpl_umode_t type is introduced and set accordingly. - */ -#ifdef HAVE_MKDIR_UMODE_T -typedef umode_t zpl_umode_t; -#else -typedef int zpl_umode_t; -#endif - /* * 3.5 API change, * The clear_inode() function replaces end_writeback() and introduces an @@ -256,16 +146,6 @@ typedef int zpl_umode_t; #define clear_inode(ip) end_writeback(ip) #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ -/* - * 3.6 API change, - * The sget() helper function now takes the mount flags as an argument. - */ -#ifdef HAVE_5ARG_SGET -#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) -#else -#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) -#endif /* HAVE_5ARG_SGET */ - #if defined(SEEK_HOLE) && defined(SEEK_DATA) && !defined(HAVE_LSEEK_EXECUTE) static inline loff_t lseek_execute( @@ -361,65 +241,22 @@ zpl_forget_cached_acl(struct inode *ip, int type) } #endif /* HAVE_SET_CACHED_ACL_USABLE */ +/* + * 3.1 API change, + * posix_acl_chmod() was added as the preferred interface. + * + * 3.14 API change, + * posix_acl_chmod() was changed to __posix_acl_chmod() + */ #ifndef HAVE___POSIX_ACL_CHMOD #ifdef HAVE_POSIX_ACL_CHMOD #define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) #define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) #else -static inline int -__posix_acl_chmod(struct posix_acl **acl, int flags, umode_t umode) -{ - struct posix_acl *oldacl = *acl; - mode_t mode = umode; - int error; - - *acl = posix_acl_clone(*acl, flags); - zpl_posix_acl_release(oldacl); - - if (!(*acl)) - return (-ENOMEM); - - error = posix_acl_chmod_masq(*acl, mode); - if (error) { - zpl_posix_acl_release(*acl); - *acl = NULL; - } - - return (error); -} - -static inline int -__posix_acl_create(struct posix_acl **acl, int flags, umode_t *umodep) -{ - struct posix_acl *oldacl = *acl; - mode_t mode = *umodep; - int error; - - *acl = posix_acl_clone(*acl, flags); - zpl_posix_acl_release(oldacl); - - if (!(*acl)) - return (-ENOMEM); - - error = posix_acl_create_masq(*acl, &mode); - *umodep = mode; - - if (error < 0) { - zpl_posix_acl_release(*acl); - *acl = NULL; - } - - return (error); -} +#error "Unsupported kernel" #endif /* HAVE_POSIX_ACL_CHMOD */ #endif /* HAVE___POSIX_ACL_CHMOD */ -#ifdef HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T -typedef umode_t zpl_equivmode_t; -#else -typedef mode_t zpl_equivmode_t; -#endif /* HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T */ - /* * 4.8 API change, * posix_acl_valid() now must be passed a namespace, the namespace from @@ -433,16 +270,6 @@ typedef mode_t zpl_equivmode_t; #endif /* CONFIG_FS_POSIX_ACL */ -/* - * 2.6.38 API change, - * The is_owner_or_cap() function was renamed to inode_owner_or_capable(). - */ -#ifdef HAVE_INODE_OWNER_OR_CAPABLE -#define zpl_inode_owner_or_capable(ip) inode_owner_or_capable(ip) -#else -#define zpl_inode_owner_or_capable(ip) is_owner_or_cap(ip) -#endif /* HAVE_INODE_OWNER_OR_CAPABLE */ - /* * 3.19 API change * struct access f->f_dentry->d_inode was replaced by accessor function @@ -467,7 +294,6 @@ static inline struct dentry *file_dentry(const struct file *f) } #endif /* HAVE_FILE_DENTRY */ -#ifdef HAVE_KUID_HELPERS static inline uid_t zfs_uid_read_impl(struct inode *ip) { #ifdef HAVE_SUPER_USER_NS @@ -514,43 +340,11 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid) #endif } -#else -static inline uid_t zfs_uid_read(struct inode *ip) -{ - return (ip->i_uid); -} - -static inline gid_t zfs_gid_read(struct inode *ip) -{ - return (ip->i_gid); -} - -static inline void zfs_uid_write(struct inode *ip, uid_t uid) -{ - ip->i_uid = uid; -} - -static inline void zfs_gid_write(struct inode *ip, gid_t gid) -{ - ip->i_gid = gid; -} -#endif - -/* - * 2.6.38 API change - */ -#ifdef HAVE_FOLLOW_DOWN_ONE -#define zpl_follow_down_one(path) follow_down_one(path) -#define zpl_follow_up(path) follow_up(path) -#else -#define zpl_follow_down_one(path) follow_down(path) -#define zpl_follow_up(path) follow_up(path) -#endif - /* * 4.9 API change */ -#ifndef HAVE_SETATTR_PREPARE +#if !(defined(HAVE_SETATTR_PREPARE_NO_USERNS) || \ + defined(HAVE_SETATTR_PREPARE_USERNS)) static inline int setattr_prepare(struct dentry *dentry, struct iattr *ia) { @@ -596,6 +390,15 @@ func(const struct path *path, struct kstat *stat, u32 request_mask, \ { \ return (func##_impl(path, stat, request_mask, query_flags)); \ } +#elif defined(HAVE_USERNS_IOPS_GETATTR) +#define ZPL_GETATTR_WRAPPER(func) \ +static int \ +func(struct user_namespace *user_ns, const struct path *path, \ + struct kstat *stat, u32 request_mask, unsigned int query_flags) \ +{ \ + return (func##_impl(user_ns, path, stat, request_mask, \ + query_flags)); \ +} #else #error #endif @@ -643,4 +446,16 @@ zpl_is_32bit_api(void) #endif } +/* + * 5.12 API change + * To support id-mapped mounts, generic_fillattr() was modified to + * accept a new struct user_namespace* as its first arg. + */ +#ifdef HAVE_GENERIC_FILLATTR_USERNS +#define zpl_generic_fillattr(user_ns, ip, sp) \ + generic_fillattr(user_ns, ip, sp) +#else +#define zpl_generic_fillattr(user_ns, ip, sp) generic_fillattr(ip, sp) +#endif + #endif /* _ZFS_VFS_H */ diff --git a/include/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h similarity index 74% rename from include/linux/xattr_compat.h rename to include/os/linux/kernel/linux/xattr_compat.h index b1c4293077..54690727ea 100644 --- a/include/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -35,24 +35,7 @@ * appropriate xattr_handler_t typedef which can be used. This was * the preferred solution because it keeps the code clean and readable. */ -#ifdef HAVE_CONST_XATTR_HANDLER typedef const struct xattr_handler xattr_handler_t; -#else -typedef struct xattr_handler xattr_handler_t; -#endif - -/* - * 3.7 API change, - * Preferred XATTR_NAME_* definitions introduced, these are mapped to - * the previous definitions for older kernels. - */ -#ifndef XATTR_NAME_POSIX_ACL_DEFAULT -#define XATTR_NAME_POSIX_ACL_DEFAULT POSIX_ACL_XATTR_DEFAULT -#endif - -#ifndef XATTR_NAME_POSIX_ACL_ACCESS -#define XATTR_NAME_POSIX_ACL_ACCESS POSIX_ACL_XATTR_ACCESS -#endif /* * 4.5 API change, @@ -88,17 +71,8 @@ fn(const struct xattr_handler *handler, struct dentry *dentry, \ return (__ ## fn(dentry->d_inode, \ list, list_size, name, name_len)); \ } -/* - * 2.6.32 API - */ -#elif defined(HAVE_XATTR_LIST_INODE) -#define ZPL_XATTR_LIST_WRAPPER(fn) \ -static size_t \ -fn(struct inode *ip, char *list, size_t list_size, \ - const char *name, size_t name_len) \ -{ \ - return (__ ## fn(ip, list, list_size, name, name_len)); \ -} +#else +#error "Unsupported kernel" #endif /* @@ -141,24 +115,31 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } -/* - * 2.6.32 API - */ -#elif defined(HAVE_XATTR_GET_INODE) -#define ZPL_XATTR_GET_WRAPPER(fn) \ -static int \ -fn(struct inode *ip, const char *name, void *buffer, size_t size) \ -{ \ - return (__ ## fn(ip, name, buffer, size)); \ -} +#else +#error "Unsupported kernel" #endif +/* + * 5.12 API change, + * The xattr_handler->set() callback was changed to take the + * struct user_namespace* as the first arg, to support idmapped + * mounts. + */ +#if defined(HAVE_XATTR_SET_USERNS) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct user_namespace *user_ns, \ + struct dentry *dentry, struct inode *inode, const char *name, \ + const void *buffer, size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size, flags)); \ +} /* * 4.7 API change, * The xattr_handler->set() callback was changed to take a both dentry and * inode, because the dentry might not be attached to an inode yet. */ -#if defined(HAVE_XATTR_SET_DENTRY_INODE) +#elif defined(HAVE_XATTR_SET_DENTRY_INODE) #define ZPL_XATTR_SET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ @@ -194,33 +175,15 @@ fn(struct dentry *dentry, const char *name, const void *buffer, \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ } -/* - * 2.6.32 API - */ -#elif defined(HAVE_XATTR_SET_INODE) -#define ZPL_XATTR_SET_WRAPPER(fn) \ -static int \ -fn(struct inode *ip, const char *name, const void *buffer, \ - size_t size, int flags) \ -{ \ - return (__ ## fn(ip, name, buffer, size, flags)); \ -} -#endif - -#ifdef HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY -#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, qstr, nm, val, len) #else -#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, nm, val, len) -#endif /* HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY */ +#error "Unsupported kernel" +#endif /* * Linux 3.7 API change. posix_acl_{from,to}_xattr gained the user_ns * parameter. All callers are expected to pass the &init_user_ns which * is available through the init credential (kcred). */ -#ifdef HAVE_POSIX_ACL_FROM_XATTR_USERNS static inline struct posix_acl * zpl_acl_from_xattr(const void *value, int size) { @@ -233,19 +196,4 @@ zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) return (posix_acl_to_xattr(kcred->user_ns, acl, value, size)); } -#else - -static inline struct posix_acl * -zpl_acl_from_xattr(const void *value, int size) -{ - return (posix_acl_from_xattr(value, size)); -} - -static inline int -zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) -{ - return (posix_acl_to_xattr(acl, value, size)); -} -#endif /* HAVE_POSIX_ACL_FROM_XATTR_USERNS */ - #endif /* _ZFS_XATTR_H */ diff --git a/include/spl/Makefile.am b/include/os/linux/spl/Makefile.am similarity index 100% rename from include/spl/Makefile.am rename to include/os/linux/spl/Makefile.am diff --git a/include/spl/rpc/Makefile.am b/include/os/linux/spl/rpc/Makefile.am similarity index 76% rename from include/spl/rpc/Makefile.am rename to include/os/linux/spl/rpc/Makefile.am index 5110cc0f0c..13d804fce9 100644 --- a/include/spl/rpc/Makefile.am +++ b/include/os/linux/spl/rpc/Makefile.am @@ -1,5 +1,5 @@ KERNEL_H = \ - $(top_srcdir)/include/spl/rpc/xdr.h + xdr.h if CONFIG_KERNEL kerneldir = @prefix@/src/zfs-$(VERSION)/include/spl/rpc diff --git a/include/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h similarity index 98% rename from include/spl/rpc/xdr.h rename to include/os/linux/spl/rpc/xdr.h index 0b39b46cf6..c62080a117 100644 --- a/include/spl/rpc/xdr.h +++ b/include/os/linux/spl/rpc/xdr.h @@ -3,7 +3,6 @@ * Written by Ricardo Correia * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/os/linux/spl/sys/Makefile.am b/include/os/linux/spl/sys/Makefile.am new file mode 100644 index 0000000000..48c27f970f --- /dev/null +++ b/include/os/linux/spl/sys/Makefile.am @@ -0,0 +1,64 @@ +KERNEL_H = \ + acl.h \ + atomic.h \ + byteorder.h \ + callb.h \ + callo.h \ + cmn_err.h \ + condvar.h \ + cred.h \ + ctype.h \ + debug.h \ + disp.h \ + dkio.h \ + errno.h \ + fcntl.h \ + file.h \ + inttypes.h \ + isa_defs.h \ + kmem_cache.h \ + kmem.h \ + kstat.h \ + list.h \ + mod_os.h \ + mutex.h \ + param.h \ + processor.h \ + proc.h \ + procfs_list.h \ + random.h \ + rwlock.h \ + shrinker.h \ + sid.h \ + signal.h \ + simd.h \ + stat.h \ + strings.h \ + sunddi.h \ + sysmacros.h \ + systeminfo.h \ + taskq.h \ + thread.h \ + time.h \ + timer.h \ + trace.h \ + trace_spl.h \ + trace_taskq.h \ + tsd.h \ + types32.h \ + types.h \ + uio.h \ + user.h \ + vfs.h \ + vmem.h \ + vmsystm.h \ + vnode.h \ + wait.h \ + wmsum.h \ + zmod.h \ + zone.h + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/spl/sys +kernel_HEADERS = $(KERNEL_H) +endif diff --git a/include/spl/sys/acl.h b/include/os/linux/spl/sys/acl.h similarity index 98% rename from include/spl/sys/acl.h rename to include/os/linux/spl/sys/acl.h index 9fc79c025c..5a3d226c76 100644 --- a/include/spl/sys/acl.h +++ b/include/os/linux/spl/sys/acl.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h similarity index 93% rename from include/spl/sys/atomic.h rename to include/os/linux/spl/sys/atomic.h index 51b5479235..8f7fa5aeda 100644 --- a/include/spl/sys/atomic.h +++ b/include/os/linux/spl/sys/atomic.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -49,6 +48,8 @@ #define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v)) #define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y) #define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x) +#define atomic_load_32(v) atomic_read((atomic_t *)(v)) +#define atomic_store_32(v, x) atomic_set((atomic_t *)(v), x) #define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v)) #define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v)) #define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v)) @@ -59,6 +60,8 @@ #define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v)) #define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y) #define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x) +#define atomic_load_64(v) atomic64_read((atomic64_t *)(v)) +#define atomic_store_64(v, x) atomic64_set((atomic64_t *)(v), x) #ifdef _LP64 static __inline__ void * diff --git a/include/spl/sys/byteorder.h b/include/os/linux/spl/sys/byteorder.h similarity index 91% rename from include/spl/sys/byteorder.h rename to include/os/linux/spl/sys/byteorder.h index 4777079961..bb5e173ce5 100644 --- a/include/spl/sys/byteorder.h +++ b/include/os/linux/spl/sys/byteorder.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,6 +25,15 @@ #define _SPL_BYTEORDER_H #include + +#if defined(__BIG_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#define _ZFS_BIG_ENDIAN +#endif + +#if defined(__LITTLE_ENDIAN) && !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + #include #define BSWAP_8(x) ((x) & 0xff) @@ -49,7 +57,7 @@ #define BE_IN32(xa) \ (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) -#ifdef _BIG_ENDIAN +#ifdef _ZFS_BIG_ENDIAN static __inline__ uint64_t htonll(uint64_t n) { diff --git a/include/spl/sys/callb.h b/include/os/linux/spl/sys/callb.h similarity index 97% rename from include/spl/sys/callb.h rename to include/os/linux/spl/sys/callb.h index f1826bfd35..19ba41ff9e 100644 --- a/include/spl/sys/callb.h +++ b/include/os/linux/spl/sys/callb.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/callo.h b/include/os/linux/spl/sys/callo.h similarity index 97% rename from include/spl/sys/callo.h rename to include/os/linux/spl/sys/callo.h index c43ac92e7c..e93a15f7a0 100644 --- a/include/spl/sys/callo.h +++ b/include/os/linux/spl/sys/callo.h @@ -5,7 +5,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/cmn_err.h b/include/os/linux/spl/sys/cmn_err.h similarity index 78% rename from include/spl/sys/cmn_err.h rename to include/os/linux/spl/sys/cmn_err.h index be57358b0a..d2088371c6 100644 --- a/include/spl/sys/cmn_err.h +++ b/include/os/linux/spl/sys/cmn_err.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -25,7 +24,11 @@ #ifndef _SPL_CMN_ERR_H #define _SPL_CMN_ERR_H +#if defined(_KERNEL) && defined(HAVE_STANDALONE_LINUX_STDARG) +#include +#else #include +#endif #define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ @@ -33,9 +36,12 @@ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ -extern void cmn_err(int, const char *, ...); -extern void vcmn_err(int, const char *, va_list); -extern void vpanic(const char *, va_list); +extern void cmn_err(int, const char *, ...) + __attribute__((format(printf, 2, 3))); +extern void vcmn_err(int, const char *, va_list) + __attribute__((format(printf, 2, 0))); +extern void vpanic(const char *, va_list) + __attribute__((format(printf, 1, 0))); #define fm_panic panic diff --git a/include/spl/sys/condvar.h b/include/os/linux/spl/sys/condvar.h similarity index 53% rename from include/spl/sys/condvar.h rename to include/os/linux/spl/sys/condvar.h index 28caea5718..ef405763ca 100644 --- a/include/spl/sys/condvar.h +++ b/include/os/linux/spl/sys/condvar.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -32,6 +31,32 @@ #include #include +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + + /* * The kcondvar_t struct is protected by mutex taken externally before * calling any of the wait/signal funs, and passed into the wait funs. @@ -54,13 +79,18 @@ extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *); extern void __cv_destroy(kcondvar_t *); extern void __cv_wait(kcondvar_t *, kmutex_t *); extern void __cv_wait_io(kcondvar_t *, kmutex_t *); -extern void __cv_wait_sig(kcondvar_t *, kmutex_t *); -extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, +extern void __cv_wait_idle(kcondvar_t *, kmutex_t *); +extern int __cv_wait_io_sig(kcondvar_t *, kmutex_t *); +extern int __cv_wait_sig(kcondvar_t *, kmutex_t *); +extern int __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_idle(kcondvar_t *, kmutex_t *, clock_t); +extern int cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t res, int flag); -extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, +extern int cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); +extern int cv_timedwait_idle_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t res, int flag); extern void __cv_signal(kcondvar_t *); extern void __cv_broadcast(kcondvar_t *c); @@ -69,13 +99,21 @@ extern void __cv_broadcast(kcondvar_t *c); #define cv_destroy(cvp) __cv_destroy(cvp) #define cv_wait(cvp, mp) __cv_wait(cvp, mp) #define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp) +#define cv_wait_idle(cvp, mp) __cv_wait_idle(cvp, mp) +#define cv_wait_io_sig(cvp, mp) __cv_wait_io_sig(cvp, mp) #define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp) -#define cv_wait_interruptible(cvp, mp) cv_wait_sig(cvp, mp) -#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) -#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) -#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) -#define cv_timedwait_interruptible(cvp, mp, t) cv_timedwait_sig(cvp, mp, t) #define cv_signal(cvp) __cv_signal(cvp) #define cv_broadcast(cvp) __cv_broadcast(cvp) +/* + * NB: There is no way to reliably distinguish between having been signalled + * and having timed out on Linux. If the client code needs to reliably + * distinguish between the two it should use the hires variant. + */ +#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) +#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) +#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) +#define cv_timedwait_idle(cvp, mp, t) __cv_timedwait_idle(cvp, mp, t) + + #endif /* _SPL_CONDVAR_H */ diff --git a/include/spl/sys/cred.h b/include/os/linux/spl/sys/cred.h similarity index 88% rename from include/spl/sys/cred.h rename to include/os/linux/spl/sys/cred.h index fd063399b7..9cc85deb5c 100644 --- a/include/spl/sys/cred.h +++ b/include/os/linux/spl/sys/cred.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -40,24 +39,12 @@ typedef struct cred cred_t; #define GROUP_AT(gi, i) ((gi)->gid[i]) #endif -#ifdef HAVE_KUIDGID_T - #define KUID_TO_SUID(x) (__kuid_val(x)) #define KGID_TO_SGID(x) (__kgid_val(x)) #define SUID_TO_KUID(x) (KUIDT_INIT(x)) #define SGID_TO_KGID(x) (KGIDT_INIT(x)) #define KGIDP_TO_SGIDP(x) (&(x)->val) -#else /* HAVE_KUIDGID_T */ - -#define KUID_TO_SUID(x) (x) -#define KGID_TO_SGID(x) (x) -#define SUID_TO_KUID(x) (x) -#define SGID_TO_KGID(x) (x) -#define KGIDP_TO_SGIDP(x) (x) - -#endif /* HAVE_KUIDGID_T */ - extern void crhold(cred_t *cr); extern void crfree(cred_t *cr); extern uid_t crgetuid(const cred_t *cr); diff --git a/include/spl/sys/ctype.h b/include/os/linux/spl/sys/ctype.h similarity index 95% rename from include/spl/sys/ctype.h rename to include/os/linux/spl/sys/ctype.h index 18beb1daa5..3513206004 100644 --- a/include/spl/sys/ctype.h +++ b/include/os/linux/spl/sys/ctype.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h similarity index 67% rename from include/spl/sys/debug.h rename to include/os/linux/spl/sys/debug.h index ecda6bcb89..dc6b85eebf 100644 --- a/include/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -30,7 +29,6 @@ * * PANIC() - Panic the node and print message. * ASSERT() - Assert X is true, if not panic. - * ASSERTV() - Wraps a variable declaration which is only used by ASSERT(). * ASSERT3B() - Assert boolean X OP Y is true, if not panic. * ASSERT3S() - Assert signed X OP Y is true, if not panic. * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. @@ -50,6 +48,12 @@ /* * Common DEBUG functionality. */ +#define __printflike(a, b) __printf(a, b) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + int spl_panic(const char *file, const char *func, int line, const char *fmt, ...); void spl_dumpstack(void); @@ -58,63 +62,63 @@ void spl_dumpstack(void); #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) -#define VERIFY(cond) \ - (void) (unlikely(!(cond)) && \ +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "%s", "VERIFY(" #cond ") failed\n")) -#define VERIFY3B(LEFT, OP, RIGHT) do { \ - boolean_t _verify3_left = (boolean_t)(LEFT); \ - boolean_t _verify3_right = (boolean_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%d " #OP " %d)\n", \ - (boolean_t) (_verify3_left), \ - (boolean_t) (_verify3_right)); \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ } while (0) -#define VERIFY3S(LEFT, OP, RIGHT) do { \ - int64_t _verify3_left = (int64_t)(LEFT); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%lld " #OP " %lld)\n", \ - (long long) (_verify3_left), \ - (long long) (_verify3_right)); \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ } while (0) -#define VERIFY3U(LEFT, OP, RIGHT) do { \ - uint64_t _verify3_left = (uint64_t)(LEFT); \ - uint64_t _verify3_right = (uint64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%llu " #OP " %llu)\n", \ - (unsigned long long) (_verify3_left), \ - (unsigned long long) (_verify3_right)); \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ } while (0) -#define VERIFY3P(LEFT, OP, RIGHT) do { \ - uintptr_t _verify3_left = (uintptr_t)(LEFT); \ - uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px)\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right)); \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ } while (0) -#define VERIFY0(RIGHT) do { \ - int64_t _verify3_left = (int64_t)(0); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left == _verify3_right)) \ +#define VERIFY0(RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(0 == " #RIGHT ") " \ - "failed (0 == %lld)\n", \ - (long long) (_verify3_right)); \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ } while (0) #define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) @@ -130,7 +134,6 @@ void spl_dumpstack(void); #ifdef NDEBUG #define ASSERT(x) ((void)0) -#define ASSERTV(x) #define ASSERT3B(x,y,z) ((void)0) #define ASSERT3S(x,y,z) ((void)0) #define ASSERT3U(x,y,z) ((void)0) @@ -150,13 +153,12 @@ void spl_dumpstack(void); #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT VERIFY -#define ASSERTV(x) x #define IMPLY(A, B) \ - ((void)(((!(A)) || (B)) || \ + ((void)(likely((!(A)) || (B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") implies (" #B ")"))) #define EQUIV(A, B) \ - ((void)((!!(A) == !!(B)) || \ + ((void)(likely(!!(A) == !!(B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") is equivalent to (" #B ")"))) /* END CSTYLED */ diff --git a/include/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h similarity index 96% rename from include/spl/sys/disp.h rename to include/os/linux/spl/sys/disp.h index 413b623c81..e106d3c543 100644 --- a/include/spl/sys/disp.h +++ b/include/os/linux/spl/sys/disp.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/dkio.h b/include/os/linux/spl/sys/dkio.h similarity index 96% rename from include/spl/sys/dkio.h rename to include/os/linux/spl/sys/dkio.h index 49f166a9c4..a90b67d367 100644 --- a/include/spl/sys/dkio.h +++ b/include/os/linux/spl/sys/dkio.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/errno.h b/include/os/linux/spl/sys/errno.h similarity index 85% rename from include/spl/sys/errno.h rename to include/os/linux/spl/sys/errno.h index 6015b1a3e2..f6d9212a61 100644 --- a/include/spl/sys/errno.h +++ b/include/os/linux/spl/sys/errno.h @@ -44,4 +44,14 @@ #define ENOTSUP EOPNOTSUPP +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ENOANO + #endif /* _SYS_ERRNO_H */ diff --git a/include/spl/sys/fcntl.h b/include/os/linux/spl/sys/fcntl.h similarity index 96% rename from include/spl/sys/fcntl.h rename to include/os/linux/spl/sys/fcntl.h index 3faa5dad78..a87fdcac7f 100644 --- a/include/spl/sys/fcntl.h +++ b/include/os/linux/spl/sys/fcntl.h @@ -5,7 +5,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/file.h b/include/os/linux/spl/sys/file.h similarity index 97% rename from include/spl/sys/file.h rename to include/os/linux/spl/sys/file.h index 05dbc08142..e0bbd6d98c 100644 --- a/include/spl/sys/file.h +++ b/include/os/linux/spl/sys/file.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/inttypes.h b/include/os/linux/spl/sys/inttypes.h similarity index 95% rename from include/spl/sys/inttypes.h rename to include/os/linux/spl/sys/inttypes.h index 92e76206ba..c99973abd1 100644 --- a/include/spl/sys/inttypes.h +++ b/include/os/linux/spl/sys/inttypes.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/isa_defs.h b/include/os/linux/spl/sys/isa_defs.h similarity index 81% rename from include/spl/sys/isa_defs.h rename to include/os/linux/spl/sys/isa_defs.h index 1eb4002779..2207ee2025 100644 --- a/include/spl/sys/isa_defs.h +++ b/include/os/linux/spl/sys/isa_defs.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -40,9 +39,13 @@ #define __x86 #endif +#if defined(_ILP32) +/* x32-specific defines; careful to *not* define _LP64 here */ +#else #if !defined(_LP64) #define _LP64 #endif +#endif #define _ALIGNMENT_REQUIRED 1 @@ -113,9 +116,9 @@ #endif #if defined(__ARMEL__) || defined(__AARCH64EL__) -#define _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN #else -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #endif /* @@ -145,7 +148,7 @@ #endif #endif -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #define _SUNOS_VTOC_16 #define _ALIGNMENT_REQUIRED 1 @@ -161,7 +164,7 @@ #endif #endif -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN /* * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1 @@ -173,9 +176,9 @@ #elif defined(__mips__) #if defined(__MIPSEB__) -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #elif defined(__MIPSEL__) -#define _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN #else #error MIPS no endian specified #endif @@ -192,10 +195,31 @@ */ #define _ALIGNMENT_REQUIRED 1 +/* + * RISC-V arch specific defines + * only RV64G (including atomic) LP64 is supported yet + */ +#elif defined(__riscv) && defined(_LP64) && _LP64 && \ + defined(__riscv_atomic) && __riscv_atomic + +#ifndef __riscv__ +#define __riscv__ +#endif + +#ifndef __rv64g__ +#define __rv64g__ +#endif + +#define _ZFS_LITTLE_ENDIAN + +#define _SUNOS_VTOC_16 + +#define _ALIGNMENT_REQUIRED 1 + #else /* * Currently supported: - * x86_64, i386, arm, powerpc, s390, sparc, and mips + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G */ #error "Unsupported ISA type" #endif @@ -218,20 +242,12 @@ #define HAVE_EFFICIENT_UNALIGNED_ACCESS #endif -#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN) -#define _LITTLE_ENDIAN __LITTLE_ENDIAN +#if defined(_ZFS_LITTLE_ENDIAN) && defined(_ZFS_BIG_ENDIAN) +#error "Both _ZFS_LITTLE_ENDIAN and _ZFS_BIG_ENDIAN are defined" #endif -#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN) -#define _BIG_ENDIAN __BIG_ENDIAN -#endif - -#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined" -#endif - -#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined" +#if !defined(_ZFS_LITTLE_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#error "Neither _ZFS_LITTLE_ENDIAN or _ZFS_BIG_ENDIAN are defined" #endif #endif /* _SPL_ISA_DEFS_H */ diff --git a/include/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h similarity index 90% rename from include/spl/sys/kmem.h rename to include/os/linux/spl/sys/kmem.h index 72d3a77653..a93e87df80 100644 --- a/include/spl/sys/kmem.h +++ b/include/os/linux/spl/sys/kmem.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -28,12 +27,14 @@ #include #include #include +#include +#include extern int kmem_debugging(void); extern char *kmem_vasprintf(const char *fmt, va_list ap); extern char *kmem_asprintf(const char *fmt, ...); -extern char *strdup(const char *str); -extern void strfree(char *str); +extern char *kmem_strdup(const char *str); +extern void kmem_strfree(char *str); /* * Memory allocation interfaces @@ -47,6 +48,7 @@ extern void strfree(char *str); #define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) static int spl_fstrans_check(void); +void *spl_kvmalloc(size_t size, gfp_t flags); /* * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion @@ -141,6 +143,18 @@ __spl_pf_fstrans_check(void) return (current->flags & __SPL_PF_FSTRANS); } +/* + * Kernel compatibility for GFP flags + */ +/* < 4.13 */ +#ifndef __GFP_RETRY_MAYFAIL +#define __GFP_RETRY_MAYFAIL __GFP_REPEAT +#endif +/* < 4.4 */ +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) @@ -169,6 +183,15 @@ extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); extern void spl_kmem_free(const void *ptr, size_t sz); +/* + * 5.8 API change, pgprot_t argument removed. + */ +#ifdef HAVE_VMALLOC_PAGE_KERNEL +#define spl_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL) +#else +#define spl_vmalloc(size, flags) __vmalloc(size, flags) +#endif + /* * The following functions are only available for internal use. */ diff --git a/include/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h similarity index 82% rename from include/spl/sys/kmem_cache.h rename to include/os/linux/spl/sys/kmem_cache.h index 8fa14f67e7..48006ec5d2 100644 --- a/include/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -30,22 +29,15 @@ /* * Slab allocation interfaces. The SPL slab differs from the standard * Linux SLAB or SLUB primarily in that each cache may be backed by slabs - * allocated from the physical or virtal memory address space. The virtual + * allocated from the physical or virtual memory address space. The virtual * slabs allow for good behavior when allocation large objects of identical * size. This slab implementation also supports both constructors and * destructors which the Linux slab does not. */ -enum { - KMC_BIT_NOTOUCH = 0, /* Don't update ages */ +typedef enum kmc_bit { KMC_BIT_NODEBUG = 1, /* Default behavior */ - KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ - KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ - KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ - KMC_BIT_KMEM = 5, /* Use kmem cache */ - KMC_BIT_VMEM = 6, /* Use vmem cache */ - KMC_BIT_SLAB = 7, /* Use Linux slab cache */ - KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ - KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ + KMC_BIT_KVMEM = 7, /* Use kvmalloc linux allocator */ + KMC_BIT_SLAB = 8, /* Use Linux slab cache */ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ @@ -53,7 +45,7 @@ enum { KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ KMC_BIT_MAX = 20, /* Proc handler helper bit */ -}; +} kmc_bit_t; /* kmem move callback return values */ typedef enum kmem_cbrc { @@ -64,16 +56,9 @@ typedef enum kmem_cbrc { KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ } kmem_cbrc_t; -#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) #define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) -#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) -#define KMC_NOHASH (1 << KMC_BIT_NOHASH) -#define KMC_QCACHE (1 << KMC_BIT_QCACHE) -#define KMC_KMEM (1 << KMC_BIT_KMEM) -#define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_KVMEM (1 << KMC_BIT_KVMEM) #define KMC_SLAB (1 << KMC_BIT_SLAB) -#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) -#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) #define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) #define KMC_GROWING (1 << KMC_BIT_GROWING) #define KMC_REAPING (1 << KMC_BIT_REAPING) @@ -85,12 +70,8 @@ typedef enum kmem_cbrc { #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 -#define KMC_EXPIRE_AGE 0x1 /* Due to age */ -#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ - #define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ -extern unsigned int spl_kmem_cache_expire; extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; @@ -99,10 +80,7 @@ extern struct rw_semaphore spl_kmem_cache_sem; #define SKS_MAGIC 0x22222222 #define SKC_MAGIC 0x2c2c2c2c -#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ -#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ #define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ #ifdef _LP64 #define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ @@ -125,7 +103,6 @@ extern struct rw_semaphore spl_kmem_cache_sem; typedef int (*spl_kmem_ctor_t)(void *, void *, int); typedef void (*spl_kmem_dtor_t)(void *, void *); -typedef void (*spl_kmem_reclaim_t)(void *); typedef struct spl_kmem_magazine { uint32_t skm_magic; /* Sanity magic */ @@ -133,7 +110,6 @@ typedef struct spl_kmem_magazine { uint32_t skm_size; /* Magazine size */ uint32_t skm_refill; /* Batch refill size */ struct spl_kmem_cache *skm_cache; /* Owned by cache */ - unsigned long skm_age; /* Last cache access */ unsigned int skm_cpu; /* Owned by cpu */ void *skm_objs[0]; /* Object pointers */ } spl_kmem_magazine_t; @@ -175,7 +151,6 @@ typedef struct spl_kmem_cache { uint32_t skc_mag_refill; /* Magazine refill count */ spl_kmem_ctor_t skc_ctor; /* Constructor */ spl_kmem_dtor_t skc_dtor; /* Destructor */ - spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ void *skc_private; /* Private data */ void *skc_vmp; /* Unused */ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ @@ -184,8 +159,6 @@ typedef struct spl_kmem_cache { uint32_t skc_obj_align; /* Object alignment */ uint32_t skc_slab_objs; /* Objects per slab */ uint32_t skc_slab_size; /* Slab size */ - uint32_t skc_delay; /* Slab reclaim interval */ - uint32_t skc_reap; /* Slab reclaim count */ atomic_t skc_ref; /* Ref count callers */ taskqid_t skc_taskqid; /* Slab reclaim task */ struct list_head skc_list; /* List of caches linkage */ @@ -202,6 +175,7 @@ typedef struct spl_kmem_cache { uint64_t skc_slab_max; /* Slab max historic */ uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_alloc; /* Obj alloc current */ + struct percpu_counter skc_linux_alloc; /* Linux-backed Obj alloc */ uint64_t skc_obj_max; /* Obj max historic */ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ uint64_t skc_obj_emergency; /* Obj emergency current */ @@ -211,15 +185,17 @@ typedef struct spl_kmem_cache { extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); + void *reclaim, void *priv, void *vmp, int flags); extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); -extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); extern void spl_kmem_reap(void); +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); #define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) @@ -227,8 +203,7 @@ extern void spl_kmem_reap(void); #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) -#define kmem_cache_reap_now(skc) \ - spl_kmem_cache_reap_now(skc, skc->skc_reap) +#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) #define kmem_reap() spl_kmem_reap() /* diff --git a/include/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h similarity index 96% rename from include/spl/sys/kstat.h rename to include/os/linux/spl/sys/kstat.h index 3ce4742488..928f707575 100644 --- a/include/spl/sys/kstat.h +++ b/include/os/linux/spl/sys/kstat.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -152,6 +151,12 @@ typedef struct kstat_named_s { #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) +#ifdef HAVE_PROC_OPS_STRUCT +typedef struct proc_ops kstat_proc_op_t; +#else +typedef struct file_operations kstat_proc_op_t; +#endif + typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; } kstat_intr_t; @@ -197,14 +202,10 @@ extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name); extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep); extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, - const struct file_operations *file_ops, void *data); + const kstat_proc_op_t *file_ops, void *data); extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); #define kstat_set_raw_ops(k, h, d, a) \ __kstat_set_raw_ops(k, h, d, a) diff --git a/include/spl/sys/list.h b/include/os/linux/spl/sys/list.h similarity index 97% rename from include/spl/sys/list.h rename to include/os/linux/spl/sys/list.h index 74b784e934..80300df15a 100644 --- a/include/spl/sys/list.h +++ b/include/os/linux/spl/sys/list.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,6 +25,7 @@ #define _SPL_LIST_H #include +#include #include /* @@ -184,7 +184,8 @@ list_prev(list_t *list, void *object) static inline int list_link_active(list_node_t *node) { - return (node->next != LIST_POISON1) && (node->prev != LIST_POISON2); + EQUIV(node->next == LIST_POISON1, node->prev == LIST_POISON2); + return (node->next != LIST_POISON1); } static inline void diff --git a/include/os/linux/spl/sys/mod_os.h b/include/os/linux/spl/sys/mod_os.h new file mode 100644 index 0000000000..bb43313d18 --- /dev/null +++ b/include/os/linux/spl/sys/mod_os.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ +#ifndef _SPL_MOD_H +#define _SPL_MOD_H +#include + +#endif /* SPL_MOD_H */ diff --git a/include/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h similarity index 98% rename from include/spl/sys/mutex.h rename to include/os/linux/spl/sys/mutex.h index ed0cd4932c..047607f826 100644 --- a/include/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,6 +25,7 @@ #define _SPL_MUTEX_H #include +#include #include #include #include @@ -127,6 +127,8 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ }) /* END CSTYLED */ +#define NESTED_SINGLE 1 + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ @@ -179,7 +181,4 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } -int spl_mutex_init(void); -void spl_mutex_fini(void); - #endif /* _SPL_MUTEX_H */ diff --git a/include/spl/sys/param.h b/include/os/linux/spl/sys/param.h similarity index 96% rename from include/spl/sys/param.h rename to include/os/linux/spl/sys/param.h index 4ef929151a..d8a12d5321 100644 --- a/include/spl/sys/param.h +++ b/include/os/linux/spl/sys/param.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/proc.h b/include/os/linux/spl/sys/proc.h similarity index 89% rename from include/spl/sys/proc.h rename to include/os/linux/spl/sys/proc.h index 05c44bca5d..fe4841407d 100644 --- a/include/spl/sys/proc.h +++ b/include/os/linux/spl/sys/proc.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,14 +25,17 @@ #define _SPL_PROC_H #include - -#ifndef HAVE_PDE_DATA -#define PDE_DATA(x) (PDE(x)->data) -#endif +#include extern struct proc_dir_entry *proc_spl_kstat; int spl_proc_init(void); void spl_proc_fini(void); +static inline boolean_t +zfs_proc_is_caller(struct task_struct *t) +{ + return (t->group_leader == current->group_leader); +} + #endif /* SPL_PROC_H */ diff --git a/include/spl/sys/processor.h b/include/os/linux/spl/sys/processor.h similarity index 96% rename from include/spl/sys/processor.h rename to include/os/linux/spl/sys/processor.h index a70101fa2f..5514f07c0b 100644 --- a/include/spl/sys/processor.h +++ b/include/os/linux/spl/sys/processor.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/procfs_list.h b/include/os/linux/spl/sys/procfs_list.h similarity index 98% rename from include/spl/sys/procfs_list.h rename to include/os/linux/spl/sys/procfs_list.h index eb1519c0ad..9bb437f55c 100644 --- a/include/spl/sys/procfs_list.h +++ b/include/os/linux/spl/sys/procfs_list.h @@ -57,6 +57,7 @@ typedef struct procfs_list_node { } procfs_list_node_t; void procfs_list_install(const char *module, + const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, diff --git a/include/spl/sys/random.h b/include/os/linux/spl/sys/random.h similarity index 85% rename from include/spl/sys/random.h rename to include/os/linux/spl/sys/random.h index 93e244f566..52e97e1ce0 100644 --- a/include/spl/sys/random.h +++ b/include/os/linux/spl/sys/random.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -37,4 +36,19 @@ random_get_bytes(uint8_t *ptr, size_t len) extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); +static __inline__ uint32_t +random_in_range(uint32_t range) +{ + uint32_t r; + + ASSERT(range != 0); + + if (range == 1) + return (0); + + (void) random_get_pseudo_bytes((uint8_t *)&r, sizeof (r)); + + return (r % range); +} + #endif /* _SPL_RANDOM_H */ diff --git a/include/spl/sys/rwlock.h b/include/os/linux/spl/sys/rwlock.h similarity index 54% rename from include/spl/sys/rwlock.h rename to include/os/linux/spl/sys/rwlock.h index 408defac20..ba7620a1f3 100644 --- a/include/spl/sys/rwlock.h +++ b/include/os/linux/spl/sys/rwlock.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -29,43 +28,6 @@ #include #include -/* Linux kernel compatibility */ -#if defined(CONFIG_PREEMPT_RT_FULL) -#define SPL_RWSEM_SINGLE_READER_VALUE (1) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (0) -#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) -#define SPL_RWSEM_SINGLE_READER_VALUE (1) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1) -#elif defined(RWSEM_ACTIVE_MASK) -#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS) -#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS) -#endif - -/* Linux 3.16 changed activity to count for rwsem-spinlock */ -#if defined(CONFIG_PREEMPT_RT_FULL) -#define RWSEM_COUNT(sem) sem->read_depth -#elif defined(HAVE_RWSEM_ACTIVITY) -#define RWSEM_COUNT(sem) sem->activity -/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */ -#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) -#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count) -#else -#define RWSEM_COUNT(sem) sem->count -#endif - -#if defined(RWSEM_SPINLOCK_IS_RAW) -#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) -#define spl_rwsem_unlock_irqrestore(lk, fl) \ - raw_spin_unlock_irqrestore(lk, fl) -#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl) -#else -#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl) -#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl) -#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl) -#endif /* RWSEM_SPINLOCK_IS_RAW */ - -#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem) - typedef enum { RW_DRIVER = 2, RW_DEFAULT = 4, @@ -78,15 +40,9 @@ typedef enum { RW_READER = 2 } krw_t; -/* - * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner - * field, so we don't need our own. - */ typedef struct { struct rw_semaphore rw_rwlock; -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER kthread_t *rw_owner; -#endif #ifdef CONFIG_LOCKDEP krw_type_t rw_type; #endif /* CONFIG_LOCKDEP */ @@ -97,31 +53,19 @@ typedef struct { static inline void spl_rw_set_owner(krwlock_t *rwp) { -/* - * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write, - * downgrade_write and __init_rwsem will set/clear owner for us. - */ -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER rwp->rw_owner = current; -#endif } static inline void spl_rw_clear_owner(krwlock_t *rwp) { -#ifndef CONFIG_RWSEM_SPIN_ON_OWNER rwp->rw_owner = NULL; -#endif } static inline kthread_t * rw_owner(krwlock_t *rwp) { -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - return (SEM(rwp)->owner); -#else return (rwp->rw_owner); -#endif } #ifdef CONFIG_LOCKDEP @@ -148,6 +92,11 @@ spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ #define spl_rw_lockdep_on_maybe(rwp) #endif /* CONFIG_LOCKDEP */ +static inline int +RW_LOCK_HELD(krwlock_t *rwp) +{ + return (rwsem_is_locked(SEM(rwp))); +} static inline int RW_WRITE_HELD(krwlock_t *rwp) @@ -155,55 +104,10 @@ RW_WRITE_HELD(krwlock_t *rwp) return (rw_owner(rwp) == current); } -static inline int -RW_LOCK_HELD(krwlock_t *rwp) -{ - return (spl_rwsem_is_locked(SEM(rwp))); -} - static inline int RW_READ_HELD(krwlock_t *rwp) { - if (!RW_LOCK_HELD(rwp)) - return (0); - - /* - * rw_semaphore cheat sheet: - * - * < 3.16: - * There's no rw_semaphore.owner, so use rwp.owner instead. - * If rwp.owner == NULL then it's a reader - * - * 3.16 - 4.7: - * rw_semaphore.owner added (https://lwn.net/Articles/596656/) - * and CONFIG_RWSEM_SPIN_ON_OWNER introduced. - * If rw_semaphore.owner == NULL then it's a reader - * - * 4.8 - 4.16.16: - * RWSEM_READER_OWNED added as an internal #define. - * (https://lore.kernel.org/patchwork/patch/678590/) - * If rw_semaphore.owner == 1 then it's a reader - * - * 4.16.17 - 4.19: - * RWSEM_OWNER_UNKNOWN introduced as ((struct task_struct *)-1L) - * (https://do-db2.lkml.org/lkml/2018/5/15/985) - * If rw_semaphore.owner == 1 then it's a reader. - * - * 4.20+: - * RWSEM_OWNER_UNKNOWN changed to ((struct task_struct *)-2L) - * (https://lkml.org/lkml/2018/9/6/986) - * If rw_semaphore.owner & 1 then it's a reader, and also the reader's - * task_struct may be embedded in rw_semaphore->owner. - */ -#if defined(CONFIG_RWSEM_SPIN_ON_OWNER) && defined(RWSEM_OWNER_UNKNOWN) - if (RWSEM_OWNER_UNKNOWN == (struct task_struct *)-2L) { - /* 4.20+ kernels with CONFIG_RWSEM_SPIN_ON_OWNER */ - return ((unsigned long) SEM(rwp)->owner & 1); - } -#endif - - /* < 4.20 kernel or !CONFIG_RWSEM_SPIN_ON_OWNER */ - return (rw_owner(rwp) == NULL || (unsigned long) rw_owner(rwp) == 1); + return (RW_LOCK_HELD(rwp) && rw_owner(rwp) == NULL); } /* @@ -228,6 +132,12 @@ RW_READ_HELD(krwlock_t *rwp) */ #define rw_destroy(rwp) ((void) 0) +/* + * Upgrading a rwsem from a reader to a writer is not supported by the + * Linux kernel. The lock must be dropped and reacquired as a writer. + */ +#define rw_tryupgrade(rwp) RW_WRITE_HELD(rwp) + #define rw_tryenter(rwp, rw) \ ({ \ int _rc_ = 0; \ @@ -285,25 +195,6 @@ RW_READ_HELD(krwlock_t *rwp) downgrade_write(SEM(rwp)); \ spl_rw_lockdep_on_maybe(rwp); \ }) - -#define rw_tryupgrade(rwp) \ -({ \ - int _rc_ = 0; \ - \ - if (RW_WRITE_HELD(rwp)) { \ - _rc_ = 1; \ - } else { \ - spl_rw_lockdep_off_maybe(rwp); \ - if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \ - spl_rw_set_owner(rwp); \ - spl_rw_lockdep_on_maybe(rwp); \ - } \ - _rc_; \ -}) /* END CSTYLED */ -int spl_rw_init(void); -void spl_rw_fini(void); -int rwsem_tryupgrade(struct rw_semaphore *rwsem); - #endif /* _SPL_RWLOCK_H */ diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h new file mode 100644 index 0000000000..e5b7a9c955 --- /dev/null +++ b/include/os/linux/spl/sys/shrinker.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SHRINKER_H +#define _SPL_SHRINKER_H + +#include +#include + +/* + * Due to frequent changes in the shrinker API the following + * compatibility wrappers should be used. They are as follows: + * + * SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost); + * + * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname, + * which is passed to spl_register_shrinker()/spl_unregister_shrinker(). + * The countfunc returns the number of free-able objects. + * The scanfunc returns the number of objects that were freed. + * The callbacks can return SHRINK_STOP if further calls can't make any more + * progress. Note that a return value of SHRINK_EMPTY is currently not + * supported. + * + * Example: + * + * static unsigned long + * my_count(struct shrinker *shrink, struct shrink_control *sc) + * { + * ...calculate number of objects in the cache... + * + * return (number of objects in the cache); + * } + * + * static unsigned long + * my_scan(struct shrinker *shrink, struct shrink_control *sc) + * { + * ...scan objects in the cache and reclaim them... + * } + * + * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS); + * + * void my_init_func(void) { + * spl_register_shrinker(&my_shrinker); + * } + */ + +#define spl_register_shrinker(x) register_shrinker(x) +#define spl_unregister_shrinker(x) unregister_shrinker(x) + +/* + * Linux 3.0 to 3.11 Shrinker API Compatibility. + */ +#if defined(HAVE_SINGLE_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ +static int \ +__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ +{ \ + if (sc->nr_to_scan != 0) { \ + (void) scanfunc(shrink, sc); \ + } \ + return (countfunc(shrink, sc)); \ +} \ + \ +static struct shrinker varname = { \ + .shrink = __ ## varname ## _wrapper, \ + .seeks = seek_cost, \ +} + +#define SHRINK_STOP (-1) + +/* + * Linux 3.12 and later Shrinker API Compatibility. + */ +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ +static struct shrinker varname = { \ + .count_objects = countfunc, \ + .scan_objects = scanfunc, \ + .seeks = seek_cost, \ +} + +#else +/* + * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. + */ +#error "Unknown shrinker callback" +#endif + +#endif /* SPL_SHRINKER_H */ diff --git a/include/spl/sys/sid.h b/include/os/linux/spl/sys/sid.h similarity index 97% rename from include/spl/sys/sid.h rename to include/os/linux/spl/sys/sid.h index 731b62c47e..3cf27111b6 100644 --- a/include/spl/sys/sid.h +++ b/include/os/linux/spl/sys/sid.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/os/linux/spl/sys/signal.h b/include/os/linux/spl/sys/signal.h new file mode 100644 index 0000000000..6b538c8966 --- /dev/null +++ b/include/os/linux/spl/sys/signal.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SIGNAL_H +#define _SPL_SIGNAL_H + +#include + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include +#endif + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +extern int issig(int why); + +#endif /* SPL_SIGNAL_H */ diff --git a/include/os/linux/spl/sys/simd.h b/include/os/linux/spl/sys/simd.h new file mode 100644 index 0000000000..6fb84d3a52 --- /dev/null +++ b/include/os/linux/spl/sys/simd.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SYS_SIMD_H +#define _SPL_SYS_SIMD_H + +#include +#include + +#endif /* _SPL_SYS_SIMD_H */ diff --git a/include/spl/sys/stat.h b/include/os/linux/spl/sys/stat.h similarity index 95% rename from include/spl/sys/stat.h rename to include/os/linux/spl/sys/stat.h index 83018e8944..5987849641 100644 --- a/include/spl/sys/stat.h +++ b/include/os/linux/spl/sys/stat.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/strings.h b/include/os/linux/spl/sys/strings.h similarity index 91% rename from include/spl/sys/strings.h rename to include/os/linux/spl/sys/strings.h index 8b810c9af2..48e417d146 100644 --- a/include/spl/sys/strings.h +++ b/include/os/linux/spl/sys/strings.h @@ -4,7 +4,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -28,8 +27,4 @@ #define bcopy(src, dest, size) memmove(dest, src, size) #define bcmp(src, dest, size) memcmp((src), (dest), (size_t)(size)) -#ifndef HAVE_KSTRTOUL -#define kstrtoul strict_strtoul -#endif - #endif /* _SPL_SYS_STRINGS_H */ diff --git a/include/spl/sys/sunddi.h b/include/os/linux/spl/sys/sunddi.h similarity index 97% rename from include/spl/sys/sunddi.h rename to include/os/linux/spl/sys/sunddi.h index 29a6fe00d1..8524ec9c30 100644 --- a/include/spl/sys/sunddi.h +++ b/include/os/linux/spl/sys/sunddi.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h similarity index 87% rename from include/spl/sys/sysmacros.h rename to include/os/linux/spl/sys/sysmacros.h index e11eaece5c..98d1ab1d7f 100644 --- a/include/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -27,15 +26,13 @@ #include #include +#include #include #include #include #include #include -#ifdef HAVE_SCHED_RT_HEADER -#include -#endif #ifndef _KERNEL #define _KERNEL __KERNEL__ @@ -79,6 +76,7 @@ #define max_ncpus num_possible_cpus() #define boot_ncpus num_online_cpus() #define CPU_SEQID smp_processor_id() +#define CPU_SEQID_UNSTABLE raw_smp_processor_id() #define is_system_labeled() 0 #ifndef RLIM64_INFINITY @@ -114,32 +112,6 @@ #define PAGESHIFT PAGE_SHIFT #endif -/* Dtrace probes do not exist in the linux kernel */ -#ifdef DTRACE_PROBE -#undef DTRACE_PROBE -#endif /* DTRACE_PROBE */ -#define DTRACE_PROBE(a) ((void)0) - -#ifdef DTRACE_PROBE1 -#undef DTRACE_PROBE1 -#endif /* DTRACE_PROBE1 */ -#define DTRACE_PROBE1(a, b, c) ((void)0) - -#ifdef DTRACE_PROBE2 -#undef DTRACE_PROBE2 -#endif /* DTRACE_PROBE2 */ -#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) - -#ifdef DTRACE_PROBE3 -#undef DTRACE_PROBE3 -#endif /* DTRACE_PROBE3 */ -#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) - -#ifdef DTRACE_PROBE4 -#undef DTRACE_PROBE4 -#endif /* DTRACE_PROBE4 */ -#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) - /* Missing globals */ extern char spl_gitrev[64]; extern unsigned long spl_hostid; @@ -217,7 +189,14 @@ extern void spl_cleanup(void); #define P2SAMEHIGHBIT_TYPED(x, y, type) \ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) -#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + +#include +#define qsort(base, num, size, cmp) \ + sort(base, num, size, cmp, NULL) + +#if !defined(_KMEMUSER) && !defined(offsetof) /* avoid any possibility of clashing with version */ diff --git a/include/spl/sys/systeminfo.h b/include/os/linux/spl/sys/systeminfo.h similarity index 96% rename from include/spl/sys/systeminfo.h rename to include/os/linux/spl/sys/systeminfo.h index 2255691580..d4037a0900 100644 --- a/include/spl/sys/systeminfo.h +++ b/include/os/linux/spl/sys/systeminfo.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h similarity index 95% rename from include/spl/sys/taskq.h rename to include/os/linux/spl/sys/taskq.h index 7353367a21..b50175a108 100644 --- a/include/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -85,6 +84,8 @@ typedef struct taskq { int tq_nthreads; /* # of existing threads */ int tq_nspawn; /* # of threads being spawned */ int tq_maxthreads; /* # of threads maximum */ + /* If PERCPU flag is set, percent of NCPUs to have as threads */ + int tq_cpu_pct; int tq_pri; /* priority */ int tq_minalloc; /* min taskq_ent_t pool size */ int tq_maxalloc; /* max taskq_ent_t pool size */ @@ -100,6 +101,9 @@ typedef struct taskq { spl_wait_queue_head_t tq_work_waitq; /* new work waitq */ spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */ tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ + /* list node for the cpu hotplug callback */ + struct hlist_node tq_hp_cb_node; + boolean_t tq_hp_support; } taskq_t; typedef struct taskq_ent { @@ -151,6 +155,7 @@ extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait(taskq_t *); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); #define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \ taskq_create(name, nthreads, pri, min, max, flags) diff --git a/include/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h similarity index 82% rename from include/spl/sys/thread.h rename to include/os/linux/spl/sys/thread.h index 3762717da3..220742387b 100644 --- a/include/spl/sys/thread.h +++ b/include/os/linux/spl/sys/thread.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -45,6 +44,11 @@ typedef void (*thread_func_t)(void *); +#define thread_create_named(name, stk, stksize, func, arg, len, \ + pp, state, pri) \ + __thread_create(stk, stksize, (thread_func_t)func, \ + name, arg, len, pp, state, pri) + /* BEGIN CSTYLED */ #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ __thread_create(stk, stksize, (thread_func_t)func, \ @@ -66,4 +70,17 @@ extern struct task_struct *spl_kthread_create(int (*func)(void *), extern proc_t p0; +#ifdef HAVE_SIGINFO +typedef kernel_siginfo_t spl_kernel_siginfo_t; +#else +typedef siginfo_t spl_kernel_siginfo_t; +#endif + +#ifdef HAVE_SET_SPECIAL_STATE +#define spl_set_special_state(x) set_special_state((x)) +#else +#define spl_set_special_state(x) __set_current_state((x)) +#endif + + #endif /* _SPL_THREAD_H */ diff --git a/include/spl/sys/time.h b/include/os/linux/spl/sys/time.h similarity index 96% rename from include/spl/sys/time.h rename to include/os/linux/spl/sys/time.h index 312415b7bc..fec85f8b8d 100644 --- a/include/spl/sys/time.h +++ b/include/os/linux/spl/sys/time.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -85,7 +84,7 @@ gethrestime(inode_timespec_t *ts) #endif } -static inline time_t +static inline uint64_t gethrestime_sec(void) { #if defined(HAVE_INODE_TIMESPEC64_TIMES) @@ -105,8 +104,13 @@ gethrestime_sec(void) static inline hrtime_t gethrtime(void) { +#if defined(HAVE_KTIME_GET_RAW_TS64) + struct timespec64 ts; + ktime_get_raw_ts64(&ts); +#else struct timespec ts; getrawmonotonic(&ts); +#endif return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); } diff --git a/include/spl/sys/timer.h b/include/os/linux/spl/sys/timer.h similarity index 89% rename from include/spl/sys/timer.h rename to include/os/linux/spl/sys/timer.h index 31d89d3b97..02c3c78934 100644 --- a/include/spl/sys/timer.h +++ b/include/os/linux/spl/sys/timer.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -53,20 +52,6 @@ #define delay(ticks) schedule_timeout_uninterruptible(ticks) -/* usleep_range() introduced in 2.6.36 */ -#ifndef HAVE_USLEEP_RANGE -static inline void -usleep_range(unsigned long min, unsigned long max) -{ - unsigned int min_ms = min / USEC_PER_MSEC; - - if (min >= MAX_UDELAY_MS) - msleep(min_ms); - else - udelay(min); -} -#endif /* HAVE_USLEEP_RANGE */ - #define SEC_TO_TICK(sec) ((sec) * HZ) #define MSEC_TO_TICK(ms) msecs_to_jiffies(ms) #define USEC_TO_TICK(us) usecs_to_jiffies(us) diff --git a/include/os/linux/spl/sys/trace.h b/include/os/linux/spl/sys/trace.h new file mode 100644 index 0000000000..b148ace6ab --- /dev/null +++ b/include/os/linux/spl/sys/trace.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) + +/* + * Calls to DTRACE_PROBE* are mapped to standard Linux kernel trace points + * when they are available(when HAVE_DECLARE_EVENT_CLASS is defined). The + * tracepoint event class definitions are found in the general tracing + * header file: include/sys/trace_*.h. See include/sys/trace_vdev.h for + * a good example. + * + * If tracepoints are not available, stub functions are generated which can + * be traced using kprobes. In this case, the DEFINE_DTRACE_PROBE* macros + * are used to provide the stub functions and also the prototypes for + * those functions. The mechanism to do this relies on DEFINE_DTRACE_PROBE + * macros defined in the general tracing headers(see trace_vdev.h) and + * CREATE_TRACE_POINTS being defined only in module/zfs/trace.c. When ZFS + * source files include the general tracing headers, e.g. + * module/zfs/vdev_removal.c including trace_vdev.h, DTRACE_PROBE calls + * are mapped to stub functions calls and prototypes for those calls are + * declared via DEFINE_DTRACE_PROBE*. Only module/zfs/trace.c defines + * CREATE_TRACE_POINTS. That is followed by includes of all the general + * tracing headers thereby defining all stub functions in one place via + * the DEFINE_DTRACE_PROBE macros. + * + * When adding new DTRACE_PROBEs to zfs source, both a tracepoint event + * class definition and a DEFINE_DTRACE_PROBE definition are needed to + * avoid undefined function errors. + */ + +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZFS_H + +#include +#include + +/* + * DTRACE_PROBE with 0 arguments is not currently available with + * tracepoint events + */ +#define DTRACE_PROBE(name) \ + ((void)0) + +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((arg1)) + +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((arg1), (arg2)) + +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((arg1), (arg2), (arg3)) + +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) + +#endif /* _TRACE_ZFS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace +#include + +#else /* HAVE_DECLARE_EVENT_CLASS */ + +#define DTRACE_PROBE(name) \ + trace_zfs_##name() + +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((uintptr_t)(arg1)) + +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2)) + +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3)) + +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3), (uintptr_t)(arg4)) + +#define PROTO_DTRACE_PROBE(name) \ + noinline void trace_zfs_##name(void) +#define PROTO_DTRACE_PROBE1(name) \ + noinline void trace_zfs_##name(uintptr_t) +#define PROTO_DTRACE_PROBE2(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t) +#define PROTO_DTRACE_PROBE3(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t, \ + uintptr_t) +#define PROTO_DTRACE_PROBE4(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t, \ + uintptr_t, uintptr_t) + +#if defined(CREATE_TRACE_POINTS) + +#define FUNC_DTRACE_PROBE(name) \ +PROTO_DTRACE_PROBE(name); \ +noinline void trace_zfs_##name(void) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE1(name) \ +PROTO_DTRACE_PROBE1(name); \ +noinline void trace_zfs_##name(uintptr_t arg1) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE2(name) \ +PROTO_DTRACE_PROBE2(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE3(name) \ +PROTO_DTRACE_PROBE3(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2, uintptr_t arg3) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE4(name) \ +PROTO_DTRACE_PROBE4(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#undef DEFINE_DTRACE_PROBE +#define DEFINE_DTRACE_PROBE(name) FUNC_DTRACE_PROBE(name) + +#undef DEFINE_DTRACE_PROBE1 +#define DEFINE_DTRACE_PROBE1(name) FUNC_DTRACE_PROBE1(name) + +#undef DEFINE_DTRACE_PROBE2 +#define DEFINE_DTRACE_PROBE2(name) FUNC_DTRACE_PROBE2(name) + +#undef DEFINE_DTRACE_PROBE3 +#define DEFINE_DTRACE_PROBE3(name) FUNC_DTRACE_PROBE3(name) + +#undef DEFINE_DTRACE_PROBE4 +#define DEFINE_DTRACE_PROBE4(name) FUNC_DTRACE_PROBE4(name) + +#else /* CREATE_TRACE_POINTS */ + +#define DEFINE_DTRACE_PROBE(name) PROTO_DTRACE_PROBE(name) +#define DEFINE_DTRACE_PROBE1(name) PROTO_DTRACE_PROBE1(name) +#define DEFINE_DTRACE_PROBE2(name) PROTO_DTRACE_PROBE2(name) +#define DEFINE_DTRACE_PROBE3(name) PROTO_DTRACE_PROBE3(name) +#define DEFINE_DTRACE_PROBE4(name) PROTO_DTRACE_PROBE4(name) + +#endif /* CREATE_TRACE_POINTS */ +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/spl/sys/trace_spl.h b/include/os/linux/spl/sys/trace_spl.h new file mode 100644 index 0000000000..bffd91d912 --- /dev/null +++ b/include/os/linux/spl/sys/trace_spl.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _OS_LINUX_SPL_TRACE_H +#define _OS_LINUX_SPL_TRACE_H + +#include + +#include +#include + +#endif diff --git a/include/os/linux/spl/sys/trace_taskq.h b/include/os/linux/spl/sys/trace_taskq.h new file mode 100644 index 0000000000..dbbb3c4c79 --- /dev/null +++ b/include/os/linux/spl/sys/trace_taskq.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_taskq + +#if !defined(_TRACE_TASKQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASKQ_H + +#include +#include + +/* + * Generic support for single argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * taskq_ent_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_taskq_ent_class, + TP_PROTO(taskq_ent_t *taskq_ent), + TP_ARGS(taskq_ent), + TP_STRUCT__entry( + __field(taskq_ent_t *, taskq_ent) + ), + TP_fast_assign( + __entry->taskq_ent = taskq_ent; + ), + TP_printk("taskq_ent %p", __entry->taskq_ent) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_TASKQ_EVENT(name) \ +DEFINE_EVENT(zfs_taskq_ent_class, name, \ + TP_PROTO(taskq_ent_t *taskq_ent), \ + TP_ARGS(taskq_ent)) +/* END CSTYLED */ +DEFINE_TASKQ_EVENT(zfs_taskq_ent__birth); +DEFINE_TASKQ_EVENT(zfs_taskq_ent__start); +DEFINE_TASKQ_EVENT(zfs_taskq_ent__finish); + +#endif /* _TRACE_TASKQ_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_taskq +#include + +#else + +/* + * When tracepoints are not available, a DEFINE_DTRACE_PROBE* macro is + * needed for each DTRACE_PROBE. These will be used to generate stub + * tracing functions and prototypes for those functions. See + * include/os/linux/spl/sys/trace.h. + */ + +DEFINE_DTRACE_PROBE1(taskq_ent__birth); +DEFINE_DTRACE_PROBE1(taskq_ent__start); +DEFINE_DTRACE_PROBE1(taskq_ent__finish); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/spl/sys/tsd.h b/include/os/linux/spl/sys/tsd.h similarity index 96% rename from include/spl/sys/tsd.h rename to include/os/linux/spl/sys/tsd.h index 39a291bf3d..8cdb9e4ffe 100644 --- a/include/spl/sys/tsd.h +++ b/include/os/linux/spl/sys/tsd.h @@ -5,7 +5,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/types.h b/include/os/linux/spl/sys/types.h similarity index 97% rename from include/spl/sys/types.h rename to include/os/linux/spl/sys/types.h index 719a44646e..b44c945187 100644 --- a/include/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/types32.h b/include/os/linux/spl/sys/types32.h similarity index 96% rename from include/spl/sys/types32.h rename to include/os/linux/spl/sys/types32.h index c60ba8c970..cb62c75e5a 100644 --- a/include/spl/sys/types32.h +++ b/include/os/linux/spl/sys/types32.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h new file mode 100644 index 0000000000..66af2b0b53 --- /dev/null +++ b/include/os/linux/spl/sys/uio.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_UIO_H +#define _SPL_UIO_H + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct iovec iovec_t; + +typedef enum zfs_uio_rw { + UIO_READ = 0, + UIO_WRITE = 1, +} zfs_uio_rw_t; + +typedef enum zfs_uio_seg { + UIO_USERSPACE = 0, + UIO_SYSSPACE = 1, + UIO_BVEC = 2, +#if defined(HAVE_VFS_IOV_ITER) + UIO_ITER = 3, +#endif +} zfs_uio_seg_t; + +typedef struct zfs_uio { + union { + const struct iovec *uio_iov; + const struct bio_vec *uio_bvec; +#if defined(HAVE_VFS_IOV_ITER) + struct iov_iter *uio_iter; +#endif + }; + int uio_iovcnt; + offset_t uio_loffset; + zfs_uio_seg_t uio_segflg; + boolean_t uio_fault_disable; + uint16_t uio_fmode; + uint16_t uio_extflg; + ssize_t uio_resid; + size_t uio_skip; +} zfs_uio_t; + +#define zfs_uio_segflg(u) (u)->uio_segflg +#define zfs_uio_offset(u) (u)->uio_loffset +#define zfs_uio_resid(u) (u)->uio_resid +#define zfs_uio_iovcnt(u) (u)->uio_iovcnt +#define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len +#define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base +#define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set +#define zfs_uio_rlimit_fsize(z, u) (0) +#define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) + +extern int zfs_uio_prefaultpages(ssize_t, zfs_uio_t *); + +static inline void +zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) +{ + uio->uio_loffset = off; +} + +static inline void +zfs_uio_advance(zfs_uio_t *uio, size_t size) +{ + uio->uio_resid -= size; + uio->uio_loffset += size; +} + +static inline void +zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, + unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid, + size_t skip) +{ + ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); + + uio->uio_iov = iov; + uio->uio_iovcnt = nr_segs; + uio->uio_loffset = offset; + uio->uio_segflg = seg; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = resid; + uio->uio_skip = skip; +} + +static inline void +zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio) +{ + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + uio->uio_segflg = UIO_BVEC; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = BIO_BI_SIZE(bio); + uio->uio_skip = BIO_BI_SKIP(bio); +} + +#if defined(HAVE_VFS_IOV_ITER) +static inline void +zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, + ssize_t resid, size_t skip) +{ + uio->uio_iter = iter; + uio->uio_iovcnt = iter->nr_segs; + uio->uio_loffset = offset; + uio->uio_segflg = UIO_ITER; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = resid; + uio->uio_skip = skip; +} +#endif + +#endif /* SPL_UIO_H */ diff --git a/include/spl/sys/user.h b/include/os/linux/spl/sys/user.h similarity index 96% rename from include/spl/sys/user.h rename to include/os/linux/spl/sys/user.h index b12cb240e3..13a2edf5f6 100644 --- a/include/spl/sys/user.h +++ b/include/os/linux/spl/sys/user.h @@ -4,7 +4,6 @@ * Written by Richard Yao . * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/vfs.h b/include/os/linux/spl/sys/vfs.h similarity index 96% rename from include/spl/sys/vfs.h rename to include/os/linux/spl/sys/vfs.h index 0d5e1d51d7..488f1827ec 100644 --- a/include/spl/sys/vfs.h +++ b/include/os/linux/spl/sys/vfs.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/spl/sys/vmem.h b/include/os/linux/spl/sys/vmem.h similarity index 94% rename from include/spl/sys/vmem.h rename to include/os/linux/spl/sys/vmem.h index a9b12eeb96..e77af2a7a4 100644 --- a/include/spl/sys/vmem.h +++ b/include/os/linux/spl/sys/vmem.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -31,12 +30,6 @@ typedef struct vmem { } vmem_t; -extern vmem_t *heap_arena; -extern vmem_t *zio_alloc_arena; -extern vmem_t *zio_arena; - -extern size_t vmem_size(vmem_t *vmp, int typemask); - /* * Memory allocation interfaces */ @@ -97,7 +90,6 @@ extern size_t vmem_size(vmem_t *vmp, int typemask); #define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) #define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) #define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) -#define vmem_qcache_reap(ptr) ((void)0) extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); diff --git a/include/spl/sys/vmsystm.h b/include/os/linux/spl/sys/vmsystm.h similarity index 91% rename from include/spl/sys/vmsystm.h rename to include/os/linux/spl/sys/vmsystm.h index 5807d960ad..b3f121ecf0 100644 --- a/include/spl/sys/vmsystm.h +++ b/include/os/linux/spl/sys/vmsystm.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -47,10 +46,6 @@ #define membar_producer() smp_wmb() #define physmem zfs_totalram_pages -#define freemem (nr_free_pages() + \ - global_page_state(NR_INACTIVE_FILE) + \ - global_page_state(NR_INACTIVE_ANON) + \ - global_page_state(NR_SLAB_RECLAIMABLE)) #define xcopyin(from, to, size) copy_from_user(to, from, size) #define xcopyout(from, to, size) copy_to_user(to, from, size) diff --git a/include/spl/sys/vnode.h b/include/os/linux/spl/sys/vnode.h similarity index 51% rename from include/spl/sys/vnode.h rename to include/os/linux/spl/sys/vnode.h index 71278b08c8..64c2706502 100644 --- a/include/spl/sys/vnode.h +++ b/include/os/linux/spl/sys/vnode.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -51,23 +50,14 @@ #define O_DSYNC O_SYNC #endif -#define FREAD 1 -#define FWRITE 2 -#define FCREAT O_CREAT -#define FTRUNC O_TRUNC -#define FOFFMAX O_LARGEFILE -#define FSYNC O_SYNC -#define FDSYNC O_DSYNC -#define FEXCL O_EXCL -#define FDIRECT O_DIRECT -#define FAPPEND O_APPEND - -#define FNODSYNC 0x10000 /* fsync pseudo flag */ -#define FNOFOLLOW 0x20000 /* don't follow symlinks */ - #define F_FREESP 11 /* Free file space */ +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +#define F_SEEK_DATA SEEK_DATA +#define F_SEEK_HOLE SEEK_HOLE +#endif + /* * The vnode AT_ flags are mapped to the Linux ATTR_* flags. * This allows them to be used safely with an iattr structure. @@ -102,23 +92,7 @@ #define CREATE_XATTR_DIR 0x04 #define ATTR_NOACLCHECK 0x20 -typedef enum vtype { - VNON = 0, - VREG = 1, - VDIR = 2, - VBLK = 3, - VCHR = 4, - VLNK = 5, - VFIFO = 6, - VDOOR = 7, - VPROC = 8, - VSOCK = 9, - VPORT = 10, - VBAD = 11 -} vtype_t; - typedef struct vattr { - enum vtype va_type; /* vnode type */ uint32_t va_mask; /* attribute bit-mask */ ushort_t va_mode; /* acc mode */ uid_t va_uid; /* owner uid */ @@ -133,71 +107,6 @@ typedef struct vattr { dev_t va_rdev; /* dev */ uint64_t va_nblocks; /* space used */ uint32_t va_blksize; /* block size */ - uint32_t va_seq; /* sequence */ struct dentry *va_dentry; /* dentry to wire */ } vattr_t; - -typedef struct vnode { - struct file *v_file; - kmutex_t v_lock; /* protects vnode fields */ - uint_t v_flag; /* vnode flags (see below) */ - uint_t v_count; /* reference count */ - void *v_data; /* private data for fs */ - struct vfs *v_vfsp; /* ptr to containing VFS */ - struct stdata *v_stream; /* associated stream */ - enum vtype v_type; /* vnode type */ - dev_t v_rdev; /* device (VCHR, VBLK) */ - gfp_t v_gfp_mask; /* original mapping gfp mask */ -} vnode_t; - -typedef struct vn_file { - int f_fd; /* linux fd for lookup */ - struct task_struct *f_task; /* linux task this fd belongs to */ - struct file *f_file; /* linux file struct */ - atomic_t f_ref; /* ref count */ - kmutex_t f_lock; /* struct lock */ - loff_t f_offset; /* offset */ - vnode_t *f_vnode; /* vnode */ - struct list_head f_list; /* list referenced file_t's */ -} file_t; - -extern vnode_t *vn_alloc(int flag); -void vn_free(vnode_t *vp); -extern vtype_t vn_mode_to_vtype(mode_t); -extern mode_t vn_vtype_to_mode(vtype_t); -extern int vn_open(const char *path, uio_seg_t seg, int flags, int mode, - vnode_t **vpp, int x1, void *x2); -extern int vn_openat(const char *path, uio_seg_t seg, int flags, int mode, - vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd); -extern int vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, - offset_t off, uio_seg_t seg, int x1, rlim64_t x2, - void *x3, ssize_t *residp); -extern int vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4); -extern int vn_seek(vnode_t *vp, offset_t o, offset_t *op, void *ct); - -extern int vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4); -extern int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4); -extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, - offset_t offset, void *x6, void *x7); -extern file_t *vn_getf(int fd); -extern void vn_releasef(int fd); -extern void vn_areleasef(int fd, uf_info_t *fip); -extern int vn_set_pwd(const char *filename); - -int spl_vn_init(void); -void spl_vn_fini(void); - -#define VOP_CLOSE vn_close -#define VOP_SEEK vn_seek -#define VOP_GETATTR vn_getattr -#define VOP_FSYNC vn_fsync -#define VOP_SPACE vn_space -#define VOP_PUTPAGE(vp, o, s, f, x1, x2) ((void)0) -#define vn_is_readonly(vp) 0 -#define getf vn_getf -#define releasef vn_releasef -#define areleasef vn_areleasef - -extern vnode_t *rootdir; - #endif /* SPL_VNODE_H */ diff --git a/include/spl/sys/wait.h b/include/os/linux/spl/sys/wait.h similarity index 97% rename from include/spl/sys/wait.h rename to include/os/linux/spl/sys/wait.h index 5311ff8b97..65cd83e5ef 100644 --- a/include/spl/sys/wait.h +++ b/include/os/linux/spl/sys/wait.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/os/linux/spl/sys/wmsum.h b/include/os/linux/spl/sys/wmsum.h new file mode 100644 index 0000000000..0871bd6950 --- /dev/null +++ b/include/os/linux/spl/sys/wmsum.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * wmsum counters are a reduced version of aggsum counters, optimized for + * write-mostly scenarios. They do not provide optimized read functions, + * but instead allow much cheaper add function. The primary usage is + * infrequently read statistic counters, not requiring exact precision. + * + * The Linux implementation is directly mapped into percpu_counter KPI. + */ + +#ifndef _SYS_WMSUM_H +#define _SYS_WMSUM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct percpu_counter wmsum_t; + +static inline void +wmsum_init(wmsum_t *ws, uint64_t value) +{ + +#ifdef HAVE_PERCPU_COUNTER_INIT_WITH_GFP + percpu_counter_init(ws, value, GFP_KERNEL); +#else + percpu_counter_init(ws, value); +#endif +} + +static inline void +wmsum_fini(wmsum_t *ws) +{ + + percpu_counter_destroy(ws); +} + +static inline uint64_t +wmsum_value(wmsum_t *ws) +{ + + return (percpu_counter_sum(ws)); +} + +static inline void +wmsum_add(wmsum_t *ws, int64_t delta) +{ + +#ifdef HAVE_PERCPU_COUNTER_ADD_BATCH + percpu_counter_add_batch(ws, delta, INT_MAX / 2); +#else + __percpu_counter_add(ws, delta, INT_MAX / 2); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_WMSUM_H */ diff --git a/include/spl/sys/zmod.h b/include/os/linux/spl/sys/zmod.h similarity index 89% rename from include/spl/sys/zmod.h rename to include/os/linux/spl/sys/zmod.h index 95c1a3ed78..8d27b62f47 100644 --- a/include/spl/sys/zmod.h +++ b/include/os/linux/spl/sys/zmod.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -59,14 +58,6 @@ #include #include -#ifdef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE -#define spl_zlib_deflate_workspacesize(wb, ml) \ - zlib_deflate_workspacesize(wb, ml) -#else -#define spl_zlib_deflate_workspacesize(wb, ml) \ - zlib_deflate_workspacesize() -#endif /* HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ - extern int z_compress_level(void *dest, size_t *destLen, const void *source, size_t sourceLen, int level); extern int z_uncompress(void *dest, size_t *destLen, const void *source, diff --git a/include/spl/sys/zone.h b/include/os/linux/spl/sys/zone.h similarity index 96% rename from include/spl/sys/zone.h rename to include/os/linux/spl/sys/zone.h index b2efd13b8e..00e30f690c 100644 --- a/include/spl/sys/zone.h +++ b/include/os/linux/spl/sys/zone.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/include/os/linux/zfs/Makefile.am b/include/os/linux/zfs/Makefile.am new file mode 100644 index 0000000000..081839c48c --- /dev/null +++ b/include/os/linux/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/linux/zfs/sys/Makefile.am b/include/os/linux/zfs/sys/Makefile.am new file mode 100644 index 0000000000..a075db476e --- /dev/null +++ b/include/os/linux/zfs/sys/Makefile.am @@ -0,0 +1,31 @@ +KERNEL_H = \ + policy.h \ + sha2.h \ + trace_acl.h \ + trace_arc.h \ + trace_common.h \ + trace_zfs.h \ + trace_dbgmsg.h \ + trace_dbuf.h \ + trace_dmu.h \ + trace_dnode.h \ + trace_multilist.h \ + trace_rrwlock.h \ + trace_txg.h \ + trace_vdev.h \ + trace_zil.h \ + trace_zio.h \ + trace_zrlock.h \ + zfs_bootenv_os.h \ + zfs_context_os.h \ + zfs_ctldir.h \ + zfs_dir.h \ + zfs_vfsops_os.h \ + zfs_vnops_os.h \ + zfs_znode_impl.h \ + zpl.h + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys +kernel_HEADERS = $(KERNEL_H) +endif diff --git a/include/sys/policy.h b/include/os/linux/zfs/sys/policy.h similarity index 91% rename from include/sys/policy.h rename to include/os/linux/zfs/sys/policy.h index 23d7d4db77..61afc37655 100644 --- a/include/sys/policy.h +++ b/include/os/linux/zfs/sys/policy.h @@ -35,6 +35,8 @@ #include #include +struct znode; + int secpolicy_nfs(const cred_t *); int secpolicy_sys_config(const cred_t *, boolean_t); int secpolicy_vnode_access2(const cred_t *, struct inode *, @@ -44,14 +46,15 @@ int secpolicy_vnode_chown(const cred_t *, uid_t); int secpolicy_vnode_create_gid(const cred_t *); int secpolicy_vnode_remove(const cred_t *); int secpolicy_vnode_setdac(const cred_t *, uid_t); -int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +int secpolicy_vnode_setid_retain(struct znode *, const cred_t *, boolean_t); int secpolicy_vnode_setids_setgids(const cred_t *, gid_t); int secpolicy_zinject(const cred_t *); int secpolicy_zfs(const cred_t *); +int secpolicy_zfs_proc(const cred_t *, proc_t *); void secpolicy_setid_clear(vattr_t *, cred_t *); int secpolicy_setid_setsticky_clear(struct inode *, vattr_t *, const vattr_t *, cred_t *); -int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); +int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, mode_t); int secpolicy_vnode_setattr(cred_t *, struct inode *, struct vattr *, const struct vattr *, int, int (void *, int, cred_t *), void *); int secpolicy_basic_link(const cred_t *); diff --git a/include/sys/sha2.h b/include/os/linux/zfs/sys/sha2.h similarity index 98% rename from include/sys/sha2.h rename to include/os/linux/zfs/sys/sha2.h index 9039835f18..4dd966b6ca 100644 --- a/include/sys/sha2.h +++ b/include/os/linux/zfs/sys/sha2.h @@ -27,11 +27,7 @@ #ifndef _SYS_SHA2_H #define _SYS_SHA2_H -#ifdef _KERNEL #include /* for uint_* */ -#else -#include -#endif #ifdef __cplusplus extern "C" { diff --git a/include/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h similarity index 93% rename from include/sys/trace_acl.h rename to include/os/linux/zfs/sys/trace_acl.h index 610bbe29c2..4707fc6f41 100644 --- a/include/sys/trace_acl.h +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -51,7 +52,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint8_t, z_unlinked) __field(uint8_t, z_atime_dirty) __field(uint8_t, z_zn_prefetch) - __field(uint8_t, z_moved) __field(uint_t, z_blksz) __field(uint_t, z_seq) __field(uint64_t, z_mapcnt) @@ -85,7 +85,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_unlinked = zn->z_unlinked; __entry->z_atime_dirty = zn->z_atime_dirty; __entry->z_zn_prefetch = zn->z_zn_prefetch; - __entry->z_moved = zn->z_moved; __entry->z_blksz = zn->z_blksz; __entry->z_seq = zn->z_seq; __entry->z_mapcnt = zn->z_mapcnt; @@ -115,7 +114,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->mask_matched = mask_matched; ), TP_printk("zn { id %llu unlinked %u atime_dirty %u " - "zn_prefetch %u moved %u blksz %u seq %u " + "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " "sync_cnt %u mode 0x%x is_sa %d " "is_mapped %d is_ctldir %d is_stale %d inode { " @@ -123,7 +122,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, - __entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz, + __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, @@ -153,4 +152,11 @@ DEFINE_ACE_EVENT(zfs_zfs__ace__allows); #define TRACE_INCLUDE_FILE trace_acl #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE3(zfs__ace__denies); +DEFINE_DTRACE_PROBE3(zfs__ace__allows); +DEFINE_DTRACE_PROBE(zfs__fastpath__execute__access__miss); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h similarity index 86% rename from include/sys/trace_arc.h rename to include/os/linux/zfs/sys/trace_arc.h index c40b58e32d..d3410bc07a 100644 --- a/include/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -21,7 +21,8 @@ #include -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -79,7 +80,7 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_l2_hits = ab->b_l2hdr.b_hits; __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " @@ -237,7 +238,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_l2_hits = hdr->b_l2hdr.b_hits; __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; @@ -353,6 +354,41 @@ DEFINE_EVENT(zfs_l2arc_evict_class, name, \ /* END CSTYLED */ DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * uint64_t, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class, + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), + TP_ARGS(amount, arc_evict_count, aew_count), + TP_STRUCT__entry( + __field(uint64_t, amount) + __field(uint64_t, arc_evict_count) + __field(uint64_t, aew_count) + ), + TP_fast_assign( + __entry->amount = amount; + __entry->arc_evict_count = arc_evict_count; + __entry->aew_count = aew_count; + ), + TP_printk("amount %llu arc_evict_count %llu aew_count %llu", + __entry->amount, __entry->arc_evict_count, __entry->aew_count) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \ +DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \ + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), \ + TP_ARGS(amount, arc_evict_count, aew_count)) +/* END CSTYLED */ +DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); + #endif /* _TRACE_ARC_H */ #undef TRACE_INCLUDE_PATH @@ -361,4 +397,23 @@ DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); #define TRACE_INCLUDE_FILE trace_arc #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE1(arc__hit); +DEFINE_DTRACE_PROBE1(arc__evict); +DEFINE_DTRACE_PROBE1(arc__delete); +DEFINE_DTRACE_PROBE1(new_state__mru); +DEFINE_DTRACE_PROBE1(new_state__mfu); +DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); +DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch); +DEFINE_DTRACE_PROBE1(l2arc__hit); +DEFINE_DTRACE_PROBE1(l2arc__miss); +DEFINE_DTRACE_PROBE2(l2arc__read); +DEFINE_DTRACE_PROBE2(l2arc__write); +DEFINE_DTRACE_PROBE2(l2arc__iodone); +DEFINE_DTRACE_PROBE3(arc__wait__for__eviction); +DEFINE_DTRACE_PROBE4(arc__miss); +DEFINE_DTRACE_PROBE4(l2arc__evict); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h similarity index 100% rename from include/sys/trace_common.h rename to include/os/linux/zfs/sys/trace_common.h diff --git a/include/sys/trace_dbgmsg.h b/include/os/linux/zfs/sys/trace_dbgmsg.h similarity index 76% rename from include/sys/trace_dbgmsg.h rename to include/os/linux/zfs/sys/trace_dbgmsg.h index a4aab1e63f..513918d004 100644 --- a/include/sys/trace_dbgmsg.h +++ b/include/os/linux/zfs/sys/trace_dbgmsg.h @@ -19,10 +19,19 @@ * CDDL HEADER END */ -/* Do not include this file directly. Please use instead. */ -#ifndef _SYS_TRACE_DBGMSG_INDIRECT -#error "trace_dbgmsg.h included directly" -#endif +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_dbgmsg + +#if !defined(_TRACE_DBGMSG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DBGMSG_H + +#include /* * This file defines tracepoint events for use by the dbgmsg(), @@ -63,3 +72,18 @@ DEFINE_EVENT(zfs_dprintf_class, name, \ TP_ARGS(msg)) /* END CSTYLED */ DEFINE_DPRINTF_EVENT(zfs_zfs__dprintf); + +#endif /* _TRACE_DBGMSG_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_dbgmsg +#include + +#else + +DEFINE_DTRACE_PROBE1(zfs__dprintf); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_dbuf.h b/include/os/linux/zfs/sys/trace_dbuf.h similarity index 85% rename from include/sys/trace_dbuf.h rename to include/os/linux/zfs/sys/trace_dbuf.h index e97b611377..bd7d791a46 100644 --- a/include/sys/trace_dbuf.h +++ b/include/os/linux/zfs/sys/trace_dbuf.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -106,6 +107,14 @@ DECLARE_EVENT_CLASS(zfs_dbuf_class, TP_fast_assign(DBUF_TP_FAST_ASSIGN), TP_printk("%s", __get_str(msg)) ); + +DECLARE_EVENT_CLASS(zfs_dbuf_state_class, + TP_PROTO(dmu_buf_impl_t *db, const char *why), + TP_ARGS(db, why), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk("%s", __get_str(msg)) +); /* END CSTYLED */ /* BEGIN CSTYLED */ @@ -116,6 +125,14 @@ DEFINE_EVENT(zfs_dbuf_class, name, \ /* END CSTYLED */ DEFINE_DBUF_EVENT(zfs_blocked__read); +/* BEGIN CSTYLED */ +#define DEFINE_DBUF_STATE_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_state_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, const char *why), \ + TP_ARGS(db, why)) +/* END CSTYLED */ +DEFINE_DBUF_STATE_EVENT(zfs_dbuf__state_change); + /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dbuf_evict_one_class, TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), @@ -142,4 +159,11 @@ DEFINE_DBUF_EVICT_ONE_EVENT(zfs_dbuf__evict__one); #define TRACE_INCLUDE_FILE trace_dbuf #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE2(blocked__read); +DEFINE_DTRACE_PROBE2(dbuf__evict__one); +DEFINE_DTRACE_PROBE2(dbuf__state_change); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_dmu.h b/include/os/linux/zfs/sys/trace_dmu.h similarity index 95% rename from include/sys/trace_dmu.h rename to include/os/linux/zfs/sys/trace_dmu.h index 24e57f5146..3c64a370f8 100644 --- a/include/sys/trace_dmu.h +++ b/include/os/linux/zfs/sys/trace_dmu.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -126,4 +127,10 @@ DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range); #define TRACE_INCLUDE_FILE trace_dmu #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE3(delay__mintime); +DEFINE_DTRACE_PROBE3(free__long__range); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_dnode.h b/include/os/linux/zfs/sys/trace_dnode.h similarity index 96% rename from include/sys/trace_dnode.h rename to include/os/linux/zfs/sys/trace_dnode.h index 7196a497d5..27ad6cba16 100644 --- a/include/sys/trace_dnode.h +++ b/include/os/linux/zfs/sys/trace_dnode.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -120,4 +121,9 @@ DEFINE_DNODE_MOVE_EVENT(zfs_dnode__move); #define TRACE_INCLUDE_FILE trace_dnode #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE3(dnode__move); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_multilist.h b/include/os/linux/zfs/sys/trace_multilist.h similarity index 92% rename from include/sys/trace_multilist.h rename to include/os/linux/zfs/sys/trace_multilist.h index ed0b38a3f3..fe68d5296f 100644 --- a/include/sys/trace_multilist.h +++ b/include/os/linux/zfs/sys/trace_multilist.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -79,4 +80,10 @@ DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); #define TRACE_INCLUDE_FILE trace_multilist #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE3(multilist__insert); +DEFINE_DTRACE_PROBE3(multilist__remove); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_rrwlock.h b/include/os/linux/zfs/sys/trace_rrwlock.h new file mode 100644 index 0000000000..4c74d62573 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_rrwlock.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#else + +DEFINE_DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +DEFINE_DTRACE_PROBE(zfs__rrwfastpath__exitmiss); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_txg.h b/include/os/linux/zfs/sys/trace_txg.h similarity index 85% rename from include/sys/trace_txg.h rename to include/os/linux/zfs/sys/trace_txg.h index f85c3f9ef7..23d5d358bc 100644 --- a/include/sys/trace_txg.h +++ b/include/os/linux/zfs/sys/trace_txg.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -75,4 +76,14 @@ DEFINE_TXG_EVENT(zfs_txg__quiesced); #define TRACE_INCLUDE_FILE trace_txg #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE2(dsl_pool_sync__done); +DEFINE_DTRACE_PROBE2(txg__quiescing); +DEFINE_DTRACE_PROBE2(txg__opened); +DEFINE_DTRACE_PROBE2(txg__syncing); +DEFINE_DTRACE_PROBE2(txg__synced); +DEFINE_DTRACE_PROBE2(txg__quiesced); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_vdev.h b/include/os/linux/zfs/sys/trace_vdev.h similarity index 82% rename from include/sys/trace_vdev.h rename to include/os/linux/zfs/sys/trace_vdev.h index d7af44c253..50711446ff 100644 --- a/include/sys/trace_vdev.h +++ b/include/os/linux/zfs/sys/trace_vdev.h @@ -19,7 +19,14 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +/* + * If tracepoints are available define dtrace_probe events for vdev + * related probes. Definitions in include/os/linux/spl/sys/trace.h + * will map DTRACE_PROBE* calls to tracepoints. + */ #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -116,4 +123,18 @@ DEFINE_REMOVE_FREE_EVENT_TXG(zfs_remove__free__inflight); #define TRACE_INCLUDE_FILE trace_vdev #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +/* + * When tracepoints are not available, a DEFINE_DTRACE_PROBE* macro is + * needed for each DTRACE_PROBE. These will be used to generate stub + * tracing functions and prototypes for those functions. See + * include/os/linux/spl/sys/trace.h. + */ + +DEFINE_DTRACE_PROBE3(remove__free__synced); +DEFINE_DTRACE_PROBE3(remove__free__unvisited); +DEFINE_DTRACE_PROBE4(remove__free__inflight); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zfs.h b/include/os/linux/zfs/sys/trace_zfs.h new file mode 100644 index 0000000000..0e19f8d186 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_zfs.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _OS_LINUX_ZFS_TRACE_H +#define _OS_LINUX_ZFS_TRACE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif diff --git a/include/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h similarity index 96% rename from include/sys/trace_zil.h rename to include/os/linux/zfs/sys/trace_zil.h index ff16c8686c..526846e664 100644 --- a/include/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -218,4 +219,11 @@ DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); #define TRACE_INCLUDE_FILE trace_zil #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE2(zil__process__commit__itx); +DEFINE_DTRACE_PROBE2(zil__process__normal__itx); +DEFINE_DTRACE_PROBE2(zil__commit__io__error); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_zio.h b/include/os/linux/zfs/sys/trace_zio.h similarity index 90% rename from include/sys/trace_zio.h rename to include/os/linux/zfs/sys/trace_zio.h index af589b9dfa..8655e245c0 100644 --- a/include/sys/trace_zio.h +++ b/include/os/linux/zfs/sys/trace_zio.h @@ -21,7 +21,8 @@ #include -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -86,4 +87,11 @@ TRACE_EVENT(zfs_zio__delay__skip, #define TRACE_INCLUDE_FILE trace_zio #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE2(zio__delay__miss); +DEFINE_DTRACE_PROBE3(zio__delay__hit); +DEFINE_DTRACE_PROBE1(zio__delay__skip); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/sys/trace_zrlock.h b/include/os/linux/zfs/sys/trace_zrlock.h similarity index 93% rename from include/sys/trace_zrlock.h rename to include/os/linux/zfs/sys/trace_zrlock.h index fa330f2c19..23f9577ba1 100644 --- a/include/sys/trace_zrlock.h +++ b/include/os/linux/zfs/sys/trace_zrlock.h @@ -19,7 +19,8 @@ * CDDL HEADER END */ -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs @@ -85,4 +86,9 @@ DEFINE_ZRLOCK_EVENT(zfs_zrlock__reentry); #define TRACE_INCLUDE_FILE trace_zrlock #include -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ +#else + +DEFINE_DTRACE_PROBE3(zrlock__reentry); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/zfs_bootenv_os.h b/include/os/linux/zfs/sys/zfs_bootenv_os.h new file mode 100644 index 0000000000..7b2f083adc --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_bootenv_os.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _ZFS_BOOTENV_OS_H +#define _ZFS_BOOTENV_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_OS BE_LINUX_VENDOR + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_OS_H */ diff --git a/include/os/linux/zfs/sys/zfs_context_os.h b/include/os/linux/zfs/sys/zfs_context_os.h new file mode 100644 index 0000000000..9e42605582 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_context_os.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_CONTEXT_OS_H +#define ZFS_CONTEXT_OS_H + +#include +#include +#include +#include + +#if THREAD_SIZE >= 16384 +#define HAVE_LARGE_STACKS 1 +#endif + +#endif diff --git a/include/sys/zfs_ctldir.h b/include/os/linux/zfs/sys/zfs_ctldir.h similarity index 85% rename from include/sys/zfs_ctldir.h rename to include/os/linux/zfs/sys/zfs_ctldir.h index 51933bc4fe..beee34979b 100644 --- a/include/sys/zfs_ctldir.h +++ b/include/os/linux/zfs/sys/zfs_ctldir.h @@ -60,22 +60,22 @@ extern boolean_t zfsctl_is_snapdir(struct inode *ip); extern int zfsctl_fid(struct inode *ip, fid_t *fidp); /* zfsctl '.zfs' functions */ -extern int zfsctl_root_lookup(struct inode *dip, char *name, +extern int zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); /* zfsctl '.zfs/snapshot' functions */ -extern int zfsctl_snapdir_lookup(struct inode *dip, char *name, +extern int zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); -extern int zfsctl_snapdir_rename(struct inode *sdip, char *sname, - struct inode *tdip, char *tname, cred_t *cr, int flags); -extern int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, - int flags); -extern int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - struct inode **ipp, cred_t *cr, int flags); +extern int zfsctl_snapdir_rename(struct inode *sdip, const char *sname, + struct inode *tdip, const char *tname, cred_t *cr, int flags); +extern int zfsctl_snapdir_remove(struct inode *dip, const char *name, + cred_t *cr, int flags); +extern int zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, + vattr_t *vap, struct inode **ipp, cred_t *cr, int flags); extern int zfsctl_snapshot_mount(struct path *path, int flags); -extern int zfsctl_snapshot_unmount(char *snapname, int flags); +extern int zfsctl_snapshot_unmount(const char *snapname, int flags); extern int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay); extern int zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, diff --git a/include/sys/zfs_dir.h b/include/os/linux/zfs/sys/zfs_dir.h similarity index 92% rename from include/sys/zfs_dir.h rename to include/os/linux/zfs/sys/zfs_dir.h index bcd4ec2c1d..0f15e43452 100644 --- a/include/sys/zfs_dir.h +++ b/include/os/linux/zfs/sys/zfs_dir.h @@ -55,7 +55,7 @@ extern void zfs_dirent_unlock(zfs_dirlock_t *); extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, boolean_t *); -extern int zfs_dirlook(znode_t *, char *, struct inode **, int, int *, +extern int zfs_dirlook(znode_t *, char *, znode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, uint_t, znode_t **, zfs_acl_ids_t *); @@ -66,8 +66,8 @@ extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs); extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); -extern int zfs_get_xattrdir(znode_t *, struct inode **, cred_t *, int); -extern int zfs_make_xattrdir(znode_t *, vattr_t *, struct inode **, cred_t *); +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); #ifdef __cplusplus } diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h new file mode 100644 index 0000000000..8e03ae99a7 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -0,0 +1,259 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +/* + * This structure emulates the vfs_t from other platforms. It's purpose + * is to facilitate the handling of mount options and minimize structural + * differences between the platforms. + */ +typedef struct vfs { + struct zfsvfs *vfs_data; + char *vfs_mntpoint; /* Primary mount point */ + uint64_t vfs_xattr; + boolean_t vfs_readonly; + boolean_t vfs_do_readonly; + boolean_t vfs_setuid; + boolean_t vfs_do_setuid; + boolean_t vfs_exec; + boolean_t vfs_do_exec; + boolean_t vfs_devices; + boolean_t vfs_do_devices; + boolean_t vfs_do_xattr; + boolean_t vfs_atime; + boolean_t vfs_do_atime; + boolean_t vfs_relatime; + boolean_t vfs_do_relatime; + boolean_t vfs_nbmand; + boolean_t vfs_do_nbmand; +} vfs_t; + +typedef struct zfs_mnt { + const char *mnt_osname; /* Objset name */ + char *mnt_data; /* Raw mount options */ +} zfs_mnt_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + struct super_block *z_sb; /* generic super_block */ + struct zfsvfs *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_flags; /* super_block flags */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + uint_t z_acl_type; /* type of ACL usable on this FS */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_relatime; /* enable relatime mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all znodes in the fs */ + uint64_t z_nr_znodes; /* number of znodes in the fs */ + unsigned long z_rollback_time; /* last online rollback time */ + unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ + struct inode *z_ctldir; /* .zfs directory inode */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + boolean_t z_draining; /* is true when drain is active */ + boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + dataset_kstats_t z_kstat; /* fs kstats */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ + uint64_t z_hold_size; /* znode hold array size */ + avl_tree_t *z_hold_trees; /* znode hold trees */ + kmutex_t *z_hold_locks; /* znode hold locks */ + taskqid_t z_drain_task; /* task id for the unlink drain task */ +}; + +#define ZFS_TEARDOWN_INIT(zfsvfs) \ + rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) + +#define ZFS_TEARDOWN_DESTROY(zfsvfs) \ + rrm_destroy(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ + rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) + +#define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); + +#define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ + rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) + +#define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ + RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ + RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_HELD(zfsvfs) \ + RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ + +/* + * Allow a maximum number of links. While ZFS does not internally limit + * this the inode->i_nlink member is defined as an unsigned int. To be + * safe we use 2^31-1 as the limit. + */ +#define ZFS_LINK_MAX ((1U << 31) - 1U) + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern void zfs_exit_fs(zfsvfs_t *zfsvfs); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); + +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); +extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); +extern void zfs_preumount(struct super_block *sb); +extern int zfs_umount(struct super_block *sb); +extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); +extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp); +extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); +extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, + int *objects); +extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h new file mode 100644 index 0000000000..47f91e4a6c --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VNOPS_OS_H +#define _SYS_FS_ZFS_VNOPS_OS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); +extern int zfs_close(struct inode *ip, int flag, cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); +extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, int flags, + cred_t *cr, int *direntflags, pathname_t *realpnp); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_tmpfile(struct inode *dip, vattr_t *vapzfs, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); +extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip, + struct kstat *sp); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, + char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr); +extern int zfs_link(znode_t *tdzp, znode_t *szp, + char *name, cred_t *cr, int flags); +extern void zfs_inactive(struct inode *ip); +extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_fid(struct inode *ip, fid_t *fidp); +extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_putpage(struct inode *ip, struct page *pp, + struct writeback_control *wbc); +extern int zfs_dirty_inode(struct inode *ip, int flags); +extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, + size_t len, unsigned long vm_flags); +extern void zfs_zrele_async(znode_t *zp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h new file mode 100644 index 0000000000..de46fc8f2b --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,183 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_ZFS_ZNODE_IMPL_H +#define _SYS_ZFS_ZNODE_IMPL_H + +#ifndef _KERNEL +#error "no user serviceable parts within" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZNODE_OS_FIELDS \ + inode_timespec_t z_btime; /* creation/birth time (cached) */ \ + struct inode z_inode; + +/* + * Convert between znode pointers and inode pointers + */ +#define ZTOI(znode) (&((znode)->z_inode)) +#define ITOZ(inode) (container_of((inode), znode_t, z_inode)) +#define ZTOZSB(znode) ((zfsvfs_t *)(ZTOI(znode)->i_sb->s_fs_info)) +#define ITOZSB(inode) ((zfsvfs_t *)((inode)->i_sb->s_fs_info)) + +#define ZTOTYPE(zp) (ZTOI(zp)->i_mode) +#define ZTOGID(zp) (ZTOI(zp)->i_gid) +#define ZTOUID(zp) (ZTOI(zp)->i_uid) +#define ZTONLNK(zp) (ZTOI(zp)->i_nlink) + +#define Z_ISBLK(type) S_ISBLK(type) +#define Z_ISCHR(type) S_ISCHR(type) +#define Z_ISLNK(type) S_ISLNK(type) +#define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) +#define Z_ISDIR(type) S_ISDIR(type) + +#define zn_has_cached_data(zp) ((zp)->z_is_mapped) +#define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync) +#define zn_rlimit_fsize(zp, uio) (0) + +/* + * zhold() wraps igrab() on Linux, and igrab() may fail when the + * inode is in the process of being deleted. As zhold() must only be + * called when a ref already exists - so the inode cannot be + * mid-deletion - we VERIFY() this. + */ +#define zhold(zp) VERIFY3P(igrab(ZTOI((zp))), !=, NULL) +#define zrele(zp) iput(ZTOI((zp))) + +/* Called on entry to each ZFS inode and vfs operation. */ +#define ZFS_ENTER_ERROR(zfsvfs, error) \ +do { \ + ZFS_TEARDOWN_ENTER_READ(zfsvfs, FTAG); \ + if (unlikely((zfsvfs)->z_unmounted)) { \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ + return (error); \ + } \ +} while (0) +#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) +#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) + +/* Must be called before exiting the operation. */ +#define ZFS_EXIT(zfsvfs) \ +do { \ + zfs_exit_fs(zfsvfs); \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ +} while (0) + +#define ZPL_EXIT(zfsvfs) \ +do { \ + rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ +} while (0) + +/* Verifies the znode is valid. */ +#define ZFS_VERIFY_ZP_ERROR(zp, error) \ +do { \ + if (unlikely((zp)->z_sa_hdl == NULL)) { \ + ZFS_EXIT(ZTOZSB(zp)); \ + return (error); \ + } \ +} while (0) +#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) +#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_MTX_SZ 64 +#define ZFS_OBJ_MTX_MAX (1024 * 1024) +#define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) + +extern unsigned int zfs_object_mutex_size; + +/* + * Encode ZFS stored time values from a struct timespec / struct timespec64. + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +do { \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} while (0) + +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +/* + * Decode ZFS stored time values to a struct timespec64 + * 4.18 and newer kernels. + */ +#define ZFS_TIME_DECODE(tp, stmp) \ +do { \ + (tp)->tv_sec = (time64_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} while (0) +#else +/* + * Decode ZFS stored time values to a struct timespec + * 4.17 and older kernels. + */ +#define ZFS_TIME_DECODE(tp, stmp) \ +do { \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} while (0) +#endif /* HAVE_INODE_TIMESPEC64_TIMES */ + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) + +struct znode; + +extern int zfs_sync(struct super_block *, int, cred_t *); +extern int zfs_inode_alloc(struct super_block *, struct inode **ip); +extern void zfs_inode_destroy(struct inode *); +extern void zfs_mark_inode_dirty(struct inode *); +extern boolean_t zfs_relatime_need_update(const struct inode *); + +#if defined(HAVE_UIO_RW) +extern caddr_t zfs_map_page(page_t *, enum seg_rw); +extern void zfs_unmap_page(page_t *, caddr_t); +#endif /* HAVE_UIO_RW */ + +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ZNODE_IMPL_H */ diff --git a/include/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h similarity index 76% rename from include/sys/zpl.h rename to include/os/linux/zfs/sys/zpl.h index 2766269f31..ff86e027bb 100644 --- a/include/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -39,26 +39,13 @@ /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, - zpl_umode_t mode, cred_t *cr); + umode_t mode, cred_t *cr); extern const struct inode_operations zpl_inode_operations; extern const struct inode_operations zpl_dir_inode_operations; extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; extern dentry_operations_t zpl_dentry_operations; - -/* zpl_file.c */ -extern ssize_t zpl_read_common(struct inode *ip, const char *buf, - size_t len, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr); -extern ssize_t zpl_write_common(struct inode *ip, const char *buf, - size_t len, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr); -#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) -extern long zpl_fallocate_common(struct inode *ip, int mode, - loff_t offset, loff_t len); -#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ - extern const struct address_space_operations zpl_address_space_operations; extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; @@ -75,20 +62,19 @@ extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) +#if defined(HAVE_SET_ACL_USERNS) +extern int zpl_set_acl(struct user_namespace *userns, struct inode *ip, + struct posix_acl *acl, int type); +#else extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); +#endif /* HAVE_SET_ACL_USERNS */ +#endif /* HAVE_SET_ACL */ +#if defined(HAVE_GET_ACL_RCU) +extern struct posix_acl *zpl_get_acl(struct inode *ip, int type, bool rcu); +#elif defined(HAVE_GET_ACL) extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); -#if !defined(HAVE_GET_ACL) -#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -extern int zpl_check_acl(struct inode *inode, int mask, unsigned int flags); -#elif defined(HAVE_CHECK_ACL) -extern int zpl_check_acl(struct inode *inode, int mask); -#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -extern int zpl_permission(struct inode *ip, int mask, struct nameidata *nd); -#elif defined(HAVE_PERMISSION) -extern int zpl_permission(struct inode *ip, int mask); -#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* HAVE_GET_ACL */ - +#endif extern int zpl_init_acl(struct inode *ip, struct inode *dir); extern int zpl_chmod_acl(struct inode *ip); #else @@ -113,11 +99,7 @@ extern const struct inode_operations zpl_ops_root; extern const struct file_operations zpl_fops_snapdir; extern const struct inode_operations zpl_ops_snapdir; -#ifdef HAVE_AUTOMOUNT extern const struct dentry_operations zpl_dops_snapdirs; -#else -extern const struct inode_operations zpl_ops_snapdirs; -#endif /* HAVE_AUTOMOUNT */ extern const struct file_operations zpl_fops_shares; extern const struct inode_operations zpl_ops_shares; @@ -188,13 +170,32 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) } #endif /* HAVE_VFS_ITERATE */ -/* - * Linux 4.18, inode times converted from timespec to timespec64. - */ -#if defined(HAVE_INODE_TIMESPEC64_TIMES) -#define zpl_inode_timespec_trunc(ts, gran) timespec64_trunc(ts, gran) +#if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) +#define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) +#elif defined(HAVE_INODE_TIMESPEC64_TIMES) +#define zpl_inode_timestamp_truncate(ts, ip) \ + timespec64_trunc(ts, (ip)->i_sb->s_time_gran) #else -#define zpl_inode_timespec_trunc(ts, gran) timespec_trunc(ts, gran) +#define zpl_inode_timestamp_truncate(ts, ip) \ + timespec_trunc(ts, (ip)->i_sb->s_time_gran) +#endif + +#if defined(HAVE_INODE_OWNER_OR_CAPABLE) +#define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ip) +#elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED) +#define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ns, ip) +#else +#error "Unsupported kernel" +#endif + +#ifdef HAVE_SETATTR_PREPARE_USERNS +#define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(ns, dentry, ia) +#else +/* + * Use kernel-provided version, or our own from + * linux/vfs_compat.h + */ +#define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) #endif #endif /* _SYS_ZPL_H */ diff --git a/include/spl/sys/Makefile.am b/include/spl/sys/Makefile.am deleted file mode 100644 index 3b5b2755a2..0000000000 --- a/include/spl/sys/Makefile.am +++ /dev/null @@ -1,61 +0,0 @@ -KERNEL_H = \ - $(top_srcdir)/include/spl/sys/acl.h \ - $(top_srcdir)/include/spl/sys/atomic.h \ - $(top_srcdir)/include/spl/sys/byteorder.h \ - $(top_srcdir)/include/spl/sys/callb.h \ - $(top_srcdir)/include/spl/sys/callo.h \ - $(top_srcdir)/include/spl/sys/cmn_err.h \ - $(top_srcdir)/include/spl/sys/condvar.h \ - $(top_srcdir)/include/spl/sys/console.h \ - $(top_srcdir)/include/spl/sys/cred.h \ - $(top_srcdir)/include/spl/sys/ctype.h \ - $(top_srcdir)/include/spl/sys/debug.h \ - $(top_srcdir)/include/spl/sys/disp.h \ - $(top_srcdir)/include/spl/sys/dkio.h \ - $(top_srcdir)/include/spl/sys/errno.h \ - $(top_srcdir)/include/spl/sys/fcntl.h \ - $(top_srcdir)/include/spl/sys/file.h \ - $(top_srcdir)/include/spl/sys/inttypes.h \ - $(top_srcdir)/include/spl/sys/isa_defs.h \ - $(top_srcdir)/include/spl/sys/kmem_cache.h \ - $(top_srcdir)/include/spl/sys/kmem.h \ - $(top_srcdir)/include/spl/sys/kobj.h \ - $(top_srcdir)/include/spl/sys/kstat.h \ - $(top_srcdir)/include/spl/sys/list.h \ - $(top_srcdir)/include/spl/sys/mode.h \ - $(top_srcdir)/include/spl/sys/mutex.h \ - $(top_srcdir)/include/spl/sys/param.h \ - $(top_srcdir)/include/spl/sys/processor.h \ - $(top_srcdir)/include/spl/sys/proc.h \ - $(top_srcdir)/include/spl/sys/procfs_list.h \ - $(top_srcdir)/include/spl/sys/random.h \ - $(top_srcdir)/include/spl/sys/rwlock.h \ - $(top_srcdir)/include/spl/sys/shrinker.h \ - $(top_srcdir)/include/spl/sys/sid.h \ - $(top_srcdir)/include/spl/sys/signal.h \ - $(top_srcdir)/include/spl/sys/stat.h \ - $(top_srcdir)/include/spl/sys/strings.h \ - $(top_srcdir)/include/spl/sys/sunddi.h \ - $(top_srcdir)/include/spl/sys/sysmacros.h \ - $(top_srcdir)/include/spl/sys/systeminfo.h \ - $(top_srcdir)/include/spl/sys/taskq.h \ - $(top_srcdir)/include/spl/sys/thread.h \ - $(top_srcdir)/include/spl/sys/time.h \ - $(top_srcdir)/include/spl/sys/timer.h \ - $(top_srcdir)/include/spl/sys/tsd.h \ - $(top_srcdir)/include/spl/sys/types32.h \ - $(top_srcdir)/include/spl/sys/types.h \ - $(top_srcdir)/include/spl/sys/uio.h \ - $(top_srcdir)/include/spl/sys/user.h \ - $(top_srcdir)/include/spl/sys/vfs.h \ - $(top_srcdir)/include/spl/sys/vmem.h \ - $(top_srcdir)/include/spl/sys/vmsystm.h \ - $(top_srcdir)/include/spl/sys/vnode.h \ - $(top_srcdir)/include/spl/sys/wait.h \ - $(top_srcdir)/include/spl/sys/zmod.h \ - $(top_srcdir)/include/spl/sys/zone.h - -if CONFIG_KERNEL -kerneldir = @prefix@/src/zfs-$(VERSION)/include/spl/sys -kernel_HEADERS = $(KERNEL_H) -endif diff --git a/include/spl/sys/console.h b/include/spl/sys/console.h deleted file mode 100644 index 3469cb762e..0000000000 --- a/include/spl/sys/console.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_CONSOLE_H -#define _SPL_CONSOLE_H - -void -console_vprintf(const char *fmt, va_list args) -{ - vprintk(fmt, args); -} - -void -console_printf(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - console_vprintf(fmt, args); - va_end(args); -} - -#endif /* _SPL_CONSOLE_H */ diff --git a/include/spl/sys/kobj.h b/include/spl/sys/kobj.h deleted file mode 100644 index 558ec39a80..0000000000 --- a/include/spl/sys/kobj.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_KOBJ_H -#define _SPL_KOBJ_H - -#include - -typedef struct _buf { - vnode_t *vp; -} _buf_t; - -typedef struct _buf buf_t; - -extern struct _buf *kobj_open_file(const char *name); -extern void kobj_close_file(struct _buf *file); -extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, - unsigned off); -extern int kobj_get_filesize(struct _buf *file, uint64_t *size); - -#endif /* SPL_KOBJ_H */ diff --git a/include/spl/sys/shrinker.h b/include/spl/sys/shrinker.h deleted file mode 100644 index 28c1fa78c4..0000000000 --- a/include/spl/sys/shrinker.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_SHRINKER_H -#define _SPL_SHRINKER_H - -#include -#include - -#if !defined(HAVE_SHRINK_CONTROL_STRUCT) -struct shrink_control { - gfp_t gfp_mask; - unsigned long nr_to_scan; -}; -#endif /* HAVE_SHRINK_CONTROL_STRUCT */ - -/* - * Due to frequent changes in the shrinker API the following - * compatibility wrappers should be used. They are as follows: - * - * SPL_SHRINKER_DECLARE is used to declare the shrinker which is - * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use - * shrinker_name to set the shrinker variable name, shrinker_callback - * to set the callback function, and seek_cost to define the cost of - * reclaiming an object. - * - * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost); - * - * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration - * of the shrinker callback function is required. Only the callback - * function needs to be passed. - * - * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback); - * - * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function - * which is registered with the shrinker. This function will call your - * custom shrinker which must use the following prototype. Notice the - * leading __'s, these must be appended to the callback_function name. - * - * int __shrinker_callback(struct shrinker *, struct shrink_control *) - * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a - * - * - * Example: - * - * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn); - * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1); - * - * static int - * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc) - * { - * if (sc->nr_to_scan) { - * ...scan objects in the cache and reclaim them... - * } - * - * ...calculate number of objects in the cache... - * - * return (number of objects in the cache); - * } - * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn); - */ - -#define spl_register_shrinker(x) register_shrinker(x) -#define spl_unregister_shrinker(x) unregister_shrinker(x) - -/* - * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility. - */ -#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(s, x, y) \ -static struct shrinker s = { \ - .shrink = x, \ - .seeks = y \ -} - -#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -static int fn(int nr_to_scan, unsigned int gfp_mask) - -#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -static int \ -fn(int nr_to_scan, unsigned int gfp_mask) \ -{ \ - struct shrink_control sc; \ - \ - sc.nr_to_scan = nr_to_scan; \ - sc.gfp_mask = gfp_mask; \ - \ - return (__ ## fn(NULL, &sc)); \ -} - -/* - * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility. - */ -#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(s, x, y) \ -static struct shrinker s = { \ - .shrink = x, \ - .seeks = y \ -} - -#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -static int fn(struct shrinker *, int, unsigned int) - -#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -static int \ -fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ -{ \ - struct shrink_control sc; \ - \ - sc.nr_to_scan = nr_to_scan; \ - sc.gfp_mask = gfp_mask; \ - \ - return (__ ## fn(shrink, &sc)); \ -} - -/* - * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ -#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(s, x, y) \ -static struct shrinker s = { \ - .shrink = x, \ - .seeks = y \ -} - -#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -static int fn(struct shrinker *, struct shrink_control *) - -#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -static int \ -fn(struct shrinker *shrink, struct shrink_control *sc) \ -{ \ - return (__ ## fn(shrink, sc)); \ -} - -/* - * Linux 3.12 and later Shrinker API Compatibility. - */ -#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(s, x, y) \ -static struct shrinker s = { \ - .count_objects = x ## _count_objects, \ - .scan_objects = x ## _scan_objects, \ - .seeks = y \ -} - -#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -static unsigned long fn ## _count_objects(struct shrinker *, \ - struct shrink_control *); \ -static unsigned long fn ## _scan_objects(struct shrinker *, \ - struct shrink_control *) - -#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -static unsigned long \ -fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\ -{ \ - int __ret__; \ - \ - sc->nr_to_scan = 0; \ - __ret__ = __ ## fn(NULL, sc); \ - \ - /* Errors may not be returned and must be converted to zeros */ \ - return ((__ret__ < 0) ? 0 : __ret__); \ -} \ - \ -static unsigned long \ -fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \ -{ \ - int __ret__; \ - \ - __ret__ = __ ## fn(NULL, sc); \ - return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \ -} -#else -/* - * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. - */ -#error "Unknown shrinker callback" -#endif - -#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) -typedef unsigned long spl_shrinker_t; -#else -typedef int spl_shrinker_t; -#define SHRINK_STOP (-1) -#endif - -#endif /* SPL_SHRINKER_H */ diff --git a/include/spl/sys/signal.h b/include/spl/sys/signal.h deleted file mode 100644 index 36b8b5d985..0000000000 --- a/include/spl/sys/signal.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_SIGNAL_H -#define _SPL_SIGNAL_H - -#include - -#ifdef HAVE_SCHED_SIGNAL_HEADER -#include -#endif - -#define FORREAL 0 /* Usual side-effects */ -#define JUSTLOOKING 1 /* Don't stop the process */ - -/* - * The "why" argument indicates the allowable side-effects of the call: - * - * FORREAL: Extract the next pending signal from p_sig into p_cursig; - * stop the process if a stop has been requested or if a traced signal - * is pending. - * - * JUSTLOOKING: Don't stop the process, just indicate whether or not - * a signal might be pending (FORREAL is needed to tell for sure). - */ -static __inline__ int -issig(int why) -{ - ASSERT(why == FORREAL || why == JUSTLOOKING); - - return (signal_pending(current)); -} - -#endif /* SPL_SIGNAL_H */ diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h deleted file mode 100644 index fac26079d7..0000000000 --- a/include/spl/sys/uio.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_UIO_H -#define _SPL_UIO_H - -#include -#include -#include -#include - -typedef struct iovec iovec_t; - -typedef enum uio_rw { - UIO_READ = 0, - UIO_WRITE = 1, -} uio_rw_t; - -typedef enum uio_seg { - UIO_USERSPACE = 0, - UIO_SYSSPACE = 1, - UIO_USERISPACE = 2, - UIO_BVEC = 3, -} uio_seg_t; - -typedef struct uio { - union { - const struct iovec *uio_iov; - const struct bio_vec *uio_bvec; - }; - int uio_iovcnt; - offset_t uio_loffset; - uio_seg_t uio_segflg; - boolean_t uio_fault_disable; - uint16_t uio_fmode; - uint16_t uio_extflg; - offset_t uio_limit; - ssize_t uio_resid; - size_t uio_skip; -} uio_t; - -typedef struct aio_req { - uio_t *aio_uio; - void *aio_private; -} aio_req_t; - -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; - - -#define UIOA_IOV_MAX 16 - -typedef struct uioa_page_s { - int uioa_pfncnt; - void **uioa_ppp; - caddr_t uioa_base; - size_t uioa_len; -} uioa_page_t; - -typedef struct xuio { - uio_t xu_uio; - enum xuio_type xu_type; - union { - struct { - uint32_t xu_a_state; - ssize_t xu_a_mbytes; - uioa_page_t *xu_a_lcur; - void **xu_a_lppp; - void *xu_a_hwst[4]; - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - -#endif /* SPL_UIO_H */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 31ffdfb4a7..54573fbe1b 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -1,158 +1,151 @@ -SUBDIRS = fm fs crypto lua sysevent +SUBDIRS = fm fs crypto lua sysevent zstd COMMON_H = \ - $(top_srcdir)/include/sys/abd.h \ - $(top_srcdir)/include/sys/aggsum.h \ - $(top_srcdir)/include/sys/arc.h \ - $(top_srcdir)/include/sys/arc_impl.h \ - $(top_srcdir)/include/sys/avl.h \ - $(top_srcdir)/include/sys/avl_impl.h \ - $(top_srcdir)/include/sys/blkptr.h \ - $(top_srcdir)/include/sys/bplist.h \ - $(top_srcdir)/include/sys/bpobj.h \ - $(top_srcdir)/include/sys/bptree.h \ - $(top_srcdir)/include/sys/bqueue.h \ - $(top_srcdir)/include/sys/cityhash.h \ - $(top_srcdir)/include/sys/spa_checkpoint.h \ - $(top_srcdir)/include/sys/dataset_kstats.h \ - $(top_srcdir)/include/sys/dbuf.h \ - $(top_srcdir)/include/sys/ddt.h \ - $(top_srcdir)/include/sys/dmu.h \ - $(top_srcdir)/include/sys/dmu_impl.h \ - $(top_srcdir)/include/sys/dmu_objset.h \ - $(top_srcdir)/include/sys/dmu_recv.h \ - $(top_srcdir)/include/sys/dmu_send.h \ - $(top_srcdir)/include/sys/dmu_traverse.h \ - $(top_srcdir)/include/sys/dmu_tx.h \ - $(top_srcdir)/include/sys/dmu_zfetch.h \ - $(top_srcdir)/include/sys/dnode.h \ - $(top_srcdir)/include/sys/dsl_bookmark.h \ - $(top_srcdir)/include/sys/dsl_dataset.h \ - $(top_srcdir)/include/sys/dsl_deadlist.h \ - $(top_srcdir)/include/sys/dsl_deleg.h \ - $(top_srcdir)/include/sys/dsl_destroy.h \ - $(top_srcdir)/include/sys/dsl_dir.h \ - $(top_srcdir)/include/sys/dsl_crypt.h \ - $(top_srcdir)/include/sys/dsl_pool.h \ - $(top_srcdir)/include/sys/dsl_prop.h \ - $(top_srcdir)/include/sys/dsl_scan.h \ - $(top_srcdir)/include/sys/dsl_synctask.h \ - $(top_srcdir)/include/sys/dsl_userhold.h \ - $(top_srcdir)/include/sys/edonr.h \ - $(top_srcdir)/include/sys/efi_partition.h \ - $(top_srcdir)/include/sys/frame.h \ - $(top_srcdir)/include/sys/hkdf.h \ - $(top_srcdir)/include/sys/metaslab.h \ - $(top_srcdir)/include/sys/metaslab_impl.h \ - $(top_srcdir)/include/sys/mmp.h \ - $(top_srcdir)/include/sys/mntent.h \ - $(top_srcdir)/include/sys/multilist.h \ - $(top_srcdir)/include/sys/note.h \ - $(top_srcdir)/include/sys/nvpair.h \ - $(top_srcdir)/include/sys/nvpair_impl.h \ - $(top_srcdir)/include/sys/pathname.h \ - $(top_srcdir)/include/sys/policy.h \ - $(top_srcdir)/include/sys/range_tree.h \ - $(top_srcdir)/include/sys/refcount.h \ - $(top_srcdir)/include/sys/rrwlock.h \ - $(top_srcdir)/include/sys/sa.h \ - $(top_srcdir)/include/sys/sa_impl.h \ - $(top_srcdir)/include/sys/sdt.h \ - $(top_srcdir)/include/sys/sha2.h \ - $(top_srcdir)/include/sys/skein.h \ - $(top_srcdir)/include/sys/spa_boot.h \ - $(top_srcdir)/include/sys/space_map.h \ - $(top_srcdir)/include/sys/space_reftree.h \ - $(top_srcdir)/include/sys/spa.h \ - $(top_srcdir)/include/sys/spa_impl.h \ - $(top_srcdir)/include/sys/spa_checksum.h \ - $(top_srcdir)/include/sys/sysevent.h \ - $(top_srcdir)/include/sys/trace.h \ - $(top_srcdir)/include/sys/trace_acl.h \ - $(top_srcdir)/include/sys/trace_arc.h \ - $(top_srcdir)/include/sys/trace_common.h \ - $(top_srcdir)/include/sys/trace_dbgmsg.h \ - $(top_srcdir)/include/sys/trace_dbuf.h \ - $(top_srcdir)/include/sys/trace_dmu.h \ - $(top_srcdir)/include/sys/trace_dnode.h \ - $(top_srcdir)/include/sys/trace_multilist.h \ - $(top_srcdir)/include/sys/trace_txg.h \ - $(top_srcdir)/include/sys/trace_vdev.h \ - $(top_srcdir)/include/sys/trace_zil.h \ - $(top_srcdir)/include/sys/trace_zio.h \ - $(top_srcdir)/include/sys/trace_zrlock.h \ - $(top_srcdir)/include/sys/txg.h \ - $(top_srcdir)/include/sys/txg_impl.h \ - $(top_srcdir)/include/sys/u8_textprep_data.h \ - $(top_srcdir)/include/sys/u8_textprep.h \ - $(top_srcdir)/include/sys/uberblock.h \ - $(top_srcdir)/include/sys/uberblock_impl.h \ - $(top_srcdir)/include/sys/uio_impl.h \ - $(top_srcdir)/include/sys/unique.h \ - $(top_srcdir)/include/sys/uuid.h \ - $(top_srcdir)/include/sys/vdev_disk.h \ - $(top_srcdir)/include/sys/vdev_file.h \ - $(top_srcdir)/include/sys/vdev.h \ - $(top_srcdir)/include/sys/vdev_impl.h \ - $(top_srcdir)/include/sys/vdev_indirect_births.h \ - $(top_srcdir)/include/sys/vdev_indirect_mapping.h \ - $(top_srcdir)/include/sys/vdev_initialize.h \ - $(top_srcdir)/include/sys/vdev_raidz.h \ - $(top_srcdir)/include/sys/vdev_raidz_impl.h \ - $(top_srcdir)/include/sys/vdev_removal.h \ - $(top_srcdir)/include/sys/vdev_trim.h \ - $(top_srcdir)/include/sys/xvattr.h \ - $(top_srcdir)/include/sys/zap.h \ - $(top_srcdir)/include/sys/zap_impl.h \ - $(top_srcdir)/include/sys/zap_leaf.h \ - $(top_srcdir)/include/sys/zcp.h \ - $(top_srcdir)/include/sys/zcp_global.h \ - $(top_srcdir)/include/sys/zcp_iter.h \ - $(top_srcdir)/include/sys/zcp_prop.h \ - $(top_srcdir)/include/sys/zfeature.h \ - $(top_srcdir)/include/sys/zfs_acl.h \ - $(top_srcdir)/include/sys/zfs_context.h \ - $(top_srcdir)/include/sys/zfs_ctldir.h \ - $(top_srcdir)/include/sys/zfs_debug.h \ - $(top_srcdir)/include/sys/zfs_delay.h \ - $(top_srcdir)/include/sys/zfs_dir.h \ - $(top_srcdir)/include/sys/zfs_fuid.h \ - $(top_srcdir)/include/sys/zfs_project.h \ - $(top_srcdir)/include/sys/zfs_ratelimit.h \ - $(top_srcdir)/include/sys/zfs_rlock.h \ - $(top_srcdir)/include/sys/zfs_sa.h \ - $(top_srcdir)/include/sys/zfs_stat.h \ - $(top_srcdir)/include/sys/zfs_sysfs.h \ - $(top_srcdir)/include/sys/zfs_vfsops.h \ - $(top_srcdir)/include/sys/zfs_vnops.h \ - $(top_srcdir)/include/sys/zfs_znode.h \ - $(top_srcdir)/include/sys/zil.h \ - $(top_srcdir)/include/sys/zil_impl.h \ - $(top_srcdir)/include/sys/zio_checksum.h \ - $(top_srcdir)/include/sys/zio_compress.h \ - $(top_srcdir)/include/sys/zio_crypt.h \ - $(top_srcdir)/include/sys/zio.h \ - $(top_srcdir)/include/sys/zio_impl.h \ - $(top_srcdir)/include/sys/zio_priority.h \ - $(top_srcdir)/include/sys/zrlock.h \ - $(top_srcdir)/include/sys/zthr.h + abd.h \ + abd_impl.h \ + aggsum.h \ + arc.h \ + arc_impl.h \ + avl.h \ + avl_impl.h \ + bitops.h \ + blkptr.h \ + bplist.h \ + bpobj.h \ + bptree.h \ + btree.h \ + bqueue.h \ + dataset_kstats.h \ + dbuf.h \ + ddt.h \ + dmu.h \ + dmu_impl.h \ + dmu_objset.h \ + dmu_recv.h \ + dmu_redact.h \ + dmu_send.h \ + dmu_traverse.h \ + dmu_tx.h \ + dmu_zfetch.h \ + dnode.h \ + dsl_bookmark.h \ + dsl_dataset.h \ + dsl_deadlist.h \ + dsl_deleg.h \ + dsl_destroy.h \ + dsl_dir.h \ + dsl_crypt.h \ + dsl_pool.h \ + dsl_prop.h \ + dsl_scan.h \ + dsl_synctask.h \ + dsl_userhold.h \ + edonr.h \ + efi_partition.h \ + frame.h \ + hkdf.h \ + metaslab.h \ + metaslab_impl.h \ + mmp.h \ + mntent.h \ + mod.h \ + multilist.h \ + nvpair.h \ + nvpair_impl.h \ + objlist.h \ + pathname.h \ + qat.h \ + range_tree.h \ + rrwlock.h \ + sa.h \ + sa_impl.h \ + skein.h \ + spa_boot.h \ + spa_checkpoint.h \ + spa_log_spacemap.h \ + space_map.h \ + space_reftree.h \ + spa.h \ + spa_impl.h \ + spa_checksum.h \ + sysevent.h \ + txg.h \ + txg_impl.h \ + u8_textprep_data.h \ + u8_textprep.h \ + uberblock.h \ + uberblock_impl.h \ + uio_impl.h \ + unique.h \ + uuid.h \ + vdev_disk.h \ + vdev_file.h \ + vdev.h \ + vdev_draid.h \ + vdev_impl.h \ + vdev_indirect_births.h \ + vdev_indirect_mapping.h \ + vdev_initialize.h \ + vdev_raidz.h \ + vdev_raidz_impl.h \ + vdev_rebuild.h \ + vdev_removal.h \ + vdev_trim.h \ + xvattr.h \ + zap.h \ + zap_impl.h \ + zap_leaf.h \ + zcp.h \ + zcp_global.h \ + zcp_iter.h \ + zcp_prop.h \ + zcp_set.h \ + zfeature.h \ + zfs_acl.h \ + zfs_bootenv.h \ + zfs_context.h \ + zfs_debug.h \ + zfs_delay.h \ + zfs_file.h \ + zfs_fuid.h \ + zfs_project.h \ + zfs_quota.h \ + zfs_racct.h \ + zfs_ratelimit.h \ + zfs_refcount.h \ + zfs_rlock.h \ + zfs_sa.h \ + zfs_stat.h \ + zfs_sysfs.h \ + zfs_vfsops.h \ + zfs_vnops.h \ + zfs_znode.h \ + zil.h \ + zil_impl.h \ + zio_checksum.h \ + zio_compress.h \ + zio_crypt.h \ + zio.h \ + zio_impl.h \ + zio_priority.h \ + zrlock.h \ + zthr.h KERNEL_H = \ - $(top_srcdir)/include/sys/zfs_ioctl.h \ - $(top_srcdir)/include/sys/zfs_onexit.h \ - ${top_srcdir}/include/sys/zpl.h \ - $(top_srcdir)/include/sys/zvol.h - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + zfs_ioctl.h \ + zfs_ioctl_impl.h \ + zfs_onexit.h \ + zvol.h \ + zvol_impl.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys kernel_HEADERS = $(COMMON_H) $(KERNEL_H) endif +endif diff --git a/include/sys/abd.h b/include/sys/abd.h index 3d9fdbf102..5c6bd0c271 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #ifndef _ABD_H @@ -28,66 +28,78 @@ #include #include -#include -#ifdef _KERNEL -#include -#include +#include #include -#endif #ifdef __cplusplus extern "C" { #endif typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ - ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */ + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ + ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ + ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ + ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */ } abd_flags_t; typedef struct abd { abd_flags_t abd_flags; uint_t abd_size; /* excludes scattered abd_offset */ + list_node_t abd_gang_link; +#ifdef ZFS_DEBUG struct abd *abd_parent; zfs_refcount_t abd_children; +#endif + kmutex_t abd_mtx; union { struct abd_scatter { uint_t abd_offset; +#if defined(__FreeBSD__) && defined(_KERNEL) + void *abd_chunks[1]; /* actually variable-length */ +#else uint_t abd_nents; struct scatterlist *abd_sgl; +#endif } abd_scatter; struct abd_linear { void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ } abd_linear; + struct abd_gang { + list_t abd_gang_chain; + } abd_gang; } abd_u; } abd_t; -typedef int abd_iter_func_t(void *buf, size_t len, void *private); -typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); +typedef int abd_iter_func_t(void *buf, size_t len, void *priv); +typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); extern int zfs_abd_scatter_enabled; -static inline boolean_t -abd_is_linear(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); -} - /* * Allocations and deallocations */ abd_t *abd_alloc(size_t, boolean_t); abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_gang(void); abd_t *abd_alloc_for_io(size_t, boolean_t); abd_t *abd_alloc_sametype(abd_t *, size_t); +boolean_t abd_size_alloc_linear(size_t); +void abd_gang_add(abd_t *, abd_t *, boolean_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t); +abd_t *abd_get_zeros(size_t); abd_t *abd_get_from_buf(void *, size_t); -void abd_put(abd_t *); +void abd_cache_reap_now(void); /* * Conversion to and from a normal buffer @@ -114,12 +126,7 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); - -#if defined(_KERNEL) -unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, - size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); -#endif +void abd_verify(abd_t *); void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t csize, ssize_t dsize, const unsigned parity, @@ -164,13 +171,49 @@ abd_zero(abd_t *abd, size_t size) abd_zero_off(abd, 0, size); } +/* + * ABD type check functions + */ +static inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) ? B_TRUE : B_FALSE); +} + +static inline boolean_t +abd_is_linear_page(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) ? B_TRUE : B_FALSE); +} + +static inline boolean_t +abd_is_gang(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_GANG) ? B_TRUE : B_FALSE); +} + +static inline uint_t +abd_get_size(abd_t *abd) +{ + return (abd->abd_size); +} + /* * Module lifecycle + * Defined in each specific OS's abd_os.c */ void abd_init(void); void abd_fini(void); +/* + * Linux ABD bio functions + */ +#if defined(__linux__) && defined(_KERNEL) +unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h new file mode 100644 index 0000000000..e96f1edfc8 --- /dev/null +++ b/include/sys/abd_impl.h @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_IMPL_H +#define _ABD_IMPL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_stats_op { + ABDSTAT_INCR, /* Increase abdstat values */ + ABDSTAT_DECR /* Decrease abdstat values */ +} abd_stats_op_t; + +struct scatterlist; /* forward declaration */ + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +}; + +extern abd_t *abd_zero_scatter; + +abd_t *abd_gang_get_offset(abd_t *, size_t *); +abd_t *abd_alloc_struct(size_t); +void abd_free_struct(abd_t *); + +/* + * OS specific functions + */ + +abd_t *abd_alloc_struct_impl(size_t); +abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t, size_t); +void abd_free_struct_impl(abd_t *); +void abd_alloc_chunks(abd_t *, size_t); +void abd_free_chunks(abd_t *); +void abd_update_scatter_stats(abd_t *, abd_stats_op_t); +void abd_update_linear_stats(abd_t *, abd_stats_op_t); +void abd_verify_scatter(abd_t *); +void abd_free_linear_page(abd_t *); +/* OS specific abd_iter functions */ +void abd_iter_init(struct abd_iter *, abd_t *); +boolean_t abd_iter_at_end(struct abd_iter *); +void abd_iter_advance(struct abd_iter *, size_t); +void abd_iter_map(struct abd_iter *); +void abd_iter_unmap(struct abd_iter *); + +/* + * Helper macros + */ +#define ABDSTAT_INCR(stat, val) \ + wmsum_add(&abd_sums.stat, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define ABD_GANG(abd) (abd->abd_u.abd_gang) + +#if defined(_KERNEL) +#if defined(__FreeBSD__) +#define abd_enter_critical(flags) critical_enter() +#define abd_exit_critical(flags) critical_exit() +#else +#define abd_enter_critical(flags) local_irq_save(flags) +#define abd_exit_critical(flags) local_irq_restore(flags) +#endif +#else /* !_KERNEL */ +#define abd_enter_critical(flags) ((void)0) +#define abd_exit_critical(flags) ((void)0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_H */ diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h index caa08d7738..65800058cb 100644 --- a/include/sys/aggsum.h +++ b/include/sys/aggsum.h @@ -39,15 +39,16 @@ struct aggsum_bucket { typedef struct aggsum { kmutex_t as_lock; int64_t as_lower_bound; - int64_t as_upper_bound; - uint64_t as_numbuckets; - aggsum_bucket_t *as_buckets; + uint64_t as_upper_bound; + aggsum_bucket_t *as_buckets ____cacheline_aligned; + uint_t as_numbuckets; + uint_t as_bucketshift; } aggsum_t; void aggsum_init(aggsum_t *, uint64_t); void aggsum_fini(aggsum_t *); int64_t aggsum_lower_bound(aggsum_t *); -int64_t aggsum_upper_bound(aggsum_t *); +uint64_t aggsum_upper_bound(aggsum_t *); int aggsum_compare(aggsum_t *, uint64_t); uint64_t aggsum_value(aggsum_t *); void aggsum_add(aggsum_t *, int64_t); diff --git a/include/sys/arc.h b/include/sys/arc.h index dc2fd03647..afbe65bb1c 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_ARC_H @@ -36,23 +38,30 @@ extern "C" { #include #include #include -#include +#include /* * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ -#define ARC_EVICT_ALL -1ULL +#define ARC_EVICT_ALL UINT64_MAX + +/* + * ZFS gets very unhappy when the maximum ARC size is smaller than the maximum + * block size and a larger block is written. To leave some safety margin, we + * limit the minimum for zfs_arc_max to the maximium transaction size. + */ +#define MIN_ARC_MAX DMU_MAX_ACCESS #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define HDR_SET_PSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) #define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) @@ -70,9 +79,9 @@ typedef struct arc_prune arc_prune_t; * parameter will be NULL. */ typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, - const blkptr_t *bp, arc_buf_t *buf, void *private); -typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); -typedef void arc_prune_func_t(int64_t bytes, void *private); + const blkptr_t *bp, arc_buf_t *buf, void *priv); +typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); +typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern int zfs_arc_average_blocksize; @@ -146,6 +155,17 @@ typedef enum arc_flags ARC_FLAG_COMPRESSED_ARC = 1 << 20, ARC_FLAG_SHARED_DATA = 1 << 21, + /* + * Fail this arc_read() (with ENOENT) if the data is not already present + * in cache. + */ + ARC_FLAG_CACHED_ONLY = 1 << 22, + + /* + * Don't instantiate an arc_buf_t for arc_read_done. + */ + ARC_FLAG_NO_BUF = 1 << 23, + /* * The arc buffer's compression mode is stored in the top 7 bits of the * flags field, so these dummy flags are included so that MDB can @@ -187,7 +207,7 @@ typedef enum arc_buf_contents { } arc_buf_contents_t; /* - * The following breakdows of arc_size exist for kstat only. + * The following breakdowns of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, @@ -197,6 +217,7 @@ typedef enum arc_space_type { ARC_SPACE_DBUF, ARC_SPACE_DNODE, ARC_SPACE_BONUS, + ARC_SPACE_ABD_CHUNK_WASTE, ARC_SPACE_NUMTYPES } arc_space_type_t; @@ -245,18 +266,20 @@ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size); arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, - uint64_t psize, uint64_t lsize, enum zio_compress compression_type); + uint64_t psize, uint64_t lsize, enum zio_compress compression_type, + uint8_t complevel); arc_buf_t *arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type); + enum zio_compress compression_type, uint8_t complevel); +uint8_t arc_get_complevel(arc_buf_t *buf); arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type); + enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type); + enum zio_compress compression_type, uint8_t complevel); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_destroy(arc_buf_t *buf, void *tag); @@ -274,16 +297,16 @@ int arc_referenced(arc_buf_t *buf); #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_read_done_func_t *done, void *private, zio_priority_t priority, + arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, + void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); -arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private); +arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); @@ -291,7 +314,10 @@ void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); +uint64_t arc_all_memory(void); +uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); +void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); @@ -302,10 +328,14 @@ void arc_fini(void); void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, + uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); +void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index cd42c0c01a..3c5af9d863 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -20,9 +20,10 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, Delphix. All rights reserved. + * Copyright (c) 2013, Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. */ #ifndef _SYS_ARC_IMPL_H @@ -30,6 +31,9 @@ #include #include +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -39,7 +43,7 @@ extern "C" { * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states @@ -71,20 +75,20 @@ typedef struct arc_state { /* * list of evictable buffers */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * supports the "dbufs" kstat + */ + arc_state_type_t arcs_state; /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. */ zfs_refcount_t arcs_size; - /* - * supports the "dbufs" kstat - */ - arc_state_type_t arcs_state; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -96,6 +100,7 @@ struct arc_callback { boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; + boolean_t acb_nobuf; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; @@ -148,24 +153,22 @@ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ + /* for waiting on reads to complete */ kcondvar_t b_cv; uint8_t b_byteswap; - /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; - /* updated atomically */ + /* protected by hash lock */ clock_t b_arc_access; uint32_t b_mru_hits; uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; - uint32_t b_l2_hits; + uint32_t b_bufcnt; + arc_buf_t *b_buf; /* self protecting */ zfs_refcount_t b_refcnt; @@ -174,6 +177,240 @@ typedef struct l1arc_buf_hdr { abd_t *b_pabd; } l1arc_buf_hdr_t; +typedef enum l2arc_dev_hdr_flags_t { + L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ +} l2arc_dev_hdr_flags_t; + +/* + * Pointer used in persistent L2ARC (for pointing to log blocks). + */ +typedef struct l2arc_log_blkptr { + /* + * Offset of log block within the device, in bytes + */ + uint64_t lbp_daddr; + /* + * Aligned payload size (in bytes) of the log block + */ + uint64_t lbp_payload_asize; + /* + * Offset in bytes of the first buffer in the payload + */ + uint64_t lbp_payload_start; + /* + * lbp_prop has the following format: + * * logical size (in bytes) + * * aligned (after compression) size (in bytes) + * * compression algorithm (we always LZ4-compress l2arc logs) + * * checksum algorithm (used for lbp_cksum) + */ + uint64_t lbp_prop; + zio_cksum_t lbp_cksum; /* checksum of log */ +} l2arc_log_blkptr_t; + +/* + * The persistent L2ARC device header. + * Byte order of magic determines whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_dev_hdr_phys { + uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ + uint64_t dh_version; /* Persistent L2ARC version */ + + /* + * Global L2ARC device state and metadata. + */ + uint64_t dh_spa_guid; + uint64_t dh_vdev_guid; + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ + uint64_t dh_evict; /* evicted offset in bytes */ + uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ + /* + * Used in zdb.c for determining if a log block is valid, in the same + * way that l2arc_rebuild() does. + */ + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ + /* + * Start of log block chain. [0] -> newest log, [1] -> one older (used + * for initiating prefetch). + */ + l2arc_log_blkptr_t dh_start_lbps[2]; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + /* + * Mirrors of vdev_trim_action_time and vdev_trim_state, used to + * display when the cache device was fully trimmed for the last + * time. + */ + uint64_t dh_trim_action_time; + uint64_t dh_trim_state; + const uint64_t dh_pad[30]; /* pad to 512 bytes */ + zio_eck_t dh_tail; +} l2arc_dev_hdr_phys_t; +CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); + +/* + * A single ARC buffer header entry in a l2arc_log_blk_phys_t. + */ +typedef struct l2arc_log_ent_phys { + dva_t le_dva; /* dva of buffer */ + uint64_t le_birth; /* birth txg of buffer */ + /* + * le_prop has the following format: + * * logical size (in bytes) + * * physical (compressed) size (in bytes) + * * compression algorithm + * * object type (used to restore arc_buf_contents_t) + * * protected status (used for encryption) + * * prefetch status (used in l2arc_read_done()) + */ + uint64_t le_prop; + uint64_t le_daddr; /* buf location on l2dev */ + uint64_t le_complevel; + /* + * We pad the size of each entry to a power of 2 so that the size of + * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, + * because of the L2ARC_SET_*SIZE macros. + */ + const uint64_t le_pad[2]; /* pad to 64 bytes */ +} l2arc_log_ent_phys_t; + +#define L2ARC_LOG_BLK_MAX_ENTRIES (1022) + +/* + * A log block of up to 1022 ARC buffer log entries, chained into the + * persistent L2ARC metadata linked list. Byte order of magic determines + * whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_log_blk_phys { + uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ + /* + * There are 2 chains (headed by dh_start_lbps[2]), and this field + * points back to the previous block in this chain. We alternate + * which chain we append to, so they are time-wise and offset-wise + * interleaved, but that is an optimization rather than for + * correctness. + */ + l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ + /* + * Pad header section to 128 bytes + */ + uint64_t lb_pad[7]; + /* Payload */ + l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; +} l2arc_log_blk_phys_t; /* 64K total */ + +/* + * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with + * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. + */ +CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), + 1ULL << SPA_MINBLOCKSHIFT)); +CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); +CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); + +/* + * These structures hold in-flight abd buffers for log blocks as they're being + * written to the L2ARC device. + */ +typedef struct l2arc_lb_abd_buf { + abd_t *abd; + list_node_t node; +} l2arc_lb_abd_buf_t; + +/* + * These structures hold pointers to log blocks present on the L2ARC device. + */ +typedef struct l2arc_lb_ptr_buf { + l2arc_log_blkptr_t *lb_ptr; + list_node_t node; +} l2arc_lb_ptr_buf_t; + +/* Macros for setting fields in le_prop and lbp_prop */ +#define L2BLK_GET_LSIZE(field) \ + BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_LSIZE(field, x) \ + BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_PSIZE(field) \ + BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_PSIZE(field, x) \ + BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_COMPRESS(field) \ + BF64_GET((field), 32, SPA_COMPRESSBITS) +#define L2BLK_SET_COMPRESS(field, x) \ + BF64_SET((field), 32, SPA_COMPRESSBITS, x) +#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) +#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) +#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) +#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) +#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) +#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) +#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) +#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) + +#define PTR_SWAP(x, y) \ + do { \ + void *tmp = (x);\ + x = y; \ + y = tmp; \ + } while (0) + +#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ +#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + zfs_refcount_t l2ad_alloc; /* allocated bytes */ + /* + * Persistence-related stuff + */ + l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ + uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ + l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ + int l2ad_log_ent_idx; /* index into cur log blk */ + /* Number of bytes in current log block's payload */ + uint64_t l2ad_log_blk_payload_asize; + /* + * Offset (in bytes) of the first buffer in current log block's + * payload. + */ + uint64_t l2ad_log_blk_payload_start; + /* Flag indicating whether a rebuild is scheduled or is going on */ + boolean_t l2ad_rebuild; + boolean_t l2ad_rebuild_cancel; + boolean_t l2ad_rebuild_began; + uint64_t l2ad_log_entries; /* entries per log blk */ + uint64_t l2ad_evict; /* evicted offset in bytes */ + /* List of pointers to log blocks present in the L2ARC device */ + list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; + boolean_t l2ad_trim_all; /* TRIM whole device */ +} l2arc_dev_t; + /* * Encrypted blocks will need to be stored encrypted on the L2ARC * disk as they appear in the main pool. In order for this to work we @@ -204,32 +441,20 @@ typedef struct arc_buf_hdr_crypt { uint8_t b_mac[ZIO_DATA_MAC_LEN]; } arc_buf_hdr_crypt_t; -typedef struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -} l2arc_dev_t; - typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; - + arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* in-flight list of log blocks */ + list_t l2wcb_abd_list; } l2arc_write_callback_t; struct arc_buf_hdr { @@ -238,6 +463,9 @@ struct arc_buf_hdr { uint64_t b_birth; arc_buf_contents_t b_type; + uint8_t b_complevel; + uint8_t b_reserved1; /* used for 4 byte alignment */ + uint16_t b_reserved2; /* used for 4 byte alignment */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; @@ -278,6 +506,513 @@ struct arc_buf_hdr { */ arc_buf_hdr_crypt_t b_crypt_hdr; }; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_eligible_mfu; + kstat_named_t arcstat_evict_l2_eligible_mru; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by dmu_buf_impl_t objects. + */ + kstat_named_t arcstat_dbuf_size; + /* + * Number of bytes consumed by dnode_t objects. + */ + kstat_named_t arcstat_dnode_size; + /* + * Number of bytes consumed by bonus buffers. + */ + kstat_named_t arcstat_bonus_size; +#if defined(COMPAT_FREEBSD11) + /* + * Sum of the previous three counters, provided for compatibility. + */ + kstat_named_t arcstat_other_size; +#endif + + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + /* + * Allocated size (in bytes) of L2ARC cached buffers by ARC state. + */ + kstat_named_t arcstat_l2_prefetch_asize; + kstat_named_t arcstat_l2_mru_asize; + kstat_named_t arcstat_l2_mfu_asize; + /* + * Allocated size (in bytes) of L2ARC cached buffers by buffer content + * type. + */ + kstat_named_t arcstat_l2_bufc_data_asize; + kstat_named_t arcstat_l2_bufc_metadata_asize; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + kstat_named_t arcstat_l2_hdr_size; + /* + * Number of L2ARC log blocks written. These are used for restoring the + * L2ARC. Updated during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_log_blk_writes; + /* + * Moving average of the aligned size of the L2ARC log blocks, in + * bytes. Updated during L2ARC rebuild and during writing of L2ARC + * log blocks. + */ + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; + /* + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. + * Updated during L2ARC rebuild and during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_data_to_meta_ratio; + /* + * Number of times the L2ARC rebuild was successful for an L2ARC device. + */ + kstat_named_t arcstat_l2_rebuild_success; + /* + * Number of times the L2ARC rebuild failed because the device header + * was in an unsupported format or corrupted. + */ + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + /* + * Number of times the L2ARC rebuild failed because of IO errors + * while reading a log block. + */ + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + /* + * Number of times the L2ARC rebuild failed because of IO errors when + * reading the device header. + */ + kstat_named_t arcstat_l2_rebuild_abort_dh_errors; + /* + * Number of L2ARC log blocks which failed to be restored due to + * checksum errors. + */ + kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; + /* + * Number of times the L2ARC rebuild was aborted due to low system + * memory. + */ + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + /* Logical size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; + /* + * Number of L2ARC log entries (buffers) that were successfully + * restored in ARC. + */ + kstat_named_t arcstat_l2_rebuild_bufs; + /* + * Number of L2ARC log entries (buffers) already cached in ARC. These + * were not restored again. + */ + kstat_named_t arcstat_l2_rebuild_bufs_precached; + /* + * Number of L2ARC log blocks that were restored successfully. Each + * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. + */ + kstat_named_t arcstat_l2_rebuild_log_blks; + kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_memory_all_bytes; + kstat_named_t arcstat_memory_free_bytes; + kstat_named_t arcstat_memory_available_bytes; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_dnode_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; + kstat_named_t arcstat_need_free; + kstat_named_t arcstat_sys_free; + kstat_named_t arcstat_raw_size; + kstat_named_t arcstat_cached_only_in_progress; + kstat_named_t arcstat_abd_chunk_waste_size; +} arc_stats_t; + +typedef struct arc_sums { + wmsum_t arcstat_hits; + wmsum_t arcstat_misses; + wmsum_t arcstat_demand_data_hits; + wmsum_t arcstat_demand_data_misses; + wmsum_t arcstat_demand_metadata_hits; + wmsum_t arcstat_demand_metadata_misses; + wmsum_t arcstat_prefetch_data_hits; + wmsum_t arcstat_prefetch_data_misses; + wmsum_t arcstat_prefetch_metadata_hits; + wmsum_t arcstat_prefetch_metadata_misses; + wmsum_t arcstat_mru_hits; + wmsum_t arcstat_mru_ghost_hits; + wmsum_t arcstat_mfu_hits; + wmsum_t arcstat_mfu_ghost_hits; + wmsum_t arcstat_deleted; + wmsum_t arcstat_mutex_miss; + wmsum_t arcstat_access_skip; + wmsum_t arcstat_evict_skip; + wmsum_t arcstat_evict_not_enough; + wmsum_t arcstat_evict_l2_cached; + wmsum_t arcstat_evict_l2_eligible; + wmsum_t arcstat_evict_l2_eligible_mfu; + wmsum_t arcstat_evict_l2_eligible_mru; + wmsum_t arcstat_evict_l2_ineligible; + wmsum_t arcstat_evict_l2_skip; + wmsum_t arcstat_hash_collisions; + wmsum_t arcstat_hash_chains; + aggsum_t arcstat_size; + wmsum_t arcstat_compressed_size; + wmsum_t arcstat_uncompressed_size; + wmsum_t arcstat_overhead_size; + wmsum_t arcstat_hdr_size; + wmsum_t arcstat_data_size; + wmsum_t arcstat_metadata_size; + wmsum_t arcstat_dbuf_size; + aggsum_t arcstat_dnode_size; + wmsum_t arcstat_bonus_size; + wmsum_t arcstat_l2_hits; + wmsum_t arcstat_l2_misses; + wmsum_t arcstat_l2_prefetch_asize; + wmsum_t arcstat_l2_mru_asize; + wmsum_t arcstat_l2_mfu_asize; + wmsum_t arcstat_l2_bufc_data_asize; + wmsum_t arcstat_l2_bufc_metadata_asize; + wmsum_t arcstat_l2_feeds; + wmsum_t arcstat_l2_rw_clash; + wmsum_t arcstat_l2_read_bytes; + wmsum_t arcstat_l2_write_bytes; + wmsum_t arcstat_l2_writes_sent; + wmsum_t arcstat_l2_writes_done; + wmsum_t arcstat_l2_writes_error; + wmsum_t arcstat_l2_writes_lock_retry; + wmsum_t arcstat_l2_evict_lock_retry; + wmsum_t arcstat_l2_evict_reading; + wmsum_t arcstat_l2_evict_l1cached; + wmsum_t arcstat_l2_free_on_write; + wmsum_t arcstat_l2_abort_lowmem; + wmsum_t arcstat_l2_cksum_bad; + wmsum_t arcstat_l2_io_error; + wmsum_t arcstat_l2_lsize; + wmsum_t arcstat_l2_psize; + aggsum_t arcstat_l2_hdr_size; + wmsum_t arcstat_l2_log_blk_writes; + wmsum_t arcstat_l2_log_blk_asize; + wmsum_t arcstat_l2_log_blk_count; + wmsum_t arcstat_l2_rebuild_success; + wmsum_t arcstat_l2_rebuild_abort_unsupported; + wmsum_t arcstat_l2_rebuild_abort_io_errors; + wmsum_t arcstat_l2_rebuild_abort_dh_errors; + wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; + wmsum_t arcstat_l2_rebuild_abort_lowmem; + wmsum_t arcstat_l2_rebuild_size; + wmsum_t arcstat_l2_rebuild_asize; + wmsum_t arcstat_l2_rebuild_bufs; + wmsum_t arcstat_l2_rebuild_bufs_precached; + wmsum_t arcstat_l2_rebuild_log_blks; + wmsum_t arcstat_memory_throttle_count; + wmsum_t arcstat_memory_direct_count; + wmsum_t arcstat_memory_indirect_count; + wmsum_t arcstat_prune; + aggsum_t arcstat_meta_used; + wmsum_t arcstat_async_upgrade_sync; + wmsum_t arcstat_demand_hit_predictive_prefetch; + wmsum_t arcstat_demand_hit_prescient_prefetch; + wmsum_t arcstat_raw_size; + wmsum_t arcstat_cached_only_in_progress; + wmsum_t arcstat_abd_chunk_waste_size; +} arc_sums_t; + +typedef struct arc_evict_waiter { + list_node_t aew_node; + kcondvar_t aew_cv; + uint64_t aew_count; +} arc_evict_waiter_t; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + wmsum_add(&arc_sums.stat, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ + +#define arc_anon (&ARC_anon) +#define arc_mru (&ARC_mru) +#define arc_mru_ghost (&ARC_mru_ghost) +#define arc_mfu (&ARC_mfu) +#define arc_mfu_ghost (&ARC_mfu_ghost) +#define arc_l2c_only (&ARC_l2c_only) + +extern taskq_t *arc_prune_taskq; +extern arc_stats_t arc_stats; +extern arc_sums_t arc_sums; +extern hrtime_t arc_growtime; +extern boolean_t arc_warm; +extern int arc_grow_retry; +extern int arc_no_grow_shift; +extern int arc_shrink_shift; +extern kmutex_t arc_prune_mtx; +extern list_t arc_prune_list; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mru; +extern uint_t zfs_arc_pc_percent; +extern int arc_lotsfree_percent; +extern unsigned long zfs_arc_min; +extern unsigned long zfs_arc_max; + +extern void arc_reduce_target_size(int64_t to_free); +extern boolean_t arc_reclaim_needed(void); +extern void arc_kmem_reap_soon(void); +extern void arc_wait_for_eviction(uint64_t, boolean_t); + +extern void arc_lowmem_init(void); +extern void arc_lowmem_fini(void); +extern void arc_prune_async(int64_t); +extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); +extern uint64_t arc_free_memory(void); +extern int64_t arc_available_memory(void); +extern void arc_tuning_update(boolean_t); +extern void arc_register_hotplug(void); +extern void arc_unregister_hotplug(void); + +extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); + +/* used in zdb.c */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); + +/* used in vdev_trim.c */ +void l2arc_dev_hdr_update(l2arc_dev_t *dev); +l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); + #ifdef __cplusplus } #endif diff --git a/include/sys/avl.h b/include/sys/avl.h index 206b539fab..20e88f2a6b 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -28,7 +28,7 @@ */ #ifndef _AVL_H -#define _AVL_H +#define _AVL_H extern __attribute__((visibility("default"))) /* * This is a private header file. Applications should not directly include @@ -97,7 +97,7 @@ extern "C" { * * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. * Note that once you use avl_destroy_nodes(), you can no longer - * use any routine except avl_destroy_nodes() and avl_destoy(). + * use any routine except avl_destroy_nodes() and avl_destroy(). * * 4. Use avl_destroy() to destroy the AVL tree itself. * @@ -108,9 +108,9 @@ extern "C" { /* * AVL comparator helpers */ -#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) -#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) -#define AVL_PCMP(a, b) \ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) /* @@ -144,7 +144,7 @@ typedef uintptr_t avl_index_t; * user data structure which must contain a field of type avl_node_t. * * Also assume the user data structures looks like: - * stuct my_type { + * struct my_type { * ... * avl_node_t my_link; * ... @@ -160,7 +160,7 @@ typedef uintptr_t avl_index_t; * size - the value of sizeof(struct my_type) * offset - the value of OFFSETOF(struct my_type, my_link) */ -extern void avl_create(avl_tree_t *tree, +_AVL_H void avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), size_t size, size_t offset); @@ -172,7 +172,7 @@ extern void avl_create(avl_tree_t *tree, * node - node that has the value being looked for * where - position for use with avl_nearest() or avl_insert(), may be NULL */ -extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); +_AVL_H void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); /* * Insert a node into the tree. @@ -180,7 +180,7 @@ extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); * node - the node to insert * where - position as returned from avl_find() */ -extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); +_AVL_H void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); /* * Insert "new_data" in "tree" in the given "direction" either after @@ -193,7 +193,7 @@ extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); * here - existing node in "tree" * direction - either AVL_AFTER or AVL_BEFORE the data "here". */ -extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, +_AVL_H void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, int direction); @@ -202,8 +202,8 @@ extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, * if the tree is empty. * */ -extern void *avl_first(avl_tree_t *tree); -extern void *avl_last(avl_tree_t *tree); +_AVL_H void *avl_first(avl_tree_t *tree); +_AVL_H void *avl_last(avl_tree_t *tree); /* @@ -239,7 +239,7 @@ extern void *avl_last(avl_tree_t *tree); * else * less = avl_nearest(tree, where, AVL_BEFORE); */ -extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); +_AVL_H void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); /* @@ -249,7 +249,7 @@ extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); * * node - the node to add */ -extern void avl_add(avl_tree_t *tree, void *node); +_AVL_H void avl_add(avl_tree_t *tree, void *node); /* @@ -257,22 +257,33 @@ extern void avl_add(avl_tree_t *tree, void *node); * * node - the node to remove */ -extern void avl_remove(avl_tree_t *tree, void *node); +_AVL_H void avl_remove(avl_tree_t *tree, void *node); + +/* + * Reinsert a node only if its order has changed relative to its nearest + * neighbors. To optimize performance avl_update_lt() checks only the previous + * node and avl_update_gt() checks only the next node. Use avl_update_lt() and + * avl_update_gt() only if you know the direction in which the order of the + * node may change. + */ +_AVL_H boolean_t avl_update(avl_tree_t *, void *); +_AVL_H boolean_t avl_update_lt(avl_tree_t *, void *); +_AVL_H boolean_t avl_update_gt(avl_tree_t *, void *); /* * Swaps the contents of the two trees. */ -extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); +_AVL_H void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); /* * Return the number of nodes in the tree */ -extern ulong_t avl_numnodes(avl_tree_t *tree); +_AVL_H ulong_t avl_numnodes(avl_tree_t *tree); /* * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. */ -extern boolean_t avl_is_empty(avl_tree_t *tree); +_AVL_H boolean_t avl_is_empty(avl_tree_t *tree); /* * Used to destroy any remaining nodes in a tree. The cookie argument should @@ -295,7 +306,7 @@ extern boolean_t avl_is_empty(avl_tree_t *tree); * free(node); * avl_destroy(tree); */ -extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); +_AVL_H void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); /* @@ -303,7 +314,7 @@ extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); * * tree - the empty tree to destroy */ -extern void avl_destroy(avl_tree_t *tree); +_AVL_H void avl_destroy(avl_tree_t *tree); diff --git a/include/sys/avl_impl.h b/include/sys/avl_impl.h index fddf76906d..c464a62a1c 100644 --- a/include/sys/avl_impl.h +++ b/include/sys/avl_impl.h @@ -25,8 +25,7 @@ */ #ifndef _AVL_IMPL_H -#define _AVL_IMPL_H - +#define _AVL_IMPL_H extern __attribute__((visibility("default"))) /* @@ -148,14 +147,16 @@ struct avl_tree { int (*avl_compar)(const void *, const void *); size_t avl_offset; /* offsetof(type, avl_link_t field) */ ulong_t avl_numnodes; /* number of nodes in the tree */ - size_t avl_size; /* sizeof user type struct */ +#ifndef _KERNEL + size_t avl_pad; /* For backwards ABI compatibility. */ +#endif }; /* * This will only by used via AVL_NEXT() or AVL_PREV() */ -extern void *avl_walk(struct avl_tree *, void *, int); +_AVL_IMPL_H void *avl_walk(struct avl_tree *, void *, int); #ifdef __cplusplus } diff --git a/include/sys/bitops.h b/include/sys/bitops.h new file mode 100644 index 0000000000..69d07d7655 --- /dev/null +++ b/include/sys/bitops.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_BITOPS_H +#define _SYS_BITOPS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +} while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +} while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel. + */ +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +} while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +} while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITOPS_H */ diff --git a/include/sys/bplist.h b/include/sys/bplist.h index 471be9047e..f8deaf8437 100644 --- a/include/sys/bplist.h +++ b/include/sys/bplist.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _SYS_BPLIST_H @@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl); void bplist_append(bplist_t *bpl, const blkptr_t *bp); void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx); +void bplist_clear(bplist_t *bpl); #ifdef __cplusplus } diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index d425e239f6..16e403526c 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -48,10 +49,12 @@ typedef struct bpobj_phys { uint64_t bpo_uncomp; uint64_t bpo_subobjs; uint64_t bpo_num_subobjs; + uint64_t bpo_num_freed; } bpobj_phys_t; #define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) #define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) typedef struct bpobj { kmutex_t bpo_lock; @@ -60,12 +63,14 @@ typedef struct bpobj { int bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; + uint8_t bpo_havefreed; bpobj_phys_t *bpo_phys; dmu_buf_t *bpo_dbuf; dmu_buf_t *bpo_cached_dbuf; } bpobj_t; -typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); @@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo); boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); -int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); +int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, + void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); -void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); @@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t bpobj_is_empty(bpobj_t *bpo); +int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 63722df1bb..797aecd791 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. */ #ifndef _BQUEUE_H @@ -32,6 +32,7 @@ typedef struct bqueue { kcondvar_t bq_pop_cv; uint64_t bq_size; uint64_t bq_maxsize; + uint64_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; @@ -41,9 +42,10 @@ typedef struct bqueue_node { } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); void bqueue_destroy(bqueue_t *); void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); void *bqueue_dequeue(bqueue_t *); boolean_t bqueue_empty(bqueue_t *); diff --git a/include/sys/btree.h b/include/sys/btree.h new file mode 100644 index 0000000000..3b53476c7c --- /dev/null +++ b/include/sys/btree.h @@ -0,0 +1,243 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#ifndef _BTREE_H +#define _BTREE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * This file defines the interface for a B-Tree implementation for ZFS. The + * tree can be used to store arbitrary sortable data types with low overhead + * and good operation performance. In addition the tree intelligently + * optimizes bulk in-order insertions to improve memory use and performance. + * + * Note that for all B-Tree functions, the values returned are pointers to the + * internal copies of the data in the tree. The internal data can only be + * safely mutated if the changes cannot change the ordering of the element + * with respect to any other elements in the tree. + * + * The major drawback of the B-Tree is that any returned elements or indexes + * are only valid until a side-effectful operation occurs, since these can + * result in reallocation or relocation of data. Side effectful operations are + * defined as insertion, removal, and zfs_btree_destroy_nodes. + * + * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core + * nodes have an array of children pointing to other nodes, and an array of + * elements that act as separators between the elements of the subtrees rooted + * at its children. Leaf nodes only contain data elements, and form the bottom + * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the + * elements in the core nodes are not copies of or references to leaf node + * elements. Each element occurs only once in the tree, no matter what kind + * of node it is in. + * + * The tree's height is the same throughout, unlike many other forms of search + * tree. Each node (except for the root) must be between half minus one and + * completely full of elements (and children) at all times. Any operation that + * would put the node outside of that range results in a rebalancing operation + * (taking, merging, or splitting). + * + * This tree was implemented using descriptions from Wikipedia's articles on + * B-Trees and B+ Trees. + */ + +/* + * Decreasing these values results in smaller memmove operations, but more of + * them, and increased memory overhead. Increasing these values results in + * higher variance in operation time, and reduces memory overhead. + */ +#define BTREE_CORE_ELEMS 128 +#define BTREE_LEAF_SIZE 4096 + +extern kmem_cache_t *zfs_btree_leaf_cache; + +typedef struct zfs_btree_hdr { + struct zfs_btree_core *bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} zfs_btree_hdr_t; + +typedef struct zfs_btree_core { + zfs_btree_hdr_t btc_hdr; + zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} zfs_btree_core_t; + +typedef struct zfs_btree_leaf { + zfs_btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} zfs_btree_leaf_t; + +typedef struct zfs_btree_index { + zfs_btree_hdr_t *bti_node; + uint64_t bti_offset; + /* + * True if the location is before the list offset, false if it's at + * the listed offset. + */ + boolean_t bti_before; +} zfs_btree_index_t; + +typedef struct btree { + zfs_btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading + int (*bt_compar) (const void *, const void *); +} zfs_btree_t; + +/* + * Allocate and deallocate caches for btree nodes. + */ +void zfs_btree_init(void); +void zfs_btree_fini(void); + +/* + * Initialize an B-Tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + */ +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), + size_t); + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest(). + * + * node - node that has the value being looked for + * where - position for use with zfs_btree_nearest() or zfs_btree_add_idx(), + * may be NULL + */ +void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from zfs_btree_find() + */ +void zfs_btree_add_idx(zfs_btree_t *, const void *, const zfs_btree_index_t *); + +/* + * Return the first or last valued node in the tree. Will return NULL if the + * tree is empty. The index can be NULL if the location of the first or last + * element isn't required. + */ +void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); +void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the next or previous valued node in the tree. The second index can + * safely be NULL, if the location of the next or previous value isn't + * required. + */ +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); + +/* + * Get a value from a tree and an index. + */ +void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Add a single value to the tree. The value must not compare equal to any + * other node already in the tree. Note that the value will be copied out, not + * inserted directly. It is safe to free or destroy the value once this + * function returns. + */ +void zfs_btree_add(zfs_btree_t *, const void *); + +/* + * Remove a single value from the tree. The value must be in the tree. The + * pointer passed in may be a pointer into a tree-controlled buffer, but it + * need not be. + */ +void zfs_btree_remove(zfs_btree_t *, const void *); + +/* + * Remove the value at the given location from the tree. + */ +void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the number of nodes in the tree + */ +ulong_t zfs_btree_numnodes(zfs_btree_t *); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it + * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. + * + * cookie - an index used to save state between calls to + * zfs_btree_destroy_nodes() + * + * EXAMPLE: + * zfs_btree_t *tree; + * struct my_data *node; + * zfs_btree_index_t *cookie; + * + * cookie = NULL; + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * data_destroy(node); + * zfs_btree_destroy(tree); + */ +void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); + +/* + * Destroys all nodes in the tree quickly. This doesn't give the caller an + * opportunity to iterate over each node and do its own cleanup; for that, use + * zfs_btree_destroy_nodes(). + */ +void zfs_btree_clear(zfs_btree_t *); + +/* + * Final destroy of an B-Tree. Arguments are: + * + * tree - the empty tree to destroy + */ +void zfs_btree_destroy(zfs_btree_t *tree); + +/* Runs a variety of self-checks on the btree to verify integrity. */ +void zfs_btree_verify(zfs_btree_t *tree); + +#ifdef __cplusplus +} +#endif + +#endif /* _BTREE_H */ diff --git a/include/sys/crypto/Makefile.am b/include/sys/crypto/Makefile.am index 7f8156b8f4..eb31f6a457 100644 --- a/include/sys/crypto/Makefile.am +++ b/include/sys/crypto/Makefile.am @@ -1,20 +1,16 @@ COMMON_H = \ - $(top_srcdir)/include/sys/crypto/api.h \ - $(top_srcdir)/include/sys/crypto/common.h \ - $(top_srcdir)/include/sys/crypto/icp.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + api.h \ + common.h \ + icp.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/crypto -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/crypto -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/crypto/api.h b/include/sys/crypto/api.h index 7c3c465513..8aecfeaff0 100644 --- a/include/sys/crypto/api.h +++ b/include/sys/crypto/api.h @@ -58,7 +58,7 @@ typedef struct { */ #define CRYPTO_MECH_INVALID ((uint64_t)-1) -extern crypto_mech_type_t crypto_mech2id(crypto_mech_name_t name); +extern crypto_mech_type_t crypto_mech2id(char *name); /* * Create and destroy context templates. diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h index a4f9d9848c..9a239225cd 100644 --- a/include/sys/crypto/common.h +++ b/include/sys/crypto/common.h @@ -244,7 +244,7 @@ typedef struct crypto_data { iovec_t cdu_raw; /* Pointer and length */ /* uio scatter-gather format */ - uio_t *cdu_uio; + zfs_uio_t *cdu_uio; } cdu; /* Crypto Data Union */ } crypto_data_t; diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index 667d1b85fa..b165b98576 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -27,18 +27,18 @@ #ifndef _SYS_DATASET_KSTATS_H #define _SYS_DATASET_KSTATS_H -#include +#include #include #include -typedef struct dataset_aggsum_stats_t { - aggsum_t das_writes; - aggsum_t das_nwritten; - aggsum_t das_reads; - aggsum_t das_nread; - aggsum_t das_nunlinks; - aggsum_t das_nunlinked; -} dataset_aggsum_stats_t; +typedef struct dataset_sum_stats_t { + wmsum_t dss_writes; + wmsum_t dss_nwritten; + wmsum_t dss_reads; + wmsum_t dss_nread; + wmsum_t dss_nunlinks; + wmsum_t dss_nunlinked; +} dataset_sum_stats_t; typedef struct dataset_kstat_values { kstat_named_t dkv_ds_name; @@ -59,7 +59,7 @@ typedef struct dataset_kstat_values { } dataset_kstat_values_t; typedef struct dataset_kstats { - dataset_aggsum_stats_t dk_aggsums; + dataset_sum_stats_t dk_sums; kstat_t *dk_kstats; } dataset_kstats_t; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index eea9e265b0..89422659d0 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -108,6 +108,12 @@ typedef enum override_states { DR_OVERRIDDEN } override_states_t; +typedef enum db_lock_type { + DLT_NONE, + DLT_PARENT, + DLT_OBJSET +} db_lock_type_t; + typedef struct dbuf_dirty_record { /* link on our parents dirty list */ list_node_t dr_dirty_node; @@ -121,8 +127,18 @@ typedef struct dbuf_dirty_record { /* pointer back to our dbuf */ struct dmu_buf_impl *dr_dbuf; - /* pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; + /* list link for dbuf dirty records */ + list_node_t dr_dbuf_node; + + /* + * The dnode we are part of. Note that the dnode can not be moved or + * evicted due to the hold that's added by dnode_setdirty() or + * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or + * userquota_updates_task(). This hold is necessary for + * dirty_lightweight_leaf-type dirty records, which don't have a hold + * on a dbuf. + */ + dnode_t *dr_dnode; /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; @@ -165,6 +181,17 @@ typedef struct dbuf_dirty_record { uint8_t dr_iv[ZIO_DATA_IV_LEN]; uint8_t dr_mac[ZIO_DATA_MAC_LEN]; } dl; + struct dirty_lightweight_leaf { + /* + * This dirty record refers to a leaf (level=0) + * block, whose dbuf has not been instantiated for + * performance reasons. + */ + uint64_t dr_blkid; + abd_t *dr_abd; + zio_prop_t dr_props; + enum zio_flag dr_flags; + } dll; } dt; } dbuf_dirty_record_t; @@ -200,6 +227,13 @@ typedef struct dmu_buf_impl { */ struct dmu_buf_impl *db_hash_next; + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. Should be on the same cache line + * as db_level and db_blkid for the best avl_add() performance. + */ + avl_node_t db_link; + /* our block number */ uint64_t db_blkid; @@ -217,6 +251,22 @@ typedef struct dmu_buf_impl { */ uint8_t db_level; + /* + * Protects db_buf's contents if they contain an indirect block or data + * block of the meta-dnode. We use this lock to protect the structure of + * the block tree. This means that when modifying this dbuf's data, we + * grab its rwlock. When modifying its parent's data (including the + * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering + * for this lock is: + * 1) dn_struct_rwlock + * 2) db_rwlock + * We don't currently grab multiple dbufs' db_rwlocks at once. + */ + krwlock_t db_rwlock; + + /* buffer holding our data */ + arc_buf_t *db_buf; + /* db_mtx protects the members below */ kmutex_t db_mtx; @@ -232,20 +282,11 @@ typedef struct dmu_buf_impl { */ zfs_refcount_t db_holds; - /* buffer holding our data */ - arc_buf_t *db_buf; - kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; - /* pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; - - /* - * Our link on the owner dnodes's dn_dbufs list. - * Protected by its dn_dbufs_mtx. - */ - avl_node_t db_link; + /* List of dirty records for the buffer sorted newest to oldest. */ + list_t db_dirty_records; /* Link in dbuf_cache or dbuf_metadata_cache */ multilist_node_t db_cache_link; @@ -281,14 +322,16 @@ typedef struct dmu_buf_impl { } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 8192 +#define DBUF_MUTEXES 2048 #define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES]; + kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; } dbuf_hash_table_t; +typedef void (*dbuf_prefetch_fn)(void *, boolean_t); + uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, const uint64_t offset); @@ -304,7 +347,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, +int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg); +int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); @@ -324,18 +370,24 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, + dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx); + +void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); - -boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); +db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag); +void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); @@ -345,6 +397,9 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); void dbuf_stats_init(dbuf_hash_table_t *hash); void dbuf_stats_destroy(void); +int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); + #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) @@ -356,6 +411,29 @@ void dbuf_fini(void); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); +static inline dbuf_dirty_record_t * +dbuf_find_dirty_lte(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + for (dr = list_head(&db->db_dirty_records); + dr != NULL && dr->dr_txg > txg; + dr = list_next(&db->db_dirty_records, dr)) + continue; + return (dr); +} + +static inline dbuf_dirty_record_t * +dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + dr = dbuf_find_dirty_lte(db, txg); + if (dr && dr->dr_txg == txg) + return (dr); + return (NULL); +} + #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) @@ -387,7 +465,7 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); char __db_buf[32]; \ uint64_t __db_obj = (dbuf)->db.db_object; \ if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ + (void) strlcpy(__db_buf, "mdn", sizeof (__db_buf)); \ else \ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj); \ @@ -396,7 +474,7 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); __db_buf, (dbuf)->db_level, \ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ @@ -405,7 +483,7 @@ _NOTE(CONSTCOND) } while (0) dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define DBUF_VERIFY(db) dbuf_verify(db) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index fb1445d8d4..25be6f56dd 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -103,6 +103,10 @@ typedef struct ddt_phys { uint64_t ddp_phys_birth; } ddt_phys_t; +/* + * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, + * we maintain the ability to free existing dedup-ditto blocks. + */ enum ddt_phys_type { DDT_PHYS_DITTO = 0, DDT_PHYS_SINGLE = 1, @@ -175,18 +179,18 @@ typedef struct ddt_ops { int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; -#define DDT_NAMELEN 80 +#define DDT_NAMELEN 107 extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, char *name); + enum ddt_class clazz, char *name); extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); + enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, uint64_t *count); + enum ddt_class clazz, uint64_t *count); extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, dmu_object_info_t *); + enum ddt_class clazz, dmu_object_info_t *); extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, - enum ddt_class class); + enum ddt_class clazz); extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg); @@ -216,10 +220,6 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); -extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, - ddt_phys_t *ddp_willref); -extern int ddt_ditto_copies_present(ddt_entry_t *dde); - extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); @@ -246,7 +246,7 @@ extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); + enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); extern const ddt_ops_t ddt_zap_ops; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 88c8361717..942ab9b108 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. @@ -49,6 +49,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -141,9 +142,6 @@ typedef enum dmu_object_byteswap { #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) -#define DMU_OT_IS_ZIL(ot) \ - ((ot) == DMU_OT_INTENT_LOG) - /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) @@ -336,13 +334,11 @@ int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); -int dmu_objset_snapshot_tmp(const char *, const char *, int); -int dmu_objset_find(char *name, int func(const char *, void *), void *arg, +int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); -int dmu_objset_remap_indirects(const char *fsname); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ @@ -383,6 +379,8 @@ typedef struct dmu_buf { #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" +#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" +#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers @@ -465,7 +463,7 @@ int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, /* * Set the data blocksize for an object. * - * The object cannot have any blocks allcated beyond the first. If + * The object cannot have any blocks allocated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. @@ -498,12 +496,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); - -int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); - void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. @@ -562,9 +559,13 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); +int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); - +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp, uint32_t flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -666,7 +667,8 @@ typedef struct dmu_buf_user { /*ARGSUSED*/ static inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, - dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); @@ -775,7 +777,6 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); -void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); @@ -845,15 +846,14 @@ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL -#include -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); -int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); -int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); -int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, +int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); +int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); -int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, +int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); -int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, +int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); @@ -863,20 +863,6 @@ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf -void dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, - dmu_buf_t *handle, dmu_tx_t *tx); -#ifdef HAVE_UIO_ZEROCOPY -int dmu_xuio_init(struct xuio *uio, int niov); -void dmu_xuio_fini(struct xuio *uio); -int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, - size_t n); -int dmu_xuio_cnt(struct xuio *uio); -struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); -void dmu_xuio_clear(struct xuio *uio, int i); -#endif /* HAVE_UIO_ZEROCOPY */ -void xuio_stat_wbuf_copied(void); -void xuio_stat_wbuf_nocopy(void); - extern int zfs_prefetch_disable; extern int zfs_max_recordsize; @@ -937,7 +923,7 @@ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about - * the size. This is specifically optimized for zfs_getattr(). + * the size. */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); @@ -951,6 +937,7 @@ typedef struct dmu_objset_stats { dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; + uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; @@ -1004,18 +991,26 @@ extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); +extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); -extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, +extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); -typedef int objset_used_cb_t(dmu_object_type_t bonustype, - void *bonus, uint64_t *userp, uint64_t *groupp, uint64_t *projectp); +typedef struct zfs_file_info { + uint64_t zfi_user; + uint64_t zfi_group; + uint64_t zfi_project; + uint64_t zfi_generation; +} zfs_file_info_t; + +typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, + struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, - objset_used_cb_t *cb); + file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); @@ -1042,7 +1037,7 @@ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; - struct locked_range *zgd_lr; + struct zfs_locked_range *zgd_lr; void *zgd_private; } zgd_t; @@ -1068,7 +1063,7 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, - struct vnode *vp, offset_t *offp); + zfs_file_t *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 5e1901da4a..def4aadba1 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -24,7 +24,7 @@ */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -164,6 +164,7 @@ extern "C" { * dn_dirty_txg * dd_assigned_tx * dn_notxholds + * dn_nodnholds * dn_dirtyctx * dn_dirtyctx_firstset * (dn_phys copy fields?) @@ -236,46 +237,13 @@ extern "C" { struct objset; struct dmu_pool; -typedef struct dmu_xuio { - int next; - int cnt; - struct arc_buf **bufs; - iovec_t *iovp; -} dmu_xuio_t; - -/* - * The list of data whose inclusion in a send stream can be pending from - * one call to backup_cb to another. Multiple calls to dump_free() and - * dump_freeobjects() can be aggregated into a single DRR_FREE or - * DRR_FREEOBJECTS replay record. - */ -typedef enum { - PENDING_NONE, - PENDING_FREE, - PENDING_FREEOBJECTS -} dmu_pendop_t; - -typedef struct dmu_sendarg { - list_node_t dsa_link; - dmu_replay_record_t *dsa_drr; - vnode_t *dsa_vp; - int dsa_outfd; - proc_t *dsa_proc; - offset_t *dsa_off; - objset_t *dsa_os; - zio_cksum_t dsa_zc; - uint64_t dsa_toguid; - uint64_t dsa_fromtxg; - int dsa_err; - dmu_pendop_t dsa_pending_op; - uint64_t dsa_featureflags; - uint64_t dsa_last_data_object; - uint64_t dsa_last_data_offset; - uint64_t dsa_resume_object; - uint64_t dsa_resume_offset; - boolean_t dsa_sent_begin; - boolean_t dsa_sent_end; -} dmu_sendarg_t; +typedef struct dmu_sendstatus { + list_node_t dss_link; + int dss_outfd; + proc_t *dss_proc; + offset_t *dss_off; + uint64_t dss_blocks; /* blocks visited during the sending process */ +} dmu_sendstatus_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index c0650bcde9..e89ee64ea6 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -118,6 +118,7 @@ struct objset { uint64_t os_dnodesize; /* default dnode size for new objects */ enum zio_checksum os_checksum; enum zio_compress os_compress; + uint8_t os_complevel; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; @@ -126,7 +127,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; - int os_recordsize; + uint64_t os_recordsize; /* * The next four values are used as a cache of whatever's on disk, and * are initialized the first time these properties are queried. Before @@ -152,7 +153,7 @@ struct objset { /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ zil_header_t os_zil_header; - multilist_t *os_synced_dnodes; + multilist_t os_synced_dnodes; uint64_t os_flags; uint64_t os_freed_dnodes; boolean_t os_rescan_dnodes; @@ -171,7 +172,7 @@ struct objset { /* Protected by os_lock */ kmutex_t os_lock; - multilist_t *os_dirty_dnodes[TXG_SIZE]; + multilist_t os_dirty_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; @@ -241,10 +242,10 @@ objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); -int dmu_objset_userspace_upgrade(objset_t *os); +void dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); boolean_t dmu_objset_userobjused_enabled(objset_t *os); boolean_t dmu_objset_userobjspace_upgradable(objset_t *os); @@ -254,6 +255,8 @@ boolean_t dmu_objset_projectquota_enabled(objset_t *os); boolean_t dmu_objset_projectquota_present(objset_t *os); boolean_t dmu_objset_projectquota_upgradable(objset_t *os); void dmu_objset_id_quota_upgrade(objset_t *os); +int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, + const void *data, zfs_file_info_t *zfi); int dmu_fsname(const char *snapname, char *buf); diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index ffa89249d3..7188b2a022 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -33,6 +33,7 @@ #include #include #include +#include extern const char *recv_clone_name; @@ -44,28 +45,43 @@ typedef struct dmu_recv_cookie { const char *drc_tosnap; boolean_t drc_newfs; boolean_t drc_byteswap; + uint64_t drc_featureflags; boolean_t drc_force; boolean_t drc_resumable; + boolean_t drc_should_save; boolean_t drc_raw; boolean_t drc_clone; boolean_t drc_spill; - struct avl_tree *drc_guid_to_ds_map; nvlist_t *drc_keynvl; - zio_cksum_t drc_cksum; uint64_t drc_fromsnapobj; - uint64_t drc_newsnapobj; uint64_t drc_ivset_guid; void *drc_owner; cred_t *drc_cred; + proc_t *drc_proc; + nvlist_t *drc_begin_nvl; + + objset_t *drc_os; + zfs_file_t *drc_fp; /* The file to read the stream from */ + uint64_t drc_voff; /* The current offset in the stream */ + uint64_t drc_bytes_read; + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *drc_rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *drc_next_rrd; + zio_cksum_t drc_cksum; + zio_cksum_t drc_prev_cksum; + /* Sorted list of objects not to issue prefetches for. */ + objlist_t *drc_ignore_objlist; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, - struct dmu_replay_record *drr_begin, boolean_t force, boolean_t resumable, - nvlist_t *localprops, nvlist_t *hidden_args, char *origin, - dmu_recv_cookie_t *drc); -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep); -int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); -boolean_t dmu_objset_is_receiving(objset_t *os); +int dmu_recv_begin(char *, char *, dmu_replay_record_t *, + boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, + dmu_recv_cookie_t *, zfs_file_t *, offset_t *); +int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); +int dmu_recv_end(dmu_recv_cookie_t *, void *); +boolean_t dmu_objset_is_receiving(objset_t *); #endif /* _DMU_RECV_H */ diff --git a/include/sys/dmu_redact.h b/include/sys/dmu_redact.h new file mode 100644 index 0000000000..85f4b05228 --- /dev/null +++ b/include/sys/dmu_redact.h @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ +#ifndef _DMU_REDACT_H_ +#define _DMU_REDACT_H_ + +#include +#include + +#define REDACT_BLOCK_MAX_COUNT (1ULL << 48) + +static inline uint64_t +redact_block_get_size(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, + 0)); +} + +static inline void +redact_block_set_size(redact_block_phys_t *rbp, uint64_t size) +{ + /* cppcheck-suppress syntaxError */ + BF64_SET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, 0, size); +} + +static inline uint64_t +redact_block_get_count(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 0, 48, 0, 1)); +} + +static inline void +redact_block_set_count(redact_block_phys_t *rbp, uint64_t count) +{ + /* cppcheck-suppress syntaxError */ + BF64_SET_SB((rbp)->rbp_size_count, 0, 48, 0, 1, count); +} + +int dmu_redact_snap(const char *, nvlist_t *, const char *); +#endif /* _DMU_REDACT_H_ */ diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 2e4d54b4ff..d150f816c9 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -31,23 +31,41 @@ #include #include +#include #include +#include +#include + +#define BEGINNV_REDACT_SNAPS "redact_snaps" +#define BEGINNV_REDACT_FROM_SNAPS "redact_from_snaps" +#define BEGINNV_RESUME_OBJECT "resume_object" +#define BEGINNV_RESUME_OFFSET "resume_offset" struct vnode; struct dsl_dataset; struct drr_begin; struct avl_tree; struct dmu_replay_record; - -int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, struct vnode *vp, offset_t *off); -int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, - boolean_t stream_compressed, uint64_t *sizep); -int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, - boolean_t stream_compressed, uint64_t *sizep); +struct dmu_send_outparams; +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook, int outfd, offset_t *off, + struct dmu_send_outparams *dsop); +int dmu_send_estimate_fast(struct dsl_dataset *ds, struct dsl_dataset *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, + boolean_t saved, uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, struct vnode *vp, offset_t *off); + boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, + struct dmu_send_outparams *dso); + +typedef int (*dmu_send_outfunc_t)(objset_t *os, void *buf, int len, void *arg); +typedef struct dmu_send_outparams { + dmu_send_outfunc_t dso_outfunc; + void *dso_arg; + boolean_t dso_dryrun; +} dmu_send_outparams_t; #endif /* _DMU_SEND_H */ diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h index 8ceef5cf13..d76bfe3c9a 100644 --- a/include/sys/dmu_traverse.h +++ b/include/sys/dmu_traverse.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -71,6 +71,20 @@ int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +/* + * Note that this calculation cannot overflow with the current maximum indirect + * block size (128k). If that maximum is increased to 1M, however, this + * calculation can overflow, and handling would need to be added to ensure + * continued correctness. + */ +static inline uint64_t +bp_span_in_blocks(uint8_t indblkshift, uint64_t level) +{ + unsigned int shift = level * (indblkshift - SPA_BLKPTRSHIFT); + ASSERT3U(shift, <, 64); + return (1ULL << shift); +} + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 36d205e950..71a9ac7ca7 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #ifdef __cplusplus extern "C" { @@ -124,6 +124,7 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; + kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index 8125d07062..4c220b0c79 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #ifndef _DMU_ZFETCH_H @@ -40,33 +40,47 @@ extern unsigned long zfetch_array_rd_sz; struct dnode; /* so we can reference dnode */ +typedef struct zfetch { + kmutex_t zf_lock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + int zf_numstreams; /* number of zstream_t's */ +} zfetch_t; + typedef struct zstream { uint64_t zs_blkid; /* expect next access at this blkid */ - uint64_t zs_pf_blkid; /* next block to prefetch */ + uint64_t zs_pf_blkid1; /* first block to prefetch */ + uint64_t zs_pf_blkid; /* block to prefetch up to */ /* * We will next prefetch the L1 indirect block of this level-0 * block id. */ - uint64_t zs_ipf_blkid; + uint64_t zs_ipf_blkid1; /* first block to prefetch */ + uint64_t zs_ipf_blkid; /* block to prefetch up to */ - kmutex_t zs_lock; /* protects stream */ - hrtime_t zs_atime; /* time last prefetch issued */ list_node_t zs_node; /* link for zf_stream */ + hrtime_t zs_atime; /* time last prefetch issued */ + zfetch_t *zs_fetch; /* parent fetch */ + boolean_t zs_missed; /* stream saw cache misses */ + zfs_refcount_t zs_callers; /* number of pending callers */ + /* + * Number of stream references: dnode, callers and pending blocks. + * The stream memory is freed when the number returns to zero. + */ + zfs_refcount_t zs_refs; } zstream_t; -typedef struct zfetch { - krwlock_t zf_rwlock; /* protects zfetch structure */ - list_t zf_stream; /* list of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ -} zfetch_t; - void zfetch_init(void); void zfetch_fini(void); void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); +zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, + boolean_t); +void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, + boolean_t); #ifdef __cplusplus diff --git a/include/sys/dnode.h b/include/sys/dnode.h index c60258bbc7..3f5fcc958c 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -46,6 +46,7 @@ extern "C" { */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 +#define DNODE_DRY_RUN 4 /* * dnode_next_offset() flags. @@ -170,7 +171,7 @@ enum dnode_dirtycontext { * example, reading 32 dnodes from a 16k dnode block and all of the spill * blocks could issue 33 separate reads. Now suppose those dnodes have size * 1024 and therefore don't need spill blocks. Then the worst case number - * of blocks read is reduced to from 33 to two--one per dnode block. + * of blocks read is reduced from 33 to two--one per dnode block. * * ZFS-on-Linux systems that make heavy use of extended attributes benefit * from this feature. In particular, ZFS-on-Linux supports the xattr=sa @@ -231,8 +232,8 @@ typedef struct dnode_phys { * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This * allows us to protect any fields that might be added here in the * future. In either case, developers will want to check - * zio_crypt_init_uios_dnode() to ensure the new field is being - * protected properly. + * zio_crypt_init_uios_dnode() and zio_crypt_do_dnode_hmac_updates() + * to ensure the new field is being protected and updated properly. */ uint64_t dn_pad3[4]; @@ -331,8 +332,9 @@ struct dnode { uint64_t dn_assigned_txg; uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ kcondvar_t dn_notxholds; + kcondvar_t dn_nodnholds; enum dnode_dirtycontext dn_dirtyctx; - uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + void *dn_dirtyctx_firstset; /* dbg: contents meaningless */ /* protected by own devices */ zfs_refcount_t dn_tx_holds; @@ -371,6 +373,13 @@ struct dnode { struct zfetch dn_zfetch; }; +/* + * Since AVL already has embedded element counter, use dn_dbufs_count + * only for dbufs not counted there (bonus buffers) and just add them. + */ +#define DN_DBUFS_COUNT(dn) ((dn)->dn_dbufs_count + \ + avl_numnodes(&(dn)->dn_dbufs)) + /* * We use this (otherwise unused) bit to indicate if the value of * dn_next_maxblkid[txgoff] is valid to use in dnode_sync(). @@ -415,7 +424,10 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); +int dnode_try_claim(objset_t *os, uint64_t object, int slots); +boolean_t dnode_is_dirty(dnode_t *dn); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); @@ -440,7 +452,6 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); void dnode_free_interior_slots(dnode_t *dn); -boolean_t dnode_needs_remap(const dnode_t *dn); #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) @@ -532,11 +543,6 @@ typedef struct dnode_stats { * a range of dnode slots which would overflow the dnode_phys_t. */ kstat_named_t dnode_hold_free_overflow; - /* - * Number of times a dnode_hold(...) was attempted on a dnode - * which had already been unlinked in an earlier txg. - */ - kstat_named_t dnode_hold_free_txg; /* * Number of times dnode_free_interior_slots() needed to retry * acquiring a slot zrl lock due to contention. @@ -595,14 +601,14 @@ extern dnode_stats_t dnode_stats; char __db_buf[32]; \ uint64_t __db_obj = (dn)->dn_object; \ if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ + (void) strlcpy(__db_buf, "mdn", sizeof (__db_buf)); \ else \ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj);\ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ __db_buf, __VA_ARGS__); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define DNODE_VERIFY(dn) dnode_verify(dn) #define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 3cdad74414..70f4813449 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -13,22 +13,21 @@ * CDDL HEADER END */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_BOOKMARK_H #define _SYS_DSL_BOOKMARK_H #include +#include #include +#include #ifdef __cplusplus extern "C" { #endif -struct dsl_pool; -struct dsl_dataset; - /* * On disk zap object. */ @@ -37,9 +36,11 @@ typedef struct zfs_bookmark_phys { uint64_t zbm_creation_txg; /* birth transaction group */ uint64_t zbm_creation_time; /* bookmark creation time */ - /* the following fields are reserved for redacted send / recv */ + /* fields used for redacted send / recv */ uint64_t zbm_redaction_obj; /* redaction list object */ uint64_t zbm_flags; /* ZBM_FLAG_* */ + + /* fields used for bookmark written size */ uint64_t zbm_referenced_bytes_refd; uint64_t zbm_compressed_bytes_refd; uint64_t zbm_uncompressed_bytes_refd; @@ -55,12 +56,99 @@ typedef struct zfs_bookmark_phys { #define BOOKMARK_PHYS_SIZE_V1 (3 * sizeof (uint64_t)) #define BOOKMARK_PHYS_SIZE_V2 (12 * sizeof (uint64_t)) +typedef enum zbm_flags { + ZBM_FLAG_HAS_FBN = (1 << 0), + ZBM_FLAG_SNAPSHOT_EXISTS = (1 << 1), +} zbm_flags_t; + +typedef struct redaction_list_phys { + uint64_t rlp_last_object; + uint64_t rlp_last_blkid; + uint64_t rlp_num_entries; + uint64_t rlp_num_snaps; + uint64_t rlp_snaps[]; /* variable length */ +} redaction_list_phys_t; + +typedef struct redaction_list { + dmu_buf_user_t rl_dbu; + redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_dbuf; + uint64_t rl_object; + zfs_refcount_t rl_longholds; + objset_t *rl_mos; +} redaction_list_t; + +/* node in ds_bookmarks */ +typedef struct dsl_bookmark_node { + char *dbn_name; /* free with strfree() */ + kmutex_t dbn_lock; /* protects dirty/phys in block_killed */ + boolean_t dbn_dirty; /* in currently syncing txg */ + zfs_bookmark_phys_t dbn_phys; + avl_node_t dbn_node; +} dsl_bookmark_node_t; + +typedef struct redact_block_phys { + uint64_t rbp_object; + uint64_t rbp_blkid; + /* + * The top 16 bits of this field represent the block size in sectors of + * the blocks in question; the bottom 48 bits are used to store the + * number of consecutive blocks that are in the redaction list. They + * should be accessed using the inline functions below. + */ + uint64_t rbp_size_count; + uint64_t rbp_padding; +} redact_block_phys_t; + +typedef int (*rl_traverse_callback_t)(redact_block_phys_t *, void *); + + +typedef struct dsl_bookmark_create_arg { + nvlist_t *dbca_bmarks; + nvlist_t *dbca_errors; +} dsl_bookmark_create_arg_t; + +typedef struct dsl_bookmark_create_redacted_arg { + const char *dbcra_bmark; + const char *dbcra_snap; + redaction_list_t **dbcra_rl; + uint64_t dbcra_numsnaps; + uint64_t *dbcra_snaps; + void *dbcra_tag; +} dsl_bookmark_create_redacted_arg_t; + int dsl_bookmark_create(nvlist_t *, nvlist_t *); +int dsl_bookmark_create_nvl_validate(nvlist_t *); +int dsl_bookmark_create_check(void *arg, dmu_tx_t *tx); +void dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx); +int dsl_bookmark_create_redacted(const char *, const char *, uint64_t, + uint64_t *, void *, redaction_list_t **); int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); +int dsl_get_bookmark_props(const char *, const char *, nvlist_t *); int dsl_bookmark_destroy(nvlist_t *, nvlist_t *); int dsl_bookmark_lookup(struct dsl_pool *, const char *, struct dsl_dataset *, zfs_bookmark_phys_t *); +int dsl_bookmark_lookup_impl(dsl_dataset_t *, const char *, + zfs_bookmark_phys_t *); +int dsl_redaction_list_hold_obj(struct dsl_pool *, uint64_t, void *, + redaction_list_t **); +void dsl_redaction_list_rele(redaction_list_t *, void *); +void dsl_redaction_list_long_hold(struct dsl_pool *, redaction_list_t *, + void *); +void dsl_redaction_list_long_rele(redaction_list_t *, void *); +boolean_t dsl_redaction_list_long_held(redaction_list_t *); +int dsl_bookmark_init_ds(dsl_dataset_t *); +void dsl_bookmark_fini_ds(dsl_dataset_t *); +boolean_t dsl_bookmark_ds_destroyed(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_snapshotted(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_block_killed(dsl_dataset_t *, const blkptr_t *, dmu_tx_t *); +void dsl_bookmark_sync_done(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_node_add(dsl_dataset_t *, dsl_bookmark_node_t *, dmu_tx_t *); +uint64_t dsl_bookmark_latest_txg(dsl_dataset_t *); +int dsl_redaction_list_traverse(redaction_list_t *, zbookmark_phys_t *, + rl_traverse_callback_t, void *); +void dsl_bookmark_next_changed(dsl_dataset_t *, dsl_dataset_t *, dmu_tx_t *); #ifdef __cplusplus } diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index c2c0a548a4..835720c878 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -189,7 +189,7 @@ void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag); int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, dsl_crypto_key_t **dck_out); -int dsl_crypto_populate_key_nvlist(struct dsl_dataset *ds, +int dsl_crypto_populate_key_nvlist(struct objset *os, uint64_t from_ivset_guid, nvlist_t **nvl_out); int dsl_crypto_recv_raw_key_check(struct dsl_dataset *ds, nvlist_t *nvl, dmu_tx_t *tx); @@ -209,7 +209,6 @@ void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, dmu_tx_t *tx); -int dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd); uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx); void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index c464c70bd2..3c9199b861 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -45,11 +45,13 @@ extern "C" { #endif +extern int zfs_allow_redacted_dataset_mount; struct dsl_dataset; struct dsl_dir; struct dsl_pool; struct dsl_crypto_params; struct dsl_key_mapping; +struct zfs_bookmark_phys; #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ @@ -114,6 +116,13 @@ struct dsl_key_mapping; */ #define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" +/* + * We were receiving an incremental from a redaction bookmark, and these are the + * guids of its snapshots. + */ +#define DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS \ + "com.delphix:resume_redact_book_snaps" + /* * This field is set to the ivset guid for encrypted snapshots. This is used * for validating raw receives. @@ -176,7 +185,8 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ + uint64_t ds_bookmarks_obj; /* DMU_OTN_ZAP_METADATA */ + avl_tree_t ds_bookmarks; /* dsl_bookmark_node_t */ /* has internal locking: */ dsl_deadlist_t ds_deadlist; @@ -263,7 +273,7 @@ typedef struct dsl_dataset { static inline dsl_dataset_phys_t * dsl_dataset_phys(dsl_dataset_t *ds) { - return (ds->ds_dbuf->db_data); + return ((dsl_dataset_phys_t *)ds->ds_dbuf->db_data); } typedef struct dsl_dataset_promote_arg { @@ -274,6 +284,7 @@ typedef struct dsl_dataset_promote_arg { uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; nvlist_t *err_ds; cred_t *cr; + proc_t *proc; } dsl_dataset_promote_arg_t; typedef struct dsl_dataset_rollback_arg { @@ -288,6 +299,7 @@ typedef struct dsl_dataset_snapshot_arg { nvlist_t *ddsa_props; nvlist_t *ddsa_errors; cred_t *ddsa_cr; + proc_t *ddsa_proc; } dsl_dataset_snapshot_arg_t; /* @@ -304,6 +316,7 @@ typedef struct dsl_dataset_snapshot_arg { /* flags for holding the dataset */ typedef enum ds_hold_flags { + DS_HOLD_FLAG_NONE = 0 << 0, DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ } ds_hold_flags_t; @@ -314,23 +327,27 @@ int dsl_dataset_hold_flags(struct dsl_pool *dp, const char *name, boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, void *tag); int dsl_dataset_create_key_mapping(dsl_dataset_t *ds); -int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **); int dsl_dataset_hold_obj_flags(struct dsl_pool *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **); void dsl_dataset_remove_key_mapping(dsl_dataset_t *ds); -void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); int dsl_dataset_own(struct dsl_pool *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_force(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_obj_force(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override); int dsl_dataset_namelen(dsl_dataset_t *ds); boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); -boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, struct dsl_crypto_params *, dmu_tx_t *); @@ -387,9 +404,11 @@ uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds); uint64_t dsl_get_referenced(dsl_dataset_t *ds); uint64_t dsl_get_numclones(dsl_dataset_t *ds); uint64_t dsl_get_inconsistent(dsl_dataset_t *ds); +uint64_t dsl_get_redacted(dsl_dataset_t *ds); uint64_t dsl_get_available(dsl_dataset_t *ds); int dsl_get_written(dsl_dataset_t *ds, uint64_t *written); int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap); +void dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval); int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, char *source); @@ -401,8 +420,10 @@ void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); -int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, +int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *newds, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_written_bookmark(struct zfs_bookmark_phys *bmp, + dsl_dataset_t *newds, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); @@ -415,6 +436,8 @@ int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t quota); int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t reservation); +int dsl_dataset_set_compression(const char *dsname, zprop_source_t source, + uint64_t compression); boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, uint64_t earlier_txg); @@ -427,7 +450,7 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc); void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx); @@ -463,6 +486,9 @@ boolean_t dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f); boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f, uint64_t *outlength, uint64_t **outp); +void dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx); + #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ @@ -471,7 +497,7 @@ boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #else #define dprintf_ds(dd, fmt, ...) #endif diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index 08f38233d7..64358bb5fc 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H @@ -28,12 +28,14 @@ #include #include +#include #ifdef __cplusplus extern "C" { #endif struct dmu_buf; +struct dsl_pool; struct dsl_dataset; typedef struct dsl_deadlist_phys { @@ -46,8 +48,10 @@ typedef struct dsl_deadlist_phys { typedef struct dsl_deadlist { objset_t *dl_os; uint64_t dl_object; - avl_tree_t dl_tree; + avl_tree_t dl_tree; /* contains dsl_deadlist_entry_t */ + avl_tree_t dl_cache; /* contains dsl_deadlist_cache_entry_t */ boolean_t dl_havetree; + boolean_t dl_havecache; struct dmu_buf *dl_dbuf; dsl_deadlist_phys_t *dl_phys; kmutex_t dl_lock; @@ -57,19 +61,49 @@ typedef struct dsl_deadlist { boolean_t dl_oldfmt; } dsl_deadlist_t; +typedef struct dsl_deadlist_cache_entry { + avl_node_t dlce_node; + uint64_t dlce_mintxg; + uint64_t dlce_bpobj; + uint64_t dlce_bytes; + uint64_t dlce_comp; + uint64_t dlce_uncomp; +} dsl_deadlist_cache_entry_t; + typedef struct dsl_deadlist_entry { avl_node_t dle_node; uint64_t dle_mintxg; bpobj_t dle_bpobj; } dsl_deadlist_entry_t; +typedef struct livelist_condense_entry { + struct dsl_dataset *ds; + dsl_deadlist_entry_t *first; + dsl_deadlist_entry_t *next; + boolean_t syncing; + boolean_t cancelled; +} livelist_condense_entry_t; + +extern unsigned long zfs_livelist_max_entries; +extern int zfs_livelist_min_percent_shared; + +typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); + void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); void dsl_deadlist_close(dsl_deadlist_t *dl); +void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); -void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, + boolean_t free, dmu_tx_t *tx); +int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, +dmu_tx_t *tx); +dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); +dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx); void dsl_deadlist_space(dsl_deadlist_t *dl, @@ -81,6 +115,11 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); +int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, + zthr_t *t, uint64_t *size); +void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx); +void dsl_deadlist_discard_tree(dsl_deadlist_t *dl); #ifdef __cplusplus } diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h index bb28014ac3..7f46233a88 100644 --- a/include/sys/dsl_deleg.h +++ b/include/sys/dsl_deleg.h @@ -61,7 +61,6 @@ extern "C" { #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" -#define ZFS_DELEG_PERM_REMAP "remap" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h index ae3ca0cfbd..208d75bacf 100644 --- a/include/sys/dsl_destroy.h +++ b/include/sys/dsl_destroy.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -33,6 +33,7 @@ extern "C" { struct nvlist; struct dsl_dataset; +struct dsl_pool; struct dmu_tx; int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, @@ -45,6 +46,7 @@ int dsl_destroy_inconsistent(const char *, void *); int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, boolean_t, struct dmu_tx *); +void dsl_dir_remove_clones_key(dsl_dir_t *, uint64_t, dmu_tx_t *); typedef struct dsl_destroy_snapshot_arg { const char *ddsa_name; diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 067bcfb6af..993e443544 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -29,18 +29,20 @@ #define _SYS_DSL_DIR_H #include +#include #include #include -#include +#include #include #include +#include #ifdef __cplusplus extern "C" { #endif struct dsl_dataset; - +struct zthr; /* * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. * They should be of the format :. @@ -49,7 +51,7 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" -#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" +#define DD_FIELD_LIVELIST "com.delphix:livelist" typedef enum dd_used { DD_USED_HEAD, @@ -115,6 +117,15 @@ struct dsl_dir { /* amount of space we expect to write; == amount of dirty data */ int64_t dd_space_towrite[TXG_SIZE]; + dsl_deadlist_t dd_livelist; + bplist_t dd_pending_frees; + bplist_t dd_pending_allocs; + + kmutex_t dd_activity_lock; + kcondvar_t dd_activity_cv; + boolean_t dd_activity_cancelled; + uint64_t dd_activity_waiters; + /* protected by dd_lock; keep at end of struct for better locality */ char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; }; @@ -154,7 +165,6 @@ void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly); void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); -int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); @@ -164,18 +174,20 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); int dsl_dir_activate_fs_ss_limit(const char *); int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, - cred_t *); + cred_t *, proc_t *); void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); -int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t); int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, - uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *, proc_t *); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); @@ -185,6 +197,12 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); +void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); +void dsl_dir_livelist_close(dsl_dir_t *dd); +void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); +int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited); +void dsl_dir_cancel_waiters(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" @@ -200,7 +218,7 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #else #define dprintf_dd(dd, fmt, ...) #endif diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 63ba3509a5..44900f8ceb 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -54,9 +55,11 @@ struct dsl_pool; struct dmu_tx; struct dsl_scan; struct dsl_crypto_params; +struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -95,7 +98,7 @@ typedef struct dsl_pool { struct dsl_dir *dp_leak_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; - struct taskq *dp_iput_taskq; + struct taskq *dp_zrele_taskq; struct taskq *dp_unlinked_drain_taskq; /* No lock needed - sync context only */ @@ -118,6 +121,9 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + aggsum_t dp_wrlog_pertxg[TXG_SIZE]; + aggsum_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -157,6 +163,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); +void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); +boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); @@ -176,7 +184,7 @@ void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); boolean_t dsl_pool_config_held(dsl_pool_t *dp); boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp); -taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); +taskq_t *dsl_pool_zrele_taskq(dsl_pool_t *dp); taskq_t *dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp); int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, diff --git a/include/sys/dsl_prop.h b/include/sys/dsl_prop.h index 62ef0ba67a..fba8f908dc 100644 --- a/include/sys/dsl_prop.h +++ b/include/sys/dsl_prop.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DSL_PROP_H @@ -61,6 +62,12 @@ typedef struct dsl_props_arg { zprop_source_t pa_source; } dsl_props_arg_t; +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; + void dsl_prop_init(dsl_dir_t *dd); void dsl_prop_fini(dsl_dir_t *dd); int dsl_prop_register(struct dsl_dataset *ds, const char *propname, @@ -85,6 +92,8 @@ int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot); +int dsl_props_set_check(void *arg, dmu_tx_t *tx); +void dsl_props_set_sync(void *arg, dmu_tx_t *tx); void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, nvlist_t *props, dmu_tx_t *tx); void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 345d2754fb..fb1f1d65ba 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -42,6 +42,8 @@ struct dsl_dataset; struct dsl_pool; struct dmu_tx; +extern int zfs_scan_suspend_progress; + /* * All members of this structure must be uint64_t, for byteswap * purposes. @@ -138,6 +140,7 @@ typedef struct dsl_scan { /* per txg statistics */ uint64_t scn_visited_this_txg; /* total bps visited this txg */ + uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */ uint64_t scn_holes_this_txg; uint64_t scn_lt_min_this_txg; uint64_t scn_gt_max_this_txg; @@ -160,14 +163,18 @@ typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; void scan_init(void); void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +int dsl_scan_setup_check(void *, dmu_tx_t *); +void dsl_scan_setup_sync(void *, dmu_tx_t *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx); diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h index da6c7a40da..5a5b306419 100644 --- a/include/sys/dsl_synctask.h +++ b/include/sys/dsl_synctask.h @@ -37,13 +37,15 @@ struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); +typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *); typedef enum zfs_space_check { /* - * Normal space check: if there is less than 3.2% free space, - * the operation will fail. Operations which are logically - * creating things should use this (e.g. "zfs create", "zfs snapshot"). - * User writes (via the ZPL / ZVOL) also fail at this point. + * Normal space check: if there is less than 3.2% free space (bounded + * by spa_max_slop), the operation will fail. Operations which are + * logically creating things should use this (e.g. "zfs create", "zfs + * snapshot"). User writes (via the ZPL / ZVOL) also fail at this + * point. */ ZFS_SPACE_CHECK_NORMAL, @@ -111,11 +113,13 @@ void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *); int dsl_sync_task(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, void *, int, zfs_space_check_t); void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); + void *, dmu_tx_t *); int dsl_early_sync_task(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, void *, int, zfs_space_check_t); void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); + void *, dmu_tx_t *); +int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, + dsl_sigfunc_t *, void *, int, zfs_space_check_t); #ifdef __cplusplus } diff --git a/include/sys/efi_partition.h b/include/sys/efi_partition.h index 684b3e588a..cda2c98e5d 100644 --- a/include/sys/efi_partition.h +++ b/include/sys/efi_partition.h @@ -24,7 +24,7 @@ */ #ifndef _SYS_EFI_PARTITION_H -#define _SYS_EFI_PARTITION_H +#define _SYS_EFI_PARTITION_H extern __attribute__((visibility("default"))) #include @@ -297,11 +297,11 @@ typedef struct efi_gpe { * checksums, and perform any necessary byte-swapping to the on-disk * format. */ -/* Solaris library abstraction for EFI partitons */ +/* Solaris library abstraction for EFI partitions */ typedef struct dk_part { diskaddr_t p_start; /* starting LBA */ diskaddr_t p_size; /* size in blocks */ - struct uuid p_guid; /* partion type GUID */ + struct uuid p_guid; /* partition type GUID */ ushort_t p_tag; /* converted to part'n type GUID */ ushort_t p_flag; /* attributes */ char p_name[EFI_PART_NAME_LEN]; /* partition name */ @@ -363,15 +363,15 @@ struct partition64 { #endif #ifndef _KERNEL -extern int efi_alloc_and_init(int, uint32_t, struct dk_gpt **); -extern int efi_alloc_and_read(int, struct dk_gpt **); -extern int efi_write(int, struct dk_gpt *); -extern int efi_rescan(int); -extern void efi_free(struct dk_gpt *); -extern int efi_type(int); -extern void efi_err_check(struct dk_gpt *); -extern int efi_auto_sense(int fd, struct dk_gpt **); -extern int efi_use_whole_disk(int fd); +_SYS_EFI_PARTITION_H int efi_debug; +_SYS_EFI_PARTITION_H int efi_alloc_and_init(int, uint32_t, struct dk_gpt **); +_SYS_EFI_PARTITION_H int efi_alloc_and_read(int, struct dk_gpt **); +_SYS_EFI_PARTITION_H int efi_write(int, struct dk_gpt *); +_SYS_EFI_PARTITION_H int efi_rescan(int); +_SYS_EFI_PARTITION_H void efi_free(struct dk_gpt *); +_SYS_EFI_PARTITION_H int efi_type(int); +_SYS_EFI_PARTITION_H void efi_err_check(struct dk_gpt *); +_SYS_EFI_PARTITION_H int efi_use_whole_disk(int fd); #endif #ifdef __cplusplus diff --git a/include/sys/fm/Makefile.am b/include/sys/fm/Makefile.am index 8bca5d8468..7c6c3d49b6 100644 --- a/include/sys/fm/Makefile.am +++ b/include/sys/fm/Makefile.am @@ -1,21 +1,17 @@ SUBDIRS = fs COMMON_H = \ - $(top_srcdir)/include/sys/fm/protocol.h \ - $(top_srcdir)/include/sys/fm/util.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + protocol.h \ + util.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/fm -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fm -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/fm/fs/Makefile.am b/include/sys/fm/fs/Makefile.am index fdc9eb5455..a662753a9e 100644 --- a/include/sys/fm/fs/Makefile.am +++ b/include/sys/fm/fs/Makefile.am @@ -1,18 +1,14 @@ COMMON_H = \ - $(top_srcdir)/include/sys/fm/fs/zfs.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + zfs.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/fm/fs -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fm/fs -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 9bfb123c76..cd080c8ee6 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H @@ -88,6 +92,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY "zio_priority" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" @@ -105,6 +110,10 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" +#define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name" +#define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name" +#define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name" +#define FM_EREPORT_PAYLOAD_ZFS_VOLUME "volume" #define FM_EREPORT_FAILMODE_WAIT "wait" #define FM_EREPORT_FAILMODE_CONTINUE "continue" @@ -114,6 +123,11 @@ extern "C" { #define FM_RESOURCE_AUTOREPLACE "autoreplace" #define FM_RESOURCE_STATECHANGE "statechange" +#define FM_RESOURCE_ZFS_SNAPSHOT_MOUNT "snapshot_mount" +#define FM_RESOURCE_ZFS_SNAPSHOT_UNMOUNT "snapshot_unmount" +#define FM_RESOURCE_ZVOL_CREATE_SYMLINK "zvol_create" +#define FM_RESOURCE_ZVOL_REMOVE_SYMLINK "zvol_remove" + #ifdef __cplusplus } #endif diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h index ff54b05bb6..5fb6d1d607 100644 --- a/include/sys/fm/util.h +++ b/include/sys/fm/util.h @@ -31,6 +31,7 @@ extern "C" { #endif #include +#include /* * Shared user/kernel definitions for class length, error channel name, @@ -92,18 +93,20 @@ typedef struct zfs_zevent { extern void fm_init(void); extern void fm_fini(void); -extern void fm_nvprint(nvlist_t *); extern void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector); extern int zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *); extern void zfs_zevent_drain_all(int *); -extern int zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); -extern void zfs_zevent_fd_rele(int); +extern zfs_file_t *zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); +extern void zfs_zevent_fd_rele(zfs_file_t *); extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); extern int zfs_zevent_wait(zfs_zevent_t *); extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); extern void zfs_zevent_init(zfs_zevent_t **); extern void zfs_zevent_destroy(zfs_zevent_t *); +extern void zfs_zevent_track_duplicate(void); +extern void zfs_ereport_init(void); +extern void zfs_ereport_fini(void); #else static inline void fm_init(void) { } diff --git a/include/sys/frame.h b/include/sys/frame.h index 2865dbb57d..caae851421 100644 --- a/include/sys/frame.h +++ b/include/sys/frame.h @@ -23,8 +23,13 @@ extern "C" { #endif -#if defined(__KERNEL__) && defined(HAVE_STACK_FRAME_NON_STANDARD) +#if defined(__KERNEL__) && defined(HAVE_KERNEL_OBJTOOL) && \ + defined(HAVE_STACK_FRAME_NON_STANDARD) +#if defined(HAVE_KERNEL_OBJTOOL_HEADER) +#include +#else #include +#endif #else #define STACK_FRAME_NON_STANDARD(func) #endif diff --git a/include/sys/fs/Makefile.am b/include/sys/fs/Makefile.am index 0859b9f670..6a93053c8e 100644 --- a/include/sys/fs/Makefile.am +++ b/include/sys/fs/Makefile.am @@ -1,18 +1,14 @@ COMMON_H = \ - $(top_srcdir)/include/sys/fs/zfs.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + zfs.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/fs -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fs -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 3bcefdbfd7..2af11fc719 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,18 +21,18 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. + * Portions Copyright 2010 Robert Milkowski + * Copyright (c) 2021, Colm Buckley */ -/* Portions Copyright 2010 Robert Milkowski */ - #ifndef _SYS_FS_ZFS_H -#define _SYS_FS_ZFS_H +#define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include #include @@ -115,7 +115,7 @@ typedef enum { ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, - ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ + ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ @@ -181,9 +181,11 @@ typedef enum { ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, - ZFS_PROP_REMAPTXG, /* not exposed to the user */ + ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_PROP_IVSET_GUID, /* not exposed to the user */ + ZFS_PROP_REDACTED, + ZFS_PROP_REDACT_SNAPS, ZFS_NUM_PROPS } zfs_prop_t; @@ -203,13 +205,12 @@ typedef enum { ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; -extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; +_SYS_FS_ZFS_H const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected - * by the change. If you make any changes to this list, be sure to update - * the property table in module/zcommon/zpool_prop.c. + * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, @@ -245,10 +246,11 @@ typedef enum { ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, + ZPOOL_PROP_COMPATIBILITY, ZPOOL_NUM_PROPS } zpool_prop_t; -/* Small enough to not hog a whole line of printout in zpool(1M). */ +/* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" @@ -299,38 +301,41 @@ typedef int (*zprop_func)(int, void *); /* * Dataset property functions shared between libzfs and kernel. */ -const char *zfs_prop_default_string(zfs_prop_t); -uint64_t zfs_prop_default_numeric(zfs_prop_t); -boolean_t zfs_prop_readonly(zfs_prop_t); -boolean_t zfs_prop_visible(zfs_prop_t prop); -boolean_t zfs_prop_inheritable(zfs_prop_t); -boolean_t zfs_prop_setonce(zfs_prop_t); -boolean_t zfs_prop_encryption_key_param(zfs_prop_t); -boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); -const char *zfs_prop_to_name(zfs_prop_t); -zfs_prop_t zfs_name_to_prop(const char *); -boolean_t zfs_prop_user(const char *); -boolean_t zfs_prop_userquota(const char *); -boolean_t zfs_prop_written(const char *); -int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); -int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); -uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); -boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); +_SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t); +_SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); +_SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t); +_SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop); +_SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t); +_SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t); +_SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t); +_SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); +_SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t); +_SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *); +_SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *); +_SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *); +_SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *); +_SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); +_SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *, + uint64_t *); +_SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); +_SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ -zpool_prop_t zpool_name_to_prop(const char *); -const char *zpool_prop_to_name(zpool_prop_t); -const char *zpool_prop_default_string(zpool_prop_t); -uint64_t zpool_prop_default_numeric(zpool_prop_t); -boolean_t zpool_prop_readonly(zpool_prop_t); -boolean_t zpool_prop_setonce(zpool_prop_t); -boolean_t zpool_prop_feature(const char *); -boolean_t zpool_prop_unsupported(const char *); -int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); -int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); -uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); +_SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *); +_SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t); +_SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t); +_SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); +_SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t); +_SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t); +_SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *); +_SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *); +_SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t, + const char **); +_SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *, + uint64_t *); +_SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. @@ -572,6 +577,11 @@ typedef enum zfs_key_location { #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 +/* Persistent L2ARC version */ +#define L2ARC_PERSISTENT_VERSION_1 1ULL +#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 +#define L2ARC_PERSISTENT_VERSION_STRING "1" + /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ @@ -590,8 +600,8 @@ typedef struct zpool_load_policy { /* * The following are configuration names used in the nvlist describing a pool's - * configuration. New on-disk names should be prefixed with ":" - * (e.g. "org.open-zfs:") to avoid conflicting names being developed + * configuration. New on-disk names should be prefixed with ":" + * (e.g. "org.openzfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" @@ -611,6 +621,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" @@ -632,6 +643,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" +#define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" @@ -640,6 +652,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" +#define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" @@ -652,6 +665,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" #define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" +#define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" @@ -660,12 +674,14 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" +#define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" +#define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" @@ -698,6 +714,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ @@ -724,6 +741,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ +#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" +#define ZPOOL_CONFIG_COMPATIBILITY "compatibility" /* * The persistent vdev state is stored as separate values rather than a single @@ -749,10 +768,17 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" +/* dRAID configuration */ +#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" +#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" +#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -762,6 +788,12 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_DRAID_MAXPARITY 3 +#define VDEV_DRAID_MIN_CHILDREN 2 +#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX + /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" @@ -769,6 +801,11 @@ typedef struct zpool_load_policy { "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" +#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ + "com.delphix:ms_unflushed_phys_txgs" + +#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ + "org.openzfs:vdev_rebuild" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" @@ -816,7 +853,20 @@ typedef struct zpool_load_policy { * The location of the pool configuration repository, shared between kernel and * userland. */ +#define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" #define ZPOOL_CACHE "/etc/zfs/zpool.cache" +/* + * Settings for zpool compatibility features files + */ +#define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d" +#define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d" +#define ZPOOL_COMPAT_MAXSIZE 16384 + +/* + * Hard-wired compatibility settings + */ +#define ZPOOL_COMPAT_LEGACY "legacy" +#define ZPOOL_COMPAT_OFF "off" /* * vdev states are ordered from least to most healthy. @@ -860,6 +910,7 @@ typedef enum vdev_aux { VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ + VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* @@ -954,7 +1005,7 @@ typedef struct pool_scan_stat { /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ - uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ + uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ @@ -983,11 +1034,26 @@ typedef enum dsl_scan_state { DSS_NUM_STATES } dsl_scan_state_t; +typedef struct vdev_rebuild_stat { + uint64_t vrs_state; /* vdev_rebuild_state_t */ + uint64_t vrs_start_time; /* time_t */ + uint64_t vrs_end_time; /* time_t */ + uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ + uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ + uint64_t vrs_bytes_issued; /* read bytes issued */ + uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrs_bytes_est; /* total bytes to scan */ + uint64_t vrs_errors; /* scanning errors */ + uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ + uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ + uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ +} vdev_rebuild_stat_t; + /* - * Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering - * of this enum must be maintained to ensure the errata identifiers map to - * the correct documentation. New errata may only be appended to the list - * and must contain corresponding documentation at the above link. + * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. + * The ordering of this enum must be maintained to ensure the errata identifiers + * map to the correct documentation. New errata may only be appended to the + * list and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, @@ -1028,7 +1094,7 @@ typedef struct vdev_stat { uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ - uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ + uint64_t vs_initialize_state; /* vdev_initializing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ @@ -1039,8 +1105,18 @@ typedef struct vdev_stat { uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ + uint64_t vs_rebuild_processed; /* bytes rebuilt */ + uint64_t vs_configured_ashift; /* TLV vdev_ashift */ + uint64_t vs_logical_ashift; /* vdev_logical_ashift */ + uint64_t vs_physical_ashift; /* vdev_physical_ashift */ } vdev_stat_t; +/* BEGIN CSTYLED */ +#define VDEV_STAT_VALID(field, uint64_t_field_count) \ + ((uint64_t_field_count * sizeof (uint64_t)) >= \ + (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) +/* END CSTYLED */ + /* * Extended stats * @@ -1141,12 +1217,11 @@ typedef struct ddt_histogram { #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" -#define ZFS_SHARETAB "/etc/dfs/sharetab" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ -#define ZVOL_DIR "/dev" +#define ZVOL_DIR "/dev/zvol/" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 @@ -1155,7 +1230,7 @@ typedef struct ddt_histogram { #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" -#define ZVOL_DEFAULT_BLOCKSIZE 8192 +#define ZVOL_DEFAULT_BLOCKSIZE 16384 typedef enum { VDEV_INITIALIZE_NONE, @@ -1173,6 +1248,13 @@ typedef enum { VDEV_TRIM_COMPLETE, } vdev_trim_state_t; +typedef enum { + VDEV_REBUILD_NONE, + VDEV_REBUILD_ACTIVE, + VDEV_REBUILD_CANCELED, + VDEV_REBUILD_COMPLETE, +} vdev_rebuild_state_t; + /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl @@ -1187,9 +1269,13 @@ typedef enum { */ typedef enum zfs_ioc { /* - * illumos - 81/128 numbers reserved. + * Core features - 81/128 numbers reserved. */ +#ifdef __FreeBSD__ + ZFS_IOC_FIRST = 0, +#else ZFS_IOC_FIRST = ('Z' << 8), +#endif ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ @@ -1272,20 +1358,23 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ ZFS_IOC_POOL_TRIM, /* 0x5a50 */ + ZFS_IOC_REDACT, /* 0x5a51 */ + ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ + ZFS_IOC_WAIT, /* 0x5a53 */ + ZFS_IOC_WAIT_FS, /* 0x5a54 */ /* - * Linux - 3/64 numbers reserved. + * Per-platform (Optional) - 8/128 numbers reserved. */ - ZFS_IOC_LINUX = ('Z' << 8) + 0x80, - ZFS_IOC_EVENTS_NEXT, /* 0x5a81 */ - ZFS_IOC_EVENTS_CLEAR, /* 0x5a82 */ - ZFS_IOC_EVENTS_SEEK, /* 0x5a83 */ - - /* - * FreeBSD - 1/64 numbers reserved. - */ - ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0, - + ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, + ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ + ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ + ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ + ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ + ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ + ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ + ZFS_IOC_SET_BOOTENV, /* 0x87 */ + ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST } zfs_ioc_t; @@ -1303,6 +1392,8 @@ typedef enum zfs_ioc { * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. + * + * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, @@ -1318,6 +1409,14 @@ typedef enum { ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, + ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, + ZFS_ERR_EXPORT_IN_PROGRESS, + ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, + ZFS_ERR_STREAM_TRUNCATED, + ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, + ZFS_ERR_RESILVER_IN_PROGRESS, + ZFS_ERR_REBUILD_IN_PROGRESS, + ZFS_ERR_BADPROP, } zfs_errno_t; /* @@ -1333,6 +1432,23 @@ typedef enum { SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; +typedef enum { + ZPOOL_WAIT_CKPT_DISCARD, + ZPOOL_WAIT_FREE, + ZPOOL_WAIT_INITIALIZE, + ZPOOL_WAIT_REPLACE, + ZPOOL_WAIT_REMOVE, + ZPOOL_WAIT_RESILVER, + ZPOOL_WAIT_SCRUB, + ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_NUM_ACTIVITIES +} zpool_wait_activity_t; + +typedef enum { + ZFS_WAIT_DELETEQ, + ZFS_WAIT_NUM_ACTIVITIES +} zfs_wait_activity_t; + /* * Bookmark name values. */ @@ -1359,9 +1475,11 @@ typedef enum { #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" +#define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" +#define ZPOOL_HIST_ELAPSED_NS "elapsed_ns" /* * Special nvlist name that will not have its args recorded in the pool's @@ -1383,6 +1501,19 @@ typedef enum { #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" +/* + * The following are names used when invoking ZFS_IOC_POOL_WAIT. + */ +#define ZPOOL_WAIT_ACTIVITY "wait_activity" +#define ZPOOL_WAIT_TAG "wait_tag" +#define ZPOOL_WAIT_WAITED "wait_waited" + +/* + * The following are names used when invoking ZFS_IOC_WAIT_FS. + */ +#define ZFS_WAIT_ACTIVITY "wait_activity" +#define ZFS_WAIT_WAITED "wait_waited" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ @@ -1429,7 +1560,12 @@ typedef enum { * given payloads: * * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END + * ESC_ZFS_RESILVER_FINISH + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING + * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * @@ -1483,6 +1619,48 @@ typedef enum { #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" +#define ZFS_EV_RESILVER_TYPE "resilver_type" + + +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * Note that although blocks up to 16MB are supported, the recordsize + * property can not be set larger than zfs_max_recordsize (default 1MB). + * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + + +/* supported encryption algorithms */ +enum zio_encrypt { + ZIO_CRYPT_INHERIT = 0, + ZIO_CRYPT_ON, + ZIO_CRYPT_OFF, + ZIO_CRYPT_AES_128_CCM, + ZIO_CRYPT_AES_192_CCM, + ZIO_CRYPT_AES_256_CCM, + ZIO_CRYPT_AES_128_GCM, + ZIO_CRYPT_AES_192_GCM, + ZIO_CRYPT_AES_256_GCM, + ZIO_CRYPT_FUNCTIONS +}; + +#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM +#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF + #ifdef __cplusplus } diff --git a/include/sys/lua/Makefile.am b/include/sys/lua/Makefile.am index 5f224dcb16..8b4dafaa8c 100644 --- a/include/sys/lua/Makefile.am +++ b/include/sys/lua/Makefile.am @@ -1,21 +1,17 @@ COMMON_H = \ - $(top_srcdir)/include/sys/lua/lua.h \ - $(top_srcdir)/include/sys/lua/luaconf.h \ - $(top_srcdir)/include/sys/lua/lualib.h \ - $(top_srcdir)/include/sys/lua/lauxlib.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + lua.h \ + luaconf.h \ + lualib.h \ + lauxlib.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/lua -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/lua -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/lua/luaconf.h b/include/sys/lua/luaconf.h index 302c57a8c4..83202d71c2 100644 --- a/include/sys/lua/luaconf.h +++ b/include/sys/lua/luaconf.h @@ -15,6 +15,7 @@ extern ssize_t lcompat_sprintf(char *, size_t size, const char *, ...); extern int64_t lcompat_strtoll(const char *, char **); extern int64_t lcompat_pow(int64_t, int64_t); +extern int lcompat_hashnum(int64_t); /* ** ================================================================== @@ -367,11 +368,7 @@ extern int64_t lcompat_pow(int64_t, int64_t); @@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system. ** CHANGE it if it uses too much C-stack space. */ -#ifdef __linux__ #define LUAL_BUFFERSIZE 512 -#else -#define LUAL_BUFFERSIZE 1024 -#endif /* @@ -495,7 +492,7 @@ extern int64_t lcompat_pow(int64_t, int64_t); ** a single double value, using NaN values to represent non-number ** values. The trick only works on 32-bit machines (ints and pointers ** are 32-bit values) with numbers represented as IEEE 754-2008 doubles -** with conventional endianess (12345678 or 87654321), in CPUs that do +** with conventional endianness (12345678 or 87654321), in CPUs that do ** not produce signaling NaN values (all NaNs are quiet). */ diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 2790d06c71..ecff65f13d 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -49,15 +49,23 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); +void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); +void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); +uint64_t metaslab_unflushed_txg(metaslab_t *); +uint64_t metaslab_estimated_condensed_size(metaslab_t *); +int metaslab_sort_by_flushed(const void *, const void *); +uint64_t metaslab_unflushed_changes_memused(metaslab_t *); + int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); +boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); -uint64_t metaslab_block_maxsize(metaslab_t *); +uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags @@ -70,6 +78,7 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 +#define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, @@ -87,8 +96,8 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); -void metaslab_alloc_trace_init(void); -void metaslab_alloc_trace_fini(void); +void metaslab_stat_init(void); +void metaslab_stat_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); @@ -101,12 +110,15 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); - +void metaslab_class_evict_old(metaslab_class_t *, uint64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); +void metaslab_space_update(vdev_t *, metaslab_class_t *, + int64_t, int64_t, int64_t); + metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); @@ -121,7 +133,13 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); -void metaslab_enable(metaslab_t *, boolean_t); +void metaslab_enable(metaslab_t *, boolean_t, boolean_t); +void metaslab_set_selected_txg(metaslab_t *, uint64_t); + +extern int metaslab_debug_load; + +range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, + metaslab_t *msp, uint64_t *start, uint64_t *shift); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index ca1104c148..adf4c03a20 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -135,6 +136,29 @@ typedef enum trace_alloc_type { #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) +/* + * Per-allocator data structure. + */ +typedef struct metaslab_class_allocator { + metaslab_group_t *mca_rotor; + uint64_t mca_aliquot; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mca_alloc_slots + * refcount. The mca_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t mca_alloc_max_slots; + zfs_refcount_t mca_alloc_slots; +} ____cacheline_aligned metaslab_class_allocator_t; + /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines @@ -144,7 +168,7 @@ typedef enum trace_alloc_type { * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done - * by traversing the metaslab groups that are linked off of the mc_rotor field. + * by traversing the metaslab groups that are linked off of the mca_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab @@ -155,9 +179,7 @@ typedef enum trace_alloc_type { struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; - metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; - uint64_t mc_aliquot; /* * Track the number of metaslab groups that have been initialized @@ -172,21 +194,6 @@ struct metaslab_class { */ boolean_t mc_alloc_throttle_enabled; - /* - * The allocation throttle works on a reservation system. Whenever - * an asynchronous zio wants to perform an allocation it must - * first reserve the number of blocks that it wants to allocate. - * If there aren't sufficient slots available for the pending zio - * then that I/O is throttled until more slots free up. The current - * number of reserved allocations is maintained by the mc_alloc_slots - * refcount. The mc_alloc_max_slots value determines the maximum - * number of allocations that the system allows. Gang blocks are - * allowed to reserve slots even if we've reached the maximum - * number of allocations allowed. - */ - uint64_t *mc_alloc_max_slots; - zfs_refcount_t *mc_alloc_slots; - uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ @@ -194,8 +201,26 @@ struct metaslab_class { uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + /* + * List of all loaded metaslabs in the class, sorted in order of most + * recent use. + */ + multilist_t mc_metaslab_txg_list; + + metaslab_class_allocator_t mc_allocator[]; }; +/* + * Per-allocator data structure. + */ +typedef struct metaslab_group_allocator { + uint64_t mga_cur_max_alloc_queue_depth; + zfs_refcount_t mga_alloc_queue_depth; + metaslab_t *mga_primary; + metaslab_t *mga_secondary; +} metaslab_group_allocator_t; + /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked together to form a circular linked @@ -207,8 +232,6 @@ struct metaslab_class { */ struct metaslab_group { kmutex_t mg_lock; - metaslab_t **mg_primaries; - metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ @@ -243,7 +266,7 @@ struct metaslab_group { * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth - * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given @@ -256,9 +279,7 @@ struct metaslab_group { * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; - uint64_t *mg_cur_max_alloc_queue_depth; - zfs_refcount_t *mg_alloc_queue_depth; - int mg_allocators; + /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -276,6 +297,9 @@ struct metaslab_group { boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; + + int mg_allocators; + metaslab_group_allocator_t mg_allocator[]; }; /* @@ -357,7 +381,7 @@ struct metaslab { * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] - * with writers [see metaslab_sync()]. + * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading @@ -378,6 +402,7 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; + uint64_t ms_allocating_total; /* * The following range trees are accessed only from syncing context. @@ -401,7 +426,6 @@ struct metaslab { boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; - uint64_t ms_condense_checked_txg; /* * The number of consumers which have disabled the metaslab. @@ -414,6 +438,8 @@ struct metaslab { */ boolean_t ms_loaded; boolean_t ms_loading; + kcondvar_t ms_flush_cv; + boolean_t ms_flushing; /* * The following histograms count entries that are in the @@ -474,6 +500,13 @@ struct metaslab { * stay cached. */ uint64_t ms_selected_txg; + /* + * ms_load/unload_time can be used for performance monitoring + * (e.g. by dtrace or mdb). + */ + hrtime_t ms_load_time; /* time last loaded */ + hrtime_t ms_unload_time; /* time last unloaded */ + hrtime_t ms_selected_time; /* time last allocated from */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ @@ -493,12 +526,33 @@ struct metaslab { * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ - avl_tree_t ms_allocatable_by_size; + zfs_btree_t ms_allocatable_by_size; + zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ + /* + * Node in metaslab class's selected txg list + */ + multilist_node_t ms_class_txg_node; + + /* + * Allocs and frees that are committed to the vdev log spacemap but + * not yet to this metaslab's spacemap. + */ + range_tree_t *ms_unflushed_allocs; + range_tree_t *ms_unflushed_frees; + + /* + * We have flushed entries up to but not including this TXG. In + * other words, all changes from this TXG and onward should not + * be in this metaslab's space map and must be read from the + * log space maps. + */ + uint64_t ms_unflushed_txg; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; @@ -506,6 +560,11 @@ struct metaslab { boolean_t ms_new; }; +typedef struct metaslab_unflushed_phys { + /* on-disk counterpart of ms_unflushed_txg */ + uint64_t msp_unflushed_txg; +} metaslab_unflushed_phys_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/mmp.h b/include/sys/mmp.h index 527e3323b4..ce9c4496a0 100644 --- a/include/sys/mmp.h +++ b/include/sys/mmp.h @@ -63,6 +63,7 @@ extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub); extern void mmp_signal_all_threads(void); /* Global tuning */ +extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS); extern ulong_t zfs_multihost_interval; extern uint_t zfs_multihost_fail_intervals; extern uint_t zfs_multihost_import_intervals; diff --git a/include/sys/mntent.h b/include/sys/mntent.h index fac751b462..8d578f67b8 100644 --- a/include/sys/mntent.h +++ b/include/sys/mntent.h @@ -29,6 +29,8 @@ #ifndef _SYS_MNTENT_H #define _SYS_MNTENT_H +#define MNTMAXSTR 128 + #define MNTTYPE_ZFS "zfs" /* ZFS file system */ #define MOUNT_SUCCESS 0x00 /* Success */ @@ -71,8 +73,15 @@ #define MNTOPT_STRICTATIME "strictatime" /* strict access time updates */ #define MNTOPT_NOSTRICTATIME "nostrictatime" /* No strict access time updates */ #define MNTOPT_LAZYTIME "lazytime" /* Defer access time writing */ +#ifdef __linux__ #define MNTOPT_SETUID "suid" /* Both setuid and devices allowed */ #define MNTOPT_NOSETUID "nosuid" /* Neither setuid nor devices allowed */ +#elif defined(__FreeBSD__) +#define MNTOPT_SETUID "setuid" /* Set uid allowed */ +#define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ +#else +#error "unknown OS" +#endif #define MNTOPT_OWNER "owner" /* allow owner mount */ #define MNTOPT_NOOWNER "noowner" /* do not allow owner mount */ #define MNTOPT_REMOUNT "remount" /* change mount options */ diff --git a/include/spl/sys/mode.h b/include/sys/mod.h similarity index 77% rename from include/spl/sys/mode.h rename to include/sys/mod.h index 02802d0d4c..a5a73ed0ee 100644 --- a/include/spl/sys/mode.h +++ b/include/sys/mod.h @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -21,12 +20,21 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ +#ifndef _SYS_MOD_H +#define _SYS_MOD_H -#ifndef _SPL_MODE_H -#define _SPL_MODE_H +#ifdef _KERNEL +#include +#else +/* + * Exported symbols + */ +#define EXPORT_SYMBOL(x) -#define IFTOVT(mode) vn_mode_to_vtype(mode) -#define VTTOIF(vtype) vn_vtype_to_mode(vtype) -#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT)) +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) +#endif -#endif /* SPL_MODE_H */ +#endif /* SYS_MOD_H */ diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 4395406859..26f37c37ab 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -71,8 +71,9 @@ struct multilist { multilist_sublist_index_func_t *ml_index_func; }; +void multilist_create(multilist_t *, size_t, size_t, + multilist_sublist_index_func_t *); void multilist_destroy(multilist_t *); -multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); void multilist_insert(multilist_t *, void *); void multilist_remove(multilist_t *, void *); @@ -89,6 +90,8 @@ void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); +int multilist_sublist_is_empty(multilist_sublist_t *); +int multilist_sublist_is_empty_idx(multilist_t *, unsigned int); void *multilist_sublist_head(multilist_sublist_t *); void *multilist_sublist_tail(multilist_sublist_t *); diff --git a/include/sys/note.h b/include/sys/note.h deleted file mode 100644 index 33b5476686..0000000000 --- a/include/sys/note.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 1994 by Sun Microsystems, Inc. - */ - -/* - * sys/note.h: interface for annotating source with info for tools - * - * This is the underlying interface; NOTE (/usr/include/note.h) is the - * preferred interface, but all exported header files should include this - * file directly and use _NOTE so as not to take "NOTE" from the user's - * namespace. For consistency, *all* kernel source should use _NOTE. - * - * By default, annotations expand to nothing. This file implements - * that. Tools using annotations will interpose a different version - * of this file that will expand annotations as needed. - */ - -#ifndef _SYS_NOTE_H -#define _SYS_NOTE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef _NOTE -#define _NOTE(s) -#endif - -#define NOTE(s) _NOTE(s) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_NOTE_H */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index e8567933d2..76d383a3c6 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -24,7 +24,7 @@ */ #ifndef _SYS_NVPAIR_H -#define _SYS_NVPAIR_H +#define _SYS_NVPAIR_H extern __attribute__((visibility("default"))) #include #include @@ -62,7 +62,7 @@ typedef enum { DATA_TYPE_UINT8, DATA_TYPE_BOOLEAN_ARRAY, DATA_TYPE_INT8_ARRAY, -#if !defined(_KERNEL) +#if !defined(_KERNEL) && !defined(_STANDALONE) DATA_TYPE_UINT8_ARRAY, DATA_TYPE_DOUBLE #else @@ -135,221 +135,270 @@ struct nv_alloc_ops { void (*nv_ao_reset)(nv_alloc_t *); }; -extern const nv_alloc_ops_t *nv_fixed_ops; -extern nv_alloc_t *nv_alloc_nosleep; +_SYS_NVPAIR_H const nv_alloc_ops_t *nv_fixed_ops; +_SYS_NVPAIR_H nv_alloc_t *nv_alloc_nosleep; #if defined(_KERNEL) -extern nv_alloc_t *nv_alloc_sleep; -extern nv_alloc_t *nv_alloc_pushpage; +_SYS_NVPAIR_H nv_alloc_t *nv_alloc_sleep; +_SYS_NVPAIR_H nv_alloc_t *nv_alloc_pushpage; #endif -int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); -void nv_alloc_reset(nv_alloc_t *); -void nv_alloc_fini(nv_alloc_t *); +_SYS_NVPAIR_H int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, + /* args */ ...); +_SYS_NVPAIR_H void nv_alloc_reset(nv_alloc_t *); +_SYS_NVPAIR_H void nv_alloc_fini(nv_alloc_t *); /* list management */ -int nvlist_alloc(nvlist_t **, uint_t, int); -void nvlist_free(nvlist_t *); -int nvlist_size(nvlist_t *, size_t *, int); -int nvlist_pack(nvlist_t *, char **, size_t *, int, int); -int nvlist_unpack(char *, size_t, nvlist_t **, int); -int nvlist_dup(nvlist_t *, nvlist_t **, int); -int nvlist_merge(nvlist_t *, nvlist_t *, int); +_SYS_NVPAIR_H int nvlist_alloc(nvlist_t **, uint_t, int); +_SYS_NVPAIR_H void nvlist_free(nvlist_t *); +_SYS_NVPAIR_H int nvlist_size(nvlist_t *, size_t *, int); +_SYS_NVPAIR_H int nvlist_pack(nvlist_t *, char **, size_t *, int, int); +_SYS_NVPAIR_H int nvlist_unpack(char *, size_t, nvlist_t **, int); +_SYS_NVPAIR_H int nvlist_dup(nvlist_t *, nvlist_t **, int); +_SYS_NVPAIR_H int nvlist_merge(nvlist_t *, nvlist_t *, int); -uint_t nvlist_nvflag(nvlist_t *); +_SYS_NVPAIR_H uint_t nvlist_nvflag(nvlist_t *); -int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); -int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); -int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); -int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); -nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); +_SYS_NVPAIR_H int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); +_SYS_NVPAIR_H int nvlist_xpack(nvlist_t *, char **, size_t *, int, + nv_alloc_t *); +_SYS_NVPAIR_H int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); +_SYS_NVPAIR_H int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); +_SYS_NVPAIR_H nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); -int nvlist_add_nvpair(nvlist_t *, nvpair_t *); -int nvlist_add_boolean(nvlist_t *, const char *); -int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); -int nvlist_add_byte(nvlist_t *, const char *, uchar_t); -int nvlist_add_int8(nvlist_t *, const char *, int8_t); -int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); -int nvlist_add_int16(nvlist_t *, const char *, int16_t); -int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); -int nvlist_add_int32(nvlist_t *, const char *, int32_t); -int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); -int nvlist_add_int64(nvlist_t *, const char *, int64_t); -int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); -int nvlist_add_string(nvlist_t *, const char *, const char *); -int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); -int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); -int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); -int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); -int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); -int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); -int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); -int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); -int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); -int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); -int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); -int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); -int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); -int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); -#if !defined(_KERNEL) -int nvlist_add_double(nvlist_t *, const char *, double); +_SYS_NVPAIR_H int nvlist_add_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H int nvlist_add_boolean(nvlist_t *, const char *); +_SYS_NVPAIR_H int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +_SYS_NVPAIR_H int nvlist_add_byte(nvlist_t *, const char *, uchar_t); +_SYS_NVPAIR_H int nvlist_add_int8(nvlist_t *, const char *, int8_t); +_SYS_NVPAIR_H int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); +_SYS_NVPAIR_H int nvlist_add_int16(nvlist_t *, const char *, int16_t); +_SYS_NVPAIR_H int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); +_SYS_NVPAIR_H int nvlist_add_int32(nvlist_t *, const char *, int32_t); +_SYS_NVPAIR_H int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); +_SYS_NVPAIR_H int nvlist_add_int64(nvlist_t *, const char *, int64_t); +_SYS_NVPAIR_H int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); +_SYS_NVPAIR_H int nvlist_add_string(nvlist_t *, const char *, const char *); +_SYS_NVPAIR_H int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +_SYS_NVPAIR_H int nvlist_add_boolean_array(nvlist_t *, const char *, + boolean_t *, uint_t); +_SYS_NVPAIR_H int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, + uint_t); +_SYS_NVPAIR_H int nvlist_add_string_array(nvlist_t *, const char *, + char * const *, uint_t); +_SYS_NVPAIR_H int nvlist_add_nvlist_array(nvlist_t *, const char *, + nvlist_t **, uint_t); +_SYS_NVPAIR_H int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); +#if !defined(_KERNEL) && !defined(_STANDALONE) +_SYS_NVPAIR_H int nvlist_add_double(nvlist_t *, const char *, double); #endif -int nvlist_remove(nvlist_t *, const char *, data_type_t); -int nvlist_remove_all(nvlist_t *, const char *); -int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H int nvlist_remove(nvlist_t *, const char *, data_type_t); +_SYS_NVPAIR_H int nvlist_remove_all(nvlist_t *, const char *); +_SYS_NVPAIR_H int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); -int nvlist_lookup_boolean(nvlist_t *, const char *); -int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); -int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); -int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); -int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); -int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); -int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); -int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); -int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); -int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); -int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); -int nvlist_lookup_string(nvlist_t *, const char *, char **); -int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); -int nvlist_lookup_boolean_array(nvlist_t *, const char *, +_SYS_NVPAIR_H int nvlist_lookup_boolean(nvlist_t *, const char *); +_SYS_NVPAIR_H int nvlist_lookup_boolean_value(nvlist_t *, const char *, + boolean_t *); +_SYS_NVPAIR_H int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); +_SYS_NVPAIR_H int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); +_SYS_NVPAIR_H int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); +_SYS_NVPAIR_H int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); +_SYS_NVPAIR_H int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); +_SYS_NVPAIR_H int nvlist_lookup_string(nvlist_t *, const char *, char **); +_SYS_NVPAIR_H int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); +_SYS_NVPAIR_H int nvlist_lookup_boolean_array(nvlist_t *, const char *, boolean_t **, uint_t *); -int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); -int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); -int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); -int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); -int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); -int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); -int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); -int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); -int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); -int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); -int nvlist_lookup_nvlist_array(nvlist_t *, const char *, +_SYS_NVPAIR_H int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, + uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, + uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint8_array(nvlist_t *, const char *, + uint8_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_int16_array(nvlist_t *, const char *, + int16_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint16_array(nvlist_t *, const char *, + uint16_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_int32_array(nvlist_t *, const char *, + int32_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint32_array(nvlist_t *, const char *, + uint32_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_int64_array(nvlist_t *, const char *, + int64_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_uint64_array(nvlist_t *, const char *, + uint64_t **, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_string_array(nvlist_t *, const char *, + char ***, uint_t *); +_SYS_NVPAIR_H int nvlist_lookup_nvlist_array(nvlist_t *, const char *, nvlist_t ***, uint_t *); -int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); -int nvlist_lookup_pairs(nvlist_t *, int, ...); -#if !defined(_KERNEL) -int nvlist_lookup_double(nvlist_t *, const char *, double *); +_SYS_NVPAIR_H int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); +_SYS_NVPAIR_H int nvlist_lookup_pairs(nvlist_t *, int, ...); +#if !defined(_KERNEL) && !defined(_STANDALONE) +_SYS_NVPAIR_H int nvlist_lookup_double(nvlist_t *, const char *, double *); #endif -int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); -int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, - int *, char **); -boolean_t nvlist_exists(nvlist_t *, const char *); -boolean_t nvlist_empty(nvlist_t *); +_SYS_NVPAIR_H int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); +_SYS_NVPAIR_H int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, + nvpair_t **, int *, char **); +_SYS_NVPAIR_H boolean_t nvlist_exists(nvlist_t *, const char *); +_SYS_NVPAIR_H boolean_t nvlist_empty(nvlist_t *); /* processing nvpair */ -nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); -nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); -char *nvpair_name(nvpair_t *); -data_type_t nvpair_type(nvpair_t *); -int nvpair_type_is_array(nvpair_t *); -int nvpair_value_boolean_value(nvpair_t *, boolean_t *); -int nvpair_value_byte(nvpair_t *, uchar_t *); -int nvpair_value_int8(nvpair_t *, int8_t *); -int nvpair_value_uint8(nvpair_t *, uint8_t *); -int nvpair_value_int16(nvpair_t *, int16_t *); -int nvpair_value_uint16(nvpair_t *, uint16_t *); -int nvpair_value_int32(nvpair_t *, int32_t *); -int nvpair_value_uint32(nvpair_t *, uint32_t *); -int nvpair_value_int64(nvpair_t *, int64_t *); -int nvpair_value_uint64(nvpair_t *, uint64_t *); -int nvpair_value_string(nvpair_t *, char **); -int nvpair_value_nvlist(nvpair_t *, nvlist_t **); -int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); -int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); -int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); -int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); -int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); -int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); -int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); -int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); -int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); -int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); -int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); -int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); -int nvpair_value_hrtime(nvpair_t *, hrtime_t *); -#if !defined(_KERNEL) -int nvpair_value_double(nvpair_t *, double *); +_SYS_NVPAIR_H nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H char *nvpair_name(nvpair_t *); +_SYS_NVPAIR_H data_type_t nvpair_type(nvpair_t *); +_SYS_NVPAIR_H int nvpair_type_is_array(nvpair_t *); +_SYS_NVPAIR_H int nvpair_value_boolean_value(nvpair_t *, boolean_t *); +_SYS_NVPAIR_H int nvpair_value_byte(nvpair_t *, uchar_t *); +_SYS_NVPAIR_H int nvpair_value_int8(nvpair_t *, int8_t *); +_SYS_NVPAIR_H int nvpair_value_uint8(nvpair_t *, uint8_t *); +_SYS_NVPAIR_H int nvpair_value_int16(nvpair_t *, int16_t *); +_SYS_NVPAIR_H int nvpair_value_uint16(nvpair_t *, uint16_t *); +_SYS_NVPAIR_H int nvpair_value_int32(nvpair_t *, int32_t *); +_SYS_NVPAIR_H int nvpair_value_uint32(nvpair_t *, uint32_t *); +_SYS_NVPAIR_H int nvpair_value_int64(nvpair_t *, int64_t *); +_SYS_NVPAIR_H int nvpair_value_uint64(nvpair_t *, uint64_t *); +_SYS_NVPAIR_H int nvpair_value_string(nvpair_t *, char **); +_SYS_NVPAIR_H int nvpair_value_nvlist(nvpair_t *, nvlist_t **); +_SYS_NVPAIR_H int nvpair_value_boolean_array(nvpair_t *, boolean_t **, + uint_t *); +_SYS_NVPAIR_H int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); +_SYS_NVPAIR_H int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); +_SYS_NVPAIR_H int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); +_SYS_NVPAIR_H int nvpair_value_hrtime(nvpair_t *, hrtime_t *); +#if !defined(_KERNEL) && !defined(_STANDALONE) +_SYS_NVPAIR_H int nvpair_value_double(nvpair_t *, double *); #endif -nvlist_t *fnvlist_alloc(void); -void fnvlist_free(nvlist_t *); -size_t fnvlist_size(nvlist_t *); -char *fnvlist_pack(nvlist_t *, size_t *); -void fnvlist_pack_free(char *, size_t); -nvlist_t *fnvlist_unpack(char *, size_t); -nvlist_t *fnvlist_dup(nvlist_t *); -void fnvlist_merge(nvlist_t *, nvlist_t *); -size_t fnvlist_num_pairs(nvlist_t *); +_SYS_NVPAIR_H nvlist_t *fnvlist_alloc(void); +_SYS_NVPAIR_H void fnvlist_free(nvlist_t *); +_SYS_NVPAIR_H size_t fnvlist_size(nvlist_t *); +_SYS_NVPAIR_H char *fnvlist_pack(nvlist_t *, size_t *); +_SYS_NVPAIR_H void fnvlist_pack_free(char *, size_t); +_SYS_NVPAIR_H nvlist_t *fnvlist_unpack(char *, size_t); +_SYS_NVPAIR_H nvlist_t *fnvlist_dup(nvlist_t *); +_SYS_NVPAIR_H void fnvlist_merge(nvlist_t *, nvlist_t *); +_SYS_NVPAIR_H size_t fnvlist_num_pairs(nvlist_t *); -void fnvlist_add_boolean(nvlist_t *, const char *); -void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); -void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); -void fnvlist_add_int8(nvlist_t *, const char *, int8_t); -void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); -void fnvlist_add_int16(nvlist_t *, const char *, int16_t); -void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); -void fnvlist_add_int32(nvlist_t *, const char *, int32_t); -void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); -void fnvlist_add_int64(nvlist_t *, const char *, int64_t); -void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); -void fnvlist_add_string(nvlist_t *, const char *, const char *); -void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); -void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); -void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); -void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); -void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); -void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); -void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); -void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); -void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); -void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); -void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); -void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); -void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); -void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); +_SYS_NVPAIR_H void fnvlist_add_boolean(nvlist_t *, const char *); +_SYS_NVPAIR_H void fnvlist_add_boolean_value(nvlist_t *, const char *, + boolean_t); +_SYS_NVPAIR_H void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +_SYS_NVPAIR_H void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +_SYS_NVPAIR_H void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +_SYS_NVPAIR_H void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +_SYS_NVPAIR_H void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +_SYS_NVPAIR_H void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +_SYS_NVPAIR_H void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +_SYS_NVPAIR_H void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +_SYS_NVPAIR_H void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +_SYS_NVPAIR_H void fnvlist_add_string(nvlist_t *, const char *, const char *); +_SYS_NVPAIR_H void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +_SYS_NVPAIR_H void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H void fnvlist_add_boolean_array(nvlist_t *, const char *, + boolean_t *, uint_t); +_SYS_NVPAIR_H void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_uint16_array(nvlist_t *, const char *, + uint16_t *, uint_t); +_SYS_NVPAIR_H void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_uint32_array(nvlist_t *, const char *, + uint32_t *, uint_t); +_SYS_NVPAIR_H void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, + uint_t); +_SYS_NVPAIR_H void fnvlist_add_uint64_array(nvlist_t *, const char *, + uint64_t *, uint_t); +_SYS_NVPAIR_H void fnvlist_add_string_array(nvlist_t *, const char *, + char * const *, uint_t); +_SYS_NVPAIR_H void fnvlist_add_nvlist_array(nvlist_t *, const char *, + nvlist_t **, uint_t); -void fnvlist_remove(nvlist_t *, const char *); -void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); +_SYS_NVPAIR_H void fnvlist_remove(nvlist_t *, const char *); +_SYS_NVPAIR_H void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); -nvpair_t *fnvlist_lookup_nvpair(nvlist_t *, const char *); -boolean_t fnvlist_lookup_boolean(nvlist_t *, const char *); -boolean_t fnvlist_lookup_boolean_value(nvlist_t *, const char *); -uchar_t fnvlist_lookup_byte(nvlist_t *, const char *); -int8_t fnvlist_lookup_int8(nvlist_t *, const char *); -int16_t fnvlist_lookup_int16(nvlist_t *, const char *); -int32_t fnvlist_lookup_int32(nvlist_t *, const char *); -int64_t fnvlist_lookup_int64(nvlist_t *, const char *); -uint8_t fnvlist_lookup_uint8(nvlist_t *, const char *); -uint16_t fnvlist_lookup_uint16(nvlist_t *, const char *); -uint32_t fnvlist_lookup_uint32(nvlist_t *, const char *); -uint64_t fnvlist_lookup_uint64(nvlist_t *, const char *); -char *fnvlist_lookup_string(nvlist_t *, const char *); -nvlist_t *fnvlist_lookup_nvlist(nvlist_t *, const char *); -boolean_t *fnvlist_lookup_boolean_array(nvlist_t *, const char *, uint_t *); -uchar_t *fnvlist_lookup_byte_array(nvlist_t *, const char *, uint_t *); -int8_t *fnvlist_lookup_int8_array(nvlist_t *, const char *, uint_t *); -uint8_t *fnvlist_lookup_uint8_array(nvlist_t *, const char *, uint_t *); -int16_t *fnvlist_lookup_int16_array(nvlist_t *, const char *, uint_t *); -uint16_t *fnvlist_lookup_uint16_array(nvlist_t *, const char *, uint_t *); -int32_t *fnvlist_lookup_int32_array(nvlist_t *, const char *, uint_t *); -uint32_t *fnvlist_lookup_uint32_array(nvlist_t *, const char *, uint_t *); -int64_t *fnvlist_lookup_int64_array(nvlist_t *, const char *, uint_t *); -uint64_t *fnvlist_lookup_uint64_array(nvlist_t *, const char *, uint_t *); +_SYS_NVPAIR_H nvpair_t *fnvlist_lookup_nvpair(nvlist_t *, const char *); +_SYS_NVPAIR_H boolean_t fnvlist_lookup_boolean(nvlist_t *, const char *); +_SYS_NVPAIR_H boolean_t fnvlist_lookup_boolean_value(nvlist_t *, const char *); +_SYS_NVPAIR_H uchar_t fnvlist_lookup_byte(nvlist_t *, const char *); +_SYS_NVPAIR_H int8_t fnvlist_lookup_int8(nvlist_t *, const char *); +_SYS_NVPAIR_H int16_t fnvlist_lookup_int16(nvlist_t *, const char *); +_SYS_NVPAIR_H int32_t fnvlist_lookup_int32(nvlist_t *, const char *); +_SYS_NVPAIR_H int64_t fnvlist_lookup_int64(nvlist_t *, const char *); +_SYS_NVPAIR_H uint8_t fnvlist_lookup_uint8(nvlist_t *, const char *); +_SYS_NVPAIR_H uint16_t fnvlist_lookup_uint16(nvlist_t *, const char *); +_SYS_NVPAIR_H uint32_t fnvlist_lookup_uint32(nvlist_t *, const char *); +_SYS_NVPAIR_H uint64_t fnvlist_lookup_uint64(nvlist_t *, const char *); +_SYS_NVPAIR_H char *fnvlist_lookup_string(nvlist_t *, const char *); +_SYS_NVPAIR_H nvlist_t *fnvlist_lookup_nvlist(nvlist_t *, const char *); +_SYS_NVPAIR_H boolean_t *fnvlist_lookup_boolean_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H uchar_t *fnvlist_lookup_byte_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H int8_t *fnvlist_lookup_int8_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H uint8_t *fnvlist_lookup_uint8_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H int16_t *fnvlist_lookup_int16_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H uint16_t *fnvlist_lookup_uint16_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H int32_t *fnvlist_lookup_int32_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H uint32_t *fnvlist_lookup_uint32_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H int64_t *fnvlist_lookup_int64_array(nvlist_t *, const char *, + uint_t *); +_SYS_NVPAIR_H uint64_t *fnvlist_lookup_uint64_array(nvlist_t *, const char *, + uint_t *); -boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); -uchar_t fnvpair_value_byte(nvpair_t *nvp); -int8_t fnvpair_value_int8(nvpair_t *nvp); -int16_t fnvpair_value_int16(nvpair_t *nvp); -int32_t fnvpair_value_int32(nvpair_t *nvp); -int64_t fnvpair_value_int64(nvpair_t *nvp); -uint8_t fnvpair_value_uint8(nvpair_t *nvp); -uint16_t fnvpair_value_uint16(nvpair_t *nvp); -uint32_t fnvpair_value_uint32(nvpair_t *nvp); -uint64_t fnvpair_value_uint64(nvpair_t *nvp); -char *fnvpair_value_string(nvpair_t *nvp); -nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); +_SYS_NVPAIR_H boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +_SYS_NVPAIR_H uchar_t fnvpair_value_byte(nvpair_t *nvp); +_SYS_NVPAIR_H int8_t fnvpair_value_int8(nvpair_t *nvp); +_SYS_NVPAIR_H int16_t fnvpair_value_int16(nvpair_t *nvp); +_SYS_NVPAIR_H int32_t fnvpair_value_int32(nvpair_t *nvp); +_SYS_NVPAIR_H int64_t fnvpair_value_int64(nvpair_t *nvp); +_SYS_NVPAIR_H uint8_t fnvpair_value_uint8(nvpair_t *nvp); +_SYS_NVPAIR_H uint16_t fnvpair_value_uint16(nvpair_t *nvp); +_SYS_NVPAIR_H uint32_t fnvpair_value_uint32(nvpair_t *nvp); +_SYS_NVPAIR_H uint64_t fnvpair_value_uint64(nvpair_t *nvp); +_SYS_NVPAIR_H char *fnvpair_value_string(nvpair_t *nvp); +_SYS_NVPAIR_H nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); #ifdef __cplusplus } diff --git a/include/sys/objlist.h b/include/sys/objlist.h new file mode 100644 index 0000000000..a124a61fdc --- /dev/null +++ b/include/sys/objlist.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _OBJLIST_H +#define _OBJLIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct objlist_node { + list_node_t on_node; + uint64_t on_object; +} objlist_node_t; + +typedef struct objlist { + list_t ol_list; /* List of struct objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t ol_last_lookup; +} objlist_t; + +objlist_t *objlist_create(void); +void objlist_destroy(objlist_t *); +boolean_t objlist_exists(objlist_t *, uint64_t); +void objlist_insert(objlist_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _OBJLIST_H */ diff --git a/include/sys/pathname.h b/include/sys/pathname.h index 5db69b1784..52f21316c2 100644 --- a/include/sys/pathname.h +++ b/include/sys/pathname.h @@ -54,8 +54,6 @@ extern "C" { */ typedef struct pathname { char *pn_buf; /* underlying storage */ - char *pn_path; /* remaining pathname */ - size_t pn_pathlen; /* remaining length */ size_t pn_bufsize; /* total size of pn_buf */ } pathname_t; diff --git a/module/zfs/qat.h b/include/sys/qat.h similarity index 96% rename from module/zfs/qat.h rename to include/sys/qat.h index 9014c03148..9ae8eb1735 100644 --- a/module/zfs/qat.h +++ b/include/sys/qat.h @@ -40,11 +40,6 @@ typedef enum qat_encrypt_dir { #include "dc/cpa_dc.h" #include "lac/cpa_cy_sym.h" -/* - * Timeout - no response from hardware after 0.5 seconds - */ -#define QAT_TIMEOUT_MS 500 - /* * The minimal and maximal buffer size which are not restricted * in the QAT hardware, but with the input buffer size between 4KB @@ -85,7 +80,7 @@ typedef struct qat_stats { * Number of fails in the QAT compression / decompression engine. * Note: when a QAT error happens, it doesn't necessarily indicate a * critical hardware issue. Sometimes it is because the output buffer - * is not big enough. The compression job will be transfered to the + * is not big enough. The compression job will be transferred to the * gzip software implementation so the functionality of ZFS is not * impacted. */ @@ -118,7 +113,7 @@ typedef struct qat_stats { /* * Number of fails in the QAT encryption / decryption engine. * Note: when a QAT error happens, it doesn't necessarily indicate a - * critical hardware issue. The encryption job will be transfered + * critical hardware issue. The encryption job will be transferred * to the software implementation so the functionality of ZFS is * not impacted. */ @@ -135,7 +130,7 @@ typedef struct qat_stats { /* * Number of fails in the QAT checksum engine. * Note: when a QAT error happens, it doesn't necessarily indicate a - * critical hardware issue. The checksum job will be transfered to the + * critical hardware issue. The checksum job will be transferred to the * software implementation so the functionality of ZFS is not impacted. */ kstat_named_t cksum_fails; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index ae1a0c323d..fef3d4d7bd 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -24,13 +24,13 @@ */ /* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H -#include +#include #include #ifdef __cplusplus @@ -41,20 +41,35 @@ extern "C" { typedef struct range_tree_ops range_tree_ops_t; +typedef enum range_seg_type { + RANGE_SEG32, + RANGE_SEG64, + RANGE_SEG_GAP, + RANGE_SEG_NUM_TYPES, +} range_seg_type_t; + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { - avl_tree_t rt_root; /* offset-ordered segment AVL tree */ + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ - uint64_t rt_gap; /* allowable inter-segment gap */ + range_seg_type_t rt_type; /* type of range_seg_t in use */ + /* + * All data that is stored in the range tree must have a start higher + * than or equal to rt_start, and all sizes and offsets must be + * multiples of 1 << rt_shift. + */ + uint8_t rt_shift; + uint64_t rt_start; range_tree_ops_t *rt_ops; - /* rt_avl_compare should only be set if rt_arg is an AVL tree */ + /* rt_btree_compare should only be set if rt_arg is a b-tree */ void *rt_arg; - int (*rt_avl_compare)(const void *, const void *); + int (*rt_btree_compare)(const void *, const void *); + uint64_t rt_gap; /* allowable inter-segment gap */ /* * The rt_histogram maintains a histogram of ranges. Each bucket, @@ -64,37 +79,221 @@ typedef struct range_tree { uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; -typedef struct range_seg { - avl_node_t rs_node; /* AVL node */ - avl_node_t rs_pp_node; /* AVL picker-private node */ +typedef struct range_seg32 { + uint32_t rs_start; /* starting offset of this segment */ + uint32_t rs_end; /* ending offset (non-inclusive) */ +} range_seg32_t; + +/* + * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may + * require 64-bit integers for ranges. + */ +typedef struct range_seg64 { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ +} range_seg64_t; + +typedef struct range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_t; +} range_seg_gap_t; + +/* + * This type needs to be the largest of the range segs, since it will be stack + * allocated and then cast the actual type to do tree operations. + */ +typedef range_seg_gap_t range_seg_max_t; + +/* + * This is just for clarity of code purposes, so we can make it clear that a + * pointer is to a range seg of some type; when we need to do the actual math, + * we'll figure out the real type. + */ +typedef void range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; +static inline uint64_t +rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((const range_seg32_t *)rs)->rs_start); + case RANGE_SEG64: + return (((const range_seg64_t *)rs)->rs_start); + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_start); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((const range_seg32_t *)rs)->rs_end); + case RANGE_SEG64: + return (((const range_seg64_t *)rs)->rs_end); + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_end); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: { + const range_seg32_t *r32 = (const range_seg32_t *)rs; + return (r32->rs_end - r32->rs_start); + } + case RANGE_SEG64: { + const range_seg64_t *r64 = (const range_seg64_t *)rs; + return (r64->rs_end - r64->rs_start); + } + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_fill); + default: + VERIFY(0); + return (0); + } + +} + +static inline uint64_t +rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +{ + return (rs_get_fill_raw(rs, rt) << rt->rt_shift); +} + +static inline void +rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(start, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_start = start; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_start = start; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(end, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_end = end; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_end = end; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + /* fall through */ + case RANGE_SEG64: + ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, + rt)); + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_fill = fill; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(start, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); + rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(end, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); + rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); + rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); +} + typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); -void range_tree_init(void); -void range_tree_fini(void); -range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, + range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); +boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); +uint64_t range_tree_numsegs(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); @@ -112,12 +311,17 @@ void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); range_seg_t *range_tree_first(range_tree_t *rt); -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; +void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto); +void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto); + +void rt_btree_create(range_tree_t *rt, void *arg); +void rt_btree_destroy(range_tree_t *rt, void *arg); +void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_vacate(range_tree_t *rt, void *arg); +extern range_tree_ops_t rt_btree_ops; #ifdef __cplusplus } diff --git a/include/sys/rrwlock.h b/include/sys/rrwlock.h index e1c1756cf2..8d296ef28f 100644 --- a/include/sys/rrwlock.h +++ b/include/sys/rrwlock.h @@ -37,7 +37,9 @@ extern "C" { #include #include -#include +#include + +extern uint_t rrw_tsd_key; /* * A reader-writer lock implementation that allows re-entrant reads, but diff --git a/include/sys/sa.h b/include/sys/sa.h index 50b9062216..98eb8f9cd7 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -51,7 +51,7 @@ typedef uint16_t sa_attr_type_t; typedef struct sa_attr_reg { char *sa_name; /* attribute name */ uint16_t sa_length; - sa_bswap_type_t sa_byteswap; /* bswap functon enum */ + sa_bswap_type_t sa_byteswap; /* bswap function enum */ sa_attr_type_t sa_attr; /* filled in during registration */ } sa_attr_reg_t; @@ -158,7 +158,7 @@ void sa_handle_lock(sa_handle_t *); void sa_handle_unlock(sa_handle_t *); #ifdef _KERNEL -int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, zfs_uio_t *); int sa_add_projid(sa_handle_t *, dmu_tx_t *, uint64_t); #endif diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index 7eddd8750f..fa10aff8a3 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -28,7 +28,7 @@ #define _SYS_SA_IMPL_H #include -#include +#include #include /* diff --git a/include/sys/sdt.h b/include/sys/sdt.h deleted file mode 100644 index 9704072cb7..0000000000 --- a/include/sys/sdt.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SDT_H -#define _SYS_SDT_H - -#ifndef _KERNEL - -#define ZFS_PROBE(a) ((void) 0) -#define ZFS_PROBE1(a, c) ((void) 0) -#define ZFS_PROBE2(a, c, e) ((void) 0) -#define ZFS_PROBE3(a, c, e, g) ((void) 0) -#define ZFS_PROBE4(a, c, e, g, i) ((void) 0) - -#endif /* _KERNEL */ - -/* - * The set-error SDT probe is extra static, in that we declare its fake - * function literally, rather than with the DTRACE_PROBE1() macro. This is - * necessary so that SET_ERROR() can evaluate to a value, which wouldn't - * be possible if it required multiple statements (to declare the function - * and then call it). - * - * SET_ERROR() uses the comma operator so that it can be used without much - * additional code. For example, "return (EINVAL);" becomes - * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated - * twice, so it should not have side effects (e.g. something like: - * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). - */ -extern void __set_error(const char *file, const char *func, int line, int err); -#undef SET_ERROR -#define SET_ERROR(err) \ - (__set_error(__FILE__, __func__, __LINE__, err), err) - -#endif /* _SYS_SDT_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 23434edbc7..2ae467877d 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,14 +20,16 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_SPA_H @@ -42,6 +44,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -62,70 +66,12 @@ typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; +struct bpobj; +struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - -/* - * We currently support block sizes from 512 bytes to 16MB. - * The benefits of larger blocks, and thus larger IO, need to be weighed - * against the cost of COWing a giant block to modify one byte, and the - * large latency of reading or writing a large block. - * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. - * - * Note that although the LSIZE field of the blkptr_t can store sizes up - * to 32MB, the dnode's dn_datablkszsec can only store sizes up to - * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_OLD_MAXBLOCKSHIFT 17 -#define SPA_MAXBLOCKSHIFT 24 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done @@ -155,6 +101,7 @@ _NOTE(CONSTCOND) } while (0) #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 +#define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). @@ -381,7 +328,7 @@ typedef struct zio_cksum_salt { #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ @@ -389,7 +336,7 @@ _NOTE(CONSTCOND) } while (0) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ @@ -397,12 +344,13 @@ _NOTE(CONSTCOND) } while (0) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) +} while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, - BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ - NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ + BP_EMBEDDED_TYPE_REDACTED, + NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 @@ -436,6 +384,12 @@ typedef struct blkptr { /* * Macros to get and set fields in a bp or DVA. */ + +/* + * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for + * this gang DVA including its children BP's. The space allocated at this + * DVA's vdev/offset is vdev_gang_header_asize(vdev). + */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ @@ -465,7 +419,7 @@ typedef struct blkptr { ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ @@ -474,7 +428,7 @@ _NOTE(CONSTCOND) } while (0) ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) @@ -490,7 +444,7 @@ _NOTE(CONSTCOND) } while (0) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ -_NOTE(CONSTCOND) } while (0) +} while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) @@ -524,6 +478,9 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) +#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) + #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) @@ -602,6 +559,14 @@ _NOTE(CONSTCOND) } while (0) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) +#define BP_SET_REDACTED(bp) \ +{ \ + BP_SET_EMBEDDED(bp, B_TRUE); \ + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ +} +#define BP_IS_REDACTED(bp) \ + (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) + /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) @@ -623,7 +588,7 @@ _NOTE(CONSTCOND) } while (0) ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } -#ifdef _BIG_ENDIAN +#ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) @@ -638,6 +603,7 @@ _NOTE(CONSTCOND) } while (0) * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ + #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ @@ -678,6 +644,13 @@ _NOTE(CONSTCOND) } while (0) (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_REDACTED(bp)) { \ + len += func(buf + len, size - len, \ + "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ @@ -738,14 +711,26 @@ typedef enum spa_import_type { SPA_IMPORT_ASSEMBLE } spa_import_type_t; +typedef enum spa_mode { + SPA_MODE_UNINIT = 0, + SPA_MODE_READ = 1, + SPA_MODE_WRITE = 2, +} spa_mode_t; + /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes + * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ - SPA_AUTOTRIM_ON + SPA_AUTOTRIM_ON, +#ifdef IN_FREEBSD_BASE + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, +#else + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, +#endif } spa_autotrim_t; /* @@ -754,6 +739,7 @@ typedef enum { typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, + TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ @@ -767,20 +753,23 @@ extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); -extern int spa_destroy(char *pool); +extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, +extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); -extern int spa_reset(char *pool); +extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); +extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); +extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -793,17 +782,14 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 +#define SPA_ASYNC_L2CACHE_REBUILD 0x800 +#define SPA_ASYNC_L2CACHE_TRIM 0x1000 +#define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); + int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); @@ -858,6 +844,9 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); +extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, + vdev_t *parent, uint_t id, int atype); + /* * Miscellaneous SPA routines in spa_misc.c @@ -893,7 +882,7 @@ typedef struct spa_history_kstat { uint64_t count; uint64_t size; kstat_t *kstat; - void *private; + void *priv; list_t list; } spa_history_kstat_t; @@ -906,7 +895,6 @@ typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; - spa_history_kstat_t io_history; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t iostats; @@ -942,6 +930,12 @@ typedef struct spa_iostats { kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; + kstat_named_t simple_trim_extents_written; + kstat_named_t simple_trim_bytes_written; + kstat_named_t simple_trim_extents_skipped; + kstat_named_t simple_trim_bytes_skipped; + kstat_named_t simple_trim_extents_failed; + kstat_named_t simple_trim_bytes_failed; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); @@ -976,12 +970,13 @@ extern int spa_import_progress_set_state(uint64_t pool_guid, /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); @@ -1036,6 +1031,7 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, @@ -1053,14 +1049,17 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); /* Miscellaneous support routines */ -extern void spa_load_failed(spa_t *spa, const char *fmt, ...); -extern void spa_load_note(spa_t *spa, const char *fmt, ...); +extern void spa_load_failed(spa_t *spa, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); +extern void spa_load_note(spa_t *spa, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); @@ -1068,7 +1067,6 @@ extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); -extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); @@ -1091,7 +1089,6 @@ extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); -extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, @@ -1103,11 +1100,13 @@ extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); +extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); -extern unsigned long spa_get_hostid(void); +extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); +extern boolean_t spa_livelist_delete_check(spa_t *spa); -extern int spa_mode(spa_t *spa); +extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; @@ -1120,22 +1119,23 @@ extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, - dmu_tx_t *tx, const char *fmt, ...); + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); -extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t length); -extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); +extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); +extern void zfs_ereport_taskq_fini(void); +extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); @@ -1157,7 +1157,7 @@ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ -extern void spa_init(int flags); +extern void spa_init(spa_mode_t mode); extern void spa_fini(void); extern void spa_boot_init(void); @@ -1170,6 +1170,22 @@ extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); +extern void zfs_ereport_zvol_post(const char *subclass, const char *name, + const char *device_name, const char *raw_name); + +/* waiting for pool activities to complete */ +extern int spa_wait(const char *pool, zpool_wait_activity_t activity, + boolean_t *waited); +extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, + uint64_t tag, boolean_t *waited); +extern void spa_notify_waiters(spa_t *spa); +extern void spa_wake_waiters(spa_t *spa); + +/* module param call functions */ +int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); +int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); +int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); +int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ @@ -1179,12 +1195,12 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) #else #define dprintf_bp(bp, fmt, ...) #endif -extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ +extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern unsigned long zfs_deadman_synctime_ms; extern unsigned long zfs_deadman_ziotime_ms; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 66032d9aad..9714bbce9c 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -34,7 +34,9 @@ #include #include +#include #include +#include #include #include #include @@ -42,18 +44,24 @@ #include #include #include -#include +#include #include #include #include #include #include +#include #include #ifdef __cplusplus extern "C" { #endif +typedef struct spa_alloc { + kmutex_t spaa_lock; + avl_tree_t spaa_tree; +} ____cacheline_aligned spa_alloc_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; @@ -138,9 +146,9 @@ typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; + int scl_count; kcondvar_t scl_cv; - zfs_refcount_t scl_count; -} spa_config_lock_t; +} ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; @@ -214,13 +222,16 @@ struct spa { spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ boolean_t spa_trust_config; /* do we trust vdev tree? */ + boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ + boolean_t spa_is_exporting; /* true while exporting pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ + metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ @@ -235,21 +246,20 @@ struct spa { kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ - int spa_min_ashift; /* of vdevs in normal class */ - int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_ashift; /* of vdevs in normal class */ + uint64_t spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ /* - * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are - * stored in spa_alloc_count. There is one tree and one lock for each - * allocator, to help improve allocation performance in write-heavy - * workloads. + * spa_allocs is an array, whose lengths is stored in spa_alloc_count. + * There is one tree and one lock for each allocator, to help improve + * allocation performance in write-heavy workloads. */ - kmutex_t *spa_alloc_locks; - avl_tree_t *spa_alloc_trees; + spa_alloc_t *spa_allocs; int spa_alloc_count; spa_aux_vdev_t spa_spares; /* hot spares */ @@ -269,7 +279,9 @@ struct spa { boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ - uint64_t spa_load_verify_ios; /* in-flight verification IOs */ + + /* in-flight verification bytes */ + uint64_t spa_load_verify_bytes; kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ @@ -307,6 +319,19 @@ struct spa { spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; + space_map_t *spa_syncing_log_sm; /* current log space map */ + avl_tree_t spa_sm_logs_by_txg; + kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ + avl_tree_t spa_metaslabs_by_flushed; + spa_unflushed_stats_t spa_unflushed_stats; + list_t spa_log_summary; + uint64_t spa_log_flushall_txg; + + zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ + zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ + uint64_t spa_livelists_to_delete; /* set of livelists to free */ + livelist_condense_entry_t spa_to_condense; /* next to condense */ + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ @@ -344,13 +369,13 @@ struct spa { uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ - int spa_mode; /* FREAD | FWRITE */ + spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ + boolean_t spa_read_spacemaps; /* spacemaps available if ro */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ - uint64_t spa_dedup_ditto; /* dedup ditto threshold */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ @@ -358,7 +383,7 @@ struct spa { kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ - uint64_t spa_did; /* if procp != p0, did of t1 */ + uintptr_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ @@ -394,6 +419,16 @@ struct spa { mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ + uint32_t spa_hostid; /* cached system hostid */ + + /* synchronization for threads in spa_wait */ + kmutex_t spa_activities_lock; + kcondvar_t spa_activities_cv; + kcondvar_t spa_waiters_cv; + int spa_waiters; /* number of waiting threads */ + boolean_t spa_waiters_cancel; /* waiters should return */ + + char *spa_compatibility; /* compatibility file(s) */ /* * spa_refcount & spa_config_lock must be the last elements @@ -408,7 +443,8 @@ struct spa { }; extern char *spa_config_path; - +extern char *zfs_deadman_failmode; +extern int spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, @@ -418,7 +454,10 @@ extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); - +extern int param_set_deadman_failmode_common(const char *val); +extern void spa_set_deadman_synctime(hrtime_t ns); +extern void spa_set_deadman_ziotime(hrtime_t ns); +extern const char *spa_history_zone(void); #ifdef __cplusplus } diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h new file mode 100644 index 0000000000..b2ed77fac3 --- /dev/null +++ b/include/sys/spa_log_spacemap.h @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_LOG_SPACEMAP_H +#define _SYS_SPA_LOG_SPACEMAP_H + +#include + +typedef struct log_summary_entry { + uint64_t lse_start; /* start TXG */ + uint64_t lse_mscount; /* # of metaslabs needed to be flushed */ + uint64_t lse_blkcount; /* blocks held by this entry */ + list_node_t lse_node; +} log_summary_entry_t; + +typedef struct spa_unflushed_stats { + /* used for memory heuristic */ + uint64_t sus_memused; /* current memory used for unflushed trees */ + + /* used for block heuristic */ + uint64_t sus_blocklimit; /* max # of log blocks allowed */ + uint64_t sus_nblocks; /* # of blocks in log space maps currently */ +} spa_unflushed_stats_t; + +typedef struct spa_log_sm { + uint64_t sls_sm_obj; /* space map object ID */ + uint64_t sls_txg; /* txg logged on the space map */ + uint64_t sls_nblocks; /* number of blocks in this log */ + uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */ + avl_node_t sls_node; /* node in spa_sm_logs_by_txg */ +} spa_log_sm_t; + +int spa_ld_log_spacemaps(spa_t *); + +void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *); +void spa_flush_metaslabs(spa_t *, dmu_tx_t *); +void spa_sync_close_syncing_log_sm(spa_t *); + +void spa_cleanup_old_sm_logs(spa_t *, dmu_tx_t *); + +uint64_t spa_log_sm_blocklimit(spa_t *); +void spa_log_sm_set_blocklimit(spa_t *); +uint64_t spa_log_sm_nblocks(spa_t *); +uint64_t spa_log_sm_memused(spa_t *); + +void spa_log_sm_decrement_mscount(spa_t *, uint64_t); +void spa_log_sm_increment_current_mscount(spa_t *); + +void spa_log_summary_add_flushed_metaslab(spa_t *); +void spa_log_summary_decrement_mscount(spa_t *, uint64_t); +void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); + +boolean_t spa_flush_all_logs_requested(spa_t *); + +extern int zfs_keep_log_spacemaps_at_export; + +#endif /* _SYS_SPA_LOG_SPACEMAP_H */ diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 7731a352f1..cb81e710bd 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -72,6 +72,11 @@ typedef struct space_map_phys { * bucket, smp_histogram[i], contains the number of free regions * whose size is: * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) + * + * Note that, if log space map feature is enabled, histograms of + * space maps that belong to metaslabs will take into account any + * unflushed changes for their metaslabs, even though the actual + * space map doesn't have entries for these changes. */ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; } space_map_phys_t; @@ -143,6 +148,15 @@ typedef struct space_map_entry { uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ uint64_t sme_run; /* max is 2^36; units of sm_shift */ + + /* + * The following fields are not part of the actual space map entry + * on-disk and they are populated with the values from the debug + * entry most recently visited starting from the beginning to the + * end of the space map. + */ + uint64_t sme_txg; + uint64_t sme_sync_pass; } space_map_entry_t; #define SM_NO_VDEVID (1 << SPA_VDEVBITS) @@ -209,6 +223,8 @@ void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, uint64_t space_map_object(space_map_t *sm); int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); +uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt); +uint64_t space_map_nblocks(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx); diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h index 249b15be67..ca9d41dc13 100644 --- a/include/sys/space_reftree.h +++ b/include/sys/space_reftree.h @@ -31,7 +31,7 @@ #define _SYS_SPACE_REFTREE_H #include - +#include #ifdef __cplusplus extern "C" { #endif diff --git a/include/sys/sysevent/Makefile.am b/include/sys/sysevent/Makefile.am index e9af2684f1..64e5376395 100644 --- a/include/sys/sysevent/Makefile.am +++ b/include/sys/sysevent/Makefile.am @@ -1,19 +1,15 @@ COMMON_H = \ - $(top_srcdir)/include/sys/sysevent/eventdefs.h \ - $(top_srcdir)/include/sys/sysevent/dev.h - -KERNEL_H = - -USER_H = - -EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + eventdefs.h \ + dev.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys/sysevent -libzfs_HEADERS = $(COMMON_H) $(USER_H) +libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL +if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/sysevent -kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +kernel_HEADERS = $(COMMON_H) +endif endif diff --git a/include/sys/trace.h b/include/sys/trace.h deleted file mode 100644 index f32ba529ec..0000000000 --- a/include/sys/trace.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM zfs - -#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_ZFS_H - -#include -#include - -/* - * The sys/trace_dbgmsg.h header defines tracepoint events for - * dprintf(), dbgmsg(), and SET_ERROR(). - */ -#define _SYS_TRACE_DBGMSG_INDIRECT -#include -#undef _SYS_TRACE_DBGMSG_INDIRECT - -/* - * Redefine the DTRACE_PROBE* functions to use Linux tracepoints - */ -#undef DTRACE_PROBE1 -#define DTRACE_PROBE1(name, t1, arg1) \ - trace_zfs_##name((arg1)) - -#undef DTRACE_PROBE2 -#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ - trace_zfs_##name((arg1), (arg2)) - -#undef DTRACE_PROBE3 -#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ - trace_zfs_##name((arg1), (arg2), (arg3)) - -#undef DTRACE_PROBE4 -#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ - trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) - -#endif /* _TRACE_ZFS_H */ - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH sys -#define TRACE_INCLUDE_FILE trace -#include - -#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/txg.h b/include/sys/txg.h index 760d5208bf..f38f0006c0 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -41,6 +41,7 @@ extern "C" { #define TXG_MASK (TXG_SIZE - 1) /* mask for size */ #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +#define TXG_UNKNOWN 0 /* Number of txgs worth of frees we defer adding to in-core spacemaps */ #define TXG_DEFER_SIZE 2 @@ -77,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); +extern void txg_kick(struct dsl_pool *dp, uint64_t txg); /* * Wait until the given transaction group has finished syncing. @@ -87,6 +88,11 @@ extern void txg_kick(struct dsl_pool *dp); */ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); +/* + * Wait as above. Returns true if the thread was signaled while waiting. + */ +extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); + /* * Wait until the given transaction group, or one after it, is * the open transaction group. Try to make this happen as soon diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h index 4e05214919..047d51b94c 100644 --- a/include/sys/txg_impl.h +++ b/include/sys/txg_impl.h @@ -43,7 +43,7 @@ extern "C" { * the number of active transaction holds (tc_count). As transactions * are assigned into a transaction group the appropriate tc_count is * incremented to indicate that there are pending changes that have yet - * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement + * to quiesce. Consumers eventually call txg_rele_to_sync() to decrement * the tc_count. A transaction group is not considered quiesced until all * tx_cpu structures have reached a tc_count of zero. * @@ -78,7 +78,7 @@ struct tx_cpu { /* * The tx_state structure maintains the state information about the different - * stages of the pool's transcation groups. A per pool tx_state structure + * stages of the pool's transaction groups. A per pool tx_state structure * is used to track this information. The tx_state structure also points to * an array of tx_cpu structures (described above). Although the tx_sync_lock * is used to protect the members of this structure, it is not used to diff --git a/include/sys/u8_textprep.h b/include/sys/u8_textprep.h index f8b5bed6e4..09ab13af26 100644 --- a/include/sys/u8_textprep.h +++ b/include/sys/u8_textprep.h @@ -101,7 +101,7 @@ extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); #define U8_ILLEGAL_CHAR (-1) #define U8_OUT_OF_RANGE_CHAR (-2) -extern int u8_validate(char *, size_t, char **, int, int *); +extern int u8_validate(const char *, size_t, char **, int, int *); extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, int *); diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index cfef0b95db..cde3ef4048 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -41,9 +41,28 @@ #include -extern int uiomove(void *, size_t, enum uio_rw, uio_t *); -extern int uio_prefaultpages(ssize_t, uio_t *); -extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); -extern void uioskip(uio_t *, size_t); +extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *); +extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *); +extern void zfs_uioskip(zfs_uio_t *, size_t); + +static inline void +zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = zfs_uio_iovbase(uio, idx); + *len = zfs_uio_iovlen(uio, idx); +} + +static inline offset_t +zfs_uio_index_at_offset(zfs_uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < zfs_uio_iovcnt(uio) && + off >= zfs_uio_iovlen(uio, *vec_idx)) { + off -= zfs_uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } + + return (off); +} #endif /* _SYS_UIO_IMPL_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 67ca0d1161..0a81713a44 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,8 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -32,6 +33,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -48,10 +50,14 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; -extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); +typedef boolean_t vdev_open_children_func_t(vdev_t *vd); + +extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); +extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); @@ -70,9 +76,12 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); -extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); + boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); @@ -85,6 +94,7 @@ extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size); extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, uint64_t offset, uint64_t size, dmu_tx_t *tx); +extern boolean_t vdev_replace_in_progress(vdev_t *vdev); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); @@ -95,10 +105,19 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); -extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, - range_seg_t *physical_rs); + +typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); + +extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); +extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); + +extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); @@ -115,6 +134,15 @@ extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); +/* + * Return the amount of space allocated for a gang block header. + */ +static inline uint64_t +vdev_gang_header_asize(vdev_t *vd) +{ + return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); +} + extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, @@ -151,7 +179,8 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); -extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd); +extern void vdev_defer_resilver(vdev_t *vd); +extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, @@ -175,7 +204,9 @@ extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t - offset, uint64_t size, zio_done_func_t *done, void *private, int flags); + offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); +extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); +extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index 908f5f3263..a7e19fbf0c 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -42,13 +42,5 @@ #ifdef _KERNEL #include - -typedef struct vdev_disk { - ddi_devid_t vd_devid; - char *vd_minor; - struct block_device *vd_bdev; - krwlock_t vd_lock; -} vdev_disk_t; - #endif /* _KERNEL */ #endif /* _SYS_VDEV_DISK_H */ diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h new file mode 100644 index 0000000000..52ce4ba161 --- /dev/null +++ b/include/sys/vdev_draid.h @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_DRAID_H +#define _SYS_VDEV_DRAID_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Constants required to generate and use dRAID permutations. + */ +#define VDEV_DRAID_SEED 0xd7a1d5eed +#define VDEV_DRAID_MAX_MAPS 254 +#define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT +#define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT) +#define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT) + +/* + * dRAID permutation map. + */ +typedef struct draid_map { + uint64_t dm_children; /* # of permutation columns */ + uint64_t dm_nperms; /* # of permutation rows */ + uint64_t dm_seed; /* dRAID map seed */ + uint64_t dm_checksum; /* Checksum of generated map */ + uint8_t *dm_perms; /* base permutation array */ +} draid_map_t; + +/* + * dRAID configuration. + */ +typedef struct vdev_draid_config { + /* + * Values read from the dRAID nvlist configuration. + */ + uint64_t vdc_ndata; /* # of data devices in group */ + uint64_t vdc_nparity; /* # of parity devices in group */ + uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_children; /* # of children */ + uint64_t vdc_ngroups; /* # groups per slice */ + + /* + * Immutable derived constants. + */ + uint8_t *vdc_perms; /* permutation array */ + uint64_t vdc_nperms; /* # of permutations */ + uint64_t vdc_groupwidth; /* = data + parity */ + uint64_t vdc_ndisks; /* = children - spares */ + uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */ + uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */ +} vdev_draid_config_t; + +/* + * Functions for handling dRAID permutation maps. + */ +extern uint64_t vdev_draid_rand(uint64_t *); +extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **); +extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); + +/* + * General dRAID support functions. + */ +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); +extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); + +/* Functions for dRAID distributed spares. */ +extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DRAID_H */ diff --git a/include/sys/vdev_file.h b/include/sys/vdev_file.h index 9a398c5839..1514a44fca 100644 --- a/include/sys/vdev_file.h +++ b/include/sys/vdev_file.h @@ -34,7 +34,7 @@ extern "C" { #endif typedef struct vdev_file { - vnode_t *vf_vnode; + zfs_file_t *vf_file; } vdev_file_t; extern void vdev_file_init(void); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index f6f7bbb4b2..3cfde40a77 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -67,14 +68,19 @@ extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ +typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, - uint64_t *ashift); + uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); -typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -86,13 +92,24 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, - range_seg_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, + range_seg64_t *physical, range_seg64_t *remain); +typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, + uint64_t size, uint64_t max_segment); +typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, + uint64_t *sizep); +typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); +typedef uint64_t vdev_nparity_func_t(vdev_t *vd); +typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { + vdev_init_func_t *vdev_op_init; + vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; + vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; @@ -100,11 +117,12 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; - /* - * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. - * Used when initializing vdevs. Isn't used by leaf ops. - */ vdev_xlation_func_t *vdev_op_xlate; + vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; + vdev_metaslab_init_func_t *vdev_op_metaslab_init; + vdev_config_generate_func_t *vdev_op_config_generate; + vdev_nparity_func_t *vdev_op_nparity; + vdev_ndisks_func_t *vdev_op_ndisks; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -147,6 +165,9 @@ struct vdev_queue { avl_tree_t vq_write_offset_tree; avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; + zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_ia_active; /* Active interactive I/Os. */ + uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ @@ -215,13 +236,30 @@ struct vdev { uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ + + /* + * Logical block alignment shift + * + * The smallest sized/aligned I/O supported by the device. + */ + uint64_t vdev_logical_ashift; + /* + * Physical block alignment shift + * + * The device supports logical I/Os with vdev_logical_ashift + * size/alignment, but optimum performance will be achieved by + * aligning/sizing requests to vdev_physical_ashift. Smaller + * requests may be inflated or incur device level read-modify-write + * operations. + * + * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). + */ + uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ - vnode_t *vdev_name_vp; /* vnode for pathname */ - vnode_t *vdev_devid_vp; /* vnode for devid */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ @@ -231,8 +269,11 @@ struct vdev { boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ + int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ + int vdev_validate_error; /* error on last validate */ kthread_t *vdev_open_thread; /* thread opening children */ + kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ /* @@ -242,6 +283,7 @@ struct vdev { uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ @@ -274,7 +316,7 @@ struct vdev { range_tree_t *vdev_initialize_tree; /* valid while initializing */ uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; - time_t vdev_initialize_action_time; /* start and end time */ + uint64_t vdev_initialize_action_time; /* start and end time */ /* TRIM related */ boolean_t vdev_trim_exit_wanted; @@ -295,15 +337,25 @@ struct vdev { uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ uint64_t vdev_trim_partial; /* requested partial TRIM */ uint64_t vdev_trim_secure; /* requested secure TRIM */ - time_t vdev_trim_action_time; /* start and end time */ + uint64_t vdev_trim_action_time; /* start and end time */ - /* for limiting outstanding I/Os (initialize and TRIM) */ + /* Rebuild related */ + boolean_t vdev_rebuilding; + boolean_t vdev_rebuild_exit_wanted; + boolean_t vdev_rebuild_cancel_wanted; + boolean_t vdev_rebuild_reset_wanted; + kmutex_t vdev_rebuild_lock; + kcondvar_t vdev_rebuild_cv; + kthread_t *vdev_rebuild_thread; + vdev_rebuild_t vdev_rebuild_config; + + /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; - uint64_t vdev_trim_inflight[2]; + uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. @@ -360,7 +412,7 @@ struct vdev { uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ + uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ @@ -406,17 +458,16 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ /* - * We rate limit ZIO delay and ZIO checksum events, since they + * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. */ zfs_ratelimit_t vdev_delay_rl; + zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; }; -#define VDEV_RAIDZ_MAXPARITY 3 - #define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +/* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -443,12 +494,41 @@ typedef struct vdev_phys { zio_eck_t vp_zbt; } vdev_phys_t; +typedef enum vbe_vers { + /* + * The bootenv file is stored as ascii text in the envblock. + * It is used by the GRUB bootloader used on Linux to store the + * contents of the grubenv file. The file is stored as raw ASCII, + * and is protected by an embedded checksum. By default, GRUB will + * check if the boot filesystem supports storing the environment data + * in a special location, and if so, will invoke filesystem specific + * logic to retrieve it. This can be overridden by a variable, should + * the user so desire. + */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT_GLOBAL(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ -} vdev_label_t; /* 256K total */ +} vdev_label_t; /* 256K total */ /* * vdev_dirty() flags @@ -471,6 +551,9 @@ typedef struct vdev_label { #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS +#define VDEV_OFFSET_IS_LABEL(vd, off) \ + (((off) < VDEV_LABEL_START_SIZE) || \ + ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 @@ -516,6 +599,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -526,16 +611,20 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, - range_seg_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); +extern uint64_t vdev_get_nparity(vdev_t *vd); +extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables */ -extern int vdev_standard_sm_blksz; +extern int zfs_vdev_standard_sm_blksz; /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; @@ -552,6 +641,15 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); +void vdev_metaslab_group_create(vdev_t *vd); + +/* + * Vdev ashift optimization tunables + */ +extern uint64_t zfs_vdev_min_auto_ashift; +extern uint64_t zfs_vdev_max_auto_ashift; +int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); +int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); #ifdef __cplusplus } diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 2ce32469d4..ee597eb0db 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -32,6 +32,7 @@ extern "C" { #endif struct zio; +struct raidz_row; struct raidz_map; #if !defined(_KERNEL) struct kernel_param {}; @@ -43,20 +44,30 @@ struct kernel_param {}; struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); -int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_child_done(zio_t *); +void vdev_raidz_io_done(zio_t *); + +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; /* * vdev_raidz_math interface */ void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); -struct raidz_impl_ops *vdev_raidz_math_get_ops(void); -int vdev_raidz_math_generate(struct raidz_map *); -int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, - const int); +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *); +int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, + const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz { + int vd_logical_width; + int vd_nparity; +} vdev_raidz_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 0799ed19df..908723da0c 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -90,7 +91,7 @@ typedef boolean_t (*will_work_f)(void); typedef void (*init_impl_f)(void); typedef void (*fini_impl_f)(void); -#define RAIDZ_IMPL_NAME_MAX (16) +#define RAIDZ_IMPL_NAME_MAX (20) typedef struct raidz_impl_ops { init_impl_f init; @@ -105,34 +106,48 @@ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ + abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ + abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ + uint8_t rc_force_repair; /* Write good data to this column */ + uint8_t rc_allow_repair; /* Allow repair I/O to this column */ } raidz_col_t; +typedef struct raidz_row { + uint64_t rr_cols; /* Regular column count */ + uint64_t rr_scols; /* Count including skipped columns */ + uint64_t rr_bigcols; /* Remainder data column count */ + uint64_t rr_missingdata; /* Count of missing data devices */ + uint64_t rr_missingparity; /* Count of missing parity devices */ + uint64_t rr_firstdatacol; /* First data column/parity count */ + abd_t *rr_abd_empty; /* dRAID empty sector buffer */ + int rr_nempty; /* empty sectors included in parity */ +#ifdef ZFS_DEBUG + uint64_t rr_offset; /* Logical offset for *_io_verify() */ + uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif + raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ +} raidz_row_t; + typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ - uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + boolean_t rm_ecksuminjected; /* checksum error was injected */ + int rm_nrows; /* Regular row count */ + int rm_nskip; /* RAIDZ sectors skipped for padding */ + int rm_skipstart; /* Column index of padding start */ + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; +extern boolean_t raidz_will_scalar_work(void); + #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_sse2_impl; #endif @@ -152,20 +167,24 @@ extern const raidz_impl_ops_t vdev_raidz_avx512bw_impl; extern const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl; extern const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl; #endif +#if defined(__powerpc__) +extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; +#endif /* * Commonly used raidz_map helpers * * raidz_parity Returns parity of the RAIDZ block * raidz_ncols Returns number of columns the block spans - * raidz_nbigcols Returns number of big columns columns + * Note, all rows have the same number of columns. + * raidz_nbigcols Returns number of big columns * raidz_col_p Returns pointer to a column * raidz_col_size Returns size of a column * raidz_big_size Returns size of big columns * raidz_short_size Returns size of short columns */ -#define raidz_parity(rm) ((rm)->rm_firstdatacol) -#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols) #define raidz_nbigcols(rm) ((rm)->rm_bigcols) #define raidz_col_p(rm, c) ((rm)->rm_col + (c)) #define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) @@ -180,10 +199,10 @@ extern const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl; */ #define _RAIDZ_GEN_WRAP(code, impl) \ static void \ -impl ## _gen_ ## code(void *rmp) \ +impl ## _gen_ ## code(void *rrp) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - raidz_generate_## code ## _impl(rm); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + raidz_generate_## code ## _impl(rr); \ } /* @@ -194,10 +213,10 @@ impl ## _gen_ ## code(void *rmp) \ */ #define _RAIDZ_REC_WRAP(code, impl) \ static int \ -impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +impl ## _rec_ ## code(void *rrp, const int *tgtidx) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \ } /* diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h new file mode 100644 index 0000000000..b59fbe1539 --- /dev/null +++ b/include/sys/vdev_rebuild.h @@ -0,0 +1,101 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_REBUILD_H +#define _SYS_VDEV_REBUILD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Number of entries in the physical vdev_rebuild_phys structure. This + * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS. + */ +#define REBUILD_PHYS_ENTRIES 12 + +/* + * On-disk rebuild configuration and state. When adding new fields they + * must be added to the end of the structure. + */ +typedef struct vdev_rebuild_phys { + uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */ + uint64_t vrp_last_offset; /* last rebuilt offset */ + uint64_t vrp_min_txg; /* minimum missing txg */ + uint64_t vrp_max_txg; /* maximum missing txg */ + uint64_t vrp_start_time; /* start time */ + uint64_t vrp_end_time; /* end time */ + uint64_t vrp_scan_time_ms; /* total run time in ms */ + uint64_t vrp_bytes_scanned; /* alloc bytes scanned */ + uint64_t vrp_bytes_issued; /* read bytes rebuilt */ + uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrp_bytes_est; /* total bytes to scan */ + uint64_t vrp_errors; /* errors during rebuild */ +} vdev_rebuild_phys_t; + +/* + * The vdev_rebuild_t describes the current state and how a top-level vdev + * should be rebuilt. The core elements are the top-vdev, the metaslab being + * rebuilt, range tree containing the allocated extents and the on-disk state. + */ +typedef struct vdev_rebuild { + vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ + metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ + range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + kmutex_t vr_io_lock; /* inflight IO lock */ + kcondvar_t vr_io_cv; /* inflight IO cv */ + + /* In-core state and progress */ + uint64_t vr_scan_offset[TXG_SIZE]; + uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */ + uint64_t vr_bytes_inflight; /* current bytes inflight */ + + /* Per-rebuild pass statistics for calculating bandwidth */ + uint64_t vr_pass_start_time; + uint64_t vr_pass_bytes_scanned; + uint64_t vr_pass_bytes_issued; + + /* On-disk state updated by vdev_rebuild_zap_update_sync() */ + vdev_rebuild_phys_t vr_rebuild_phys; +} vdev_rebuild_t; + +boolean_t vdev_rebuild_active(vdev_t *); + +int vdev_rebuild_load(vdev_t *); +void vdev_rebuild(vdev_t *); +void vdev_rebuild_stop_wait(vdev_t *); +void vdev_rebuild_stop_all(spa_t *); +void vdev_rebuild_restart(spa_t *); +void vdev_rebuild_clear_sync(void *, dmu_tx_t *); +int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REBUILD_H */ diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 3962237afd..e3bab0658d 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_REMOVAL_H @@ -81,13 +81,13 @@ extern void spa_vdev_condense_suspend(spa_t *); extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); -extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void svr_sync(spa_t *, dmu_tx_t *); extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); -extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); +extern uint64_t spa_remove_max_segment(spa_t *); extern int vdev_removal_max_span; -extern int zfs_remove_max_segment; #ifdef __cplusplus } diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h index 1e54017665..16f4be2a41 100644 --- a/include/sys/vdev_trim.h +++ b/include/sys/vdev_trim.h @@ -44,6 +44,8 @@ extern void vdev_autotrim(spa_t *spa); extern void vdev_autotrim_stop_all(spa_t *spa); extern void vdev_autotrim_stop_wait(vdev_t *vd); extern void vdev_autotrim_restart(spa_t *spa); +extern int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size); +extern void vdev_trim_l2arc(spa_t *spa); #ifdef __cplusplus } diff --git a/include/sys/zap.h b/include/sys/zap.h index ab13652d8c..b19b464387 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -350,6 +350,7 @@ typedef struct zap_cursor { uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; + boolean_t zc_prefetch; } zap_cursor_t; typedef struct { @@ -375,7 +376,9 @@ typedef struct { * Initialize a zap cursor, pointing to the "first" attribute of the * zapobj. You must _fini the cursor when you are done with it. */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj); void zap_cursor_fini(zap_cursor_t *zc); /* diff --git a/include/sys/zcp.h b/include/sys/zcp.h index b9c8ef0069..d7b1dfaa2e 100644 --- a/include/sys/zcp.h +++ b/include/sys/zcp.h @@ -52,6 +52,12 @@ typedef struct zcp_cleanup_handler { list_node_t zch_node; } zcp_cleanup_handler_t; +typedef struct zcp_alloc_arg { + boolean_t aa_must_succeed; + int64_t aa_alloc_remaining; + int64_t aa_alloc_limit; +} zcp_alloc_arg_t; + typedef struct zcp_run_info { dsl_pool_t *zri_pool; @@ -69,6 +75,7 @@ typedef struct zcp_run_info { * rather than the 'current' thread's. */ cred_t *zri_cred; + proc_t *zri_proc; /* * The tx in which this channel program is running. @@ -93,6 +100,11 @@ typedef struct zcp_run_info { */ boolean_t zri_timed_out; + /* + * Channel program was canceled by user + */ + boolean_t zri_canceled; + /* * Boolean indicating whether or not we are running in syncing * context. @@ -104,6 +116,34 @@ typedef struct zcp_run_info { * triggered in the event of a fatal error. */ list_t zri_cleanup_handlers; + + /* + * The Lua state context of our channel program. + */ + lua_State *zri_state; + + /* + * Lua memory allocator arguments. + */ + zcp_alloc_arg_t *zri_allocargs; + + /* + * Contains output values from zcp script or error string. + */ + nvlist_t *zri_outnvl; + + /* + * The keys of this nvlist are datasets which may be zvols and may need + * to have device minor nodes created. This information is passed from + * syncing context (where the zvol is created) to open context (where we + * create the minor nodes). + */ + nvlist_t *zri_new_zvols; + + /* + * The errno number returned to caller of zcp_eval(). + */ + int zri_result; } zcp_run_info_t; zcp_run_info_t *zcp_run_info(lua_State *); @@ -118,7 +158,7 @@ typedef struct zcp_arg { /* * The name of this argument. For keyword arguments this is the name * functions will use to set the argument. For positional arguments - * the name has no programatic meaning, but will appear in error + * the name has no programmatic meaning, but will appear in error * messages and help output. */ const char *za_name; diff --git a/include/sys/zcp_set.h b/include/sys/zcp_set.h new file mode 100644 index 0000000000..b7428d6fc0 --- /dev/null +++ b/include/sys/zcp_set.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_ZCP_SET_H +#define _SYS_ZCP_SET_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zcp_set_prop_arg { + lua_State *state; + const char *dsname; + const char *prop; + const char *val; +} zcp_set_prop_arg_t; + +int zcp_set_prop_check(void *arg, dmu_tx_t *tx); +void zcp_set_prop_sync(void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_SET_H */ diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h index 6d3db50416..010686a912 100644 --- a/include/sys/zfs_acl.h +++ b/include/sys/zfs_acl.h @@ -62,7 +62,7 @@ struct znode_phys; /* * All ACEs have a common hdr. For * owner@, group@, and everyone@ this is all - * thats needed. + * that's needed. */ typedef struct zfs_ace_hdr { uint16_t z_type; @@ -220,7 +220,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); -void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); diff --git a/include/sys/zfs_bootenv.h b/include/sys/zfs_bootenv.h new file mode 100644 index 0000000000..7af0a57dd0 --- /dev/null +++ b/include/sys/zfs_bootenv.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _ZFS_BOOTENV_H +#define _ZFS_BOOTENV_H + +/* + * Define macros for label bootenv nvlist pair keys. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_VERSION "version" + +#define BE_ILLUMOS_VENDOR "illumos" +#define BE_FREEBSD_VENDOR "freebsd" +#define BE_GRUB_VENDOR "grub" +#define BE_LINUX_VENDOR "linux" + +#include + +#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap" + +#define FREEBSD_BOOTONCE BE_FREEBSD_VENDOR ":" "bootonce" +#define FREEBSD_BOOTONCE_USED BE_FREEBSD_VENDOR ":" "bootonce-used" +#define FREEBSD_NVSTORE BE_FREEBSD_VENDOR ":" "nvstore" +#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce" +#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used" +#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore" + +#define OS_BOOTONCE BOOTENV_OS ":" "bootonce" +#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used" +#define OS_NVSTORE BOOTENV_OS ":" "nvstore" + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e3fa2e61bd..b1df9f3f38 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -21,16 +21,26 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H -#ifdef __KERNEL__ +#ifdef __cplusplus +extern "C" { +#endif -#include +/* + * This code compiles in three different contexts. When __KERNEL__ is defined, + * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the + * code is running in a reduced capacity environment of the boot loader which is + * generally a subset of both POSIX and kernel interfaces (with a few unique + * interfaces too). When neither are defined, it's in a userland POSIX or + * similar environment. + */ +#if defined(__KERNEL__) || defined(_STANDALONE) #include #include #include @@ -42,17 +52,14 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include -#include #include #include #include @@ -63,10 +70,10 @@ #include #include #include -#include -#include - -#else /* _KERNEL */ +#include +#include +#include +#else /* _KERNEL || _STANDALONE */ #define _SYS_MUTEX_H #define _SYS_RWLOCK_H @@ -88,7 +95,6 @@ #include #include #include -#include #include #include #include @@ -97,17 +103,15 @@ #include #include #include -#include #include #include #include -#include #include #include #include +#include #include #include -#include #include #include #include @@ -115,6 +119,9 @@ #include #include #include +#include + +#include /* * Stack @@ -122,6 +129,7 @@ #define noinline __attribute__((noinline)) #define likely(x) __builtin_expect((x), 1) +#define unlikely(x) __builtin_expect((x), 0) /* * Debugging @@ -150,8 +158,6 @@ extern void vpanic(const char *, va_list) __NORETURN; #define fm_panic panic -extern int aok; - /* * DTrace SDT probes have different signatures in userland than they do in * the kernel. If they're being used in kernel code, re-define them out of @@ -171,33 +177,39 @@ extern int aok; #ifdef DTRACE_PROBE #undef DTRACE_PROBE #endif /* DTRACE_PROBE */ -#define DTRACE_PROBE(a) \ - ZFS_PROBE0(#a) +#define DTRACE_PROBE(a) #ifdef DTRACE_PROBE1 #undef DTRACE_PROBE1 #endif /* DTRACE_PROBE1 */ -#define DTRACE_PROBE1(a, b, c) \ - ZFS_PROBE1(#a, (unsigned long)c) +#define DTRACE_PROBE1(a, b, c) #ifdef DTRACE_PROBE2 #undef DTRACE_PROBE2 #endif /* DTRACE_PROBE2 */ -#define DTRACE_PROBE2(a, b, c, d, e) \ - ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e) +#define DTRACE_PROBE2(a, b, c, d, e) #ifdef DTRACE_PROBE3 #undef DTRACE_PROBE3 #endif /* DTRACE_PROBE3 */ -#define DTRACE_PROBE3(a, b, c, d, e, f, g) \ - ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g) +#define DTRACE_PROBE3(a, b, c, d, e, f, g) #ifdef DTRACE_PROBE4 #undef DTRACE_PROBE4 #endif /* DTRACE_PROBE4 */ -#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \ - ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \ - (unsigned long)i) +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) + +/* + * Tunables. + */ +typedef struct zfs_kernel_param { + const char *name; /* unused stub */ +} zfs_kernel_param_t; + +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) +#define ZFS_MODULE_PARAM_ARGS void +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ + getfunc, perm, desc) /* * Threads. @@ -211,6 +223,9 @@ typedef pthread_t kthread_t; #define kpreempt(x) yield() #define getcomm() "unknown" +#define thread_create_named(name, stk, stksize, func, arg, len, \ + pp, state, pri) \ + zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) @@ -257,6 +272,8 @@ extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +#define NESTED_SINGLE 1 +#define mutex_enter_nested(mp, class) mutex_enter(mp) /* * RW locks */ @@ -305,49 +322,42 @@ typedef pthread_cond_t kcondvar_t; extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); -extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); -extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, +extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); +extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); +extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); #define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) +#define cv_timedwait_idle(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) -#define cv_wait_sig(cv, mp) cv_wait(cv, mp) #define cv_wait_io(cv, mp) cv_wait(cv, mp) +#define cv_wait_idle(cv, mp) cv_wait(cv, mp) +#define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) #define cv_timedwait_sig_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) +#define cv_timedwait_idle_hires(cv, mp, t, r, f) \ + cv_timedwait_hires(cv, mp, t, r, f) /* * Thread-specific data */ #define tsd_get(k) pthread_getspecific(k) #define tsd_set(k, v) pthread_setspecific(k, v) -#define tsd_create(kp, d) pthread_key_create(kp, d) -#define tsd_destroy(kp) /* nothing */ - -/* - * Thread-specific data - */ -#define tsd_get(k) pthread_getspecific(k) -#define tsd_set(k, v) pthread_setspecific(k, v) -#define tsd_create(kp, d) pthread_key_create(kp, d) +#define tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d) #define tsd_destroy(kp) /* nothing */ +#ifdef __FreeBSD__ +typedef off_t loff_t; +#endif /* * kstat creation, installation and deletion */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); -extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); -extern void kstat_waitq_to_runq(kstat_io_t *); -extern void kstat_runq_back_to_waitq(kstat_io_t *); extern void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), @@ -357,9 +367,6 @@ extern void kstat_set_raw_ops(kstat_t *ksp, * procfs list manipulation */ -struct seq_file { }; -void seq_printf(struct seq_file *m, const char *fmt, ...); - typedef struct procfs_list { void *pl_private; kmutex_t pl_lock; @@ -368,12 +375,17 @@ typedef struct procfs_list { size_t pl_node_offset; } procfs_list_t; +#ifndef __cplusplus +struct seq_file { }; +void seq_printf(struct seq_file *m, const char *fmt, ...); + typedef struct procfs_list_node { list_node_t pln_link; uint64_t pln_id; } procfs_list_node_t; void procfs_list_install(const char *module, + const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, @@ -384,6 +396,7 @@ void procfs_list_install(const char *module, void procfs_list_uninstall(procfs_list_t *procfs_list); void procfs_list_destroy(procfs_list_t *procfs_list); void procfs_list_add(procfs_list_t *procfs_list, void *p); +#endif /* * Kernel memory @@ -393,8 +406,7 @@ void procfs_list_add(procfs_list_t *procfs_list, void *p); #define KM_NOSLEEP UMEM_DEFAULT #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG -#define KMC_KMEM 0x0 -#define KMC_VMEM 0x0 +#define KMC_KVMEM 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) @@ -409,12 +421,9 @@ void procfs_list_add(procfs_list_t *procfs_list, void *p); #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); #define kmem_cache_set_move(_c, _cb) /* nothing */ -#define vmem_qcache_reap(_v) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 -extern vmem_t *zio_arena; - typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { @@ -496,6 +505,7 @@ extern void taskq_wait(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); @@ -503,16 +513,6 @@ extern void system_taskq_fini(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 -/* - * vnodes - */ -typedef struct vnode { - uint64_t v_size; - int v_fd; - char *v_path; - int v_dump_fd; -} vnode_t; - extern char *vn_dumpdir; #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ @@ -561,7 +561,6 @@ typedef struct vsecattr { size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ } vsecattr_t; -#define AT_TYPE 0x00001 #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 @@ -581,42 +580,7 @@ typedef struct vsecattr { #define CRCREAT 0 #define F_FREESP 11 - -extern int fop_getattr(vnode_t *vp, vattr_t *vap); - -#define VOP_CLOSE(vp, f, c, o, cr, ct) vn_close(vp) -#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0 -#define VOP_GETATTR(vp, vap, fl, cr, ct) fop_getattr((vp), (vap)); - -#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) - -#if defined(HAVE_FILE_FALLOCATE) && \ - defined(FALLOC_FL_PUNCH_HOLE) && \ - defined(FALLOC_FL_KEEP_SIZE) -#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) \ - fallocate((vp)->v_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, \ - (flck)->l_start, (flck)->l_len) -#else -#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) (0) -#endif - -#define VN_RELE(vp) vn_close(vp) - -extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, - int x2, int x3); -extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp, - int x2, int x3, vnode_t *vp, int fd); -extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, - offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp); -extern void vn_close(vnode_t *vp); - -#define vn_remove(path, x1, x2) remove(path) -#define vn_rename(from, to, seg) rename((from), (to)) -#define vn_is_readonly(vp) B_FALSE - -extern vnode_t *rootdir; - -#include /* for FREAD, FWRITE, etc */ +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ /* * Random stuff @@ -638,9 +602,9 @@ extern vnode_t *rootdir; extern void delay(clock_t ticks); #define SEC_TO_TICK(sec) ((sec) * hz) -#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) -#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz)) -#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz)) +#define MSEC_TO_TICK(msec) (howmany((hrtime_t)(msec) * hz, MILLISEC)) +#define USEC_TO_TICK(usec) (howmany((hrtime_t)(usec) * hz, MICROSEC)) +#define NSEC_TO_TICK(nsec) (howmany((hrtime_t)(nsec) * hz, NANOSEC)) #define max_ncpus 64 #define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) @@ -653,6 +617,7 @@ extern void delay(clock_t ticks); #define defclsyspri 0 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) +#define CPU_SEQID_UNSTABLE CPU_SEQID #define kcred NULL #define CRED() NULL @@ -663,22 +628,37 @@ extern void delay(clock_t ticks); #define NN_NUMBUF_SZ (6) extern uint64_t physmem; -extern char *random_path; -extern char *urandom_path; +extern const char *random_path; +extern const char *urandom_path; extern int highbit64(uint64_t i); extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); -extern void kernel_init(int); +static __inline__ uint32_t +random_in_range(uint32_t range) +{ + uint32_t r; + + ASSERT(range != 0); + + if (range == 1) + return (0); + + (void) random_get_pseudo_bytes((uint8_t *)&r, sizeof (r)); + + return (r % range); +} + +extern void kernel_init(int mode); extern void kernel_fini(void); extern void random_init(void); extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); -extern int set_global_var(char *arg); +extern int set_global_var(char const *arg); typedef struct callb_cpr { kmutex_t *cc_lockp; @@ -707,7 +687,8 @@ extern uint32_t zone_get_hostid(void *zonep); extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); -#define strfree(str) kmem_free((str), strlen(str) + 1) +#define kmem_strfree(str) kmem_free((str), strlen(str) + 1) +#define kmem_strdup(s) strdup(s) /* * Hostname information @@ -747,16 +728,12 @@ typedef struct ace_object { #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 -extern struct _buf *kobj_open_file(char *name); -extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, - unsigned off); -extern void kobj_close_file(struct _buf *file); -extern int kobj_get_filesize(struct _buf *file, uint64_t *size); extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int secpolicy_zfs(const cred_t *cr); +extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); extern zoneid_t getzoneid(void); /* SID stuff */ @@ -789,7 +766,17 @@ extern void spl_fstrans_unmark(fstrans_cookie_t); extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); -#define ____cacheline_aligned -#endif /* _KERNEL */ +/* + * Kernel modules + */ +#define __init +#define __exit + +#endif /* _KERNEL || _STANDALONE */ + +#ifdef __cplusplus +}; +#endif + #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 7968a01cd4..7b103510dd 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -55,10 +55,13 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_SET_ERROR (1 << 9) #define ZFS_DEBUG_INDIRECT_REMAP (1 << 10) #define ZFS_DEBUG_TRIM (1 << 11) +#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) +#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) +extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); extern void __dprintf(boolean_t dprint, const char *file, const char *func, - int line, const char *fmt, ...); + int line, const char *fmt, ...) __attribute__((format(printf, 5, 6))); /* * Some general principles for using zfs_dbgmsg(): diff --git a/include/sys/zfs_file.h b/include/sys/zfs_file.h new file mode 100644 index 0000000000..02cd1a6f04 --- /dev/null +++ b/include/sys/zfs_file.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_FILE_H +#define _SYS_ZFS_FILE_H + +#include + +#ifndef _KERNEL +typedef struct zfs_file { + int f_fd; + int f_dump_fd; +} zfs_file_t; +#elif defined(__linux__) || defined(__FreeBSD__) +typedef struct file zfs_file_t; +#else +#error "unknown OS" +#endif + +typedef struct zfs_file_attr { + uint64_t zfa_size; /* file size */ + mode_t zfa_mode; /* file type */ +} zfs_file_attr_t; + +int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fp); +void zfs_file_close(zfs_file_t *fp); + +int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); +int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, + ssize_t *resid); +int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); +int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, + ssize_t *resid); + +int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence); +int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr); +int zfs_file_fsync(zfs_file_t *fp, int flags); +int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len); +loff_t zfs_file_off(zfs_file_t *fp); +int zfs_file_unlink(const char *); + +zfs_file_t *zfs_file_get(int fd); +void zfs_file_put(zfs_file_t *fp); +void *zfs_file_private(zfs_file_t *fp); + +#endif /* _SYS_ZFS_FILE_H */ diff --git a/include/sys/zfs_fuid.h b/include/sys/zfs_fuid.h index 5c56f7fccc..b5b37db294 100644 --- a/include/sys/zfs_fuid.h +++ b/include/sys/zfs_fuid.h @@ -116,6 +116,8 @@ extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, char **retdomain, boolean_t addok); extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, size_t len, boolean_t addok); #endif char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index a883c33585..4fb15636ec 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -67,7 +67,8 @@ extern "C" { * Property values for acltype */ #define ZFS_ACLTYPE_OFF 0 -#define ZFS_ACLTYPE_POSIXACL 1 +#define ZFS_ACLTYPE_POSIX 1 +#define ZFS_ACLTYPE_NFSV4 2 /* * Field manipulation macros for the drr_versioninfo field of the @@ -101,22 +102,39 @@ typedef enum drr_headertype { /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) -/* flag #21 is reserved for the redacted send/receive feature */ +#define DMU_BACKUP_FEATURE_REDACTED (1 << 21) #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) #define DMU_BACKUP_FEATURE_RAW (1 << 24) -/* flag #25 is reserved for the ZSTD compression feature */ +#define DMU_BACKUP_FEATURE_ZSTD (1 << 25) #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) +/* + * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive + * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even + * if the previous send did not use LARGE_BLOCKS, and thus its large blocks + * were split into multiple 128KB WRITE records. (See + * flush_write_batch_impl() and receive_object()). Older software that does + * not support this flag may encounter a bug when switching to large blocks, + * which causes files to incorrectly be zeroed. + * + * This flag is currently not set on any send streams. In the future, we + * intend for incremental send streams of snapshots that have large blocks to + * use LARGE_BLOCKS by default, and these streams will also have the + * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the + * default use of "zfs send" won't encounter the bug mentioned above. + */ +#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) /* * Mask of all supported backup features */ -#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ - DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ - DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS) + DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ + DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ + DMU_BACKUP_FEATURE_ZSTD) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -208,25 +226,29 @@ typedef enum dmu_send_resume_token_version { /* * zfs ioctl command structure */ + +/* Header is used in C++ so can't forward declare untagged struct */ +struct drr_begin { + uint64_t drr_magic; + uint64_t drr_versioninfo; /* was drr_version */ + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; +}; + typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { - struct drr_begin { - uint64_t drr_magic; - uint64_t drr_versioninfo; /* was drr_version */ - uint64_t drr_creation_time; - dmu_objset_type_t drr_type; - uint32_t drr_flags; - uint64_t drr_toguid; - uint64_t drr_fromguid; - char drr_toname[MAXNAMELEN]; - } drr_begin; + struct drr_begin drr_begin; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; @@ -337,9 +359,15 @@ typedef struct dmu_replay_record { uint8_t drr_flags; uint8_t drr_pad[3]; } drr_object_range; + struct drr_redact { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_redact; /* - * Nore: drr_checksum is overlaid with all record types + * Note: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very @@ -486,6 +514,7 @@ typedef struct zfs_cmd { uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; + uint64_t zc_zoneid; } zfs_cmd_t; typedef struct zfs_useracct { @@ -496,7 +525,6 @@ typedef struct zfs_useracct { } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 @@ -532,15 +560,16 @@ enum zfsdev_state_type { */ typedef struct zfsdev_state { struct zfsdev_state *zs_next; /* next zfsdev_state_t link */ - struct file *zs_file; /* associated file struct */ minor_t zs_minor; /* made up minor number */ void *zs_onexit; /* onexit data */ void *zs_zevent; /* zevent data */ } zfsdev_state_t; extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); -extern int zfsdev_getminor(struct file *filp, minor_t *minorp); -extern minor_t zfsdev_minor_alloc(void); +extern int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp); + +extern uint_t zfs_fsyncer_key; +extern uint_t zfs_allow_log_key; #endif /* _KERNEL */ diff --git a/include/sys/zfs_ioctl_impl.h b/include/sys/zfs_ioctl_impl.h new file mode 100644 index 0000000000..3db67ae984 --- /dev/null +++ b/include/sys/zfs_ioctl_impl.h @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#ifndef _ZFS_IOCTL_IMPL_H_ +#define _ZFS_IOCTL_IMPL_H_ + +extern kmutex_t zfsdev_state_lock; +extern zfsdev_state_t *zfsdev_state_list; +extern unsigned long zfs_max_nvlist_src_size; + +typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); +typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); + +typedef enum { + POOL_CHECK_NONE = 1 << 0, + POOL_CHECK_SUSPENDED = 1 << 1, + POOL_CHECK_READONLY = 1 << 2, +} zfs_ioc_poolcheck_t; + +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME, + ENTITY_NAME +} zfs_ioc_namecheck_t; + +/* + * IOC Keys are used to document and validate user->kernel interface inputs. + * See zfs_keys_recv_new for an example declaration. Any key name that is not + * listed will be rejected as input. + * + * The keyname 'optional' is always allowed, and must be an nvlist if present. + * Arguments which older kernels can safely ignore can be placed under the + * "optional" key. + * + * When adding new keys to an existing ioc for new functionality, consider: + * - adding an entry into zfs_sysfs.c zfs_features[] list + * - updating the libzfs_input_check.c test utility + * + * Note: in the ZK_WILDCARDLIST case, the name serves as documentation + * for the expected name (bookmark, snapshot, property, etc) but there + * is no validation in the preflight zfs_check_input_nvpairs() check. + */ +typedef enum { + ZK_OPTIONAL = 1 << 0, /* pair is optional */ + ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */ +} ioc_key_flag_t; + +typedef struct zfs_ioc_key { + const char *zkey_name; + data_type_t zkey_type; + ioc_key_flag_t zkey_flags; +} zfs_ioc_key_t; + +int zfs_secpolicy_config(zfs_cmd_t *, nvlist_t *, cred_t *); + +void zfs_ioctl_register_dataset_nolog(zfs_ioc_t, zfs_ioc_legacy_func_t *, + zfs_secpolicy_func_t *, zfs_ioc_poolcheck_t); + +void zfs_ioctl_register(const char *, zfs_ioc_t, zfs_ioc_func_t *, + zfs_secpolicy_func_t *, zfs_ioc_namecheck_t, zfs_ioc_poolcheck_t, + boolean_t, boolean_t, const zfs_ioc_key_t *, size_t); + +uint64_t zfs_max_nvlist_src_size_os(void); +void zfs_ioctl_update_mount_cache(const char *dsname); +void zfs_ioctl_init_os(void); + +boolean_t zfs_vfs_held(zfsvfs_t *); +int zfs_vfs_ref(zfsvfs_t **); +void zfs_vfs_rele(zfsvfs_t *); + +long zfsdev_ioctl_common(uint_t, zfs_cmd_t *, int); +int zfsdev_attach(void); +void zfsdev_detach(void); +void zfsdev_private_set_state(void *, zfsdev_state_t *); +zfsdev_state_t *zfsdev_private_get_state(void *); +int zfsdev_state_init(void *); +void zfsdev_state_destroy(void *); +int zfs_kmod_init(void); +void zfs_kmod_fini(void); + +#endif diff --git a/include/sys/zfs_onexit.h b/include/sys/zfs_onexit.h index 4982bd4d0a..fd3030e3ac 100644 --- a/include/sys/zfs_onexit.h +++ b/include/sys/zfs_onexit.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_ONEXIT_H @@ -50,14 +51,10 @@ extern void zfs_onexit_destroy(zfs_onexit_t *zo); #endif -extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); -extern void zfs_onexit_fd_rele(int fd); +extern zfs_file_t *zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(zfs_file_t *); extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle); -extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, - boolean_t fire); -extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, - void **data); #ifdef __cplusplus } diff --git a/include/sys/zfs_project.h b/include/sys/zfs_project.h index 52d5204a69..81a2389052 100644 --- a/include/sys/zfs_project.h +++ b/include/sys/zfs_project.h @@ -32,7 +32,7 @@ #endif #endif -#include +#include #ifdef FS_PROJINHERIT_FL #define ZFS_PROJINHERIT_FL FS_PROJINHERIT_FL diff --git a/include/sys/zfs_quota.h b/include/sys/zfs_quota.h new file mode 100644 index 0000000000..b215b8dd00 --- /dev/null +++ b/include/sys/zfs_quota.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_QUOTA_H +#define _SYS_ZFS_QUOTA_H + +#include +#include + +struct zfsvfs; +struct zfs_file_info_t; + +extern int zpl_get_file_info(dmu_object_type_t, + const void *, struct zfs_file_info *); + +extern int zfs_userspace_one(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t *); +extern int zfs_userspace_many(struct zfsvfs *, zfs_userquota_prop_t, + uint64_t *, void *, uint64_t *); +extern int zfs_set_userquota(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t); + +extern boolean_t zfs_id_overobjquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overblockquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overquota(struct zfsvfs *, uint64_t, uint64_t); + +#endif diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h new file mode 100644 index 0000000000..cfcdd336ea --- /dev/null +++ b/include/sys/zfs_racct.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Portions Copyright 2021 iXsystems, Inc. + */ + +#ifndef _SYS_ZFS_RACCT_H +#define _SYS_ZFS_RACCT_H + +#include + +/* + * Platform-dependent resource accounting hooks + */ +void zfs_racct_read(uint64_t size, uint64_t iops); +void zfs_racct_write(uint64_t size, uint64_t iops); + +#endif /* _SYS_ZFS_RACCT_H */ diff --git a/include/sys/refcount.h b/include/sys/zfs_refcount.h similarity index 74% rename from include/sys/refcount.h rename to include/sys/zfs_refcount.h index e982faeba0..2f59ebb32b 100644 --- a/include/sys/refcount.h +++ b/include/sys/zfs_refcount.h @@ -23,8 +23,8 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ -#ifndef _SYS_REFCOUNT_H -#define _SYS_REFCOUNT_H +#ifndef _SYS_ZFS_REFCOUNT_H +#define _SYS_ZFS_REFCOUNT_H #include #include @@ -44,7 +44,7 @@ extern "C" { #ifdef ZFS_DEBUG typedef struct reference { list_node_t ref_link; - void *ref_holder; + const void *ref_holder; uint64_t ref_number; uint8_t *ref_removed; } reference_t; @@ -70,16 +70,25 @@ void zfs_refcount_destroy(zfs_refcount_t *); void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t); int zfs_refcount_is_zero(zfs_refcount_t *); int64_t zfs_refcount_count(zfs_refcount_t *); -int64_t zfs_refcount_add(zfs_refcount_t *, void *); -int64_t zfs_refcount_remove(zfs_refcount_t *, void *); -int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, void *); -int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, void *); +int64_t zfs_refcount_add(zfs_refcount_t *, const void *); +int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); +/* + * Note that (add|remove)_many add/remove one reference with "number" N, + * _not_ make N references with "number" 1, which is what vanilla + * zfs_refcount_(add|remove) would do if called N times. + * + * Attempting to remove a reference with number N when none exists is a + * panic on debug kernels with reference_tracking enabled. + */ +int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); +int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); -void zfs_refcount_transfer_ownership(zfs_refcount_t *, void *, void *); +void zfs_refcount_transfer_ownership(zfs_refcount_t *, const void *, + const void *); void zfs_refcount_transfer_ownership_many(zfs_refcount_t *, uint64_t, - void *, void *); -boolean_t zfs_refcount_held(zfs_refcount_t *, void *); -boolean_t zfs_refcount_not_held(zfs_refcount_t *, void *); + const void *, const void *); +boolean_t zfs_refcount_held(zfs_refcount_t *, const void *); +boolean_t zfs_refcount_not_held(zfs_refcount_t *, const void *); void zfs_refcount_init(void); void zfs_refcount_fini(void); @@ -95,8 +104,8 @@ typedef struct refcount { #define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0) -#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0) -#define zfs_refcount_count(rc) ((rc)->rc_count) +#define zfs_refcount_is_zero(rc) (zfs_refcount_count(rc) == 0) +#define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) #define zfs_refcount_add_many(rc, number, holder) \ @@ -104,13 +113,13 @@ typedef struct refcount { #define zfs_refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) #define zfs_refcount_transfer(dst, src) { \ - uint64_t __tmp = (src)->rc_count; \ + uint64_t __tmp = zfs_refcount_count(src); \ atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } #define zfs_refcount_transfer_ownership(rc, ch, nh) ((void)0) #define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0) -#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0) +#define zfs_refcount_held(rc, holder) (zfs_refcount_count(rc) > 0) #define zfs_refcount_not_held(rc, holder) (B_TRUE) #define zfs_refcount_init() diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h index 05b080843d..2302abb373 100644 --- a/include/sys/zfs_rlock.h +++ b/include/sys/zfs_rlock.h @@ -39,40 +39,42 @@ typedef enum { RL_READER, RL_WRITER, RL_APPEND -} rangelock_type_t; +} zfs_rangelock_type_t; -struct locked_range; +struct zfs_locked_range; -typedef void (rangelock_cb_t)(struct locked_range *, void *); +typedef void (zfs_rangelock_cb_t)(struct zfs_locked_range *, void *); -typedef struct rangelock { +typedef struct zfs_rangelock { avl_tree_t rl_tree; /* contains locked_range_t */ kmutex_t rl_lock; - rangelock_cb_t *rl_cb; + zfs_rangelock_cb_t *rl_cb; void *rl_arg; -} rangelock_t; +} zfs_rangelock_t; -typedef struct locked_range { - rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ +typedef struct zfs_locked_range { + zfs_rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ avl_node_t lr_node; /* avl node link */ uint64_t lr_offset; /* file range offset */ uint64_t lr_length; /* file range length */ uint_t lr_count; /* range reference count in tree */ - rangelock_type_t lr_type; /* range type */ + zfs_rangelock_type_t lr_type; /* range type */ kcondvar_t lr_write_cv; /* cv for waiting writers */ kcondvar_t lr_read_cv; /* cv for waiting readers */ uint8_t lr_proxy; /* acting for original range */ uint8_t lr_write_wanted; /* writer wants to lock this range */ uint8_t lr_read_wanted; /* reader wants to lock this range */ -} locked_range_t; +} zfs_locked_range_t; -void rangelock_init(rangelock_t *, rangelock_cb_t *, void *); -void rangelock_fini(rangelock_t *); +void zfs_rangelock_init(zfs_rangelock_t *, zfs_rangelock_cb_t *, void *); +void zfs_rangelock_fini(zfs_rangelock_t *); -locked_range_t *rangelock_enter(rangelock_t *, - uint64_t, uint64_t, rangelock_type_t); -void rangelock_exit(locked_range_t *); -void rangelock_reduce(locked_range_t *, uint64_t, uint64_t); +zfs_locked_range_t *zfs_rangelock_enter(zfs_rangelock_t *, + uint64_t, uint64_t, zfs_rangelock_type_t); +zfs_locked_range_t *zfs_rangelock_tryenter(zfs_rangelock_t *, + uint64_t, uint64_t, zfs_rangelock_type_t); +void zfs_rangelock_exit(zfs_locked_range_t *); +void zfs_rangelock_reduce(zfs_locked_range_t *, uint64_t, uint64_t); #ifdef __cplusplus } diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h index 4e6d28638e..1ca7ced331 100644 --- a/include/sys/zfs_sa.h +++ b/include/sys/zfs_sa.h @@ -134,7 +134,7 @@ typedef struct znode_phys { #define DXATTR_MAX_ENTRY_SIZE (32768) #define DXATTR_MAX_SA_SIZE (SPA_OLD_MAXBLOCKSIZE >> 1) -int zfs_sa_readlink(struct znode *, uio_t *); +int zfs_sa_readlink(struct znode *, zfs_uio_t *); void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); diff --git a/include/sys/zfs_sysfs.h b/include/sys/zfs_sysfs.h index 925d7ad542..14ba61fc4b 100644 --- a/include/sys/zfs_sysfs.h +++ b/include/sys/zfs_sysfs.h @@ -23,7 +23,7 @@ */ #ifndef _SYS_ZFS_SYSFS_H -#define _SYS_ZFS_SYSFS_H +#define _SYS_ZFS_SYSFS_H extern __attribute__((visibility("default"))) #ifdef _KERNEL @@ -35,7 +35,7 @@ void zfs_sysfs_fini(void); #define zfs_sysfs_init() #define zfs_sysfs_fini() -boolean_t zfs_mod_supported(const char *, const char *); +_SYS_ZFS_SYSFS_H boolean_t zfs_mod_supported(const char *, const char *); #endif #define ZFS_SYSFS_POOL_PROPERTIES "properties.pool" diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index 42f534f5db..a438c86f0a 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -18,215 +18,18 @@ * * CDDL HEADER END */ + /* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Portions Copyright 2020 iXsystems, Inc. */ -#ifndef _SYS_FS_ZFS_VFSOPS_H -#define _SYS_FS_ZFS_VFSOPS_H +#ifndef _SYS_ZFS_VFSOPS_H +#define _SYS_ZFS_VFSOPS_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { +#ifdef _KERNEL +#include #endif -typedef struct zfsvfs zfsvfs_t; -struct znode; +extern void zfsvfs_update_fromname(const char *, const char *); -/* - * This structure emulates the vfs_t from other platforms. It's purpose - * is to faciliate the handling of mount options and minimize structural - * differences between the platforms. - */ -typedef struct vfs { - struct zfsvfs *vfs_data; - char *vfs_mntpoint; /* Primary mount point */ - uint64_t vfs_xattr; - boolean_t vfs_readonly; - boolean_t vfs_do_readonly; - boolean_t vfs_setuid; - boolean_t vfs_do_setuid; - boolean_t vfs_exec; - boolean_t vfs_do_exec; - boolean_t vfs_devices; - boolean_t vfs_do_devices; - boolean_t vfs_do_xattr; - boolean_t vfs_atime; - boolean_t vfs_do_atime; - boolean_t vfs_relatime; - boolean_t vfs_do_relatime; - boolean_t vfs_nbmand; - boolean_t vfs_do_nbmand; -} vfs_t; - -typedef struct zfs_mnt { - const char *mnt_osname; /* Objset name */ - char *mnt_data; /* Raw mount options */ -} zfs_mnt_t; - -struct zfsvfs { - vfs_t *z_vfs; /* generic fs struct */ - struct super_block *z_sb; /* generic super_block */ - struct zfsvfs *z_parent; /* parent fs */ - objset_t *z_os; /* objset reference */ - uint64_t z_flags; /* super_block flags */ - uint64_t z_root; /* id of root znode */ - uint64_t z_unlinkedobj; /* id of unlinked zapobj */ - uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_fuid_obj; /* fuid table object number */ - uint64_t z_fuid_size; /* fuid table size */ - avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ - avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ - krwlock_t z_fuid_lock; /* fuid lock */ - boolean_t z_fuid_loaded; /* fuid tables are loaded */ - boolean_t z_fuid_dirty; /* need to sync fuid table ? */ - struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ - zilog_t *z_log; /* intent log pointer */ - uint_t z_acl_inherit; /* acl inheritance behavior */ - uint_t z_acl_type; /* type of ACL usable on this FS */ - zfs_case_t z_case; /* case-sense */ - boolean_t z_utf8; /* utf8-only */ - int z_norm; /* normalization flags */ - boolean_t z_relatime; /* enable relatime mount option */ - boolean_t z_unmounted; /* unmounted */ - rrmlock_t z_teardown_lock; - krwlock_t z_teardown_inactive_lock; - list_t z_all_znodes; /* all znodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ - unsigned long z_rollback_time; /* last online rollback time */ - unsigned long z_snap_defer_time; /* last snapshot unmount deferal */ - kmutex_t z_znodes_lock; /* lock for z_all_znodes */ - arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ - struct inode *z_ctldir; /* .zfs directory inode */ - boolean_t z_show_ctldir; /* expose .zfs in the root dir */ - boolean_t z_issnap; /* true if this is a snapshot */ - boolean_t z_vscan; /* virus scan on/off */ - boolean_t z_use_fuids; /* version allows fuids */ - boolean_t z_replay; /* set during ZIL replay */ - boolean_t z_use_sa; /* version allow system attributes */ - boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ - boolean_t z_draining; /* is true when drain is active */ - boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ - uint64_t z_version; /* ZPL version */ - uint64_t z_shares_dir; /* hidden shares dir */ - dataset_kstats_t z_kstat; /* fs kstats */ - kmutex_t z_lock; - uint64_t z_userquota_obj; - uint64_t z_groupquota_obj; - uint64_t z_userobjquota_obj; - uint64_t z_groupobjquota_obj; - uint64_t z_projectquota_obj; - uint64_t z_projectobjquota_obj; - uint64_t z_replay_eof; /* New end of file - replay only */ - sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ - uint64_t z_hold_size; /* znode hold array size */ - avl_tree_t *z_hold_trees; /* znode hold trees */ - kmutex_t *z_hold_locks; /* znode hold locks */ - taskqid_t z_drain_task; /* task id for the unlink drain task */ -}; - -#define ZSB_XATTR 0x0001 /* Enable user xattrs */ - -/* - * Allow a maximum number of links. While ZFS does not internally limit - * this the inode->i_nlink member is defined as an unsigned int. To be - * safe we use 2^31-1 as the limit. - */ -#define ZFS_LINK_MAX ((1U << 31) - 1U) - -/* - * Normal filesystems (those not under .zfs/snapshot) have a total - * file ID size limited to 12 bytes (including the length field) due to - * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical - * reasons, this same limit is being imposed by the Solaris NFSv3 implementation - * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It - * is not possible to expand beyond 12 bytes without abandoning support - * of NFSv2. - * - * For normal filesystems, we partition up the available space as follows: - * 2 bytes fid length (required) - * 6 bytes object number (48 bits) - * 4 bytes generation number (32 bits) - * - * We reserve only 48 bits for the object number, as this is the limit - * currently defined and imposed by the DMU. - */ -typedef struct zfid_short { - uint16_t zf_len; - uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ -} zfid_short_t; - -/* - * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes - * (including the length field). This makes files under .zfs/snapshot - * accessible by NFSv3 and NFSv4, but not NFSv2. - * - * For files under .zfs/snapshot, we partition up the available space - * as follows: - * 2 bytes fid length (required) - * 6 bytes object number (48 bits) - * 4 bytes generation number (32 bits) - * 6 bytes objset id (48 bits) - * 4 bytes currently just zero (32 bits) - * - * We reserve only 48 bits for the object number and objset id, as these are - * the limits currently defined and imposed by the DMU. - */ -typedef struct zfid_long { - zfid_short_t z_fid; - uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ -} zfid_long_t; - -#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) -#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) - -extern uint_t zfs_fsyncer_key; - -extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); -extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); -extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valuep); -extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); -extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota); -extern boolean_t zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); -extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); -extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); -extern void zfsvfs_free(zfsvfs_t *zfsvfs); -extern int zfs_check_global_label(const char *dsname, const char *hexsl); - -extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); -extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); -extern void zfs_preumount(struct super_block *sb); -extern int zfs_umount(struct super_block *sb); -extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); -extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp); -extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); -extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, - int *objects); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_VFSOPS_H */ +#endif /* _SYS_ZFS_VFSOPS_H */ diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 767cba10da..18259f0dc9 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -24,66 +24,32 @@ #ifndef _SYS_FS_ZFS_VNOPS_H #define _SYS_FS_ZFS_VNOPS_H +#include -#include -#include -#include -#include -#include -#include -#include +extern int zfs_fsync(znode_t *, int, cred_t *); +extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); +extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); +extern int zfs_holey(znode_t *, ulong_t, loff_t *); +extern int zfs_access(znode_t *, int, int, cred_t *); + +extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); +extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); + +extern int mappedread(znode_t *, int, zfs_uio_t *); +extern int mappedread_sf(znode_t *, int, zfs_uio_t *); +extern void update_pages(znode_t *, int64_t, int, objset_t *); + +/* + * Platform code that asynchronously drops zp's inode / vnode_t. + * + * Asynchronous dropping ensures that the caller will never drop the + * last reference on an inode / vnode_t in the current context. + * Doing so while holding open a tx could result in a deadlock if + * the platform calls into filesystem again in the implementation + * of inode / vnode_t dropping (e.g. call from iput_final()). + */ +extern void zfs_zrele_async(znode_t *zp); + +extern zil_get_data_t zfs_get_data; -#ifdef __cplusplus -extern "C" { #endif - -extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); -extern int zfs_close(struct inode *ip, int flag, cred_t *cr); -extern int zfs_holey(struct inode *ip, int cmd, loff_t *off); -extern int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); -extern int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); -extern int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr); -extern int zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, - int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); -extern int zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); -extern int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); -extern int zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags); -extern int zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - struct inode **ipp, cred_t *cr, int flags, vsecattr_t *vsecp); -extern int zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, - cred_t *cr, int flags); -extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); -extern int zfs_fsync(struct inode *ip, int syncflag, cred_t *cr); -extern int zfs_getattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr); -extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp); -extern int zfs_setattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr); -extern int zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, - char *tnm, cred_t *cr, int flags); -extern int zfs_symlink(struct inode *dip, char *name, vattr_t *vap, - char *link, struct inode **ipp, cred_t *cr, int flags); -extern int zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr); -extern int zfs_link(struct inode *tdip, struct inode *sip, - char *name, cred_t *cr, int flags); -extern void zfs_inactive(struct inode *ip); -extern int zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr); -extern int zfs_fid(struct inode *ip, fid_t *fidp); -extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, - cred_t *cr); -extern int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, - cred_t *cr); -extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); -extern int zfs_putpage(struct inode *ip, struct page *pp, - struct writeback_control *wbc); -extern int zfs_dirty_inode(struct inode *ip, int flags); -extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, - size_t len, unsigned long vm_flags); -extern void zfs_iput_async(struct inode *ip); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index d4a3ea7693..1bf25a77d3 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -27,18 +27,6 @@ #ifndef _SYS_FS_ZFS_ZNODE_H #define _SYS_FS_ZFS_ZNODE_H -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif #include #include #include @@ -169,12 +157,16 @@ extern "C" { #define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef _KERNEL +#include + /* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. */ -#ifdef _KERNEL typedef struct zfs_dirlock { char *dl_name; /* directory entry being locked */ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ @@ -191,11 +183,15 @@ typedef struct znode { krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ - rangelock_t z_rangelock; /* file range locks */ - uint8_t z_unlinked; /* file has been unlinked */ - uint8_t z_atime_dirty; /* atime needs to be synced */ - uint8_t z_zn_prefetch; /* Prefetch znodes? */ - uint8_t z_moved; /* Has this znode been moved? */ + zfs_rangelock_t z_rangelock; /* file range locks */ + boolean_t z_unlinked; /* file has been unlinked */ + boolean_t z_atime_dirty; /* atime needs to be synced */ + boolean_t z_zn_prefetch; /* Prefetch znodes? */ + boolean_t z_is_sa; /* are we native sa? */ + boolean_t z_is_mapped; /* are we mmap'ed */ + boolean_t z_is_ctldir; /* are we .zfs entry */ + boolean_t z_is_stale; /* are we stale due to rollback? */ + boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ @@ -212,11 +208,12 @@ typedef struct znode { uint64_t z_projid; /* project ID */ list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ - boolean_t z_is_sa; /* are we native sa? */ - boolean_t z_is_mapped; /* are we mmap'ed */ - boolean_t z_is_ctldir; /* are we .zfs entry */ - boolean_t z_is_stale; /* are we stale due to rollback? */ - struct inode z_inode; /* generic vfs inode */ + + /* + * Platform specific field, defined by each platform and only + * accessible from platform specific code. + */ + ZNODE_OS_FIELDS; } znode_t; typedef struct znode_hold { @@ -233,102 +230,6 @@ zfs_inherit_projid(znode_t *dzp) ZFS_DEFAULT_PROJID); } -/* - * Range locking rules - * -------------------- - * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole - * file range needs to be locked as RL_WRITER. Only then can the pages be - * freed etc and zp_size reset. zp_size must be set within range lock. - * 2. For writes and punching holes (zfs_write & zfs_space) just the range - * being written or freed needs to be locked as RL_WRITER. - * Multiple writes at the end of the file must coordinate zp_size updates - * to ensure data isn't lost. A compare and swap loop is currently used - * to ensure the file size is at least the offset last written. - * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being - * read needs to be locked as RL_READER. A check against zp_size can then - * be made for reading beyond end of file. - */ - -/* - * Convert between znode pointers and inode pointers - */ -#define ZTOI(znode) (&((znode)->z_inode)) -#define ITOZ(inode) (container_of((inode), znode_t, z_inode)) -#define ZTOZSB(znode) ((zfsvfs_t *)(ZTOI(znode)->i_sb->s_fs_info)) -#define ITOZSB(inode) ((zfsvfs_t *)((inode)->i_sb->s_fs_info)) - -#define S_ISDEV(mode) (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) - -/* Called on entry to each ZFS inode and vfs operation. */ -#define ZFS_ENTER_ERROR(zfsvfs, error) \ -do { \ - rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ - if ((zfsvfs)->z_unmounted) { \ - ZFS_EXIT(zfsvfs); \ - return (error); \ - } \ -} while (0) -#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) -#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) - -/* Must be called before exiting the operation. */ -#define ZFS_EXIT(zfsvfs) \ -do { \ - rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ -} while (0) -#define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) - -/* Verifies the znode is valid. */ -#define ZFS_VERIFY_ZP_ERROR(zp, error) \ -do { \ - if ((zp)->z_sa_hdl == NULL) { \ - ZFS_EXIT(ZTOZSB(zp)); \ - return (error); \ - } \ -} while (0) -#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) -#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) - -/* - * Macros for dealing with dmu_buf_hold - */ -#define ZFS_OBJ_MTX_SZ 64 -#define ZFS_OBJ_MTX_MAX (1024 * 1024) -#define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) - -extern unsigned int zfs_object_mutex_size; - -/* - * Encode ZFS stored time values from a struct timespec / struct timespec64. - */ -#define ZFS_TIME_ENCODE(tp, stmp) \ -do { \ - (stmp)[0] = (uint64_t)(tp)->tv_sec; \ - (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ -} while (0) - -#if defined(HAVE_INODE_TIMESPEC64_TIMES) -/* - * Decode ZFS stored time values to a struct timespec64 - * 4.18 and newer kernels. - */ -#define ZFS_TIME_DECODE(tp, stmp) \ -do { \ - (tp)->tv_sec = (time64_t)(stmp)[0]; \ - (tp)->tv_nsec = (long)(stmp)[1]; \ -} while (0) -#else -/* - * Decode ZFS stored time values to a struct timespec - * 4.17 and older kernels. - */ -#define ZFS_TIME_DECODE(tp, stmp) \ -do { \ - (tp)->tv_sec = (time_t)(stmp)[0]; \ - (tp)->tv_nsec = (long)(stmp)[1]; \ -} while (0) -#endif /* HAVE_INODE_TIMESPEC64_TIMES */ - /* * Timestamp defines */ @@ -353,32 +254,27 @@ extern void zfs_zinactive(znode_t *); extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); -extern int zfs_sync(struct super_block *, int, cred_t *); extern dev_t zfs_cmpldev(uint64_t); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); -extern int zfs_inode_alloc(struct super_block *, struct inode **ip); -extern void zfs_inode_destroy(struct inode *); -extern void zfs_inode_update(znode_t *); -extern void zfs_mark_inode_dirty(struct inode *); -extern boolean_t zfs_relatime_need_update(const struct inode *); extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, - vattr_t *vap); + znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *, + zfs_fuid_info_t *, vattr_t *vap); extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, vattr_t *vap); extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid); + znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked); #define ZFS_NO_OBJECT 0 /* no object id */ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name); + znode_t *dzp, znode_t *zp, const char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, char *link); + znode_t *dzp, znode_t *zp, const char *name, const char *link); extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, int ioflag, zil_callback_t callback, void *callback_data); @@ -391,19 +287,9 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); -#if defined(HAVE_UIO_RW) -extern caddr_t zfs_map_page(page_t *, enum seg_rw); -extern void zfs_unmap_page(page_t *, caddr_t); -#endif /* HAVE_UIO_RW */ - -extern zil_get_data_t zfs_get_data; -extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; -extern int zfsfstype; - -#endif /* _KERNEL */ - -extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); +extern void zfs_znode_update_vfs(struct znode *); +#endif #ifdef __cplusplus } #endif diff --git a/include/sys/zil.h b/include/sys/zil.h index fb7b38a066..cefbccb32f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -80,7 +80,7 @@ typedef struct zil_header { * Log blocks are chained together. Originally they were chained at the * end of the block. For performance reasons the chain was moved to the * beginning of the block which allows writes for only the data being used. - * The older position is supported for backwards compatability. + * The older position is supported for backwards compatibility. * * The zio_eck_t contains a zec_cksum which for the intent log is * the sequence number of this log block. A seq of 0 is invalid. @@ -373,7 +373,7 @@ typedef struct { * - the write occupies only one block * WR_COPIED: * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), then we allocate a larger + * transaction (O_SYNC or O_DSYNC), then we allocate a larger * log record here for the data and copy the data in. * WR_NEED_COPY: * Otherwise we don't allocate a buffer, and *if* we need to @@ -399,6 +399,7 @@ typedef struct itx { void *itx_callback_data; /* User data for the callback */ size_t itx_size; /* allocated itx structure size */ uint64_t itx_oid; /* object id */ + uint64_t itx_gen; /* gen number for zfs_get_data */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; @@ -421,7 +422,7 @@ typedef struct zil_stats { /* * Number of transactions (reads, writes, renames, etc.) - * that have been commited. + * that have been committed. */ kstat_named_t zil_itx_count; @@ -462,12 +463,12 @@ extern zil_stats_t zil_stats; #define ZIL_STAT_BUMP(stat) \ ZIL_STAT_INCR(stat, 1); -typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, +typedef int zil_parse_blk_func_t(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t txg); -typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, +typedef int zil_parse_lr_func_t(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap); -typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, +typedef int zil_get_data_t(void *arg, uint64_t arg2, lr_write_t *lr, char *dbuf, struct lwb *lwb, zio_t *zio); extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, @@ -493,8 +494,10 @@ extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); extern void zil_itx_destroy(itx_t *itx); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); +extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); extern void zil_commit(zilog_t *zilog, uint64_t oid); extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); +extern void zil_remove_async(zilog_t *zilog, uint64_t oid); extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, @@ -515,6 +518,9 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); +extern uint64_t zil_max_copied_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog); + extern int zil_replay_disable; #ifdef __cplusplus diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 174fef3341..d2f4018653 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -209,6 +209,13 @@ struct zilog { uint_t zl_prev_rotor; /* rotor for zl_prev[] */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ + /* + * Max block size for this ZIL. Note that this can not be changed + * while the ZIL is in use because consumers (ZPL/zvol) need to take + * this into account when deciding between WR_COPIED and WR_NEED_COPY + * (see zil_max_copied_data()). + */ + uint64_t zl_max_block_size; }; typedef struct zil_bp_node { @@ -216,26 +223,6 @@ typedef struct zil_bp_node { avl_node_t zn_node; } zil_bp_node_t; -/* - * Maximum amount of write data that can be put into single log block. - */ -#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ - sizeof (lr_write_t)) - -/* - * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). - */ -#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8) - -/* - * Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY - * as more space efficient if we can't fit at least two log records into - * maximum sized log block. - */ -#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \ - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) - #ifdef __cplusplus } #endif diff --git a/include/sys/zio.h b/include/sys/zio.h index e69bf92080..b3589e9b03 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,10 +22,13 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019-2020, Michael Niewöhner */ #ifndef _ZIO_H @@ -85,7 +88,9 @@ enum zio_checksum { ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, +#if !defined(__FreeBSD__) ZIO_CHECKSUM_EDONR, +#endif ZIO_CHECKSUM_FUNCTIONS }; @@ -99,27 +104,9 @@ enum zio_checksum { #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL -#define ZIO_CHECKSUM_VERIFY (1 << 8) +#define ZIO_CHECKSUM_VERIFY (1U << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 -#define ZIO_DEDUPDITTO_MIN 100 - -/* supported encryption algorithms */ -enum zio_encrypt { - ZIO_CRYPT_INHERIT = 0, - ZIO_CRYPT_ON, - ZIO_CRYPT_OFF, - ZIO_CRYPT_AES_128_CCM, - ZIO_CRYPT_AES_192_CCM, - ZIO_CRYPT_AES_256_CCM, - ZIO_CRYPT_AES_128_GCM, - ZIO_CRYPT_AES_192_GCM, - ZIO_CRYPT_AES_256_GCM, - ZIO_CRYPT_FUNCTIONS -}; - -#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_CCM -#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 @@ -155,9 +142,18 @@ enum zio_encrypt { (compress) == ZIO_COMPRESS_GZIP_8 || \ (compress) == ZIO_COMPRESS_GZIP_9 || \ (compress) == ZIO_COMPRESS_ZLE || \ + (compress) == ZIO_COMPRESS_ZSTD || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) + +#define ZIO_COMPRESS_ALGO(x) (x & SPA_COMPRESSMASK) +#define ZIO_COMPRESS_LEVEL(x) ((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS) +#define ZIO_COMPRESS_RAW(type, level) (type | ((level) << SPA_COMPRESSBITS)) + +#define ZIO_COMPLEVEL_ZSTD(level) \ + ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level) + #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 @@ -173,27 +169,27 @@ enum zio_flag { * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ - ZIO_FLAG_DONT_AGGREGATE = 1 << 0, - ZIO_FLAG_IO_REPAIR = 1 << 1, - ZIO_FLAG_SELF_HEAL = 1 << 2, - ZIO_FLAG_RESILVER = 1 << 3, - ZIO_FLAG_SCRUB = 1 << 4, - ZIO_FLAG_SCAN_THREAD = 1 << 5, - ZIO_FLAG_PHYSICAL = 1 << 6, + ZIO_FLAG_DONT_AGGREGATE = 1U << 0, + ZIO_FLAG_IO_REPAIR = 1U << 1, + ZIO_FLAG_SELF_HEAL = 1U << 2, + ZIO_FLAG_RESILVER = 1U << 3, + ZIO_FLAG_SCRUB = 1U << 4, + ZIO_FLAG_SCAN_THREAD = 1U << 5, + ZIO_FLAG_PHYSICAL = 1U << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ - ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1 << 8, - ZIO_FLAG_CONFIG_WRITER = 1 << 9, - ZIO_FLAG_DONT_RETRY = 1 << 10, - ZIO_FLAG_DONT_CACHE = 1 << 11, - ZIO_FLAG_NODATA = 1 << 12, - ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, - ZIO_FLAG_IO_ALLOCATING = 1 << 14, + ZIO_FLAG_CANFAIL = 1U << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1U << 8, + ZIO_FLAG_CONFIG_WRITER = 1U << 9, + ZIO_FLAG_DONT_RETRY = 1U << 10, + ZIO_FLAG_DONT_CACHE = 1U << 11, + ZIO_FLAG_NODATA = 1U << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1U << 13, + ZIO_FLAG_IO_ALLOCATING = 1U << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -201,29 +197,29 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 16, - ZIO_FLAG_TRYHARD = 1 << 17, - ZIO_FLAG_OPTIONAL = 1 << 18, + ZIO_FLAG_IO_RETRY = 1U << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1U << 16, + ZIO_FLAG_TRYHARD = 1U << 17, + ZIO_FLAG_OPTIONAL = 1U << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 20, - ZIO_FLAG_IO_BYPASS = 1 << 21, - ZIO_FLAG_IO_REWRITE = 1 << 22, - ZIO_FLAG_RAW_COMPRESS = 1 << 23, - ZIO_FLAG_RAW_ENCRYPT = 1 << 24, - ZIO_FLAG_GANG_CHILD = 1 << 25, - ZIO_FLAG_DDT_CHILD = 1 << 26, - ZIO_FLAG_GODFATHER = 1 << 27, - ZIO_FLAG_NOPWRITE = 1 << 28, - ZIO_FLAG_REEXECUTED = 1 << 29, - ZIO_FLAG_DELEGATED = 1 << 30, - ZIO_FLAG_FASTWRITE = 1 << 31, + ZIO_FLAG_DONT_QUEUE = 1U << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1U << 20, + ZIO_FLAG_IO_BYPASS = 1U << 21, + ZIO_FLAG_IO_REWRITE = 1U << 22, + ZIO_FLAG_RAW_COMPRESS = 1U << 23, + ZIO_FLAG_RAW_ENCRYPT = 1U << 24, + ZIO_FLAG_GANG_CHILD = 1U << 25, + ZIO_FLAG_DDT_CHILD = 1U << 26, + ZIO_FLAG_GODFATHER = 1U << 27, + ZIO_FLAG_NOPWRITE = 1U << 28, + ZIO_FLAG_REEXECUTED = 1U << 29, + ZIO_FLAG_DELEGATED = 1U << 30, + ZIO_FLAG_FASTWRITE = 1U << 31, }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -241,8 +237,8 @@ enum zio_flag { (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) -#define ZIO_CHILD_BIT(x) (1 << (x)) -#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) +#define ZIO_CHILD_BIT(x) (1U << (x)) +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) enum zio_child { ZIO_CHILD_VDEV = 0, @@ -266,18 +262,9 @@ enum zio_wait_type { ZIO_WAIT_TYPES }; -/* - * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent - * graveyard) to indicate checksum errors and fragmentation. - */ -#define ECKSUM EBADE -#define EFRAGS EBADR - -/* Similar for ENOACTIVE */ -#define ENOTACTIVE ENOANO - typedef void zio_done_func_t(zio_t *zio); +extern int zio_exclude_metadata; extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; @@ -337,6 +324,7 @@ struct zbookmark_phys { typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; + uint8_t zp_complevel; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; @@ -367,6 +355,7 @@ struct zio_cksum_report { nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; @@ -376,14 +365,8 @@ struct zio_cksum_report { struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; -typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, - void *arg); - -zio_vsd_cksum_report_f zio_vsd_default_cksum_report; - typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; - zio_vsd_cksum_report_f *vsd_cksum_report; } zio_vsd_ops_t; typedef struct zio_gang_node { @@ -421,7 +404,7 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio); * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags. */ enum trim_flag { - ZIO_TRIM_SECURE = 1 << 0, + ZIO_TRIM_SECURE = 1U << 0, }; typedef struct zio_alloc_list { @@ -511,6 +494,7 @@ struct zio { zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; + void *io_bio; kmutex_t io_lock; kcondvar_t io_cv; int io_allocator; @@ -523,27 +507,33 @@ struct zio { taskq_ent_t io_tqent; }; +enum blk_verify_flag { + BLK_VERIFY_ONLY, + BLK_VERIFY_LOG, + BLK_VERIFY_HALT +}; + extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, - zio_done_func_t *done, void *private, enum zio_flag flags); + zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *private, enum zio_flag flags); + zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, enum zio_flag flags, + void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - struct abd *data, uint64_t size, zio_done_func_t *done, void *private, + struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -553,23 +543,23 @@ extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_done_func_t *done, void *private, enum zio_flag flags); + zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, enum zio_flag flags); + zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - zio_done_func_t *done, void *private, zio_priority_t priority, + zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, enum trim_flag trim_flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, - zio_done_func_t *done, void *private, zio_priority_t priority, + zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, - zio_done_func_t *done, void *private, zio_priority_t priority, + zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, @@ -582,8 +572,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); -extern void zio_execute(zio_t *zio); -extern void zio_interrupt(zio_t *zio); +extern void zio_execute(void *zio); +extern void zio_interrupt(void *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, char *tag); @@ -607,11 +597,11 @@ extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private); + zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); @@ -628,11 +618,16 @@ extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); +extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, + uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); +extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + boolean_t config_held, enum blk_verify_flag blk_verify); + /* * Initial setup and teardown. */ @@ -663,9 +658,9 @@ extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ -extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, +extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, - uint64_t length, void *arg, struct zio_bad_cksum *info); + uint64_t length, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); @@ -677,6 +672,10 @@ extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); +void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr); +extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, + const char *name); + /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 45abd3bd31..9a73a62622 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -25,7 +25,7 @@ */ #ifndef _SYS_ZIO_CHECKSUM_H -#define _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H extern __attribute__((visibility("default"))) #include #include @@ -101,7 +101,8 @@ typedef struct zio_bad_cksum { uint8_t zbc_has_cksum; /* expected/actual valid */ } zio_bad_cksum_t; -extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; +_SYS_ZIO_CHECKSUM_H zio_checksum_info_t + zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. @@ -122,7 +123,7 @@ extern zio_checksum_t abd_checksum_edonr_byteswap; extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; -extern zio_abd_checksum_func_t fletcher_4_abd_ops; +_SYS_ZIO_CHECKSUM_H zio_abd_checksum_func_t fletcher_4_abd_ops; extern zio_checksum_t abd_fletcher_4_native; extern zio_checksum_t abd_fletcher_4_byteswap; diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 1642823d3d..4a22ad2a27 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. * Use is subject to license terms. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ @@ -51,15 +53,86 @@ enum zio_compress { ZIO_COMPRESS_GZIP_9, ZIO_COMPRESS_ZLE, ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_ZSTD, ZIO_COMPRESS_FUNCTIONS }; +/* Compression algorithms that have levels */ +#define ZIO_COMPRESS_HASLEVEL(compress) ((compress == ZIO_COMPRESS_ZSTD || \ + (compress >= ZIO_COMPRESS_GZIP_1 && \ + compress <= ZIO_COMPRESS_GZIP_9))) + +#define ZIO_COMPLEVEL_INHERIT 0 +#define ZIO_COMPLEVEL_DEFAULT 255 + +enum zio_zstd_levels { + ZIO_ZSTD_LEVEL_INHERIT = 0, + ZIO_ZSTD_LEVEL_1, +#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1 + ZIO_ZSTD_LEVEL_2, + ZIO_ZSTD_LEVEL_3, +#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3 + ZIO_ZSTD_LEVEL_4, + ZIO_ZSTD_LEVEL_5, + ZIO_ZSTD_LEVEL_6, + ZIO_ZSTD_LEVEL_7, + ZIO_ZSTD_LEVEL_8, + ZIO_ZSTD_LEVEL_9, + ZIO_ZSTD_LEVEL_10, + ZIO_ZSTD_LEVEL_11, + ZIO_ZSTD_LEVEL_12, + ZIO_ZSTD_LEVEL_13, + ZIO_ZSTD_LEVEL_14, + ZIO_ZSTD_LEVEL_15, + ZIO_ZSTD_LEVEL_16, + ZIO_ZSTD_LEVEL_17, + ZIO_ZSTD_LEVEL_18, + ZIO_ZSTD_LEVEL_19, +#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19 + ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */ + ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */ + ZIO_ZSTD_LEVEL_FAST_1, +#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1 + ZIO_ZSTD_LEVEL_FAST_2, + ZIO_ZSTD_LEVEL_FAST_3, + ZIO_ZSTD_LEVEL_FAST_4, + ZIO_ZSTD_LEVEL_FAST_5, + ZIO_ZSTD_LEVEL_FAST_6, + ZIO_ZSTD_LEVEL_FAST_7, + ZIO_ZSTD_LEVEL_FAST_8, + ZIO_ZSTD_LEVEL_FAST_9, + ZIO_ZSTD_LEVEL_FAST_10, + ZIO_ZSTD_LEVEL_FAST_20, + ZIO_ZSTD_LEVEL_FAST_30, + ZIO_ZSTD_LEVEL_FAST_40, + ZIO_ZSTD_LEVEL_FAST_50, + ZIO_ZSTD_LEVEL_FAST_60, + ZIO_ZSTD_LEVEL_FAST_70, + ZIO_ZSTD_LEVEL_FAST_80, + ZIO_ZSTD_LEVEL_FAST_90, + ZIO_ZSTD_LEVEL_FAST_100, + ZIO_ZSTD_LEVEL_FAST_500, + ZIO_ZSTD_LEVEL_FAST_1000, +#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000 + ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */ + ZIO_ZSTD_LEVEL_LEVELS +}; + +/* Forward Declaration to avoid visibility problems */ +struct zio_prop; + /* Common signature for all zio compress functions. */ typedef size_t zio_compress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); /* Common signature for all zio decompress functions. */ typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); +/* Common signature for all zio decompress and get level functions. */ +typedef int zio_decompresslevel_func_t(void *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level); +/* Common signature for all zio get-compression-level functions. */ +typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level); + /* * Common signature for all zio decompress functions using an ABD as input. @@ -76,6 +149,7 @@ typedef const struct zio_compress_info { int ci_level; zio_compress_func_t *ci_compress; zio_decompress_func_t *ci_decompress; + zio_decompresslevel_func_t *ci_decompress_level; } zio_compress_info_t; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; @@ -105,17 +179,17 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); -extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, - int level); + /* * Compress and decompress data if necessary. */ extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len); + size_t s_len, uint8_t level); extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len); + size_t s_len, size_t d_len, uint8_t *level); extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, - size_t s_len, size_t d_len); + size_t s_len, size_t d_len, uint8_t *level); +extern int zio_compress_to_feature(enum zio_compress comp); #ifdef __cplusplus } diff --git a/include/sys/zio_crypt.h b/include/sys/zio_crypt.h index d54e2fe192..d7a63d6958 100644 --- a/include/sys/zio_crypt.h +++ b/include/sys/zio_crypt.h @@ -21,8 +21,12 @@ #define _SYS_ZIO_CRYPT_H #include -#include +#include +#if defined(__FreeBSD__) && defined(_KERNEL) +#include +#else #include +#endif /* __FreeBSD__ */ #include #include #include @@ -47,15 +51,22 @@ typedef enum zio_crypt_type { /* table of supported crypto algorithms, modes and keylengths. */ typedef struct zio_crypt_info { /* mechanism name, needed by ICP */ +#if defined(__FreeBSD__) && defined(_KERNEL) + /* + * I've deliberately used a different name here, to catch + * ICP-using code. + */ + const char *ci_algname; +#else crypto_mech_name_t ci_mechname; - +#endif /* cipher mode type (GCM, CCM) */ zio_crypt_type_t ci_crypt_type; /* length of the encryption key */ size_t ci_keylen; - /* human-readable name of the encryption alforithm */ + /* human-readable name of the encryption algorithm */ char *ci_name; } zio_crypt_info_t; @@ -78,7 +89,7 @@ typedef struct zio_crypt_key { /* buffer for hmac key */ uint8_t zk_hmac_keydata[SHA512_HMAC_KEYLEN]; - /* buffer for currrent encryption key derived from master key */ + /* buffer for current encryption key derived from master key */ uint8_t zk_current_keydata[MASTER_KEY_MAX_LEN]; /* current 64 bit salt for deriving an encryption key */ @@ -90,8 +101,13 @@ typedef struct zio_crypt_key { /* illumos crypto api current encryption key */ crypto_key_t zk_current_key; +#if defined(__FreeBSD__) && defined(_KERNEL) + /* Session for current encryption key. Must always be set */ + freebsd_crypt_session_t zk_session; +#else /* template of current encryption key for illumos crypto api */ crypto_ctx_template_t zk_current_tmpl; +#endif /* illumos crypto api current hmac key */ crypto_key_t zk_hmac_key; @@ -99,7 +115,7 @@ typedef struct zio_crypt_key { /* template of hmac key for illumos crypto api */ crypto_ctx_template_t zk_hmac_tmpl; - /* lock for changing the salt and dependant values */ + /* lock for changing the salt and dependent values */ krwlock_t zk_salt_lock; } zio_crypt_key_t; diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index fbbe06eb04..4c99857165 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -73,9 +73,9 @@ extern "C" { * the supported transformations: * * Compression: - * ZFS supports three different flavors of compression -- gzip, lzjb, and - * zle. Compression occurs as part of the write pipeline and is performed - * in the ZIO_STAGE_WRITE_BP_INIT stage. + * ZFS supports five different flavors of compression -- gzip, lzjb, lz4, zle, + * and zstd. Compression occurs as part of the write pipeline and is + * performed in the ZIO_STAGE_WRITE_BP_INIT stage. * * Dedup: * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and @@ -87,7 +87,7 @@ extern "C" { * * NOP Write: * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage - * and is added to an existing write pipeline if a crypographically + * and is added to an existing write pipeline if a cryptographically * secure checksum (i.e. SHA256) is enabled and compression is turned on. * The NOP write stage will compare the checksums of the current data * on-disk (level-0 blocks only) and the data that is currently being written. diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 0b422904ec..2d8e7fc36b 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -31,6 +31,7 @@ typedef enum zio_priority { ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/include/sys/zstd/Makefile.am b/include/sys/zstd/Makefile.am new file mode 100644 index 0000000000..16666fe633 --- /dev/null +++ b/include/sys/zstd/Makefile.am @@ -0,0 +1,18 @@ +COMMON_H = \ + $(top_srcdir)/include/sys/zstd/zstd.h + +KERNEL_H = + +USER_H = + +EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/zstd +libzfs_HEADERS = $(COMMON_H) $(USER_H) +endif + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/zstd +kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +endif diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h new file mode 100644 index 0000000000..ca32a74645 --- /dev/null +++ b/include/sys/zstd/zstd.h @@ -0,0 +1,229 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2018, Klara Inc. + * Copyright (c) 2016-2018, Allan Jude + * Copyright (c) 2018-2020, Sebastian Gottschall + * Copyright (c) 2019-2020, Michael Niewöhner + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#ifndef _ZFS_ZSTD_H +#define _ZFS_ZSTD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * ZSTD block header + * NOTE: all fields in this header are in big endian order. + */ +typedef struct zfs_zstd_header { + /* Compressed size of data */ + uint32_t c_len; + + /* + * Version and compression level + * We used to use a union to reference compression level + * and version easily, but as it turns out, relying on the + * ordering of bitfields is not remotely portable. + * So now we have get/set functions in zfs_zstd.c for + * manipulating this in just the right way forever. + */ + uint32_t raw_version_level; + char data[]; +} zfs_zstdhdr_t; + +/* + * Simple struct to pass the data from raw_version_level around. + */ +typedef struct zfs_zstd_meta { + uint8_t level; + uint32_t version; +} zfs_zstdmeta_t; + +/* + * kstat helper macros + */ +#define ZSTDSTAT(stat) (zstd_stats.stat.value.ui64) +#define ZSTDSTAT_ADD(stat, val) \ + atomic_add_64(&zstd_stats.stat.value.ui64, (val)) +#define ZSTDSTAT_SUB(stat, val) \ + atomic_sub_64(&zstd_stats.stat.value.ui64, (val)) +#define ZSTDSTAT_BUMP(stat) ZSTDSTAT_ADD(stat, 1) + +/* (de)init for user space / kernel emulation */ +int zstd_init(void); +void zstd_fini(void); + +size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, + size_t d_len, int level); +int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level); +int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, + size_t d_len, uint8_t *level); +int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n); +void zfs_zstd_cache_reap_now(void); + +/* + * So, the reason we have all these complicated set/get functions is that + * originally, in the zstd "header" we wrote out to disk, we used a 32-bit + * bitfield to store the "level" (8 bits) and "version" (24 bits). + * + * Unfortunately, bitfields make few promises about how they're arranged in + * memory... + * + * By way of example, if we were using version 1.4.5 and level 3, it'd be + * level = 0x03, version = 10405/0x0028A5, which gets broken into Vhigh = 0x00, + * Vmid = 0x28, Vlow = 0xA5. We include these positions below to help follow + * which data winds up where. + * + * As a consequence, we wound up with little endian platforms with a layout + * like this in memory: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | Vlow | Vmid | Vhigh | level | + * +-------+-------+-------+-------+ + * =A5 =28 =00 =03 + * + * ...and then, after being run through BE_32(), serializing this out to + * disk: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | level | Vhigh | Vmid | Vlow | + * +-------+-------+-------+-------+ + * =03 =00 =28 =A5 + * + * while on big-endian systems, since BE_32() is a noop there, both in + * memory and on disk, we wind up with: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | Vhigh | Vmid | Vlow | level | + * +-------+-------+-------+-------+ + * =00 =28 =A5 =03 + * + * (Vhigh is always 0 until version exceeds 6.55.35. Vmid and Vlow are the + * other two bytes of the "version" data.) + * + * So now we use the BF32_SET macros to get consistent behavior (the + * ondisk LE encoding, since x86 currently rules the world) across + * platforms, but the "get" behavior requires that we check each of the + * bytes in the aforementioned former-bitfield for 0x00, and from there, + * we can know which possible layout we're dealing with. (Only the two + * that have been observed in the wild are illustrated above, but handlers + * for all 4 positions of 0x00 are implemented. + */ + +static inline void +zfs_get_hdrmeta(const zfs_zstdhdr_t *blob, zfs_zstdmeta_t *res) +{ + uint32_t raw = blob->raw_version_level; + uint8_t findme = 0xff; + int shift; + for (shift = 0; shift < 4; shift++) { + findme = BF32_GET(raw, 8*shift, 8); + if (findme == 0) + break; + } + switch (shift) { + case 0: + res->level = BF32_GET(raw, 24, 8); + res->version = BSWAP_32(raw); + res->version = BF32_GET(res->version, 8, 24); + break; + case 1: + res->level = BF32_GET(raw, 0, 8); + res->version = BSWAP_32(raw); + res->version = BF32_GET(res->version, 0, 24); + break; + case 2: + res->level = BF32_GET(raw, 24, 8); + res->version = BF32_GET(raw, 0, 24); + break; + case 3: + res->level = BF32_GET(raw, 0, 8); + res->version = BF32_GET(raw, 8, 24); + break; + default: + res->level = 0; + res->version = 0; + break; + } +} + +static inline uint8_t +zfs_get_hdrlevel(const zfs_zstdhdr_t *blob) +{ + uint8_t level = 0; + zfs_zstdmeta_t res; + zfs_get_hdrmeta(blob, &res); + level = res.level; + return (level); +} + +static inline uint32_t +zfs_get_hdrversion(const zfs_zstdhdr_t *blob) +{ + uint32_t version = 0; + zfs_zstdmeta_t res; + zfs_get_hdrmeta(blob, &res); + version = res.version; + return (version); + +} + +static inline void +zfs_set_hdrversion(zfs_zstdhdr_t *blob, uint32_t version) +{ + /* cppcheck-suppress syntaxError */ + BF32_SET(blob->raw_version_level, 0, 24, version); +} + +static inline void +zfs_set_hdrlevel(zfs_zstdhdr_t *blob, uint8_t level) +{ + /* cppcheck-suppress syntaxError */ + BF32_SET(blob->raw_version_level, 24, 8, level); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_ZSTD_H */ diff --git a/include/sys/zthr.h b/include/sys/zthr.h index 33c218ec4c..19be89eeeb 100644 --- a/include/sys/zthr.h +++ b/include/sys/zthr.h @@ -24,16 +24,20 @@ typedef struct zthr zthr_t; typedef void (zthr_func_t)(void *, zthr_t *); typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); -extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, - zthr_func_t *func, void *arg); -extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, - zthr_func_t *func, void *arg, hrtime_t nano_wait); +extern zthr_t *zthr_create(const char *zthr_name, + zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg, + pri_t pri); +extern zthr_t *zthr_create_timer(const char *zthr_name, + zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg, + hrtime_t nano_wait, pri_t pri); extern void zthr_destroy(zthr_t *t); extern void zthr_wakeup(zthr_t *t); extern void zthr_cancel(zthr_t *t); extern void zthr_resume(zthr_t *t); +extern void zthr_wait_cycle_done(zthr_t *t); extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_has_waiters(zthr_t *t); #endif /* _SYS_ZTHR_H */ diff --git a/include/sys/zvol.h b/include/sys/zvol.h index e8b084762a..a0f1800130 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -35,28 +35,30 @@ #define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ DEV_BSHIFT - 1)) - 1) -extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async); -extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async); -extern void zvol_rename_minors(spa_t *spa, const char *oldname, - const char *newname, boolean_t async); +extern void zvol_create_minor(const char *); +extern void zvol_create_minors_recursive(const char *); +extern void zvol_remove_minors(spa_t *, const char *, boolean_t); +extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t); #ifdef _KERNEL -typedef struct zvol_state zvol_state_t; +struct zvol_state; +typedef struct zvol_state zvol_state_handle_t; -extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); -extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); -extern int zvol_get_stats(objset_t *os, nvlist_t *nv); +extern int zvol_check_volsize(uint64_t, uint64_t); +extern int zvol_check_volblocksize(const char *, uint64_t); +extern int zvol_get_stats(objset_t *, nvlist_t *); extern boolean_t zvol_is_zvol(const char *); -extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); +extern void zvol_create_cb(objset_t *, void *, cred_t *, dmu_tx_t *); extern int zvol_set_volsize(const char *, uint64_t); -extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); extern int zvol_set_volmode(const char *, zprop_source_t, uint64_t); -extern zvol_state_t *zvol_suspend(const char *); -extern int zvol_resume(zvol_state_t *); -extern void *zvol_tag(zvol_state_t *); +extern zvol_state_handle_t *zvol_suspend(const char *); +extern int zvol_resume(zvol_state_handle_t *); +extern void *zvol_tag(zvol_state_handle_t *); extern int zvol_init(void); extern void zvol_fini(void); +extern int zvol_busy(void); + #endif /* _KERNEL */ #endif /* _SYS_ZVOL_H */ diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h new file mode 100644 index 0000000000..89fe598004 --- /dev/null +++ b/include/sys/zvol_impl.h @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZVOL_IMPL_H +#define _SYS_ZVOL_IMPL_H + +#include + +#define ZVOL_RDONLY 0x1 +/* + * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which + * specifies whether or not the zvol _can_ be written to) + */ +#define ZVOL_WRITTEN_TO 0x2 + +#define ZVOL_DUMPIFIED 0x4 + +#define ZVOL_EXCL 0x8 + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + char zv_name[MAXNAMELEN]; /* name */ + uint64_t zv_volsize; /* advertised space */ + uint64_t zv_volblocksize; /* volume block size */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_flags; /* ZVOL_* flags */ + uint32_t zv_open_count; /* open counts */ + uint32_t zv_changed; /* disk changed */ + uint32_t zv_volmode; /* volmode */ + zilog_t *zv_zilog; /* ZIL handle */ + zfs_rangelock_t zv_rangelock; /* for range locking */ + dnode_t *zv_dn; /* dnode hold */ + dataset_kstats_t zv_kstat; /* zvol kstats */ + list_node_t zv_next; /* next zvol_state_t linkage */ + uint64_t zv_hash; /* name hash */ + struct hlist_node zv_hlink; /* hash link */ + kmutex_t zv_state_lock; /* protects zvol_state_t */ + atomic_t zv_suspend_ref; /* refcount for suspend */ + krwlock_t zv_suspend_lock; /* suspend lock */ + struct zvol_state_os *zv_zso; /* private platform state */ +} zvol_state_t; + + +extern list_t zvol_state_list; +extern krwlock_t zvol_state_lock; +#define ZVOL_HT_SIZE 1024 +extern struct hlist_head *zvol_htable; +#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) +extern zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE]; + +extern unsigned int zvol_volmode; +extern unsigned int zvol_inhibit_dev; + +/* + * platform independent functions exported to platform code + */ +zvol_state_t *zvol_find_by_name_hash(const char *name, + uint64_t hash, int mode); +int zvol_first_open(zvol_state_t *zv, boolean_t readonly); +uint64_t zvol_name_hash(const char *name); +void zvol_remove_minors_impl(const char *name); +void zvol_last_close(zvol_state_t *zv); +void zvol_insert(zvol_state_t *zv); +void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, + uint64_t len, boolean_t sync); +void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, + uint64_t size, int sync); +int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio); +int zvol_init_impl(void); +void zvol_fini_impl(void); +void zvol_wait_close(zvol_state_t *zv); + +/* + * platform dependent functions exported to platform independent code + */ +typedef struct zvol_platform_ops { + void (*zv_free)(zvol_state_t *); + void (*zv_rename_minor)(zvol_state_t *, const char *); + int (*zv_create_minor)(const char *); + int (*zv_update_volsize)(zvol_state_t *, uint64_t); + boolean_t (*zv_is_zvol)(const char *); + void (*zv_clear_private)(zvol_state_t *); + void (*zv_set_disk_ro)(zvol_state_t *, int flags); + void (*zv_set_capacity)(zvol_state_t *, uint64_t capacity); +} zvol_platform_ops_t; + +void zvol_register_ops(const zvol_platform_ops_t *ops); + +#endif diff --git a/include/thread_pool.h b/include/thread_pool.h index 57266f11c5..43090c3c66 100644 --- a/include/thread_pool.h +++ b/include/thread_pool.h @@ -25,7 +25,7 @@ */ #ifndef _THREAD_POOL_H_ -#define _THREAD_POOL_H_ +#define _THREAD_POOL_H_ extern __attribute__((visibility("default"))) #include #include @@ -37,33 +37,17 @@ extern "C" { typedef struct tpool tpool_t; /* opaque thread pool descriptor */ -#if defined(__STDC__) - -extern tpool_t *tpool_create(uint_t min_threads, uint_t max_threads, +_THREAD_POOL_H_ tpool_t *tpool_create(uint_t min_threads, uint_t max_threads, uint_t linger, pthread_attr_t *attr); -extern int tpool_dispatch(tpool_t *tpool, +_THREAD_POOL_H_ int tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg); -extern void tpool_destroy(tpool_t *tpool); -extern void tpool_abandon(tpool_t *tpool); -extern void tpool_wait(tpool_t *tpool); -extern void tpool_suspend(tpool_t *tpool); -extern int tpool_suspended(tpool_t *tpool); -extern void tpool_resume(tpool_t *tpool); -extern int tpool_member(tpool_t *tpool); - -#else /* Non ANSI */ - -extern tpool_t *tpool_create(); -extern int tpool_dispatch(); -extern void tpool_destroy(); -extern void tpool_abandon(); -extern void tpool_wait(); -extern void tpool_suspend(); -extern int tpool_suspended(); -extern void tpool_resume(); -extern int tpool_member(); - -#endif /* __STDC__ */ +_THREAD_POOL_H_ void tpool_destroy(tpool_t *tpool); +_THREAD_POOL_H_ void tpool_abandon(tpool_t *tpool); +_THREAD_POOL_H_ void tpool_wait(tpool_t *tpool); +_THREAD_POOL_H_ void tpool_suspend(tpool_t *tpool); +_THREAD_POOL_H_ int tpool_suspended(tpool_t *tpool); +_THREAD_POOL_H_ void tpool_resume(tpool_t *tpool); +_THREAD_POOL_H_ int tpool_member(tpool_t *tpool); #ifdef __cplusplus } diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 4e75fe71e3..874cbd9ff7 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -27,7 +27,7 @@ */ #ifndef _ZFEATURE_COMMON_H -#define _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H extern __attribute__((visibility("default"))) #include #include @@ -67,6 +67,14 @@ typedef enum spa_feature { SPA_FEATURE_ALLOCATION_CLASSES, SPA_FEATURE_RESILVER_DEFER, SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_REDACTION_BOOKMARKS, + SPA_FEATURE_REDACTED_DATASETS, + SPA_FEATURE_BOOKMARK_WRITTEN, + SPA_FEATURE_LOG_SPACEMAP, + SPA_FEATURE_LIVELIST, + SPA_FEATURE_DEVICE_REBUILD, + SPA_FEATURE_ZSTD_COMPRESS, + SPA_FEATURE_DRAID, SPA_FEATURES } spa_feature_t; @@ -108,16 +116,17 @@ typedef int (zfeature_func_t)(zfeature_info_t *, void *); #define ZFS_FEATURE_DEBUG -extern zfeature_info_t spa_feature_table[SPA_FEATURES]; +_ZFEATURE_COMMON_H zfeature_info_t spa_feature_table[SPA_FEATURES]; +_ZFEATURE_COMMON_H boolean_t zfeature_checks_disable; -extern boolean_t zfeature_is_valid_guid(const char *); +_ZFEATURE_COMMON_H boolean_t zfeature_is_valid_guid(const char *); -extern boolean_t zfeature_is_supported(const char *); -extern int zfeature_lookup_guid(const char *, spa_feature_t *); -extern int zfeature_lookup_name(const char *, spa_feature_t *); -extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); +_ZFEATURE_COMMON_H boolean_t zfeature_is_supported(const char *); +_ZFEATURE_COMMON_H int zfeature_lookup_guid(const char *, spa_feature_t *); +_ZFEATURE_COMMON_H int zfeature_lookup_name(const char *, spa_feature_t *); +_ZFEATURE_COMMON_H boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); -extern void zpool_feature_init(void); +_ZFEATURE_COMMON_H void zpool_feature_init(void); #ifdef __cplusplus } diff --git a/include/zfs_comutil.h b/include/zfs_comutil.h index 1360d6e1c1..3e4716224a 100644 --- a/include/zfs_comutil.h +++ b/include/zfs_comutil.h @@ -24,7 +24,7 @@ */ #ifndef _ZFS_COMUTIL_H -#define _ZFS_COMUTIL_H +#define _ZFS_COMUTIL_H extern __attribute__((visibility("default"))) #include #include @@ -33,16 +33,18 @@ extern "C" { #endif -extern boolean_t zfs_allocatable_devs(nvlist_t *); -extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *); +_ZFS_COMUTIL_H boolean_t zfs_allocatable_devs(nvlist_t *); +_ZFS_COMUTIL_H boolean_t zfs_special_devs(nvlist_t *, char *); +_ZFS_COMUTIL_H void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *); -extern int zfs_zpl_version_map(int spa_version); -extern int zfs_spa_version_map(int zpl_version); +_ZFS_COMUTIL_H int zfs_zpl_version_map(int spa_version); +_ZFS_COMUTIL_H int zfs_spa_version_map(int zpl_version); -extern boolean_t zfs_dataset_name_hidden(const char *); +_ZFS_COMUTIL_H boolean_t zfs_dataset_name_hidden(const char *); #define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 -extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; +_ZFS_COMUTIL_H const char * + zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; #ifdef __cplusplus } diff --git a/include/zfs_deleg.h b/include/zfs_deleg.h index 32d66980e5..1ae08850fb 100644 --- a/include/zfs_deleg.h +++ b/include/zfs_deleg.h @@ -25,7 +25,7 @@ */ #ifndef _ZFS_DELEG_H -#define _ZFS_DELEG_H +#define _ZFS_DELEG_H extern __attribute__((visibility("default"))) #include @@ -77,7 +77,6 @@ typedef enum { ZFS_DELEG_NOTE_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJQUOTA, - ZFS_DELEG_NOTE_REMAP, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; @@ -86,12 +85,12 @@ typedef struct zfs_deleg_perm_tab { zfs_deleg_note_t z_note; } zfs_deleg_perm_tab_t; -extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; +_ZFS_DELEG_H zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; -int zfs_deleg_verify_nvlist(nvlist_t *nvlist); -void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, +_ZFS_DELEG_H int zfs_deleg_verify_nvlist(nvlist_t *nvlist); +_ZFS_DELEG_H void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, char checkflag, void *data); -const char *zfs_deleg_canonicalize_perm(const char *perm); +_ZFS_DELEG_H const char *zfs_deleg_canonicalize_perm(const char *perm); #ifdef __cplusplus } diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index 5c7a61c562..bb356c59ac 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -27,7 +27,7 @@ */ #ifndef _ZFS_FLETCHER_H -#define _ZFS_FLETCHER_H +#define _ZFS_FLETCHER_H extern __attribute__((visibility("default"))) #include #include @@ -48,19 +48,24 @@ extern "C" { * checksum method is added. This method will ignore last (size % 4) bytes of * the data buffer. */ -void fletcher_init(zio_cksum_t *); -void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); -int fletcher_2_incremental_native(void *, size_t, void *); -int fletcher_2_incremental_byteswap(void *, size_t, void *); -void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -int fletcher_4_incremental_native(void *, size_t, void *); -int fletcher_4_incremental_byteswap(void *, size_t, void *); -int fletcher_4_impl_set(const char *selector); -void fletcher_4_init(void); -void fletcher_4_fini(void); +_ZFS_FLETCHER_H void fletcher_init(zio_cksum_t *); +_ZFS_FLETCHER_H void fletcher_2_native(const void *, uint64_t, const void *, + zio_cksum_t *); +_ZFS_FLETCHER_H void fletcher_2_byteswap(const void *, uint64_t, const void *, + zio_cksum_t *); +_ZFS_FLETCHER_H void fletcher_4_native(const void *, uint64_t, const void *, + zio_cksum_t *); +_ZFS_FLETCHER_H int fletcher_2_incremental_native(void *, size_t, void *); +_ZFS_FLETCHER_H int fletcher_2_incremental_byteswap(void *, size_t, void *); +_ZFS_FLETCHER_H void fletcher_4_native_varsize(const void *, uint64_t, + zio_cksum_t *); +_ZFS_FLETCHER_H void fletcher_4_byteswap(const void *, uint64_t, const void *, + zio_cksum_t *); +_ZFS_FLETCHER_H int fletcher_4_incremental_native(void *, size_t, void *); +_ZFS_FLETCHER_H int fletcher_4_incremental_byteswap(void *, size_t, void *); +_ZFS_FLETCHER_H int fletcher_4_impl_set(const char *selector); +_ZFS_FLETCHER_H void fletcher_4_init(void); +_ZFS_FLETCHER_H void fletcher_4_fini(void); @@ -124,27 +129,31 @@ typedef struct fletcher_4_func { const char *name; } fletcher_4_ops_t; -extern const fletcher_4_ops_t fletcher_4_superscalar_ops; -extern const fletcher_4_ops_t fletcher_4_superscalar4_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops; #if defined(HAVE_SSE2) -extern const fletcher_4_ops_t fletcher_4_sse2_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_sse2_ops; #endif #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) -extern const fletcher_4_ops_t fletcher_4_ssse3_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_ssse3_ops; #endif #if defined(HAVE_AVX) && defined(HAVE_AVX2) -extern const fletcher_4_ops_t fletcher_4_avx2_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx2_ops; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) -extern const fletcher_4_ops_t fletcher_4_avx512f_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512f_ops; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512BW) +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512bw_ops; #endif #if defined(__aarch64__) -extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; +_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; #endif #ifdef __cplusplus diff --git a/include/zfs_namecheck.h b/include/zfs_namecheck.h index 527db92b0c..4739b065c5 100644 --- a/include/zfs_namecheck.h +++ b/include/zfs_namecheck.h @@ -27,7 +27,7 @@ */ #ifndef _ZFS_NAMECHECK_H -#define _ZFS_NAMECHECK_H +#define _ZFS_NAMECHECK_H extern __attribute__((visibility("default"))) #ifdef __cplusplus extern "C" { @@ -43,21 +43,30 @@ typedef enum { NAME_ERR_RESERVED, /* entire name is reserved */ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ NAME_ERR_TOOLONG, /* name is too long */ + NAME_ERR_SELF_REF, /* reserved self path name ('.') */ + NAME_ERR_PARENT_REF, /* reserved parent path name ('..') */ NAME_ERR_NO_AT, /* permission set is missing '@' */ + NAME_ERR_NO_POUND, /* permission set is missing '#' */ } namecheck_err_t; #define ZFS_PERMSET_MAXLEN 64 -extern int zfs_max_dataset_nesting; +_ZFS_NAMECHECK_H int zfs_max_dataset_nesting; -int get_dataset_depth(const char *); -int pool_namecheck(const char *, namecheck_err_t *, char *); -int entity_namecheck(const char *, namecheck_err_t *, char *); -int dataset_namecheck(const char *, namecheck_err_t *, char *); -int dataset_nestcheck(const char *); -int mountpoint_namecheck(const char *, namecheck_err_t *); -int zfs_component_namecheck(const char *, namecheck_err_t *, char *); -int permset_namecheck(const char *, namecheck_err_t *, char *); +_ZFS_NAMECHECK_H int get_dataset_depth(const char *); +_ZFS_NAMECHECK_H int pool_namecheck(const char *, namecheck_err_t *, char *); +_ZFS_NAMECHECK_H int entity_namecheck(const char *, namecheck_err_t *, char *); +_ZFS_NAMECHECK_H int dataset_namecheck(const char *, namecheck_err_t *, char *); +_ZFS_NAMECHECK_H int snapshot_namecheck(const char *, namecheck_err_t *, + char *); +_ZFS_NAMECHECK_H int bookmark_namecheck(const char *, namecheck_err_t *, + char *); +_ZFS_NAMECHECK_H int dataset_nestcheck(const char *); +_ZFS_NAMECHECK_H int mountpoint_namecheck(const char *, namecheck_err_t *); +_ZFS_NAMECHECK_H int zfs_component_namecheck(const char *, namecheck_err_t *, + char *); +_ZFS_NAMECHECK_H int permset_namecheck(const char *, namecheck_err_t *, + char *); #ifdef __cplusplus } diff --git a/include/zfs_prop.h b/include/zfs_prop.h index 89b6a20243..91b5032e70 100644 --- a/include/zfs_prop.h +++ b/include/zfs_prop.h @@ -24,7 +24,7 @@ */ #ifndef _ZFS_PROP_H -#define _ZFS_PROP_H +#define _ZFS_PROP_H extern __attribute__((visibility("default"))) #include #include @@ -87,44 +87,46 @@ typedef struct { /* * zfs dataset property functions */ -void zfs_prop_init(void); -zprop_type_t zfs_prop_get_type(zfs_prop_t); -boolean_t zfs_prop_delegatable(zfs_prop_t prop); -zprop_desc_t *zfs_prop_get_table(void); +_ZFS_PROP_H void zfs_prop_init(void); +_ZFS_PROP_H zprop_type_t zfs_prop_get_type(zfs_prop_t); +_ZFS_PROP_H boolean_t zfs_prop_delegatable(zfs_prop_t prop); +_ZFS_PROP_H zprop_desc_t *zfs_prop_get_table(void); /* * zpool property functions */ -void zpool_prop_init(void); -zprop_type_t zpool_prop_get_type(zpool_prop_t); -zprop_desc_t *zpool_prop_get_table(void); +_ZFS_PROP_H void zpool_prop_init(void); +_ZFS_PROP_H zprop_type_t zpool_prop_get_type(zpool_prop_t); +_ZFS_PROP_H zprop_desc_t *zpool_prop_get_table(void); /* * Common routines to initialize property tables */ -void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, +_ZFS_PROP_H void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, const char *, zprop_attr_t, int, const char *, const char *, boolean_t, boolean_t, const zprop_index_t *); -void zprop_register_string(int, const char *, const char *, +_ZFS_PROP_H void zprop_register_string(int, const char *, const char *, zprop_attr_t attr, int, const char *, const char *); -void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int, - const char *, const char *); -void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int, - const char *, const char *, const zprop_index_t *); -void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t, - int, const char *); +_ZFS_PROP_H void zprop_register_number(int, const char *, uint64_t, + zprop_attr_t, int, const char *, const char *); +_ZFS_PROP_H void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, + int, const char *, const char *, const zprop_index_t *); +_ZFS_PROP_H void zprop_register_hidden(int, const char *, zprop_type_t, + zprop_attr_t, int, const char *); /* * Common routines for zfs and zpool property management */ -int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); -int zprop_name_to_prop(const char *, zfs_type_t); -int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); -int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); -uint64_t zprop_random_value(int, uint64_t, zfs_type_t); -const char *zprop_values(int, zfs_type_t); -size_t zprop_width(int, boolean_t *, zfs_type_t); -boolean_t zprop_valid_for_type(int, zfs_type_t, boolean_t); +_ZFS_PROP_H int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, + zfs_type_t); +_ZFS_PROP_H int zprop_name_to_prop(const char *, zfs_type_t); +_ZFS_PROP_H int zprop_string_to_index(int, const char *, uint64_t *, + zfs_type_t); +_ZFS_PROP_H int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); +_ZFS_PROP_H uint64_t zprop_random_value(int, uint64_t, zfs_type_t); +_ZFS_PROP_H const char *zprop_values(int, zfs_type_t); +_ZFS_PROP_H size_t zprop_width(int, boolean_t *, zfs_type_t); +_ZFS_PROP_H boolean_t zprop_valid_for_type(int, zfs_type_t, boolean_t); #ifdef __cplusplus } diff --git a/lib/Makefile.am b/lib/Makefile.am index 8dff773df4..f07975cc03 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,7 +1,82 @@ +# +# Shown below is a simplified dependency graph of the OpenZFS provided +# libraries. Administrative commands (`zfs`, `zpool`, etc) interface with +# the kernel modules using the `libzfs.so` and `libzfs_core.so` libraries. +# These libraries provide a stable ABI across OpenZFS point releases. +# +# The `libzpool.so` library is a user space build of the DMU and SPA layers +# used to implement debugging tools (zdb) and code validation tools (ztest). +# These library interfaces are subject to change at any time. +# +# +# CMDS: zhack/ztest/zdb/ zfs/zpool/zed/ +# raidz_{test,bench} zinject/zstream +# | | +# LIBS: | | libzfsbootenv* +# | | | +# | | | +# libzpool libzfs* ----------------+ +# | | | \ / | | | +# libicp --/ | | \ / | | \------- libshare +# | | \ / | | +# libzstd ---/ | \ / | \--------- libuutil +# | \ / \ | | +# libunicode --/ \ / \ | | +# \ / \ | | +# libzutil libzfs_core* | | +# | | | | \ | | | | +# | | | | | | | | | +# | | | | | | | | | +# libtpool -------------/ | | | \---- libnvpair* | | | +# | | | | | | +# libefi -----------------/ | \------ libavl* --------/ | +# | | | +# \-------- libspl ----+------/ +# +# * - A stable ABI is provided for these libraries +# +# # NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries -# These six libraries are intermediary build components. -SUBDIRS = libavl libefi libicp libshare libspl libtpool libzutil libunicode +# These nine libraries are intermediary build components. +# +SUBDIRS = libavl libicp libshare libspl libtpool libzstd +CPPCHECKDIRS = libavl libicp libnvpair libshare libspl libtpool libunicode +CPPCHECKDIRS += libuutil libzfs libzfs_core libzfsbootenv libzpool libzutil -# These four libraries, which are installed as the final build product, -# incorporate the six convenience libraries given above. -SUBDIRS += libuutil libnvpair libzpool libzfs_core libzfs +if BUILD_LINUX +SUBDIRS += libefi +CPPCHECKDIRS += libefi +endif + +# libnvpair is installed as part of the final build product +# libzutil depends on it, so it must be compiled before libzutil +SUBDIRS += libnvpair + +# libzutil depends on libefi if present +SUBDIRS += libzutil libunicode + +# These five libraries, which are installed as the final build product, +# incorporate the eight convenience libraries given above. +DISTLIBS = libuutil libzfs_core libzfs libzpool libzfsbootenv +SUBDIRS += $(DISTLIBS) +DISTLIBS += libnvpair + +# An ABI is stored for each of these libraries. Note that libzpool.so +# is only linked against by ztest and zdb and no stable ABI is provided. +ABILIBS = libnvpair libuutil libzfs_core libzfs libzfsbootenv + +PHONY = checkabi storeabi cppcheck +checkabi: $(ABILIBS) + set -e ; for dir in $(ABILIBS) ; do \ + $(MAKE) -C $$dir checkabi ; \ + done + +storeabi: $(ABILIBS) + set -e ; for dir in $(ABILIBS) ; do \ + $(MAKE) -C $$dir storeabi ; \ + done + +cppcheck: $(CPPCHECKDIRS) + set -e ; for dir in $(CPPCHECKDIRS) ; do \ + $(MAKE) -C $$dir cppcheck ; \ + done diff --git a/lib/libavl/Makefile.am b/lib/libavl/Makefile.am index 82b30bd80f..3166febd02 100644 --- a/lib/libavl/Makefile.am +++ b/lib/libavl/Makefile.am @@ -4,20 +4,14 @@ VPATH = $(top_srcdir)/module/avl/ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += -fvisibility=hidden noinst_LTLIBRARIES = libavl.la -USER_C = - KERNEL_C = \ avl.c nodist_libavl_la_SOURCES = \ - $(USER_C) \ $(KERNEL_C) -EXTRA_DIST = $(USER_C) +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am index 9f69e46014..580319a314 100644 --- a/lib/libefi/Makefile.am +++ b/lib/libefi/Makefile.am @@ -1,16 +1,15 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += $(LIBUUID_CFLAGS) $(ZLIB_CFLAGS) +AM_CFLAGS += -fvisibility=hidden noinst_LTLIBRARIES = libefi.la USER_C = \ rdwr_efi.c -nodist_libefi_la_SOURCES = $(USER_C) +libefi_la_SOURCES = $(USER_C) -libefi_la_LIBADD = $(LIBUUID) +libefi_la_LIBADD = $(LIBUUID_LIBS) $(ZLIB_LIBS) -EXTRA_DIST = $(USER_C) +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 1d8f631c83..fd243e230e 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -42,7 +42,9 @@ #include #include #include +#include #include +#include static struct uuid_to_ptag { struct uuid uuid; @@ -138,40 +140,6 @@ static struct uuid_to_ptag { { EFI_FREEDESKTOP_BOOT } }; -/* - * Default vtoc information for non-SVr4 partitions - */ -struct dk_map2 default_vtoc_map[NDKMAP] = { - { V_ROOT, 0 }, /* a - 0 */ - { V_SWAP, V_UNMNT }, /* b - 1 */ - { V_BACKUP, V_UNMNT }, /* c - 2 */ - { V_UNASSIGNED, 0 }, /* d - 3 */ - { V_UNASSIGNED, 0 }, /* e - 4 */ - { V_UNASSIGNED, 0 }, /* f - 5 */ - { V_USR, 0 }, /* g - 6 */ - { V_UNASSIGNED, 0 }, /* h - 7 */ - -#if defined(_SUNOS_VTOC_16) - -#if defined(i386) || defined(__amd64) || defined(__arm) || \ - defined(__powerpc) || defined(__sparc) || defined(__s390__) || \ - defined(__mips__) - { V_BOOT, V_UNMNT }, /* i - 8 */ - { V_ALTSCTR, 0 }, /* j - 9 */ - -#else -#error No VTOC format defined. -#endif /* defined(i386) */ - - { V_UNASSIGNED, 0 }, /* k - 10 */ - { V_UNASSIGNED, 0 }, /* l - 11 */ - { V_UNASSIGNED, 0 }, /* m - 12 */ - { V_UNASSIGNED, 0 }, /* n - 13 */ - { V_UNASSIGNED, 0 }, /* o - 14 */ - { V_UNASSIGNED, 0 }, /* p - 15 */ -#endif /* defined(_SUNOS_VTOC_16) */ -}; - int efi_debug = 0; static int efi_read(int, struct dk_gpt *); @@ -208,36 +176,44 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) return (0); } +/* + * Return back the device name associated with the file descriptor. The + * caller is responsible for freeing the memory associated with the + * returned string. + */ +static char * +efi_get_devname(int fd) +{ + char path[32]; + + /* + * The libefi API only provides the open fd and not the file path. + * To handle this realpath(3) is used to resolve the block device + * name from /proc/self/fd/. + */ + (void) snprintf(path, sizeof (path), "/proc/self/fd/%d", fd); + return (realpath(path, NULL)); +} + static int efi_get_info(int fd, struct dk_cinfo *dki_info) { - char *path; char *dev_path; int rval = 0; memset(dki_info, 0, sizeof (*dki_info)); - path = calloc(1, PATH_MAX); - if (path == NULL) - goto error; - /* * The simplest way to get the partition number under linux is - * to parse it out of the /dev/ block device name. + * to parse it out of the /dev/ block device name. * The kernel creates this using the partition number when it * populates /dev/ so it may be trusted. The tricky bit here is * that the naming convention is based on the block device type. * So we need to take this in to account when parsing out the - * partition information. Another issue is that the libefi API - * API only provides the open fd and not the file path. To handle - * this realpath(3) is used to resolve the block device name from - * /proc/self/fd/. Aside from the partition number we collect + * partition information. Aside from the partition number we collect * some additional device info. */ - (void) sprintf(path, "/proc/self/fd/%d", fd); - dev_path = realpath(path, NULL); - free(path); - + dev_path = efi_get_devname(fd); if (dev_path == NULL) goto error; @@ -398,10 +374,11 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) length = sizeof (struct dk_gpt) + sizeof (struct dk_part) * (nparts - 1); - if ((*vtoc = calloc(1, length)) == NULL) + vptr = calloc(1, length); + if (vptr == NULL) return (-1); - vptr = *vtoc; + *vtoc = vptr; vptr->efi_version = EFI_VERSION_CURRENT; vptr->efi_lbasize = lbsize; @@ -430,30 +407,33 @@ efi_alloc_and_read(int fd, struct dk_gpt **vtoc) int rval; uint32_t nparts; int length; + struct dk_gpt *vptr; /* figure out the number of entries that would fit into 16K */ nparts = EFI_MIN_ARRAY_SIZE / sizeof (efi_gpe_t); length = (int) sizeof (struct dk_gpt) + (int) sizeof (struct dk_part) * (nparts - 1); - if ((*vtoc = calloc(1, length)) == NULL) + vptr = calloc(1, length); + + if (vptr == NULL) return (VT_ERROR); - (*vtoc)->efi_nparts = nparts; - rval = efi_read(fd, *vtoc); + vptr->efi_nparts = nparts; + rval = efi_read(fd, vptr); - if ((rval == VT_EINVAL) && (*vtoc)->efi_nparts > nparts) { + if ((rval == VT_EINVAL) && vptr->efi_nparts > nparts) { void *tmp; length = (int) sizeof (struct dk_gpt) + - (int) sizeof (struct dk_part) * - ((*vtoc)->efi_nparts - 1); - nparts = (*vtoc)->efi_nparts; - if ((tmp = realloc(*vtoc, length)) == NULL) { - free (*vtoc); + (int) sizeof (struct dk_part) * (vptr->efi_nparts - 1); + nparts = vptr->efi_nparts; + if ((tmp = realloc(vptr, length)) == NULL) { + /* cppcheck-suppress doubleFree */ + free(vptr); *vtoc = NULL; return (VT_ERROR); } else { - *vtoc = tmp; - rval = efi_read(fd, *vtoc); + vptr = tmp; + rval = efi_read(fd, vptr); } } @@ -462,8 +442,10 @@ efi_alloc_and_read(int fd, struct dk_gpt **vtoc) (void) fprintf(stderr, "read of EFI table failed, rval=%d\n", rval); } - free (*vtoc); + free(vptr); *vtoc = NULL; + } else { + *vtoc = vptr; } return (rval); @@ -1102,18 +1084,49 @@ check_input(struct dk_gpt *vtoc) return (0); } +static int +call_blkpg_ioctl(int fd, int command, diskaddr_t start, + diskaddr_t size, uint_t pno) +{ + struct blkpg_ioctl_arg ioctl_arg; + struct blkpg_partition linux_part; + memset(&linux_part, 0, sizeof (linux_part)); + + char *path = efi_get_devname(fd); + if (path == NULL) { + (void) fprintf(stderr, "failed to retrieve device name\n"); + return (VT_EINVAL); + } + + linux_part.start = start; + linux_part.length = size; + linux_part.pno = pno; + snprintf(linux_part.devname, BLKPG_DEVNAMELTH - 1, "%s%u", path, pno); + linux_part.devname[BLKPG_DEVNAMELTH - 1] = '\0'; + free(path); + + ioctl_arg.op = command; + ioctl_arg.flags = 0; + ioctl_arg.datalen = sizeof (struct blkpg_partition); + ioctl_arg.data = &linux_part; + + return (ioctl(fd, BLKPG, &ioctl_arg)); +} + /* * add all the unallocated space to the current label */ int efi_use_whole_disk(int fd) { - struct dk_gpt *efi_label = NULL; - int rval; - int i; - uint_t resv_index = 0, data_index = 0; - diskaddr_t resv_start = 0, data_start = 0; - diskaddr_t difference; + struct dk_gpt *efi_label = NULL; + int rval; + int i; + uint_t resv_index = 0, data_index = 0; + diskaddr_t resv_start = 0, data_start = 0; + diskaddr_t data_size, limit, difference; + boolean_t sync_needed = B_FALSE; + uint_t nblocks; rval = efi_alloc_and_read(fd, &efi_label); if (rval < 0) { @@ -1122,23 +1135,6 @@ efi_use_whole_disk(int fd) return (rval); } - /* - * If alter_lba is 1, we are using the backup label. - * Since we can locate the backup label by disk capacity, - * there must be no unallocated space. - */ - if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba - >= efi_label->efi_last_lba)) { - if (efi_debug) { - (void) fprintf(stderr, - "efi_use_whole_disk: requested space not found\n"); - } - efi_free(efi_label); - return (VT_ENOSPC); - } - - difference = efi_label->efi_last_lba - efi_label->efi_altern_lba; - /* * Find the last physically non-zero partition. * This should be the reserved partition. @@ -1150,11 +1146,69 @@ efi_use_whole_disk(int fd) } } + /* + * Find the last physically non-zero partition before that. + * This is the data partition. + */ + for (i = 0; i < resv_index; i ++) { + if (data_start < efi_label->efi_parts[i].p_start) { + data_start = efi_label->efi_parts[i].p_start; + data_index = i; + } + } + data_size = efi_label->efi_parts[data_index].p_size; + + /* + * See the "efi_alloc_and_init" function for more information + * about where this "nblocks" value comes from. + */ + nblocks = efi_label->efi_first_u_lba - 1; + + /* + * Determine if the EFI label is out of sync. We check that: + * + * 1. the data partition ends at the limit we set, and + * 2. the reserved partition starts at the limit we set. + * + * If either of these conditions is not met, then we need to + * resync the EFI label. + * + * The limit is the last usable LBA, determined by the last LBA + * and the first usable LBA fields on the EFI label of the disk + * (see the lines directly above). Additionally, we factor in + * EFI_MIN_RESV_SIZE (per its use in "zpool_label_disk") and + * P2ALIGN it to ensure the partition boundaries are aligned + * (for performance reasons). The alignment should match the + * alignment used by the "zpool_label_disk" function. + */ + limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, + PARTITION_END_ALIGNMENT); + if (data_start + data_size != limit || resv_start != limit) + sync_needed = B_TRUE; + + if (efi_debug && sync_needed) + (void) fprintf(stderr, "efi_use_whole_disk: sync needed\n"); + + /* + * If alter_lba is 1, we are using the backup label. + * Since we can locate the backup label by disk capacity, + * there must be no unallocated space. + */ + if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba + >= efi_label->efi_last_lba && !sync_needed)) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: requested space not found\n"); + } + efi_free(efi_label); + return (VT_ENOSPC); + } + /* * Verify that we've found the reserved partition by checking * that it looks the way it did when we created it in zpool_label_disk. * If we've found the incorrect partition, then we know that this - * device was reformatted and no longer is soley used by ZFS. + * device was reformatted and no longer is solely used by ZFS. */ if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) || (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) || @@ -1167,17 +1221,36 @@ efi_use_whole_disk(int fd) return (VT_ENOSPC); } - /* - * Find the last physically non-zero partition before that. - * This is the data partition. - */ - for (i = 0; i < resv_index; i ++) { - if (data_start < efi_label->efi_parts[i].p_start) { - data_start = efi_label->efi_parts[i].p_start; - data_index = i; + if (data_start + data_size != resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "data_start (%lli) + " + "data_size (%lli) != " + "resv_start (%lli)\n", + data_start, data_size, resv_start); } + + return (VT_EINVAL); } + if (limit < resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "limit (%lli) < resv_start (%lli)\n", + limit, resv_start); + } + + return (VT_EINVAL); + } + + difference = limit - resv_start; + + if (efi_debug) + (void) fprintf(stderr, + "efi_use_whole_disk: difference is %lli\n", difference); + /* * Move the reserved partition. There is currently no data in * here except fabricated devids (which get generated via @@ -1185,23 +1258,76 @@ efi_use_whole_disk(int fd) */ efi_label->efi_parts[data_index].p_size += difference; efi_label->efi_parts[resv_index].p_start += difference; - efi_label->efi_last_u_lba += difference; + efi_label->efi_last_u_lba = efi_label->efi_last_lba - nblocks; - rval = efi_write(fd, efi_label); - if (rval < 0) { - if (efi_debug) { - (void) fprintf(stderr, - "efi_use_whole_disk:fail to write label, rval=%d\n", - rval); - } - efi_free(efi_label); - return (rval); + /* + * Rescanning the partition table in the kernel can result + * in the device links to be removed (see comment in vdev_disk_open). + * If BLKPG_RESIZE_PARTITION is available, then we can resize + * the partition table online and avoid having to remove the device + * links used by the pool. This provides a very deterministic + * approach to resizing devices and does not require any + * loops waiting for devices to reappear. + */ +#ifdef BLKPG_RESIZE_PARTITION + /* + * Delete the reserved partition since we're about to expand + * the data partition and it would overlap with the reserved + * partition. + * NOTE: The starting index for the ioctl is 1 while for the + * EFI partitions it's 0. For that reason we have to add one + * whenever we make an ioctl call. + */ + rval = call_blkpg_ioctl(fd, BLKPG_DEL_PARTITION, 0, 0, resv_index + 1); + if (rval != 0) + goto out; + + /* + * Expand the data partition + */ + rval = call_blkpg_ioctl(fd, BLKPG_RESIZE_PARTITION, + efi_label->efi_parts[data_index].p_start * efi_label->efi_lbasize, + efi_label->efi_parts[data_index].p_size * efi_label->efi_lbasize, + data_index + 1); + if (rval != 0) { + (void) fprintf(stderr, "Unable to resize data " + "partition: %d\n", rval); + /* + * Since we failed to resize, we need to reset the start + * of the reserve partition and re-create it. + */ + efi_label->efi_parts[resv_index].p_start -= difference; } - efi_free(efi_label); - return (0); -} + /* + * Re-add the reserved partition. If we've expanded the data partition + * then we'll move the reserve partition to the end of the data + * partition. Otherwise, we'll recreate the partition in its original + * location. Note that we do this as best-effort and ignore any + * errors that may arise here. This will ensure that we finish writing + * the EFI label. + */ + (void) call_blkpg_ioctl(fd, BLKPG_ADD_PARTITION, + efi_label->efi_parts[resv_index].p_start * efi_label->efi_lbasize, + efi_label->efi_parts[resv_index].p_size * efi_label->efi_lbasize, + resv_index + 1); +#endif + /* + * We're now ready to write the EFI label. + */ + if (rval == 0) { + rval = efi_write(fd, efi_label); + if (rval < 0 && efi_debug) { + (void) fprintf(stderr, "efi_use_whole_disk:fail " + "to write label, rval=%d\n", rval); + } + } + +out: + efi_free(efi_label); + return (rval); +} /* * write EFI label and backup label @@ -1222,7 +1348,7 @@ efi_write(int fd, struct dk_gpt *vtoc) if ((rval = efi_get_info(fd, &dki_info)) != 0) return (rval); - /* check if we are dealing wih a metadevice */ + /* check if we are dealing with a metadevice */ if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && (strncmp(dki_info.dki_dname, "md", 3) == 0)) { md_flag = 1; @@ -1532,57 +1658,3 @@ efi_err_check(struct dk_gpt *vtoc) "no reserved partition found\n"); } } - -/* - * We need to get information necessary to construct a *new* efi - * label type - */ -int -efi_auto_sense(int fd, struct dk_gpt **vtoc) -{ - - int i; - - /* - * Now build the default partition table - */ - if (efi_alloc_and_init(fd, EFI_NUMPAR, vtoc) != 0) { - if (efi_debug) { - (void) fprintf(stderr, "efi_alloc_and_init failed.\n"); - } - return (-1); - } - - for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) { - (*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag; - (*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag; - (*vtoc)->efi_parts[i].p_start = 0; - (*vtoc)->efi_parts[i].p_size = 0; - } - /* - * Make constants first - * and variable partitions later - */ - - /* root partition - s0 128 MB */ - (*vtoc)->efi_parts[0].p_start = 34; - (*vtoc)->efi_parts[0].p_size = 262144; - - /* partition - s1 128 MB */ - (*vtoc)->efi_parts[1].p_start = 262178; - (*vtoc)->efi_parts[1].p_size = 262144; - - /* partition -s2 is NOT the Backup disk */ - (*vtoc)->efi_parts[2].p_tag = V_UNASSIGNED; - - /* partition -s6 /usr partition - HOG */ - (*vtoc)->efi_parts[6].p_start = 524322; - (*vtoc)->efi_parts[6].p_size = (*vtoc)->efi_last_u_lba - 524322 - - (1024 * 16); - - /* efi reserved partition - s9 16K */ - (*vtoc)->efi_parts[8].p_start = (*vtoc)->efi_last_u_lba - (1024 * 16); - (*vtoc)->efi_parts[8].p_size = (1024 * 16); - (*vtoc)->efi_parts[8].p_tag = V_RESERVED; - return (0); -} diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index e9f22cd707..0b87a988c0 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -7,38 +7,24 @@ VPATH = \ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/module/icp/include \ - -I$(top_srcdir)/lib/libspl/include - noinst_LTLIBRARIES = libicp.la -if TARGET_ASM_X86_64 +if TARGET_CPU_X86_64 ASM_SOURCES_C = asm-x86_64/aes/aeskey.c ASM_SOURCES_AS = \ asm-x86_64/aes/aes_amd64.S \ asm-x86_64/aes/aes_aesni.S \ asm-x86_64/modes/gcm_pclmulqdq.S \ + asm-x86_64/modes/aesni-gcm-x86_64.S \ + asm-x86_64/modes/ghash-x86_64.S \ asm-x86_64/sha1/sha1-x86_64.S \ asm-x86_64/sha2/sha256_impl.S \ asm-x86_64/sha2/sha512_impl.S -endif - -if TARGET_ASM_I386 +else ASM_SOURCES_C = ASM_SOURCES_AS = endif -if TARGET_ASM_GENERIC -ASM_SOURCES_C = -ASM_SOURCES_AS = -endif - -USER_C = - -USER_ASM = - KERNEL_C = \ spi/kcf_spi.c \ api/kcf_ctxops.c \ @@ -83,9 +69,7 @@ KERNEL_C = \ KERNEL_ASM = $(ASM_SOURCES_AS) nodist_libicp_la_SOURCES = \ - $(USER_C) \ - $(USER_ASM) \ $(KERNEL_C) \ $(KERNEL_ASM) -libicp_la_LIBADD = -lrt +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am index 6d59d7bfc6..7b9ebebe79 100644 --- a/lib/libnvpair/Makefile.am +++ b/lib/libnvpair/Makefile.am @@ -7,13 +7,12 @@ VPATH = \ # Includes kernel code, generate warnings for large stack frames # and required CFLAGS for libtirpc AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS) - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += -fvisibility=hidden lib_LTLIBRARIES = libnvpair.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ libnvpair.c \ libnvpair_json.c \ @@ -24,11 +23,26 @@ KERNEL_C = \ nvpair.c \ fnvpair.c +dist_libnvpair_la_SOURCES = \ + $(USER_C) + nodist_libnvpair_la_SOURCES = \ - $(USER_C) \ $(KERNEL_C) -libnvpair_la_LIBADD = $(LIBTIRPC_LIBS) -libnvpair_la_LDFLAGS = -version-info 1:1:0 +libnvpair_la_LIBADD = \ + $(abs_top_builddir)/lib/libspl/libspl_assert.la -EXTRA_DIST = $(USER_C) +libnvpair_la_LIBADD += $(LIBTIRPC_LIBS) $(LTLIBINTL) + +libnvpair_la_LDFLAGS = + +if !ASAN_ENABLED +libnvpair_la_LDFLAGS += -Wl,-z,defs +endif + +libnvpair_la_LDFLAGS += -version-info 3:0:0 + +include $(top_srcdir)/config/CppCheck.am + +# Library ABI +EXTRA_DIST = libnvpair.abi libnvpair.suppr diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi new file mode 100644 index 0000000000..9c27d178dc --- /dev/null +++ b/lib/libnvpair/libnvpair.abi @@ -0,0 +1,2778 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c index 2e9ea1c174..fd43a44c1c 100644 --- a/lib/libnvpair/libnvpair.c +++ b/lib/libnvpair/libnvpair.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include "libnvpair.h" @@ -191,9 +190,9 @@ static int \ nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ nvlist_t *nvl, const char *name, vtype value) \ { \ + (void) private; \ + (void) nvl; \ FILE *fp = pctl->nvprt_fp; \ - NOTE(ARGUNUSED(private)) \ - NOTE(ARGUNUSED(nvl)) \ indent(pctl, 1); \ (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ (void) fprintf(fp, vfmt, (ptype)value); \ @@ -224,10 +223,10 @@ static int \ nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ { \ + (void) private; \ + (void) nvl; \ FILE *fp = pctl->nvprt_fp; \ uint_t i; \ - NOTE(ARGUNUSED(private)) \ - NOTE(ARGUNUSED(nvl)) \ for (i = 0; i < count; i++) { \ if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ indent(pctl, 1); \ diff --git a/lib/libnvpair/libnvpair.suppr b/lib/libnvpair/libnvpair.suppr new file mode 100644 index 0000000000..f4db8a49e4 --- /dev/null +++ b/lib/libnvpair/libnvpair.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/lib/libnvpair/libnvpair_json.c b/lib/libnvpair/libnvpair_json.c index 0b403f1af3..15b6f4afaf 100644 --- a/lib/libnvpair/libnvpair_json.c +++ b/lib/libnvpair/libnvpair_json.c @@ -54,6 +54,13 @@ nvlist_print_json_string(FILE *fp, const char *input) FPRINTF(fp, "\""); while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) { + if (sz == (size_t)-1 || sz == (size_t)-2) { + /* + * We last read an invalid multibyte character sequence, + * so return an error. + */ + return (-1); + } switch (c) { case '"': FPRINTF(fp, "\\\""); @@ -97,14 +104,6 @@ nvlist_print_json_string(FILE *fp, const char *input) input += sz; } - if (sz == (size_t)-1 || sz == (size_t)-2) { - /* - * We last read an invalid multibyte character sequence, - * so return an error. - */ - return (-1); - } - FPRINTF(fp, "\""); return (0); } @@ -303,7 +302,7 @@ nvlist_print_json(FILE *fp, nvlist_t *nvl) for (i = 0; i < valsz; i++) { if (i > 0) FPRINTF(fp, ","); - FPRINTF(fp, "%hd", val[i]); + FPRINTF(fp, "%hhd", val[i]); } FPRINTF(fp, "]"); break; diff --git a/lib/libshare/Makefile.am b/lib/libshare/Makefile.am index 462e333ffc..dff3e5382d 100644 --- a/lib/libshare/Makefile.am +++ b/lib/libshare/Makefile.am @@ -1,8 +1,8 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +DEFAULT_INCLUDES += -I$(srcdir) + +AM_CFLAGS += -fvisibility=hidden noinst_LTLIBRARIES = libshare.la @@ -11,9 +11,20 @@ USER_C = \ libshare.c \ nfs.c \ nfs.h \ - smb.c \ smb.h -nodist_libshare_la_SOURCES = $(USER_C) +if BUILD_LINUX +USER_C += \ + os/linux/nfs.c \ + os/linux/smb.c +endif -EXTRA_DIST = $(USER_C) +if BUILD_FREEBSD +USER_C += \ + os/freebsd/nfs.c \ + os/freebsd/smb.c +endif + +libshare_la_SOURCES = $(USER_C) + +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c index 0965911cf0..a228645fbf 100644 --- a/lib/libshare/libshare.c +++ b/lib/libshare/libshare.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2018, 2020 by Delphix. All rights reserved. */ #include @@ -29,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -38,21 +40,9 @@ #include "nfs.h" #include "smb.h" -static sa_share_impl_t find_share(sa_handle_impl_t handle, - const char *sharepath); -static sa_share_impl_t alloc_share(const char *sharepath); +static sa_share_impl_t alloc_share(const char *zfsname, const char *path); static void free_share(sa_share_impl_t share); -static void parse_sharetab(sa_handle_impl_t impl_handle); -static int process_share(sa_handle_impl_t impl_handle, - sa_share_impl_t impl_share, char *pathname, char *resource, - char *fstype, char *options, char *description, - char *dataset, boolean_t from_sharetab); -static void update_sharetab(sa_handle_impl_t impl_handle); - -static int update_zfs_share(sa_share_impl_t impl_handle, const char *proto); -static int update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto); - static int fstypes_count; static sa_fstype_t *fstypes; @@ -78,28 +68,6 @@ register_fstype(const char *name, const sa_share_ops_t *ops) return (fstype); } -sa_handle_t -sa_init(int init_service) -{ - sa_handle_impl_t impl_handle; - - impl_handle = calloc(1, sizeof (struct sa_handle_impl)); - - if (impl_handle == NULL) - return (NULL); - - impl_handle->zfs_libhandle = libzfs_init(); - - if (impl_handle->zfs_libhandle != NULL) { - libzfs_print_on_error(impl_handle->zfs_libhandle, B_TRUE); - } - - parse_sharetab(impl_handle); - update_zfs_shares(impl_handle, NULL); - - return ((sa_handle_t)impl_handle); -} - __attribute__((constructor)) static void libshare_init(void) { @@ -107,448 +75,101 @@ libshare_init(void) libshare_smb_init(); } -static void -parse_sharetab(sa_handle_impl_t impl_handle) +int +sa_enable_share(const char *zfsname, const char *mountpoint, + const char *shareopts, char *protocol) { - FILE *fp; - char line[512]; - char *eol, *pathname, *resource, *fstype, *options, *description; - - fp = fopen(ZFS_SHARETAB, "r"); - - if (fp == NULL) - return; - - while (fgets(line, sizeof (line), fp) != NULL) { - eol = line + strlen(line) - 1; - - while (eol >= line) { - if (*eol != '\r' && *eol != '\n') - break; - - *eol = '\0'; - eol--; - } - - pathname = line; - - if ((resource = strchr(pathname, '\t')) == NULL) - continue; - - *resource = '\0'; - resource++; - - if ((fstype = strchr(resource, '\t')) == NULL) - continue; - - *fstype = '\0'; - fstype++; - - if ((options = strchr(fstype, '\t')) == NULL) - continue; - - *options = '\0'; - options++; - - if ((description = strchr(fstype, '\t')) != NULL) { - *description = '\0'; - description++; - } - - if (strcmp(resource, "-") == 0) - resource = NULL; - - (void) process_share(impl_handle, NULL, pathname, resource, - fstype, options, description, NULL, B_TRUE); - } - - fclose(fp); -} - -static void -update_sharetab(sa_handle_impl_t impl_handle) -{ - sa_share_impl_t impl_share; - int temp_fd; - FILE *temp_fp; - char tempfile[] = ZFS_SHARETAB".XXXXXX"; - sa_fstype_t *fstype; - const char *resource; - - if (mkdir("/etc/dfs", 0755) < 0 && errno != EEXIST) { - return; - } - - temp_fd = mkstemp(tempfile); - - if (temp_fd < 0) - return; - - temp_fp = fdopen(temp_fd, "w"); - - if (temp_fp == NULL) - return; - - impl_share = impl_handle->shares; - while (impl_share != NULL) { - fstype = fstypes; - while (fstype != NULL) { - if (FSINFO(impl_share, fstype)->active && - FSINFO(impl_share, fstype)->shareopts != NULL) { - resource = FSINFO(impl_share, fstype)->resource; - - if (resource == NULL) - resource = "-"; - - fprintf(temp_fp, "%s\t%s\t%s\t%s\n", - impl_share->sharepath, resource, - fstype->name, - FSINFO(impl_share, fstype)->shareopts); - } - - fstype = fstype->next; - } - - impl_share = impl_share->next; - } - - fflush(temp_fp); - fsync(temp_fd); - fclose(temp_fp); - - (void) rename(tempfile, ZFS_SHARETAB); -} - -typedef struct update_cookie_s { - sa_handle_impl_t handle; - const char *proto; -} update_cookie_t; - -static int -update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) -{ - update_cookie_t *udata = (update_cookie_t *)pcookie; - char mountpoint[ZFS_MAXPROPLEN]; - char shareopts[ZFS_MAXPROPLEN]; - char *dataset; - zfs_type_t type = zfs_get_type(zhp); - - if (type == ZFS_TYPE_FILESYSTEM && - zfs_iter_filesystems(zhp, update_zfs_shares_cb, pcookie) != 0) { - zfs_close(zhp); - return (1); - } - - if (type != ZFS_TYPE_FILESYSTEM) { - zfs_close(zhp); - return (0); - } - - if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, - sizeof (mountpoint), NULL, NULL, 0, B_FALSE) != 0) { - zfs_close(zhp); - return (0); - } - - dataset = (char *)zfs_get_name(zhp); - - if (dataset == NULL) { - zfs_close(zhp); - return (0); - } - - if (!zfs_is_mounted(zhp, NULL)) { - zfs_close(zhp); - return (0); - } - - if ((udata->proto == NULL || strcmp(udata->proto, "nfs") == 0) && - zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0 && - strcmp(shareopts, "off") != 0) { - (void) process_share(udata->handle, NULL, mountpoint, NULL, - "nfs", shareopts, NULL, dataset, B_FALSE); - } - - if ((udata->proto == NULL || strcmp(udata->proto, "smb") == 0) && - zfs_prop_get(zhp, ZFS_PROP_SHARESMB, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0 && - strcmp(shareopts, "off") != 0) { - (void) process_share(udata->handle, NULL, mountpoint, NULL, - "smb", shareopts, NULL, dataset, B_FALSE); - } - - zfs_close(zhp); - - return (0); -} - -static int -update_zfs_share(sa_share_impl_t impl_share, const char *proto) -{ - sa_handle_impl_t impl_handle = impl_share->handle; - zfs_handle_t *zhp; - update_cookie_t udata; - - if (impl_handle->zfs_libhandle == NULL) - return (SA_SYSTEM_ERR); - - assert(impl_share->dataset != NULL); - - zhp = zfs_open(impl_share->handle->zfs_libhandle, impl_share->dataset, - ZFS_TYPE_FILESYSTEM); - - if (zhp == NULL) - return (SA_SYSTEM_ERR); - - udata.handle = impl_handle; - udata.proto = proto; - (void) update_zfs_shares_cb(zhp, &udata); - - return (SA_OK); -} - -static int -update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) -{ - update_cookie_t udata; - - if (impl_handle->zfs_libhandle == NULL) - return (SA_SYSTEM_ERR); - - udata.handle = impl_handle; - udata.proto = proto; - (void) zfs_iter_root(impl_handle->zfs_libhandle, update_zfs_shares_cb, - &udata); - - return (SA_OK); -} - -static int -process_share(sa_handle_impl_t impl_handle, sa_share_impl_t impl_share, - char *pathname, char *resource, char *proto, - char *options, char *description, char *dataset, - boolean_t from_sharetab) -{ - struct stat statbuf; - int rc; - char *resource_dup = NULL, *dataset_dup = NULL; - boolean_t new_share; + int rc, ret = SA_OK; + boolean_t found_protocol = B_FALSE; sa_fstype_t *fstype; - new_share = B_FALSE; - + sa_share_impl_t impl_share = alloc_share(zfsname, mountpoint); if (impl_share == NULL) - impl_share = find_share(impl_handle, pathname); - - if (impl_share == NULL) { - if (lstat(pathname, &statbuf) != 0 || - !S_ISDIR(statbuf.st_mode)) - return (SA_BAD_PATH); - - impl_share = alloc_share(pathname); - - if (impl_share == NULL) { - rc = SA_NO_MEMORY; - goto err; - } - - new_share = B_TRUE; - } - - if (dataset != NULL) { - dataset_dup = strdup(dataset); - - if (dataset_dup == NULL) { - rc = SA_NO_MEMORY; - goto err; - } - } - - free(impl_share->dataset); - impl_share->dataset = dataset_dup; - - rc = SA_INVALID_PROTOCOL; + return (SA_NO_MEMORY); fstype = fstypes; while (fstype != NULL) { - if (strcmp(fstype->name, proto) == 0) { - if (resource != NULL) { - resource_dup = strdup(resource); - - if (resource_dup == NULL) { - rc = SA_NO_MEMORY; - goto err; - } - } - - free(FSINFO(impl_share, fstype)->resource); - FSINFO(impl_share, fstype)->resource = resource_dup; + if (strcmp(fstype->name, protocol) == 0) { rc = fstype->ops->update_shareopts(impl_share, - resource, options); + shareopts); + if (rc != SA_OK) + break; - if (rc == SA_OK && from_sharetab) - FSINFO(impl_share, fstype)->active = B_TRUE; + rc = fstype->ops->enable_share(impl_share); + if (rc != SA_OK) + ret = rc; - break; + found_protocol = B_TRUE; } fstype = fstype->next; } + free_share(impl_share); - if (rc != SA_OK) - goto err; + return (found_protocol ? ret : SA_INVALID_PROTOCOL); +} - if (new_share) { - impl_share->handle = impl_handle; +int +sa_disable_share(const char *mountpoint, char *protocol) +{ + int rc, ret = SA_OK; + boolean_t found_protocol = B_FALSE; + sa_fstype_t *fstype; - impl_share->next = impl_handle->shares; - impl_handle->shares = impl_share; + sa_share_impl_t impl_share = alloc_share(NULL, mountpoint); + if (impl_share == NULL) + return (SA_NO_MEMORY); + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) { + + rc = fstype->ops->disable_share(impl_share); + if (rc != SA_OK) + ret = rc; + + found_protocol = B_TRUE; + } + + fstype = fstype->next; } + free_share(impl_share); -err: - if (rc != SA_OK) { - if (new_share) - free_share(impl_share); + return (found_protocol ? ret : SA_INVALID_PROTOCOL); +} + +boolean_t +sa_is_shared(const char *mountpoint, char *protocol) +{ + sa_fstype_t *fstype; + boolean_t ret = B_FALSE; + + /* guid value is not used */ + sa_share_impl_t impl_share = alloc_share(NULL, mountpoint); + if (impl_share == NULL) + return (B_FALSE); + + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) { + ret = fstype->ops->is_shared(impl_share); + } + fstype = fstype->next; } - - return (rc); + free_share(impl_share); + return (ret); } void -sa_fini(sa_handle_t handle) +sa_commit_shares(const char *protocol) { - sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; - sa_share_impl_t impl_share, next; - sa_share_impl_t *pcurr; - - if (impl_handle == NULL) - return; - - /* - * clean up shares which don't have a non-NULL dataset property, - * which means they're in sharetab but we couldn't find their - * ZFS dataset. - */ - pcurr = &(impl_handle->shares); - impl_share = *pcurr; - while (impl_share != NULL) { - next = impl_share->next; - - if (impl_share->dataset == NULL) { - /* remove item from the linked list */ - *pcurr = next; - - sa_disable_share(impl_share, NULL); - - free_share(impl_share); - } else { - pcurr = &(impl_share->next); - } - - impl_share = next; - } - - update_sharetab(impl_handle); - - if (impl_handle->zfs_libhandle != NULL) - libzfs_fini(impl_handle->zfs_libhandle); - - impl_share = impl_handle->shares; - while (impl_share != NULL) { - next = impl_share->next; - free_share(impl_share); - impl_share = next; - } - - free(impl_handle); -} - -static sa_share_impl_t -find_share(sa_handle_impl_t impl_handle, const char *sharepath) -{ - sa_share_impl_t impl_share; - - impl_share = impl_handle->shares; - while (impl_share != NULL) { - if (strcmp(impl_share->sharepath, sharepath) == 0) { - break; - } - - impl_share = impl_share->next; - } - - return (impl_share); -} - -sa_share_t -sa_find_share(sa_handle_t handle, char *sharepath) -{ - return ((sa_share_t)find_share((sa_handle_impl_t)handle, sharepath)); -} - -int -sa_enable_share(sa_share_t share, char *protocol) -{ - sa_share_impl_t impl_share = (sa_share_impl_t)share; - int rc, ret = SA_OK; - boolean_t found_protocol = B_FALSE; - sa_fstype_t *fstype; - - fstype = fstypes; + sa_fstype_t *fstype = fstypes; while (fstype != NULL) { - if (protocol == NULL || strcmp(fstype->name, protocol) == 0) { - update_zfs_share(impl_share, fstype->name); - - rc = fstype->ops->enable_share(impl_share); - - if (rc != SA_OK) - ret = rc; - else - FSINFO(impl_share, fstype)->active = B_TRUE; - - found_protocol = B_TRUE; - } - + if (strcmp(fstype->name, protocol) == 0) + fstype->ops->commit_shares(); fstype = fstype->next; } - - update_sharetab(impl_share->handle); - - return (found_protocol ? ret : SA_INVALID_PROTOCOL); -} - -int -sa_disable_share(sa_share_t share, char *protocol) -{ - sa_share_impl_t impl_share = (sa_share_impl_t)share; - int rc, ret = SA_OK; - boolean_t found_protocol = B_FALSE; - sa_fstype_t *fstype; - - fstype = fstypes; - while (fstype != NULL) { - if (protocol == NULL || strcmp(fstype->name, protocol) == 0) { - rc = fstype->ops->disable_share(impl_share); - - if (rc == SA_OK) { - fstype->ops->clear_shareopts(impl_share); - - FSINFO(impl_share, fstype)->active = B_FALSE; - } else - ret = rc; - - found_protocol = B_TRUE; - } - - fstype = fstype->next; - } - - update_sharetab(impl_share->handle); - - return (found_protocol ? ret : SA_INVALID_PROTOCOL); } /* @@ -674,7 +295,7 @@ sa_errorstr(int err) } int -sa_parse_legacy_options(sa_group_t group, char *options, char *proto) +sa_validate_shareopts(char *options, char *proto) { sa_fstype_t *fstype; @@ -691,25 +312,8 @@ sa_parse_legacy_options(sa_group_t group, char *options, char *proto) return (SA_INVALID_PROTOCOL); } -boolean_t -sa_needs_refresh(sa_handle_t handle) -{ - return (B_TRUE); -} - -libzfs_handle_t * -sa_get_zfs_handle(sa_handle_t handle) -{ - sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; - - if (impl_handle == NULL) - return (NULL); - - return (impl_handle->zfs_libhandle); -} - static sa_share_impl_t -alloc_share(const char *sharepath) +alloc_share(const char *zfsname, const char *mountpoint) { sa_share_impl_t impl_share; @@ -718,17 +322,24 @@ alloc_share(const char *sharepath) if (impl_share == NULL) return (NULL); - impl_share->sharepath = strdup(sharepath); - - if (impl_share->sharepath == NULL) { + if (mountpoint != NULL && + ((impl_share->sa_mountpoint = strdup(mountpoint)) == NULL)) { free(impl_share); return (NULL); } - impl_share->fsinfo = calloc(fstypes_count, sizeof (sa_share_fsinfo_t)); + if (zfsname != NULL && + ((impl_share->sa_zfsname = strdup(zfsname)) == NULL)) { + free(impl_share->sa_mountpoint); + free(impl_share); + return (NULL); + } - if (impl_share->fsinfo == NULL) { - free(impl_share->sharepath); + impl_share->sa_fsinfo = calloc(fstypes_count, + sizeof (sa_share_fsinfo_t)); + if (impl_share->sa_fsinfo == NULL) { + free(impl_share->sa_mountpoint); + free(impl_share->sa_zfsname); free(impl_share); return (NULL); } @@ -744,34 +355,11 @@ free_share(sa_share_impl_t impl_share) fstype = fstypes; while (fstype != NULL) { fstype->ops->clear_shareopts(impl_share); - - free(FSINFO(impl_share, fstype)->resource); - fstype = fstype->next; } - free(impl_share->sharepath); - free(impl_share->dataset); - free(impl_share->fsinfo); + free(impl_share->sa_mountpoint); + free(impl_share->sa_zfsname); + free(impl_share->sa_fsinfo); free(impl_share); } - -int -sa_zfs_process_share(sa_handle_t handle, sa_group_t group, sa_share_t share, - char *mountpoint, char *proto, zprop_source_t source, char *shareopts, - char *sourcestr, char *dataset) -{ - sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; - sa_share_impl_t impl_share = (sa_share_impl_t)share; - - return (process_share(impl_handle, impl_share, mountpoint, NULL, - proto, shareopts, NULL, dataset, B_FALSE)); -} - -void -sa_update_sharetab_ts(sa_handle_t handle) -{ - sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; - - update_sharetab(impl_handle); -} diff --git a/lib/libshare/libshare_impl.h b/lib/libshare/libshare_impl.h index 18d619b107..63a6907539 100644 --- a/lib/libshare/libshare_impl.h +++ b/lib/libshare/libshare_impl.h @@ -22,36 +22,34 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. */ - -struct sa_handle_impl; +#ifndef _LIBSPL_LIBSHARE_IMPL_H +#define _LIBSPL_LIBSHARE_IMPL_H typedef struct sa_share_fsinfo { - boolean_t active; - char *resource; char *shareopts; } sa_share_fsinfo_t; typedef struct sa_share_impl { - struct sa_share_impl *next; + char *sa_mountpoint; + char *sa_zfsname; - struct sa_handle_impl *handle; - - char *sharepath; - char *dataset; - - sa_share_fsinfo_t *fsinfo; /* per-fstype information */ + sa_share_fsinfo_t *sa_fsinfo; /* per-fstype information */ } *sa_share_impl_t; -#define FSINFO(impl_share, fstype) (&(impl_share->fsinfo[fstype->fsinfo_index])) +#define FSINFO(impl_share, fstype) \ + (&(impl_share->sa_fsinfo[fstype->fsinfo_index])) typedef struct sa_share_ops { int (*enable_share)(sa_share_impl_t share); int (*disable_share)(sa_share_impl_t share); + boolean_t (*is_shared)(sa_share_impl_t share); int (*validate_shareopts)(const char *shareopts); int (*update_shareopts)(sa_share_impl_t impl_share, - const char *resource, const char *shareopts); + const char *shareopts); void (*clear_shareopts)(sa_share_impl_t impl_share); + int (*commit_shares)(void); } sa_share_ops_t; typedef struct sa_fstype { @@ -62,9 +60,6 @@ typedef struct sa_fstype { int fsinfo_index; } sa_fstype_t; -typedef struct sa_handle_impl { - libzfs_handle_t *zfs_libhandle; - sa_share_impl_t shares; -} *sa_handle_impl_t; - sa_fstype_t *register_fstype(const char *name, const sa_share_ops_t *ops); + +#endif /* _LIBSPL_LIBSHARE_IMPL_H */ diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c index 5c8976e15a..e339ebc81f 100644 --- a/lib/libshare/nfs.c +++ b/lib/libshare/nfs.c @@ -19,728 +19,139 @@ * CDDL HEADER END */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Gunnar Beutner - * Copyright (c) 2012 Cyril Plisko. All rights reserved. - */ -#include -#include -#include +#include +#include +#include #include +#include #include -#include -#include -#include #include -#include "libshare_impl.h" +#include "nfs.h" -static boolean_t nfs_available(void); -static sa_fstype_t *nfs_fstype; +static int nfs_lock_fd = -1; + /* - * nfs_exportfs_temp_fd refers to a temporary copy of the output - * from exportfs -v. - */ -static int nfs_exportfs_temp_fd = -1; - -typedef int (*nfs_shareopt_callback_t)(const char *opt, const char *value, - void *cookie); - -typedef int (*nfs_host_callback_t)(const char *sharepath, const char *host, - const char *security, const char *access, void *cookie); - -/* - * Invokes the specified callback function for each Solaris share option - * listed in the specified string. + * nfs_exports_[lock|unlock] are used to guard against conconcurrent + * updates to the exports file. Each protocol is responsible for + * providing the necessary locking to ensure consistency. */ static int -foreach_nfs_shareopt(const char *shareopts, - nfs_shareopt_callback_t callback, void *cookie) +nfs_exports_lock(const char *name) { - char *shareopts_dup, *opt, *cur, *value; - int was_nul, rc; + int err; - if (shareopts == NULL) - return (SA_OK); - - shareopts_dup = strdup(shareopts); - - if (shareopts_dup == NULL) - return (SA_NO_MEMORY); - - opt = shareopts_dup; - was_nul = 0; - - while (1) { - cur = opt; - - while (*cur != ',' && *cur != '\0') - cur++; - - if (*cur == '\0') - was_nul = 1; - - *cur = '\0'; - - if (cur > opt) { - value = strchr(opt, '='); - - if (value != NULL) { - *value = '\0'; - value++; - } - - rc = callback(opt, value, cookie); - - if (rc != SA_OK) { - free(shareopts_dup); - return (rc); - } - } - - opt = cur + 1; - - if (was_nul) - break; + nfs_lock_fd = open(name, O_RDWR | O_CREAT | O_CLOEXEC, 0600); + if (nfs_lock_fd == -1) { + err = errno; + fprintf(stderr, "failed to lock %s: %s\n", name, strerror(err)); + return (err); } - free(shareopts_dup); + if (flock(nfs_lock_fd, LOCK_EX) != 0) { + err = errno; + fprintf(stderr, "failed to lock %s: %s\n", name, strerror(err)); + (void) close(nfs_lock_fd); + nfs_lock_fd = -1; + return (err); + } return (0); } -typedef struct nfs_host_cookie_s { - nfs_host_callback_t callback; - const char *sharepath; - void *cookie; - const char *security; -} nfs_host_cookie_t; - -/* - * Helper function for foreach_nfs_host. This function checks whether the - * current share option is a host specification and invokes a callback - * function with information about the host. - */ -static int -foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) -{ - int rc; - const char *access; - char *host_dup, *host, *next; - nfs_host_cookie_t *udata = (nfs_host_cookie_t *)pcookie; - -#ifdef DEBUG - fprintf(stderr, "foreach_nfs_host_cb: key=%s, value=%s\n", opt, value); -#endif - - if (strcmp(opt, "sec") == 0) - udata->security = value; - - if (strcmp(opt, "rw") == 0 || strcmp(opt, "ro") == 0) { - if (value == NULL) - value = "*"; - - access = opt; - - host_dup = strdup(value); - - if (host_dup == NULL) - return (SA_NO_MEMORY); - - host = host_dup; - - do { - next = strchr(host, ':'); - if (next != NULL) { - *next = '\0'; - next++; - } - - rc = udata->callback(udata->sharepath, host, - udata->security, access, udata->cookie); - - if (rc != SA_OK) { - free(host_dup); - - return (rc); - } - - host = next; - } while (host != NULL); - - free(host_dup); - } - - return (SA_OK); -} - -/* - * Invokes a callback function for all NFS hosts that are set for a share. - */ -static int -foreach_nfs_host(sa_share_impl_t impl_share, nfs_host_callback_t callback, - void *cookie) -{ - nfs_host_cookie_t udata; - char *shareopts; - - udata.callback = callback; - udata.sharepath = impl_share->sharepath; - udata.cookie = cookie; - udata.security = "sys"; - - shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; - - return foreach_nfs_shareopt(shareopts, foreach_nfs_host_cb, - &udata); -} - -/* - * Converts a Solaris NFS host specification to its Linux equivalent. - */ -static int -get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) -{ - /* - * For now we just support CIDR masks (e.g. @192.168.0.0/16) and host - * wildcards (e.g. *.example.org). - */ - if (solaris_hostspec[0] == '@') { - /* - * Solaris host specifier, e.g. @192.168.0.0/16; we just need - * to skip the @ in this case - */ - *plinux_hostspec = strdup(solaris_hostspec + 1); - } else { - *plinux_hostspec = strdup(solaris_hostspec); - } - - if (*plinux_hostspec == NULL) { - return (SA_NO_MEMORY); - } - - return (SA_OK); -} - -/* - * Used internally by nfs_enable_share to enable sharing for a single host. - */ -static int -nfs_enable_share_one(const char *sharepath, const char *host, - const char *security, const char *access, void *pcookie) -{ - int rc; - char *linuxhost, *hostpath, *opts; - const char *linux_opts = (const char *)pcookie; - char *argv[6]; - - /* exportfs -i -o sec=XX,rX, : */ - - rc = get_linux_hostspec(host, &linuxhost); - - if (rc < 0) - exit(1); - - hostpath = malloc(strlen(linuxhost) + 1 + strlen(sharepath) + 1); - - if (hostpath == NULL) { - free(linuxhost); - - exit(1); - } - - sprintf(hostpath, "%s:%s", linuxhost, sharepath); - - free(linuxhost); - - if (linux_opts == NULL) - linux_opts = ""; - - opts = malloc(4 + strlen(security) + 4 + strlen(linux_opts) + 1); - - if (opts == NULL) - exit(1); - - sprintf(opts, "sec=%s,%s,%s", security, access, linux_opts); - -#ifdef DEBUG - fprintf(stderr, "sharing %s with opts %s\n", hostpath, opts); -#endif - - argv[0] = "/usr/sbin/exportfs"; - argv[1] = "-i"; - argv[2] = "-o"; - argv[3] = opts; - argv[4] = hostpath; - argv[5] = NULL; - - rc = libzfs_run_process(argv[0], argv, 0); - - free(hostpath); - free(opts); - - if (rc < 0) - return (SA_SYSTEM_ERR); - else - return (SA_OK); -} - -/* - * Adds a Linux share option to an array of NFS options. - */ -static int -add_linux_shareopt(char **plinux_opts, const char *key, const char *value) -{ - size_t len = 0; - char *new_linux_opts; - - if (*plinux_opts != NULL) - len = strlen(*plinux_opts); - - new_linux_opts = realloc(*plinux_opts, len + 1 + strlen(key) + - (value ? 1 + strlen(value) : 0) + 1); - - if (new_linux_opts == NULL) - return (SA_NO_MEMORY); - - new_linux_opts[len] = '\0'; - - if (len > 0) - strcat(new_linux_opts, ","); - - strcat(new_linux_opts, key); - - if (value != NULL) { - strcat(new_linux_opts, "="); - strcat(new_linux_opts, value); - } - - *plinux_opts = new_linux_opts; - - return (SA_OK); -} - -/* - * Validates and converts a single Solaris share option to its Linux - * equivalent. - */ -static int -get_linux_shareopts_cb(const char *key, const char *value, void *cookie) -{ - char **plinux_opts = (char **)cookie; - - /* host-specific options, these are taken care of elsewhere */ - if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || - strcmp(key, "sec") == 0) - return (SA_OK); - - if (strcmp(key, "anon") == 0) - key = "anonuid"; - - if (strcmp(key, "root_mapping") == 0) { - (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); - key = "anonuid"; - } - - if (strcmp(key, "nosub") == 0) - key = "subtree_check"; - - if (strcmp(key, "insecure") != 0 && strcmp(key, "secure") != 0 && - strcmp(key, "async") != 0 && strcmp(key, "sync") != 0 && - strcmp(key, "no_wdelay") != 0 && strcmp(key, "wdelay") != 0 && - strcmp(key, "nohide") != 0 && strcmp(key, "hide") != 0 && - strcmp(key, "crossmnt") != 0 && - strcmp(key, "no_subtree_check") != 0 && - strcmp(key, "subtree_check") != 0 && - strcmp(key, "insecure_locks") != 0 && - strcmp(key, "secure_locks") != 0 && - strcmp(key, "no_auth_nlm") != 0 && strcmp(key, "auth_nlm") != 0 && - strcmp(key, "no_acl") != 0 && strcmp(key, "mountpoint") != 0 && - strcmp(key, "mp") != 0 && strcmp(key, "fsuid") != 0 && - strcmp(key, "refer") != 0 && strcmp(key, "replicas") != 0 && - strcmp(key, "root_squash") != 0 && - strcmp(key, "no_root_squash") != 0 && - strcmp(key, "all_squash") != 0 && - strcmp(key, "no_all_squash") != 0 && strcmp(key, "fsid") != 0 && - strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { - return (SA_SYNTAX_ERR); - } - - (void) add_linux_shareopt(plinux_opts, key, value); - - return (SA_OK); -} - -/* - * Takes a string containing Solaris share options (e.g. "sync,no_acl") and - * converts them to a NULL-terminated array of Linux NFS options. - */ -static int -get_linux_shareopts(const char *shareopts, char **plinux_opts) -{ - int rc; - - assert(plinux_opts != NULL); - - *plinux_opts = NULL; - - /* default options for Solaris shares */ - (void) add_linux_shareopt(plinux_opts, "no_subtree_check", NULL); - (void) add_linux_shareopt(plinux_opts, "no_root_squash", NULL); - (void) add_linux_shareopt(plinux_opts, "mountpoint", NULL); - - rc = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, - plinux_opts); - - if (rc != SA_OK) { - free(*plinux_opts); - *plinux_opts = NULL; - } - - return (rc); -} - -/* - * Enables NFS sharing for the specified share. - */ -static int -nfs_enable_share(sa_share_impl_t impl_share) -{ - char *shareopts, *linux_opts; - int rc; - - if (!nfs_available()) { - return (SA_SYSTEM_ERR); - } - - shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; - - if (shareopts == NULL) - return (SA_OK); - - rc = get_linux_shareopts(shareopts, &linux_opts); - - if (rc != SA_OK) - return (rc); - - rc = foreach_nfs_host(impl_share, nfs_enable_share_one, linux_opts); - - free(linux_opts); - - return (rc); -} - -/* - * Used internally by nfs_disable_share to disable sharing for a single host. - */ -static int -nfs_disable_share_one(const char *sharepath, const char *host, - const char *security, const char *access, void *cookie) -{ - int rc; - char *linuxhost, *hostpath; - char *argv[4]; - - rc = get_linux_hostspec(host, &linuxhost); - - if (rc < 0) - exit(1); - - hostpath = malloc(strlen(linuxhost) + 1 + strlen(sharepath) + 1); - - if (hostpath == NULL) { - free(linuxhost); - exit(1); - } - - sprintf(hostpath, "%s:%s", linuxhost, sharepath); - - free(linuxhost); - -#ifdef DEBUG - fprintf(stderr, "unsharing %s\n", hostpath); -#endif - - argv[0] = "/usr/sbin/exportfs"; - argv[1] = "-u"; - argv[2] = hostpath; - argv[3] = NULL; - - rc = libzfs_run_process(argv[0], argv, 0); - - free(hostpath); - - if (rc < 0) - return (SA_SYSTEM_ERR); - else - return (SA_OK); -} - -/* - * Disables NFS sharing for the specified share. - */ -static int -nfs_disable_share(sa_share_impl_t impl_share) -{ - if (!nfs_available()) { - /* - * The share can't possibly be active, so nothing - * needs to be done to disable it. - */ - return (SA_OK); - } - - return (foreach_nfs_host(impl_share, nfs_disable_share_one, NULL)); -} - -/* - * Checks whether the specified NFS share options are syntactically correct. - */ -static int -nfs_validate_shareopts(const char *shareopts) -{ - char *linux_opts; - int rc; - - rc = get_linux_shareopts(shareopts, &linux_opts); - - if (rc != SA_OK) - return (rc); - - free(linux_opts); - - return (SA_OK); -} - -/* - * Checks whether a share is currently active. - */ -static boolean_t -nfs_is_share_active(sa_share_impl_t impl_share) -{ - int fd; - char line[512]; - char *tab, *cur; - FILE *nfs_exportfs_temp_fp; - - if (!nfs_available()) - return (B_FALSE); - - if ((fd = dup(nfs_exportfs_temp_fd)) == -1) - return (B_FALSE); - - nfs_exportfs_temp_fp = fdopen(fd, "r"); - - if (nfs_exportfs_temp_fp == NULL) - return (B_FALSE); - - if (fseek(nfs_exportfs_temp_fp, 0, SEEK_SET) < 0) { - fclose(nfs_exportfs_temp_fp); - return (B_FALSE); - } - - while (fgets(line, sizeof (line), nfs_exportfs_temp_fp) != NULL) { - /* - * exportfs uses separate lines for the share path - * and the export options when the share path is longer - * than a certain amount of characters; this ignores - * the option lines - */ - if (line[0] == '\t') - continue; - - tab = strchr(line, '\t'); - - if (tab != NULL) { - *tab = '\0'; - cur = tab - 1; - } else { - /* - * there's no tab character, which means the - * NFS options are on a separate line; we just - * need to remove the new-line character - * at the end of the line - */ - cur = line + strlen(line) - 1; - } - - /* remove trailing spaces and new-line characters */ - while (cur >= line && (*cur == ' ' || *cur == '\n')) - *cur-- = '\0'; - - if (strcmp(line, impl_share->sharepath) == 0) { - fclose(nfs_exportfs_temp_fp); - return (B_TRUE); - } - } - - fclose(nfs_exportfs_temp_fp); - - return (B_FALSE); -} - -/* - * Called to update a share's options. A share's options might be out of - * date if the share was loaded from disk (i.e. /etc/dfs/sharetab) and the - * "sharenfs" dataset property has changed in the meantime. This function - * also takes care of re-enabling the share if necessary. - */ -static int -nfs_update_shareopts(sa_share_impl_t impl_share, const char *resource, - const char *shareopts) -{ - char *shareopts_dup; - boolean_t needs_reshare = B_FALSE; - char *old_shareopts; - - FSINFO(impl_share, nfs_fstype)->active = - nfs_is_share_active(impl_share); - - old_shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; - - if (strcmp(shareopts, "on") == 0) - shareopts = "rw,crossmnt"; - - if (FSINFO(impl_share, nfs_fstype)->active && old_shareopts != NULL && - strcmp(old_shareopts, shareopts) != 0) { - needs_reshare = B_TRUE; - nfs_disable_share(impl_share); - } - - shareopts_dup = strdup(shareopts); - - if (shareopts_dup == NULL) - return (SA_NO_MEMORY); - - if (old_shareopts != NULL) - free(old_shareopts); - - FSINFO(impl_share, nfs_fstype)->shareopts = shareopts_dup; - - if (needs_reshare) - nfs_enable_share(impl_share); - - return (SA_OK); -} - -/* - * Clears a share's NFS options. Used by libshare to - * clean up shares that are about to be free()'d. - */ static void -nfs_clear_shareopts(sa_share_impl_t impl_share) +nfs_exports_unlock(const char *name) { - free(FSINFO(impl_share, nfs_fstype)->shareopts); - FSINFO(impl_share, nfs_fstype)->shareopts = NULL; + verify(nfs_lock_fd > 0); + + if (flock(nfs_lock_fd, LOCK_UN) != 0) { + fprintf(stderr, "failed to unlock %s: %s\n", + name, strerror(errno)); + } + + (void) close(nfs_lock_fd); + nfs_lock_fd = -1; } -static const sa_share_ops_t nfs_shareops = { - .enable_share = nfs_enable_share, - .disable_share = nfs_disable_share, +static char * +nfs_init_tmpfile(const char *prefix, const char *mdir) +{ + char *tmpfile = NULL; + struct stat sb; - .validate_shareopts = nfs_validate_shareopts, - .update_shareopts = nfs_update_shareopts, - .clear_shareopts = nfs_clear_shareopts, -}; + if (mdir != NULL && + stat(mdir, &sb) < 0 && + mkdir(mdir, 0755) < 0) { + fprintf(stderr, "failed to create %s: %s\n", + mdir, strerror(errno)); + return (NULL); + } + + if (asprintf(&tmpfile, "%s.XXXXXXXX", prefix) == -1) { + fprintf(stderr, "Unable to allocate temporary file\n"); + return (NULL); + } + + int fd = mkostemp(tmpfile, O_CLOEXEC); + if (fd == -1) { + fprintf(stderr, "Unable to create temporary file: %s", + strerror(errno)); + free(tmpfile); + return (NULL); + } + close(fd); + return (tmpfile); +} -/* - * nfs_check_exportfs() checks that the exportfs command runs - * and also maintains a temporary copy of the output from - * exportfs -v. - * To update this temporary copy simply call this function again. - * - * TODO : Use /var/lib/nfs/etab instead of our private copy. - * But must implement locking to prevent concurrent access. - * - * TODO : The temporary file descriptor is never closed since - * there is no libshare_nfs_fini() function. - */ static int -nfs_check_exportfs(void) +nfs_fini_tmpfile(const char *exports, char *tmpfile) { - pid_t pid; - int rc, status; - static char nfs_exportfs_tempfile[] = "/tmp/exportfs.XXXXXX"; - - /* - * Close any existing temporary copies of output from exportfs. - * We have already called unlink() so file will be deleted. - */ - if (nfs_exportfs_temp_fd >= 0) - close(nfs_exportfs_temp_fd); - - nfs_exportfs_temp_fd = mkstemp(nfs_exportfs_tempfile); - - if (nfs_exportfs_temp_fd < 0) - return (SA_SYSTEM_ERR); - - unlink(nfs_exportfs_tempfile); - - (void) fcntl(nfs_exportfs_temp_fd, F_SETFD, FD_CLOEXEC); - - pid = fork(); - - if (pid < 0) { - (void) close(nfs_exportfs_temp_fd); - nfs_exportfs_temp_fd = -1; + if (rename(tmpfile, exports) == -1) { + fprintf(stderr, "Unable to rename %s: %s\n", tmpfile, + strerror(errno)); + unlink(tmpfile); + free(tmpfile); return (SA_SYSTEM_ERR); } + free(tmpfile); + return (SA_OK); +} - if (pid > 0) { - while ((rc = waitpid(pid, &status, 0)) <= 0 && - errno == EINTR) { } +int +nfs_toggle_share(const char *lockfile, const char *exports, + const char *expdir, sa_share_impl_t impl_share, + int(*cbk)(sa_share_impl_t impl_share, char *filename)) +{ + int error; + char *filename; - if (rc <= 0) { - (void) close(nfs_exportfs_temp_fd); - nfs_exportfs_temp_fd = -1; - return (SA_SYSTEM_ERR); - } + if ((filename = nfs_init_tmpfile(exports, expdir)) == NULL) + return (SA_SYSTEM_ERR); - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - (void) close(nfs_exportfs_temp_fd); - nfs_exportfs_temp_fd = -1; - return (SA_CONFIG_ERR); - } - - return (SA_OK); + error = nfs_exports_lock(lockfile); + if (error != 0) { + unlink(filename); + free(filename); + return (error); } - /* child */ + error = nfs_copy_entries(filename, impl_share->sa_mountpoint); + if (error != SA_OK) + goto fullerr; - /* exportfs -v */ + error = cbk(impl_share, filename); + if (error != SA_OK) + goto fullerr; - if (dup2(nfs_exportfs_temp_fd, STDOUT_FILENO) < 0) - exit(1); + error = nfs_fini_tmpfile(exports, filename); + nfs_exports_unlock(lockfile); + return (error); - rc = execlp("/usr/sbin/exportfs", "exportfs", "-v", NULL); - - if (rc < 0) { - exit(1); - } - - exit(0); -} - -/* - * Provides a convenient wrapper for determining nfs availability - */ -static boolean_t -nfs_available(void) -{ - if (nfs_exportfs_temp_fd == -1) - (void) nfs_check_exportfs(); - - return ((nfs_exportfs_temp_fd != -1) ? B_TRUE : B_FALSE); -} - -/* - * Initializes the NFS functionality of libshare. - */ -void -libshare_nfs_init(void) -{ - nfs_fstype = register_fstype("nfs", &nfs_shareops); +fullerr: + unlink(filename); + free(filename); + nfs_exports_unlock(lockfile); + return (error); } diff --git a/lib/libshare/nfs.h b/lib/libshare/nfs.h index b9ea6ee2f8..4dbcdf5985 100644 --- a/lib/libshare/nfs.h +++ b/lib/libshare/nfs.h @@ -24,4 +24,13 @@ * Copyright (c) 2011 Gunnar Beutner */ +#include "libshare_impl.h" + +#define FILE_HEADER "# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n" + void libshare_nfs_init(void); + +int nfs_copy_entries(char *filename, const char *mountpoint); +int nfs_toggle_share(const char *lockfile, const char *exports, + const char *expdir, sa_share_impl_t impl_share, + int(*cbk)(sa_share_impl_t impl_share, char *filename)); diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c new file mode 100644 index 0000000000..0041bc228b --- /dev/null +++ b/lib/libshare/os/freebsd/nfs.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "libshare_impl.h" +#include "nfs.h" + +#define _PATH_MOUNTDPID "/var/run/mountd.pid" +#define OPTSSIZE 1024 +#define MAXLINESIZE (PATH_MAX + OPTSSIZE) +#define ZFS_EXPORTS_FILE "/etc/zfs/exports" +#define ZFS_EXPORTS_LOCK ZFS_EXPORTS_FILE".lock" + +static sa_fstype_t *nfs_fstype; + +/* + * Read one line from a file. Skip comments, empty lines and a line with a + * mountpoint specified in the 'skip' argument. + * + * NOTE: This function returns a static buffer and thus is not thread-safe. + */ +static char * +zgetline(FILE *fd, const char *skip) +{ + static char line[MAXLINESIZE]; + size_t len, skiplen = 0; + char *s, last; + + if (skip != NULL) + skiplen = strlen(skip); + for (;;) { + s = fgets(line, sizeof (line), fd); + if (s == NULL) + return (NULL); + /* Skip empty lines and comments. */ + if (line[0] == '\n' || line[0] == '#') + continue; + len = strlen(line); + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + last = line[skiplen]; + /* Skip the given mountpoint. */ + if (skip != NULL && strncmp(skip, line, skiplen) == 0 && + (last == '\t' || last == ' ' || last == '\0')) { + continue; + } + break; + } + return (line); +} + +/* + * This function translate options to a format acceptable by exports(5), eg. + * + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \ + * zfs.freebsd.org 69.147.83.54 + * + * Accepted input formats: + * + * ro,network=192.168.0.0,mask=255.255.255.0,maproot=0,zfs.freebsd.org + * ro network=192.168.0.0 mask=255.255.255.0 maproot=0 zfs.freebsd.org + * -ro,-network=192.168.0.0,-mask=255.255.255.0,-maproot=0,zfs.freebsd.org + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \ + * zfs.freebsd.org + * + * Recognized keywords: + * + * ro, maproot, mapall, mask, network, sec, alldirs, public, webnfs, + * index, quiet + * + * NOTE: This function returns a static buffer and thus is not thread-safe. + */ +static char * +translate_opts(const char *shareopts) +{ + static const char *known_opts[] = { "ro", "maproot", "mapall", "mask", + "network", "sec", "alldirs", "public", "webnfs", "index", "quiet", + NULL }; + static char newopts[OPTSSIZE]; + char oldopts[OPTSSIZE]; + char *o, *s = NULL; + unsigned int i; + size_t len; + + strlcpy(oldopts, shareopts, sizeof (oldopts)); + newopts[0] = '\0'; + s = oldopts; + while ((o = strsep(&s, "-, ")) != NULL) { + if (o[0] == '\0') + continue; + for (i = 0; known_opts[i] != NULL; i++) { + len = strlen(known_opts[i]); + if (strncmp(known_opts[i], o, len) == 0 && + (o[len] == '\0' || o[len] == '=')) { + strlcat(newopts, "-", sizeof (newopts)); + break; + } + } + strlcat(newopts, o, sizeof (newopts)); + strlcat(newopts, " ", sizeof (newopts)); + } + return (newopts); +} + +/* + * This function copies all entries from the exports file to "filename", + * omitting any entries for the specified mountpoint. + */ +int +nfs_copy_entries(char *filename, const char *mountpoint) +{ + int error = SA_OK; + char *line; + + FILE *oldfp = fopen(ZFS_EXPORTS_FILE, "re"); + FILE *newfp = fopen(filename, "w+e"); + if (newfp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + fclose(oldfp); + return (SA_SYSTEM_ERR); + } + fputs(FILE_HEADER, newfp); + + /* + * The ZFS_EXPORTS_FILE may not exist yet. If that's the + * case then just write out the new file. + */ + if (oldfp != NULL) { + while ((line = zgetline(oldfp, mountpoint)) != NULL) + fprintf(newfp, "%s\n", line); + if (ferror(oldfp) != 0) { + error = ferror(oldfp); + } + if (fclose(oldfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + } + + if (error == 0 && ferror(newfp) != 0) { + error = ferror(newfp); + } + + if (fclose(newfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + return (error); +} + +static int +nfs_enable_share_impl(sa_share_impl_t impl_share, char *filename) +{ + FILE *fp = fopen(filename, "a+e"); + if (fp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + return (SA_SYSTEM_ERR); + } + + char *shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + if (strcmp(shareopts, "on") == 0) + shareopts = ""; + + if (fprintf(fp, "%s\t%s\n", impl_share->sa_mountpoint, + translate_opts(shareopts)) < 0) { + fprintf(stderr, "failed to write to %s\n", filename); + fclose(fp); + return (SA_SYSTEM_ERR); + } + + if (fclose(fp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + return (SA_SYSTEM_ERR); + } + + return (SA_OK); +} + +static int +nfs_enable_share(sa_share_impl_t impl_share) +{ + return (nfs_toggle_share( + ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, NULL, impl_share, + nfs_enable_share_impl)); +} + +static int +nfs_disable_share_impl(sa_share_impl_t impl_share, char *filename) +{ + return (SA_OK); +} + +static int +nfs_disable_share(sa_share_impl_t impl_share) +{ + return (nfs_toggle_share( + ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, NULL, impl_share, + nfs_disable_share_impl)); +} + +static boolean_t +nfs_is_shared(sa_share_impl_t impl_share) +{ + char *s, last, line[MAXLINESIZE]; + size_t len; + char *mntpoint = impl_share->sa_mountpoint; + size_t mntlen = strlen(mntpoint); + + FILE *fp = fopen(ZFS_EXPORTS_FILE, "re"); + if (fp == NULL) + return (B_FALSE); + + for (;;) { + s = fgets(line, sizeof (line), fp); + if (s == NULL) + return (B_FALSE); + /* Skip empty lines and comments. */ + if (line[0] == '\n' || line[0] == '#') + continue; + len = strlen(line); + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + last = line[mntlen]; + /* Skip the given mountpoint. */ + if (strncmp(mntpoint, line, mntlen) == 0 && + (last == '\t' || last == ' ' || last == '\0')) { + fclose(fp); + return (B_TRUE); + } + } + fclose(fp); + return (B_FALSE); +} + +static int +nfs_validate_shareopts(const char *shareopts) +{ + return (SA_OK); +} + +static int +nfs_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = (char *)shareopts; + return (SA_OK); +} + +static void +nfs_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = NULL; +} + +/* + * Commit the shares by restarting mountd. + */ +static int +nfs_commit_shares(void) +{ + struct pidfh *pfh; + pid_t mountdpid; + + pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &mountdpid); + if (pfh != NULL) { + /* Mountd is not running. */ + pidfile_remove(pfh); + return (SA_OK); + } + if (errno != EEXIST) { + /* Cannot open pidfile for some reason. */ + return (SA_SYSTEM_ERR); + } + /* We have mountd(8) PID in mountdpid variable. */ + kill(mountdpid, SIGHUP); + return (SA_OK); +} + +static const sa_share_ops_t nfs_shareops = { + .enable_share = nfs_enable_share, + .disable_share = nfs_disable_share, + .is_shared = nfs_is_shared, + + .validate_shareopts = nfs_validate_shareopts, + .update_shareopts = nfs_update_shareopts, + .clear_shareopts = nfs_clear_shareopts, + .commit_shares = nfs_commit_shares, +}; + +/* + * Initializes the NFS functionality of libshare. + */ +void +libshare_nfs_init(void) +{ + nfs_fstype = register_fstype("nfs", &nfs_shareops); +} diff --git a/lib/libshare/os/freebsd/smb.c b/lib/libshare/os/freebsd/smb.c new file mode 100644 index 0000000000..5b606ab969 --- /dev/null +++ b/lib/libshare/os/freebsd/smb.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libshare_impl.h" +#include "smb.h" + +static sa_fstype_t *smb_fstype; + +/* + * Enables SMB sharing for the specified share. + */ +static int +smb_enable_share(sa_share_impl_t impl_share) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} +/* + * Disables SMB sharing for the specified share. + */ +static int +smb_disable_share(sa_share_impl_t impl_share) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} + +/* + * Checks whether the specified SMB share options are syntactically correct. + */ +static int +smb_validate_shareopts(const char *shareopts) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} + +/* + * Checks whether a share is currently active. + */ +static boolean_t +smb_is_share_active(sa_share_impl_t impl_share) +{ + return (B_FALSE); +} + +/* + * Called to update a share's options. A share's options might be out of + * date if the share was loaded from disk and the "sharesmb" dataset + * property has changed in the meantime. This function also takes care + * of re-enabling the share if necessary. + */ +static int +smb_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + return (SA_OK); +} + +static int +smb_update_shares(void) +{ + /* Not implemented */ + return (0); +} +/* + * Clears a share's SMB options. Used by libshare to + * clean up shares that are about to be free()'d. + */ +static void +smb_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, smb_fstype)->shareopts = NULL; +} + +static const sa_share_ops_t smb_shareops = { + .enable_share = smb_enable_share, + .disable_share = smb_disable_share, + .is_shared = smb_is_share_active, + + .validate_shareopts = smb_validate_shareopts, + .update_shareopts = smb_update_shareopts, + .clear_shareopts = smb_clear_shareopts, + .commit_shares = smb_update_shares, +}; + +/* + * Initializes the SMB functionality of libshare. + */ +void +libshare_smb_init(void) +{ + smb_fstype = register_fstype("smb", &smb_shareops); +} diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c new file mode 100644 index 0000000000..4f754aabd3 --- /dev/null +++ b/lib/libshare/os/linux/nfs.c @@ -0,0 +1,643 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libshare_impl.h" +#include "nfs.h" + +#define ZFS_EXPORTS_DIR "/etc/exports.d" +#define ZFS_EXPORTS_FILE ZFS_EXPORTS_DIR"/zfs.exports" +#define ZFS_EXPORTS_LOCK ZFS_EXPORTS_FILE".lock" + +static sa_fstype_t *nfs_fstype; + +typedef int (*nfs_shareopt_callback_t)(const char *opt, const char *value, + void *cookie); + +typedef int (*nfs_host_callback_t)(const char *sharepath, const char *filename, + const char *host, const char *security, const char *access, void *cookie); + +/* + * Invokes the specified callback function for each Solaris share option + * listed in the specified string. + */ +static int +foreach_nfs_shareopt(const char *shareopts, + nfs_shareopt_callback_t callback, void *cookie) +{ + char *shareopts_dup, *opt, *cur, *value; + int was_nul, error; + + if (shareopts == NULL) + return (SA_OK); + + if (strcmp(shareopts, "on") == 0) + shareopts = "rw,crossmnt"; + + shareopts_dup = strdup(shareopts); + + + if (shareopts_dup == NULL) + return (SA_NO_MEMORY); + + opt = shareopts_dup; + was_nul = 0; + + while (1) { + cur = opt; + + while (*cur != ',' && *cur != '\0') + cur++; + + if (*cur == '\0') + was_nul = 1; + + *cur = '\0'; + + if (cur > opt) { + value = strchr(opt, '='); + + if (value != NULL) { + *value = '\0'; + value++; + } + + error = callback(opt, value, cookie); + + if (error != SA_OK) { + free(shareopts_dup); + return (error); + } + } + + opt = cur + 1; + + if (was_nul) + break; + } + + free(shareopts_dup); + + return (SA_OK); +} + +typedef struct nfs_host_cookie_s { + nfs_host_callback_t callback; + const char *sharepath; + void *cookie; + const char *filename; + const char *security; +} nfs_host_cookie_t; + +/* + * Helper function for foreach_nfs_host. This function checks whether the + * current share option is a host specification and invokes a callback + * function with information about the host. + */ +static int +foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) +{ + int error; + const char *access; + char *host_dup, *host, *next, *v6Literal; + nfs_host_cookie_t *udata = (nfs_host_cookie_t *)pcookie; + int cidr_len; + +#ifdef DEBUG + fprintf(stderr, "foreach_nfs_host_cb: key=%s, value=%s\n", opt, value); +#endif + + if (strcmp(opt, "sec") == 0) + udata->security = value; + + if (strcmp(opt, "rw") == 0 || strcmp(opt, "ro") == 0) { + if (value == NULL) + value = "*"; + + access = opt; + + host_dup = strdup(value); + + if (host_dup == NULL) + return (SA_NO_MEMORY); + + host = host_dup; + + do { + if (*host == '[') { + host++; + v6Literal = strchr(host, ']'); + if (v6Literal == NULL) { + free(host_dup); + return (SA_SYNTAX_ERR); + } + if (v6Literal[1] == '\0') { + *v6Literal = '\0'; + next = NULL; + } else if (v6Literal[1] == '/') { + next = strchr(v6Literal + 2, ':'); + if (next == NULL) { + cidr_len = + strlen(v6Literal + 1); + memmove(v6Literal, + v6Literal + 1, + cidr_len); + v6Literal[cidr_len] = '\0'; + } else { + cidr_len = next - v6Literal - 1; + memmove(v6Literal, + v6Literal + 1, + cidr_len); + v6Literal[cidr_len] = '\0'; + next++; + } + } else if (v6Literal[1] == ':') { + *v6Literal = '\0'; + next = v6Literal + 2; + } else { + free(host_dup); + return (SA_SYNTAX_ERR); + } + } else { + next = strchr(host, ':'); + if (next != NULL) { + *next = '\0'; + next++; + } + } + + error = udata->callback(udata->filename, + udata->sharepath, host, udata->security, + access, udata->cookie); + + if (error != SA_OK) { + free(host_dup); + + return (error); + } + + host = next; + } while (host != NULL); + + free(host_dup); + } + + return (SA_OK); +} + +/* + * Invokes a callback function for all NFS hosts that are set for a share. + */ +static int +foreach_nfs_host(sa_share_impl_t impl_share, char *filename, + nfs_host_callback_t callback, void *cookie) +{ + nfs_host_cookie_t udata; + char *shareopts; + + udata.callback = callback; + udata.sharepath = impl_share->sa_mountpoint; + udata.cookie = cookie; + udata.filename = filename; + udata.security = "sys"; + + shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + + return (foreach_nfs_shareopt(shareopts, foreach_nfs_host_cb, + &udata)); +} + +/* + * Converts a Solaris NFS host specification to its Linux equivalent. + */ +static int +get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) +{ + /* + * For now we just support CIDR masks (e.g. @192.168.0.0/16) and host + * wildcards (e.g. *.example.org). + */ + if (solaris_hostspec[0] == '@') { + /* + * Solaris host specifier, e.g. @192.168.0.0/16; we just need + * to skip the @ in this case + */ + *plinux_hostspec = strdup(solaris_hostspec + 1); + } else { + *plinux_hostspec = strdup(solaris_hostspec); + } + + if (*plinux_hostspec == NULL) { + return (SA_NO_MEMORY); + } + + return (SA_OK); +} + +/* + * Adds a Linux share option to an array of NFS options. + */ +static int +add_linux_shareopt(char **plinux_opts, const char *key, const char *value) +{ + size_t len = 0; + char *new_linux_opts; + + if (*plinux_opts != NULL) + len = strlen(*plinux_opts); + + new_linux_opts = realloc(*plinux_opts, len + 1 + strlen(key) + + (value ? 1 + strlen(value) : 0) + 1); + + if (new_linux_opts == NULL) + return (SA_NO_MEMORY); + + new_linux_opts[len] = '\0'; + + if (len > 0) + strcat(new_linux_opts, ","); + + strcat(new_linux_opts, key); + + if (value != NULL) { + strcat(new_linux_opts, "="); + strcat(new_linux_opts, value); + } + + *plinux_opts = new_linux_opts; + + return (SA_OK); +} + +/* + * Validates and converts a single Solaris share option to its Linux + * equivalent. + */ +static int +get_linux_shareopts_cb(const char *key, const char *value, void *cookie) +{ + char **plinux_opts = (char **)cookie; + + /* host-specific options, these are taken care of elsewhere */ + if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || + strcmp(key, "sec") == 0) + return (SA_OK); + + if (strcmp(key, "anon") == 0) + key = "anonuid"; + + if (strcmp(key, "root_mapping") == 0) { + (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); + key = "anonuid"; + } + + if (strcmp(key, "nosub") == 0) + key = "subtree_check"; + + if (strcmp(key, "insecure") != 0 && strcmp(key, "secure") != 0 && + strcmp(key, "async") != 0 && strcmp(key, "sync") != 0 && + strcmp(key, "no_wdelay") != 0 && strcmp(key, "wdelay") != 0 && + strcmp(key, "nohide") != 0 && strcmp(key, "hide") != 0 && + strcmp(key, "crossmnt") != 0 && + strcmp(key, "no_subtree_check") != 0 && + strcmp(key, "subtree_check") != 0 && + strcmp(key, "insecure_locks") != 0 && + strcmp(key, "secure_locks") != 0 && + strcmp(key, "no_auth_nlm") != 0 && strcmp(key, "auth_nlm") != 0 && + strcmp(key, "no_acl") != 0 && strcmp(key, "mountpoint") != 0 && + strcmp(key, "mp") != 0 && strcmp(key, "fsuid") != 0 && + strcmp(key, "refer") != 0 && strcmp(key, "replicas") != 0 && + strcmp(key, "root_squash") != 0 && + strcmp(key, "no_root_squash") != 0 && + strcmp(key, "all_squash") != 0 && + strcmp(key, "no_all_squash") != 0 && strcmp(key, "fsid") != 0 && + strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { + return (SA_SYNTAX_ERR); + } + + (void) add_linux_shareopt(plinux_opts, key, value); + + return (SA_OK); +} + +/* + * Takes a string containing Solaris share options (e.g. "sync,no_acl") and + * converts them to a NULL-terminated array of Linux NFS options. + */ +static int +get_linux_shareopts(const char *shareopts, char **plinux_opts) +{ + int error; + + assert(plinux_opts != NULL); + + *plinux_opts = NULL; + + /* no_subtree_check - Default as of nfs-utils v1.1.0 */ + (void) add_linux_shareopt(plinux_opts, "no_subtree_check", NULL); + + /* mountpoint - Restrict exports to ZFS mountpoints */ + (void) add_linux_shareopt(plinux_opts, "mountpoint", NULL); + + error = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, + plinux_opts); + + if (error != SA_OK) { + free(*plinux_opts); + *plinux_opts = NULL; + } + + return (error); +} + +/* + * This function populates an entry into /etc/exports.d/zfs.exports. + * This file is consumed by the linux nfs server so that zfs shares are + * automatically exported upon boot or whenever the nfs server restarts. + */ +static int +nfs_add_entry(const char *filename, const char *sharepath, + const char *host, const char *security, const char *access_opts, + void *pcookie) +{ + int error; + char *linuxhost; + const char *linux_opts = (const char *)pcookie; + + error = get_linux_hostspec(host, &linuxhost); + if (error != SA_OK) + return (error); + + if (linux_opts == NULL) + linux_opts = ""; + + FILE *fp = fopen(filename, "a+e"); + if (fp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + free(linuxhost); + return (SA_SYSTEM_ERR); + } + + if (fprintf(fp, "%s %s(sec=%s,%s,%s)\n", sharepath, linuxhost, + security, access_opts, linux_opts) < 0) { + fprintf(stderr, "failed to write to %s\n", filename); + free(linuxhost); + fclose(fp); + return (SA_SYSTEM_ERR); + } + + free(linuxhost); + if (fclose(fp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + return (SA_SYSTEM_ERR); + } + return (SA_OK); +} + +/* + * This function copies all entries from the exports file to "filename", + * omitting any entries for the specified mountpoint. + */ +int +nfs_copy_entries(char *filename, const char *mountpoint) +{ + char *buf = NULL; + size_t buflen = 0; + int error = SA_OK; + + FILE *oldfp = fopen(ZFS_EXPORTS_FILE, "re"); + FILE *newfp = fopen(filename, "w+e"); + if (newfp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + fclose(oldfp); + return (SA_SYSTEM_ERR); + } + fputs(FILE_HEADER, newfp); + + /* + * The ZFS_EXPORTS_FILE may not exist yet. If that's the + * case then just write out the new file. + */ + if (oldfp != NULL) { + while (getline(&buf, &buflen, oldfp) != -1) { + char *space = NULL; + + if (buf[0] == '\n' || buf[0] == '#') + continue; + + if ((space = strchr(buf, ' ')) != NULL) { + int mountpoint_len = strlen(mountpoint); + + if (space - buf == mountpoint_len && + strncmp(mountpoint, buf, + mountpoint_len) == 0) { + continue; + } + } + fputs(buf, newfp); + } + + if (ferror(oldfp) != 0) { + error = ferror(oldfp); + } + if (fclose(oldfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + } + + if (error == 0 && ferror(newfp) != 0) { + error = ferror(newfp); + } + + free(buf); + if (fclose(newfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + return (error); +} + +/* + * Enables NFS sharing for the specified share. + */ +static int +nfs_enable_share_impl(sa_share_impl_t impl_share, char *filename) +{ + char *shareopts, *linux_opts; + int error; + + shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + error = get_linux_shareopts(shareopts, &linux_opts); + if (error != SA_OK) + return (error); + + error = foreach_nfs_host(impl_share, filename, nfs_add_entry, + linux_opts); + free(linux_opts); + return (error); +} + +static int +nfs_enable_share(sa_share_impl_t impl_share) +{ + return (nfs_toggle_share( + ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, ZFS_EXPORTS_DIR, impl_share, + nfs_enable_share_impl)); +} + +/* + * Disables NFS sharing for the specified share. + */ +static int +nfs_disable_share_impl(sa_share_impl_t impl_share, char *filename) +{ + return (SA_OK); +} + +static int +nfs_disable_share(sa_share_impl_t impl_share) +{ + return (nfs_toggle_share( + ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, ZFS_EXPORTS_DIR, impl_share, + nfs_disable_share_impl)); +} + +static boolean_t +nfs_is_shared(sa_share_impl_t impl_share) +{ + size_t buflen = 0; + char *buf = NULL; + + FILE *fp = fopen(ZFS_EXPORTS_FILE, "re"); + if (fp == NULL) { + return (B_FALSE); + } + while ((getline(&buf, &buflen, fp)) != -1) { + char *space = NULL; + + if ((space = strchr(buf, ' ')) != NULL) { + int mountpoint_len = strlen(impl_share->sa_mountpoint); + + if (space - buf == mountpoint_len && + strncmp(impl_share->sa_mountpoint, buf, + mountpoint_len) == 0) { + fclose(fp); + free(buf); + return (B_TRUE); + } + } + } + free(buf); + fclose(fp); + return (B_FALSE); +} + +/* + * Checks whether the specified NFS share options are syntactically correct. + */ +static int +nfs_validate_shareopts(const char *shareopts) +{ + char *linux_opts; + int error; + + error = get_linux_shareopts(shareopts, &linux_opts); + + if (error != SA_OK) + return (error); + + free(linux_opts); + return (SA_OK); +} + +static int +nfs_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = (char *)shareopts; + return (SA_OK); +} + +/* + * Clears a share's NFS options. Used by libshare to + * clean up shares that are about to be free()'d. + */ +static void +nfs_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = NULL; +} + +static int +nfs_commit_shares(void) +{ + char *argv[] = { + "/usr/sbin/exportfs", + "-ra", + NULL + }; + + return (libzfs_run_process(argv[0], argv, 0)); +} + +static const sa_share_ops_t nfs_shareops = { + .enable_share = nfs_enable_share, + .disable_share = nfs_disable_share, + .is_shared = nfs_is_shared, + + .validate_shareopts = nfs_validate_shareopts, + .update_shareopts = nfs_update_shareopts, + .clear_shareopts = nfs_clear_shareopts, + .commit_shares = nfs_commit_shares, +}; + +/* + * Initializes the NFS functionality of libshare. + */ +void +libshare_nfs_init(void) +{ + nfs_fstype = register_fstype("nfs", &nfs_shareops); +} diff --git a/lib/libshare/smb.c b/lib/libshare/os/linux/smb.c similarity index 90% rename from lib/libshare/smb.c rename to lib/libshare/os/linux/smb.c index 4c2045dfdb..9b18848e09 100644 --- a/lib/libshare/smb.c +++ b/lib/libshare/os/linux/smb.c @@ -23,13 +23,14 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011,2012 Turbo Fredriksson , based on nfs.c * by Gunnar Beutner + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. * * This is an addition to the zfs device driver to add, modify and remove SMB * shares using the 'net share' command that comes with Samba. * * TESTING * Make sure that samba listens to 'localhost' (127.0.0.1) and that the options - * 'usershare max shares' and 'usershare owner only' have been rewied/set + * 'usershare max shares' and 'usershare owner only' have been reviewed/set * accordingly (see zfs(8) for information). * * Once configuration in samba have been done, test that this @@ -65,6 +66,10 @@ static boolean_t smb_available(void); static sa_fstype_t *smb_fstype; +smb_share_t *smb_shares; +static int smb_disable_share(sa_share_impl_t impl_share); +static boolean_t smb_is_share_active(sa_share_impl_t impl_share); + /* * Retrieve the list of SMB shares. */ @@ -102,7 +107,7 @@ smb_retrieve_shares(void) if (!S_ISREG(eStat.st_mode)) continue; - if ((share_file_fp = fopen(file_path, "r")) == NULL) { + if ((share_file_fp = fopen(file_path, "re")) == NULL) { rc = SA_SYSTEM_ERR; goto out; } @@ -273,6 +278,9 @@ smb_enable_share(sa_share_impl_t impl_share) if (!smb_available()) return (SA_SYSTEM_ERR); + if (smb_is_share_active(impl_share)) + smb_disable_share(impl_share); + shareopts = FSINFO(impl_share, smb_fstype)->shareopts; if (shareopts == NULL) /* on/off */ return (SA_SYSTEM_ERR); @@ -281,8 +289,8 @@ smb_enable_share(sa_share_impl_t impl_share) return (SA_OK); /* Magic: Enable (i.e., 'create new') share */ - return (smb_enable_share_one(impl_share->dataset, - impl_share->sharepath)); + return (smb_enable_share_one(impl_share->sa_zfsname, + impl_share->sa_mountpoint)); } /* @@ -300,7 +308,7 @@ smb_disable_share_one(const char *sharename) argv[2] = NET_CMD_ARG_HOST; argv[3] = (char *)"usershare"; argv[4] = (char *)"delete"; - argv[5] = strdup(sharename); + argv[5] = (char *)sharename; argv[6] = NULL; rc = libzfs_run_process(argv[0], argv, 0); @@ -327,7 +335,7 @@ smb_disable_share(sa_share_impl_t impl_share) } while (shares != NULL) { - if (strcmp(impl_share->sharepath, shares->path) == 0) + if (strcmp(impl_share->sa_mountpoint, shares->path) == 0) return (smb_disable_share_one(shares->name)); shares = shares->next; @@ -364,7 +372,7 @@ smb_is_share_active(sa_share_impl_t impl_share) smb_retrieve_shares(); while (iter != NULL) { - if (strcmp(impl_share->sharepath, iter->path) == 0) + if (strcmp(impl_share->sa_mountpoint, iter->path) == 0) return (B_TRUE); iter = iter->next; @@ -380,43 +388,22 @@ smb_is_share_active(sa_share_impl_t impl_share) * of re-enabling the share if necessary. */ static int -smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - const char *shareopts) +smb_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) { - char *shareopts_dup; - boolean_t needs_reshare = B_FALSE; - char *old_shareopts; - if (!impl_share) return (SA_SYSTEM_ERR); - FSINFO(impl_share, smb_fstype)->active = - smb_is_share_active(impl_share); - - old_shareopts = FSINFO(impl_share, smb_fstype)->shareopts; - - if (FSINFO(impl_share, smb_fstype)->active && old_shareopts != NULL && - strcmp(old_shareopts, shareopts) != 0) { - needs_reshare = B_TRUE; - smb_disable_share(impl_share); - } - - shareopts_dup = strdup(shareopts); - - if (shareopts_dup == NULL) - return (SA_NO_MEMORY); - - if (old_shareopts != NULL) - free(old_shareopts); - - FSINFO(impl_share, smb_fstype)->shareopts = shareopts_dup; - - if (needs_reshare) - smb_enable_share(impl_share); - + FSINFO(impl_share, smb_fstype)->shareopts = (char *)shareopts; return (SA_OK); } +static int +smb_update_shares(void) +{ + /* Not implemented */ + return (0); +} + /* * Clears a share's SMB options. Used by libshare to * clean up shares that are about to be free()'d. @@ -424,17 +411,18 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, static void smb_clear_shareopts(sa_share_impl_t impl_share) { - free(FSINFO(impl_share, smb_fstype)->shareopts); FSINFO(impl_share, smb_fstype)->shareopts = NULL; } static const sa_share_ops_t smb_shareops = { .enable_share = smb_enable_share, .disable_share = smb_disable_share, + .is_shared = smb_is_share_active, .validate_shareopts = smb_validate_shareopts, .update_shareopts = smb_update_shareopts, .clear_shareopts = smb_clear_shareopts, + .commit_shares = smb_update_shares, }; /* diff --git a/lib/libshare/smb.h b/lib/libshare/smb.h index 7a0c0fd162..8ea44677f9 100644 --- a/lib/libshare/smb.h +++ b/lib/libshare/smb.h @@ -44,6 +44,6 @@ typedef struct smb_share_s { struct smb_share_s *next; } smb_share_t; -smb_share_t *smb_shares; +extern smb_share_t *smb_shares; void libshare_smb_init(void); diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index cd74676dd2..8457df6dcd 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -1,42 +1,47 @@ include $(top_srcdir)/config/Rules.am -VPATH = \ - $(top_srcdir)/lib/libspl \ - $(top_srcdir)/lib/libspl/$(TARGET_ASM_DIR) +SUBDIRS = include -SUBDIRS = include $(TARGET_ASM_DIR) -DIST_SUBDIRS = include asm-generic asm-i386 asm-x86_64 +noinst_LTLIBRARIES = libspl_assert.la libspl.la -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/lib/libspl/include - -AM_CFLAGS += $(LIBTIRPC_CFLAGS) - -AM_CCASFLAGS = \ - $(CFLAGS) - -noinst_LTLIBRARIES = libspl.la +libspl_assert_la_SOURCES = \ + assert.c USER_C = \ + libspl_impl.h \ + atomic.c \ getexecname.c \ - gethostid.c \ - getmntany.c \ list.c \ mkdirp.c \ page.c \ strlcat.c \ strlcpy.c \ timestamp.c \ - zone.c \ include/sys/list.h \ include/sys/list_impl.h -USER_ASM = atomic.S +if BUILD_LINUX +USER_C += \ + os/linux/getexecname.c \ + os/linux/gethostid.c \ + os/linux/getmntany.c \ + os/linux/zone.c +endif -nodist_libspl_la_SOURCES = \ - $(USER_C) \ - $(USER_ASM) +if BUILD_FREEBSD +USER_C += \ + os/freebsd/getexecname.c \ + os/freebsd/gethostid.c \ + os/freebsd/getmntany.c \ + os/freebsd/mnttab.c \ + os/freebsd/zone.c +endif -libspl_la_LIBADD = -lrt $(LIBTIRPC_LIBS) +libspl_la_SOURCES = $(USER_C) -EXTRA_DIST = $(USER_C) +libspl_la_LIBADD = \ + libspl_assert.la + +libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) + +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libspl/asm-generic/Makefile.am b/lib/libspl/asm-generic/Makefile.am deleted file mode 100644 index 17fe501fa1..0000000000 --- a/lib/libspl/asm-generic/Makefile.am +++ /dev/null @@ -1,18 +0,0 @@ -include $(top_srcdir)/config/Rules.am - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/lib/libspl/include - -atomic_SOURCE = atomic.c -atomic_ASM = atomic.S - -COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ - $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -fPIC -EXTRA_DIST = $(atomic_SOURCE) - -# Generates assembly to simplify inclusion in ../Makefile.am -all-am: - $(COMPILE) -c -S $(atomic_SOURCE) -o $(atomic_ASM) - -clean-generic: - $(RM) $(atomic_ASM) diff --git a/lib/libspl/asm-i386/Makefile.am b/lib/libspl/asm-i386/Makefile.am deleted file mode 100644 index e1126102f7..0000000000 --- a/lib/libspl/asm-i386/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -noinst_HEADERS = atomic.S diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S deleted file mode 100644 index 3086d55439..0000000000 --- a/lib/libspl/asm-i386/atomic.S +++ /dev/null @@ -1,836 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .ident "%Z%%M% %I% %E% SMI" - - .file "%M%" - -#define _ASM -#include - - ENTRY(atomic_inc_8) - ALTENTRY(atomic_inc_uchar) - movl 4(%esp), %eax - lock - incb (%eax) - ret - SET_SIZE(atomic_inc_uchar) - SET_SIZE(atomic_inc_8) - - ENTRY(atomic_inc_16) - ALTENTRY(atomic_inc_ushort) - movl 4(%esp), %eax - lock - incw (%eax) - ret - SET_SIZE(atomic_inc_ushort) - SET_SIZE(atomic_inc_16) - - ENTRY(atomic_inc_32) - ALTENTRY(atomic_inc_uint) - ALTENTRY(atomic_inc_ulong) - movl 4(%esp), %eax - lock - incl (%eax) - ret - SET_SIZE(atomic_inc_ulong) - SET_SIZE(atomic_inc_uint) - SET_SIZE(atomic_inc_32) - - ENTRY(atomic_inc_8_nv) - ALTENTRY(atomic_inc_uchar_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - leal 1(%eax), %ecx - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_inc_uchar_nv) - SET_SIZE(atomic_inc_8_nv) - - ENTRY(atomic_inc_16_nv) - ALTENTRY(atomic_inc_ushort_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - leal 1(%eax), %ecx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_inc_ushort_nv) - SET_SIZE(atomic_inc_16_nv) - - ENTRY(atomic_inc_32_nv) - ALTENTRY(atomic_inc_uint_nv) - ALTENTRY(atomic_inc_ulong_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - leal 1(%eax), %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_inc_ulong_nv) - SET_SIZE(atomic_inc_uint_nv) - SET_SIZE(atomic_inc_32_nv) - - /* - * NOTE: If atomic_inc_64 and atomic_inc_64_nv are ever - * separated, you need to also edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_inc_64_nv. - */ - ENTRY(atomic_inc_64) - ALTENTRY(atomic_inc_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - xorl %ebx, %ebx - xorl %ecx, %ecx - incl %ebx - addl %eax, %ebx - adcl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_inc_64_nv) - SET_SIZE(atomic_inc_64) - - ENTRY(atomic_dec_8) - ALTENTRY(atomic_dec_uchar) - movl 4(%esp), %eax - lock - decb (%eax) - ret - SET_SIZE(atomic_dec_uchar) - SET_SIZE(atomic_dec_8) - - ENTRY(atomic_dec_16) - ALTENTRY(atomic_dec_ushort) - movl 4(%esp), %eax - lock - decw (%eax) - ret - SET_SIZE(atomic_dec_ushort) - SET_SIZE(atomic_dec_16) - - ENTRY(atomic_dec_32) - ALTENTRY(atomic_dec_uint) - ALTENTRY(atomic_dec_ulong) - movl 4(%esp), %eax - lock - decl (%eax) - ret - SET_SIZE(atomic_dec_ulong) - SET_SIZE(atomic_dec_uint) - SET_SIZE(atomic_dec_32) - - ENTRY(atomic_dec_8_nv) - ALTENTRY(atomic_dec_uchar_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - leal -1(%eax), %ecx - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_dec_uchar_nv) - SET_SIZE(atomic_dec_8_nv) - - ENTRY(atomic_dec_16_nv) - ALTENTRY(atomic_dec_ushort_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - leal -1(%eax), %ecx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_dec_ushort_nv) - SET_SIZE(atomic_dec_16_nv) - - ENTRY(atomic_dec_32_nv) - ALTENTRY(atomic_dec_uint_nv) - ALTENTRY(atomic_dec_ulong_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - leal -1(%eax), %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_dec_ulong_nv) - SET_SIZE(atomic_dec_uint_nv) - SET_SIZE(atomic_dec_32_nv) - - /* - * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_dec_64_nv. - */ - ENTRY(atomic_dec_64) - ALTENTRY(atomic_dec_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - xorl %ebx, %ebx - xorl %ecx, %ecx - not %ecx - not %ebx - addl %eax, %ebx - adcl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_dec_64_nv) - SET_SIZE(atomic_dec_64) - - ENTRY(atomic_add_8) - ALTENTRY(atomic_add_char) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - addb %cl, (%eax) - ret - SET_SIZE(atomic_add_char) - SET_SIZE(atomic_add_8) - - ENTRY(atomic_add_16) - ALTENTRY(atomic_add_short) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - addw %cx, (%eax) - ret - SET_SIZE(atomic_add_short) - SET_SIZE(atomic_add_16) - - ENTRY(atomic_add_32) - ALTENTRY(atomic_add_int) - ALTENTRY(atomic_add_ptr) - ALTENTRY(atomic_add_long) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - addl %ecx, (%eax) - ret - SET_SIZE(atomic_add_long) - SET_SIZE(atomic_add_ptr) - SET_SIZE(atomic_add_int) - SET_SIZE(atomic_add_32) - - ENTRY(atomic_sub_8) - ALTENTRY(atomic_sub_char) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - subb %cl, (%eax) - ret - SET_SIZE(atomic_sub_char) - SET_SIZE(atomic_sub_8) - - ENTRY(atomic_sub_16) - ALTENTRY(atomic_sub_short) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - subw %cx, (%eax) - ret - SET_SIZE(atomic_sub_short) - SET_SIZE(atomic_sub_16) - - ENTRY(atomic_sub_32) - ALTENTRY(atomic_sub_int) - ALTENTRY(atomic_sub_ptr) - ALTENTRY(atomic_sub_long) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - subl %ecx, (%eax) - ret - SET_SIZE(atomic_sub_long) - SET_SIZE(atomic_sub_ptr) - SET_SIZE(atomic_sub_int) - SET_SIZE(atomic_sub_32) - - ENTRY(atomic_or_8) - ALTENTRY(atomic_or_uchar) - movl 4(%esp), %eax - movb 8(%esp), %cl - lock - orb %cl, (%eax) - ret - SET_SIZE(atomic_or_uchar) - SET_SIZE(atomic_or_8) - - ENTRY(atomic_or_16) - ALTENTRY(atomic_or_ushort) - movl 4(%esp), %eax - movw 8(%esp), %cx - lock - orw %cx, (%eax) - ret - SET_SIZE(atomic_or_ushort) - SET_SIZE(atomic_or_16) - - ENTRY(atomic_or_32) - ALTENTRY(atomic_or_uint) - ALTENTRY(atomic_or_ulong) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - orl %ecx, (%eax) - ret - SET_SIZE(atomic_or_ulong) - SET_SIZE(atomic_or_uint) - SET_SIZE(atomic_or_32) - - ENTRY(atomic_and_8) - ALTENTRY(atomic_and_uchar) - movl 4(%esp), %eax - movb 8(%esp), %cl - lock - andb %cl, (%eax) - ret - SET_SIZE(atomic_and_uchar) - SET_SIZE(atomic_and_8) - - ENTRY(atomic_and_16) - ALTENTRY(atomic_and_ushort) - movl 4(%esp), %eax - movw 8(%esp), %cx - lock - andw %cx, (%eax) - ret - SET_SIZE(atomic_and_ushort) - SET_SIZE(atomic_and_16) - - ENTRY(atomic_and_32) - ALTENTRY(atomic_and_uint) - ALTENTRY(atomic_and_ulong) - movl 4(%esp), %eax - movl 8(%esp), %ecx - lock - andl %ecx, (%eax) - ret - SET_SIZE(atomic_and_ulong) - SET_SIZE(atomic_and_uint) - SET_SIZE(atomic_and_32) - - ENTRY(atomic_add_8_nv) - ALTENTRY(atomic_add_char_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - movl 8(%esp), %ecx - addb %al, %cl - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_add_char_nv) - SET_SIZE(atomic_add_8_nv) - - ENTRY(atomic_add_16_nv) - ALTENTRY(atomic_add_short_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - movl 8(%esp), %ecx - addw %ax, %cx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_add_short_nv) - SET_SIZE(atomic_add_16_nv) - - ENTRY(atomic_add_32_nv) - ALTENTRY(atomic_add_int_nv) - ALTENTRY(atomic_add_ptr_nv) - ALTENTRY(atomic_add_long_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - movl 8(%esp), %ecx - addl %eax, %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_add_long_nv) - SET_SIZE(atomic_add_ptr_nv) - SET_SIZE(atomic_add_int_nv) - SET_SIZE(atomic_add_32_nv) - - ENTRY(atomic_sub_8_nv) - ALTENTRY(atomic_sub_char_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - movl 8(%esp), %ecx - subb %al, %cl - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_sub_char_nv) - SET_SIZE(atomic_sub_8_nv) - - ENTRY(atomic_sub_16_nv) - ALTENTRY(atomic_sub_short_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - movl 8(%esp), %ecx - subw %ax, %cx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_sub_short_nv) - SET_SIZE(atomic_sub_16_nv) - - ENTRY(atomic_sub_32_nv) - ALTENTRY(atomic_sub_int_nv) - ALTENTRY(atomic_sub_ptr_nv) - ALTENTRY(atomic_sub_long_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - movl 8(%esp), %ecx - subl %eax, %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_sub_long_nv) - SET_SIZE(atomic_sub_ptr_nv) - SET_SIZE(atomic_sub_int_nv) - SET_SIZE(atomic_sub_32_nv) - - /* - * NOTE: If atomic_add_64 and atomic_add_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_add_64_nv. - */ - ENTRY(atomic_add_64) - ALTENTRY(atomic_add_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - movl 16(%esp), %ebx - movl 20(%esp), %ecx - addl %eax, %ebx - adcl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_add_64_nv) - SET_SIZE(atomic_add_64) - - ENTRY(atomic_sub_64) - ALTENTRY(atomic_sub_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - movl 16(%esp), %ebx - movl 20(%esp), %ecx - subl %eax, %ebx - sbbl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_sub_64_nv) - SET_SIZE(atomic_sub_64) - - ENTRY(atomic_or_8_nv) - ALTENTRY(atomic_or_uchar_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - movl 8(%esp), %ecx - orb %al, %cl - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_or_uchar_nv) - SET_SIZE(atomic_or_8_nv) - - ENTRY(atomic_or_16_nv) - ALTENTRY(atomic_or_ushort_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - movl 8(%esp), %ecx - orw %ax, %cx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_or_ushort_nv) - SET_SIZE(atomic_or_16_nv) - - ENTRY(atomic_or_32_nv) - ALTENTRY(atomic_or_uint_nv) - ALTENTRY(atomic_or_ulong_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - movl 8(%esp), %ecx - orl %eax, %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_or_ulong_nv) - SET_SIZE(atomic_or_uint_nv) - SET_SIZE(atomic_or_32_nv) - - /* - * NOTE: If atomic_or_64 and atomic_or_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_or_64_nv. - */ - ENTRY(atomic_or_64) - ALTENTRY(atomic_or_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - movl 16(%esp), %ebx - movl 20(%esp), %ecx - orl %eax, %ebx - orl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_or_64_nv) - SET_SIZE(atomic_or_64) - - ENTRY(atomic_and_8_nv) - ALTENTRY(atomic_and_uchar_nv) - movl 4(%esp), %edx - movb (%edx), %al -1: - movl 8(%esp), %ecx - andb %al, %cl - lock - cmpxchgb %cl, (%edx) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_and_uchar_nv) - SET_SIZE(atomic_and_8_nv) - - ENTRY(atomic_and_16_nv) - ALTENTRY(atomic_and_ushort_nv) - movl 4(%esp), %edx - movw (%edx), %ax -1: - movl 8(%esp), %ecx - andw %ax, %cx - lock - cmpxchgw %cx, (%edx) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_and_ushort_nv) - SET_SIZE(atomic_and_16_nv) - - ENTRY(atomic_and_32_nv) - ALTENTRY(atomic_and_uint_nv) - ALTENTRY(atomic_and_ulong_nv) - movl 4(%esp), %edx - movl (%edx), %eax -1: - movl 8(%esp), %ecx - andl %eax, %ecx - lock - cmpxchgl %ecx, (%edx) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_and_ulong_nv) - SET_SIZE(atomic_and_uint_nv) - SET_SIZE(atomic_and_32_nv) - - /* - * NOTE: If atomic_and_64 and atomic_and_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_and_64_nv. - */ - ENTRY(atomic_and_64) - ALTENTRY(atomic_and_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi - movl (%edi), %eax - movl 4(%edi), %edx -1: - movl 16(%esp), %ebx - movl 20(%esp), %ecx - andl %eax, %ebx - andl %edx, %ecx - lock - cmpxchg8b (%edi) - jne 1b - movl %ebx, %eax - movl %ecx, %edx - popl %ebx - popl %edi - ret - SET_SIZE(atomic_and_64_nv) - SET_SIZE(atomic_and_64) - - ENTRY(atomic_cas_8) - ALTENTRY(atomic_cas_uchar) - movl 4(%esp), %edx - movzbl 8(%esp), %eax - movb 12(%esp), %cl - lock - cmpxchgb %cl, (%edx) - ret - SET_SIZE(atomic_cas_uchar) - SET_SIZE(atomic_cas_8) - - ENTRY(atomic_cas_16) - ALTENTRY(atomic_cas_ushort) - movl 4(%esp), %edx - movzwl 8(%esp), %eax - movw 12(%esp), %cx - lock - cmpxchgw %cx, (%edx) - ret - SET_SIZE(atomic_cas_ushort) - SET_SIZE(atomic_cas_16) - - ENTRY(atomic_cas_32) - ALTENTRY(atomic_cas_uint) - ALTENTRY(atomic_cas_ulong) - ALTENTRY(atomic_cas_ptr) - movl 4(%esp), %edx - movl 8(%esp), %eax - movl 12(%esp), %ecx - lock - cmpxchgl %ecx, (%edx) - ret - SET_SIZE(atomic_cas_ptr) - SET_SIZE(atomic_cas_ulong) - SET_SIZE(atomic_cas_uint) - SET_SIZE(atomic_cas_32) - - ENTRY(atomic_cas_64) - pushl %ebx - pushl %esi - movl 12(%esp), %esi - movl 16(%esp), %eax - movl 20(%esp), %edx - movl 24(%esp), %ebx - movl 28(%esp), %ecx - lock - cmpxchg8b (%esi) - popl %esi - popl %ebx - ret - SET_SIZE(atomic_cas_64) - - ENTRY(atomic_swap_8) - ALTENTRY(atomic_swap_uchar) - movl 4(%esp), %edx - movzbl 8(%esp), %eax - lock - xchgb %al, (%edx) - ret - SET_SIZE(atomic_swap_uchar) - SET_SIZE(atomic_swap_8) - - ENTRY(atomic_swap_16) - ALTENTRY(atomic_swap_ushort) - movl 4(%esp), %edx - movzwl 8(%esp), %eax - lock - xchgw %ax, (%edx) - ret - SET_SIZE(atomic_swap_ushort) - SET_SIZE(atomic_swap_16) - - ENTRY(atomic_swap_32) - ALTENTRY(atomic_swap_uint) - ALTENTRY(atomic_swap_ptr) - ALTENTRY(atomic_swap_ulong) - movl 4(%esp), %edx - movl 8(%esp), %eax - lock - xchgl %eax, (%edx) - ret - SET_SIZE(atomic_swap_ulong) - SET_SIZE(atomic_swap_ptr) - SET_SIZE(atomic_swap_uint) - SET_SIZE(atomic_swap_32) - - ENTRY(atomic_swap_64) - pushl %esi - pushl %ebx - movl 12(%esp), %esi - movl 16(%esp), %ebx - movl 20(%esp), %ecx - movl (%esi), %eax - movl 4(%esi), %edx -1: - lock - cmpxchg8b (%esi) - jne 1b - popl %ebx - popl %esi - ret - SET_SIZE(atomic_swap_64) - - ENTRY(atomic_set_long_excl) - movl 4(%esp), %edx - movl 8(%esp), %ecx - xorl %eax, %eax - lock - btsl %ecx, (%edx) - jnc 1f - decl %eax -1: - ret - SET_SIZE(atomic_set_long_excl) - - ENTRY(atomic_clear_long_excl) - movl 4(%esp), %edx - movl 8(%esp), %ecx - xorl %eax, %eax - lock - btrl %ecx, (%edx) - jc 1f - decl %eax -1: - ret - SET_SIZE(atomic_clear_long_excl) - - /* - * NOTE: membar_enter, membar_exit, membar_producer, and - * membar_consumer are all identical routines. We define them - * separately, instead of using ALTENTRY definitions to alias them - * together, so that DTrace and debuggers will see a unique address - * for them, allowing more accurate tracing. - */ - - - ENTRY(membar_enter) - lock - xorl $0, (%esp) - ret - SET_SIZE(membar_enter) - - ENTRY(membar_exit) - lock - xorl $0, (%esp) - ret - SET_SIZE(membar_exit) - - ENTRY(membar_producer) - lock - xorl $0, (%esp) - ret - SET_SIZE(membar_producer) - - ENTRY(membar_consumer) - lock - xorl $0, (%esp) - ret - SET_SIZE(membar_consumer) - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif diff --git a/lib/libspl/asm-x86_64/Makefile.am b/lib/libspl/asm-x86_64/Makefile.am deleted file mode 100644 index e1126102f7..0000000000 --- a/lib/libspl/asm-x86_64/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -noinst_HEADERS = atomic.S diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S deleted file mode 100644 index 49c9b2ad15..0000000000 --- a/lib/libspl/asm-x86_64/atomic.S +++ /dev/null @@ -1,687 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .ident "%Z%%M% %I% %E% SMI" - - .file "%M%" - -#define _ASM -#include - - ENTRY(atomic_inc_8) - ALTENTRY(atomic_inc_uchar) - lock - incb (%rdi) - ret - SET_SIZE(atomic_inc_uchar) - SET_SIZE(atomic_inc_8) - - ENTRY(atomic_inc_16) - ALTENTRY(atomic_inc_ushort) - lock - incw (%rdi) - ret - SET_SIZE(atomic_inc_ushort) - SET_SIZE(atomic_inc_16) - - ENTRY(atomic_inc_32) - ALTENTRY(atomic_inc_uint) - lock - incl (%rdi) - ret - SET_SIZE(atomic_inc_uint) - SET_SIZE(atomic_inc_32) - - ENTRY(atomic_inc_64) - ALTENTRY(atomic_inc_ulong) - lock - incq (%rdi) - ret - SET_SIZE(atomic_inc_ulong) - SET_SIZE(atomic_inc_64) - - ENTRY(atomic_inc_8_nv) - ALTENTRY(atomic_inc_uchar_nv) - movb (%rdi), %al -1: - leaq 1(%rax), %rcx - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_inc_uchar_nv) - SET_SIZE(atomic_inc_8_nv) - - ENTRY(atomic_inc_16_nv) - ALTENTRY(atomic_inc_ushort_nv) - movw (%rdi), %ax -1: - leaq 1(%rax), %rcx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_inc_ushort_nv) - SET_SIZE(atomic_inc_16_nv) - - ENTRY(atomic_inc_32_nv) - ALTENTRY(atomic_inc_uint_nv) - movl (%rdi), %eax -1: - leaq 1(%rax), %rcx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_inc_uint_nv) - SET_SIZE(atomic_inc_32_nv) - - ENTRY(atomic_inc_64_nv) - ALTENTRY(atomic_inc_ulong_nv) - movq (%rdi), %rax -1: - leaq 1(%rax), %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_inc_ulong_nv) - SET_SIZE(atomic_inc_64_nv) - - ENTRY(atomic_dec_8) - ALTENTRY(atomic_dec_uchar) - lock - decb (%rdi) - ret - SET_SIZE(atomic_dec_uchar) - SET_SIZE(atomic_dec_8) - - ENTRY(atomic_dec_16) - ALTENTRY(atomic_dec_ushort) - lock - decw (%rdi) - ret - SET_SIZE(atomic_dec_ushort) - SET_SIZE(atomic_dec_16) - - ENTRY(atomic_dec_32) - ALTENTRY(atomic_dec_uint) - lock - decl (%rdi) - ret - SET_SIZE(atomic_dec_uint) - SET_SIZE(atomic_dec_32) - - ENTRY(atomic_dec_64) - ALTENTRY(atomic_dec_ulong) - lock - decq (%rdi) - ret - SET_SIZE(atomic_dec_ulong) - SET_SIZE(atomic_dec_64) - - ENTRY(atomic_dec_8_nv) - ALTENTRY(atomic_dec_uchar_nv) - movb (%rdi), %al -1: - leaq -1(%rax), %rcx - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_dec_uchar_nv) - SET_SIZE(atomic_dec_8_nv) - - ENTRY(atomic_dec_16_nv) - ALTENTRY(atomic_dec_ushort_nv) - movw (%rdi), %ax -1: - leaq -1(%rax), %rcx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_dec_ushort_nv) - SET_SIZE(atomic_dec_16_nv) - - ENTRY(atomic_dec_32_nv) - ALTENTRY(atomic_dec_uint_nv) - movl (%rdi), %eax -1: - leaq -1(%rax), %rcx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_dec_uint_nv) - SET_SIZE(atomic_dec_32_nv) - - ENTRY(atomic_dec_64_nv) - ALTENTRY(atomic_dec_ulong_nv) - movq (%rdi), %rax -1: - leaq -1(%rax), %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_dec_ulong_nv) - SET_SIZE(atomic_dec_64_nv) - - ENTRY(atomic_add_8) - ALTENTRY(atomic_add_char) - lock - addb %sil, (%rdi) - ret - SET_SIZE(atomic_add_char) - SET_SIZE(atomic_add_8) - - ENTRY(atomic_add_16) - ALTENTRY(atomic_add_short) - lock - addw %si, (%rdi) - ret - SET_SIZE(atomic_add_short) - SET_SIZE(atomic_add_16) - - ENTRY(atomic_add_32) - ALTENTRY(atomic_add_int) - lock - addl %esi, (%rdi) - ret - SET_SIZE(atomic_add_int) - SET_SIZE(atomic_add_32) - - ENTRY(atomic_add_64) - ALTENTRY(atomic_add_ptr) - ALTENTRY(atomic_add_long) - lock - addq %rsi, (%rdi) - ret - SET_SIZE(atomic_add_long) - SET_SIZE(atomic_add_ptr) - SET_SIZE(atomic_add_64) - - ENTRY(atomic_sub_8) - ALTENTRY(atomic_sub_char) - lock - subb %sil, (%rdi) - ret - SET_SIZE(atomic_sub_char) - SET_SIZE(atomic_sub_8) - - ENTRY(atomic_sub_16) - ALTENTRY(atomic_sub_short) - lock - subw %si, (%rdi) - ret - SET_SIZE(atomic_sub_short) - SET_SIZE(atomic_sub_16) - - ENTRY(atomic_sub_32) - ALTENTRY(atomic_sub_int) - lock - subl %esi, (%rdi) - ret - SET_SIZE(atomic_sub_int) - SET_SIZE(atomic_sub_32) - - ENTRY(atomic_sub_64) - ALTENTRY(atomic_sub_ptr) - ALTENTRY(atomic_sub_long) - lock - subq %rsi, (%rdi) - ret - SET_SIZE(atomic_sub_long) - SET_SIZE(atomic_sub_ptr) - SET_SIZE(atomic_sub_64) - - ENTRY(atomic_or_8) - ALTENTRY(atomic_or_uchar) - lock - orb %sil, (%rdi) - ret - SET_SIZE(atomic_or_uchar) - SET_SIZE(atomic_or_8) - - ENTRY(atomic_or_16) - ALTENTRY(atomic_or_ushort) - lock - orw %si, (%rdi) - ret - SET_SIZE(atomic_or_ushort) - SET_SIZE(atomic_or_16) - - ENTRY(atomic_or_32) - ALTENTRY(atomic_or_uint) - lock - orl %esi, (%rdi) - ret - SET_SIZE(atomic_or_uint) - SET_SIZE(atomic_or_32) - - ENTRY(atomic_or_64) - ALTENTRY(atomic_or_ulong) - lock - orq %rsi, (%rdi) - ret - SET_SIZE(atomic_or_ulong) - SET_SIZE(atomic_or_64) - - ENTRY(atomic_and_8) - ALTENTRY(atomic_and_uchar) - lock - andb %sil, (%rdi) - ret - SET_SIZE(atomic_and_uchar) - SET_SIZE(atomic_and_8) - - ENTRY(atomic_and_16) - ALTENTRY(atomic_and_ushort) - lock - andw %si, (%rdi) - ret - SET_SIZE(atomic_and_ushort) - SET_SIZE(atomic_and_16) - - ENTRY(atomic_and_32) - ALTENTRY(atomic_and_uint) - lock - andl %esi, (%rdi) - ret - SET_SIZE(atomic_and_uint) - SET_SIZE(atomic_and_32) - - ENTRY(atomic_and_64) - ALTENTRY(atomic_and_ulong) - lock - andq %rsi, (%rdi) - ret - SET_SIZE(atomic_and_ulong) - SET_SIZE(atomic_and_64) - - ENTRY(atomic_add_8_nv) - ALTENTRY(atomic_add_char_nv) - movb (%rdi), %al -1: - movb %sil, %cl - addb %al, %cl - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_add_char_nv) - SET_SIZE(atomic_add_8_nv) - - ENTRY(atomic_add_16_nv) - ALTENTRY(atomic_add_short_nv) - movw (%rdi), %ax -1: - movw %si, %cx - addw %ax, %cx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_add_short_nv) - SET_SIZE(atomic_add_16_nv) - - ENTRY(atomic_add_32_nv) - ALTENTRY(atomic_add_int_nv) - movl (%rdi), %eax -1: - movl %esi, %ecx - addl %eax, %ecx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_add_int_nv) - SET_SIZE(atomic_add_32_nv) - - ENTRY(atomic_add_64_nv) - ALTENTRY(atomic_add_ptr_nv) - ALTENTRY(atomic_add_long_nv) - movq (%rdi), %rax -1: - movq %rsi, %rcx - addq %rax, %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_add_long_nv) - SET_SIZE(atomic_add_ptr_nv) - SET_SIZE(atomic_add_64_nv) - - ENTRY(atomic_sub_8_nv) - ALTENTRY(atomic_sub_char_nv) - movb (%rdi), %al -1: - movb %sil, %cl - subb %al, %cl - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_sub_char_nv) - SET_SIZE(atomic_sub_8_nv) - - ENTRY(atomic_sub_16_nv) - ALTENTRY(atomic_sub_short_nv) - movw (%rdi), %ax -1: - movw %si, %cx - subw %ax, %cx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_sub_short_nv) - SET_SIZE(atomic_sub_16_nv) - - ENTRY(atomic_sub_32_nv) - ALTENTRY(atomic_sub_int_nv) - movl (%rdi), %eax -1: - movl %esi, %ecx - subl %eax, %ecx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_sub_int_nv) - SET_SIZE(atomic_sub_32_nv) - - ENTRY(atomic_sub_64_nv) - ALTENTRY(atomic_sub_ptr_nv) - ALTENTRY(atomic_sub_long_nv) - movq (%rdi), %rax -1: - movq %rsi, %rcx - subq %rax, %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_sub_long_nv) - SET_SIZE(atomic_sub_ptr_nv) - SET_SIZE(atomic_sub_64_nv) - - ENTRY(atomic_and_8_nv) - ALTENTRY(atomic_and_uchar_nv) - movb (%rdi), %al -1: - movb %sil, %cl - andb %al, %cl - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_and_uchar_nv) - SET_SIZE(atomic_and_8_nv) - - ENTRY(atomic_and_16_nv) - ALTENTRY(atomic_and_ushort_nv) - movw (%rdi), %ax -1: - movw %si, %cx - andw %ax, %cx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_and_ushort_nv) - SET_SIZE(atomic_and_16_nv) - - ENTRY(atomic_and_32_nv) - ALTENTRY(atomic_and_uint_nv) - movl (%rdi), %eax -1: - movl %esi, %ecx - andl %eax, %ecx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_and_uint_nv) - SET_SIZE(atomic_and_32_nv) - - ENTRY(atomic_and_64_nv) - ALTENTRY(atomic_and_ulong_nv) - movq (%rdi), %rax -1: - movq %rsi, %rcx - andq %rax, %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_and_ulong_nv) - SET_SIZE(atomic_and_64_nv) - - ENTRY(atomic_or_8_nv) - ALTENTRY(atomic_or_uchar_nv) - movb (%rdi), %al -1: - movb %sil, %cl - orb %al, %cl - lock - cmpxchgb %cl, (%rdi) - jne 1b - movzbl %cl, %eax - ret - SET_SIZE(atomic_and_uchar_nv) - SET_SIZE(atomic_and_8_nv) - - ENTRY(atomic_or_16_nv) - ALTENTRY(atomic_or_ushort_nv) - movw (%rdi), %ax -1: - movw %si, %cx - orw %ax, %cx - lock - cmpxchgw %cx, (%rdi) - jne 1b - movzwl %cx, %eax - ret - SET_SIZE(atomic_or_ushort_nv) - SET_SIZE(atomic_or_16_nv) - - ENTRY(atomic_or_32_nv) - ALTENTRY(atomic_or_uint_nv) - movl (%rdi), %eax -1: - movl %esi, %ecx - orl %eax, %ecx - lock - cmpxchgl %ecx, (%rdi) - jne 1b - movl %ecx, %eax - ret - SET_SIZE(atomic_or_uint_nv) - SET_SIZE(atomic_or_32_nv) - - ENTRY(atomic_or_64_nv) - ALTENTRY(atomic_or_ulong_nv) - movq (%rdi), %rax -1: - movq %rsi, %rcx - orq %rax, %rcx - lock - cmpxchgq %rcx, (%rdi) - jne 1b - movq %rcx, %rax - ret - SET_SIZE(atomic_or_ulong_nv) - SET_SIZE(atomic_or_64_nv) - - ENTRY(atomic_cas_8) - ALTENTRY(atomic_cas_uchar) - movzbl %sil, %eax - lock - cmpxchgb %dl, (%rdi) - ret - SET_SIZE(atomic_cas_uchar) - SET_SIZE(atomic_cas_8) - - ENTRY(atomic_cas_16) - ALTENTRY(atomic_cas_ushort) - movzwl %si, %eax - lock - cmpxchgw %dx, (%rdi) - ret - SET_SIZE(atomic_cas_ushort) - SET_SIZE(atomic_cas_16) - - ENTRY(atomic_cas_32) - ALTENTRY(atomic_cas_uint) - movl %esi, %eax - lock - cmpxchgl %edx, (%rdi) - ret - SET_SIZE(atomic_cas_uint) - SET_SIZE(atomic_cas_32) - - ENTRY(atomic_cas_64) - ALTENTRY(atomic_cas_ulong) - ALTENTRY(atomic_cas_ptr) - movq %rsi, %rax - lock - cmpxchgq %rdx, (%rdi) - ret - SET_SIZE(atomic_cas_ptr) - SET_SIZE(atomic_cas_ulong) - SET_SIZE(atomic_cas_64) - - ENTRY(atomic_swap_8) - ALTENTRY(atomic_swap_uchar) - movzbl %sil, %eax - lock - xchgb %al, (%rdi) - ret - SET_SIZE(atomic_swap_uchar) - SET_SIZE(atomic_swap_8) - - ENTRY(atomic_swap_16) - ALTENTRY(atomic_swap_ushort) - movzwl %si, %eax - lock - xchgw %ax, (%rdi) - ret - SET_SIZE(atomic_swap_ushort) - SET_SIZE(atomic_swap_16) - - ENTRY(atomic_swap_32) - ALTENTRY(atomic_swap_uint) - movl %esi, %eax - lock - xchgl %eax, (%rdi) - ret - SET_SIZE(atomic_swap_uint) - SET_SIZE(atomic_swap_32) - - ENTRY(atomic_swap_64) - ALTENTRY(atomic_swap_ulong) - ALTENTRY(atomic_swap_ptr) - movq %rsi, %rax - lock - xchgq %rax, (%rdi) - ret - SET_SIZE(atomic_swap_ptr) - SET_SIZE(atomic_swap_ulong) - SET_SIZE(atomic_swap_64) - - ENTRY(atomic_set_long_excl) - xorl %eax, %eax - lock - btsq %rsi, (%rdi) - jnc 1f - decl %eax -1: - ret - SET_SIZE(atomic_set_long_excl) - - ENTRY(atomic_clear_long_excl) - xorl %eax, %eax - lock - btrq %rsi, (%rdi) - jc 1f - decl %eax -1: - ret - SET_SIZE(atomic_clear_long_excl) - - /* - * NOTE: membar_enter, and membar_exit are identical routines. - * We define them separately, instead of using an ALTENTRY - * definitions to alias them together, so that DTrace and - * debuggers will see a unique address for them, allowing - * more accurate tracing. - */ - - ENTRY(membar_enter) - mfence - ret - SET_SIZE(membar_enter) - - ENTRY(membar_exit) - mfence - ret - SET_SIZE(membar_exit) - - ENTRY(membar_producer) - sfence - ret - SET_SIZE(membar_producer) - - ENTRY(membar_consumer) - lfence - ret - SET_SIZE(membar_consumer) - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c new file mode 100644 index 0000000000..8e4333976f --- /dev/null +++ b/lib/libspl/assert.c @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include + +int libspl_assert_ok = 0; + +/* printf version of libspl_assert */ +void +libspl_assertf(const char *file, const char *func, int line, + const char *format, ...) +{ + va_list args; + + va_start(args, format); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); + va_end(args); + if (libspl_assert_ok) { + return; + } + abort(); +} diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/atomic.c similarity index 53% rename from lib/libspl/asm-generic/atomic.c rename to lib/libspl/atomic.c index d0023b1828..4717d818ce 100644 --- a/lib/libspl/asm-generic/atomic.c +++ b/lib/libspl/atomic.c @@ -25,30 +25,17 @@ */ #include -#include -#include /* - * All operations are implemented by serializing them through a global - * pthread mutex. This provides a correct generic implementation. - * However all supported architectures are encouraged to provide a - * native implementation is assembly for performance reasons. - */ -pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER; - -/* - * Theses are the void returning variants + * These are the void returning variants */ /* BEGIN CSTYLED */ #define ATOMIC_INC(name, type) \ void atomic_inc_##name(volatile type *target) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - (*target)++; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -ATOMIC_INC(long, unsigned long) ATOMIC_INC(8, uint8_t) ATOMIC_INC(uchar, uchar_t) ATOMIC_INC(16, uint16_t) @@ -62,12 +49,9 @@ ATOMIC_INC(64, uint64_t) #define ATOMIC_DEC(name, type) \ void atomic_dec_##name(volatile type *target) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - (*target)--; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -ATOMIC_DEC(long, unsigned long) ATOMIC_DEC(8, uint8_t) ATOMIC_DEC(uchar, uchar_t) ATOMIC_DEC(16, uint16_t) @@ -81,9 +65,7 @@ ATOMIC_DEC(64, uint64_t) #define ATOMIC_ADD(name, type1, type2) \ void atomic_add_##name(volatile type1 *target, type2 bits) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - *target += bits; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST); \ } ATOMIC_ADD(8, uint8_t, int8_t) @@ -98,18 +80,14 @@ ATOMIC_ADD(64, uint64_t, int64_t) void atomic_add_ptr(volatile void *target, ssize_t bits) { - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - *(caddr_t *)target += bits; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + (void) __atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } #define ATOMIC_SUB(name, type1, type2) \ void atomic_sub_##name(volatile type1 *target, type2 bits) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - *target -= bits; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST); \ } ATOMIC_SUB(8, uint8_t, int8_t) @@ -124,18 +102,14 @@ ATOMIC_SUB(64, uint64_t, int64_t) void atomic_sub_ptr(volatile void *target, ssize_t bits) { - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - *(caddr_t *)target -= bits; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + (void) __atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } #define ATOMIC_OR(name, type) \ void atomic_or_##name(volatile type *target, type bits) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - *target |= bits; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST); \ } ATOMIC_OR(8, uint8_t) @@ -151,9 +125,7 @@ ATOMIC_OR(64, uint64_t) #define ATOMIC_AND(name, type) \ void atomic_and_##name(volatile type *target, type bits) \ { \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - *target &= bits; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + (void) __atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST); \ } ATOMIC_AND(8, uint8_t) @@ -173,14 +145,9 @@ ATOMIC_AND(64, uint64_t) #define ATOMIC_INC_NV(name, type) \ type atomic_inc_##name##_nv(volatile type *target) \ { \ - type rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (++(*target)); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -ATOMIC_INC_NV(long, unsigned long) ATOMIC_INC_NV(8, uint8_t) ATOMIC_INC_NV(uchar, uchar_t) ATOMIC_INC_NV(16, uint16_t) @@ -194,14 +161,9 @@ ATOMIC_INC_NV(64, uint64_t) #define ATOMIC_DEC_NV(name, type) \ type atomic_dec_##name##_nv(volatile type *target) \ { \ - type rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (--(*target)); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -ATOMIC_DEC_NV(long, unsigned long) ATOMIC_DEC_NV(8, uint8_t) ATOMIC_DEC_NV(uchar, uchar_t) ATOMIC_DEC_NV(16, uint16_t) @@ -213,13 +175,9 @@ ATOMIC_DEC_NV(64, uint64_t) #define ATOMIC_ADD_NV(name, type1, type2) \ - type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits)\ + type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits) \ { \ - type1 rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (*target += bits); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } ATOMIC_ADD_NV(8, uint8_t, int8_t) @@ -234,24 +192,14 @@ ATOMIC_ADD_NV(64, uint64_t, int64_t) void * atomic_add_ptr_nv(volatile void *target, ssize_t bits) { - void *ptr; - - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - ptr = (*(caddr_t *)target += bits); - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (ptr); + return (__atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } #define ATOMIC_SUB_NV(name, type1, type2) \ - type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ + type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits) \ { \ - type1 rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (*target -= bits); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } ATOMIC_SUB_NV(8, uint8_t, int8_t) @@ -266,27 +214,16 @@ ATOMIC_SUB_NV(64, uint64_t, int64_t) void * atomic_sub_ptr_nv(volatile void *target, ssize_t bits) { - void *ptr; - - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - ptr = (*(caddr_t *)target -= bits); - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (ptr); + return (__atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } #define ATOMIC_OR_NV(name, type) \ type atomic_or_##name##_nv(volatile type *target, type bits) \ { \ - type rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (*target |= bits); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -ATOMIC_OR_NV(long, unsigned long) ATOMIC_OR_NV(8, uint8_t) ATOMIC_OR_NV(uchar, uchar_t) ATOMIC_OR_NV(16, uint16_t) @@ -300,14 +237,9 @@ ATOMIC_OR_NV(64, uint64_t) #define ATOMIC_AND_NV(name, type) \ type atomic_and_##name##_nv(volatile type *target, type bits) \ { \ - type rc; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - rc = (*target &= bits); \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (rc); \ + return (__atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -ATOMIC_AND_NV(long, unsigned long) ATOMIC_AND_NV(8, uint8_t) ATOMIC_AND_NV(uchar, uchar_t) ATOMIC_AND_NV(16, uint16_t) @@ -319,19 +251,21 @@ ATOMIC_AND_NV(64, uint64_t) /* - * If *arg1 == arg2, set *arg1 = arg3; return old value + * If *tgt == exp, set *tgt = des; return old value + * + * This may not look right on the first pass (or the sixteenth), but, + * from https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html: + * > If they are not equal, the operation is a read + * > and the current contents of *ptr are written into *expected. + * And, in the converse case, exp is already *target by definition. */ #define ATOMIC_CAS(name, type) \ - type atomic_cas_##name(volatile type *target, type arg1, type arg2) \ + type atomic_cas_##name(volatile type *target, type exp, type des) \ { \ - type old; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - old = *target; \ - if (old == arg1) \ - *target = arg2; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (old); \ + __atomic_compare_exchange_n(target, &exp, des, B_FALSE, \ + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ + return (exp); \ } ATOMIC_CAS(8, uint8_t) @@ -344,17 +278,12 @@ ATOMIC_CAS(ulong, ulong_t) ATOMIC_CAS(64, uint64_t) void * -atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) +atomic_cas_ptr(volatile void *target, void *exp, void *des) { - void *old; - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - old = *(void **)target; - if (old == arg1) - *(void **)target = arg2; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (old); + __atomic_compare_exchange_n((void **)target, &exp, des, B_FALSE, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return (exp); } @@ -365,12 +294,7 @@ atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) #define ATOMIC_SWAP(name, type) \ type atomic_swap_##name(volatile type *target, type bits) \ { \ - type old; \ - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ - old = *target; \ - *target = bits; \ - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ - return (old); \ + return (__atomic_exchange_n(target, bits, __ATOMIC_SEQ_CST)); \ } ATOMIC_SWAP(8, uint8_t) @@ -386,115 +310,59 @@ ATOMIC_SWAP(64, uint64_t) void * atomic_swap_ptr(volatile void *target, void *bits) { - void *old; - - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - old = *(void **)target; - *(void **)target = bits; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (old); + return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST)); } +#ifndef _LP64 +uint64_t +atomic_load_64(volatile uint64_t *target) +{ + return (__atomic_load_n(target, __ATOMIC_RELAXED)); +} + +void +atomic_store_64(volatile uint64_t *target, uint64_t bits) +{ + return (__atomic_store_n(target, bits, __ATOMIC_RELAXED)); +} +#endif int atomic_set_long_excl(volatile ulong_t *target, uint_t value) { - ulong_t bit; - - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - bit = (1UL << value); - if ((*target & bit) != 0) { - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - return (-1); - } - *target |= bit; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (0); + ulong_t bit = 1UL << value; + ulong_t old = __atomic_fetch_or(target, bit, __ATOMIC_SEQ_CST); + return ((old & bit) ? -1 : 0); } int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) { - ulong_t bit; - - VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); - bit = (1UL << value); - if ((*target & bit) != 0) { - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - return (-1); - } - *target &= ~bit; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - - return (0); + ulong_t bit = 1UL << value; + ulong_t old = __atomic_fetch_and(target, ~bit, __ATOMIC_SEQ_CST); + return ((old & bit) ? 0 : -1); } void membar_enter(void) { - /* XXX - Implement me */ + __atomic_thread_fence(__ATOMIC_SEQ_CST); } void membar_exit(void) { - /* XXX - Implement me */ + __atomic_thread_fence(__ATOMIC_SEQ_CST); } void membar_producer(void) { - /* XXX - Implement me */ + __atomic_thread_fence(__ATOMIC_RELEASE); } void membar_consumer(void) { - /* XXX - Implement me */ -} - -/* Legacy kernel interfaces; they will go away (eventually). */ - -uint8_t -cas8(uint8_t *target, uint8_t arg1, uint8_t arg2) -{ - return (atomic_cas_8(target, arg1, arg2)); -} - -uint32_t -cas32(uint32_t *target, uint32_t arg1, uint32_t arg2) -{ - return (atomic_cas_32(target, arg1, arg2)); -} - -uint64_t -cas64(uint64_t *target, uint64_t arg1, uint64_t arg2) -{ - return (atomic_cas_64(target, arg1, arg2)); -} - -ulong_t -caslong(ulong_t *target, ulong_t arg1, ulong_t arg2) -{ - return (atomic_cas_ulong(target, arg1, arg2)); -} - -void * -casptr(void *target, void *arg1, void *arg2) -{ - return (atomic_cas_ptr(target, arg1, arg2)); -} - -void -atomic_and_long(ulong_t *target, ulong_t bits) -{ - return (atomic_and_ulong(target, bits)); -} - -void -atomic_or_long(ulong_t *target, ulong_t bits) -{ - return (atomic_or_ulong(target, bits)); + __atomic_thread_fence(__ATOMIC_ACQUIRE); } diff --git a/lib/libspl/getexecname.c b/lib/libspl/getexecname.c index c21a110ad5..dca7162034 100644 --- a/lib/libspl/getexecname.c +++ b/lib/libspl/getexecname.c @@ -25,32 +25,33 @@ */ +#include +#include +#include #include #include -#include -#include +#include "libspl_impl.h" + const char * getexecname(void) { static char execname[PATH_MAX + 1] = ""; static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; - char *ptr = NULL; + + char *ptr = execname; ssize_t rc; (void) pthread_mutex_lock(&mtx); if (strlen(execname) == 0) { - rc = readlink("/proc/self/exe", - execname, sizeof (execname) - 1); + rc = getexecname_impl(execname); if (rc == -1) { execname[0] = '\0'; + ptr = NULL; } else { execname[rc] = '\0'; - ptr = execname; } - } else { - ptr = execname; } (void) pthread_mutex_unlock(&mtx); diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index 842a8fbb39..9ca08b2bc0 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -1,23 +1,22 @@ -SUBDIRS = ia32 rpc sys util +SUBDIRS = ia32 rpc sys util os libspldir = $(includedir)/libspl libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/assert.h \ - $(top_srcdir)/lib/libspl/include/atomic.h \ - $(top_srcdir)/lib/libspl/include/devid.h \ - $(top_srcdir)/lib/libspl/include/libdevinfo.h \ - $(top_srcdir)/lib/libspl/include/libgen.h \ - $(top_srcdir)/lib/libspl/include/libshare.h \ - $(top_srcdir)/lib/libspl/include/limits.h \ - $(top_srcdir)/lib/libspl/include/locale.h \ - $(top_srcdir)/lib/libspl/include/statcommon.h \ - $(top_srcdir)/lib/libspl/include/stdio.h \ - $(top_srcdir)/lib/libspl/include/stdlib.h \ - $(top_srcdir)/lib/libspl/include/string.h \ - $(top_srcdir)/lib/libspl/include/stropts.h \ - $(top_srcdir)/lib/libspl/include/thread.h \ - $(top_srcdir)/lib/libspl/include/tzfile.h \ - $(top_srcdir)/lib/libspl/include/ucred.h \ - $(top_srcdir)/lib/libspl/include/umem.h \ - $(top_srcdir)/lib/libspl/include/unistd.h \ - $(top_srcdir)/lib/libspl/include/zone.h + assert.h \ + atomic.h \ + libdevinfo.h \ + libgen.h \ + libshare.h \ + limits.h \ + locale.h \ + statcommon.h \ + stdio.h \ + stdlib.h \ + string.h \ + stropts.h \ + thread.h \ + tzfile.h \ + ucred.h \ + umem.h \ + unistd.h \ + zone.h diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index f615fbdfe7..84dbccdc4a 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -33,26 +33,18 @@ #include #include +/* Set to non-zero to avoid abort()ing on an assertion failure */ +extern int libspl_assert_ok; + +/* printf version of libspl_assert */ +extern void libspl_assertf(const char *file, const char *func, int line, + const char *format, ...); + static inline int libspl_assert(const char *buf, const char *file, const char *func, int line) { - fprintf(stderr, "%s\n", buf); - fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); - abort(); -} - -/* printf version of libspl_assert */ -static inline void -libspl_assertf(const char *file, const char *func, int line, char *format, ...) -{ - va_list args; - - va_start(args, format); - vfprintf(stderr, format, args); - fprintf(stderr, "\n"); - fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); - va_end(args); - abort(); + libspl_assertf(file, func, line, "%s", buf); + return (0); } #ifdef verify @@ -135,7 +127,6 @@ do { \ #define ASSERT0(x) ((void)0) #define ASSERT(x) ((void)0) #define assert(x) ((void)0) -#define ASSERTV(x) #define IMPLY(A, B) ((void)0) #define EQUIV(A, B) ((void)0) #else @@ -146,7 +137,6 @@ do { \ #define ASSERT0 VERIFY0 #define ASSERT VERIFY #define assert VERIFY -#define ASSERTV(x) x #define IMPLY(A, B) \ ((void)(((!(A)) || (B)) || \ libspl_assert("(" #A ") implies (" #B ")", \ diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h index 7072a11bdb..8dd1d654a4 100644 --- a/lib/libspl/include/atomic.h +++ b/lib/libspl/include/atomic.h @@ -79,7 +79,7 @@ extern void atomic_add_64(volatile uint64_t *, int64_t); #endif /* - * Substract delta from target + * Subtract delta from target */ extern void atomic_sub_8(volatile uint8_t *, int8_t); extern void atomic_sub_char(volatile uchar_t *, signed char); @@ -173,7 +173,7 @@ extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t); #endif /* - * Substract delta from target + * Subtract delta from target */ extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t); extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char); @@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t); extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); #endif +/* + * Atomically read variable. + */ +#define atomic_load_char(p) (*(volatile uchar_t *)(p)) +#define atomic_load_short(p) (*(volatile ushort_t *)(p)) +#define atomic_load_int(p) (*(volatile uint_t *)(p)) +#define atomic_load_long(p) (*(volatile ulong_t *)(p)) +#define atomic_load_ptr(p) (*(volatile __typeof(*p) *)(p)) +#define atomic_load_8(p) (*(volatile uint8_t *)(p)) +#define atomic_load_16(p) (*(volatile uint16_t *)(p)) +#define atomic_load_32(p) (*(volatile uint32_t *)(p)) +#ifdef _LP64 +#define atomic_load_64(p) (*(volatile uint64_t *)(p)) +#elif defined(_INT64_TYPE) +extern uint64_t atomic_load_64(volatile uint64_t *); +#endif + +/* + * Atomically write variable. + */ +#define atomic_store_char(p, v) \ + (*(volatile uchar_t *)(p) = (uchar_t)(v)) +#define atomic_store_short(p, v) \ + (*(volatile ushort_t *)(p) = (ushort_t)(v)) +#define atomic_store_int(p, v) \ + (*(volatile uint_t *)(p) = (uint_t)(v)) +#define atomic_store_long(p, v) \ + (*(volatile ulong_t *)(p) = (ulong_t)(v)) +#define atomic_store_ptr(p, v) \ + (*(volatile __typeof(*p) *)(p) = (v)) +#define atomic_store_8(p, v) \ + (*(volatile uint8_t *)(p) = (uint8_t)(v)) +#define atomic_store_16(p, v) \ + (*(volatile uint16_t *)(p) = (uint16_t)(v)) +#define atomic_store_32(p, v) \ + (*(volatile uint32_t *)(p) = (uint32_t)(v)) +#ifdef _LP64 +#define atomic_store_64(p, v) \ + (*(volatile uint64_t *)(p) = (uint64_t)(v)) +#elif defined(_INT64_TYPE) +extern void atomic_store_64(volatile uint64_t *, uint64_t); +#endif + /* * Perform an exclusive atomic bit set/clear on a target. * Returns 0 if bit was successfully set/cleared, or -1 diff --git a/lib/libspl/include/ia32/sys/Makefile.am b/lib/libspl/include/ia32/sys/Makefile.am index c8136ee2ad..683288460c 100644 --- a/lib/libspl/include/ia32/sys/Makefile.am +++ b/lib/libspl/include/ia32/sys/Makefile.am @@ -1,3 +1,3 @@ libspldir = $(includedir)/libspl/ia32/sys libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/ia32/sys/asm_linkage.h + asm_linkage.h diff --git a/lib/libspl/include/libshare.h b/lib/libspl/include/libshare.h index 4016ff0314..5d06b163a3 100644 --- a/lib/libspl/include/libshare.h +++ b/lib/libspl/include/libshare.h @@ -22,13 +22,10 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. */ #ifndef _LIBSPL_LIBSHARE_H -#define _LIBSPL_LIBSHARE_H - -typedef void *sa_handle_t; /* opaque handle to access core functions */ -typedef void *sa_group_t; -typedef void *sa_share_t; +#define _LIBSPL_LIBSHARE_H extern __attribute__((visibility("default"))) /* API Initialization */ #define SA_INIT_SHARE_API 0x0001 /* init share specific interface */ @@ -74,23 +71,16 @@ typedef void *sa_share_t; #define SA_SHARE_EXISTS 33 /* path or file is already shared */ /* initialization */ -extern sa_handle_t sa_init(int); -extern void sa_fini(sa_handle_t); -extern char *sa_errorstr(int); +_LIBSPL_LIBSHARE_H char *sa_errorstr(int); /* share control */ -extern sa_share_t sa_find_share(sa_handle_t, char *); -extern int sa_enable_share(sa_group_t, char *); -extern int sa_disable_share(sa_share_t, char *); +_LIBSPL_LIBSHARE_H int sa_enable_share(const char *, const char *, const char *, + char *); +_LIBSPL_LIBSHARE_H int sa_disable_share(const char *, char *); +_LIBSPL_LIBSHARE_H boolean_t sa_is_shared(const char *, char *); +_LIBSPL_LIBSHARE_H void sa_commit_shares(const char *); /* protocol specific interfaces */ -extern int sa_parse_legacy_options(sa_group_t, char *, char *); - -/* ZFS functions */ -extern boolean_t sa_needs_refresh(sa_handle_t handle); -libzfs_handle_t *sa_get_zfs_handle(sa_handle_t handle); -extern int sa_zfs_process_share(sa_handle_t handle, sa_group_t group, - sa_share_t share, char *mountpoint, char *proto, zprop_source_t source, - char *shareopts, char *sourcestr, char *dataset); +_LIBSPL_LIBSHARE_H int sa_validate_shareopts(char *, char *); #endif /* _LIBSPL_LIBSHARE_H */ diff --git a/lib/libspl/include/limits.h b/lib/libspl/include/limits.h index 1a42cfec46..5d996eb846 100644 --- a/lib/libspl/include/limits.h +++ b/lib/libspl/include/limits.h @@ -25,16 +25,21 @@ */ #include_next +#include #ifndef _LIBSPL_LIMITS_H #define _LIBSPL_LIMITS_H +#ifndef DBL_DIG #define DBL_DIG 15 #define DBL_MAX 1.7976931348623157081452E+308 #define DBL_MIN 2.2250738585072013830903E-308 +#endif +#ifndef FLT_DIG #define FLT_DIG 6 #define FLT_MAX 3.4028234663852885981170E+38F #define FLT_MIN 1.1754943508222875079688E-38F +#endif #endif /* _LIBSPL_LIMITS_H */ diff --git a/lib/libspl/include/os/Makefile.am b/lib/libspl/include/os/Makefile.am new file mode 100644 index 0000000000..7b362e02ad --- /dev/null +++ b/lib/libspl/include/os/Makefile.am @@ -0,0 +1,7 @@ +if BUILD_FREEBSD +SUBDIRS = freebsd +endif + +if BUILD_LINUX +SUBDIRS = linux +endif diff --git a/lib/libspl/include/os/freebsd/Makefile.am b/lib/libspl/include/os/freebsd/Makefile.am new file mode 100644 index 0000000000..f06325ee3e --- /dev/null +++ b/lib/libspl/include/os/freebsd/Makefile.am @@ -0,0 +1,5 @@ +SUBDIRS = sys + +libspldir = $(includedir)/libspl +libspl_HEADERS = \ + fcntl.h diff --git a/lib/libspl/include/os/freebsd/fcntl.h b/lib/libspl/include/os/freebsd/fcntl.h new file mode 100644 index 0000000000..26d571ad89 --- /dev/null +++ b/lib/libspl/include/os/freebsd/fcntl.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _LIBSPL_FCNTL_H_ +#define _LIBSPL_FCNTL_H_ + +#include_next + +#include + +#endif /* _LIBSPL_FCNTL_H_ */ diff --git a/lib/libspl/include/os/freebsd/sys/Makefile.am b/lib/libspl/include/os/freebsd/sys/Makefile.am new file mode 100644 index 0000000000..7a85460807 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/Makefile.am @@ -0,0 +1,12 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + byteorder.h \ + fcntl.h \ + file.h \ + mnttab.h \ + mount.h \ + param.h \ + stat.h \ + sysmacros.h \ + vfs.h \ + zfs_context_os.h diff --git a/lib/libspl/include/os/freebsd/sys/byteorder.h b/lib/libspl/include/os/freebsd/sys/byteorder.h new file mode 100644 index 0000000000..cd692d3616 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/byteorder.h @@ -0,0 +1,192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#include +#include +#include +#include + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _ZFS_BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#ifdef _ZFS_BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/freebsd/sys/fcntl.h b/lib/libspl/include/os/freebsd/sys/fcntl.h new file mode 100644 index 0000000000..c8a37a1938 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/fcntl.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _LIBSPL_SYS_FCNTL_H_ +#define _LIBSPL_SYS_FCNTL_H_ + +#include_next + +#define O_LARGEFILE 0 +#define O_RSYNC 0 + +#ifndef O_DSYNC +#define O_DSYNC 0 +#endif + +#endif /* _LIBSPL_SYS_FCNTL_H_ */ diff --git a/lib/libspl/include/sys/file.h b/lib/libspl/include/os/freebsd/sys/file.h similarity index 92% rename from lib/libspl/include/sys/file.h rename to lib/libspl/include/os/freebsd/sys/file.h index e0752ac25c..27fd2888f3 100644 --- a/lib/libspl/include/sys/file.h +++ b/lib/libspl/include/os/freebsd/sys/file.h @@ -29,15 +29,8 @@ #include_next -#include - -#define FREAD 1 -#define FWRITE 2 -// #define FAPPEND 8 - #define FCREAT O_CREAT #define FTRUNC O_TRUNC -#define FOFFMAX O_LARGEFILE #define FSYNC O_SYNC #define FDSYNC O_DSYNC #define FEXCL O_EXCL diff --git a/lib/libspl/include/os/freebsd/sys/mnttab.h b/lib/libspl/include/os/freebsd/sys/mnttab.h new file mode 100644 index 0000000000..c08349bdf9 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mnttab.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#include +#include +#define MNTTAB _PATH_DEVZERO +#define MS_NOMNTTAB 0x0 +#define MS_RDONLY 0x1 +#define umount2(p, f) unmount(p, f) +#define MNT_LINE_MAX 4108 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; +}; + +/* + * NOTE: fields in extmnttab should match struct mnttab till new fields + * are encountered, this allows hasmntopt to work properly when its arg is + * a pointer to an extmnttab struct cast to a mnttab struct pointer. + */ + +struct extmnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; +}; + +struct stat64; +struct statfs; + +extern int getmntany(FILE *fp, struct mnttab *mp, struct mnttab *mpref); +extern int _sol_getmntent(FILE *fp, struct mnttab *mp); +extern int getextmntent(const char *path, struct extmnttab *entry, + struct stat64 *statbuf); +extern void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); +char *hasmntopt(struct mnttab *mnt, char *opt); +int getmntent(FILE *fp, struct mnttab *mp); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/mount.h b/lib/libspl/include/os/freebsd/sys/mount.h new file mode 100644 index 0000000000..e995185712 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mount.h @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#undef _SYS_MOUNT_H_ +#include_next + +#include +#include +#include + +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 DIOCGMEDIASIZE +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_NOSUID MNT_NOSUID +#define MS_NOEXEC MNT_NOEXEC +#define MS_NODEV 0 +#define S_WRITE 0 +#define MS_BIND 0 +#define MS_REMOUNT 0 +#define MS_SYNCHRONOUS MNT_SYNCHRONOUS + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/param.h b/lib/libspl/include/os/freebsd/sys/param.h new file mode 100644 index 0000000000..cb5260ea3d --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/param.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXNAMELEN 256 + +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifdef PAGESIZE +#undef PAGESIZE +#endif /* PAGESIZE */ + +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + +extern int execvpe(const char *name, char * const argv[], char * const envp[]); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/stat.h b/lib/libspl/include/os/freebsd/sys/stat.h new file mode 100644 index 0000000000..38c684d62a --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/stat.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +/* Note: this file can be used on linux/macOS when bootstrapping tools. */ + +#if defined(__FreeBSD__) +#include /* for BLKGETSIZE64 */ + +#define stat64 stat + +#define MAXOFFSET_T OFF_MAX + +#ifndef _KERNEL +#include + +static __inline int +fstat64(int fd, struct stat *sb) +{ + int ret; + + ret = fstat(fd, sb); + if (ret == 0) { + if (S_ISCHR(sb->st_mode)) + (void) ioctl(fd, DIOCGMEDIASIZE, &sb->st_size); + } + return (ret); +} +#endif + +/* + * Emulate Solaris' behavior of returning the block device size in fstat64(). + */ +static inline int +fstat64_blk(int fd, struct stat64 *st) +{ + if (fstat64(fd, st) == -1) + return (-1); + + /* In Linux we need to use an ioctl to get the size of a block device */ + if (S_ISBLK(st->st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) + return (-1); + } + + return (0); +} +#endif /* defined(__FreeBSD__) */ + +/* + * Only Intel-based Macs have a separate stat64; Arm-based Macs are like + * FreeBSD and have a full 64-bit stat from the start. + */ +#if defined(__APPLE__) && !(defined(__i386__) || defined(__x86_64__)) +#define stat64 stat +#define fstat64 fstat +#endif + +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/sysmacros.h b/lib/libspl/include/os/freebsd/sys/sysmacros.h new file mode 100644 index 0000000000..d9639d27b6 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/sysmacros.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/lib/libspl/include/os/freebsd/sys/vfs.h b/lib/libspl/include/os/freebsd/sys/vfs.h new file mode 100644 index 0000000000..55eb3c23b2 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/vfs.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_SYS_VFS_H_ +#define ZFS_SYS_VFS_H_ + +#include_next + +int fsshare(const char *, const char *, const char *); +int fsunshare(const char *, const char *); + +#endif /* !ZFS_SYS_VFS_H_ */ diff --git a/lib/libspl/include/os/freebsd/sys/zfs_context_os.h b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h new file mode 100644 index 0000000000..b9bf487c2a --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#define HAVE_LARGE_STACKS 1 +#define ZFS_EXPORTS_PATH "/etc/zfs/exports" + +#endif diff --git a/lib/libspl/include/os/linux/Makefile.am b/lib/libspl/include/os/linux/Makefile.am new file mode 100644 index 0000000000..081839c48c --- /dev/null +++ b/lib/libspl/include/os/linux/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/lib/libspl/include/os/linux/sys/Makefile.am b/lib/libspl/include/os/linux/sys/Makefile.am new file mode 100644 index 0000000000..1ec07a76d3 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/Makefile.am @@ -0,0 +1,10 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + byteorder.h \ + errno.h \ + mnttab.h \ + mount.h \ + param.h \ + stat.h \ + sysmacros.h \ + zfs_context_os.h diff --git a/lib/libspl/include/sys/byteorder.h b/lib/libspl/include/os/linux/sys/byteorder.h similarity index 97% rename from lib/libspl/include/sys/byteorder.h rename to lib/libspl/include/os/linux/sys/byteorder.h index 72d40b1643..d5ee3e26f5 100644 --- a/lib/libspl/include/sys/byteorder.h +++ b/lib/libspl/include/os/linux/sys/byteorder.h @@ -40,16 +40,14 @@ #ifndef _SYS_BYTEORDER_H #define _SYS_BYTEORDER_H - - -#include -#include - #if defined(__GNUC__) && defined(_ASM_INLINES) && \ (defined(__i386) || defined(__amd64)) #include #endif +#include +#include + #ifdef __cplusplus extern "C" { #endif @@ -58,7 +56,7 @@ extern "C" { * macros for conversion between host and (internet) network byte order */ -#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +#if defined(_ZFS_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) /* big-endian */ #define ntohl(x) (x) #define ntohs(x) (x) @@ -108,7 +106,7 @@ extern in_port_t ntohs(in_port_t); /* * Macros to convert from a specific byte order to/from native byte order */ -#ifdef _BIG_ENDIAN +#ifdef _ZFS_BIG_ENDIAN #define BE_8(x) BMASK_8(x) #define BE_16(x) BMASK_16(x) #define BE_32(x) BMASK_32(x) @@ -128,7 +126,7 @@ extern in_port_t ntohs(in_port_t); #define BE_64(x) BSWAP_64(x) #endif -#ifdef _BIG_ENDIAN +#ifdef _ZFS_BIG_ENDIAN static __inline__ uint64_t htonll(uint64_t n) { diff --git a/lib/libspl/include/sys/errno.h b/lib/libspl/include/os/linux/sys/errno.h similarity index 84% rename from lib/libspl/include/sys/errno.h rename to lib/libspl/include/os/linux/sys/errno.h index e8bfbe3538..30d20ab895 100644 --- a/lib/libspl/include/sys/errno.h +++ b/lib/libspl/include/os/linux/sys/errno.h @@ -31,5 +31,16 @@ */ #ifndef _LIBSPL_SYS_ERRNO_H #define _LIBSPL_SYS_ERRNO_H + #include +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ENOANO + #endif /* _LIBSPL_SYS_ERRNO_H */ diff --git a/lib/libspl/include/sys/mnttab.h b/lib/libspl/include/os/linux/sys/mnttab.h similarity index 93% rename from lib/libspl/include/sys/mnttab.h rename to lib/libspl/include/os/linux/sys/mnttab.h index 026a8fa7be..1957293d5c 100644 --- a/lib/libspl/include/sys/mnttab.h +++ b/lib/libspl/include/os/linux/sys/mnttab.h @@ -32,6 +32,7 @@ #include #include +#include #include #ifdef MNTTAB @@ -39,7 +40,7 @@ #endif /* MNTTAB */ #define MNTTAB "/proc/self/mounts" -#define MNT_LINE_MAX 4096 +#define MNT_LINE_MAX 4108 #define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ #define MNT_TOOMANY 2 /* too many fields in line */ @@ -67,10 +68,12 @@ struct extmnttab { uint_t mnt_minor; }; +struct statfs; + extern int getmntany(FILE *fp, struct mnttab *mp, struct mnttab *mpref); extern int _sol_getmntent(FILE *fp, struct mnttab *mp); -extern int getextmntent(FILE *fp, struct extmnttab *mp, int len); - +extern int getextmntent(const char *path, struct extmnttab *mp, + struct stat64 *statbuf); static inline char *_sol_hasmntopt(struct mnttab *mnt, char *opt) { struct mntent mnt_new; diff --git a/lib/libspl/include/sys/mount.h b/lib/libspl/include/os/linux/sys/mount.h similarity index 100% rename from lib/libspl/include/sys/mount.h rename to lib/libspl/include/os/linux/sys/mount.h diff --git a/lib/libspl/include/sys/param.h b/lib/libspl/include/os/linux/sys/param.h similarity index 96% rename from lib/libspl/include/sys/param.h rename to lib/libspl/include/os/linux/sys/param.h index c22d508f9b..26335187fd 100644 --- a/lib/libspl/include/sys/param.h +++ b/lib/libspl/include/os/linux/sys/param.h @@ -37,7 +37,7 @@ * with smaller units (fragments) only in the last direct block. * MAXBSIZE primarily determines the size of buffers in the buffer * pool. It may be made larger without any effect on existing - * file systems; however making it smaller make make some file + * file systems; however making it smaller may make some file * systems unmountable. * * Note that the blocked devices are assumed to have DEV_BSIZE diff --git a/lib/libspl/include/sys/stat.h b/lib/libspl/include/os/linux/sys/stat.h similarity index 100% rename from lib/libspl/include/sys/stat.h rename to lib/libspl/include/os/linux/sys/stat.h diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/os/linux/sys/sysmacros.h similarity index 99% rename from lib/libspl/include/sys/sysmacros.h rename to lib/libspl/include/os/linux/sys/sysmacros.h index 22fcb04b94..31f347c6fd 100644 --- a/lib/libspl/include/sys/sysmacros.h +++ b/lib/libspl/include/os/linux/sys/sysmacros.h @@ -98,6 +98,4 @@ #define offsetof(s, m) ((size_t)(&(((s *)0)->m))) #endif -#define _NOTE(x) - #endif /* _LIBSPL_SYS_SYSMACROS_H */ diff --git a/lib/libspl/include/sys/bitmap.h b/lib/libspl/include/os/linux/sys/zfs_context_os.h similarity index 83% rename from lib/libspl/include/sys/bitmap.h rename to lib/libspl/include/os/linux/sys/zfs_context_os.h index 95122ab8b4..81ced52077 100644 --- a/lib/libspl/include/sys/bitmap.h +++ b/lib/libspl/include/os/linux/sys/zfs_context_os.h @@ -19,12 +19,10 @@ * * CDDL HEADER END */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -#ifndef _LIBSPL_SYS_BITMAP_H -#define _LIBSPL_SYS_BITMAP_H +#ifndef ZFS_CONTEXT_OS_H +#define ZFS_CONTEXT_OS_H + +#define HAVE_LARGE_STACKS 1 #endif diff --git a/lib/libspl/include/rpc/Makefile.am b/lib/libspl/include/rpc/Makefile.am index 78ee5a29ef..7fe1d7fea4 100644 --- a/lib/libspl/include/rpc/Makefile.am +++ b/lib/libspl/include/rpc/Makefile.am @@ -1,3 +1,3 @@ libspldir = $(includedir)/libspl/rpc libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/rpc/xdr.h + xdr.h diff --git a/lib/libspl/include/rpc/xdr.h b/lib/libspl/include/rpc/xdr.h index 27e4395c73..51d71f693b 100644 --- a/lib/libspl/include/rpc/xdr.h +++ b/lib/libspl/include/rpc/xdr.h @@ -40,10 +40,13 @@ #define XDR_GET_BYTES_AVAIL 1 -typedef struct xdr_bytesrec { +#ifndef HAVE_XDR_BYTESREC +struct xdr_bytesrec { bool_t xc_is_last_record; size_t xc_num_avail; -} xdr_bytesrec_t; +}; +#endif +typedef struct xdr_bytesrec xdr_bytesrec_t; /* * This functionality is not required and is disabled in user space. diff --git a/lib/libspl/include/sys/Makefile.am b/lib/libspl/include/sys/Makefile.am index e7af317e0c..6816a01253 100644 --- a/lib/libspl/include/sys/Makefile.am +++ b/lib/libspl/include/sys/Makefile.am @@ -2,52 +2,47 @@ SUBDIRS = dktp libspldir = $(includedir)/libspl/sys libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/sys/acl.h \ - $(top_srcdir)/lib/libspl/include/sys/acl_impl.h \ - $(top_srcdir)/lib/libspl/include/sys/bitmap.h \ - $(top_srcdir)/lib/libspl/include/sys/byteorder.h \ - $(top_srcdir)/lib/libspl/include/sys/callb.h \ - $(top_srcdir)/lib/libspl/include/sys/cmn_err.h \ - $(top_srcdir)/lib/libspl/include/sys/cred.h \ - $(top_srcdir)/lib/libspl/include/sys/debug.h \ - $(top_srcdir)/lib/libspl/include/sys/dkio.h \ - $(top_srcdir)/lib/libspl/include/sys/dklabel.h \ - $(top_srcdir)/lib/libspl/include/sys/errno.h \ - $(top_srcdir)/lib/libspl/include/sys/feature_tests.h \ - $(top_srcdir)/lib/libspl/include/sys/file.h \ - $(top_srcdir)/lib/libspl/include/sys/int_limits.h \ - $(top_srcdir)/lib/libspl/include/sys/int_types.h \ - $(top_srcdir)/lib/libspl/include/sys/inttypes.h \ - $(top_srcdir)/lib/libspl/include/sys/isa_defs.h \ - $(top_srcdir)/lib/libspl/include/sys/kmem.h \ - $(top_srcdir)/lib/libspl/include/sys/kstat.h \ - $(top_srcdir)/lib/libspl/include/sys/list.h \ - $(top_srcdir)/lib/libspl/include/sys/list_impl.h \ - $(top_srcdir)/lib/libspl/include/sys/mhd.h \ - $(top_srcdir)/lib/libspl/include/sys/mkdev.h \ - $(top_srcdir)/lib/libspl/include/sys/mnttab.h \ - $(top_srcdir)/lib/libspl/include/sys/mount.h \ - $(top_srcdir)/lib/libspl/include/sys/param.h \ - $(top_srcdir)/lib/libspl/include/sys/policy.h \ - $(top_srcdir)/lib/libspl/include/sys/poll.h \ - $(top_srcdir)/lib/libspl/include/sys/priv.h \ - $(top_srcdir)/lib/libspl/include/sys/processor.h \ - $(top_srcdir)/lib/libspl/include/sys/signal.h \ - $(top_srcdir)/lib/libspl/include/sys/stack.h \ - $(top_srcdir)/lib/libspl/include/sys/stat.h \ - $(top_srcdir)/lib/libspl/include/sys/stdtypes.h \ - $(top_srcdir)/lib/libspl/include/sys/strings.h \ - $(top_srcdir)/lib/libspl/include/sys/stropts.h \ - $(top_srcdir)/lib/libspl/include/sys/sunddi.h \ - $(top_srcdir)/lib/libspl/include/sys/sysmacros.h \ - $(top_srcdir)/lib/libspl/include/sys/systeminfo.h \ - $(top_srcdir)/lib/libspl/include/sys/time.h \ - $(top_srcdir)/lib/libspl/include/sys/types32.h \ - $(top_srcdir)/lib/libspl/include/sys/types.h \ - $(top_srcdir)/lib/libspl/include/sys/tzfile.h \ - $(top_srcdir)/lib/libspl/include/sys/uio.h \ - $(top_srcdir)/lib/libspl/include/sys/va_list.h \ - $(top_srcdir)/lib/libspl/include/sys/varargs.h \ - $(top_srcdir)/lib/libspl/include/sys/vnode.h \ - $(top_srcdir)/lib/libspl/include/sys/vtoc.h \ - $(top_srcdir)/lib/libspl/include/sys/zone.h + acl.h \ + acl_impl.h \ + callb.h \ + cmn_err.h \ + cred.h \ + debug.h \ + dkio.h \ + dklabel.h \ + feature_tests.h \ + int_limits.h \ + int_types.h \ + inttypes.h \ + isa_defs.h \ + kmem.h \ + kstat.h \ + list.h \ + list_impl.h \ + mhd.h \ + mkdev.h \ + policy.h \ + poll.h \ + priv.h \ + processor.h \ + sha2.h \ + simd.h \ + stack.h \ + stdtypes.h \ + strings.h \ + stropts.h \ + sunddi.h \ + systeminfo.h \ + time.h \ + trace_spl.h \ + trace_zfs.h \ + types32.h \ + types.h \ + tzfile.h \ + uio.h \ + va_list.h \ + varargs.h \ + vnode.h \ + vtoc.h \ + wmsum.h \ + zone.h diff --git a/lib/libspl/include/sys/acl.h b/lib/libspl/include/sys/acl.h index e6df864f85..31168421b0 100644 --- a/lib/libspl/include/sys/acl.h +++ b/lib/libspl/include/sys/acl.h @@ -19,8 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2014 Garrett D'Amore + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 RackTop Systems. */ #ifndef _SYS_ACL_H @@ -75,23 +79,24 @@ typedef struct acl_info acl_t; /* * The following are defined for ace_t. */ -#define ACE_READ_DATA 0x00000001 -#define ACE_LIST_DIRECTORY 0x00000001 -#define ACE_WRITE_DATA 0x00000002 -#define ACE_ADD_FILE 0x00000002 -#define ACE_APPEND_DATA 0x00000004 -#define ACE_ADD_SUBDIRECTORY 0x00000004 -#define ACE_READ_NAMED_ATTRS 0x00000008 -#define ACE_WRITE_NAMED_ATTRS 0x00000010 -#define ACE_EXECUTE 0x00000020 -#define ACE_DELETE_CHILD 0x00000040 -#define ACE_READ_ATTRIBUTES 0x00000080 -#define ACE_WRITE_ATTRIBUTES 0x00000100 -#define ACE_DELETE 0x00010000 -#define ACE_READ_ACL 0x00020000 -#define ACE_WRITE_ACL 0x00040000 -#define ACE_WRITE_OWNER 0x00080000 -#define ACE_SYNCHRONIZE 0x00100000 +#define ACE_READ_DATA 0x00000001 /* file: read data */ +#define ACE_LIST_DIRECTORY 0x00000001 /* dir: list files */ +#define ACE_WRITE_DATA 0x00000002 /* file: write data */ +#define ACE_ADD_FILE 0x00000002 /* dir: create file */ +#define ACE_APPEND_DATA 0x00000004 /* file: append data */ +#define ACE_ADD_SUBDIRECTORY 0x00000004 /* dir: create subdir */ +#define ACE_READ_NAMED_ATTRS 0x00000008 /* FILE_READ_EA */ +#define ACE_WRITE_NAMED_ATTRS 0x00000010 /* FILE_WRITE_EA */ +#define ACE_EXECUTE 0x00000020 /* file: execute */ +#define ACE_TRAVERSE 0x00000020 /* dir: lookup name */ +#define ACE_DELETE_CHILD 0x00000040 /* dir: unlink child */ +#define ACE_READ_ATTRIBUTES 0x00000080 /* (all) stat, etc. */ +#define ACE_WRITE_ATTRIBUTES 0x00000100 /* (all) utimes, etc. */ +#define ACE_DELETE 0x00010000 /* (all) unlink self */ +#define ACE_READ_ACL 0x00020000 /* (all) getsecattr */ +#define ACE_WRITE_ACL 0x00040000 /* (all) setsecattr */ +#define ACE_WRITE_OWNER 0x00080000 /* (all) chown */ +#define ACE_SYNCHRONIZE 0x00100000 /* (all) */ #define ACE_FILE_INHERIT_ACE 0x0001 #define ACE_DIRECTORY_INHERIT_ACE 0x0002 @@ -116,8 +121,6 @@ typedef struct acl_info acl_t; #define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ ACL_DEFAULTED) -#ifdef _KERNEL - /* * These are only applicable in a CIFS context. */ @@ -137,6 +140,8 @@ typedef struct acl_info acl_t; #define ACE_ALL_TYPES 0x001F +#if defined(_KERNEL) + typedef struct ace_object { uid_t a_who; /* uid or gid */ uint32_t a_access_mask; /* read,write,... */ @@ -154,6 +159,21 @@ typedef struct ace_object { ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ ACE_WRITE_OWNER|ACE_SYNCHRONIZE) +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ + ACE_READ_NAMED_ATTRS) + +#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS) + +#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) + /* * The following flags are supported by both NFSv4 ACLs and ace_t. */ @@ -217,6 +237,7 @@ typedef struct ace_object { #define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */ #define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */ #define ACL_NORESOLVE 0x4 /* don't do name service lookups */ +#define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */ /* * Legacy aclcheck errors for aclent_t ACLs @@ -272,13 +293,8 @@ extern int cmp2acls(void *, void *); #endif /* !defined(_KERNEL) */ -#if defined(__STDC__) extern int acl(const char *path, int cmd, int cnt, void *buf); extern int facl(int fd, int cmd, int cnt, void *buf); -#else /* !__STDC__ */ -extern int acl(); -extern int facl(); -#endif /* defined(__STDC__) */ #ifdef __cplusplus } diff --git a/lib/libspl/include/sys/debug.h b/lib/libspl/include/sys/debug.h index fde4a01207..af18da9480 100644 --- a/lib/libspl/include/sys/debug.h +++ b/lib/libspl/include/sys/debug.h @@ -29,4 +29,12 @@ #include +#ifndef __printflike +#define __printflike(x, y) __attribute__((__format__(__printf__, x, y))) +#endif + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + #endif diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h index 2e6b9a1a9d..f3c641f669 100644 --- a/lib/libspl/include/sys/dkio.h +++ b/lib/libspl/include/sys/dkio.h @@ -59,7 +59,6 @@ struct dk_cinfo { uint_t dki_vec; /* interrupt vector */ char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ uint_t dki_unit; /* unit number */ - uint_t dki_slave; /* slave number */ ushort_t dki_partition; /* partition number */ ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ }; diff --git a/lib/libspl/include/sys/dklabel.h b/lib/libspl/include/sys/dklabel.h index 95faf2bb4a..8c2ca06c0c 100644 --- a/lib/libspl/include/sys/dklabel.h +++ b/lib/libspl/include/sys/dklabel.h @@ -31,7 +31,6 @@ #include #include -#include #ifdef __cplusplus extern "C" { diff --git a/lib/libspl/include/sys/dktp/Makefile.am b/lib/libspl/include/sys/dktp/Makefile.am index 9887675c46..4ad3695d8a 100644 --- a/lib/libspl/include/sys/dktp/Makefile.am +++ b/lib/libspl/include/sys/dktp/Makefile.am @@ -1,4 +1,4 @@ libspldir = $(includedir)/libspl/sys/dktp libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/sys/dktp/fdisk.h + fdisk.h diff --git a/lib/libspl/include/sys/feature_tests.h b/lib/libspl/include/sys/feature_tests.h index 1a68b75f0c..a36fd7b8cf 100644 --- a/lib/libspl/include/sys/feature_tests.h +++ b/lib/libspl/include/sys/feature_tests.h @@ -27,6 +27,15 @@ #ifndef _SYS_FEATURE_TESTS_H #define _SYS_FEATURE_TESTS_H -#define __NORETURN __attribute__((__noreturn__)) +#define ____cacheline_aligned +#define __NORETURN __attribute__((__noreturn__)) + +#if !defined(fallthrough) +#if defined(HAVE_IMPLICIT_FALLTHROUGH) +#define fallthrough __attribute__((__fallthrough__)) +#else +#define fallthrough ((void)0) +#endif +#endif #endif diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h index 7a90e077e8..8c0932f576 100644 --- a/lib/libspl/include/sys/isa_defs.h +++ b/lib/libspl/include/sys/isa_defs.h @@ -46,12 +46,16 @@ extern "C" { #define __x86 #endif +#if defined(_ILP32) +/* x32-specific defines; careful to *not* define _LP64 here */ +#else #if !defined(_LP64) #define _LP64 #endif +#endif -#if !defined(_LITTLE_ENDIAN) -#define _LITTLE_ENDIAN +#if !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN #endif #define _SUNOS_VTOC_16 @@ -72,8 +76,8 @@ extern "C" { #define _ILP32 #endif -#if !defined(_LITTLE_ENDIAN) -#define _LITTLE_ENDIAN +#if !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN #endif #define _SUNOS_VTOC_16 @@ -103,6 +107,24 @@ extern "C" { #define _SUNOS_VTOC_16 #define HAVE_EFFICIENT_UNALIGNED_ACCESS +#if defined(__BYTE_ORDER) +#if defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#endif +#elif defined(_BYTE_ORDER) +#if defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#endif +#elif defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN) +#define _ZFS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + /* arm arch specific defines */ #elif defined(__arm) || defined(__arm__) || defined(__aarch64__) @@ -125,9 +147,9 @@ extern "C" { #endif #if defined(__ARMEL__) || defined(__AARCH64EL__) -#define _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN #else -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #endif #define _SUNOS_VTOC_16 @@ -147,7 +169,7 @@ extern "C" { #define __sparc__ #endif -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #define _SUNOS_VTOC_16 #if defined(__arch64__) @@ -172,30 +194,49 @@ extern "C" { #endif #endif -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #define _SUNOS_VTOC_16 /* MIPS arch specific defines */ #elif defined(__mips__) #if defined(__MIPSEB__) -#define _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN #elif defined(__MIPSEL__) -#define _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN #else #error MIPS no endian specified #endif -#ifndef _LP64 +#if !defined(_LP64) && !defined(_ILP32) #define _ILP32 #endif #define _SUNOS_VTOC_16 +/* + * RISC-V arch specific defines + * only RV64G (including atomic) LP64 is supported yet + */ +#elif defined(__riscv) && defined(_LP64) && _LP64 && \ + defined(__riscv_atomic) && __riscv_atomic + +#ifndef __riscv__ +#define __riscv__ +#endif + +#ifndef __rv64g__ +#define __rv64g__ +#endif + +#define _ZFS_LITTLE_ENDIAN + +#define _SUNOS_VTOC_16 + #else /* * Currently supported: - * x86_64, i386, arm, powerpc, s390, sparc, and mips + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G */ #error "Unsupported ISA type" #endif @@ -208,12 +249,12 @@ extern "C" { #error "Neither _ILP32 or _LP64 are defined" #endif -#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined" +#if defined(_ZFS_LITTLE_ENDIAN) && defined(_ZFS_BIG_ENDIAN) +#error "Both _ZFS_LITTLE_ENDIAN and _ZFS_BIG_ENDIAN are defined" #endif -#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -#error "Neither _LITTLE_ENDIAN nor _BIG_ENDIAN are defined" +#if !defined(_ZFS_LITTLE_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#error "Neither _ZFS_LITTLE_ENDIAN nor _ZFS_BIG_ENDIAN are defined" #endif #ifdef __cplusplus diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h index 9bd0d949d5..f73fb92eb7 100644 --- a/lib/libspl/include/sys/kstat.h +++ b/lib/libspl/include/sys/kstat.h @@ -82,7 +82,7 @@ typedef struct kstat { void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of type-specific data records */ size_t ks_data_size; /* total size of kstat data section */ - hrtime_t ks_snaptime; /* time of last data shapshot */ + hrtime_t ks_snaptime; /* time of last data snapshot */ /* * Fields relevant to kernel only */ @@ -796,12 +796,6 @@ extern void kstat_delete_byname(const char *, int, const char *); extern void kstat_delete_byname_zone(const char *, int, const char *, zoneid_t); extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_timer_init(kstat_timer_t *, const char *); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); -extern void kstat_waitq_to_runq(kstat_io_t *); -extern void kstat_runq_back_to_waitq(kstat_io_t *); extern void kstat_timer_start(kstat_timer_t *); extern void kstat_timer_stop(kstat_timer_t *); diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h index a6614f9a38..b5655b972c 100644 --- a/lib/libspl/include/sys/list_impl.h +++ b/lib/libspl/include/sys/list_impl.h @@ -34,8 +34,8 @@ extern "C" { #endif struct list_node { - struct list_node *list_next; - struct list_node *list_prev; + struct list_node *next; + struct list_node *prev; }; struct list { diff --git a/lib/libspl/include/sys/sha2.h b/lib/libspl/include/sys/sha2.h new file mode 100644 index 0000000000..e2f66d225e --- /dev/null +++ b/lib/libspl/include/sys/sha2.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ +typedef struct { + uint32_t algotype; /* Algorithm Type */ + + /* state (ABCDEFGH) */ + union { + uint32_t s32[8]; /* for SHA256 */ + uint64_t s64[8]; /* for SHA384/512 */ + } state; + /* number of bits */ + union { + uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ + uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ + } count; + union { + uint8_t buf8[128]; /* undigested input */ + uint32_t buf32[32]; /* realigned input */ + uint64_t buf64[16]; /* realigned input */ + } buf_un; +} SHA2_CTX; + +typedef SHA2_CTX SHA256_CTX; +typedef SHA2_CTX SHA384_CTX; +typedef SHA2_CTX SHA512_CTX; + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + +extern void SHA2Init(uint64_t mech, SHA2_CTX *); + +extern void SHA2Update(SHA2_CTX *, const void *, size_t); + +extern void SHA2Final(void *, SHA2_CTX *); + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h new file mode 100644 index 0000000000..dceedb698f --- /dev/null +++ b/lib/libspl/include/sys/simd.h @@ -0,0 +1,502 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SIMD_H +#define _LIBSPL_SYS_SIMD_H + +#include +#include + +#if defined(__x86) +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +/* + * CPUID feature tests for user-space. + * + * x86 registers used implicitly by CPUID + */ +typedef enum cpuid_regs { + EAX = 0, + EBX, + ECX, + EDX, + CPUID_REG_CNT = 4 +} cpuid_regs_t; + +/* + * List of instruction sets identified by CPUID + */ +typedef enum cpuid_inst_sets { + SSE = 0, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + OSXSAVE, + AVX, + AVX2, + BMI1, + BMI2, + AVX512F, + AVX512CD, + AVX512DQ, + AVX512BW, + AVX512IFMA, + AVX512VBMI, + AVX512PF, + AVX512ER, + AVX512VL, + AES, + PCLMULQDQ, + MOVBE +} cpuid_inst_sets_t; + +/* + * Instruction set descriptor. + */ +typedef struct cpuid_feature_desc { + uint32_t leaf; /* CPUID leaf */ + uint32_t subleaf; /* CPUID sub-leaf */ + uint32_t flag; /* bit mask of the feature */ + cpuid_regs_t reg; /* which CPUID return register to test */ +} cpuid_feature_desc_t; + +#define _AVX512F_BIT (1U << 16) +#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) +#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) +#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) +#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) +#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ +#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) +#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) +#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ +#define _AES_BIT (1U << 25) +#define _PCLMULQDQ_BIT (1U << 1) +#define _MOVBE_BIT (1U << 22) + +/* + * Descriptions of supported instruction sets + */ +static const cpuid_feature_desc_t cpuid_features[] = { + [SSE] = {1U, 0U, 1U << 25, EDX }, + [SSE2] = {1U, 0U, 1U << 26, EDX }, + [SSE3] = {1U, 0U, 1U << 0, ECX }, + [SSSE3] = {1U, 0U, 1U << 9, ECX }, + [SSE4_1] = {1U, 0U, 1U << 19, ECX }, + [SSE4_2] = {1U, 0U, 1U << 20, ECX }, + [OSXSAVE] = {1U, 0U, 1U << 27, ECX }, + [AVX] = {1U, 0U, 1U << 28, ECX }, + [AVX2] = {7U, 0U, 1U << 5, EBX }, + [BMI1] = {7U, 0U, 1U << 3, EBX }, + [BMI2] = {7U, 0U, 1U << 8, EBX }, + [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, + [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, + [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, + [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, + [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, + [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, + [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, + [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AES] = {1U, 0U, _AES_BIT, ECX }, + [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, + [MOVBE] = {1U, 0U, _MOVBE_BIT, ECX }, +}; + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + +/* + * Check if CPU supports a feature + */ +static inline boolean_t +__cpuid_check_feature(const cpuid_feature_desc_t *desc) +{ + uint32_t r[CPUID_REG_CNT]; + + if (__get_cpuid_max(0, NULL) >= desc->leaf) { + /* + * __cpuid_count is needed to properly check + * for AVX2. It is a macro, so return parameters + * are passed by value. + */ + __cpuid_count(desc->leaf, desc->subleaf, + r[EAX], r[EBX], r[ECX], r[EDX]); + return ((r[desc->reg] & desc->flag) == desc->flag); + } + return (B_FALSE); +} + +#define CPUID_FEATURE_CHECK(name, id) \ +static inline boolean_t \ +__cpuid_has_ ## name(void) \ +{ \ + return (__cpuid_check_feature(&cpuid_features[id])); \ +} + +/* + * Define functions for user-space CPUID features testing + */ +CPUID_FEATURE_CHECK(sse, SSE); +CPUID_FEATURE_CHECK(sse2, SSE2); +CPUID_FEATURE_CHECK(sse3, SSE3); +CPUID_FEATURE_CHECK(ssse3, SSSE3); +CPUID_FEATURE_CHECK(sse4_1, SSE4_1); +CPUID_FEATURE_CHECK(sse4_2, SSE4_2); +CPUID_FEATURE_CHECK(avx, AVX); +CPUID_FEATURE_CHECK(avx2, AVX2); +CPUID_FEATURE_CHECK(osxsave, OSXSAVE); +CPUID_FEATURE_CHECK(bmi1, BMI1); +CPUID_FEATURE_CHECK(bmi2, BMI2); +CPUID_FEATURE_CHECK(avx512f, AVX512F); +CPUID_FEATURE_CHECK(avx512cd, AVX512CD); +CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); +CPUID_FEATURE_CHECK(avx512bw, AVX512BW); +CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); +CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); +CPUID_FEATURE_CHECK(avx512pf, AVX512PF); +CPUID_FEATURE_CHECK(avx512er, AVX512ER); +CPUID_FEATURE_CHECK(avx512vl, AVX512VL); +CPUID_FEATURE_CHECK(aes, AES); +CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); +CPUID_FEATURE_CHECK(movbe, MOVBE); + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + + has_osxsave = __cpuid_has_osxsave(); + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (__cpuid_has_sse()); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (__cpuid_has_sse2()); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (__cpuid_has_sse3()); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (__cpuid_has_ssse3()); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (__cpuid_has_sse4_1()); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (__cpuid_has_sse4_2()); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + return (__cpuid_has_avx() && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + return (__cpuid_has_avx2() && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ + return (__cpuid_has_bmi1()); +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ + return (__cpuid_has_bmi2()); +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ + return (__cpuid_has_aes()); +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ + return (__cpuid_has_pclmulqdq()); +} + +/* + * Check if MOVBE instruction is available + */ +static inline boolean_t +zfs_movbe_available(void) +{ + return (__cpuid_has_movbe()); +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + +/* + * Check if AVX512F instruction set is available + */ +static inline boolean_t +zfs_avx512f_available(void) +{ + return (__cpuid_has_avx512f() && __zmm_enabled()); +} + +/* + * Check if AVX512CD instruction set is available + */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + return (__cpuid_has_avx512cd() && __zmm_enabled()); +} + +/* + * Check if AVX512ER instruction set is available + */ +static inline boolean_t +zfs_avx512er_available(void) +{ + return (__cpuid_has_avx512er() && __zmm_enabled()); +} + +/* + * Check if AVX512PF instruction set is available + */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + return (__cpuid_has_avx512pf() && __zmm_enabled()); +} + +/* + * Check if AVX512BW instruction set is available + */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + return (__cpuid_has_avx512bw() && __zmm_enabled()); +} + +/* + * Check if AVX512DQ instruction set is available + */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + return (__cpuid_has_avx512dq() && __zmm_enabled()); +} + +/* + * Check if AVX512VL instruction set is available + */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + return (__cpuid_has_avx512vl() && __zmm_enabled()); +} + +/* + * Check if AVX512IFMA instruction set is available + */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + return (__cpuid_has_avx512ifma() && __zmm_enabled()); +} + +/* + * Check if AVX512VBMI instruction set is available + */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + return (__cpuid_has_avx512f() && __cpuid_has_avx512vbmi() && + __zmm_enabled()); +} + +#elif defined(__aarch64__) + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +#elif defined(__powerpc__) + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +/* + * Check if AltiVec instruction set is available + * No easy way beyond 'altivec works' :-( + */ +#include +#include + +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) +static jmp_buf env; +static void sigillhandler(int x) +{ + longjmp(env, 1); +} +#endif + +static inline boolean_t +zfs_altivec_available(void) +{ + boolean_t has_altivec = B_FALSE; +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) + sighandler_t savesig; + savesig = signal(SIGILL, sigillhandler); + if (setjmp(env)) { + signal(SIGILL, savesig); + has_altivec = B_FALSE; + } else { + __asm__ __volatile__("vor 0,0,0\n" : : : "v0"); + signal(SIGILL, savesig); + has_altivec = B_TRUE; + } +#endif + return (has_altivec); +} +#else + +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +#endif + +#endif /* _LIBSPL_SYS_SIMD_H */ diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h index 291f2190a2..c9f6165047 100644 --- a/lib/libspl/include/sys/time.h +++ b/lib/libspl/include/sys/time.h @@ -88,7 +88,7 @@ gethrestime(inode_timespec_t *ts) ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; } -static inline time_t +static inline uint64_t gethrestime_sec(void) { struct timeval tv; diff --git a/lib/libspl/include/sys/trace_spl.h b/lib/libspl/include/sys/trace_spl.h new file mode 100644 index 0000000000..b80d288f73 --- /dev/null +++ b/lib/libspl/include/sys/trace_spl.h @@ -0,0 +1,24 @@ +/* Here to keep the libspl build happy */ + +#ifndef _LIBSPL_SPL_TRACE_H +#define _LIBSPL_SPL_TRACE_H + +/* + * The set-error SDT probe is extra static, in that we declare its fake + * function literally, rather than with the DTRACE_PROBE1() macro. This is + * necessary so that SET_ERROR() can evaluate to a value, which wouldn't + * be possible if it required multiple statements (to declare the function + * and then call it). + * + * SET_ERROR() uses the comma operator so that it can be used without much + * additional code. For example, "return (EINVAL);" becomes + * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated + * twice, so it should not have side effects (e.g. something like: + * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). + */ +#undef SET_ERROR +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + + +#endif diff --git a/lib/libspl/include/sys/trace_zfs.h b/lib/libspl/include/sys/trace_zfs.h new file mode 100644 index 0000000000..87ed5ad3c3 --- /dev/null +++ b/lib/libspl/include/sys/trace_zfs.h @@ -0,0 +1,24 @@ +/* Here to keep the libspl build happy */ + +#ifndef _LIBSPL_ZFS_TRACE_H +#define _LIBSPL_ZFS_TRACE_H + +/* + * The set-error SDT probe is extra static, in that we declare its fake + * function literally, rather than with the DTRACE_PROBE1() macro. This is + * necessary so that SET_ERROR() can evaluate to a value, which wouldn't + * be possible if it required multiple statements (to declare the function + * and then call it). + * + * SET_ERROR() uses the comma operator so that it can be used without much + * additional code. For example, "return (EINVAL);" becomes + * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated + * twice, so it should not have side effects (e.g. something like: + * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). + */ +#undef SET_ERROR +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + + +#endif diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 97e8412ef7..81ade54b54 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -40,69 +40,73 @@ #ifndef _LIBSPL_SYS_UIO_H #define _LIBSPL_SYS_UIO_H +#include #include_next +#ifdef __APPLE__ +#include +#endif + +#include typedef struct iovec iovec_t; -typedef enum uio_rw { +#if defined(__linux__) || defined(__APPLE__) +typedef enum zfs_uio_rw { UIO_READ = 0, UIO_WRITE = 1, -} uio_rw_t; +} zfs_uio_rw_t; -typedef enum uio_seg { +typedef enum zfs_uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, - UIO_USERISPACE = 2, -} uio_seg_t; +} zfs_uio_seg_t; -typedef struct uio { +#elif defined(__FreeBSD__) +typedef enum uio_seg zfs_uio_seg_t; +#endif + +typedef struct zfs_uio { struct iovec *uio_iov; /* pointer to array of iovecs */ int uio_iovcnt; /* number of iovecs */ offset_t uio_loffset; /* file offset */ - uio_seg_t uio_segflg; /* address space (kernel or user) */ + zfs_uio_seg_t uio_segflg; /* address space (kernel or user) */ uint16_t uio_fmode; /* file mode flags */ uint16_t uio_extflg; /* extended flags */ - offset_t uio_limit; /* u-limit (maximum byte offset) */ ssize_t uio_resid; /* residual count */ -} uio_t; +} zfs_uio_t; -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; +#define zfs_uio_segflg(uio) (uio)->uio_segflg +#define zfs_uio_offset(uio) (uio)->uio_loffset +#define zfs_uio_resid(uio) (uio)->uio_resid +#define zfs_uio_iovcnt(uio) (uio)->uio_iovcnt +#define zfs_uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len +#define zfs_uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base -#define UIOA_IOV_MAX 16 +static inline void +zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = zfs_uio_iovbase(uio, idx); + *len = zfs_uio_iovlen(uio, idx); +} -typedef struct uioa_page_s { /* locked uio_iov state */ - int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ - void **uioa_ppp; /* page_t or pfn_t arrary */ - caddr_t uioa_base; /* address base */ - size_t uioa_len; /* span length */ -} uioa_page_t; +static inline void +zfs_uio_advance(zfs_uio_t *uio, size_t size) +{ + uio->uio_resid -= size; + uio->uio_loffset += size; +} -typedef struct xuio { - uio_t xu_uio; /* embedded UIO structure */ +static inline offset_t +zfs_uio_index_at_offset(zfs_uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < (uint_t)zfs_uio_iovcnt(uio) && + off >= (offset_t)zfs_uio_iovlen(uio, *vec_idx)) { + off -= zfs_uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } - /* Extended uio fields */ - enum xuio_type xu_type; /* uio type */ - union { - struct { - uint32_t xu_a_state; /* state of async i/o */ - ssize_t xu_a_mbytes; /* bytes moved */ - uioa_page_t *xu_a_lcur; /* uioa_locked[] pointer */ - void **xu_a_lppp; /* lcur->uioa_pppp[] pointer */ - void *xu_a_hwst[4]; /* opaque hardware state */ - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; /* read or write buffer */ - void *xu_zc_priv; /* fs specific */ - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + return (off); +} #endif /* _SYS_UIO_H */ diff --git a/lib/libspl/include/sys/vtoc.h b/lib/libspl/include/sys/vtoc.h index 22a652b74b..5d8448b628 100644 --- a/lib/libspl/include/sys/vtoc.h +++ b/lib/libspl/include/sys/vtoc.h @@ -51,7 +51,7 @@ extern "C" { * v_sanity returned as VTOC_SANE * if Disk Label was sane * v_sectorsz returned as 512 - * v_reserved [all] retunred as zero + * v_reserved [all] returned as zero * timestamp [all] returned as zero * * See dklabel.h, read_vtoc(), and write_vtoc(). diff --git a/lib/libspl/include/sys/wmsum.h b/lib/libspl/include/sys/wmsum.h new file mode 100644 index 0000000000..0679af73ce --- /dev/null +++ b/lib/libspl/include/sys/wmsum.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * wmsum counters are a reduced version of aggsum counters, optimized for + * write-mostly scenarios. They do not provide optimized read functions, + * but instead allow much cheaper add function. The primary usage is + * infrequently read statistic counters, not requiring exact precision. + * + * In user-space due to lack of better implementation mapped to aggsum. + */ + +#ifndef _SYS_WMSUM_H +#define _SYS_WMSUM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define wmsum_t aggsum_t + +static inline void +wmsum_init(wmsum_t *ws, uint64_t value) +{ + + aggsum_init(ws, value); +} + +static inline void +wmsum_fini(wmsum_t *ws) +{ + + aggsum_fini(ws); +} + +static inline uint64_t +wmsum_value(wmsum_t *ws) +{ + + return (aggsum_value(ws)); +} + +static inline void +wmsum_add(wmsum_t *ws, int64_t delta) +{ + + aggsum_add(ws, delta); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_WMSUM_H */ diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h index 59dc931442..65f12595e6 100644 --- a/lib/libspl/include/umem.h +++ b/lib/libspl/include/umem.h @@ -36,6 +36,7 @@ * * https://labs.omniti.com/trac/portableumem */ +#include #include #include @@ -56,10 +57,7 @@ typedef void vmem_t; /* * Flags for umem_cache_create() */ -#define UMC_NOTOUCH 0x00010000 #define UMC_NODEBUG 0x00020000 -#define UMC_NOMAGAZINE 0x00040000 -#define UMC_NOHASH 0x00080000 #define UMEM_CACHE_NAMELEN 31 @@ -80,6 +78,11 @@ typedef struct umem_cache { int cache_cflags; } umem_cache_t; +/* Prototypes for functions to provide defaults for umem envvars */ +const char *_umem_debug_init(void); +const char *_umem_options_init(void); +const char *_umem_logging_init(void); + static inline void * umem_alloc(size_t size, int flags) { @@ -126,13 +129,13 @@ umem_zalloc(size_t size, int flags) } static inline void -umem_free(void *ptr, size_t size) +umem_free(void *ptr, size_t size __maybe_unused) { free(ptr); } static inline void -umem_nofail_callback(umem_nofail_callback_t *cb) +umem_nofail_callback(umem_nofail_callback_t *cb __maybe_unused) {} static inline umem_cache_t * @@ -145,7 +148,7 @@ umem_cache_create( { umem_cache_t *cp; - cp = umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); + cp = (umem_cache_t *)umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); if (cp) { strlcpy(cp->cache_name, name, UMEM_CACHE_NAMELEN); cp->cache_bufsize = bufsize; @@ -194,7 +197,7 @@ umem_cache_free(umem_cache_t *cp, void *ptr) } static inline void -umem_cache_reap_now(umem_cache_t *cp) +umem_cache_reap_now(umem_cache_t *cp __maybe_unused) { } diff --git a/lib/libspl/include/util/Makefile.am b/lib/libspl/include/util/Makefile.am index 060e143a89..ab553bc803 100644 --- a/lib/libspl/include/util/Makefile.am +++ b/lib/libspl/include/util/Makefile.am @@ -1,3 +1,3 @@ libspldir = $(includedir)/libspl libspl_HEADERS = \ - $(top_srcdir)/lib/libspl/include/util/sscanf.h + sscanf.h diff --git a/lib/libspl/include/zone.h b/lib/libspl/include/zone.h index b4a6deb40c..b0ac2d9bc6 100644 --- a/lib/libspl/include/zone.h +++ b/lib/libspl/include/zone.h @@ -26,25 +26,16 @@ #ifndef _LIBSPL_ZONE_H #define _LIBSPL_ZONE_H - - #include #include -#include #ifdef __cplusplus extern "C" { #endif #define GLOBAL_ZONEID 0 -#define GLOBAL_ZONEID_NAME "global" -/* - * Functions for mapping between id and name for active zones. - */ extern zoneid_t getzoneid(void); -extern zoneid_t getzoneidbyname(const char *); -extern ssize_t getzonenamebyid(zoneid_t, char *, size_t); #ifdef __cplusplus } diff --git a/lib/libspl/include/devid.h b/lib/libspl/libspl_impl.h similarity index 82% rename from lib/libspl/include/devid.h rename to lib/libspl/libspl_impl.h index 8e483281a4..cda56e64c9 100644 --- a/lib/libspl/include/devid.h +++ b/lib/libspl/libspl_impl.h @@ -19,14 +19,6 @@ * * CDDL HEADER END */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -#ifndef _LIBSPL_DEVID_H -#define _LIBSPL_DEVID_H -#include - -#endif +extern ssize_t getexecname_impl(char *execname); diff --git a/lib/libspl/list.c b/lib/libspl/list.c index b29dc8a873..0f2f3731b2 100644 --- a/lib/libspl/list.c +++ b/lib/libspl/list.c @@ -35,28 +35,28 @@ #define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) #define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) -#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) +#define list_empty(a) ((a)->list_head.next == &(a)->list_head) #define list_insert_after_node(list, node, object) { \ list_node_t *lnew = list_d2l(list, object); \ - lnew->list_prev = (node); \ - lnew->list_next = (node)->list_next; \ - (node)->list_next->list_prev = lnew; \ - (node)->list_next = lnew; \ + lnew->prev = (node); \ + lnew->next = (node)->next; \ + (node)->next->prev = lnew; \ + (node)->next = lnew; \ } #define list_insert_before_node(list, node, object) { \ list_node_t *lnew = list_d2l(list, object); \ - lnew->list_next = (node); \ - lnew->list_prev = (node)->list_prev; \ - (node)->list_prev->list_next = lnew; \ - (node)->list_prev = lnew; \ + lnew->next = (node); \ + lnew->prev = (node)->prev; \ + (node)->prev->next = lnew; \ + (node)->prev = lnew; \ } #define list_remove_node(node) \ - (node)->list_prev->list_next = (node)->list_next; \ - (node)->list_next->list_prev = (node)->list_prev; \ - (node)->list_next = (node)->list_prev = NULL + (node)->prev->next = (node)->next; \ + (node)->next->prev = (node)->prev; \ + (node)->next = (node)->prev = NULL void list_create(list_t *list, size_t size, size_t offset) @@ -67,8 +67,7 @@ list_create(list_t *list, size_t size, size_t offset) list->list_size = size; list->list_offset = offset; - list->list_head.list_next = list->list_head.list_prev = - &list->list_head; + list->list_head.next = list->list_head.prev = &list->list_head; } void @@ -77,10 +76,10 @@ list_destroy(list_t *list) list_node_t *node = &list->list_head; ASSERT(list); - ASSERT(list->list_head.list_next == node); - ASSERT(list->list_head.list_prev == node); + ASSERT(list->list_head.next == node); + ASSERT(list->list_head.prev == node); - node->list_next = node->list_prev = NULL; + node->next = node->prev = NULL; } void @@ -124,14 +123,14 @@ list_remove(list_t *list, void *object) { list_node_t *lold = list_d2l(list, object); ASSERT(!list_empty(list)); - ASSERT(lold->list_next != NULL); + ASSERT(lold->next != NULL); list_remove_node(lold); } void * list_remove_head(list_t *list) { - list_node_t *head = list->list_head.list_next; + list_node_t *head = list->list_head.next; if (head == &list->list_head) return (NULL); list_remove_node(head); @@ -141,7 +140,7 @@ list_remove_head(list_t *list) void * list_remove_tail(list_t *list) { - list_node_t *tail = list->list_head.list_prev; + list_node_t *tail = list->list_head.prev; if (tail == &list->list_head) return (NULL); list_remove_node(tail); @@ -153,7 +152,7 @@ list_head(list_t *list) { if (list_empty(list)) return (NULL); - return (list_object(list, list->list_head.list_next)); + return (list_object(list, list->list_head.next)); } void * @@ -161,7 +160,7 @@ list_tail(list_t *list) { if (list_empty(list)) return (NULL); - return (list_object(list, list->list_head.list_prev)); + return (list_object(list, list->list_head.prev)); } void * @@ -169,8 +168,8 @@ list_next(list_t *list, void *object) { list_node_t *node = list_d2l(list, object); - if (node->list_next != &list->list_head) - return (list_object(list, node->list_next)); + if (node->next != &list->list_head) + return (list_object(list, node->next)); return (NULL); } @@ -180,8 +179,8 @@ list_prev(list_t *list, void *object) { list_node_t *node = list_d2l(list, object); - if (node->list_prev != &list->list_head) - return (list_object(list, node->list_prev)); + if (node->prev != &list->list_head) + return (list_object(list, node->prev)); return (NULL); } @@ -201,13 +200,13 @@ list_move_tail(list_t *dst, list_t *src) if (list_empty(src)) return; - dstnode->list_prev->list_next = srcnode->list_next; - srcnode->list_next->list_prev = dstnode->list_prev; - dstnode->list_prev = srcnode->list_prev; - srcnode->list_prev->list_next = dstnode; + dstnode->prev->next = srcnode->next; + srcnode->next->prev = dstnode->prev; + dstnode->prev = srcnode->prev; + srcnode->prev->next = dstnode; /* empty src list */ - srcnode->list_next = srcnode->list_prev = srcnode; + srcnode->next = srcnode->prev = srcnode; } void @@ -216,24 +215,25 @@ list_link_replace(list_node_t *lold, list_node_t *lnew) ASSERT(list_link_active(lold)); ASSERT(!list_link_active(lnew)); - lnew->list_next = lold->list_next; - lnew->list_prev = lold->list_prev; - lold->list_prev->list_next = lnew; - lold->list_next->list_prev = lnew; - lold->list_next = lold->list_prev = NULL; + lnew->next = lold->next; + lnew->prev = lold->prev; + lold->prev->next = lnew; + lold->next->prev = lnew; + lold->next = lold->prev = NULL; } void list_link_init(list_node_t *ln) { - ln->list_next = NULL; - ln->list_prev = NULL; + ln->next = NULL; + ln->prev = NULL; } int list_link_active(list_node_t *ln) { - return (ln->list_next != NULL); + EQUIV(ln->next == NULL, ln->prev == NULL); + return (ln->next != NULL); } int diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c index 5417417520..fce2c1c82e 100644 --- a/lib/libspl/mkdirp.c +++ b/lib/libspl/mkdirp.c @@ -128,7 +128,7 @@ mkdirp(const char *d, mode_t mode) * caller, or NULL is returned on error. * * The caller should handle error reporting based upon the - * returned vlaue, and should free the returned value, + * returned value, and should free the returned value, * when appropriate. */ diff --git a/lib/libspl/os/freebsd/getexecname.c b/lib/libspl/os/freebsd/getexecname.c new file mode 100644 index 0000000000..256b28c1b7 --- /dev/null +++ b/lib/libspl/os/freebsd/getexecname.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include "../../libspl_impl.h" + +__attribute__((visibility("hidden"))) ssize_t +getexecname_impl(char *execname) +{ + size_t len = PATH_MAX; + int name[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + + if (sysctl(name, nitems(name), execname, &len, NULL, 0) != 0) + return (-1); + + return (len); +} diff --git a/lib/libspl/os/freebsd/gethostid.c b/lib/libspl/os/freebsd/gethostid.c new file mode 100644 index 0000000000..7bd567fe61 --- /dev/null +++ b/lib/libspl/os/freebsd/gethostid.c @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +unsigned long +get_system_hostid(void) +{ + return (gethostid()); +} diff --git a/lib/libuutil/uu_open.c b/lib/libspl/os/freebsd/getmntany.c similarity index 53% rename from lib/libuutil/uu_open.c rename to lib/libspl/os/freebsd/getmntany.c index cf5c5450b8..0ef24059e8 100644 --- a/lib/libuutil/uu_open.c +++ b/lib/libspl/os/freebsd/getmntany.c @@ -20,51 +20,44 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. * Use is subject to license terms. */ +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ - -#include "libuutil_common.h" - -#include - -#include -#include -#include #include +#include +#include +#include +#include +#include +#include #include -#ifdef _LP64 -#define TMPPATHFMT "%s/uu%ld" -#else /* _LP64 */ -#define TMPPATHFMT "%s/uu%lld" -#endif /* _LP64 */ - -/*ARGSUSED*/ int -uu_open_tmp(const char *dir, uint_t uflags) +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) { - int f; - char *fname = uu_zalloc(PATH_MAX); + struct statfs sfs; - if (fname == NULL) + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); return (-1); - - for (;;) { - (void) snprintf(fname, PATH_MAX, "%s/uu%lld", dir, gethrtime()); - - f = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); - - if (f >= 0 || errno != EEXIST) - break; } - if (f >= 0) - (void) unlink(fname); + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } - uu_free(fname); - - return (f); + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + return (-1); + } + statfs2mnttab(&sfs, (struct mnttab *)entry); + return (0); } diff --git a/lib/libspl/os/freebsd/mnttab.c b/lib/libspl/os/freebsd/mnttab.c new file mode 100644 index 0000000000..bd3e3e4e3e --- /dev/null +++ b/lib/libspl/os/freebsd/mnttab.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible getmntany() and hasmntopt() + * functions. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static char * +mntopt(char **p) +{ + char *cp = *p; + char *retstr; + + while (*cp && isspace(*cp)) + cp++; + + retstr = cp; + while (*cp && *cp != ',') + cp++; + + if (*cp) { + *cp = '\0'; + cp++; + } + + *p = cp; + return (retstr); +} + +char * +hasmntopt(struct mnttab *mnt, char *opt) +{ + char tmpopts[MNT_LINE_MAX]; + char *f, *opts = tmpopts; + + if (mnt->mnt_mntopts == NULL) + return (NULL); + (void) strcpy(opts, mnt->mnt_mntopts); + f = mntopt(&opts); + for (; *f; f = mntopt(&opts)) { + if (strncmp(opt, f, strlen(opt)) == 0) + return (f - tmpopts + mnt->mnt_mntopts); + } + return (NULL); +} + +static void +optadd(char *mntopts, size_t size, const char *opt) +{ + + if (mntopts[0] != '\0') + strlcat(mntopts, ",", size); + strlcat(mntopts, opt, size); +} + +void +statfs2mnttab(struct statfs *sfs, struct mnttab *mp) +{ + static char mntopts[MNTMAXSTR]; + long flags; + + mntopts[0] = '\0'; + + flags = sfs->f_flags; +#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + if (flags & MNT_RDONLY) + OPTADD(MNTOPT_RO); + else + OPTADD(MNTOPT_RW); + if (flags & MNT_NOSUID) + OPTADD(MNTOPT_NOSETUID); + else + OPTADD(MNTOPT_SETUID); + if (flags & MNT_UPDATE) + OPTADD(MNTOPT_REMOUNT); + if (flags & MNT_NOATIME) + OPTADD(MNTOPT_NOATIME); + else + OPTADD(MNTOPT_ATIME); + OPTADD(MNTOPT_NOXATTR); + if (flags & MNT_NOEXEC) + OPTADD(MNTOPT_NOEXEC); + else + OPTADD(MNTOPT_EXEC); +#undef OPTADD + mp->mnt_special = strdup(sfs->f_mntfromname); + mp->mnt_mountp = strdup(sfs->f_mntonname); + mp->mnt_fstype = strdup(sfs->f_fstypename); + mp->mnt_mntopts = strdup(mntopts); +} + +static struct statfs *gsfs = NULL; +static int allfs = 0; + +static int +statfs_init(void) +{ + struct statfs *sfs; + int error; + + if (gsfs != NULL) { + free(gsfs); + gsfs = NULL; + } + allfs = getfsstat(NULL, 0, MNT_NOWAIT); + if (allfs == -1) + goto fail; + gsfs = malloc(sizeof (gsfs[0]) * allfs * 2); + if (gsfs == NULL) + goto fail; + allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2), + MNT_NOWAIT); + if (allfs == -1) + goto fail; + sfs = realloc(gsfs, allfs * sizeof (gsfs[0])); + if (sfs != NULL) + gsfs = sfs; + return (0); +fail: + error = errno; + if (gsfs != NULL) + free(gsfs); + gsfs = NULL; + allfs = 0; + return (error); +} + +int +getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) +{ + // struct statfs *sfs; + int i, error; + + error = statfs_init(); + if (error != 0) + return (error); + + for (i = 0; i < allfs; i++) { + if (mrefp->mnt_special != NULL && + strcmp(mrefp->mnt_special, gsfs[i].f_mntfromname) != 0) { + continue; + } + if (mrefp->mnt_mountp != NULL && + strcmp(mrefp->mnt_mountp, gsfs[i].f_mntonname) != 0) { + continue; + } + if (mrefp->mnt_fstype != NULL && + strcmp(mrefp->mnt_fstype, gsfs[i].f_fstypename) != 0) { + continue; + } + statfs2mnttab(&gsfs[i], mgetp); + return (0); + } + return (-1); +} + +int +getmntent(FILE *fp, struct mnttab *mp) +{ + // struct statfs *sfs; + int error, nfs; + + nfs = (int)lseek(fileno(fp), 0, SEEK_CUR); + if (nfs == -1) + return (errno); + /* If nfs is 0, we want to refresh out cache. */ + if (nfs == 0 || gsfs == NULL) { + error = statfs_init(); + if (error != 0) + return (error); + } + if (nfs >= allfs) + return (-1); + statfs2mnttab(&gsfs[nfs], mp); + if (lseek(fileno(fp), 1, SEEK_CUR) == -1) + return (errno); + return (0); +} diff --git a/lib/libspl/os/freebsd/zone.c b/lib/libspl/os/freebsd/zone.c new file mode 100644 index 0000000000..c07cb0532e --- /dev/null +++ b/lib/libspl/os/freebsd/zone.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +zoneid_t +getzoneid(void) +{ + size_t size; + int jailid; + + /* Information that we are in jail or not is enough for our needs. */ + size = sizeof (jailid); + if (sysctlbyname("security.jail.jailed", &jailid, &size, NULL, 0) == -1) + assert(!"No security.jail.jailed sysctl!"); + return ((zoneid_t)jailid); +} diff --git a/lib/libspl/os/linux/getexecname.c b/lib/libspl/os/linux/getexecname.c new file mode 100644 index 0000000000..a640556bcb --- /dev/null +++ b/lib/libspl/os/linux/getexecname.c @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include "../../libspl_impl.h" + +__attribute__((visibility("hidden"))) ssize_t +getexecname_impl(char *execname) +{ + return (readlink("/proc/self/exe", execname, PATH_MAX)); +} diff --git a/lib/libspl/gethostid.c b/lib/libspl/os/linux/gethostid.c similarity index 63% rename from lib/libspl/gethostid.c rename to lib/libspl/os/linux/gethostid.c index 1eb93f4411..c04b7fd3ee 100644 --- a/lib/libspl/gethostid.c +++ b/lib/libspl/os/linux/gethostid.c @@ -40,47 +40,40 @@ get_spl_hostid(void) * Allow the hostid to be subverted for testing. */ env = getenv("ZFS_HOSTID"); - if (env) { - hostid = strtoull(env, NULL, 0); - return (hostid & HOSTID_MASK); - } + if (env) + return (strtoull(env, NULL, 0)); - f = fopen("/sys/module/spl/parameters/spl_hostid", "r"); + f = fopen("/proc/sys/kernel/spl/hostid", "re"); if (!f) return (0); - if (fscanf(f, "%lu", &hostid) != 1) + if (fscanf(f, "%lx", &hostid) != 1) hostid = 0; fclose(f); - return (hostid & HOSTID_MASK); + return (hostid); } unsigned long get_system_hostid(void) { - unsigned long system_hostid = get_spl_hostid(); + unsigned long hostid = get_spl_hostid(); + /* - * We do not use the library call gethostid() because - * it generates a hostid value that the kernel is - * unaware of, if the spl_hostid module parameter has not - * been set and there is no system hostid file (e.g. - * /etc/hostid). The kernel and userspace must agree. + * We do not use gethostid(3) because it can return a bogus ID, + * depending on the libc and /etc/hostid presence, + * and the kernel and userspace must agree. * See comments above hostid_read() in the SPL. */ - if (system_hostid == 0) { - int fd, rc; - unsigned long hostid; - int hostid_size = 4; /* 4 bytes regardless of arch */ - - fd = open("/etc/hostid", O_RDONLY); + if (hostid == 0) { + int fd = open("/etc/hostid", O_RDONLY | O_CLOEXEC); if (fd >= 0) { - rc = read(fd, &hostid, hostid_size); - if (rc > 0) - system_hostid = (hostid & HOSTID_MASK); - close(fd); + if (read(fd, &hostid, 4) < 0) + hostid = 0; + (void) close(fd); } } - return (system_hostid); + + return (hostid & HOSTID_MASK); } diff --git a/lib/libspl/getmntany.c b/lib/libspl/os/linux/getmntany.c similarity index 62% rename from lib/libspl/getmntany.c rename to lib/libspl/os/linux/getmntany.c index 43e523e4a5..d458b28ad3 100644 --- a/lib/libspl/getmntany.c +++ b/lib/libspl/os/linux/getmntany.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ #define BUFSIZE (MNT_LINE_MAX + 2) -__thread char buf[BUFSIZE]; +static __thread char buf[BUFSIZE]; #define DIFF(xx) ( \ (mrefp->xx != NULL) && \ @@ -81,8 +82,8 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) return (MNT_TOOLONG); } -int -getextmntent(FILE *fp, struct extmnttab *mp, int len) +static int +getextmntent_impl(FILE *fp, struct extmnttab *mp, int len) { int ret; struct stat64 st; @@ -100,3 +101,62 @@ getextmntent(FILE *fp, struct extmnttab *mp, int len) return (ret); } + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct stat64 st; + FILE *fp; + int match; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + /* + * Search for the path in /proc/self/mounts. Rather than looking for the + * specific path, which can be fooled by non-standard paths (i.e. ".." + * or "//"), we stat() the path and search for the corresponding + * (major,minor) device pair. + */ + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + + if ((fp = fopen(MNTTAB, "re")) == NULL) { + (void) fprintf(stderr, "cannot open %s\n", MNTTAB); + return (-1); + } + + /* + * Search for the given (major,minor) pair in the mount table. + */ + + match = 0; + while (getextmntent_impl(fp, entry, sizeof (*entry)) == 0) { + if (makedev(entry->mnt_major, entry->mnt_minor) == + statbuf->st_dev) { + match = 1; + break; + } + } + (void) fclose(fp); + + if (!match) { + (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", + path); + return (-1); + } + + if (stat64(entry->mnt_mountp, &st) != 0) { + entry->mnt_major = 0; + entry->mnt_minor = 0; + return (-1); + } + + return (0); +} diff --git a/lib/libspl/zone.c b/lib/libspl/os/linux/zone.c similarity index 65% rename from lib/libspl/zone.c rename to lib/libspl/os/linux/zone.c index 5ca93b224d..a71c4e0b27 100644 --- a/lib/libspl/zone.c +++ b/lib/libspl/os/linux/zone.c @@ -24,40 +24,9 @@ */ #include -#include -#include zoneid_t getzoneid() { return (GLOBAL_ZONEID); } - -zoneid_t -getzoneidbyname(const char *name) -{ - if (name == NULL) - return (GLOBAL_ZONEID); - - if (strcmp(name, GLOBAL_ZONEID_NAME) == 0) - return (GLOBAL_ZONEID); - - return (EINVAL); -} - -ssize_t -getzonenamebyid(zoneid_t id, char *buf, size_t buflen) -{ - if (id != GLOBAL_ZONEID) - return (EINVAL); - - ssize_t ret = strlen(GLOBAL_ZONEID_NAME) + 1; - - if (buf == NULL || buflen == 0) - return (ret); - - strncpy(buf, GLOBAL_ZONEID_NAME, buflen); - buf[buflen - 1] = '\0'; - - return (ret); -} diff --git a/lib/libspl/page.c b/lib/libspl/page.c index 06d9fcfa05..5b0d3f2e57 100644 --- a/lib/libspl/page.c +++ b/lib/libspl/page.c @@ -21,8 +21,9 @@ */ #include +#include -size_t pagesize = 0; +static size_t pagesize = 0; size_t spl_pagesize(void) diff --git a/lib/libspl/timestamp.c b/lib/libspl/timestamp.c index eab15f3f13..22ecb39407 100644 --- a/lib/libspl/timestamp.c +++ b/lib/libspl/timestamp.c @@ -51,7 +51,7 @@ print_timestamp(uint_t timestamp_fmt) fmt = nl_langinfo(_DATE_FMT); if (timestamp_fmt == UDATE) { - (void) printf("%ld\n", t); + (void) printf("%lld\n", (longlong_t)t); } else if (timestamp_fmt == DDATE) { char dstr[64]; int len; diff --git a/lib/libtpool/Makefile.am b/lib/libtpool/Makefile.am index 586eec2ec9..40fd137b43 100644 --- a/lib/libtpool/Makefile.am +++ b/lib/libtpool/Makefile.am @@ -1,8 +1,6 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += -fvisibility=hidden noinst_LTLIBRARIES = libtpool.la @@ -10,11 +8,6 @@ USER_C = \ thread_pool.c \ thread_pool_impl.h -nodist_libtpool_la_SOURCES = $(USER_C) +libtpool_la_SOURCES = $(USER_C) -libtpool_la_LIBADD = \ - $(top_builddir)/lib/libspl/libspl.la - -libtpool_la_LDFLAGS = -pthread - -EXTRA_DIST = $(USER_C) +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libtpool/thread_pool.c b/lib/libtpool/thread_pool.c index a43fdd9cd6..892beeffa5 100644 --- a/lib/libtpool/thread_pool.c +++ b/lib/libtpool/thread_pool.c @@ -134,7 +134,7 @@ tpool_worker(void *arg) /* * This is the worker's main loop. - * It will only be left if a timeout or an error has occured. + * It will only be left if a timeout or an error has occurred. */ active.tpa_tid = pthread_self(); for (;;) { @@ -597,56 +597,3 @@ tpool_member(tpool_t *tpool) pthread_mutex_unlock(&tpool->tp_mutex); return (0); } - -void -postfork1_child_tpool(void) -{ - pthread_t my_tid = pthread_self(); - tpool_t *tpool; - tpool_job_t *job; - - /* - * All of the thread pool workers are gone, except possibly - * for the current thread, if it is a thread pool worker thread. - * Retain the thread pools, but make them all empty. Whatever - * jobs were queued or running belong to the parent process. - */ -top: - if ((tpool = thread_pools) == NULL) - return; - - do { - tpool_active_t *activep; - - (void) pthread_mutex_init(&tpool->tp_mutex, NULL); - (void) pthread_cond_init(&tpool->tp_busycv, NULL); - (void) pthread_cond_init(&tpool->tp_workcv, NULL); - (void) pthread_cond_init(&tpool->tp_waitcv, NULL); - for (job = tpool->tp_head; job; job = tpool->tp_head) { - tpool->tp_head = job->tpj_next; - free(job); - } - tpool->tp_tail = NULL; - tpool->tp_njobs = 0; - for (activep = tpool->tp_active; activep; - activep = activep->tpa_next) { - if (activep->tpa_tid == my_tid) { - activep->tpa_next = NULL; - break; - } - } - tpool->tp_idle = 0; - tpool->tp_current = 0; - if ((tpool->tp_active = activep) != NULL) - tpool->tp_current = 1; - tpool->tp_flags &= ~TP_WAIT; - if (tpool->tp_flags & (TP_DESTROY | TP_ABANDON)) { - tpool->tp_flags &= ~TP_DESTROY; - tpool->tp_flags |= TP_ABANDON; - if (tpool->tp_current == 0) { - delete_pool(tpool); - goto top; /* start over */ - } - } - } while ((tpool = tpool->tp_forw) != thread_pools); -} diff --git a/lib/libunicode/Makefile.am b/lib/libunicode/Makefile.am index 0a4734c037..b82975f68e 100644 --- a/lib/libunicode/Makefile.am +++ b/lib/libunicode/Makefile.am @@ -5,20 +5,13 @@ VPATH = $(top_srcdir)/module/unicode # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - noinst_LTLIBRARIES = libunicode.la -USER_C = - KERNEL_C = \ u8_textprep.c \ uconv.c nodist_libunicode_la_SOURCES = \ - $(USER_C) \ $(KERNEL_C) -EXTRA_DIST = $(USER_C) +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am index 09eef792a2..5a911f85f7 100644 --- a/lib/libuutil/Makefile.am +++ b/lib/libuutil/Makefile.am @@ -1,28 +1,35 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - lib_LTLIBRARIES = libuutil.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ uu_alloc.c \ uu_avl.c \ - uu_dprintf.c \ uu_ident.c \ uu_list.c \ uu_misc.c \ - uu_open.c \ uu_pname.c \ uu_string.c -nodist_libuutil_la_SOURCES = $(USER_C) +libuutil_la_SOURCES = $(USER_C) libuutil_la_LIBADD = \ - $(top_builddir)/lib/libavl/libavl.la \ - $(top_builddir)/lib/libspl/libspl.la + $(abs_top_builddir)/lib/libavl/libavl.la \ + $(abs_top_builddir)/lib/libspl/libspl.la -libuutil_la_LDFLAGS = -pthread -version-info 1:1:0 +libuutil_la_LIBADD += $(LTLIBINTL) -EXTRA_DIST = $(USER_C) +libuutil_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libuutil_la_LDFLAGS += -Wl,-z,defs +endif + +libuutil_la_LDFLAGS += -version-info 3:0:0 + +include $(top_srcdir)/config/CppCheck.am + +# Library ABI +EXTRA_DIST = libuutil.abi libuutil.suppr diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi new file mode 100644 index 0000000000..c7ab5672ea --- /dev/null +++ b/lib/libuutil/libuutil.abi @@ -0,0 +1,1841 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libuutil/libuutil.suppr b/lib/libuutil/libuutil.suppr new file mode 100644 index 0000000000..f4db8a49e4 --- /dev/null +++ b/lib/libuutil/libuutil.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/lib/libuutil/uu_avl.c b/lib/libuutil/uu_avl.c index 040008883a..53def0e073 100644 --- a/lib/libuutil/uu_avl.c +++ b/lib/libuutil/uu_avl.c @@ -128,6 +128,7 @@ uu_avl_pool_destroy(uu_avl_pool_t *pp) pp->uap_next->uap_prev = pp->uap_prev; pp->uap_prev->uap_next = pp->uap_next; (void) pthread_mutex_unlock(&uu_apool_list_lock); + (void) pthread_mutex_destroy(&pp->uap_lock); pp->uap_prev = NULL; pp->uap_next = NULL; uu_free(pp); diff --git a/lib/libuutil/uu_dprintf.c b/lib/libuutil/uu_dprintf.c deleted file mode 100644 index 6958057b29..0000000000 --- a/lib/libuutil/uu_dprintf.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - - -#include "libuutil_common.h" - -#include -#include -#include -#include -#include -#include -#include - -#define FACILITY_FMT "%s (%s): " - -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif - -static const char * -strseverity(uu_dprintf_severity_t severity) -{ - switch (severity) { - case UU_DPRINTF_SILENT: - return (dgettext(TEXT_DOMAIN, "silent")); - case UU_DPRINTF_FATAL: - return (dgettext(TEXT_DOMAIN, "FATAL")); - case UU_DPRINTF_WARNING: - return (dgettext(TEXT_DOMAIN, "WARNING")); - case UU_DPRINTF_NOTICE: - return (dgettext(TEXT_DOMAIN, "note")); - case UU_DPRINTF_INFO: - return (dgettext(TEXT_DOMAIN, "info")); - case UU_DPRINTF_DEBUG: - return (dgettext(TEXT_DOMAIN, "debug")); - default: - return (dgettext(TEXT_DOMAIN, "unspecified")); - } -} - -uu_dprintf_t * -uu_dprintf_create(const char *name, uu_dprintf_severity_t severity, - uint_t flags) -{ - uu_dprintf_t *D; - - if (name != NULL && - uu_check_name(name, UU_NAME_DOMAIN) == -1) { - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (NULL); - } - - if ((D = uu_zalloc(sizeof (uu_dprintf_t))) == NULL) - return (NULL); - - if (name != NULL) { - D->uud_name = strdup(name); - if (D->uud_name == NULL) { - uu_free(D); - return (NULL); - } - } else { - D->uud_name = NULL; - } - - D->uud_severity = severity; - D->uud_flags = flags; - - return (D); -} - -/*PRINTFLIKE3*/ -void -uu_dprintf(uu_dprintf_t *D, uu_dprintf_severity_t severity, - const char *format, ...) -{ - va_list alist; - - /* XXX Assert that severity is not UU_DPRINTF_SILENT. */ - - if (severity > D->uud_severity) - return; - - (void) fprintf(stderr, FACILITY_FMT, D->uud_name, - strseverity(severity)); - - va_start(alist, format); - (void) vfprintf(stderr, format, alist); - va_end(alist); -} - -void -uu_dprintf_destroy(uu_dprintf_t *D) -{ - if (D->uud_name) - free(D->uud_name); - - uu_free(D); -} - -const char * -uu_dprintf_getname(uu_dprintf_t *D) -{ - return (D->uud_name); -} diff --git a/lib/libuutil/uu_misc.c b/lib/libuutil/uu_misc.c index b10afd8ead..a8478ace9a 100644 --- a/lib/libuutil/uu_misc.c +++ b/lib/libuutil/uu_misc.c @@ -252,30 +252,3 @@ uu_init(void) _uu_main_thread = 1; (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); } - -/* - * Dump a block of memory in hex+ascii, for debugging - */ -void -uu_dump(FILE *out, const char *prefix, const void *buf, size_t len) -{ - const unsigned char *p = buf; - int i; - - for (i = 0; i < len; i += 16) { - int j; - - (void) fprintf(out, "%s", prefix); - for (j = 0; j < 16 && i + j < len; j++) { - (void) fprintf(out, "%2.2x ", p[i + j]); - } - for (; j < 16; j++) { - (void) fprintf(out, " "); - } - for (j = 0; j < 16 && i + j < len; j++) { - (void) fprintf(out, "%c", - isprint(p[i + j]) ? p[i + j] : '.'); - } - (void) fprintf(out, "\n"); - } -} diff --git a/lib/libuutil/uu_pname.c b/lib/libuutil/uu_pname.c index a6a0f22661..28c4a8a9cf 100644 --- a/lib/libuutil/uu_pname.c +++ b/lib/libuutil/uu_pname.c @@ -38,9 +38,6 @@ #include #include -static const char PNAME_FMT[] = "%s: "; -static const char ERRNO_FMT[] = ": %s\n"; - static const char *pname; static void @@ -85,16 +82,16 @@ uu_alt_exit(int profile) } } -static void +static __attribute__((format(printf, 2, 0))) void uu_warn_internal(int err, const char *format, va_list alist) { if (pname != NULL) - (void) fprintf(stderr, PNAME_FMT, pname); + (void) fprintf(stderr, "%s: ", pname); (void) vfprintf(stderr, format, alist); if (strrchr(format, '\n') == NULL) - (void) fprintf(stderr, ERRNO_FMT, strerror(err)); + (void) fprintf(stderr, ": %s\n", strerror(err)); } void @@ -103,7 +100,6 @@ uu_vwarn(const char *format, va_list alist) uu_warn_internal(errno, format, alist); } -/*PRINTFLIKE1*/ void uu_warn(const char *format, ...) { @@ -113,7 +109,7 @@ uu_warn(const char *format, ...) va_end(alist); } -static void +static __attribute__((format(printf, 2, 0))) __NORETURN void uu_die_internal(int status, const char *format, va_list alist) { uu_warn_internal(errno, format, alist); @@ -137,7 +133,6 @@ uu_vdie(const char *format, va_list alist) uu_die_internal(UU_EXIT_FATAL, format, alist); } -/*PRINTFLIKE1*/ void uu_die(const char *format, ...) { @@ -153,7 +148,6 @@ uu_vxdie(int status, const char *format, va_list alist) uu_die_internal(status, format, alist); } -/*PRINTFLIKE2*/ void uu_xdie(int status, const char *format, ...) { diff --git a/lib/libuutil/uu_string.c b/lib/libuutil/uu_string.c index 66afba05e8..67024c3b50 100644 --- a/lib/libuutil/uu_string.c +++ b/lib/libuutil/uu_string.c @@ -29,8 +29,6 @@ #include #include -#include -#include #include #include "libuutil.h" diff --git a/lib/libzfs/.gitignore b/lib/libzfs/.gitignore index d719bc1ad9..9336a5c00b 100644 --- a/lib/libzfs/.gitignore +++ b/lib/libzfs/.gitignore @@ -1,2 +1 @@ /libzfs.pc -/libzfs_core.pc diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 421970413d..e23f7c162a 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -7,18 +7,17 @@ VPATH = \ # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) +AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS) +AM_CFLAGS += -fvisibility=hidden -libzfs_pcdir = $(datarootdir)/pkgconfig -libzfs_pc_DATA = libzfs.pc libzfs_core.pc - -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/module/icp/include \ - -I$(top_srcdir)/lib/libspl/include +pkgconfig_DATA = libzfs.pc lib_LTLIBRARIES = libzfs.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ + libzfs_impl.h \ libzfs_changelist.c \ libzfs_config.c \ libzfs_crypto.c \ @@ -32,8 +31,24 @@ USER_C = \ libzfs_status.c \ libzfs_util.c + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/libzfs_compat.c \ + os/freebsd/libzfs_zmount.c +endif + +if BUILD_LINUX +USER_C += \ + os/linux/libzfs_mount_os.c \ + os/linux/libzfs_pool_os.c \ + os/linux/libzfs_sendrecv_os.c \ + os/linux/libzfs_util_os.c +endif + KERNEL_C = \ algs/sha2/sha2.c \ + cityhash.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ @@ -46,25 +61,40 @@ KERNEL_C = \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ - zfs_uio.c \ zpool_prop.c \ zprop_common.c +dist_libzfs_la_SOURCES = \ + $(USER_C) + nodist_libzfs_la_SOURCES = \ - $(USER_C) \ $(KERNEL_C) libzfs_la_LIBADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libshare/libshare.la \ - $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzfs_core/libzfs_core.la \ - $(top_builddir)/lib/libzutil/libzutil.la + $(abs_top_builddir)/lib/libshare/libshare.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la -libzfs_la_LIBADD += -lm $(LIBSSL) -libzfs_la_LDFLAGS = -version-info 2:0:0 +libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) -EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C) +libzfs_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzfs_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzfs_la_LIBADD += -lutil -lgeom +endif + +libzfs_la_LDFLAGS += -version-info 5:0:1 + +include $(top_srcdir)/config/CppCheck.am + +# Library ABI +EXTRA_DIST = libzfs.abi libzfs.suppr # Licensing data EXTRA_DIST += THIRDPARTYLICENSE.openssl THIRDPARTYLICENSE.openssl.descrip diff --git a/lib/libzfs/THIRDPARTYLICENSE.openssl b/lib/libzfs/THIRDPARTYLICENSE.openssl index a2c4adcbe6..92c9e196a3 100644 --- a/lib/libzfs/THIRDPARTYLICENSE.openssl +++ b/lib/libzfs/THIRDPARTYLICENSE.openssl @@ -101,7 +101,7 @@ * must display the following acknowledgement: * "This product includes cryptographic software written by * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library + * The word 'cryptographic' can be left out if the routines from the library * being used are not cryptographic related :-). * 4. If you include any Windows specific code (or a derivative thereof) from * the apps directory (application code) you must include an acknowledgement: diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi new file mode 100644 index 0000000000..ab6d27e913 --- /dev/null +++ b/lib/libzfs/libzfs.abi @@ -0,0 +1,5637 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs/libzfs.pc.in b/lib/libzfs/libzfs.pc.in index 0e83f7a64b..afe5635ae6 100644 --- a/lib/libzfs/libzfs.pc.in +++ b/lib/libzfs/libzfs.pc.in @@ -6,7 +6,9 @@ includedir=@includedir@ Name: libzfs Description: LibZFS library Version: @VERSION@ -URL: http://zfsonlinux.org +URL: https://github.com/openzfs/zfs Requires: libzfs_core +Requires.private: @LIBCRYPTO_PC@ @ZLIB_PC@ Cflags: -I${includedir}/libzfs -I${includedir}/libspl -Libs: -L${libdir} -lzfs +Libs: -L${libdir} -lzfs -lnvpair +Libs.private: -luutil -lm -pthread diff --git a/lib/libzfs/libzfs.suppr b/lib/libzfs/libzfs.suppr new file mode 100644 index 0000000000..d55b5b7281 --- /dev/null +++ b/lib/libzfs/libzfs.suppr @@ -0,0 +1,13 @@ +[suppress_type] + name = FILE* + +[suppress_type] + type_kind = typedef + name = SHA256_CTX + +[suppress_type] + type_kind = typedef + name = SHA2_CTX + +[suppress_variable] + name = zfs_deleg_perm_tab diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 3101febc16..4d90a511f6 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -24,7 +24,7 @@ * Use is subject to license terms. * * Portions Copyright 2007 Ramprakash Jelari - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. */ @@ -98,6 +98,7 @@ changelist_prefix(prop_changelist_t *clp) prop_changenode_t *cn; uu_avl_walk_t *walk; int ret = 0; + boolean_t commit_smb_shares = B_FALSE; if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && clp->cl_prop != ZFS_PROP_SHARESMB) @@ -127,6 +128,8 @@ changelist_prefix(prop_changelist_t *clp) */ switch (clp->cl_prop) { case ZFS_PROP_MOUNTPOINT: + if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) + break; if (zfs_unmount(cn->cn_handle, NULL, clp->cl_mflags) != 0) { ret = -1; @@ -135,6 +138,7 @@ changelist_prefix(prop_changelist_t *clp) break; case ZFS_PROP_SHARESMB: (void) zfs_unshare_smb(cn->cn_handle, NULL); + commit_smb_shares = B_TRUE; break; default: @@ -143,6 +147,8 @@ changelist_prefix(prop_changelist_t *clp) } } + if (commit_smb_shares) + zfs_commit_smb_shares(); uu_avl_walk_end(walk); if (ret == -1) @@ -167,7 +173,8 @@ changelist_postfix(prop_changelist_t *clp) uu_avl_walk_t *walk; char shareopts[ZFS_MAXPROPLEN]; int errors = 0; - libzfs_handle_t *hdl; + boolean_t commit_smb_shares = B_FALSE; + boolean_t commit_nfs_shares = B_FALSE; /* * If we're changing the mountpoint, attempt to destroy the underlying @@ -179,21 +186,10 @@ changelist_postfix(prop_changelist_t *clp) if ((cn = uu_avl_last(clp->cl_tree)) == NULL) return (0); - if (clp->cl_prop == ZFS_PROP_MOUNTPOINT) + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && + !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) remove_mountpoint(cn->cn_handle); - /* - * It is possible that the changelist_prefix() used libshare - * to unshare some entries. Since libshare caches data, an - * attempt to reshare during postfix can fail unless libshare - * is uninitialized here so that it will reinitialize later. - */ - if (cn->cn_handle != NULL) { - hdl = cn->cn_handle->zfs_hdl; - assert(hdl != NULL); - zfs_uninit_libshare(hdl); - } - /* * We walk the datasets in reverse, because we want to mount any parent * datasets before mounting the children. We walk all datasets even if @@ -242,7 +238,8 @@ changelist_postfix(prop_changelist_t *clp) needs_key = (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE); - mounted = zfs_is_mounted(cn->cn_handle, NULL); + mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) || + zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && !needs_key && (cn->cn_mounted || ((sharenfs || sharesmb || clp->cl_waslegacy) && @@ -260,16 +257,25 @@ changelist_postfix(prop_changelist_t *clp) * if the filesystem is currently shared, so that we can * adopt any new options. */ - if (sharenfs && mounted) + if (sharenfs && mounted) { errors += zfs_share_nfs(cn->cn_handle); - else if (cn->cn_shared || clp->cl_waslegacy) + commit_nfs_shares = B_TRUE; + } else if (cn->cn_shared || clp->cl_waslegacy) { errors += zfs_unshare_nfs(cn->cn_handle, NULL); - if (sharesmb && mounted) + commit_nfs_shares = B_TRUE; + } + if (sharesmb && mounted) { errors += zfs_share_smb(cn->cn_handle); - else if (cn->cn_shared || clp->cl_waslegacy) + commit_smb_shares = B_TRUE; + } else if (cn->cn_shared || clp->cl_waslegacy) { errors += zfs_unshare_smb(cn->cn_handle, NULL); + commit_smb_shares = B_TRUE; + } } - + if (commit_nfs_shares) + zfs_commit_nfs_shares(); + if (commit_smb_shares) + zfs_commit_smb_shares(); uu_avl_walk_end(walk); return (errors ? -1 : 0); @@ -278,7 +284,7 @@ changelist_postfix(prop_changelist_t *clp) /* * Is this "dataset" a child of "parent"? */ -boolean_t +static boolean_t isa_child_of(const char *dataset, const char *parent) { int len; @@ -357,6 +363,7 @@ changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) ret = -1; } + zfs_commit_proto(proto); uu_avl_walk_end(walk); return (ret); @@ -475,9 +482,10 @@ change_one(zfs_handle_t *zhp, void *data) prop_changelist_t *clp = data; char property[ZFS_MAXPROPLEN]; char where[64]; - prop_changenode_t *cn; + prop_changenode_t *cn = NULL; zprop_source_t sourcetype = ZPROP_SRC_NONE; zprop_source_t share_sourcetype = ZPROP_SRC_NONE; + int ret = 0; /* * We only want to unmount/unshare those filesystems that may inherit @@ -493,8 +501,7 @@ change_one(zfs_handle_t *zhp, void *data) zfs_prop_get(zhp, clp->cl_prop, property, sizeof (property), &sourcetype, where, sizeof (where), B_FALSE) != 0) { - zfs_close(zhp); - return (0); + goto out; } /* @@ -506,8 +513,7 @@ change_one(zfs_handle_t *zhp, void *data) zfs_prop_get(zhp, clp->cl_shareprop, property, sizeof (property), &share_sourcetype, where, sizeof (where), B_FALSE) != 0) { - zfs_close(zhp); - return (0); + goto out; } if (clp->cl_alldependents || clp->cl_allchildren || @@ -518,8 +524,8 @@ change_one(zfs_handle_t *zhp, void *data) share_sourcetype == ZPROP_SRC_INHERITED))) { if ((cn = zfs_alloc(zfs_get_handle(zhp), sizeof (prop_changenode_t))) == NULL) { - zfs_close(zhp); - return (-1); + ret = -1; + goto out; } cn->cn_handle = zhp; @@ -541,16 +547,23 @@ change_one(zfs_handle_t *zhp, void *data) uu_avl_insert(clp->cl_tree, cn, idx); } else { free(cn); - zfs_close(zhp); + cn = NULL; } if (!clp->cl_alldependents) - return (zfs_iter_children(zhp, change_one, data)); - } else { - zfs_close(zhp); + ret = zfs_iter_children(zhp, change_one, data); + + /* + * If we added the handle to the changelist, we will re-use it + * later so return without closing it. + */ + if (cn != NULL) + return (ret); } - return (0); +out: + zfs_close(zhp); + return (ret); } static int diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index 67379d0721..a3ecc4a327 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -133,7 +133,7 @@ namespace_reload(libzfs_handle_t *hdl) for (;;) { zc.zc_cookie = hdl->libzfs_ns_gen; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { switch (errno) { case EEXIST: /* @@ -279,7 +279,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) return (-1); for (;;) { - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_POOL_STATS, + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, &zc) == 0) { /* * The real error is returned in the zc_cookie field. diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c index 3318a6bd2e..644dd26859 100644 --- a/lib/libzfs/libzfs_crypto.c +++ b/lib/libzfs/libzfs_crypto.c @@ -15,6 +15,7 @@ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. + * Copyright 2020 Joyent, Inc. */ #include @@ -25,6 +26,16 @@ #include #include #include +#if LIBFETCH_DYNAMIC +#include +#endif +#if LIBFETCH_IS_FETCH +#include +#include +#include +#elif LIBFETCH_IS_LIBCURL +#include +#endif #include #include "libzfs_impl.h" #include "zfeature_common.h" @@ -50,25 +61,31 @@ * technically ok if the salt is known to the attacker). */ -typedef enum key_locator { - KEY_LOCATOR_NONE, - KEY_LOCATOR_PROMPT, - KEY_LOCATOR_URI -} key_locator_t; - #define MIN_PASSPHRASE_LEN 8 #define MAX_PASSPHRASE_LEN 512 #define MAX_KEY_PROMPT_ATTEMPTS 3 static int caught_interrupt; +static int get_key_material_file(libzfs_handle_t *, const char *, const char *, + zfs_keyformat_t, boolean_t, uint8_t **, size_t *); +static int get_key_material_https(libzfs_handle_t *, const char *, const char *, + zfs_keyformat_t, boolean_t, uint8_t **, size_t *); + +static zfs_uri_handler_t uri_handlers[] = { + { "file", get_key_material_file }, + { "https", get_key_material_https }, + { "http", get_key_material_https }, + { NULL, NULL } +}; + static int pkcs11_get_urandom(uint8_t *buf, size_t bytes) { int rand; ssize_t bytes_read = 0; - rand = open("/dev/urandom", O_RDONLY); + rand = open("/dev/urandom", O_RDONLY | O_CLOEXEC); if (rand < 0) return (rand); @@ -85,15 +102,49 @@ pkcs11_get_urandom(uint8_t *buf, size_t bytes) return (bytes_read); } -static zfs_keylocation_t -zfs_prop_parse_keylocation(const char *str) +static int +zfs_prop_parse_keylocation(libzfs_handle_t *restrict hdl, const char *str, + zfs_keylocation_t *restrict locp, char **restrict schemep) { - if (strcmp("prompt", str) == 0) - return (ZFS_KEYLOCATION_PROMPT); - else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0) - return (ZFS_KEYLOCATION_URI); + *locp = ZFS_KEYLOCATION_NONE; + *schemep = NULL; - return (ZFS_KEYLOCATION_NONE); + if (strcmp("prompt", str) == 0) { + *locp = ZFS_KEYLOCATION_PROMPT; + return (0); + } + + regmatch_t pmatch[2]; + + if (regexec(&hdl->libzfs_urire, str, ARRAY_SIZE(pmatch), + pmatch, 0) == 0) { + size_t scheme_len; + + if (pmatch[1].rm_so == -1) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid URI")); + return (EINVAL); + } + + scheme_len = pmatch[1].rm_eo - pmatch[1].rm_so; + + *schemep = calloc(1, scheme_len + 1); + if (*schemep == NULL) { + int ret = errno; + + errno = 0; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid URI")); + return (ret); + } + + (void) memcpy(*schemep, str + pmatch[1].rm_so, scheme_len); + *locp = ZFS_KEYLOCATION_URI; + return (0); + } + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid keylocation")); + return (EINVAL); } static int @@ -146,62 +197,235 @@ get_format_prompt_string(zfs_keyformat_t format) } } +/* do basic validation of the key material */ static int -get_key_material_raw(FILE *fd, const char *fsname, zfs_keyformat_t keyformat, - boolean_t again, boolean_t newkey, uint8_t **buf, size_t *len_out) +validate_key(libzfs_handle_t *hdl, zfs_keyformat_t keyformat, + const char *key, size_t keylen) { - int ret = 0, bytes; + switch (keyformat) { + case ZFS_KEYFORMAT_RAW: + /* verify the key length is correct */ + if (keylen < WRAPPING_KEY_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Raw key too short (expected %u)."), + WRAPPING_KEY_LEN); + return (EINVAL); + } + + if (keylen > WRAPPING_KEY_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Raw key too long (expected %u)."), + WRAPPING_KEY_LEN); + return (EINVAL); + } + break; + case ZFS_KEYFORMAT_HEX: + /* verify the key length is correct */ + if (keylen < WRAPPING_KEY_LEN * 2) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Hex key too short (expected %u)."), + WRAPPING_KEY_LEN * 2); + return (EINVAL); + } + + if (keylen > WRAPPING_KEY_LEN * 2) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Hex key too long (expected %u)."), + WRAPPING_KEY_LEN * 2); + return (EINVAL); + } + + /* check for invalid hex digits */ + for (size_t i = 0; i < WRAPPING_KEY_LEN * 2; i++) { + if (!isxdigit(key[i])) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid hex character detected.")); + return (EINVAL); + } + } + break; + case ZFS_KEYFORMAT_PASSPHRASE: + /* verify the length is within bounds */ + if (keylen > MAX_PASSPHRASE_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Passphrase too long (max %u)."), + MAX_PASSPHRASE_LEN); + return (EINVAL); + } + + if (keylen < MIN_PASSPHRASE_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Passphrase too short (min %u)."), + MIN_PASSPHRASE_LEN); + return (EINVAL); + } + break; + default: + /* can't happen, checked above */ + break; + } + + return (0); +} + +static int +libzfs_getpassphrase(zfs_keyformat_t keyformat, boolean_t is_reenter, + boolean_t new_key, const char *fsname, + char **restrict res, size_t *restrict reslen) +{ + FILE *f = stdin; size_t buflen = 0; + ssize_t bytes; + int ret = 0; struct termios old_term, new_term; struct sigaction act, osigint, osigtstp; - *len_out = 0; + *res = NULL; + *reslen = 0; - if (isatty(fileno(fd))) { - /* - * handle SIGINT and ignore SIGSTP. This is necessary to - * restore the state of the terminal. - */ - caught_interrupt = 0; - act.sa_flags = 0; - (void) sigemptyset(&act.sa_mask); - act.sa_handler = catch_signal; + /* + * handle SIGINT and ignore SIGSTP. This is necessary to + * restore the state of the terminal. + */ + caught_interrupt = 0; + act.sa_flags = 0; + (void) sigemptyset(&act.sa_mask); + act.sa_handler = catch_signal; - (void) sigaction(SIGINT, &act, &osigint); - act.sa_handler = SIG_IGN; - (void) sigaction(SIGTSTP, &act, &osigtstp); + (void) sigaction(SIGINT, &act, &osigint); + act.sa_handler = SIG_IGN; + (void) sigaction(SIGTSTP, &act, &osigtstp); - /* prompt for the key */ - if (fsname != NULL) { - (void) printf("%s %s%s for '%s': ", - (again) ? "Re-enter" : "Enter", - (newkey) ? "new " : "", - get_format_prompt_string(keyformat), fsname); - } else { - (void) printf("%s %s%s: ", - (again) ? "Re-enter" : "Enter", - (newkey) ? "new " : "", - get_format_prompt_string(keyformat)); + (void) printf("%s %s%s", + is_reenter ? "Re-enter" : "Enter", + new_key ? "new " : "", + get_format_prompt_string(keyformat)); + if (fsname != NULL) + (void) printf(" for '%s'", fsname); + (void) fputc(':', stdout); + (void) fflush(stdout); - } - (void) fflush(stdout); + /* disable the terminal echo for key input */ + (void) tcgetattr(fileno(f), &old_term); - /* disable the terminal echo for key input */ - (void) tcgetattr(fileno(fd), &old_term); + new_term = old_term; + new_term.c_lflag &= ~(ECHO | ECHOE | ECHOK | ECHONL); - new_term = old_term; - new_term.c_lflag &= ~(ECHO | ECHOE | ECHOK | ECHONL); - - ret = tcsetattr(fileno(fd), TCSAFLUSH, &new_term); - if (ret != 0) { - ret = errno; - errno = 0; - goto out; - } + ret = tcsetattr(fileno(f), TCSAFLUSH, &new_term); + if (ret != 0) { + ret = errno; + errno = 0; + goto out; } + bytes = getline(res, &buflen, f); + if (bytes < 0) { + ret = errno; + errno = 0; + goto out; + } + + /* trim the ending newline if it exists */ + if (bytes > 0 && (*res)[bytes - 1] == '\n') { + (*res)[bytes - 1] = '\0'; + bytes--; + } + + *reslen = bytes; + +out: + /* reset the terminal */ + (void) tcsetattr(fileno(f), TCSAFLUSH, &old_term); + (void) sigaction(SIGINT, &osigint, NULL); + (void) sigaction(SIGTSTP, &osigtstp, NULL); + + /* if we caught a signal, re-throw it now */ + if (caught_interrupt != 0) + (void) kill(getpid(), caught_interrupt); + + /* print the newline that was not echo'd */ + (void) printf("\n"); + + return (ret); +} + +static int +get_key_interactive(libzfs_handle_t *restrict hdl, const char *fsname, + zfs_keyformat_t keyformat, boolean_t confirm_key, boolean_t newkey, + uint8_t **restrict outbuf, size_t *restrict len_out) +{ + char *buf = NULL, *buf2 = NULL; + size_t buflen = 0, buf2len = 0; + int ret = 0; + + ASSERT(isatty(fileno(stdin))); + + /* raw keys cannot be entered on the terminal */ + if (keyformat == ZFS_KEYFORMAT_RAW) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Cannot enter raw keys on the terminal")); + goto out; + } + + /* prompt for the key */ + if ((ret = libzfs_getpassphrase(keyformat, B_FALSE, newkey, fsname, + &buf, &buflen)) != 0) { + free(buf); + buf = NULL; + buflen = 0; + goto out; + } + + if (!confirm_key) + goto out; + + if ((ret = validate_key(hdl, keyformat, buf, buflen)) != 0) { + free(buf); + return (ret); + } + + ret = libzfs_getpassphrase(keyformat, B_TRUE, newkey, fsname, &buf2, + &buf2len); + if (ret != 0) { + free(buf); + free(buf2); + buf = buf2 = NULL; + buflen = buf2len = 0; + goto out; + } + + if (buflen != buf2len || strcmp(buf, buf2) != 0) { + free(buf); + buf = NULL; + buflen = 0; + + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Provided keys do not match.")); + } + + free(buf2); + +out: + *outbuf = (uint8_t *)buf; + *len_out = buflen; + return (ret); +} + +static int +get_key_material_raw(FILE *fd, zfs_keyformat_t keyformat, + uint8_t **buf, size_t *len_out) +{ + int ret = 0; + size_t buflen = 0; + + *len_out = 0; + /* read the key material */ if (keyformat != ZFS_KEYFORMAT_RAW) { + ssize_t bytes; + bytes = getline((char **)buf, &buflen, fd); if (bytes < 0) { ret = errno; @@ -210,25 +434,29 @@ get_key_material_raw(FILE *fd, const char *fsname, zfs_keyformat_t keyformat, } /* trim the ending newline if it exists */ - if ((*buf)[bytes - 1] == '\n') { + if (bytes > 0 && (*buf)[bytes - 1] == '\n') { (*buf)[bytes - 1] = '\0'; bytes--; } + + *len_out = bytes; } else { + size_t n; + /* * Raw keys may have newline characters in them and so can't * use getline(). Here we attempt to read 33 bytes so that we * can properly check the key length (the file should only have * 32 bytes). */ - *buf = malloc((WRAPPING_KEY_LEN + 1) * sizeof (char)); + *buf = malloc((WRAPPING_KEY_LEN + 1) * sizeof (uint8_t)); if (*buf == NULL) { ret = ENOMEM; goto out; } - bytes = fread(*buf, 1, WRAPPING_KEY_LEN + 1, fd); - if (bytes < 0) { + n = fread(*buf, 1, WRAPPING_KEY_LEN + 1, fd); + if (n == 0 || ferror(fd)) { /* size errors are handled by the calling function */ free(*buf); *buf = NULL; @@ -236,28 +464,209 @@ get_key_material_raw(FILE *fd, const char *fsname, zfs_keyformat_t keyformat, errno = 0; goto out; } + + *len_out = n; } - - *len_out = bytes; - out: - if (isatty(fileno(fd))) { - /* reset the teminal */ - (void) tcsetattr(fileno(fd), TCSAFLUSH, &old_term); - (void) sigaction(SIGINT, &osigint, NULL); - (void) sigaction(SIGTSTP, &osigtstp, NULL); + return (ret); +} - /* if we caught a signal, re-throw it now */ - if (caught_interrupt != 0) { - (void) kill(getpid(), caught_interrupt); - } +static int +get_key_material_file(libzfs_handle_t *hdl, const char *uri, + const char *fsname, zfs_keyformat_t keyformat, boolean_t newkey, + uint8_t **restrict buf, size_t *restrict len_out) +{ + FILE *f = NULL; + int ret = 0; - /* print the newline that was not echo'd */ - printf("\n"); + if (strlen(uri) < 7) + return (EINVAL); + + if ((f = fopen(uri + 7, "re")) == NULL) { + ret = errno; + errno = 0; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to open key material file: %s"), strerror(ret)); + return (ret); } + ret = get_key_material_raw(f, keyformat, buf, len_out); + + (void) fclose(f); + return (ret); +} +static int +get_key_material_https(libzfs_handle_t *hdl, const char *uri, + const char *fsname, zfs_keyformat_t keyformat, boolean_t newkey, + uint8_t **restrict buf, size_t *restrict len_out) +{ + int ret = 0; + FILE *key = NULL; + boolean_t is_http = strncmp(uri, "http:", strlen("http:")) == 0; + + if (strlen(uri) < (is_http ? 7 : 8)) { + ret = EINVAL; + goto end; + } + +#if LIBFETCH_DYNAMIC +#define LOAD_FUNCTION(func) \ + __typeof__(func) *func = dlsym(hdl->libfetch, #func); + + if (hdl->libfetch == NULL) + hdl->libfetch = dlopen(LIBFETCH_SONAME, RTLD_LAZY); + + if (hdl->libfetch == NULL) { + hdl->libfetch = (void *)-1; + char *err = dlerror(); + if (err) + hdl->libfetch_load_error = strdup(err); + } + + if (hdl->libfetch == (void *)-1) { + ret = ENOSYS; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Couldn't load %s: %s"), + LIBFETCH_SONAME, hdl->libfetch_load_error ?: "(?)"); + goto end; + } + + boolean_t ok; +#if LIBFETCH_IS_FETCH + LOAD_FUNCTION(fetchGetURL); + char *fetchLastErrString = dlsym(hdl->libfetch, "fetchLastErrString"); + + ok = fetchGetURL && fetchLastErrString; +#elif LIBFETCH_IS_LIBCURL + LOAD_FUNCTION(curl_easy_init); + LOAD_FUNCTION(curl_easy_setopt); + LOAD_FUNCTION(curl_easy_perform); + LOAD_FUNCTION(curl_easy_cleanup); + LOAD_FUNCTION(curl_easy_strerror); + LOAD_FUNCTION(curl_easy_getinfo); + + ok = curl_easy_init && curl_easy_setopt && curl_easy_perform && + curl_easy_cleanup && curl_easy_strerror && curl_easy_getinfo; +#endif + if (!ok) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "keylocation=%s back-end %s missing symbols."), + is_http ? "http://" : "https://", LIBFETCH_SONAME); + ret = ENOSYS; + goto end; + } +#endif + +#if LIBFETCH_IS_FETCH + key = fetchGetURL(uri, ""); + if (key == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Couldn't GET %s: %s"), + uri, fetchLastErrString); + ret = ENETDOWN; + } +#elif LIBFETCH_IS_LIBCURL + CURL *curl = curl_easy_init(); + if (curl == NULL) { + ret = ENOTSUP; + goto end; + } + + int kfd = -1; +#ifdef O_TMPFILE + kfd = open(getenv("TMPDIR") ?: "/tmp", + O_RDWR | O_TMPFILE | O_EXCL | O_CLOEXEC, 0600); + if (kfd != -1) + goto kfdok; +#endif + + char *path; + if (asprintf(&path, + "%s/libzfs-XXXXXXXX.https", getenv("TMPDIR") ?: "/tmp") == -1) { + ret = ENOMEM; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s"), + strerror(ret)); + goto end; + } + + kfd = mkostemps(path, strlen(".https"), O_CLOEXEC); + if (kfd == -1) { + ret = errno; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Couldn't create temporary file %s: %s"), + path, strerror(ret)); + free(path); + goto end; + } + (void) unlink(path); + free(path); + +kfdok: + if ((key = fdopen(kfd, "r+")) == NULL) { + ret = errno; + free(path); + (void) close(kfd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Couldn't reopen temporary file: %s"), strerror(ret)); + goto end; + } + + char errbuf[CURL_ERROR_SIZE] = ""; + char *cainfo = getenv("SSL_CA_CERT_FILE"); /* matches fetch(3) */ + char *capath = getenv("SSL_CA_CERT_PATH"); /* matches fetch(3) */ + char *clcert = getenv("SSL_CLIENT_CERT_FILE"); /* matches fetch(3) */ + char *clkey = getenv("SSL_CLIENT_KEY_FILE"); /* matches fetch(3) */ + (void) curl_easy_setopt(curl, CURLOPT_URL, uri); + (void) curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + (void) curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, 30000L); + (void) curl_easy_setopt(curl, CURLOPT_WRITEDATA, key); + (void) curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errbuf); + if (cainfo != NULL) + (void) curl_easy_setopt(curl, CURLOPT_CAINFO, cainfo); + if (capath != NULL) + (void) curl_easy_setopt(curl, CURLOPT_CAPATH, capath); + if (clcert != NULL) + (void) curl_easy_setopt(curl, CURLOPT_SSLCERT, clcert); + if (clkey != NULL) + (void) curl_easy_setopt(curl, CURLOPT_SSLKEY, clkey); + + CURLcode res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to connect to %s: %s"), + uri, strlen(errbuf) ? errbuf : curl_easy_strerror(res)); + ret = ENETDOWN; + } else { + long resp = 200; + (void) curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp); + + if (resp < 200 || resp >= 300) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Couldn't GET %s: %ld"), + uri, resp); + ret = ENOENT; + } else + rewind(key); + } + + curl_easy_cleanup(curl); +#else + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "No keylocation=%s back-end."), is_http ? "http://" : "https://"); + ret = ENOSYS; +#endif + +end: + if (ret == 0) + ret = get_key_material_raw(key, keyformat, buf, len_out); + + if (key != NULL) + fclose(key); + + return (ret); } /* @@ -271,41 +680,58 @@ get_key_material(libzfs_handle_t *hdl, boolean_t do_verify, boolean_t newkey, zfs_keyformat_t keyformat, char *keylocation, const char *fsname, uint8_t **km_out, size_t *kmlen_out, boolean_t *can_retry_out) { - int ret, i; + int ret; zfs_keylocation_t keyloc = ZFS_KEYLOCATION_NONE; - FILE *fd = NULL; - uint8_t *km = NULL, *km2 = NULL; - size_t kmlen, kmlen2; + uint8_t *km = NULL; + size_t kmlen = 0; + char *uri_scheme = NULL; + zfs_uri_handler_t *handler = NULL; boolean_t can_retry = B_FALSE; /* verify and parse the keylocation */ - keyloc = zfs_prop_parse_keylocation(keylocation); + ret = zfs_prop_parse_keylocation(hdl, keylocation, &keyloc, + &uri_scheme); + if (ret != 0) + goto error; /* open the appropriate file descriptor */ switch (keyloc) { case ZFS_KEYLOCATION_PROMPT: - fd = stdin; - if (isatty(fileno(fd))) { - can_retry = B_TRUE; - - /* raw keys cannot be entered on the terminal */ - if (keyformat == ZFS_KEYFORMAT_RAW) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Cannot enter raw keys on the terminal")); - goto error; - } + if (isatty(fileno(stdin))) { + can_retry = keyformat != ZFS_KEYFORMAT_RAW; + ret = get_key_interactive(hdl, fsname, keyformat, + do_verify, newkey, &km, &kmlen); + } else { + /* fetch the key material into the buffer */ + ret = get_key_material_raw(stdin, keyformat, &km, + &kmlen); } + + if (ret != 0) + goto error; + break; case ZFS_KEYLOCATION_URI: - fd = fopen(&keylocation[7], "r"); - if (!fd) { - ret = errno; - errno = 0; + ret = ENOTSUP; + + for (handler = uri_handlers; handler->zuh_scheme != NULL; + handler++) { + if (strcmp(handler->zuh_scheme, uri_scheme) != 0) + continue; + + if ((ret = handler->zuh_handler(hdl, keylocation, + fsname, keyformat, newkey, &km, &kmlen)) != 0) + goto error; + + break; + } + + if (ret == ENOTSUP) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Failed to open key material file")); + "URI scheme is not supported")); goto error; } + break; default: ret = EINVAL; @@ -314,126 +740,27 @@ get_key_material(libzfs_handle_t *hdl, boolean_t do_verify, boolean_t newkey, goto error; } - /* fetch the key material into the buffer */ - ret = get_key_material_raw(fd, fsname, keyformat, B_FALSE, newkey, - &km, &kmlen); - if (ret != 0) + if ((ret = validate_key(hdl, keyformat, (const char *)km, kmlen)) != 0) goto error; - /* do basic validation of the key material */ - switch (keyformat) { - case ZFS_KEYFORMAT_RAW: - /* verify the key length is correct */ - if (kmlen < WRAPPING_KEY_LEN) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Raw key too short (expected %u)."), - WRAPPING_KEY_LEN); - goto error; - } - - if (kmlen > WRAPPING_KEY_LEN) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Raw key too long (expected %u)."), - WRAPPING_KEY_LEN); - goto error; - } - break; - case ZFS_KEYFORMAT_HEX: - /* verify the key length is correct */ - if (kmlen < WRAPPING_KEY_LEN * 2) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Hex key too short (expected %u)."), - WRAPPING_KEY_LEN * 2); - goto error; - } - - if (kmlen > WRAPPING_KEY_LEN * 2) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Hex key too long (expected %u)."), - WRAPPING_KEY_LEN * 2); - goto error; - } - - /* check for invalid hex digits */ - for (i = 0; i < WRAPPING_KEY_LEN * 2; i++) { - if (!isxdigit((char)km[i])) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Invalid hex character detected.")); - goto error; - } - } - break; - case ZFS_KEYFORMAT_PASSPHRASE: - /* verify the length is within bounds */ - if (kmlen > MAX_PASSPHRASE_LEN) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Passphrase too long (max %u)."), - MAX_PASSPHRASE_LEN); - goto error; - } - - if (kmlen < MIN_PASSPHRASE_LEN) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Passphrase too short (min %u)."), - MIN_PASSPHRASE_LEN); - goto error; - } - break; - default: - /* can't happen, checked above */ - break; - } - - if (do_verify && isatty(fileno(fd))) { - ret = get_key_material_raw(fd, fsname, keyformat, B_TRUE, - newkey, &km2, &kmlen2); - if (ret != 0) - goto error; - - if (kmlen2 != kmlen || - (memcmp((char *)km, (char *)km2, kmlen) != 0)) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Provided keys do not match.")); - goto error; - } - } - - if (fd != stdin) - fclose(fd); - - if (km2 != NULL) - free(km2); - *km_out = km; *kmlen_out = kmlen; if (can_retry_out != NULL) *can_retry_out = can_retry; + free(uri_scheme); return (0); error: - if (km != NULL) - free(km); - - if (km2 != NULL) - free(km2); - - if (fd != NULL && fd != stdin) - fclose(fd); + free(km); *km_out = NULL; *kmlen_out = 0; + if (can_retry_out != NULL) *can_retry_out = can_retry; + free(uri_scheme); return (ret); } @@ -740,14 +1067,6 @@ zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props, pcrypt = ZIO_CRYPT_OFF; } - /* Check for encryption being explicitly truned off */ - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Invalid encryption value. Dataset must be encrypted.")); - goto out; - } - /* Get the inherited encryption property if we don't have it locally */ if (!local_crypt) crypt = pcrypt; @@ -821,7 +1140,7 @@ zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props, } ret = populate_create_encryption_params_nvlists(hdl, NULL, - B_FALSE, keyformat, keylocation, props, &wkeydata, + B_TRUE, keyformat, keylocation, props, &wkeydata, &wkeylen); if (ret != 0) goto out; @@ -849,10 +1168,7 @@ int zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, char *parent_name, nvlist_t *props) { - int ret; char errbuf[1024]; - zfs_handle_t *pzhp = NULL; - uint64_t pcrypt, ocrypt; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Encryption clone error")); @@ -865,40 +1181,12 @@ zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS))) { - ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption properties must inherit from origin dataset.")); - goto out; + return (EINVAL); } - /* get a reference to parent dataset, should never be NULL */ - pzhp = make_dataset_handle(hdl, parent_name); - if (pzhp == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Failed to lookup parent.")); - return (ENOENT); - } - - /* Lookup parent's crypt */ - pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); - ocrypt = zfs_prop_get_int(origin_zhp, ZFS_PROP_ENCRYPTION); - - /* all children of encrypted parents must be encrypted */ - if (pcrypt != ZIO_CRYPT_OFF && ocrypt == ZIO_CRYPT_OFF) { - ret = EINVAL; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Cannot create unencrypted clone as a child " - "of encrypted parent.")); - goto out; - } - - zfs_close(pzhp); return (0); - -out: - if (pzhp != NULL) - zfs_close(pzhp); - return (ret); } typedef struct loadkeys_cbdata { @@ -1360,7 +1648,7 @@ zfs_crypto_rewrap(zfs_handle_t *zhp, nvlist_t *raw_props, boolean_t inheritkey) if (is_encroot) { /* - * If this is already an ecryption root, just keep + * If this is already an encryption root, just keep * any properties not set by the user. */ if (keyformat == ZFS_KEYFORMAT_NONE) { diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 93af50b99c..5836587d27 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. @@ -32,6 +32,7 @@ * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K + * Copyright (c) 2021 Matt Fiddaman */ #include @@ -48,7 +49,6 @@ #include #include #include -#include #include #ifdef HAVE_IDMAP #include @@ -66,7 +66,6 @@ #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" -#include "libzfs.h" #include "zfs_deleg.h" static int userquota_propname_decode(const char *propname, boolean_t zoned, @@ -119,8 +118,7 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing '@' delimiter in snapshot name, " - "did you mean to use -r?")); + "missing '@' delimiter in snapshot name")); return (0); } @@ -134,8 +132,7 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing '#' delimiter in bookmark name, " - "did you mean to use -r?")); + "missing '#' delimiter in bookmark name")); return (0); } @@ -197,6 +194,16 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, "reserved disk name")); break; + case NAME_ERR_SELF_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "self reference, '.' is found in name")); + break; + + case NAME_ERR_PARENT_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent reference, '..' is found in name")); + break; + default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); @@ -324,7 +331,7 @@ get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) { + while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { return (-1); @@ -352,7 +359,7 @@ get_recvd_props_ioctl(zfs_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { + while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { return (-1); @@ -587,7 +594,6 @@ zfs_bookmark_exists(const char *path) int err; boolean_t rv; - (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) @@ -787,7 +793,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } void @@ -799,16 +805,16 @@ libzfs_mnttab_init(libzfs_handle_t *hdl) sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } -int +static int libzfs_mnttab_update(libzfs_handle_t *hdl) { + FILE *mnttab; struct mnttab entry; - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); - while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + while (getmntent(mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; @@ -834,6 +840,7 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) avl_add(&hdl->libzfs_mnttab_cache, mtn); } + (void) fclose(mnttab); return (0); } @@ -865,6 +872,7 @@ int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { + FILE *mnttab; mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; @@ -875,16 +883,14 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = MNTTYPE_ZFS; - if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) - return (0); - else - return (ENOENT); + ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0; + (void) fclose(mnttab); + return (ret); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); @@ -924,10 +930,15 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ - if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) + if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); free(mtn); - else + } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); + } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } @@ -1222,12 +1233,19 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + { + int maxbs = SPA_OLD_MAXBLOCKSIZE; + char buf[64]; + if (zpool_hdl != NULL) { char state[64] = ""; + maxbs = zpool_get_prop_int(zpool_hdl, + ZPOOL_PROP_MAXBLOCKSIZE, NULL); + /* * Issue a warning but do not fail so that - * tests for setable properties succeed. + * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, @@ -1240,15 +1258,17 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || - intval > SPA_OLD_MAXBLOCKSIZE || !ISP2(intval))) { + intval > maxbs || !ISP2(intval))) { + zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid '%s=%d' property: must be zero or " - "a power of 2 from 512B to 128K"), propname, - intval); + "invalid '%s=%llu' property: must be zero " + "or a power of 2 from 512B to %s"), + propname, (unsigned long long)intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; + } case ZFS_PROP_MLSLABEL: { @@ -1343,10 +1363,9 @@ badlabel: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } + fallthrough; } - /*FALLTHRU*/ - case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* @@ -1411,49 +1430,14 @@ badlabel: else proto = PROTO_NFS; - /* - * Must be an valid sharing protocol - * option string so init the libshare - * in order to enable the parser and - * then parse the options. We use the - * control API since we don't care about - * the current configuration and don't - * want the overhead of loading it - * until we actually do something. - */ - - if (zfs_init_libshare(hdl, - SA_INIT_CONTROL_API) != SA_OK) { - /* - * An error occurred so we can't do - * anything - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set: problem " - "in share initialization"), - propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - if (zfs_parse_options(strval, proto) != SA_OK) { - /* - * There was an error in parsing so - * deal with it by issuing an error - * message and leaving after - * uninitializing the the libshare - * interface. - */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - zfs_uninit_libshare(hdl); goto error; } - zfs_uninit_libshare(hdl); } break; @@ -1578,6 +1562,9 @@ badlabel: * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. + * + * If utf8only was turned off, but the parent has normalization, + * turn off normalization. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, @@ -1591,6 +1578,12 @@ badlabel: zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; + } else if (chosen_normal < 0 && chosen_utf == 0) { + if (nvlist_add_uint64(ret, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), 0) != 0) { + (void) no_memory(hdl); + goto error; + } } return (ret); @@ -1599,7 +1592,7 @@ error: return (NULL); } -int +static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; @@ -1608,6 +1601,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; + zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, @@ -1622,7 +1616,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); - if ((zvol_volsize_to_reservation(old_volsize, props) != + if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); @@ -1633,7 +1627,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) fnvlist_free(props); return (-1); } - new_reservation = zvol_volsize_to_reservation(new_volsize, props); + new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), @@ -1646,7 +1640,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after - * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinal value. + * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int @@ -1688,7 +1682,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } - resvsize = zvol_volsize_to_reservation(volsize, props); + resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, + props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); @@ -1699,114 +1694,6 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) return (1); } -void -zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, - char *errbuf) -{ - switch (err) { - - case ENOSPC: - /* - * For quotas and reservations, ENOSPC indicates - * something different; setting a quota or reservation - * doesn't use any disk space. - */ - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_REFQUOTA: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is less than current used or " - "reserved space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFRESERVATION: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is greater than available space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - default: - (void) zfs_standard_error(hdl, err, errbuf); - break; - } - break; - - case EBUSY: - (void) zfs_standard_error(hdl, EBUSY, errbuf); - break; - - case EROFS: - (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); - break; - - case E2BIG: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property value too long")); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - break; - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool and or dataset must be upgraded to set this " - "property or value")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - - case ERANGE: - if (prop == ZFS_PROP_COMPRESSION || - prop == ZFS_PROP_DNODESIZE || - prop == ZFS_PROP_RECORDSIZE) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property setting is not allowed on " - "bootable datasets")); - (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); - } else if (prop == ZFS_PROP_CHECKSUM || - prop == ZFS_PROP_DEDUP) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property setting is not allowed on " - "root pools")); - (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); - } else { - (void) zfs_standard_error(hdl, err, errbuf); - } - break; - - case EINVAL: - if (prop == ZPROP_INVAL) { - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - } else { - (void) zfs_standard_error(hdl, err, errbuf); - } - break; - - case EACCES: - if (prop == ZFS_PROP_KEYLOCATION) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "keylocation may only be set on encryption roots")); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - } else { - (void) zfs_standard_error(hdl, err, errbuf); - } - break; - - case EOVERFLOW: - /* - * This platform can't address a volume this big. - */ -#ifdef _ILP32 - if (prop == ZFS_PROP_VOLSIZE) { - (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); - break; - } -#endif - /* FALLTHROUGH */ - default: - (void) zfs_standard_error(hdl, err, errbuf); - } -} - static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { @@ -2074,6 +1961,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); + (void) get_stats(zhp); return (0); } @@ -2326,7 +2214,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: +#ifndef __FreeBSD__ case ZFS_PROP_XATTR: +#endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); @@ -2398,6 +2288,10 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, *val = zhp->zfs_dmustats.dds_inconsistent; break; + case ZFS_PROP_REDACTED: + *val = zhp->zfs_dmustats.dds_redacted; + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -2501,7 +2395,7 @@ get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) nvpair_t *pair; value = zfs_get_clones_nvl(zhp); - if (value == NULL) + if (value == NULL || nvlist_empty(value)) return (-1); propbuf[0] = '\0'; @@ -2522,7 +2416,7 @@ struct get_clones_arg { char buf[ZFS_MAX_DATASET_NAME_LEN]; }; -int +static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; @@ -2610,6 +2504,37 @@ zfs_get_clones_nvl(zfs_handle_t *zhp) return (value); } +static int +get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + uint64_t *snaps; + uint_t nsnaps; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) + return (-1); + if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, + &nsnaps) != 0) + return (-1); + if (nsnaps == 0) { + /* There's no redaction snapshots; pass a special value back */ + (void) snprintf(propbuf, proplen, "none"); + return (0); + } + propbuf[0] = '\0'; + for (int i = 0; i < nsnaps; i++) { + char buf[128]; + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) snprintf(buf, sizeof (buf), "%llu", + (u_longlong_t)snaps[i]); + (void) strlcat(propbuf, buf, proplen); + } + + return (0); +} + /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are @@ -2804,6 +2729,11 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zcp_check(zhp, prop, 0, str); break; + case ZFS_PROP_REDACT_SNAPS: + if (get_rsnaps_string(zhp, propbuf, proplen) != 0) + return (-1); + break; + case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); @@ -2846,16 +2776,15 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, return (-1); /* - * If limit is UINT64_MAX, we translate this into 'none' (unless - * literal is set), and indicate that it's the default value. - * Otherwise, we print the number nicely and indicate that it's - * set locally. + * If limit is UINT64_MAX, we translate this into 'none', and + * indicate that it's the default value. Otherwise, we print + * the number nicely and indicate that it's set locally. */ - if (literal) { + if (val == UINT64_MAX) { + (void) strlcpy(propbuf, "none", proplen); + } else if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); - } else if (val == UINT64_MAX) { - (void) strlcpy(propbuf, "none", proplen); } else { zfs_nicenum(val, propbuf, proplen); } @@ -2971,11 +2900,12 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, case ZFS_PROP_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: + case ZFS_PROP_PBKDF2_ITERS: /* * These properties are stored as numbers, but they are - * identifiers. + * identifiers or counters. * We don't want them to be pretty printed, because pretty - * printing mangles the ID into a truncated and useless value. + * printing truncates their values making them useless. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); @@ -3064,7 +2994,7 @@ zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) return (val); } -int +static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; @@ -3272,7 +3202,7 @@ zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, if (err) return (err); - err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc); + err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); @@ -3323,6 +3253,9 @@ zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, return (0); } +/* + * propname must start with "written@" or "written#". + */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) @@ -3333,8 +3266,10 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - snapname = strchr(propname, '@') + 1; - if (strchr(snapname, '@')) { + assert(zfs_prop_written(propname)); + snapname = propname + strlen("written@"); + if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { + /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ @@ -3345,11 +3280,10 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; - (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value)); - (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value)); + (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } - err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc); + err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); @@ -3406,6 +3340,16 @@ zfs_get_type(const zfs_handle_t *zhp) return (zhp->zfs_type); } +/* + * Returns the type of the given zfs handle, + * or, if a snapshot, the type of the snapshotted dataset. + */ +zfs_type_t +zfs_get_underlying_type(const zfs_handle_t *zhp) +{ + return (zhp->zfs_head_type); +} + /* * Is one dataset name a child dataset of another? * @@ -3486,7 +3430,7 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, slash = parent + strlen(parent); (void) strncpy(zc.zc_name, parent, slash - parent); zc.zc_name[slash - parent] = '\0'; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 && + if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); @@ -3628,6 +3572,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) zfs_close(h); } + zfs_commit_all_shares(); return (0); @@ -3837,8 +3782,8 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); + fallthrough; #endif - /* FALLTHROUGH */ default: return (zfs_standard_error(hdl, errno, errbuf)); } @@ -3947,10 +3892,13 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { - int ret; nvlist_t *errlist = NULL; nvpair_t *pair; + int ret = zfs_destroy_snaps_nvl_os(hdl, snaps); + if (ret != 0) + return (ret); + ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { @@ -4107,6 +4055,16 @@ zfs_promote(zfs_handle_t *zhp) if (ret != 0) { switch (ret) { + case EACCES: + /* + * Promoting encrypted dataset outside its + * encryption root. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot promote dataset outside its " + "encryption root")); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -4147,36 +4105,6 @@ zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) return (rv); } -int -zfs_remap_indirects(libzfs_handle_t *hdl, const char *fs) -{ - int err; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot remap dataset '%s'"), fs); - - err = lzc_remap(fs); - - if (err != 0) { - switch (err) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - default: - (void) zfs_standard_error(hdl, err, errbuf); - break; - } - } - - return (err); -} - /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. @@ -4463,14 +4391,14 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) * Renames the given dataset. */ int -zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - boolean_t force_unmount) +zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; + char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; @@ -4522,7 +4450,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { - if (recursive) { + if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); @@ -4563,8 +4491,19 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, return (zfs_error(hdl, EZFS_ZONED, errbuf)); } - if (recursive) { - zfs_handle_t *zhrp; + /* + * Avoid unmounting file systems with mountpoint property set to + * 'legacy' or 'none' even if -u option is not given. + */ + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + !flags.recursive && !flags.nounmount && + zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, + sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + flags.nounmount = B_TRUE; + } + if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { ret = -1; @@ -4572,7 +4511,8 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, } delim = strchr(parentname, '@'); *delim = '\0'; - zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); + zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, + ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; @@ -4581,8 +4521,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, + flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, - force_unmount ? MS_FORCE : 0)) == NULL) + flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { @@ -4606,7 +4547,8 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - zc.zc_cookie = recursive; + zc.zc_cookie = !!flags.recursive; + zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* @@ -4616,22 +4558,15 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); - if (recursive && errno == EEXIST) { + if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { - if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) == - ZIO_CRYPT_OFF) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot rename an unencrypted dataset to " - "be a decendent of an encrypted one")); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot move encryption child outside of " - "its encryption root")); - } + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot move encrypted child outside of " + "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); @@ -4920,8 +4855,6 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { - char errbuf[1024]; - if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || @@ -4933,10 +4866,9 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, type == ZFS_PROP_PROJECTQUOTA))) break; - (void) snprintf(errbuf, sizeof (errbuf), + return (zfs_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, - "cannot get used/quota for %s"), zc.zc_name); - return (zfs_standard_error_fmt(hdl, errno, errbuf)); + "cannot get used/quota for %s"), zc.zc_name)); } if (zc.zc_nvlist_dst_size == 0) break; @@ -5165,7 +5097,7 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: - (void) zfs_standard_error_fmt(hdl, errno, errbuf); + (void) zfs_standard_error(hdl, errno, errbuf); } } @@ -5184,7 +5116,7 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: - (void) zfs_standard_error_fmt(hdl, + (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } @@ -5219,7 +5151,7 @@ tryagain: (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); @@ -5241,17 +5173,16 @@ tryagain: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); + err = zfs_standard_error(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { - (void) snprintf(errbuf, sizeof (errbuf), dgettext( + err = zfs_standard_error_fmt(hdl, rc, dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); - err = zfs_standard_error_fmt(hdl, rc, errbuf); } } @@ -5304,7 +5235,7 @@ zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); + err = zfs_standard_error(hdl, errno, errbuf); break; } } @@ -5341,7 +5272,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); + err = zfs_standard_error(hdl, errno, errbuf); break; } } @@ -5350,12 +5281,231 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) } /* - * Convert the zvol's volume size to an appropriate reservation. + * The theory of raidz space accounting + * + * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block + * will "reference" 128KB, even though it allocates more than that, to store the + * parity information (and perhaps skip sectors). This concept of the + * "referenced" (and other DMU space accounting) being lower than the allocated + * space by a constant factor is called "raidz deflation." + * + * As mentioned above, the constant factor for raidz deflation assumes a 128KB + * block size. However, zvols typically have a much smaller block size (default + * 8KB). These smaller blocks may require proportionally much more parity + * information (and perhaps skip sectors). In this case, the change to the + * "referenced" property may be much more than the logical block size. + * + * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written + * as follows. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D8 | D16 | D24 | + * | P1 | D1 | D9 | D17 | D25 | + * | P2 | D2 | D10 | D18 | D26 | + * | P3 | D3 | D11 | D19 | D27 | + * | P4 | D4 | D12 | D20 | D28 | + * | P5 | D5 | D13 | D21 | D29 | + * | P6 | D6 | D14 | D22 | D30 | + * | P7 | D7 | D15 | D23 | D31 | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data + * sectors. The dataset's referenced will increase by 128k and the pool's + * allocated and free properties will be adjusted by 160k. + * + * A 4k block written to the same raidz vdev will require two 4k sectors. The + * blank cells represent unallocated space. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | | | | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that the 4k block required one sector for parity and another + * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated + * and free properties will be adjusted by 8k. The dataset will not be charged + * 8k. Rather, it will be charged a value that is scaled according to the + * overhead of the 128k block on the same vdev. This 8k allocation will be + * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as + * calculated in the 128k block example above. + * + * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That + * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 + * allocations are a multiple of 3 sectors, and raidz3 allocations are a + * multiple of of 4 sectors. When a block does not fill the required number of + * sectors, skip blocks (sectors) are used. + * + * An 8k block being written to a raidz vdev may be written as follows: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | | + * +-------+-------+-------+-------+-------+ + * + * In order to maintain the nparity+1 allocation size, a skip block (S0) was + * added. For this 8k block, the pool's allocated and free properties are + * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / + * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in + * the 128k block example above. + * + * The situation is slightly different for dRAID since the minimum allocation + * size is the full group width. The same 8K block above would be written as + * follows in a dRAID group: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | S1 | + * +-------+-------+-------+-------+-------+ + * + * Compression may lead to a variety of block sizes being written for the same + * volume or file. There is no clear way to reserve just the amount of space + * that will be required, so the worst case (no compression) is assumed. + * Note that metadata blocks will typically be compressed, so the reservation + * size returned by zvol_volsize_to_reservation() will generally be slightly + * larger than the maximum that the volume can reference. + */ + +/* + * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. Note that the "referenced" space accounted will be less than this, but + * not necessarily equal to "blksize", due to RAIDZ deflation. + */ +static uint64_t +vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + uint64_t asize, ndata; + + ASSERT3U(ndisks, >, nparity); + ndata = ndisks - nparity; + asize = ((blksize - 1) >> ashift) + 1; + asize += nparity * ((asize + ndata - 1) / ndata); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +/* + * Derived from function of same name in module/zfs/vdev_draid.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. + */ +static uint64_t +vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + ASSERT3U(ndisks, >, nparity); + uint64_t ndata = ndisks - nparity; + uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; + uint64_t asize = (rows * ndisks) << ashift; + + return (asize); +} + +/* + * Determine how much space will be allocated if it lands on the most space- + * inefficient top-level vdev. Returns the size in bytes required to store one + * copy of the volume data. See theory comment above. + */ +static uint64_t +volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) +{ + nvlist_t *config, *tree, **vdevs; + uint_t nvdevs; + uint64_t ret = 0; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (nblocks * blksize); + } + + for (int v = 0; v < nvdevs; v++) { + char *type; + uint64_t nparity, ashift, asize, tsize; + uint64_t volsize; + + if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, + &type) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && + strcmp(type, VDEV_TYPE_DRAID) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_NPARITY, &nparity) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + nvlist_t **disks; + uint_t ndisks; + + if (nvlist_lookup_nvlist_array(vdevs[v], + ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, + blksize); + } else { + uint64_t ndata; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_draid_asize(ndata + nparity, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_draid_asize(ndata + nparity, nparity, + ashift, blksize); + } + + /* + * Scale this size down as a ratio of 128k / tsize. + * See theory statement above. + */ + volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; + if (volsize > ret) { + ret = volsize; + } + } + + if (ret == 0) { + ret = nblocks * blksize; + } + + return (ret); +} + +/* + * Convert the zvol's volume size to an appropriate reservation. See theory + * comment above. + * * Note: If this routine is updated, it is necessary to update the ZFS test - * suite's shell version in reservation.kshlib. + * suite's shell version in reservation.shlib. */ uint64_t -zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) +zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, + nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; @@ -5371,7 +5521,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; - nblocks = volsize/volblocksize; + + nblocks = volsize / volblocksize; + /* + * Metadata defaults to using 128k blocks, not volblocksize blocks. For + * this reason, only the data blocks are scaled based on vdev config. + */ + volsize = volsize_from_vdevs(zph, nblocks, volblocksize); + /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ @@ -5391,3 +5548,31 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) volsize += numdb; return (volsize); } + +/* + * Wait for the given activity and return the status of the wait (whether or not + * any waiting was done) in the 'waited' parameter. Non-existent fses are + * reported via the 'missing' parameter, rather than by printing an error + * message. This is convenient when this function is called in a loop over a + * long period of time (as it is, for example, by zfs's wait cmd). In that + * scenario, a fs being exported or destroyed should be considered a normal + * event, so we don't want to print an error when we find that the fs doesn't + * exist. + */ +int +zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, + boolean_t *missing, boolean_t *waited) +{ + int error = lzc_wait_fs(zhp->zfs_name, activity, waited); + *missing = (error == ENOENT); + if (*missing) + return (0); + + if (error != 0) { + (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, + dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), + zhp->zfs_name); + } + + return (error); +} diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c index 1b5c44b047..d46e23a2fc 100644 --- a/lib/libzfs/libzfs_diff.c +++ b/lib/libzfs/libzfs_diff.c @@ -48,7 +48,6 @@ #include "libzfs_impl.h" #define ZDIFF_SNAPDIR "/.zfs/snapshot/" -#define ZDIFF_SHARESDIR "/.zfs/shares/" #define ZDIFF_PREFIX "zfs-diff-%d" #define ZDIFF_ADDED '+' @@ -56,26 +55,6 @@ #define ZDIFF_REMOVED '-' #define ZDIFF_RENAMED 'R' -typedef struct differ_info { - zfs_handle_t *zhp; - char *fromsnap; - char *frommnt; - char *tosnap; - char *tomnt; - char *ds; - char *dsmnt; - char *tmpsnap; - char errbuf[1024]; - boolean_t isclone; - boolean_t scripted; - boolean_t classify; - boolean_t timestamped; - uint64_t shares; - int zerr; - int cleanupfd; - int outputfd; - int datafd; -} differ_info_t; /* * Given a {dsname, object id}, get the object path @@ -91,7 +70,7 @@ get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, zc.zc_obj = obj; errno = 0; - error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc); + error = zfs_ioctl(di->zhp->zfs_hdl, ZFS_IOC_OBJ_TO_STATS, &zc); di->zerr = errno; /* we can get stats even if we failed to get a path */ @@ -264,6 +243,7 @@ write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) struct zfs_stat fsb, tsb; mode_t fmode, tmode; char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; + boolean_t already_logged = B_FALSE; int fobjerr, tobjerr; int change; @@ -275,22 +255,36 @@ write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) * we get ENOENT, then the object just didn't exist in that * snapshot. If we get ENOTSUP, then we tried to get * info on a non-ZPL object, which we don't care about anyway. + * For any other error we print a warning which includes the + * errno and continue. */ + fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, MAXPATHLEN, &fsb); - if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) - return (-1); + if (fobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { + zfs_error_aux(di->zhp->zfs_hdl, "%s", strerror(di->zerr)); + zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); + /* + * Let's not print an error for the same object more than + * once if it happens in both snapshots + */ + already_logged = B_TRUE; + } tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, MAXPATHLEN, &tsb); - if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) - return (-1); + if (tobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { + if (!already_logged) { + zfs_error_aux(di->zhp->zfs_hdl, + "%s", strerror(di->zerr)); + zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); + } + } /* * Unallocated object sharing the same meta dnode block */ if (fobjerr && tobjerr) { - ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP); di->zerr = 0; return (0); } @@ -365,12 +359,11 @@ describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, { struct zfs_stat sb; - if (get_stats_for_obj(di, di->fromsnap, object, namebuf, - maxlen, &sb) != 0) { - return (-1); - } + (void) get_stats_for_obj(di, di->fromsnap, object, namebuf, + maxlen, &sb); + /* Don't print if in the delete queue on from side */ - if (di->zerr == ESTALE) { + if (di->zerr == ESTALE || di->zerr == ENOENT) { di->zerr = 0; return (0); } @@ -394,7 +387,7 @@ write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) while (zc.zc_obj < dr->ddr_last) { int err; - err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc); + err = zfs_ioctl(lhdl, ZFS_IOC_NEXT_OBJ, &zc); if (err == 0) { if (zc.zc_obj == di->shares) { zc.zc_obj++; @@ -405,8 +398,6 @@ write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) } err = describe_free(fp, di, zc.zc_obj, fobjname, MAXPATHLEN); - if (err) - break; } else if (errno == ESRCH) { break; } else { @@ -487,25 +478,6 @@ differ(void *arg) return ((void *)0); } -static int -find_shares_object(differ_info_t *di) -{ - char fullpath[MAXPATHLEN]; - struct stat64 sb = { 0 }; - - (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); - (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); - - if (stat64(fullpath, &sb) != 0) { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); - return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); - } - - di->shares = (uint64_t)sb.st_ino; - return (0); -} - static int make_temp_snapshot(differ_info_t *di) { @@ -517,7 +489,7 @@ make_temp_snapshot(differ_info_t *di) (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); zc.zc_cleanup_fd = di->cleanupfd; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { int err = errno; if (err == EPERM) { (void) snprintf(di->errbuf, sizeof (di->errbuf), @@ -737,7 +709,7 @@ setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, { di->zhp = zhp; - di->cleanupfd = open(ZFS_DEV, O_RDWR); + di->cleanupfd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); VERIFY(di->cleanupfd >= 0); if (get_snapshot_names(di, fromsnap, tosnap) != 0) @@ -771,8 +743,8 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, return (-1); } - if (pipe(pipefd)) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + if (pipe2(pipefd, O_CLOEXEC)) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); teardown_differ_info(&di); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); } @@ -785,7 +757,7 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, di.datafd = pipefd[0]; if (pthread_create(&tid, NULL, differ, &di)) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); (void) close(pipefd[0]); (void) close(pipefd[1]); teardown_differ_info(&di); @@ -798,7 +770,7 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); zc.zc_cookie = pipefd[1]; - iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc); + iocerr = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DIFF, &zc); if (iocerr != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); @@ -811,14 +783,14 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "\n Not an earlier snapshot from the same fs")); } else if (errno != EPIPE || di.zerr == 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); } (void) close(pipefd[1]); (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); teardown_differ_info(&di); if (di.zerr != 0 && di.zerr != EPIPE) { - zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } else { return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); @@ -829,7 +801,7 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, (void) pthread_join(tid, NULL); if (di.zerr != 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } teardown_differ_info(&di); diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h new file mode 100644 index 0000000000..b1cf4f825f --- /dev/null +++ b/lib/libzfs/libzfs_impl.h @@ -0,0 +1,267 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2018 Datto Inc. + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _LIBZFS_IMPL_H +#define _LIBZFS_IMPL_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct libzfs_handle { + int libzfs_error; + int libzfs_fd; + zpool_handle_t *libzfs_pool_handles; + uu_avl_pool_t *libzfs_ns_avlpool; + uu_avl_t *libzfs_ns_avl; + uint64_t libzfs_ns_gen; + int libzfs_desc_active; + char libzfs_action[1024]; + char libzfs_desc[1024]; + int libzfs_printerr; + boolean_t libzfs_mnttab_enable; + /* + * We need a lock to handle the case where parallel mount + * threads are populating the mnttab cache simultaneously. The + * lock only protects the integrity of the avl tree, and does + * not protect the contents of the mnttab entries themselves. + */ + pthread_mutex_t libzfs_mnttab_cache_lock; + avl_tree_t libzfs_mnttab_cache; + int libzfs_pool_iter; + boolean_t libzfs_prop_debug; + regex_t libzfs_urire; + uint64_t libzfs_max_nvlist; + void *libfetch; + char *libfetch_load_error; +}; + +struct zfs_handle { + libzfs_handle_t *zfs_hdl; + zpool_handle_t *zpool_hdl; + char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; + zfs_type_t zfs_type; /* type including snapshot */ + zfs_type_t zfs_head_type; /* type excluding snapshot */ + dmu_objset_stats_t zfs_dmustats; + nvlist_t *zfs_props; + nvlist_t *zfs_user_props; + nvlist_t *zfs_recvd_props; + boolean_t zfs_mntcheck; + char *zfs_mntopts; + uint8_t *zfs_props_table; +}; + +/* + * This is different from checking zfs_type, because it will also catch + * snapshots of volumes. + */ +#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) + +struct zpool_handle { + libzfs_handle_t *zpool_hdl; + zpool_handle_t *zpool_next; + char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; + int zpool_state; + size_t zpool_config_size; + nvlist_t *zpool_config; + nvlist_t *zpool_old_config; + nvlist_t *zpool_props; + diskaddr_t zpool_start_block; +}; + +typedef enum { + PROTO_NFS = 0, + PROTO_SMB = 1, + PROTO_END = 2 +} zfs_share_proto_t; + +/* + * The following can be used as a bitmask and any new values + * added must preserve that capability. + */ +typedef enum { + SHARED_NOT_SHARED = 0x0, + SHARED_NFS = 0x2, + SHARED_SMB = 0x4 +} zfs_share_type_t; + +typedef int (*zfs_uri_handler_fn_t)(struct libzfs_handle *, const char *, + const char *, zfs_keyformat_t, boolean_t, uint8_t **, size_t *); + +typedef struct zfs_uri_handler { + const char *zuh_scheme; + zfs_uri_handler_fn_t zuh_handler; +} zfs_uri_handler_t; + +#define CONFIG_BUF_MINSIZE 262144 + +extern int zfs_error(libzfs_handle_t *, int, const char *); +extern int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...) + __attribute__((format(printf, 3, 4))); +extern void zfs_error_aux(libzfs_handle_t *, const char *, ...) + __attribute__((format(printf, 2, 3))); +extern void *zfs_alloc(libzfs_handle_t *, size_t); +extern void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); +extern char *zfs_asprintf(libzfs_handle_t *, const char *, ...) + __attribute__((format(printf, 2, 3))); +extern char *zfs_strdup(libzfs_handle_t *, const char *); +extern int no_memory(libzfs_handle_t *); + +extern int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...) + __attribute__((format(printf, 3, 4))); +extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); +extern int zpool_standard_error(libzfs_handle_t *, int, const char *); +extern int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...) + __attribute__((format(printf, 3, 4))); + +extern zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); +extern zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); + +extern int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, + nvlist_t *, char **, uint64_t *, const char *); +extern int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, + zfs_type_t type); + +/* + * Use this changelist_gather() flag to force attempting mounts + * on each change node regardless of whether or not it is currently + * mounted. + */ +#define CL_GATHER_MOUNT_ALWAYS 1 +/* + * changelist_gather() flag to force it to iterate on mounted datasets only + */ +#define CL_GATHER_ITER_MOUNTED 2 +/* + * Use this changelist_gather() flag to prevent unmounting of file systems. + */ +#define CL_GATHER_DONT_UNMOUNT 4 + +typedef struct prop_changelist prop_changelist_t; + +extern int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); +extern int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +extern int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +extern int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); +extern int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); +extern void zcmd_free_nvlists(zfs_cmd_t *); + +extern int changelist_prefix(prop_changelist_t *); +extern int changelist_postfix(prop_changelist_t *); +extern void changelist_rename(prop_changelist_t *, const char *, const char *); +extern void changelist_remove(prop_changelist_t *, const char *); +extern void changelist_free(prop_changelist_t *); +extern prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, + int); +extern int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); +extern int changelist_haszonedchild(prop_changelist_t *); + +extern void remove_mountpoint(zfs_handle_t *); +extern int create_parents(libzfs_handle_t *, char *, int); + +extern zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); +extern zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, + nvlist_t *props); + +extern int zpool_open_silent(libzfs_handle_t *, const char *, + zpool_handle_t **); + +extern boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); + +extern int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying); + +extern void namespace_clear(libzfs_handle_t *); + +extern int zfs_parse_options(char *, zfs_share_proto_t); + +typedef struct { + zfs_prop_t p_prop; + char *p_name; + int p_share_err; + int p_unshare_err; +} proto_table_t; + +typedef struct differ_info { + zfs_handle_t *zhp; + char *fromsnap; + char *frommnt; + char *tosnap; + char *tomnt; + char *ds; + char *dsmnt; + char *tmpsnap; + char errbuf[1024]; + boolean_t isclone; + boolean_t scripted; + boolean_t classify; + boolean_t timestamped; + uint64_t shares; + int zerr; + int cleanupfd; + int outputfd; + int datafd; +} differ_info_t; + +extern proto_table_t proto_table[PROTO_END]; + +extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, + int flags); +extern int do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags); +extern int zfs_mount_delegation_check(void); +extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto); +extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *); +extern int unshare_one(libzfs_handle_t *hdl, const char *name, + const char *mountpoint, zfs_share_proto_t proto); +extern boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, + zprop_source_t *source, int flags); +extern zfs_share_type_t is_shared(const char *mountpoint, + zfs_share_proto_t proto); +extern int libzfs_load_module(void); +extern int zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, + const char *msg); +extern int find_shares_object(differ_info_t *di); +extern void libzfs_set_pipe_max(int infd); +extern void zfs_commit_proto(zfs_share_proto_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_IMPL_H */ diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 3d7a0bf12a..0d375b3551 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -26,7 +26,6 @@ * Copyright (c) 2016, Intel Corporation. */ -#include #include #include #include @@ -37,8 +36,9 @@ #include #include #include -#include +#include "libzfs_impl.h" #include +#include /* * Returns true if the named pool matches the given GUID. @@ -77,14 +77,14 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) return (NULL); - dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4); + dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 32); if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) { zcmd_free_nvlists(&zc); return (NULL); } - while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT, + while ((err = zfs_ioctl(hdl, ZFS_IOC_POOL_TRYIMPORT, &zc)) != 0 && errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); @@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig) return (refresh_config((libzfs_handle_t *)handle, tryconfig)); } - static int pool_active_libzfs(void *handle, const char *name, uint64_t guid, boolean_t *isactive) @@ -147,8 +146,10 @@ zpool_clear_label(int fd) struct stat64 statbuf; int l; vdev_label_t *label; + l2arc_dev_hdr_phys_t *l2dhdr; uint64_t size; - int labels_cleared = 0; + int labels_cleared = 0, header_cleared = 0; + boolean_t clear_l2arc_header = B_FALSE; if (fstat64_blk(fd, &statbuf) == -1) return (0); @@ -158,8 +159,13 @@ zpool_clear_label(int fd) if ((label = calloc(1, sizeof (vdev_label_t))) == NULL) return (-1); + if ((l2dhdr = calloc(1, sizeof (l2arc_dev_hdr_phys_t))) == NULL) { + free(label); + return (-1); + } + for (l = 0; l < VDEV_LABELS; l++) { - uint64_t state, guid; + uint64_t state, guid, l2cache; nvlist_t *config; if (pread64(fd, label, sizeof (vdev_label_t), @@ -186,6 +192,15 @@ zpool_clear_label(int fd) continue; } + /* If the device is a cache device clear the header. */ + if (!clear_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + clear_l2arc_header = B_TRUE; + } + } + nvlist_free(config); /* @@ -203,7 +218,17 @@ zpool_clear_label(int fd) } } + /* Clear the L2ARC header. */ + if (clear_l2arc_header) { + memset(l2dhdr, 0, sizeof (l2arc_dev_hdr_phys_t)); + if (pwrite64(fd, l2dhdr, sizeof (l2arc_dev_hdr_phys_t), + VDEV_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) { + header_cleared++; + } + } + free(label); + free(l2dhdr); if (labels_cleared == 0) return (-1); diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index 5e9a1ecae7..3c537be794 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019 Datto Inc. */ @@ -38,7 +38,7 @@ #include "libzfs_impl.h" -int +static int zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) { nvlist_t *nvl = zfs_get_clones_nvl(zhp); @@ -69,7 +69,7 @@ zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) orig_cookie = zc->zc_cookie; top: (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + rc = zfs_ioctl(zhp->zfs_hdl, arg, zc); if (rc == -1) { switch (errno) { @@ -212,10 +212,12 @@ zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) /* Setup the requested properties nvlist. */ props = fnvlist_alloc(); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_IVSET_GUID)); + for (zfs_prop_t p = 0; p < ZFS_NUM_PROPS; p++) { + if (zfs_prop_valid_for_type(p, ZFS_TYPE_BOOKMARK, B_FALSE)) { + fnvlist_add_boolean(props, zfs_prop_to_name(p)); + } + } + fnvlist_add_boolean(props, "redact_complete"); if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) goto out; @@ -300,7 +302,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg) lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - return (AVL_CMP(lcreate, rcreate)); + return (TREE_CMP(lcreate, rcreate)); } int @@ -563,7 +565,7 @@ zfs_iter_mounted(zfs_handle_t *zhp, zfs_iter_f func, void *data) FILE *mnttab; int err = 0; - if ((mnttab = fopen(MNTTAB, "r")) == NULL) + if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (err == 0 && getmntent(mnttab, &entry) == 0) { @@ -573,8 +575,11 @@ zfs_iter_mounted(zfs_handle_t *zhp, zfs_iter_f func, void *data) /* Ignore datasets not within the provided dataset */ if (strncmp(entry.mnt_special, zhp->zfs_name, namelen) != 0 || - (entry.mnt_special[namelen] != '/' && - entry.mnt_special[namelen] != '@')) + entry.mnt_special[namelen] != '/') + continue; + + /* Skip snapshot of any child dataset */ + if (strchr(entry.mnt_special, '@') != NULL) continue; if ((mtab_zhp = zfs_open(zhp->zfs_hdl, entry.mnt_special, diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 649c232aa3..b0279d8fbc 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, 2021 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright 2017 RackTop Systems. * Copyright (c) 2018 Datto Inc. @@ -37,6 +37,7 @@ * * zfs_is_mounted() * zfs_mount() + * zfs_mount_at() * zfs_unmount() * zfs_unmountall() * @@ -94,95 +95,34 @@ static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ static void zfs_mount_task(void *); -static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); -zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, +static zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); /* * The share protocols table must be in the same order as the zfs_share_proto_t * enum in libzfs_impl.h */ -typedef struct { - zfs_prop_t p_prop; - char *p_name; - int p_share_err; - int p_unshare_err; -} proto_table_t; - proto_table_t proto_table[PROTO_END] = { {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED}, {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED}, }; -zfs_share_proto_t nfs_only[] = { +static zfs_share_proto_t nfs_only[] = { PROTO_NFS, PROTO_END }; -zfs_share_proto_t smb_only[] = { +static zfs_share_proto_t smb_only[] = { PROTO_SMB, PROTO_END }; -zfs_share_proto_t share_all_proto[] = { +static zfs_share_proto_t share_all_proto[] = { PROTO_NFS, PROTO_SMB, PROTO_END }; -/* - * Search the sharetab for the given mountpoint and protocol, returning - * a zfs_share_type_t value. - */ -static zfs_share_type_t -is_shared(libzfs_handle_t *hdl, const char *mountpoint, zfs_share_proto_t proto) -{ - char buf[MAXPATHLEN], *tab; - char *ptr; - if (hdl->libzfs_sharetab == NULL) - return (SHARED_NOT_SHARED); - - /* Reopen ZFS_SHARETAB to prevent reading stale data from open file */ - if (freopen(ZFS_SHARETAB, "r", hdl->libzfs_sharetab) == NULL) - return (SHARED_NOT_SHARED); - - (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET); - - while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) { - - /* the mountpoint is the first entry on each line */ - if ((tab = strchr(buf, '\t')) == NULL) - continue; - - *tab = '\0'; - if (strcmp(buf, mountpoint) == 0) { - /* - * the protocol field is the third field - * skip over second field - */ - ptr = ++tab; - if ((tab = strchr(ptr, '\t')) == NULL) - continue; - ptr = ++tab; - if ((tab = strchr(ptr, '\t')) == NULL) - continue; - *tab = '\0'; - if (strcmp(ptr, - proto_table[proto].p_name) == 0) { - switch (proto) { - case PROTO_NFS: - return (SHARED_NFS); - case PROTO_SMB: - return (SHARED_SMB); - default: - return (0); - } - } - } - } - - return (SHARED_NOT_SHARED); -} static boolean_t dir_is_empty_stat(const char *dirname) @@ -300,13 +240,30 @@ zfs_is_mounted(zfs_handle_t *zhp, char **where) return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where)); } +/* + * Checks any higher order concerns about whether the given dataset is + * mountable, false otherwise. zfs_is_mountable_internal specifically assumes + * that the caller has verified the sanity of mounting the dataset at + * mountpoint to the extent the caller wants. + */ +static boolean_t +zfs_is_mountable_internal(zfs_handle_t *zhp, const char *mountpoint) +{ + + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && + getzoneid() == GLOBAL_ZONEID) + return (B_FALSE); + + return (B_TRUE); +} + /* * Returns true if the given dataset is mountable, false otherwise. Returns the * mountpoint in 'buf'. */ -static boolean_t +boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, - zprop_source_t *source) + zprop_source_t *source, int flags) { char sourceloc[MAXNAMELEN]; zprop_source_t sourcetype; @@ -325,8 +282,10 @@ zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF) return (B_FALSE); - if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && - getzoneid() == GLOBAL_ZONEID) + if (!zfs_is_mountable_internal(zhp, buf)) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) return (B_FALSE); if (source) @@ -352,68 +311,6 @@ zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, * http://www.kernel.org/pub/linux/utils/util-linux/libmount-docs/index.html */ -static int -do_mount(const char *src, const char *mntpt, char *opts) -{ - char *argv[9] = { - "/bin/mount", - "--no-canonicalize", - "-t", MNTTYPE_ZFS, - "-o", opts, - (char *)src, - (char *)mntpt, - (char *)NULL }; - int rc; - - /* Return only the most critical mount error */ - rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); - if (rc) { - if (rc & MOUNT_FILEIO) - return (EIO); - if (rc & MOUNT_USER) - return (EINTR); - if (rc & MOUNT_SOFTWARE) - return (EPIPE); - if (rc & MOUNT_BUSY) - return (EBUSY); - if (rc & MOUNT_SYSERR) - return (EAGAIN); - if (rc & MOUNT_USAGE) - return (EINVAL); - - return (ENXIO); /* Generic error */ - } - - return (0); -} - -static int -do_unmount(const char *mntpt, int flags) -{ - char force_opt[] = "-f"; - char lazy_opt[] = "-l"; - char *argv[7] = { - "/bin/umount", - "-t", MNTTYPE_ZFS, - NULL, NULL, NULL, NULL }; - int rc, count = 3; - - if (flags & MS_FORCE) { - argv[count] = force_opt; - count++; - } - - if (flags & MS_DETACH) { - argv[count] = lazy_opt; - count++; - } - - argv[count] = (char *)mntpt; - rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); - - return (rc ? EINVAL : 0); -} - static int zfs_add_option(zfs_handle_t *zhp, char *options, int len, zfs_prop_t prop, char *on, char *off) @@ -466,16 +363,31 @@ zfs_add_options(zfs_handle_t *zhp, char *options, int len) return (error); } +int +zfs_mount(zfs_handle_t *zhp, const char *options, int flags) +{ + char mountpoint[ZFS_MAXPROPLEN]; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, + flags)) + return (0); + + return (zfs_mount_at(zhp, options, flags, mountpoint)); +} + /* * Mount the given filesystem. */ int -zfs_mount(zfs_handle_t *zhp, const char *options, int flags) +zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, + const char *mountpoint) { struct stat buf; - char mountpoint[ZFS_MAXPROPLEN]; char mntopts[MNT_LINE_MAX]; char overlay[ZFS_MAXPROPLEN]; + char prop_encroot[MAXNAMELEN]; + boolean_t is_encroot; + zfs_handle_t *encroot_hp = zhp; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t keystatus; int remount = 0, rc; @@ -489,15 +401,16 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) if (strstr(mntopts, MNTOPT_REMOUNT) != NULL) remount = 1; + /* Potentially duplicates some checks if invoked by zfs_mount(). */ + if (!zfs_is_mountable_internal(zhp, mountpoint)) + return (0); + /* * If the pool is imported read-only then all mounts must be read-only */ if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) (void) strlcat(mntopts, "," MNTOPT_RO, sizeof (mntopts)); - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) - return (0); - /* * Append default mount options which apply to the mount point. * This is done because under Linux (unlike Solaris) multiple mount @@ -533,7 +446,27 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) */ if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { if (flags & MS_CRYPT) { - rc = zfs_crypto_load_key(zhp, B_FALSE, NULL); + rc = zfs_crypto_get_encryption_root(zhp, + &is_encroot, prop_encroot); + if (rc) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to get encryption root for " + "'%s'."), zfs_get_name(zhp)); + return (rc); + } + + if (!is_encroot) { + encroot_hp = zfs_open(hdl, prop_encroot, + ZFS_TYPE_DATASET); + if (encroot_hp == NULL) + return (hdl->libzfs_error); + } + + rc = zfs_crypto_load_key(encroot_hp, + B_FALSE, NULL); + + if (!is_encroot) + zfs_close(encroot_hp); if (rc) return (rc); } else { @@ -556,7 +489,8 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) if (lstat(mountpoint, &buf) != 0) { if (mkdirp(mountpoint, 0755) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "failed to create mountpoint")); + "failed to create mountpoint: %s"), + strerror(errno)); return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); @@ -564,8 +498,8 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) } /* - * Overlay mounts are disabled by default but may be enabled - * via the 'overlay' property or the 'zfs mount -O' option. + * Overlay mounts are enabled by default but may be disabled + * via the 'overlay' property. The -O flag remains for compatibility. */ if (!(flags & MS_OVERLAY)) { if (zfs_prop_get(zhp, ZFS_PROP_OVERLAY, overlay, @@ -579,7 +513,7 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) /* * Determine if the mountpoint is empty. If so, refuse to perform the * mount. We don't perform this check if 'remount' is - * specified or if overlay option(-O) is given + * specified or if overlay option (-O) is given */ if ((flags & MS_OVERLAY) == 0 && !remount && !dir_is_empty(mountpoint)) { @@ -590,7 +524,7 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) } /* perform the mount */ - rc = do_mount(zfs_get_name(zhp), mountpoint, mntopts); + rc = do_mount(zhp, mountpoint, mntopts, flags); if (rc) { /* * Generic errors are nasty, but there are just way too many @@ -604,19 +538,17 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Insufficient privileges")); } else if (rc == ENOTSUP) { - char buf[256]; int spa_version; VERIFY(zfs_spa_version(zhp, &spa_version) == 0); - (void) snprintf(buf, sizeof (buf), - dgettext(TEXT_DOMAIN, "Can't mount a version %lld " + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Can't mount a version %llu " "file system on a version %d pool. Pool must be" " upgraded to mount this file system."), (u_longlong_t)zfs_prop_get_int(zhp, ZFS_PROP_VERSION), spa_version); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); } else { - zfs_error_aux(hdl, strerror(rc)); + zfs_error_aux(hdl, "%s", strerror(rc)); } return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), @@ -636,13 +568,34 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) * Unmount a single filesystem. */ static int -unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) +unmount_one(zfs_handle_t *zhp, const char *mountpoint, int flags) { int error; - error = do_unmount(mountpoint, flags); + error = do_unmount(zhp, mountpoint, flags); if (error != 0) { - return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED, + int libzfs_err; + + switch (error) { + case EBUSY: + libzfs_err = EZFS_BUSY; + break; + case EIO: + libzfs_err = EZFS_IO; + break; + case ENOENT: + libzfs_err = EZFS_NOENT; + break; + case ENOMEM: + libzfs_err = EZFS_NOMEM; + break; + case EPERM: + libzfs_err = EZFS_PERM; + break; + default: + libzfs_err = EZFS_UMOUNTFAILED; + } + return (zfs_error_fmt(zhp->zfs_hdl, libzfs_err, dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), mountpoint)); } @@ -659,6 +612,7 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; char *mntpt = NULL; + boolean_t encroot, unmounted = B_FALSE; /* check to see if we need to unmount the filesystem */ if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && @@ -681,16 +635,45 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) free(mntpt); return (-1); } + zfs_commit_all_shares(); - if (unmount_one(hdl, mntpt, flags) != 0) { + if (unmount_one(zhp, mntpt, flags) != 0) { free(mntpt); (void) zfs_shareall(zhp); + zfs_commit_all_shares(); return (-1); } + libzfs_mnttab_remove(hdl, zhp->zfs_name); free(mntpt); + unmounted = B_TRUE; } + /* + * If the MS_CRYPT flag is provided we must ensure we attempt to + * unload the dataset's key regardless of whether we did any work + * to unmount it. We only do this for encryption roots. + */ + if ((flags & MS_CRYPT) != 0 && + zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { + zfs_refresh_properties(zhp); + + if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0 && + unmounted) { + (void) zfs_mount(zhp, NULL, 0); + return (-1); + } + + if (encroot && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_AVAILABLE && + zfs_crypto_unload_key(zhp) != 0) { + (void) zfs_mount(zhp, NULL, 0); + return (-1); + } + } + + zpool_disable_volume_os(zhp->zfs_name); + return (0); } @@ -706,7 +689,7 @@ zfs_unmountall(zfs_handle_t *zhp, int flags) int ret; clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, - CL_GATHER_ITER_MOUNTED, 0); + CL_GATHER_ITER_MOUNTED, flags); if (clp == NULL) return (-1); @@ -732,6 +715,94 @@ zfs_is_shared(zfs_handle_t *zhp) return (rc ? B_TRUE : B_FALSE); } +/* + * Unshare a filesystem by mountpoint. + */ +int +unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, + zfs_share_proto_t proto) +{ + int err; + + err = sa_disable_share(mountpoint, proto_table[proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, sa_errorstr(err))); + } + return (0); +} + +/* + * Query libshare for the given mountpoint and protocol, returning + * a zfs_share_type_t value. + */ +zfs_share_type_t +is_shared(const char *mountpoint, zfs_share_proto_t proto) +{ + if (sa_is_shared(mountpoint, proto_table[proto].p_name)) { + switch (proto) { + case PROTO_NFS: + return (SHARED_NFS); + case PROTO_SMB: + return (SHARED_SMB); + default: + return (SHARED_NOT_SHARED); + } + } + return (SHARED_NOT_SHARED); +} + +/* + * Share the given filesystem according to the options in the specified + * protocol specific properties (sharenfs, sharesmb). We rely + * on "libshare" to do the dirty work for us. + */ +int +zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char sourcestr[ZFS_MAXPROPLEN]; + zfs_share_proto_t *curr_proto; + zprop_source_t sourcetype; + int err = 0; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) + return (0); + + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + /* + * Return success if there are no share options. + */ + if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, + shareopts, sizeof (shareopts), &sourcetype, sourcestr, + ZFS_MAXPROPLEN, B_FALSE) != 0 || + strcmp(shareopts, "off") == 0) + continue; + + /* + * If the 'zoned' property is set, then zfs_is_mountable() + * will have already bailed out if we are in the global zone. + * But local zones cannot be NFS servers, so we ignore it for + * local zones as well. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) + continue; + + err = sa_enable_share(zfs_get_name(zhp), mountpoint, shareopts, + proto_table[*curr_proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(zhp->zfs_hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s: %s'"), + zfs_get_name(zhp), sa_errorstr(err))); + } + + } + return (0); +} + int zfs_share(zfs_handle_t *zhp) { @@ -749,7 +820,7 @@ zfs_unshare(zfs_handle_t *zhp) /* * Check to see if the filesystem is currently shared. */ -zfs_share_type_t +static zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) { char *mountpoint; @@ -758,7 +829,7 @@ zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) if (!zfs_is_mounted(zhp, &mountpoint)) return (SHARED_NOT_SHARED); - if ((rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) + if ((rc = is_shared(mountpoint, proto)) != SHARED_NOT_SHARED) { if (where != NULL) *where = mountpoint; @@ -785,59 +856,6 @@ zfs_is_shared_smb(zfs_handle_t *zhp, char **where) PROTO_SMB) != SHARED_NOT_SHARED); } -/* - * zfs_init_libshare(zhandle, service) - * - * Initialize the libshare API if it hasn't already been initialized. - * In all cases it returns 0 if it succeeded and an error if not. The - * service value is which part(s) of the API to initialize and is a - * direct map to the libshare sa_init(service) interface. - */ -int -zfs_init_libshare(libzfs_handle_t *zhandle, int service) -{ - int ret = SA_OK; - - if (ret == SA_OK && zhandle->libzfs_shareflags & ZFSSHARE_MISS) { - /* - * We had a cache miss. Most likely it is a new ZFS - * dataset that was just created. We want to make sure - * so check timestamps to see if a different process - * has updated any of the configuration. If there was - * some non-ZFS change, we need to re-initialize the - * internal cache. - */ - zhandle->libzfs_shareflags &= ~ZFSSHARE_MISS; - if (sa_needs_refresh(zhandle->libzfs_sharehdl)) { - zfs_uninit_libshare(zhandle); - zhandle->libzfs_sharehdl = sa_init(service); - } - } - - if (ret == SA_OK && zhandle && zhandle->libzfs_sharehdl == NULL) - zhandle->libzfs_sharehdl = sa_init(service); - - if (ret == SA_OK && zhandle->libzfs_sharehdl == NULL) - ret = SA_NO_MEMORY; - - return (ret); -} - -/* - * zfs_uninit_libshare(zhandle) - * - * Uninitialize the libshare API if it hasn't already been - * uninitialized. It is OK to call multiple times. - */ -void -zfs_uninit_libshare(libzfs_handle_t *zhandle) -{ - if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) { - sa_fini(zhandle->libzfs_sharehdl); - zhandle->libzfs_sharehdl = NULL; - } -} - /* * zfs_parse_options(options, proto) * @@ -847,105 +865,46 @@ zfs_uninit_libshare(libzfs_handle_t *zhandle) int zfs_parse_options(char *options, zfs_share_proto_t proto) { - return (sa_parse_legacy_options(NULL, options, - proto_table[proto].p_name)); + return (sa_validate_shareopts(options, proto_table[proto].p_name)); } -/* - * Share the given filesystem according to the options in the specified - * protocol specific properties (sharenfs, sharesmb). We rely - * on "libshare" to do the dirty work for us. - */ -static int -zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +void +zfs_commit_proto(zfs_share_proto_t *proto) { - char mountpoint[ZFS_MAXPROPLEN]; - char shareopts[ZFS_MAXPROPLEN]; - char sourcestr[ZFS_MAXPROPLEN]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - sa_share_t share; zfs_share_proto_t *curr_proto; - zprop_source_t sourcetype; - int ret; - - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) - return (0); - for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { - /* - * Return success if there are no share options. - */ - if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, - shareopts, sizeof (shareopts), &sourcetype, sourcestr, - ZFS_MAXPROPLEN, B_FALSE) != 0 || - strcmp(shareopts, "off") == 0) - continue; - - ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API); - if (ret != SA_OK) { - (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, - dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), - zfs_get_name(zhp), sa_errorstr(ret)); - return (-1); - } - - /* - * If the 'zoned' property is set, then zfs_is_mountable() - * will have already bailed out if we are in the global zone. - * But local zones cannot be NFS servers, so we ignore it for - * local zones as well. - */ - if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) - continue; - - share = sa_find_share(hdl->libzfs_sharehdl, mountpoint); - if (share == NULL) { - /* - * This may be a new file system that was just - * created so isn't in the internal cache - * (second time through). Rather than - * reloading the entire configuration, we can - * assume ZFS has done the checking and it is - * safe to add this to the internal - * configuration. - */ - if (sa_zfs_process_share(hdl->libzfs_sharehdl, - NULL, NULL, mountpoint, - proto_table[*curr_proto].p_name, sourcetype, - shareopts, sourcestr, zhp->zfs_name) != SA_OK) { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - hdl->libzfs_shareflags |= ZFSSHARE_MISS; - share = sa_find_share(hdl->libzfs_sharehdl, - mountpoint); - } - if (share != NULL) { - int err; - err = sa_enable_share(share, - proto_table[*curr_proto].p_name); - if (err != SA_OK) { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - } else { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - + sa_commit_shares(proto_table[*curr_proto].p_name); } - return (0); } +void +zfs_commit_nfs_shares(void) +{ + zfs_commit_proto(nfs_only); +} + +void +zfs_commit_smb_shares(void) +{ + zfs_commit_proto(smb_only); +} + +void +zfs_commit_all_shares(void) +{ + zfs_commit_proto(share_all_proto); +} + +void +zfs_commit_shares(const char *proto) +{ + if (proto == NULL) + zfs_commit_proto(share_all_proto); + else if (strcmp(proto, "nfs") == 0) + zfs_commit_proto(nfs_only); + else if (strcmp(proto, "smb") == 0) + zfs_commit_proto(smb_only); +} int zfs_share_nfs(zfs_handle_t *zhp) @@ -965,50 +924,6 @@ zfs_shareall(zfs_handle_t *zhp) return (zfs_share_proto(zhp, share_all_proto)); } -/* - * Unshare a filesystem by mountpoint. - */ -static int -unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, - zfs_share_proto_t proto) -{ - sa_share_t share; - int err; - char *mntpt; - /* - * Mountpoint could get trashed if libshare calls getmntany - * which it does during API initialization, so strdup the - * value. - */ - mntpt = zfs_strdup(hdl, mountpoint); - - /* make sure libshare initialized */ - if ((err = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) { - free(mntpt); /* don't need the copy anymore */ - return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), - name, sa_errorstr(err))); - } - - share = sa_find_share(hdl->libzfs_sharehdl, mntpt); - free(mntpt); /* don't need the copy anymore */ - - if (share != NULL) { - err = sa_disable_share(share, proto_table[proto].p_name); - if (err != SA_OK) { - return (zfs_error_fmt(hdl, - proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), - name, sa_errorstr(err))); - } - } else { - return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"), - name)); - } - return (0); -} - /* * Unshare the given filesystem. */ @@ -1034,12 +949,13 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { - if (is_shared(hdl, mntpt, *curr_proto) && - unshare_one(hdl, zhp->zfs_name, - mntpt, *curr_proto) != 0) { - if (mntpt != NULL) - free(mntpt); - return (-1); + if (is_shared(mntpt, *curr_proto)) { + if (unshare_one(hdl, zhp->zfs_name, + mntpt, *curr_proto) != 0) { + if (mntpt != NULL) + free(mntpt); + return (-1); + } } } } @@ -1064,7 +980,7 @@ zfs_unshare_smb(zfs_handle_t *zhp, const char *mountpoint) /* * Same as zfs_unmountall(), but for NFS and SMB unshares. */ -int +static int zfs_unshareall_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) { prop_changelist_t *clp; @@ -1136,7 +1052,7 @@ remove_mountpoint(zfs_handle_t *zhp) zprop_source_t source; if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), - &source)) + &source, 0)) return; if (source == ZPROP_SRC_DEFAULT || @@ -1302,12 +1218,14 @@ mountpoint_cmp(const void *arga, const void *argb) } /* - * Return true if path2 is a child of path1. + * Return true if path2 is a child of path1 or path2 equals path1 or + * path1 is "/" (path2 is always a child of "/"). */ static boolean_t libzfs_path_contains(const char *path1, const char *path2) { - return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'); + return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 || + (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/')); } /* @@ -1535,7 +1453,6 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, * Mount and share all datasets within the given pool. This assumes that no * datasets within the pool are currently mounted. */ -#pragma weak zpool_mount_datasets = zpool_enable_datasets int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { @@ -1577,6 +1494,8 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) zfs_share_one, &ms, B_FALSE); if (ms.ms_mntstatus != 0) ret = ms.ms_mntstatus; + else + zfs_commit_all_shares(); out: for (int i = 0; i < cb.cb_used; i++) @@ -1586,17 +1505,20 @@ out: return (ret); } +struct sets_s { + char *mountpoint; + zfs_handle_t *dataset; +}; + static int mountpoint_compare(const void *a, const void *b) { - const char *mounta = *((char **)a); - const char *mountb = *((char **)b); + const struct sets_s *mounta = (struct sets_s *)a; + const struct sets_s *mountb = (struct sets_s *)b; - return (strcmp(mountb, mounta)); + return (strcmp(mountb->mountpoint, mounta->mountpoint)); } -/* alias for 2002/240 */ -#pragma weak zpool_unmount_datasets = zpool_disable_datasets /* * Unshare and unmount all datasets within the given pool. We don't want to * rely on traversing the DSL to discover the filesystems within the pool, @@ -1608,10 +1530,10 @@ int zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) { int used, alloc; + FILE *mnttab; struct mnttab entry; size_t namelen; - char **mountpoints = NULL; - zfs_handle_t **datasets = NULL; + struct sets_s *sets = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; int i; int ret = -1; @@ -1619,12 +1541,11 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) namelen = strlen(zhp->zpool_name); - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); used = alloc = 0; - while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + while (getmntent(mnttab, &entry) == 0) { /* * Ignore non-ZFS entries. */ @@ -1647,35 +1568,27 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) */ if (used == alloc) { if (alloc == 0) { - if ((mountpoints = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) - goto out; - if ((datasets = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) + if ((sets = zfs_alloc(hdl, + 8 * sizeof (struct sets_s))) == NULL) goto out; alloc = 8; } else { void *ptr; - if ((ptr = zfs_realloc(hdl, mountpoints, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) + if ((ptr = zfs_realloc(hdl, sets, + alloc * sizeof (struct sets_s), + alloc * 2 * sizeof (struct sets_s))) + == NULL) goto out; - mountpoints = ptr; - - if ((ptr = zfs_realloc(hdl, datasets, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) - goto out; - datasets = ptr; + sets = ptr; alloc *= 2; } } - if ((mountpoints[used] = zfs_strdup(hdl, + if ((sets[used].mountpoint = zfs_strdup(hdl, entry.mnt_mountp)) == NULL) goto out; @@ -1684,7 +1597,8 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * is only used to determine if we need to remove the underlying * mountpoint, so failure is not fatal. */ - datasets[used] = make_dataset_handle(hdl, entry.mnt_special); + sets[used].dataset = make_dataset_handle(hdl, + entry.mnt_special); used++; } @@ -1693,7 +1607,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * At this point, we have the entire list of filesystems, so sort it by * mountpoint. */ - qsort(mountpoints, used, sizeof (char *), mountpoint_compare); + qsort(sets, used, sizeof (struct sets_s), mountpoint_compare); /* * Walk through and first unshare everything. @@ -1702,36 +1616,40 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) zfs_share_proto_t *curr_proto; for (curr_proto = share_all_proto; *curr_proto != PROTO_END; curr_proto++) { - if (is_shared(hdl, mountpoints[i], *curr_proto) && - unshare_one(hdl, mountpoints[i], - mountpoints[i], *curr_proto) != 0) + if (is_shared(sets[i].mountpoint, *curr_proto) && + unshare_one(hdl, sets[i].mountpoint, + sets[i].mountpoint, *curr_proto) != 0) goto out; } } + zfs_commit_all_shares(); /* * Now unmount everything, removing the underlying directories as * appropriate. */ for (i = 0; i < used; i++) { - if (unmount_one(hdl, mountpoints[i], flags) != 0) + if (unmount_one(sets[i].dataset, sets[i].mountpoint, + flags) != 0) goto out; } for (i = 0; i < used; i++) { - if (datasets[i]) - remove_mountpoint(datasets[i]); + if (sets[i].dataset) + remove_mountpoint(sets[i].dataset); } + zpool_disable_datasets_os(zhp, force); + ret = 0; out: + (void) fclose(mnttab); for (i = 0; i < used; i++) { - if (datasets[i]) - zfs_close(datasets[i]); - free(mountpoints[i]); + if (sets[i].dataset) + zfs_close(sets[i].dataset); + free(sets[i].mountpoint); } - free(datasets); - free(mountpoints); + free(sets); return (ret); } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index a6e26ebcd4..8ed96275c4 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -22,16 +22,16 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K + * Copyright (c) 2021, Colm Buckley */ #include -#include #include #include #include @@ -42,11 +42,13 @@ #include #include #include -#include #include +#include #include +#include #include #include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -54,7 +56,6 @@ #include "zfs_comutil.h" #include "zfeature_common.h" -static int read_efi_label(nvlist_t *config, diskaddr_t *sb); static boolean_t zpool_vdev_is_interior(const char *name); typedef struct prop_flags { @@ -79,7 +80,7 @@ zpool_get_all_props(zpool_handle_t *zhp) if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); - while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { + while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); @@ -101,7 +102,7 @@ zpool_get_all_props(zpool_handle_t *zhp) return (0); } -static int +int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; @@ -305,6 +306,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: + case ZPOOL_PROP_COMPATIBILITY: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, @@ -312,7 +314,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, len); break; } - /* FALLTHROUGH */ + fallthrough; default: (void) strlcpy(buf, "-", len); break; @@ -403,7 +405,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, (void) snprintf(buf, len, "-"); break; } - /* FALLTHROUGH */ + fallthrough; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } @@ -432,7 +434,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, * Assuming bootfs is a valid dataset name. */ static boolean_t -bootfs_name_valid(const char *pool, char *bootfs) +bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') @@ -448,17 +450,6 @@ bootfs_name_valid(const char *pool, char *bootfs) return (B_FALSE); } -boolean_t -zpool_is_bootable(zpool_handle_t *zhp) -{ - char bootfs[ZFS_MAX_DATASET_NAME_LEN]; - - return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, - sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-", - sizeof (bootfs)) != 0); -} - - /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are @@ -476,6 +467,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; + char report[1024]; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); @@ -495,7 +487,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid feature '%s'"), fname); + "feature '%s' unsupported by kernel"), + fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -570,8 +563,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' number %d is invalid."), - propname, intval); + "property '%s' number %llu is invalid."), + propname, (unsigned long long)intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } @@ -581,10 +574,11 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (intval != 0 && (intval < ASHIFT_MIN || intval > ASHIFT_MAX)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' number %d is invalid, only " - "values between %" PRId32 " and " - "%" PRId32 " are allowed."), - propname, intval, ASHIFT_MIN, ASHIFT_MAX); + "property '%s' number %llu is invalid, " + "only values between %" PRId32 " and %" + PRId32 " are allowed."), + propname, (unsigned long long)intval, + ASHIFT_MIN, ASHIFT_MAX); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -684,6 +678,20 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, *slash = '/'; break; + case ZPOOL_PROP_COMPATIBILITY: + switch (zpool_load_compat(strval, NULL, report, 1024)) { + case ZPOOL_COMPATIBILITY_OK: + case ZPOOL_COMPATIBILITY_WARNTOKEN: + break; + case ZPOOL_COMPATIBILITY_BADFILE: + case ZPOOL_COMPATIBILITY_BADTOKEN: + case ZPOOL_COMPATIBILITY_NOFILES: + zfs_error_aux(hdl, "%s", report); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { @@ -722,20 +730,11 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, } break; case ZPOOL_PROP_DEDUPDITTO: - if (intval < ZIO_DEDUPDITTO_MIN && intval != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' value %d is invalid; only " - "values of 0 or >= %" PRId32 " are allowed " - "for this property."), - propname, intval, ZIO_DEDUPDITTO_MIN); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } + printf("Note: property '%s' no longer has " + "any effect\n", propname); break; default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s'(%d) not defined"), propname, prop); break; } } @@ -806,7 +805,8 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) } int -zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) +zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, + boolean_t literal) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; @@ -885,13 +885,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) } for (entry = *plp; entry != NULL; entry = entry->pl_next) { - - if (entry->pl_fixed) + if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_INVAL && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), - NULL, B_FALSE) == 0) { + NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } @@ -983,6 +982,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -1209,6 +1209,61 @@ zpool_has_special_vdev(nvlist_t *nvroot) return (B_FALSE); } +/* + * Check if vdev list contains a dRAID vdev + */ +static boolean_t +zpool_has_draid_vdev(nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (uint_t c = 0; c < children; c++) { + char *type; + + if (nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type) == 0 && + strcmp(type, VDEV_TYPE_DRAID) == 0) { + return (B_TRUE); + } + } + } + return (B_FALSE); +} + +/* + * Output a dRAID top-level vdev name in to the provided buffer. + */ +static char * +zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, + uint64_t spares, uint64_t children) +{ + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); + + return (name); +} + +/* + * Return B_TRUE if the provided name is a dRAID spare name. + */ +boolean_t +zpool_is_draid_spare(const char *name) +{ + uint64_t spare_id, parity, vdev_id; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we @@ -1363,10 +1418,16 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, msg)); - case ENOTBLK: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cache device must be a disk or disk slice")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); + case EINVAL: + if (zpool_has_draid_vdev(nvroot) && + zfeature_lookup_name("draid", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID vdevs are unsupported by the " + "kernel")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + } else { + return (zpool_standard_error(hdl, errno, msg)); + } default: return (zpool_standard_error(hdl, errno, msg)); @@ -1523,15 +1584,25 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) break; case EINVAL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid config; a pool with removing/removed " - "vdevs does not support adding raidz vdevs")); + + if (zpool_has_draid_vdev(nvroot) && + zfeature_lookup_name("draid", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID vdevs are unsupported by the " + "kernel")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; a pool with removing/" + "removed vdevs does not support adding " + "raidz or dRAID vdevs")); + } + (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EOVERFLOW: /* - * This occurrs when one of the devices is below + * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. @@ -1555,12 +1626,6 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) (void) zfs_error(hdl, EZFS_BADVERSION, msg); break; - case ENOTBLK: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cache device must be a disk or disk slice")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - default: (void) zpool_standard_error(hdl, errno, msg); } @@ -1584,10 +1649,6 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = {"\0"}; - char msg[1024]; - - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot export '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; @@ -1602,11 +1663,13 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); - return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, - msg)); + return (zfs_error_fmt(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, + dgettext(TEXT_DOMAIN, "cannot export '%s'"), + zhp->zpool_name)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, - msg)); + dgettext(TEXT_DOMAIN, "cannot export '%s'"), + zhp->zpool_name)); } } @@ -2016,7 +2079,7 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, "the zgenhostid(8) command.\n")); } - (void) zfs_error_aux(hdl, aux); + (void) zfs_error_aux(hdl, "%s", aux); } (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); break; @@ -2164,11 +2227,10 @@ xlate_init_err(int err) * Begin, suspend, or cancel the initialization (initializing of all free * blocks) for the given vdevs in the given pool. */ -int -zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, - nvlist_t *vds) +static int +zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds, boolean_t wait) { - char msg[1024]; int err; nvlist_t *vdev_guids = fnvlist_alloc(); @@ -2180,26 +2242,46 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &vd_errlist); - if (err == 0) { - err = lzc_initialize(zhp->zpool_name, cmd_type, - vdev_guids, &errlist); - if (err == 0) { - fnvlist_free(vdev_guids); - fnvlist_free(guids_to_paths); - return (0); - } + if (err != 0) { + verify(vd_errlist != NULL); + goto list_errors; + } + err = lzc_initialize(zhp->zpool_name, cmd_type, + vdev_guids, &errlist); + + if (err != 0) { if (errlist != NULL) { vd_errlist = fnvlist_lookup_nvlist(errlist, ZPOOL_INITIALIZE_VDEVS); + goto list_errors; } - - (void) snprintf(msg, sizeof (msg), + (void) zpool_standard_error(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "operation failed")); - } else { - verify(vd_errlist != NULL); + goto out; } + if (wait) { + for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; + elem = nvlist_next_nvpair(vdev_guids, elem)) { + + uint64_t guid = fnvpair_value_uint64(elem); + + err = lzc_wait_tag(zhp->zpool_name, + ZPOOL_WAIT_INITIALIZE, guid, NULL); + if (err != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, + err, dgettext(TEXT_DOMAIN, "error " + "waiting for '%s' to initialize"), + nvpair_name(elem)); + + goto out; + } + } + } + goto out; + +list_errors: for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); @@ -2213,15 +2295,28 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, "cannot initialize '%s'", path); } +out: fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); - if (vd_errlist != NULL) { + if (vd_errlist != NULL) fnvlist_free(vd_errlist); - return (-1); - } - return (zpool_standard_error(zhp->zpool_hdl, err, msg)); + return (err == 0 ? 0 : -1); +} + +int +zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); +} + +int +zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE)); } static int @@ -2243,47 +2338,50 @@ xlate_trim_err(int err) return (err); } -/* - * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for - * the given vdevs in the given pool. - */ -int -zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, - trimflags_t *trim_flags) +static int +zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) { - char msg[1024]; int err; - - nvlist_t *vdev_guids = fnvlist_alloc(); - nvlist_t *guids_to_paths = fnvlist_alloc(); - nvlist_t *vd_errlist = NULL; - nvlist_t *errlist; nvpair_t *elem; - err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, - guids_to_paths, &vd_errlist); - if (err == 0) { - err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, - trim_flags->secure, vdev_guids, &errlist); - if (err == 0) { - fnvlist_free(vdev_guids); - fnvlist_free(guids_to_paths); - return (0); - } + for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; + elem = nvlist_next_nvpair(vdev_guids, elem)) { - if (errlist != NULL) { - vd_errlist = fnvlist_lookup_nvlist(errlist, - ZPOOL_TRIM_VDEVS); - } + uint64_t guid = fnvpair_value_uint64(elem); - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "operation failed")); - } else { - verify(vd_errlist != NULL); + err = lzc_wait_tag(zhp->zpool_name, + ZPOOL_WAIT_TRIM, guid, NULL); + if (err != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, + err, dgettext(TEXT_DOMAIN, "error " + "waiting to trim '%s'"), nvpair_name(elem)); + + return (err); + } + } + return (0); +} + +/* + * Check errlist and report any errors, omitting ones which should be + * suppressed. Returns B_TRUE if any errors were reported. + */ +static boolean_t +check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags, + nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist) +{ + nvpair_t *elem; + boolean_t reported_errs = B_FALSE; + int num_vds = 0; + int num_suppressed_errs = 0; + + for (elem = nvlist_next_nvpair(vds, NULL); + elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { + num_vds++; } - for (elem = nvlist_next_nvpair(vd_errlist, NULL); - elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { + for (elem = nvlist_next_nvpair(errlist, NULL); + elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) { int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); char *path; @@ -2295,9 +2393,11 @@ zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, if (vd_error == EZFS_TRIM_NOTSUP && trim_flags->fullpool && !trim_flags->secure) { + num_suppressed_errs++; continue; } + reported_errs = B_TRUE; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); @@ -2306,15 +2406,72 @@ zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, "cannot trim '%s'", path); } - fnvlist_free(vdev_guids); - fnvlist_free(guids_to_paths); - - if (vd_errlist != NULL) { - fnvlist_free(vd_errlist); - return (-1); + if (num_suppressed_errs == num_vds) { + (void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "no devices in pool support trim operations")); + (void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP, + dgettext(TEXT_DOMAIN, "cannot trim"))); + reported_errs = B_TRUE; } - return (zpool_standard_error(zhp->zpool_hdl, err, msg)); + return (reported_errs); +} + +/* + * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for + * the given vdevs in the given pool. + */ +int +zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, + trimflags_t *trim_flags) +{ + int err; + int retval = 0; + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); + nvlist_t *errlist = NULL; + + err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, + guids_to_paths, &errlist); + if (err != 0) { + check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist); + retval = -1; + goto out; + } + + err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, + trim_flags->secure, vdev_guids, &errlist); + if (err != 0) { + nvlist_t *vd_errlist; + if (errlist != NULL && nvlist_lookup_nvlist(errlist, + ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) { + if (check_trim_errs(zhp, trim_flags, guids_to_paths, + vds, vd_errlist)) { + retval = -1; + goto out; + } + } else { + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + zpool_standard_error(zhp->zpool_hdl, err, msg); + retval = -1; + goto out; + } + } + + + if (trim_flags->wait) + retval = zpool_trim_wait(zhp, vdev_guids); + +out: + if (errlist != NULL) + fnvlist_free(errlist); + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + return (retval); } /* @@ -2375,7 +2532,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_SCRUB) { + if (ps && ps->pss_func == POOL_SCAN_SCRUB && + ps->pss_state == DSS_SCANNING) { if (cmd == POOL_SCRUB_PAUSE) return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); else @@ -2490,6 +2648,36 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, errno = 0; vdev_id = strtoull(idx, &end, 10); + /* + * If we are looking for a raidz and a parity is + * specified, make sure it matches. + */ + int rzlen = strlen(VDEV_TYPE_RAIDZ); + assert(rzlen == strlen(VDEV_TYPE_DRAID)); + int typlen = strlen(type); + if ((strncmp(type, VDEV_TYPE_RAIDZ, rzlen) == 0 || + strncmp(type, VDEV_TYPE_DRAID, rzlen) == 0) && + typlen != rzlen) { + uint64_t vdev_parity; + int parity = *(type + rzlen) - '0'; + + if (parity <= 0 || parity > 3 || + (typlen - rzlen) != 1) { + /* + * Nonsense parity specified, can + * never match + */ + free(type); + return (NULL); + } + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, &vdev_parity) == 0); + if ((int)vdev_parity != parity) { + free(type); + break; + } + } + free(type); if (errno != 0) return (NULL); @@ -2607,6 +2795,11 @@ zpool_vdev_is_interior(const char *name) VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); + + if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && + !zpool_is_draid_spare(name)) + return (B_TRUE); + return (B_FALSE); } @@ -2790,45 +2983,6 @@ zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) phypath_size)); } -/* - * If the device has being dynamically expanded then we need to relabel - * the disk to use the new unallocated space. - */ -static int -zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) -{ - int fd, error; - - if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to open device: %d"), path, errno); - return (zfs_error(hdl, EZFS_OPENFAILED, msg)); - } - - /* - * It's possible that we might encounter an error if the device - * does not have any unallocated space left. If so, we simply - * ignore that error and continue on. - * - * Also, we don't call efi_rescan() - that would just return EBUSY. - * The module will do it for us in vdev_disk_open(). - */ - error = efi_use_whole_disk(fd); - - /* Flush the buffers to disk and invalidate the page cache. */ - (void) fsync(fd); - (void) ioctl(fd, BLKFLSBUF); - - (void) close(fd); - if (error && error != VT_ENOSPC) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to read disk capacity"), path); - return (zfs_error(hdl, EZFS_NOCAP, msg)); - } - - return (0); -} - /* * Convert a vdev path to a GUID. Returns GUID or 0 on error. * @@ -3022,7 +3176,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { @@ -3057,7 +3211,7 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) zc.zc_cookie = VDEV_STATE_DEGRADED; zc.zc_obj = aux; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); @@ -3079,7 +3233,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || + strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); @@ -3096,8 +3251,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) * If 'replacing' is specified, the new disk will replace the old one. */ int -zpool_vdev_attach(zpool_handle_t *zhp, - const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) +zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, + const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char msg[1024]; @@ -3110,7 +3265,6 @@ zpool_vdev_attach(zpool_handle_t *zhp, uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; - boolean_t rootpool = zpool_is_bootable(zhp); if (replacing) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -3132,6 +3286,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; + zc.zc_simple = rebuild; + + if (rebuild && + zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the loaded zfs module doesn't support device rebuilds")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { @@ -3170,18 +3332,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, zcmd_free_nvlists(&zc); - if (ret == 0) { - if (rootpool) { - /* - * XXX need a better way to prevent user from - * booting up a half-baked vdev. - */ - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " - "sure to wait until resilver is done " - "before rebooting.\n")); - } + if (ret == 0) return (0); - } switch (errno) { case ENOTSUP: @@ -3192,20 +3344,40 @@ zpool_vdev_attach(zpool_handle_t *zhp, uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if (islog) + if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); - else if (version >= SPA_VERSION_MULTI_REPLACE) + } else if (rebuild) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "only mirror and dRAID vdevs support " + "sequential reconstruction")); + } else if (zpool_is_draid_spare(new_disk)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares can only replace child " + "devices in their parent's dRAID vdev")); + } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); - else + } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); + } } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "can only attach to mirrors and top-level " - "disks")); + char status[64] = {0}; + zpool_prop_get_feature(zhp, + "feature@device_rebuild", status, 63); + if (rebuild && + strncmp(status, ZFS_FEATURE_DISABLED, 64) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device_rebuild feature must be enabled " + "in order to use sequential " + "reconstruction")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "can only attach to mirrors and top-level " + "disks")); + } } (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; @@ -3364,7 +3536,7 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = {"\0"}; - char msg[1024]; + char msg[1024], *bias; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; @@ -3422,6 +3594,7 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; + boolean_t is_special = B_FALSE, is_dedup = B_FALSE; char *type; nvlist_t **mchild, *vdev; uint_t mchildren; @@ -3455,13 +3628,26 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, lastlog = 0; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev = child[c]; + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + continue; + } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); goto out; } + if (nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { + if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + is_special = B_TRUE; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + is_dedup = B_TRUE; + } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); @@ -3479,6 +3665,20 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; + + if (flags.dryrun != 0) { + if (is_dedup == B_TRUE) { + if (nvlist_add_string(varray[vcount - 1], + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) != 0) + goto out; + } else if (is_special == B_TRUE) { + if (nvlist_add_string(varray[vcount - 1], + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) != 0) + goto out; + } + } } /* did we find every disk the user specified? */ @@ -3588,6 +3788,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (zpool_is_draid_spare(path)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3600,13 +3806,6 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "root pool can not have removed devices, " - "because GRUB does not understand them")); - return (zfs_error(hdl, EINVAL, msg)); - } - zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) @@ -3792,7 +3991,7 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) + if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); @@ -3859,86 +4058,6 @@ zpool_sync_one(zpool_handle_t *zhp, void *data) return (0); } -#if defined(__sun__) || defined(__sun) -/* - * Convert from a devid string to a path. - */ -static char * -devid_to_path(char *devid_str) -{ - ddi_devid_t devid; - char *minor; - char *path; - devid_nmlist_t *list = NULL; - int ret; - - if (devid_str_decode(devid_str, &devid, &minor) != 0) - return (NULL); - - ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list); - - devid_str_free(minor); - devid_free(devid); - - if (ret != 0) - return (NULL); - - /* - * In a case the strdup() fails, we will just return NULL below. - */ - path = strdup(list[0].devname); - - devid_free_nmlist(list); - - return (path); -} - -/* - * Convert from a path to a devid string. - */ -static char * -path_to_devid(const char *path) -{ - int fd; - ddi_devid_t devid; - char *minor, *ret; - - if ((fd = open(path, O_RDONLY)) < 0) - return (NULL); - - minor = NULL; - ret = NULL; - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0) - ret = devid_str_encode(devid, minor); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - (void) close(fd); - - return (ret); -} - -/* - * Issue the necessary ioctl() to update the stored path value for the vdev. We - * ignore any failure here, since a common case is for an unprivileged user to - * type 'zpool status', and we'll display the correct information anyway. - */ -static void -set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) -{ - zfs_cmd_t zc = {"\0"}; - - (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value)); - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &zc.zc_guid) == 0); - - (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); -} -#endif /* sun */ - #define PATH_BUF_LEN 64 /* @@ -3994,54 +4113,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { -#if defined(__sun__) || defined(__sun) - /* - * Live VDEV path updates to a kernel VDEV during a - * zpool_vdev_name lookup are not supported on Linux. - */ - char *devid; - vdev_stat_t *vs; - uint_t vsc; - - /* - * If the device is dead (faulted, offline, etc) then don't - * bother opening it. Otherwise we may be forcing the user to - * open a misbehaving device, which can have undesirable - * effects. - */ - if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) != 0 || - vs->vs_state >= VDEV_STATE_DEGRADED) && - zhp != NULL && - nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { - /* - * Determine if the current path is correct. - */ - char *newdevid = path_to_devid(path); - - if (newdevid == NULL || - strcmp(devid, newdevid) != 0) { - char *newpath; - - if ((newpath = devid_to_path(devid)) != NULL) { - /* - * Update the path appropriately. - */ - set_path(zhp, nv, newpath); - if (nvlist_add_string(nv, - ZPOOL_CONFIG_PATH, newpath) == 0) - verify(nvlist_lookup_string(nv, - ZPOOL_CONFIG_PATH, - &path) == 0); - free(newpath); - } - } - - if (newdevid) - devid_str_free(newdevid); - } -#endif /* sun */ - if (name_flags & VDEV_NAME_FOLLOW_LINKS) { char *rp = realpath(path, NULL); if (rp) { @@ -4056,14 +4127,14 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, */ if ((strcmp(type, VDEV_TYPE_DISK) == 0) && !(name_flags & VDEV_NAME_PATH)) { - path = strrchr(path, '/'); - path++; + path = zfs_strip_path(path); } /* - * Remove the partition from the path it this is a whole disk. + * Remove the partition from the path if this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -4081,6 +4152,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, path = buf; } + /* + * If it's a dRAID device, we add parity, groups, and spares. + */ + if (strcmp(path, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata, nparity, nspares; + nvlist_t **child; + uint_t children; + + verify(nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, &nparity) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0); + + path = zpool_draid_name(buf, sizeof (buf), ndata, + nparity, nspares, children); + } + /* * We identify each top-level vdev by using a * naming convention. @@ -4131,7 +4223,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG, + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, &zc) != 0) { free((void *)(uintptr_t)zc.zc_nvlist_dst); if (errno == ENOMEM) { @@ -4154,7 +4246,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last - * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks + * to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ @@ -4228,7 +4320,7 @@ zfs_save_arguments(int argc, char **argv, char *string, int len) { int i; - (void) strlcpy(string, basename(argv[0]), len); + (void) strlcpy(string, zfs_basename(argv[0]), len); for (i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); @@ -4246,7 +4338,7 @@ zpool_log_history(libzfs_handle_t *hdl, const char *message) fnvlist_add_string(args, "message", message); err = zcmd_write_src_nvlist(hdl, &zc, args); if (err == 0) - err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); + err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); @@ -4273,7 +4365,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) zc.zc_history_len = *len; zc.zc_history_offset = *off; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, @@ -4305,33 +4397,37 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) * Retrieve the command history of a pool. */ int -zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) +zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, + boolean_t *eof) { char *buf; int buflen = 128 * 1024; - uint64_t off = 0; nvlist_t **records = NULL; uint_t numrecords = 0; int err, i; + uint64_t start = *off; buf = malloc(buflen); if (buf == NULL) return (ENOMEM); - do { + /* process about 1MB a time */ + while (*off - start < 1024 * 1024) { uint64_t bytes_read = buflen; uint64_t leftover; - if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0) + if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ - if (!bytes_read) + if (!bytes_read) { + *eof = B_TRUE; break; + } if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) break; - off -= leftover; + *off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough @@ -4343,9 +4439,7 @@ zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) if (buf == NULL) return (ENOMEM); } - - /* CONSTCOND */ - } while (1); + } free(buf); @@ -4436,13 +4530,10 @@ int zpool_events_clear(libzfs_handle_t *hdl, int *count) { zfs_cmd_t zc = {"\0"}; - char msg[1024]; - - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot clear events")); if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) - return (zpool_standard_error_fmt(hdl, errno, msg)); + return (zpool_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot clear events"))); if (count != NULL) *count = (int)zc.zc_cookie; /* # of events cleared */ @@ -4486,9 +4577,9 @@ zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) return (error); } -void -zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - char *pathname, size_t len) +static void +zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len, boolean_t always_unmounted) { zfs_cmd_t zc = {"\0"}; boolean_t mounted = B_FALSE; @@ -4505,7 +4596,7 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; - if (ioctl(zhp->zpool_hdl->libzfs_fd, + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", @@ -4515,12 +4606,13 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ - mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt); + mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname, + &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH, + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, @@ -4536,256 +4628,334 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, free(mntpnt); } -/* - * Read the EFI label from the config, if a label does not exist then - * pass back the error to the caller. If the caller has passed a non-NULL - * diskaddr argument then we set it to the starting address of the EFI - * partition. - */ -static int -read_efi_label(nvlist_t *config, diskaddr_t *sb) +void +zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len) { - char *path; - int fd; - char diskname[MAXPATHLEN]; - int err = -1; - - if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) - return (err); - - (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, - strrchr(path, '/')); - if ((fd = open(diskname, O_RDONLY|O_DIRECT)) >= 0) { - struct dk_gpt *vtoc; - - if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { - if (sb != NULL) - *sb = vtoc->efi_parts[0].p_start; - efi_free(vtoc); - } - (void) close(fd); - } - return (err); + zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); } -/* - * determine where a partition starts on a disk in the current - * configuration - */ -static diskaddr_t -find_start_block(nvlist_t *config) +void +zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len) { - nvlist_t **child; - uint_t c, children; - diskaddr_t sb = MAXOFFSET_T; - uint64_t wholedisk; - - if (nvlist_lookup_nvlist_array(config, - ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) != 0 || !wholedisk) { - return (MAXOFFSET_T); - } - if (read_efi_label(config, &sb) < 0) - sb = MAXOFFSET_T; - return (sb); - } - - for (c = 0; c < children; c++) { - sb = find_start_block(child[c]); - if (sb != MAXOFFSET_T) { - return (sb); - } - } - return (MAXOFFSET_T); + zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE); } - -static int -zpool_label_disk_check(char *path) -{ - struct dk_gpt *vtoc; - int fd, err; - - if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) - return (errno); - - if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { - (void) close(fd); - return (err); - } - - if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { - efi_free(vtoc); - (void) close(fd); - return (EIDRM); - } - - efi_free(vtoc); - (void) close(fd); - return (0); -} - /* - * Generate a unique partition name for the ZFS member. Partitions must - * have unique names to ensure udev will be able to create symlinks under - * /dev/disk/by-partlabel/ for all pool members. The partition names are - * of the form -. - */ -static void -zpool_label_name(char *label_name, int label_size) -{ - uint64_t id = 0; - int fd; - - fd = open("/dev/urandom", O_RDONLY); - if (fd >= 0) { - if (read(fd, &id, sizeof (id)) != sizeof (id)) - id = 0; - - close(fd); - } - - if (id == 0) - id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); - - snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); -} - -/* - * Label an individual disk. The name provided is the short name, - * stripped of any leading /dev path. + * Wait while the specified activity is in progress in the pool. */ int -zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) +zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity) { - char path[MAXPATHLEN]; - struct dk_gpt *vtoc; - int rval, fd; - size_t resv = EFI_MIN_RESV_SIZE; - uint64_t slice_size; - diskaddr_t start_block; - char errbuf[1024]; + boolean_t missing; - /* prepare an error message just in case */ - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + int error = zpool_wait_status(zhp, activity, &missing, NULL); - if (zhp) { - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - - if (zhp->zpool_start_block == 0) - start_block = find_start_block(nvroot); - else - start_block = zhp->zpool_start_block; - zhp->zpool_start_block = start_block; + if (missing) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT, + dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), + zhp->zpool_name); + return (ENOENT); } else { - /* new pool */ - start_block = NEW_START_BLOCK; + return (error); } - - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); - - if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { - /* - * This shouldn't happen. We've long since verified that this - * is a valid device. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "label '%s': unable to open device: %d"), path, errno); - return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); - } - - if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { - /* - * The only way this can fail is if we run out of memory, or we - * were unable to read the disk's capacity - */ - if (errno == ENOMEM) - (void) no_memory(hdl); - - (void) close(fd); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "label '%s': unable to read disk capacity"), path); - - return (zfs_error(hdl, EZFS_NOCAP, errbuf)); - } - - slice_size = vtoc->efi_last_u_lba + 1; - slice_size -= EFI_MIN_RESV_SIZE; - if (start_block == MAXOFFSET_T) - start_block = NEW_START_BLOCK; - slice_size -= start_block; - slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); - - vtoc->efi_parts[0].p_start = start_block; - vtoc->efi_parts[0].p_size = slice_size; - - /* - * Why we use V_USR: V_BACKUP confuses users, and is considered - * disposable by some EFI utilities (since EFI doesn't have a backup - * slice). V_UNASSIGNED is supposed to be used only for zero size - * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, - * etc. were all pretty specific. V_USR is as close to reality as we - * can get, in the absence of V_OTHER. - */ - vtoc->efi_parts[0].p_tag = V_USR; - zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); - - vtoc->efi_parts[8].p_start = slice_size + start_block; - vtoc->efi_parts[8].p_size = resv; - vtoc->efi_parts[8].p_tag = V_RESERVED; - - rval = efi_write(fd, vtoc); - - /* Flush the buffers to disk and invalidate the page cache. */ - (void) fsync(fd); - (void) ioctl(fd, BLKFLSBUF); - - if (rval == 0) - rval = efi_rescan(fd); - - /* - * Some block drivers (like pcata) may not support EFI GPT labels. - * Print out a helpful error message directing the user to manually - * label the disk and give a specific slice. - */ - if (rval != 0) { - (void) close(fd); - efi_free(vtoc); - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " - "parted(8) and then provide a specific slice: %d"), rval); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - - (void) close(fd); - efi_free(vtoc); - - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); - (void) zfs_append_partition(path, MAXPATHLEN); - - /* Wait to udev to signal use the device has settled. */ - rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); - if (rval) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " - "detect device partitions on '%s': %d"), path, rval); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - - /* We can't be to paranoid. Read the label back and verify it. */ - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); - rval = zpool_label_disk_check(path); - if (rval) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " - "EFI label on '%s' is damaged. Ensure\nthis device " - "is not in in use, and is functioning properly: %d"), - path, rval); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - - return (0); +} + +/* + * Wait for the given activity and return the status of the wait (whether or not + * any waiting was done) in the 'waited' parameter. Non-existent pools are + * reported via the 'missing' parameter, rather than by printing an error + * message. This is convenient when this function is called in a loop over a + * long period of time (as it is, for example, by zpool's wait cmd). In that + * scenario, a pool being exported or destroyed should be considered a normal + * event, so we don't want to print an error when we find that the pool doesn't + * exist. + */ +int +zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, + boolean_t *missing, boolean_t *waited) +{ + int error = lzc_wait(zhp->zpool_name, activity, waited); + *missing = (error == ENOENT); + if (*missing) + return (0); + + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), + zhp->zpool_name); + } + + return (error); +} + +int +zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) +{ + int error = lzc_set_bootenv(zhp->zpool_name, envmap); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error setting bootenv in pool '%s'"), zhp->zpool_name); + } + + return (error); +} + +int +zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp) +{ + nvlist_t *nvl; + int error; + + nvl = NULL; + error = lzc_get_bootenv(zhp->zpool_name, &nvl); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error getting bootenv in pool '%s'"), zhp->zpool_name); + } else { + *nvlp = nvl; + } + + return (error); +} + +/* + * Attempt to read and parse feature file(s) (from "compatibility" property). + * Files contain zpool feature names, comma or whitespace-separated. + * Comments (# character to next newline) are discarded. + * + * Arguments: + * compatibility : string containing feature filenames + * features : either NULL or pointer to array of boolean + * report : either NULL or pointer to string buffer + * rlen : length of "report" buffer + * + * compatibility is NULL (unset), "", "off", "legacy", or list of + * comma-separated filenames. filenames should either be absolute, + * or relative to: + * 1) ZPOOL_SYSCONF_COMPAT_D (eg: /etc/zfs/compatibility.d) or + * 2) ZPOOL_DATA_COMPAT_D (eg: /usr/share/zfs/compatibility.d). + * (Unset), "" or "off" => enable all features + * "legacy" => disable all features + * + * Any feature names read from files which match unames in spa_feature_table + * will have the corresponding boolean set in the features array (if non-NULL). + * If more than one feature set specified, only features present in *all* of + * them will be set. + * + * "report" if not NULL will be populated with a suitable status message. + * + * Return values: + * ZPOOL_COMPATIBILITY_OK : files read and parsed ok + * ZPOOL_COMPATIBILITY_BADFILE : file too big or not a text file + * ZPOOL_COMPATIBILITY_BADTOKEN : SYSCONF file contains invalid feature name + * ZPOOL_COMPATIBILITY_WARNTOKEN : DATA file contains invalid feature name + * ZPOOL_COMPATIBILITY_NOFILES : no feature files found + */ +zpool_compat_status_t +zpool_load_compat(const char *compat, boolean_t *features, char *report, + size_t rlen) +{ + int sdirfd, ddirfd, featfd; + struct stat fs; + char *fc; + char *ps, *ls, *ws; + char *file, *line, *word; + + char l_compat[ZFS_MAXPROPLEN]; + + boolean_t ret_nofiles = B_TRUE; + boolean_t ret_badfile = B_FALSE; + boolean_t ret_badtoken = B_FALSE; + boolean_t ret_warntoken = B_FALSE; + + /* special cases (unset), "" and "off" => enable all features */ + if (compat == NULL || compat[0] == '\0' || + strcmp(compat, ZPOOL_COMPAT_OFF) == 0) { + if (features != NULL) + for (uint_t i = 0; i < SPA_FEATURES; i++) + features[i] = B_TRUE; + if (report != NULL) + strlcpy(report, gettext("all features enabled"), rlen); + return (ZPOOL_COMPATIBILITY_OK); + } + + /* Final special case "legacy" => disable all features */ + if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { + if (features != NULL) + for (uint_t i = 0; i < SPA_FEATURES; i++) + features[i] = B_FALSE; + if (report != NULL) + strlcpy(report, gettext("all features disabled"), rlen); + return (ZPOOL_COMPATIBILITY_OK); + } + + /* + * Start with all true; will be ANDed with results from each file + */ + if (features != NULL) + for (uint_t i = 0; i < SPA_FEATURES; i++) + features[i] = B_TRUE; + + char err_badfile[1024] = ""; + char err_badtoken[1024] = ""; + + /* + * We ignore errors from the directory open() + * as they're only needed if the filename is relative + * which will be checked during the openat(). + */ + +/* O_PATH safer than O_RDONLY if system allows it */ +#if defined(O_PATH) +#define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_PATH) +#else +#define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_RDONLY) +#endif + + sdirfd = open(ZPOOL_SYSCONF_COMPAT_D, ZC_DIR_FLAGS); + ddirfd = open(ZPOOL_DATA_COMPAT_D, ZC_DIR_FLAGS); + + (void) strlcpy(l_compat, compat, ZFS_MAXPROPLEN); + + for (file = strtok_r(l_compat, ",", &ps); + file != NULL; + file = strtok_r(NULL, ",", &ps)) { + + boolean_t l_features[SPA_FEATURES]; + + enum { Z_SYSCONF, Z_DATA } source; + + /* try sysconfdir first, then datadir */ + source = Z_SYSCONF; + if ((featfd = openat(sdirfd, file, O_RDONLY | O_CLOEXEC)) < 0) { + featfd = openat(ddirfd, file, O_RDONLY | O_CLOEXEC); + source = Z_DATA; + } + + /* File readable and correct size? */ + if (featfd < 0 || + fstat(featfd, &fs) < 0 || + fs.st_size < 1 || + fs.st_size > ZPOOL_COMPAT_MAXSIZE) { + (void) close(featfd); + strlcat(err_badfile, file, ZFS_MAXPROPLEN); + strlcat(err_badfile, " ", ZFS_MAXPROPLEN); + ret_badfile = B_TRUE; + continue; + } + +/* Prefault the file if system allows */ +#if defined(MAP_POPULATE) +#define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) +#elif defined(MAP_PREFAULT_READ) +#define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_PREFAULT_READ) +#else +#define ZC_MMAP_FLAGS (MAP_PRIVATE) +#endif + + /* private mmap() so we can strtok safely */ + fc = (char *)mmap(NULL, fs.st_size, PROT_READ | PROT_WRITE, + ZC_MMAP_FLAGS, featfd, 0); + (void) close(featfd); + + /* map ok, and last character == newline? */ + if (fc == MAP_FAILED || fc[fs.st_size - 1] != '\n') { + (void) munmap((void *) fc, fs.st_size); + strlcat(err_badfile, file, ZFS_MAXPROPLEN); + strlcat(err_badfile, " ", ZFS_MAXPROPLEN); + ret_badfile = B_TRUE; + continue; + } + + ret_nofiles = B_FALSE; + + for (uint_t i = 0; i < SPA_FEATURES; i++) + l_features[i] = B_FALSE; + + /* replace final newline with NULL to ensure string ends */ + fc[fs.st_size - 1] = '\0'; + + for (line = strtok_r(fc, "\n", &ls); + line != NULL; + line = strtok_r(NULL, "\n", &ls)) { + /* discard comments */ + char *r = strchr(line, '#'); + if (r != NULL) + *r = '\0'; + + for (word = strtok_r(line, ", \t", &ws); + word != NULL; + word = strtok_r(NULL, ", \t", &ws)) { + /* Find matching feature name */ + uint_t f; + for (f = 0; f < SPA_FEATURES; f++) { + zfeature_info_t *fi = + &spa_feature_table[f]; + if (strcmp(word, fi->fi_uname) == 0) { + l_features[f] = B_TRUE; + break; + } + } + if (f < SPA_FEATURES) + continue; + + /* found an unrecognized word */ + /* lightly sanitize it */ + if (strlen(word) > 32) + word[32] = '\0'; + for (char *c = word; *c != '\0'; c++) + if (!isprint(*c)) + *c = '?'; + + strlcat(err_badtoken, word, ZFS_MAXPROPLEN); + strlcat(err_badtoken, " ", ZFS_MAXPROPLEN); + if (source == Z_SYSCONF) + ret_badtoken = B_TRUE; + else + ret_warntoken = B_TRUE; + } + } + (void) munmap((void *) fc, fs.st_size); + + if (features != NULL) + for (uint_t i = 0; i < SPA_FEATURES; i++) + features[i] &= l_features[i]; + } + (void) close(sdirfd); + (void) close(ddirfd); + + /* Return the most serious error */ + if (ret_badfile) { + if (report != NULL) + snprintf(report, rlen, gettext("could not read/" + "parse feature file(s): %s"), err_badfile); + return (ZPOOL_COMPATIBILITY_BADFILE); + } + if (ret_nofiles) { + if (report != NULL) + strlcpy(report, + gettext("no valid compatibility files specified"), + rlen); + return (ZPOOL_COMPATIBILITY_NOFILES); + } + if (ret_badtoken) { + if (report != NULL) + snprintf(report, rlen, gettext("invalid feature " + "name(s) in local compatibility files: %s"), + err_badtoken); + return (ZPOOL_COMPATIBILITY_BADTOKEN); + } + if (ret_warntoken) { + if (report != NULL) + snprintf(report, rlen, gettext("unrecognized feature " + "name(s) in distribution compatibility files: %s"), + err_badtoken); + return (ZPOOL_COMPATIBILITY_WARNTOKEN); + } + if (report != NULL) + strlcpy(report, gettext("compatibility set ok"), rlen); + return (ZPOOL_COMPATIBILITY_OK); } diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index f69a46430b..7460ffc413 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -61,6 +60,7 @@ #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" +#include #include #include #include @@ -68,135 +68,23 @@ #include #include -/* in libzfs_dataset.c */ -extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); - static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, - recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, - uint64_t *, const char *, nvlist_t *); + recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, + const char *, nvlist_t *); +static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); -static const zio_cksum_t zero_cksum = { { 0 } }; - -typedef struct dedup_arg { - int inputfd; - int outputfd; - libzfs_handle_t *dedup_hdl; -} dedup_arg_t; - typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; + boolean_t pa_estimate; + int pa_verbosity; } progress_arg_t; -typedef struct dataref { - uint64_t ref_guid; - uint64_t ref_object; - uint64_t ref_offset; -} dataref_t; - -typedef struct dedup_entry { - struct dedup_entry *dde_next; - zio_cksum_t dde_chksum; - uint64_t dde_prop; - dataref_t dde_ref; -} dedup_entry_t; - -#define MAX_DDT_PHYSMEM_PERCENT 20 -#define SMALLEST_POSSIBLE_MAX_DDT_MB 128 - -typedef struct dedup_table { - dedup_entry_t **dedup_hash_array; - umem_cache_t *ddecache; - uint64_t max_ddt_size; /* max dedup table size in bytes */ - uint64_t cur_ddt_size; /* current dedup table size in bytes */ - uint64_t ddt_count; - int numhashbits; - boolean_t ddt_full; -} dedup_table_t; - -static int -high_order_bit(uint64_t n) -{ - int count; - - for (count = 0; n != 0; count++) - n >>= 1; - return (count); -} - -static size_t -ssread(void *buf, size_t len, FILE *stream) -{ - size_t outlen; - - if ((outlen = fread(buf, len, 1, stream)) == 0) - return (0); - - return (outlen); -} - -static void -ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, - zio_cksum_t *cs, uint64_t prop, dataref_t *dr) -{ - dedup_entry_t *dde; - - if (ddt->cur_ddt_size >= ddt->max_ddt_size) { - if (ddt->ddt_full == B_FALSE) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Dedup table full. Deduplication will continue " - "with existing table entries")); - ddt->ddt_full = B_TRUE; - } - return; - } - - if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) - != NULL) { - assert(*ddepp == NULL); - dde->dde_next = NULL; - dde->dde_chksum = *cs; - dde->dde_prop = prop; - dde->dde_ref = *dr; - *ddepp = dde; - ddt->cur_ddt_size += sizeof (dedup_entry_t); - ddt->ddt_count++; - } -} - -/* - * Using the specified dedup table, do a lookup for an entry with - * the checksum cs. If found, return the block's reference info - * in *dr. Otherwise, insert a new entry in the dedup table, using - * the reference information specified by *dr. - * - * return value: true - entry was found - * false - entry was not found - */ -static boolean_t -ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, - uint64_t prop, dataref_t *dr) -{ - uint32_t hashcode; - dedup_entry_t **ddepp; - - hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); - - for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; - ddepp = &((*ddepp)->dde_next)) { - if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && - (*ddepp)->dde_prop == prop) { - *dr = (*ddepp)->dde_ref; - return (B_TRUE); - } - } - ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); - return (B_FALSE); -} - static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) @@ -222,274 +110,6 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, return (0); } -/* - * This function is started in a separate thread when the dedup option - * has been requested. The main send thread determines the list of - * snapshots to be included in the send stream and makes the ioctl calls - * for each one. But instead of having the ioctl send the output to the - * the output fd specified by the caller of zfs_send()), the - * ioctl is told to direct the output to a pipe, which is read by the - * alternate thread running THIS function. This function does the - * dedup'ing by: - * 1. building a dedup table (the DDT) - * 2. doing checksums on each data block and inserting a record in the DDT - * 3. looking for matching checksums, and - * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever - * a duplicate block is found. - * The output of this function then goes to the output fd requested - * by the caller of zfs_send(). - */ -static void * -cksummer(void *arg) -{ - dedup_arg_t *dda = arg; - char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); - dmu_replay_record_t thedrr = { 0 }; - dmu_replay_record_t *drr = &thedrr; - FILE *ofp; - int outfd; - dedup_table_t ddt; - zio_cksum_t stream_cksum; - uint64_t numbuckets; - -#ifdef _ILP32 - ddt.max_ddt_size = SMALLEST_POSSIBLE_MAX_DDT_MB << 20; -#else - uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); - ddt.max_ddt_size = - MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, - SMALLEST_POSSIBLE_MAX_DDT_MB << 20); -#endif - - numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); - - /* - * numbuckets must be a power of 2. Increase number to - * a power of 2 if necessary. - */ - if (!ISP2(numbuckets)) - numbuckets = 1ULL << high_order_bit(numbuckets); - - ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); - ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); - ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); - ddt.numhashbits = high_order_bit(numbuckets) - 1; - ddt.ddt_full = B_FALSE; - - outfd = dda->outputfd; - ofp = fdopen(dda->inputfd, "r"); - while (ssread(drr, sizeof (*drr), ofp) != 0) { - - /* - * kernel filled in checksum, we are going to write same - * record, but need to regenerate checksum. - */ - if (drr->drr_type != DRR_BEGIN) { - bzero(&drr->drr_u.drr_checksum.drr_checksum, - sizeof (drr->drr_u.drr_checksum.drr_checksum)); - } - - switch (drr->drr_type) { - case DRR_BEGIN: - { - struct drr_begin *drrb = &drr->drr_u.drr_begin; - int fflags; - int sz = 0; - ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); - - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - - /* set the DEDUP feature flag for this stream */ - fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - fflags |= (DMU_BACKUP_FEATURE_DEDUP | - DMU_BACKUP_FEATURE_DEDUPPROPS); - DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - - if (drr->drr_payloadlen != 0) { - sz = drr->drr_payloadlen; - - if (sz > SPA_MAXBLOCKSIZE) { - buf = zfs_realloc(dda->dedup_hdl, buf, - SPA_MAXBLOCKSIZE, sz); - } - (void) ssread(buf, sz, ofp); - if (ferror(stdin)) - perror("fread"); - } - if (dump_record(drr, buf, sz, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_END: - { - struct drr_end *drre = &drr->drr_u.drr_end; - /* use the recalculated checksum */ - drre->drr_checksum = stream_cksum; - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_OBJECT: - { - struct drr_object *drro = &drr->drr_u.drr_object; - if (drro->drr_bonuslen > 0) { - (void) ssread(buf, - DRR_OBJECT_PAYLOAD_SIZE(drro), ofp); - } - if (dump_record(drr, buf, DRR_OBJECT_PAYLOAD_SIZE(drro), - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_SPILL: - { - struct drr_spill *drrs = &drr->drr_u.drr_spill; - (void) ssread(buf, DRR_SPILL_PAYLOAD_SIZE(drrs), ofp); - if (dump_record(drr, buf, DRR_SPILL_PAYLOAD_SIZE(drrs), - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_FREEOBJECTS: - { - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_WRITE: - { - struct drr_write *drrw = &drr->drr_u.drr_write; - dataref_t dataref; - uint64_t payload_size; - - payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); - (void) ssread(buf, payload_size, ofp); - - /* - * Use the existing checksum if it's dedup-capable, - * else calculate a SHA256 checksum for it. - */ - - if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, - zero_cksum) || - !DRR_IS_DEDUP_CAPABLE(drrw->drr_flags)) { - SHA2_CTX ctx; - zio_cksum_t tmpsha256; - - SHA2Init(SHA256, &ctx); - SHA2Update(&ctx, buf, payload_size); - SHA2Final(&tmpsha256, &ctx); - - drrw->drr_key.ddk_cksum.zc_word[0] = - BE_64(tmpsha256.zc_word[0]); - drrw->drr_key.ddk_cksum.zc_word[1] = - BE_64(tmpsha256.zc_word[1]); - drrw->drr_key.ddk_cksum.zc_word[2] = - BE_64(tmpsha256.zc_word[2]); - drrw->drr_key.ddk_cksum.zc_word[3] = - BE_64(tmpsha256.zc_word[3]); - drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; - drrw->drr_flags |= DRR_CHECKSUM_DEDUP; - } - - dataref.ref_guid = drrw->drr_toguid; - dataref.ref_object = drrw->drr_object; - dataref.ref_offset = drrw->drr_offset; - - if (ddt_update(dda->dedup_hdl, &ddt, - &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, - &dataref)) { - dmu_replay_record_t wbr_drr = {0}; - struct drr_write_byref *wbr_drrr = - &wbr_drr.drr_u.drr_write_byref; - - /* block already present in stream */ - wbr_drr.drr_type = DRR_WRITE_BYREF; - - wbr_drrr->drr_object = drrw->drr_object; - wbr_drrr->drr_offset = drrw->drr_offset; - wbr_drrr->drr_length = drrw->drr_logical_size; - wbr_drrr->drr_toguid = drrw->drr_toguid; - wbr_drrr->drr_refguid = dataref.ref_guid; - wbr_drrr->drr_refobject = - dataref.ref_object; - wbr_drrr->drr_refoffset = - dataref.ref_offset; - - wbr_drrr->drr_checksumtype = - drrw->drr_checksumtype; - wbr_drrr->drr_flags = drrw->drr_flags; - wbr_drrr->drr_key.ddk_cksum = - drrw->drr_key.ddk_cksum; - wbr_drrr->drr_key.ddk_prop = - drrw->drr_key.ddk_prop; - - if (dump_record(&wbr_drr, NULL, 0, - &stream_cksum, outfd) != 0) - goto out; - } else { - /* block not previously seen */ - if (dump_record(drr, buf, payload_size, - &stream_cksum, outfd) != 0) - goto out; - } - break; - } - - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &drr->drr_u.drr_write_embedded; - (void) ssread(buf, - P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); - if (dump_record(drr, buf, - P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_FREE: - { - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_OBJECT_RANGE: - { - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - default: - (void) fprintf(stderr, "INVALID record type 0x%x\n", - drr->drr_type); - /* should never happen, so assert */ - assert(B_FALSE); - } - } -out: - umem_cache_destroy(ddt.ddecache); - free(ddt.dedup_hash_array); - free(buf); - (void) fclose(ofp); - - return (NULL); -} - /* * Routines for dealing with the AVL tree of fs-nvlists */ @@ -506,7 +126,7 @@ fsavl_compare(const void *arg1, const void *arg2) const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; - return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); + return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* @@ -565,15 +185,15 @@ fsavl_create(nvlist_t *fss) nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; - VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); - VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); + nvfs = fnvpair_value_nvlist(fselem); + snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; uint64_t guid; - VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); + guid = fnvpair_value_uint64(snapelem); if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); @@ -627,6 +247,7 @@ typedef struct send_data { boolean_t raw; boolean_t doall; boolean_t replicate; + boolean_t skipmissing; boolean_t verbose; boolean_t backup; boolean_t seenfrom; @@ -690,7 +311,7 @@ send_iterate_snap(zfs_handle_t *zhp, void *arg) return (0); } - VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); + fnvlist_add_uint64(sd->parent_snaps, snapname, guid); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. @@ -700,6 +321,15 @@ send_iterate_snap(zfs_handle_t *zhp, void *arg) } if (!sd->recursive) { + + /* + * To allow a doall stream to work properly + * with a NULL fromsnap + */ + if (sd->doall && sd->fromsnap == NULL && !sd->seenfrom) { + sd->seenfrom = B_TRUE; + } + if (!sd->seenfrom && isfromsnap) { sd->seenfrom = B_TRUE; zfs_close(zhp); @@ -715,16 +345,15 @@ send_iterate_snap(zfs_handle_t *zhp, void *arg) sd->seento = B_TRUE; } - VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); + nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); - VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); - nvlist_free(nv); + fnvlist_add_nvlist(sd->snapprops, snapname, nv); + fnvlist_free(nv); if (sd->holds) { nvlist_t *holds = fnvlist_alloc(); int err = lzc_get_holds(zhp->zfs_name, &holds); if (err == 0) { - VERIFY(0 == nvlist_add_nvlist(sd->snapholds, - snapname, holds)); + fnvlist_add_nvlist(sd->snapholds, snapname, holds); } fnvlist_free(holds); } @@ -799,14 +428,12 @@ send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; - verify(nvlist_lookup_string(propnv, - ZPROP_VALUE, &value) == 0); - VERIFY(0 == nvlist_add_string(nv, propname, value)); + value = fnvlist_lookup_string(propnv, ZPROP_VALUE); + fnvlist_add_string(nv, propname, value); } else { uint64_t value; - verify(nvlist_lookup_uint64(propnv, - ZPROP_VALUE, &value) == 0); - VERIFY(0 == nvlist_add_uint64(nv, propname, value)); + value = fnvlist_lookup_uint64(propnv, ZPROP_VALUE); + fnvlist_add_uint64(nv, propname, value); } } } @@ -871,7 +498,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) * - skip sending the current dataset if it was created later than * the parent tosnap * - return error if the current dataset was created earlier than - * the parent tosnap + * the parent tosnap, unless --skip-missing specified. Then + * just print a warning */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { @@ -880,13 +508,18 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } + } else if (sd->skipmissing) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: skipping dataset %s and its children:" + " snapshot %s does not exist\n"), + zhp->zfs_name, sd->tosnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); - rv = -1; + rv = EZFS_NOENT; } goto out; } @@ -951,8 +584,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) sd->parent_snaps = fnvlist_alloc(); sd->snapprops = fnvlist_alloc(); if (sd->holds) - VERIFY(0 == nvlist_alloc(&sd->snapholds, NV_UNIQUE_NAME, 0)); - + sd->snapholds = fnvlist_alloc(); /* * If this is a "doall" send, a replicate send or we're just trying @@ -989,6 +621,18 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) fnvlist_free(sd->snapprops); fnvlist_free(sd->snapholds); + /* Do not allow the size of the properties list to exceed the limit */ + if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) > + zhp->zfs_hdl->libzfs_max_nvlist) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "warning: cannot send %s@%s: the size of the list of " + "snapshots and properties is too large to be received " + "successfully.\n" + "Select a smaller number of snapshots to send.\n"), + zhp->zfs_name, sd->tosnap); + rv = EZFS_NOSPC; + goto out; + } /* add this fs to nvlist */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); @@ -1012,8 +656,9 @@ out: static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, - boolean_t replicate, boolean_t verbose, boolean_t backup, boolean_t holds, - boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) + boolean_t replicate, boolean_t skipmissing, boolean_t verbose, + boolean_t backup, boolean_t holds, boolean_t props, nvlist_t **nvlp, + avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; @@ -1023,7 +668,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, if (zhp == NULL) return (EZFS_BADTYPE); - VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); + sd.fss = fnvlist_alloc(); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; @@ -1031,13 +676,14 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, sd.raw = raw; sd.doall = doall; sd.replicate = replicate; + sd.skipmissing = skipmissing; sd.verbose = verbose; sd.backup = backup; sd.holds = holds; sd.props = props; if ((error = send_iterate_fs(zhp, &sd)) != 0) { - nvlist_free(sd.fss); + fnvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; @@ -1045,7 +691,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { - nvlist_free(sd.fss); + fnvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } @@ -1064,7 +710,7 @@ typedef struct send_dump_data { char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; + boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw, holds; int outfd; boolean_t err; @@ -1076,6 +722,7 @@ typedef struct send_dump_data { nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; + int verbosity; uint64_t size; } send_dump_data_t; @@ -1121,7 +768,7 @@ zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, case EFAULT: case EROFS: case EINVAL: - zfs_error_aux(hdl, strerror(error)); + zfs_error_aux(hdl, "%s", strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: @@ -1155,10 +802,9 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; - VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); + thisdbg = fnvlist_alloc(); if (fromsnap && fromsnap[0] != '\0') { - VERIFY(0 == nvlist_add_string(thisdbg, - "fromsnap", fromsnap)); + fnvlist_add_string(thisdbg, "fromsnap", fromsnap); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { @@ -1166,12 +812,11 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); - VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); + fnvlist_add_uint64(thisdbg, "error", errno); if (debugnv) { - VERIFY(0 == nvlist_add_nvlist(debugnv, - zhp->zfs_name, thisdbg)); + fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); } - nvlist_free(thisdbg); + fnvlist_free(thisdbg); switch (errno) { case EXDEV: @@ -1204,7 +849,8 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, case ERANGE: case EFAULT: case EROFS: - zfs_error_aux(hdl, strerror(errno)); + case EINVAL: + zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: @@ -1213,8 +859,8 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, } if (debugnv) - VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); - nvlist_free(thisdbg); + fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); + fnvlist_free(thisdbg); return (0); } @@ -1234,42 +880,76 @@ gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } +int +zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, + uint64_t *blocks_visited) +{ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_cookie = fd; + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) + return (errno); + if (bytes_written != NULL) + *bytes_written = zc.zc_cookie; + if (blocks_visited != NULL) + *blocks_visited = zc.zc_objset_type; + return (0); +} + static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; - zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = pa->pa_zhp; - libzfs_handle_t *hdl = zhp->zfs_hdl; - unsigned long long bytes; + uint64_t bytes; + uint64_t blocks; char buf[16]; time_t t; struct tm *tm; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (!pa->pa_parsable) - (void) fprintf(stderr, "TIME SENT SNAPSHOT %s\n", - zhp->zfs_name); + boolean_t firstloop = B_TRUE; /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { + int err; (void) sleep(1); + if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, + &blocks)) != 0) { + if (err == EINTR || err == ENOENT) + return ((void *)0); + return ((void *)(uintptr_t)err); + } - zc.zc_cookie = pa->pa_fd; - if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) - return ((void *)-1); + if (firstloop && !pa->pa_parsable) { + (void) fprintf(stderr, + "TIME %s %sSNAPSHOT %s\n", + pa->pa_estimate ? "BYTES" : " SENT", + pa->pa_verbosity >= 2 ? " BLOCKS " : "", + zhp->zfs_name); + firstloop = B_FALSE; + } (void) time(&t); tm = localtime(&t); - bytes = zc.zc_cookie; - if (pa->pa_parsable) { + if (pa->pa_verbosity >= 2 && pa->pa_parsable) { + (void) fprintf(stderr, + "%02d:%02d:%02d\t%llu\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + (u_longlong_t)bytes, (u_longlong_t)blocks, + zhp->zfs_name); + } else if (pa->pa_verbosity >= 2) { + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "%02d:%02d:%02d %5s %8llu %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, (u_longlong_t)blocks, zhp->zfs_name); + } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, - bytes, zhp->zfs_name); + (u_longlong_t)bytes, zhp->zfs_name); } else { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", @@ -1379,11 +1059,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); - VERIFY(0 == nvlist_lookup_nvlist(nvfs, - "snapprops", &snapprops)); - VERIFY(0 == nvlist_lookup_nvlist(snapprops, - thissnap, &snapprops)); - exclude = !nvlist_exists(snapprops, "is_clone_origin"); + if (nvfs != NULL) { + snapprops = fnvlist_lookup_nvlist(nvfs, + "snapprops"); + snapprops = fnvlist_lookup_nvlist(snapprops, + thissnap); + exclude = !nvlist_exists(snapprops, + "is_clone_origin"); + } } else { exclude = B_TRUE; } @@ -1411,7 +1094,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); - if (sdd->verbose) { + if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; @@ -1440,6 +1123,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = sdd->verbosity; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { @@ -1452,8 +1137,18 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { + void *status = NULL; (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } } } @@ -1474,7 +1169,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); - if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); @@ -1492,7 +1187,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); - if (ioctl(zhp->zfs_hdl->libzfs_fd, + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { missingfrom = B_TRUE; } @@ -1589,7 +1284,7 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg) nvlist_t *nvfs; uint64_t origin_guid = 0; - VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); + nvfs = fnvpair_value_nvlist(fspair); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; @@ -1597,12 +1292,12 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg) origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; - VERIFY(0 == nvlist_lookup_nvlist(origin_nv, - "snapprops", &snapprops)); - VERIFY(0 == nvlist_lookup_nvlist(snapprops, - snapname, &snapprops)); - VERIFY(0 == nvlist_add_boolean( - snapprops, "is_clone_origin")); + snapprops = fnvlist_lookup_nvlist(origin_nv, + "snapprops"); + snapprops = fnvlist_lookup_nvlist(snapprops, + snapname); + fnvlist_add_boolean(snapprops, + "is_clone_origin"); } } } @@ -1617,11 +1312,11 @@ again: uint64_t origin_guid = 0; uint64_t parent_guid = 0; - VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + fslist = fnvpair_value_nvlist(fspair); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; - VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); + fsname = fnvlist_lookup_string(fslist, "name"); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); @@ -1653,7 +1348,7 @@ again: if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); - VERIFY(nvlist_add_boolean(fslist, "sent") == 0); + fnvlist_add_boolean(fslist, "sent"); progress = B_TRUE; zfs_close(zhp); if (err) @@ -1669,7 +1364,7 @@ again: fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; - VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + fslist = fnvpair_value_nvlist(fspair); (void) nvlist_remove_all(fslist, "sent"); } @@ -1752,10 +1447,189 @@ zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) } return (nv); } +static enum lzc_send_flags +lzc_flags_from_sendflags(const sendflags_t *flags) +{ + enum lzc_send_flags lzc_flags = 0; + if (flags->largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags->embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; + if (flags->raw) + lzc_flags |= LZC_SEND_FLAG_RAW; + if (flags->saved) + lzc_flags |= LZC_SEND_FLAG_SAVED; + return (lzc_flags); +} -int -zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, - const char *resume_token) +static int +estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, + const char *redactbook, char *errbuf) +{ + uint64_t size; + FILE *fout = flags->dryrun ? stdout : stderr; + progress_arg_t pa = { 0 }; + int err = 0; + pthread_t ptid; + + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_TRUE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + err = lzc_send_space_resume_redacted(zhp->zfs_name, from, + lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, + redactbook, fd, &size); + + if (flags->progress) { + void *status = NULL; + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "progress thread exited " + "nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } + } + + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + send_print_verbose(fout, zhp->zfs_name, from, size, + flags->parsable); + + if (flags->parsable) { + (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + return (0); +} + +static boolean_t +redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, + const uint64_t *snaps2, uint64_t num_snaps2) +{ + if (num_snaps1 != num_snaps2) + return (B_FALSE); + for (int i = 0; i < num_snaps1; i++) { + if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Check that the list of redaction snapshots in the bookmark matches the send + * we're resuming, and return whether or not it's complete. + * + * Note that the caller needs to free the contents of *bookname with free() if + * this function returns successfully. + */ +static int +find_redact_book(libzfs_handle_t *hdl, const char *path, + const uint64_t *redact_snap_guids, int num_redact_snaps, + char **bookname) +{ + char errbuf[1024]; + int error = 0; + nvlist_t *props = fnvlist_alloc(); + nvlist_t *bmarks; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + fnvlist_add_boolean(props, "redact_complete"); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + error = lzc_get_bookmarks(path, props, &bmarks); + fnvlist_free(props); + if (error != 0) { + if (error == ESRCH) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "nonexistent redaction bookmark provided")); + } else if (error == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset to be sent no longer exists")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unknown error: %s"), strerror(error)); + } + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + nvpair_t *pair; + for (pair = nvlist_next_nvpair(bmarks, NULL); pair; + pair = nvlist_next_nvpair(bmarks, pair)) { + + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + uint_t len = 0; + uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, + ZPROP_VALUE, &len); + if (redact_snaps_equal(redact_snap_guids, + num_redact_snaps, bmarksnaps, len)) { + break; + } + } + if (pair == NULL) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no appropriate redaction bookmark exists")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + char *name = nvpair_name(pair); + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); + boolean_t complete = fnvlist_lookup_boolean_value(vallist, + ZPROP_VALUE); + if (!complete) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incomplete redaction bookmark provided")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + *bookname = strndup(name, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3P(*bookname, !=, NULL); + fnvlist_free(bmarks); + return (0); +} + +static int +zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + nvlist_t *resume_nvl) { char errbuf[1024]; char *toname; @@ -1765,21 +1639,15 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; enum lzc_send_flags lzc_flags = 0; - FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr; + FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; + uint64_t *redact_snap_guids = NULL; + int num_redact_snaps = 0; + char *redact_book = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); - nvlist_t *resume_nvl = - zfs_send_resume_token_to_nvlist(hdl, resume_token); - if (resume_nvl == NULL) { - /* - * zfs_error_aux has already been set by - * zfs_send_resume_token_to_nvlist - */ - return (zfs_error(hdl, EZFS_FAULT, errbuf)); - } - if (flags->verbose) { + if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); @@ -1805,19 +1673,27 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw || nvlist_exists(resume_nvl, "rawok")) lzc_flags |= LZC_SEND_FLAG_RAW; + if (flags->saved || nvlist_exists(resume_nvl, "savedok")) + lzc_flags |= LZC_SEND_FLAG_SAVED; - if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { - if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is no longer the same snapshot used in " - "the initial send"), toname); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' used in the initial send no longer exists"), - toname); + if (flags->saved) { + (void) strcpy(name, toname); + } else { + error = guid_to_name(hdl, toname, toguid, B_FALSE, name); + if (error != 0) { + if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is no longer the same snapshot " + "used in the initial send"), toname); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' used in the initial send no " + "longer exists"), toname); + } + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } - return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -1825,8 +1701,14 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } + if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", + &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { + num_redact_snaps = -1; + } + if (fromguid != 0) { - if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { + if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, + redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); @@ -1835,14 +1717,43 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, fromname = name; } - if (flags->verbose) { - uint64_t size = 0; - error = lzc_send_space(zhp->zfs_name, fromname, - lzc_flags, &size); - if (error == 0) - size = MAX(0, (int64_t)(size - bytes)); - send_print_verbose(fout, zhp->zfs_name, fromname, - size, flags->parsable); + redact_snap_guids = NULL; + + if (nvlist_lookup_uint64_array(resume_nvl, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, + (uint_t *)&num_redact_snaps) == 0) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + + (void) strlcpy(path, toname, sizeof (path)); + char *at = strchr(path, '@'); + ASSERT3P(at, !=, NULL); + + *at = '\0'; + + if ((error = find_redact_book(hdl, path, redact_snap_guids, + num_redact_snaps, &redact_book)) != 0) { + return (error); + } + } + + if (flags->verbosity != 0) { + /* + * Some of these may have come from the resume token, set them + * here for size estimate purposes. + */ + sendflags_t tmpflags = *flags; + if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) + tmpflags.largeblock = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_COMPRESS) + tmpflags.compress = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) + tmpflags.embed_data = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_RAW) + tmpflags.raw = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_SAVED) + tmpflags.saved = B_TRUE; + error = estimate_size(zhp, fromname, outfd, &tmpflags, + resumeobj, resumeoff, bytes, redact_book, errbuf); } if (!flags->dryrun) { @@ -1856,21 +1767,36 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { + if (redact_book != NULL) + free(redact_book); zfs_close(zhp); return (error); } } - error = lzc_send_resume(zhp->zfs_name, fromname, outfd, - lzc_flags, resumeobj, resumeoff); + error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff, redact_book); + if (redact_book != NULL) + free(redact_book); if (flags->progress) { + void *status = NULL; (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(hdl, error, errbuf)); + } } char errbuf[1024]; @@ -1886,6 +1812,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + case ESRCH: + if (lzc_exists(zhp->zfs_name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source could not be found")); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: @@ -1900,22 +1832,286 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, case ERANGE: case EFAULT: case EROFS: - zfs_error_aux(hdl, strerror(errno)); + zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } + } else { + if (redact_book != NULL) + free(redact_book); } - zfs_close(zhp); return (error); } +int +zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + const char *resume_token) +{ + int ret; + char errbuf[1024]; + nvlist_t *resume_nvl; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); + if (resume_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist() + */ + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + + ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); + fnvlist_free(resume_nvl); + + return (ret); +} + +int +zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, + const char *resume_token) +{ + int ret; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; + uint64_t saved_guid = 0, resume_guid = 0; + uint64_t obj = 0, off = 0, bytes = 0; + char token_buf[ZFS_MAXPROPLEN]; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "saved send failed")); + + ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); + if (ret != 0) + goto out; + + saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); + if (saved_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist() + */ + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + /* + * If a resume token is provided we use the object and offset + * from that instead of the default, which starts from the + * beginning. + */ + if (resume_token != NULL) { + resume_nvl = zfs_send_resume_token_to_nvlist(hdl, + resume_token); + if (resume_nvl == NULL) { + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", + &resume_guid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "provided resume token is corrupt")); + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (nvlist_lookup_uint64(saved_nvl, "toguid", + &saved_guid)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset's resume token is corrupt")); + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (resume_guid != saved_guid) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "provided resume token does not match dataset")); + ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); + goto out; + } + } + + (void) nvlist_remove_all(saved_nvl, "object"); + fnvlist_add_uint64(saved_nvl, "object", obj); + + (void) nvlist_remove_all(saved_nvl, "offset"); + fnvlist_add_uint64(saved_nvl, "offset", off); + + (void) nvlist_remove_all(saved_nvl, "bytes"); + fnvlist_add_uint64(saved_nvl, "bytes", bytes); + + (void) nvlist_remove_all(saved_nvl, "toname"); + fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); + + ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); + +out: + fnvlist_free(saved_nvl); + fnvlist_free(resume_nvl); + return (ret); +} + /* - * Generate a send stream for the dataset identified by the argument zhp. + * This function informs the target system that the recursive send is complete. + * The record is also expected in the case of a send -p. + */ +static int +send_conclusion_record(int fd, zio_cksum_t *zc) +{ + dmu_replay_record_t drr = { 0 }; + drr.drr_type = DRR_END; + if (zc != NULL) + drr.drr_u.drr_end.drr_checksum = *zc; + if (write(fd, &drr, sizeof (drr)) == -1) { + return (errno); + } + return (0); +} + +/* + * This function is responsible for sending the records that contain the + * necessary information for the target system's libzfs to be able to set the + * properties of the filesystem being received, or to be able to prepare for + * a recursive receive. + * + * The "zhp" argument is the handle of the snapshot we are sending + * (the "tosnap"). The "from" argument is the short snapshot name (the part + * after the @) of the incremental source. + */ +static int +send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, + boolean_t gather_props, boolean_t recursive, boolean_t verbose, + boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t skipmissing, + boolean_t backup, boolean_t holds, boolean_t props, boolean_t doall, + nvlist_t **fssp, avl_tree_t **fsavlp) +{ + int err = 0; + char *packbuf = NULL; + size_t buflen = 0; + zio_cksum_t zc = { {0} }; + int featureflags = 0; + /* name of filesystem/volume that contains snapshot we are sending */ + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + /* short name of snap we are sending */ + char *tosnap = ""; + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, + ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + + if (holds) + featureflags |= DMU_BACKUP_FEATURE_HOLDS; + + (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); + char *at = strchr(tofs, '@'); + if (at != NULL) { + *at = '\0'; + tosnap = at + 1; + } + + if (gather_props) { + nvlist_t *hdrnv = fnvlist_alloc(); + nvlist_t *fss = NULL; + + if (from != NULL) + fnvlist_add_string(hdrnv, "fromsnap", from); + fnvlist_add_string(hdrnv, "tosnap", tosnap); + if (!recursive) + fnvlist_add_boolean(hdrnv, "not_recursive"); + + if (raw) { + fnvlist_add_boolean(hdrnv, "raw"); + } + + if ((err = gather_nvlist(zhp->zfs_hdl, tofs, + from, tosnap, recursive, raw, doall, replicate, skipmissing, + verbose, backup, holds, props, &fss, fsavlp)) != 0) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + /* + * Do not allow the size of the properties list to exceed + * the limit + */ + if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) > + zhp->zfs_hdl->libzfs_max_nvlist) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "warning: cannot send '%s': " + "the size of the list of snapshots and properties " + "is too large to be received successfully.\n" + "Select a smaller number of snapshots to send.\n"), + zhp->zfs_name); + return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC, + errbuf)); + } + fnvlist_add_nvlist(hdrnv, "fss", fss); + VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, + 0)); + if (fssp != NULL) { + *fssp = fss; + } else { + fnvlist_free(fss); + } + fnvlist_free(hdrnv); + } + + if (!dryrun) { + dmu_replay_record_t drr = { 0 }; + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. + drr_versioninfo, DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. + drr_versioninfo, featureflags); + if (snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, + tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + drr.drr_payloadlen = buflen; + + err = dump_record(&drr, packbuf, buflen, &zc, fd); + free(packbuf); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + err = send_conclusion_record(fd, &zc); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + } + return (0); +} + +/* + * Generate a send stream. The "zhp" argument is the filesystem/volume + * that contains the snapshot to send. The "fromsnap" argument is the + * short name (the part after the '@') of the snapshot that is the + * incremental source to send from (if non-NULL). The "tosnap" argument + * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: @@ -1942,10 +2138,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; - pthread_t tid = 0; - int pipefd[2]; - dedup_arg_t dda = { 0 }; - int featureflags = 0; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -1957,138 +2149,59 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } - if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { - uint64_t version; - version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - if (version >= ZPL_VERSION_SA) { - featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + if (fromsnap) { + char full_fromsnap_name[ZFS_MAX_DATASET_NAME_LEN]; + if (snprintf(full_fromsnap_name, sizeof (full_fromsnap_name), + "%s@%s", zhp->zfs_name, fromsnap) >= + sizeof (full_fromsnap_name)) { + err = EINVAL; + goto stderr_out; } - } - - if (flags->holds) - featureflags |= DMU_BACKUP_FEATURE_HOLDS; - - /* - * Start the dedup thread if this is a dedup stream. We do not bother - * doing this if this a raw send of an encrypted dataset with dedup off - * because normal encrypted blocks won't dedup. - */ - if (flags->dedup && !flags->dryrun && !(flags->raw && - zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && - zfs_prop_get_int(zhp, ZFS_PROP_DEDUP) == ZIO_CHECKSUM_OFF)) { - featureflags |= (DMU_BACKUP_FEATURE_DEDUP | - DMU_BACKUP_FEATURE_DEDUPPROPS); - if ((err = socketpair(AF_UNIX, SOCK_STREAM, 0, pipefd)) != 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, - errbuf)); - } - dda.outputfd = outfd; - dda.inputfd = pipefd[1]; - dda.dedup_hdl = zhp->zfs_hdl; - if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) { - (void) close(pipefd[0]); - (void) close(pipefd[1]); - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - return (zfs_error(zhp->zfs_hdl, - EZFS_THREADCREATEFAILED, errbuf)); + zfs_handle_t *fromsnapn = zfs_open(zhp->zfs_hdl, + full_fromsnap_name, ZFS_TYPE_SNAPSHOT); + if (fromsnapn == NULL) { + err = -1; + goto err_out; } + zfs_close(fromsnapn); } if (flags->replicate || flags->doall || flags->props || flags->holds || flags->backup) { - dmu_replay_record_t drr = { 0 }; - char *packbuf = NULL; - size_t buflen = 0; - zio_cksum_t zc; - - ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); - - if (flags->replicate || flags->props || flags->backup || - flags->holds) { - nvlist_t *hdrnv; - - VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); - if (fromsnap) { - VERIFY(0 == nvlist_add_string(hdrnv, - "fromsnap", fromsnap)); - } - VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); - if (!flags->replicate) { - VERIFY(0 == nvlist_add_boolean(hdrnv, - "not_recursive")); - } - if (flags->raw) { - VERIFY(0 == nvlist_add_boolean(hdrnv, "raw")); - } - - err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, flags->replicate, flags->raw, - flags->doall, flags->replicate, flags->verbose, - flags->backup, flags->holds, flags->props, &fss, - &fsavl); - if (err) - goto err_out; - VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); - err = nvlist_pack(hdrnv, &packbuf, &buflen, - NV_ENCODE_XDR, 0); - if (debugnvp) - *debugnvp = hdrnv; - else - nvlist_free(hdrnv); - if (err) - goto stderr_out; + char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; + if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), + "%s@%s", zhp->zfs_name, tosnap) >= + sizeof (full_tosnap_name)) { + err = EINVAL; + goto stderr_out; } - - if (!flags->dryrun) { - /* write first begin record */ - drr.drr_type = DRR_BEGIN; - drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. - drr_versioninfo, DMU_COMPOUNDSTREAM); - DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. - drr_versioninfo, featureflags); - if (snprintf(drr.drr_u.drr_begin.drr_toname, - sizeof (drr.drr_u.drr_begin.drr_toname), - "%s@%s", zhp->zfs_name, tosnap) >= - sizeof (drr.drr_u.drr_begin.drr_toname)) { - err = EINVAL; - goto stderr_out; - } - drr.drr_payloadlen = buflen; - - err = dump_record(&drr, packbuf, buflen, &zc, outfd); - free(packbuf); - if (err != 0) - goto stderr_out; - - /* write end record */ - bzero(&drr, sizeof (drr)); - drr.drr_type = DRR_END; - drr.drr_u.drr_end.drr_checksum = zc; - err = write(outfd, &drr, sizeof (drr)); - if (err == -1) { - err = errno; - goto stderr_out; - } - - err = 0; + zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, + full_tosnap_name, ZFS_TYPE_SNAPSHOT); + if (tosnap == NULL) { + err = -1; + goto err_out; } + err = send_prelim_records(tosnap, fromsnap, outfd, + flags->replicate || flags->props || flags->holds, + flags->replicate, flags->verbosity > 0, flags->dryrun, + flags->raw, flags->replicate, flags->skipmissing, + flags->backup, flags->holds, flags->props, flags->doall, + &fss, &fsavl); + zfs_close(tosnap); + if (err != 0) + goto err_out; } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; - if (tid != 0) - sdd.outfd = pipefd[0]; - else - sdd.outfd = outfd; + sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; - sdd.verbose = flags->verbose; + sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.dryrun = flags->dryrun; @@ -2101,7 +2214,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; - if (sdd.verbose && sdd.dryrun) + if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; @@ -2119,7 +2232,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); - sdd.cleanup_fd = open(ZFS_DEV, O_RDWR); + sdd.cleanup_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; @@ -2130,7 +2243,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.snapholds = NULL; } - if (flags->verbose || sdd.snapholds != NULL) { + if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, @@ -2142,7 +2255,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, if (err != 0) goto stderr_out; - if (flags->verbose) { + if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); @@ -2174,24 +2287,17 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } sdd.dryrun = B_FALSE; - sdd.verbose = B_FALSE; + sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); - nvlist_free(fss); + fnvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; - if (tid != 0) { - if (err != 0) - (void) pthread_cancel(tid); - (void) close(pipefd[0]); - (void) pthread_join(tid, NULL); - } - if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; @@ -2204,12 +2310,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, * there was some error, because it might not be totally * failed. */ - dmu_replay_record_t drr = { 0 }; - drr.drr_type = DRR_END; - if (write(outfd, &drr, sizeof (drr)) == -1) { - return (zfs_standard_error(zhp->zfs_hdl, - errno, errbuf)); - } + err = send_conclusion_record(outfd, NULL); + if (err != 0) + return (zfs_standard_error(zhp->zfs_hdl, err, errbuf)); } return (err || sdd.err); @@ -2218,56 +2321,233 @@ stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); - nvlist_free(fss); + fnvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); - if (tid != 0) { - (void) pthread_cancel(tid); - (void) close(pipefd[0]); - (void) pthread_join(tid, NULL); - } return (err); } -int -zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) +static zfs_handle_t * +name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) { - int err = 0; + char dirname[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(dirname, '@'); + if (c != NULL) + *c = '\0'; + return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); +} + +/* + * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either + * an earlier snapshot in the same filesystem, or a snapshot before later's + * origin, or it's origin's origin, etc. + */ +static boolean_t +snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) +{ + boolean_t ret; + uint64_t later_txg = + (later->zfs_type == ZFS_TYPE_FILESYSTEM || + later->zfs_type == ZFS_TYPE_VOLUME ? + UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); + uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); + + if (earlier_txg >= later_txg) + return (B_FALSE); + + zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, + earlier->zfs_name); + zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, + later->zfs_name); + + if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_TRUE); + } + + char clonename[ZFS_MAX_DATASET_NAME_LEN]; + if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, + ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_FALSE); + } + + zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, + ZFS_TYPE_DATASET); + uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); + + /* + * If "earlier" is exactly the origin, then + * snapshot_is_before(earlier, origin) will return false (because + * they're the same). + */ + if (origin_txg == earlier_txg && + strcmp(origin->zfs_name, earlier->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + zfs_close(origin); + return (B_TRUE); + } + zfs_close(earlier_dir); + zfs_close(later_dir); + + ret = snapshot_is_before(earlier, origin); + zfs_close(origin); + return (ret); +} + +/* + * The "zhp" argument is the handle of the dataset to send (typically a + * snapshot). The "from" argument is the full name of the snapshot or + * bookmark that is the incremental source. + */ +int +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + const char *redactbook) +{ + int err; libzfs_handle_t *hdl = zhp->zfs_hdl; - enum lzc_send_flags lzc_flags = 0; - FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr; + char *name = zhp->zfs_name; + pthread_t ptid; + progress_arg_t pa = { 0 }; + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), name); - if (flags.largeblock) - lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (flags.embed_data) - lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; - if (flags.compress) - lzc_flags |= LZC_SEND_FLAG_COMPRESS; - if (flags.raw) - lzc_flags |= LZC_SEND_FLAG_RAW; + if (from != NULL && strchr(from, '@')) { + zfs_handle_t *from_zhp = zfs_open(hdl, from, + ZFS_TYPE_DATASET); + if (from_zhp == NULL) + return (-1); + if (!snapshot_is_before(from_zhp, zhp)) { + zfs_close(from_zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + } + zfs_close(from_zhp); + } - if (flags.verbose) { - uint64_t size = 0; - err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size); - if (err == 0) { - send_print_verbose(fout, zhp->zfs_name, from, size, - flags.parsable); - } else { - (void) fprintf(stderr, "Cannot estimate send size: " - "%s\n", strerror(errno)); + if (redactbook != NULL) { + char bookname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *redact_snaps; + zfs_handle_t *book_zhp; + char *at, *pound; + int dsnamelen; + + pound = strchr(redactbook, '#'); + if (pound != NULL) + redactbook = pound + 1; + at = strchr(name, '@'); + if (at == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot do a redacted send to a filesystem")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + dsnamelen = at - name; + if (snprintf(bookname, sizeof (bookname), "%.*s#%s", + dsnamelen, name, redactbook) + >= sizeof (bookname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid bookmark name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); + if (book_zhp == NULL) + return (-1); + if (nvlist_lookup_nvlist(book_zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), + &redact_snaps) != 0 || redact_snaps == NULL) { + zfs_close(book_zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not a redaction bookmark")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + zfs_close(book_zhp); + } + + /* + * Send fs properties + */ + if (flags->props || flags->holds || flags->backup) { + /* + * Note: the header generated by send_prelim_records() + * assumes that the incremental source is in the same + * filesystem/volume as the target (which is a requirement + * when doing "zfs send -R"). But that isn't always the + * case here (e.g. send from snap in origin, or send from + * bookmark). We pass from=NULL, which will omit this + * information from the prelim records; it isn't used + * when receiving this type of stream. + */ + err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, + flags->verbosity > 0, flags->dryrun, flags->raw, + flags->replicate, B_FALSE, flags->backup, flags->holds, + flags->props, flags->doall, NULL, NULL); + if (err != 0) + return (err); + } + + /* + * Perform size estimate if verbose was specified. + */ + if (flags->verbosity != 0) { + err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, + errbuf); + if (err != 0) + return (err); + } + + if (flags->dryrun) + return (0); + + /* + * If progress reporting is requested, spawn a new thread to poll + * ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); } } - if (flags.dryrun) - return (err); + err = lzc_send_redacted(name, from, fd, + lzc_flags_from_sendflags(flags), redactbook); - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot send '%s'"), zhp->zfs_name); + if (flags->progress) { + void *status = NULL; + if (err != 0) + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) + return (zfs_standard_error_fmt(hdl, error, + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero"))); + } - err = lzc_send(zhp->zfs_name, from, fd, lzc_flags); + if (flags->props || flags->holds || flags->backup) { + /* Write the final end record. */ + err = send_conclusion_record(fd, NULL); + if (err != 0) + return (zfs_standard_error(hdl, err, errbuf)); + } if (err != 0) { switch (errno) { case EXDEV: @@ -2277,7 +2557,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) case ENOENT: case ESRCH: - if (lzc_exists(zhp->zfs_name)) { + if (lzc_exists(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); @@ -2296,7 +2576,9 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: + case EFAULT: case EFBIG: + case EINVAL: case EIO: case ENOLINK: case ENOSPC: @@ -2304,9 +2586,8 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) case ENXIO: case EPIPE: case ERANGE: - case EFAULT: case EROFS: - zfs_error_aux(hdl, strerror(errno)); + zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: @@ -2328,8 +2609,6 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, int rv; int len = ilen; - assert(ilen <= SPA_MAXBLOCKSIZE); - do { rv = read(fd, cp, len); cp += rv; @@ -2363,6 +2642,12 @@ recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, if (buf == NULL) return (ENOMEM); + if (len > hdl->libzfs_max_nvlist) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large")); + free(buf); + return (ENOMEM); + } + err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); @@ -2621,8 +2906,38 @@ typedef struct guid_to_name_data { boolean_t bookmark_ok; char *name; char *skip; + uint64_t *redact_snap_guids; + uint64_t num_redact_snaps; } guid_to_name_data_t; +static boolean_t +redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) +{ + uint64_t *bmark_snaps; + uint_t bmark_num_snaps; + nvlist_t *nvl; + if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) + return (B_FALSE); + + nvl = fnvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, + &bmark_num_snaps); + if (bmark_num_snaps != gtnd->num_redact_snaps) + return (B_FALSE); + int i = 0; + for (; i < bmark_num_snaps; i++) { + int j = 0; + for (; j < bmark_num_snaps; j++) { + if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) + break; + } + if (j == bmark_num_snaps) + break; + } + return (i == bmark_num_snaps); +} + static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { @@ -2637,7 +2952,8 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) return (0); } - if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && + (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); @@ -2656,10 +2972,19 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. + * + * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with + * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or + * -1, then redact_snap_guids will be an array of the guids of the snapshots the + * redaction bookmark was created with. If num_redact_snaps is -1, then we will + * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the + * given guid. Note that a redaction bookmark can be returned if + * num_redact_snaps == -1. */ static int -guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - boolean_t bookmark_ok, char *name) +guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; @@ -2668,6 +2993,8 @@ guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; + gtnd.redact_snap_guids = redact_snap_guids; + gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting @@ -2706,6 +3033,14 @@ guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, return (ENOENT); } +static int +guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, + boolean_t bookmark_ok, char *name) +{ + return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, + -1, name)); +} + /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. @@ -2727,14 +3062,14 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl, return (1); nvfs = fsavl_find(avl, guid1, &snapname); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { @@ -2765,21 +3100,14 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl, * sent datasets to their final locations in the dataset hierarchy. */ static int -recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, +recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, nvlist_t *stream_nv, avl_tree_t *stream_avl) { int err; nvpair_t *fselem = NULL; nvlist_t *stream_fss; - char *cp; - char top_zfs[ZFS_MAX_DATASET_NAME_LEN]; - (void) strcpy(top_zfs, destname); - cp = strrchr(top_zfs, '@'); - if (cp != NULL) - *cp = '\0'; - - VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", &stream_fss)); + stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { zfs_handle_t *zhp = NULL; @@ -2793,9 +3121,9 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, char fsname[ZFS_MAX_DATASET_NAME_LEN]; keylocation[0] = '\0'; - VERIFY(0 == nvpair_value_nvlist(fselem, &stream_nvfs)); - VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "snaps", &snaps)); - VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "props", &props)); + stream_nvfs = fnvpair_value_nvlist(fselem); + snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); + props = fnvlist_lookup_nvlist(stream_nvfs, "props"); stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); /* find a snapshot from the stream that exists locally */ @@ -2803,8 +3131,8 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { uint64_t guid; - VERIFY(0 == nvpair_value_uint64(snapel, &guid)); - err = guid_to_name(hdl, destname, guid, B_FALSE, + guid = fnvpair_value_uint64(snapel); + err = guid_to_name(hdl, top_zfs, guid, B_FALSE, fsname); if (err == 0) break; @@ -2827,7 +3155,7 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); - /* we don't need to do anything for unencrypted filesystems */ + /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; @@ -2848,9 +3176,8 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *destname, } } - VERIFY(0 == nvlist_lookup_string(props, - zfs_prop_to_name(ZFS_PROP_KEYLOCATION), - &stream_keylocation)); + stream_keylocation = fnvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* * Refresh the properties in case the call to @@ -2916,7 +3243,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, boolean_t needagain, progress, recursive; char *s1, *s2; - VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); + fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); @@ -2927,10 +3254,10 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, again: needagain = progress = B_FALSE; - VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); + deleted = fnvlist_alloc(); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, - recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, + recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) return (error); @@ -2950,11 +3277,11 @@ again: nextfselem = nvlist_next_nvpair(local_nv, fselem); - VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); - VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); - VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", - &parent_fromsnap_guid)); + nvfs = fnvpair_value_nvlist(fselem); + snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); + fsname = fnvlist_lookup_string(nvfs, "name"); + parent_fromsnap_guid = fnvlist_lookup_uint64(nvfs, + "parentfromsnap"); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* @@ -2965,7 +3292,7 @@ again: snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; - VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + thisguid = fnvpair_value_uint64(snapelem); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) @@ -2985,8 +3312,8 @@ again: origin_nvfs = fsavl_find(local_avl, originguid, NULL); - VERIFY(0 == nvlist_lookup_string(origin_nvfs, - "name", &origin_fsname)); + origin_fsname = fnvlist_lookup_string( + origin_nvfs, "name"); error = recv_promote(hdl, fsname, origin_fsname, flags); if (error == 0) @@ -2997,7 +3324,7 @@ again: break; case -1: fsavl_destroy(local_avl); - nvlist_free(local_nv); + fnvlist_free(local_nv); return (-1); } /* @@ -3017,7 +3344,7 @@ again: nextsnapelem = nvlist_next_nvpair(snaps, snapelem); - VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + thisguid = fnvpair_value_uint64(snapelem); found = fsavl_find(stream_avl, thisguid, &stream_snapname); @@ -3111,10 +3438,9 @@ again: continue; } - VERIFY(0 == nvlist_lookup_string(stream_nvfs, - "name", &stream_fsname)); - VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, - "parentfromsnap", &stream_parent_fromsnap_guid)); + stream_fsname = fnvlist_lookup_string(stream_nvfs, "name"); + stream_parent_fromsnap_guid = fnvlist_lookup_uint64( + stream_nvfs, "parentfromsnap"); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); @@ -3163,8 +3489,7 @@ again: if (parent != NULL) { char *pname; - VERIFY(0 == nvlist_lookup_string(parent, "name", - &pname)); + pname = fnvlist_lookup_string(parent, "name"); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { @@ -3181,8 +3506,7 @@ again: strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { - VERIFY(0 == nvlist_add_boolean(renamed, - newname)); + fnvlist_add_boolean(renamed, newname); } if (error) @@ -3194,8 +3518,8 @@ again: doagain: fsavl_destroy(local_avl); - nvlist_free(local_nv); - nvlist_free(deleted); + fnvlist_free(local_nv); + fnvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ @@ -3210,8 +3534,7 @@ doagain: static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, - char **top_zfs, int cleanup_fd, uint64_t *action_handlep, - nvlist_t *cmdprops) + char **top_zfs, nvlist_t *cmdprops) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; @@ -3291,8 +3614,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; - VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", - &stream_fss)); + stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); @@ -3328,8 +3650,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, } if (!flags->dryrun && !flags->nomount) { - VERIFY(0 == nvlist_alloc(&renamed, - NV_UNIQUE_NAME, 0)); + renamed = fnvlist_alloc(); } softerr = recv_incremental_replication(hdl, tofs, flags, @@ -3345,7 +3666,8 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, - ZFS_PROP_MOUNTPOINT, 0, 0); + ZFS_PROP_MOUNTPOINT, 0, + flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp != NULL) { softerr |= @@ -3355,7 +3677,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, } } - nvlist_free(renamed); + fnvlist_free(renamed); } } @@ -3387,8 +3709,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, - sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, - action_handlep, sendsnap, cmdprops); + sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); if (error == ENODATA) { error = 0; break; @@ -3405,14 +3726,14 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, stream_nv, stream_avl, NULL); } - if (raw && softerr == 0) { - softerr = recv_fix_encryption_hierarchy(hdl, destname, + if (raw && softerr == 0 && *top_zfs != NULL) { + softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, stream_nv, stream_avl); } out: fsavl_destroy(stream_avl); - nvlist_free(stream_nv); + fnvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) @@ -3438,10 +3759,11 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); + uint64_t payload_size; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive:")); + "cannot receive")); /* XXX would be great to use lseek if possible... */ drr = buf; @@ -3468,9 +3790,14 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); + drr->drr_u.drr_object.drr_raw_bonuslen = + BSWAP_32(drr->drr_u.drr_object. + drr_raw_bonuslen); } - (void) recv_read(hdl, fd, buf, - P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8), + + payload_size = + DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); + (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; @@ -3483,8 +3810,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } - uint64_t payload_size = + payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); + assert(payload_size <= SPA_MAXBLOCKSIZE); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; @@ -3492,9 +3820,15 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); + drr->drr_u.drr_spill.drr_compressed_size = + BSWAP_64(drr->drr_u.drr_spill. + drr_compressed_size); } - (void) recv_read(hdl, fd, buf, - drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); + + payload_size = + DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); + (void) recv_read(hdl, fd, buf, payload_size, + B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { @@ -3506,6 +3840,7 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; + case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: @@ -3525,12 +3860,12 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, - boolean_t resumable) + boolean_t resumable, boolean_t checksum) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "checksum mismatch or incomplete stream")); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? + "checksum mismatch" : "incomplete stream"))); if (!resumable) return; @@ -3623,35 +3958,40 @@ zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); + /* + * It turns out, if we don't normalize "aliased" names + * e.g. compress= against the "real" names (e.g. compression) + * here, then setting/excluding them does not work as + * intended. + * + * But since user-defined properties wouldn't have a valid + * mapping here, we do this conditional dance. + */ + const char *newname = name; + if (prop >= ZFS_PROP_TYPE) + newname = zfs_prop_to_name(prop); + /* "origin" is processed separately, don't handle it here */ if (prop == ZFS_PROP_ORIGIN) continue; - /* - * we're trying to override or exclude a property that does not - * make sense for this type of dataset, but we don't want to - * fail if the receive is recursive: this comes in handy when - * the send stream contains, for instance, a child ZVOL and - * we're trying to receive it with "-o atime=on" - */ - if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && - !zfs_prop_user(name)) { - if (recursive) - continue; + /* raw streams can't override encryption properties */ + if ((zfs_prop_encryption_key_param(prop) || + prop == ZFS_PROP_ENCRYPTION) && raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' does not apply to datasets of this " - "type"), name); + "encryption property '%s' cannot " + "be set or excluded for raw streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } - /* raw streams can't override encryption properties */ + /* incremental streams can only exclude encryption properties */ if ((zfs_prop_encryption_key_param(prop) || - prop == ZFS_PROP_ENCRYPTION) && (raw || !newfs)) { + prop == ZFS_PROP_ENCRYPTION) && !newfs && + nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " - "be set or excluded for raw or incremental " - "streams."), name); + "be set for incremental streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -3663,16 +4003,29 @@ zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, * a property: this is done by forcing an explicit * inherit on the destination so the effective value is * not the one we received from the send stream. + */ + if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && + !zfs_prop_user(name)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Warning: %s: property '%s' does not " + "apply to datasets of this type\n"), + fsname, name); + continue; + } + /* * We do this only if the property is not already * locally-set, in which case its value will take * priority over the received anyway. */ - if (nvlist_exists(origprops, name)) { + if (nvlist_exists(origprops, newname)) { nvlist_t *attrs; + char *source = NULL; - attrs = fnvlist_lookup_nvlist(origprops, name); - if (strcmp(fnvlist_lookup_string(attrs, - ZPROP_SOURCE), ZPROP_SOURCE_VAL_RECVD) != 0) + attrs = fnvlist_lookup_nvlist(origprops, + newname); + if (nvlist_lookup_string(attrs, + ZPROP_SOURCE, &source) == 0 && + strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } /* @@ -3682,13 +4035,32 @@ zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, */ if (!zfs_prop_inheritable(prop) && !zfs_prop_user(name) && /* can be inherited too */ - nvlist_exists(recvprops, name)) - fnvlist_remove(recvprops, name); + nvlist_exists(recvprops, newname)) + fnvlist_remove(recvprops, newname); else - fnvlist_add_nvpair(*oxprops, nvp); + fnvlist_add_boolean(*oxprops, newname); break; case DATA_TYPE_STRING: /* -o property=value */ - fnvlist_add_nvpair(oprops, nvp); + /* + * we're trying to override a property that does not + * make sense for this type of dataset, but we don't + * want to fail if the receive is recursive: this comes + * in handy when the send stream contains, for + * instance, a child ZVOL and we're trying to receive + * it with "-o atime=on" + */ + if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && + !zfs_prop_user(name)) { + if (recursive) + continue; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' does not apply to datasets " + "of this type"), name); + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + fnvlist_add_string(oprops, newname, + fnvpair_value_string(nvp)); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -3749,8 +4121,8 @@ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, - avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, - uint64_t *action_handlep, const char *finalsnap, nvlist_t *cmdprops) + avl_tree_t *stream_avl, char **top_zfs, + const char *finalsnap, nvlist_t *cmdprops) { time_t begin_time; int ioctl_err, ioctl_errno, err; @@ -3759,7 +4131,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char errbuf[1024]; const char *chopprefix; boolean_t newfs = B_FALSE; - boolean_t stream_wantsnewfs; + boolean_t stream_wantsnewfs, stream_resumingnewfs; boolean_t newprops = B_FALSE; uint64_t read_bytes = 0; uint64_t errflags = 0; @@ -3782,6 +4154,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; + boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; @@ -3808,7 +4181,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &rcvprops); if (err) { - VERIFY(0 == nvlist_alloc(&rcvprops, NV_UNIQUE_NAME, 0)); + rcvprops = fnvlist_alloc(); newprops = B_TRUE; } @@ -3829,22 +4202,22 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } if (flags->canmountoff) { - VERIFY(0 == nvlist_add_uint64(rcvprops, - zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); + fnvlist_add_uint64(rcvprops, + zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0); } else if (newprops) { /* nothing in rcvprops, eliminate it */ - nvlist_free(rcvprops); + fnvlist_free(rcvprops); rcvprops = NULL; newprops = B_FALSE; } if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { - VERIFY(0 == nvlist_lookup_nvlist(lookup, - snapname, &snapprops_nvlist)); + snapprops_nvlist = fnvlist_lookup_nvlist(lookup, + snapname); } if (holds) { if (0 == nvlist_lookup_nvlist(fs, "snapholds", &lookup)) { - VERIFY(0 == nvlist_lookup_nvlist(lookup, - snapname, &snapholds_nvlist)); + snapholds_nvlist = fnvlist_lookup_nvlist( + lookup, snapname); } } } @@ -3959,6 +4332,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) printf("found clone origin %s\n", origin); } + if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_DEDUP)) { + (void) fprintf(stderr, + gettext("ERROR: \"zfs receive\" no longer supports " + "deduplicated send streams. Use\n" + "the \"zstream redup\" command to convert this stream " + "to a regular,\n" + "non-deduplicated stream.\n")); + err = zfs_error(hdl, EZFS_NOTSUP, errbuf); + goto out; + } + boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & @@ -3967,6 +4352,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, DMU_BACKUP_FEATURE_EMBED_DATA; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; + stream_resumingnewfs = (drrb->drr_fromguid == 0 || + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && resuming; if (stream_wantsnewfs) { /* @@ -3992,11 +4379,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } } else { /* - * if the fs does not exist, look for it based on the - * fromsnap GUID + * If the fs does not exist, look for it based on the + * fromsnap GUID. */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive incremental stream")); + if (resuming) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive resume stream")); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive incremental stream")); + } (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; @@ -4022,6 +4416,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; + redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_REDACTED; + if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp; @@ -4046,7 +4443,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } - if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, + if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" @@ -4064,7 +4461,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, goto out; } if (is_volume && - ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, + zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has children (eg. %s)\n" @@ -4093,7 +4490,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, /* * Raw sends can not be performed as an incremental on top - * of existing unencryppted datasets. zfs recv -F cant be + * of existing unencrypted datasets. zfs recv -F can't be * used to blow away an existing encrypted filesystem. This * is because it would require the dsl dir to point to the * new key (or lack of a key) and the old key at the same @@ -4124,9 +4521,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && - stream_wantsnewfs) { + (stream_wantsnewfs || stream_resumingnewfs)) { /* We can't do online recv in this case */ - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags->forceunmount ? MS_FORCE : 0); if (clp == NULL) { zfs_close(zhp); err = -1; @@ -4210,34 +4608,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, goto out; } - /* - * It is invalid to receive a properties stream that was - * unencrypted on the send side as a child of an encrypted - * parent. Technically there is nothing preventing this, but - * it would mean that the encryption=off property which is - * locally set on the send side would not be received correctly. - * We can infer encryption=off if the stream is not raw and - * properties were included since the send side will only ever - * send the encryption property in a raw nvlist header. This - * check will be avoided if the user specifically overrides - * the encryption property on the command line. - */ - if (!raw && rcvprops != NULL && - !nvlist_exists(cmdprops, - zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { - uint64_t crypt; - - crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); - - if (crypt != ZIO_CRYPT_OFF) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "parent '%s' must not be encrypted to " - "receive unenecrypted property"), name); - err = zfs_error(hdl, EZFS_BADPROP, errbuf); - zfs_close(zhp); - goto out; - } - } zfs_close(zhp); newfs = B_TRUE; @@ -4252,13 +4622,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) fflush(stdout); } - if (flags->dryrun) { - err = recv_skip(hdl, infd, flags->byteswap); - goto out; + /* + * If this is the top-level dataset, record it so we can use it + * for recursive operations later. + */ + if (top_zfs != NULL && + (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { + toplevel = B_TRUE; + if (*top_zfs == NULL) + *top_zfs = zfs_strdup(hdl, name); } - if (top_zfs && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) - toplevel = B_TRUE; if (drrb->drr_type == DMU_OST_ZVOL) { type = ZFS_TYPE_VOLUME; } else if (drrb->drr_type == DMU_OST_ZFS) { @@ -4274,10 +4648,48 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; + /* + * When sending with properties (zfs send -p), the encryption property + * is not included because it is a SETONCE property and therefore + * treated as read only. However, we are always able to determine its + * value because raw sends will include it in the DRR_BDEGIN payload + * and non-raw sends with properties are not allowed for encrypted + * datasets. Therefore, if this is a non-raw properties stream, we can + * infer that the value should be ZIO_CRYPT_OFF and manually add that + * to the received properties. + */ + if (stream_wantsnewfs && !raw && rcvprops != NULL && + !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { + if (oxprops == NULL) + oxprops = fnvlist_alloc(); + fnvlist_add_uint64(oxprops, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); + } + + if (flags->dryrun) { + void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); + + /* + * We have read the DRR_BEGIN record, but we have + * not yet read the payload. For non-dryrun sends + * this will be done by the kernel, so we must + * emulate that here, before attempting to read + * more records. + */ + err = recv_read(hdl, infd, buf, drr->drr_payloadlen, + flags->byteswap, NULL); + free(buf); + if (err != 0) + goto out; + + err = recv_skip(hdl, infd, flags->byteswap); + goto out; + } + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, - raw, infd, drr_noswap, cleanup_fd, &read_bytes, &errflags, - action_handlep, &prop_errors); + raw, infd, drr_noswap, -1, &read_bytes, &errflags, + NULL, &prop_errors); ioctl_errno = ioctl_err; prop_errflags = errflags; @@ -4339,12 +4751,11 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { - VERIFY(0 == nvlist_add_string(holds, destsnap, - nvpair_name(pair))); + fnvlist_add_string(holds, destsnap, nvpair_name(pair)); } (void) lzc_hold(holds, cleanup_fd, &errors); - nvlist_free(snapholds_nvlist); - nvlist_free(holds); + fnvlist_free(snapholds_nvlist); + fnvlist_free(holds); } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { @@ -4364,12 +4775,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, */ *cp = '\0'; if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, - B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, - &local_nv, &local_avl) == 0) { + B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, + B_TRUE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); - nvlist_free(local_nv); + fnvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { @@ -4428,18 +4839,27 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, *cp = '@'; break; case EINVAL: - if (flags->resumable) + if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); - if (embedded && !raw) + } else if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); + } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: - recv_ecksum_set_aux(hdl, destsnap, flags->resumable); + case ZFS_ERR_STREAM_TRUNCATED: + recv_ecksum_set_aux(hdl, destsnap, flags->resumable, + ioctl_err == ECKSUM); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental send stream requires -L " + "(--large-block), to match previous receive.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: @@ -4455,7 +4875,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, case ZFS_ERR_FROM_IVSET_GUID_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid missing. See errata %u at " - "http://zfsonlinux.org/msg/ZFS-8000-ER."), + "https://openzfs.github.io/openzfs-docs/msg/" + "ZFS-8000-ER."), ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; @@ -4482,7 +4903,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; } - /* fallthru */ + fallthrough; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } @@ -4493,35 +4914,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ - cp = strchr(destsnap, '@'); - if (cp && (ioctl_err == 0 || !newfs)) { - zfs_handle_t *h; - - *cp = '\0'; - h = zfs_open(hdl, destsnap, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (h != NULL) { - if (h->zfs_type == ZFS_TYPE_VOLUME) { - *cp = '@'; - } else if (newfs || stream_avl) { - /* - * Track the first/top of hierarchy fs, - * for mounting and sharing later. - */ - if (top_zfs && *top_zfs == NULL) - *top_zfs = zfs_strdup(hdl, destsnap); - } - zfs_close(h); - } - *cp = '@'; - } - if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } + if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) + flags->domount = B_TRUE; + if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), name); @@ -4548,25 +4949,25 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_nicebytes(bytes, buf1, sizeof (buf1)); zfs_nicebytes(bytes/delta, buf2, sizeof (buf1)); - (void) printf("received %s stream in %lu seconds (%s/sec)\n", - buf1, delta, buf2); + (void) printf("received %s stream in %lld seconds (%s/sec)\n", + buf1, (longlong_t)delta, buf2); } err = 0; out: if (prop_errors != NULL) - nvlist_free(prop_errors); + fnvlist_free(prop_errors); if (tmp_keylocation[0] != '\0') { - VERIFY(0 == nvlist_add_string(rcvprops, - zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation)); + fnvlist_add_string(rcvprops, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation); } if (newprops) - nvlist_free(rcvprops); + fnvlist_free(rcvprops); - nvlist_free(oxprops); - nvlist_free(origprops); + fnvlist_free(oxprops); + fnvlist_free(origprops); return (err); } @@ -4625,8 +5026,8 @@ zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, - nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, - uint64_t *action_handlep, const char *finalsnap, nvlist_t *cmdprops) + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, + const char *finalsnap, nvlist_t *cmdprops) { int err; dmu_replay_record_t drr, drr_noswap; @@ -4702,16 +5103,29 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "stream has unsupported feature, feature flags = %lx"), - featureflags); + /* + * Let's be explicit about this one, since rather than + * being a new feature we can't know, it's an old + * feature we dropped. + */ + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream has deprecated feature: dedup, try " + "'zstream redup [send in a file] | zfs recv " + "[...]'")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream has unsupported feature, feature flags = " + "%llx (unknown flags = %llx)"), + (u_longlong_t)featureflags, + (u_longlong_t)((featureflags) & + ~DMU_BACKUP_FEATURE_MASK)); + } return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } /* Holds feature is set once in the compound stream header. */ - boolean_t holds = (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_HOLDS); - if (holds) + if (featureflags & DMU_BACKUP_FEATURE_HOLDS) flags->holds = B_TRUE; if (strchr(drrb->drr_toname, '@') == NULL) { @@ -4738,12 +5152,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, - cleanup_fd, action_handlep, finalsnap, cmdprops)); + finalsnap, cmdprops)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, - &zcksum, top_zfs, cleanup_fd, action_handlep, cmdprops)); + &zcksum, top_zfs, cmdprops)); } } @@ -4760,8 +5174,6 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, { char *top_zfs = NULL; int err; - int cleanup_fd; - uint64_t action_handle = 0; struct stat sb; char *originsnap = NULL; @@ -4774,37 +5186,12 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, return (-2); } -#ifdef __linux__ -#ifndef F_SETPIPE_SZ -#define F_SETPIPE_SZ (F_SETLEASE + 7) -#endif /* F_SETPIPE_SZ */ - -#ifndef F_GETPIPE_SZ -#define F_GETPIPE_SZ (F_GETLEASE + 7) -#endif /* F_GETPIPE_SZ */ - /* * It is not uncommon for gigabytes to be processed in zfs receive. - * Speculatively increase the buffer size via Linux-specific fcntl() - * call. + * Speculatively increase the buffer size if supported by the platform. */ - if (S_ISFIFO(sb.st_mode)) { - FILE *procf = fopen("/proc/sys/fs/pipe-max-size", "r"); - - if (procf != NULL) { - unsigned long max_psize; - long cur_psize; - if (fscanf(procf, "%lu", &max_psize) > 0) { - cur_psize = fcntl(infd, F_GETPIPE_SZ); - if (cur_psize > 0 && - max_psize > (unsigned long) cur_psize) - (void) fcntl(infd, F_SETPIPE_SZ, - max_psize); - } - fclose(procf); - } - } -#endif /* __linux__ */ + if (S_ISFIFO(sb.st_mode)) + libzfs_set_pipe_max(infd); if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); @@ -4812,32 +5199,42 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, return (err); } - cleanup_fd = open(ZFS_DEV, O_RDWR); - VERIFY(cleanup_fd >= 0); - err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, - stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL, props); + stream_avl, &top_zfs, NULL, props); - VERIFY(0 == close(cleanup_fd)); - - if (err == 0 && !flags->nomount && top_zfs) { + if (err == 0 && !flags->nomount && flags->domount && top_zfs) { zfs_handle_t *zhp = NULL; prop_changelist_t *clp = NULL; - zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM); - if (zhp != NULL) { - clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, - CL_GATHER_MOUNT_ALWAYS, 0); - zfs_close(zhp); - if (clp != NULL) { - /* mount and share received datasets */ - err = changelist_postfix(clp); - changelist_free(clp); - } - } - if (zhp == NULL || clp == NULL || err) + zhp = zfs_open(hdl, top_zfs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { err = -1; + goto out; + } else { + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + zfs_close(zhp); + goto out; + } + + clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, + CL_GATHER_MOUNT_ALWAYS, + flags->forceunmount ? MS_FORCE : 0); + zfs_close(zhp); + if (clp == NULL) { + err = -1; + goto out; + } + + /* mount and share received datasets */ + err = changelist_postfix(clp); + changelist_free(clp); + if (err != 0) + err = -1; + } } + +out: if (top_zfs) free(top_zfs); diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index ebf497db64..33d6e1bfdf 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2021, Colm Buckley */ /* @@ -43,6 +44,7 @@ #include #include +#include #include #include #include @@ -84,6 +86,10 @@ static char *zfs_msgid_table[] = { * ZPOOL_STATUS_RESILVERING * ZPOOL_STATUS_OFFLINE_DEV * ZPOOL_STATUS_REMOVED_DEV + * ZPOOL_STATUS_REBUILDING + * ZPOOL_STATUS_REBUILD_SCRUB + * ZPOOL_STATUS_COMPATIBILITY_ERR + * ZPOOL_STATUS_INCOMPATIBLE_FEAT * ZPOOL_STATUS_OK */ }; @@ -92,57 +98,69 @@ static char *zfs_msgid_table[] = { /* ARGSUSED */ static int -vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) +vdev_missing(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN && - aux == VDEV_AUX_OPEN_FAILED); + return (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_OPEN_FAILED); } /* ARGSUSED */ static int -vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +vdev_faulted(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_FAULTED); + return (vs->vs_state == VDEV_STATE_FAULTED); } /* ARGSUSED */ static int -vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) +vdev_errors(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_DEGRADED || errs != 0); + return (vs->vs_state == VDEV_STATE_DEGRADED || + vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || + vs->vs_checksum_errors != 0); } /* ARGSUSED */ static int -vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) +vdev_broken(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN); + return (vs->vs_state == VDEV_STATE_CANT_OPEN); } /* ARGSUSED */ static int -vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) +vdev_offlined(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_OFFLINE); + return (vs->vs_state == VDEV_STATE_OFFLINE); } /* ARGSUSED */ static int -vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +vdev_removed(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_REMOVED); + return (vs->vs_state == VDEV_STATE_REMOVED); +} + +static int +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) +{ + if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL) + return (0); + + return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift); } /* * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), + boolean_t ignore_replacing) { nvlist_t **child; vdev_stat_t *vs; - uint_t c, children; - char *type; + uint_t c, vsc, children; /* * Ignore problems within a 'replacing' vdev, since we're presumably in @@ -150,23 +168,25 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * out again. We'll pick up the fact that a resilver is happening * later. */ - verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + if (ignore_replacing == B_TRUE) { + char *type; + + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } else { verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); - if (func(vs->vs_state, vs->vs_aux, - vs->vs_read_errors + - vs->vs_write_errors + - vs->vs_checksum_errors)) + if (func(vs, vsc) != 0) return (B_TRUE); } @@ -176,7 +196,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } @@ -195,13 +215,14 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * - Check for any data errors * - Check for any faulted or missing devices in a replicated config * - Look for any devices showing errors - * - Check for any resilvering devices + * - Check for any resilvering or rebuilding devices * * There can obviously be multiple errors within a single pool, so this routine * only picks the most damaging of all the current errors to report. */ static zpool_status_t -check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) +check_status(nvlist_t *config, boolean_t isimport, + zpool_errata_t *erratap, const char *compat) { nvlist_t *nvroot; vdev_stat_t *vs; @@ -233,6 +254,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) ps->pss_state == DSS_SCANNING) return (ZPOOL_STATUS_RESILVERING); + /* + * Currently rebuilding a vdev, check top-level vdevs. + */ + vdev_rebuild_stat_t *vrs = NULL; + nvlist_t **child; + uint_t c, i, children; + uint64_t rebuild_end_time = 0; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) { + uint64_t state = vrs->vrs_state; + + if (state == VDEV_REBUILD_ACTIVE) { + return (ZPOOL_STATUS_REBUILDING); + } else if (state == VDEV_REBUILD_COMPLETE && + vrs->vrs_end_time > rebuild_end_time) { + rebuild_end_time = vrs->vrs_end_time; + } + } + } + + /* + * If we can determine when the last scrub was run, and it + * was before the last rebuild completed, then recommend + * that the pool be scrubbed to verify all checksums. When + * ps is NULL we can infer the pool has never been scrubbed. + */ + if (rebuild_end_time > 0) { + if (ps != NULL) { + if ((ps->pss_state == DSS_FINISHED && + ps->pss_func == POOL_SCAN_SCRUB && + rebuild_end_time > ps->pss_end_time) || + ps->pss_state == DSS_NONE) + return (ZPOOL_STATUS_REBUILD_SCRUB); + } else { + return (ZPOOL_STATUS_REBUILD_SCRUB); + } + } + } + /* * The multihost property is set and the pool may be active. */ @@ -317,15 +381,15 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted)) + find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing)) + find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken)) + find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* @@ -347,31 +411,37 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) /* * Missing devices in a replicated config. */ - if (find_vdev_problem(nvroot, vdev_faulted)) + if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing)) + if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken)) + if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors)) + if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ - if (find_vdev_problem(nvroot, vdev_offlined)) + if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ - if (find_vdev_problem(nvroot, vdev_removed)) + if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); + /* + * Suboptimal, but usable, ashift configuration. + */ + if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); + /* * Informational errata available. */ @@ -384,11 +454,17 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) /* * Outdated, but usable, version */ - if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) - return (ZPOOL_STATUS_VERSION_OLDER); + if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) { + /* "legacy" compatibility disables old version reporting */ + if (compat != NULL && strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) + return (ZPOOL_STATUS_OK); + else + return (ZPOOL_STATUS_VERSION_OLDER); + } /* - * Usable pool with disabled features + * Usable pool with disabled or superfluous features + * (superfluous = beyond what's requested by 'compatibility') */ if (version >= SPA_VERSION_FEATURES) { int i; @@ -405,10 +481,24 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) ZPOOL_CONFIG_FEATURE_STATS); } + /* check against all features, or limited set? */ + boolean_t c_features[SPA_FEATURES]; + + switch (zpool_load_compat(compat, c_features, NULL, 0)) { + case ZPOOL_COMPATIBILITY_OK: + case ZPOOL_COMPATIBILITY_WARNTOKEN: + break; + default: + return (ZPOOL_STATUS_COMPATIBILITY_ERR); + } for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; - if (!nvlist_exists(feat, fi->fi_guid)) + if (!fi->fi_zfs_mod_supported) + continue; + if (c_features[i] && !nvlist_exists(feat, fi->fi_guid)) return (ZPOOL_STATUS_FEAT_DISABLED); + if (!c_features[i] && nvlist_exists(feat, fi->fi_guid)) + return (ZPOOL_STATUS_INCOMPATIBLE_FEAT); } } @@ -418,7 +508,18 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) zpool_status_t zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) { - zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata); + /* + * pass in the desired feature set, as + * it affects check for disabled features + */ + char compatibility[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compatibility, + ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) + compatibility[0] = '\0'; + + zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata, + compatibility); + if (msgid != NULL) { if (ret >= NMSGID) *msgid = NULL; @@ -431,7 +532,7 @@ zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) zpool_status_t zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) { - zpool_status_t ret = check_status(config, B_TRUE, errata); + zpool_status_t ret = check_status(config, B_TRUE, errata, NULL); if (ret >= NMSGID) *msgid = NULL; diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 19bb57ad43..c3c009ae3a 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -21,10 +21,14 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2020 Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2020 The FreeBSD Foundation + * + * Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. */ /* @@ -40,6 +44,9 @@ #include #include #include +#if LIBFETCH_DYNAMIC +#include +#endif #include #include #include @@ -54,7 +61,13 @@ #include "zfeature_common.h" #include #include -#include + +/* + * We only care about the scheme in order to match the scheme + * with the handler. Each handler should validate the full URI + * as necessary. + */ +#define URI_REGEX "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):" int libzfs_errno(libzfs_handle_t *hdl) @@ -62,31 +75,6 @@ libzfs_errno(libzfs_handle_t *hdl) return (hdl->libzfs_error); } -const char * -libzfs_error_init(int error) -{ - switch (error) { - case ENXIO: - return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " - "loaded.\nTry running '/sbin/modprobe zfs' as root " - "to load them.\n")); - case ENOENT: - return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " - "are required.\nTry running 'udevadm trigger' and 'mount " - "-t proc proc /proc' as root.\n")); - case ENOEXEC: - return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " - "auto-loaded.\nTry running '/sbin/modprobe zfs' as " - "root to manually load them.\n")); - case EACCES: - return (dgettext(TEXT_DOMAIN, "Permission denied the " - "ZFS utilities must be run as root.\n")); - default: - return (dgettext(TEXT_DOMAIN, "Failed to initialize the " - "libzfs library.\n")); - } -} - const char * libzfs_error_action(libzfs_handle_t *hdl) { @@ -163,15 +151,15 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: - return (dgettext(TEXT_DOMAIN, "umount failed")); + return (dgettext(TEXT_DOMAIN, "unmount failed")); case EZFS_UNSHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); + return (dgettext(TEXT_DOMAIN, "NFS share removal failed")); case EZFS_SHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "share(1M) failed")); + return (dgettext(TEXT_DOMAIN, "NFS share creation failed")); case EZFS_UNSHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb remove share failed")); + return (dgettext(TEXT_DOMAIN, "SMB share removal failed")); case EZFS_SHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb add share failed")); + return (dgettext(TEXT_DOMAIN, "SMB share creation failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: @@ -303,6 +291,11 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_NO_RESILVER_DEFER: return (dgettext(TEXT_DOMAIN, "this action requires the " "resilver_defer feature")); + case EZFS_EXPORT_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "pool export in progress")); + case EZFS_REBUILDING: + return (dgettext(TEXT_DOMAIN, "currently sequentially " + "resilvering")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -311,7 +304,6 @@ libzfs_error_description(libzfs_handle_t *hdl) } } -/*PRINTFLIKE2*/ void zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) { @@ -341,7 +333,8 @@ zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) if (hdl->libzfs_printerr) { if (error == EZFS_UNKNOWN) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " - "error: %s\n"), libzfs_error_description(hdl)); + "error: %s: %s\n"), hdl->libzfs_action, + libzfs_error_description(hdl)); abort(); } @@ -358,7 +351,6 @@ zfs_error(libzfs_handle_t *hdl, int error, const char *msg) return (zfs_error_fmt(hdl, error, "%s", msg)); } -/*PRINTFLIKE3*/ int zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { @@ -409,7 +401,6 @@ zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) return (zfs_standard_error_fmt(hdl, error, "%s", msg)); } -/*PRINTFLIKE3*/ int zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { @@ -468,6 +459,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; + case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE: case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " @@ -487,8 +479,11 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_WRONG_PARENT: zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); break; + case ZFS_ERR_BADPROP: + zfs_verror(hdl, EZFS_BADPROP, fmt, ap); + break; default: - zfs_error_aux(hdl, strerror(error)); + zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } @@ -497,13 +492,124 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) return (-1); } +void +zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, + char *errbuf) +{ + switch (err) { + + case ENOSPC: + /* + * For quotas and reservations, ENOSPC indicates + * something different; setting a quota or reservation + * doesn't use any disk space. + */ + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is less than current used or " + "reserved space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is greater than available space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + default: + (void) zfs_standard_error(hdl, err, errbuf); + break; + } + break; + + case EBUSY: + (void) zfs_standard_error(hdl, EBUSY, errbuf); + break; + + case EROFS: + (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); + break; + + case E2BIG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property value too long")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + break; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool and or dataset must be upgraded to set this " + "property or value")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + + case ERANGE: + if (prop == ZFS_PROP_COMPRESSION || + prop == ZFS_PROP_DNODESIZE || + prop == ZFS_PROP_RECORDSIZE) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "bootable datasets")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else if (prop == ZFS_PROP_CHECKSUM || + prop == ZFS_PROP_DEDUP) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "root pools")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EINVAL: + if (prop == ZPROP_INVAL) { + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case ZFS_ERR_BADPROP: + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + break; + + case EACCES: + if (prop == ZFS_PROP_KEYLOCATION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "keylocation may only be set on encryption roots")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EOVERFLOW: + /* + * This platform can't address a volume this big. + */ +#ifdef _ILP32 + if (prop == ZFS_PROP_VOLSIZE) { + (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); + break; + } +#endif + fallthrough; + default: + (void) zfs_standard_error(hdl, err, errbuf); + } +} + int zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zpool_standard_error_fmt(hdl, error, "%s", msg)); } -/*PRINTFLIKE3*/ int zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { @@ -598,6 +704,18 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_VDEV_TOO_BIG: zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); break; + case ZFS_ERR_EXPORT_IN_PROGRESS: + zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); + break; + case ZFS_ERR_RESILVER_IN_PROGRESS: + zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); + break; + case ZFS_ERR_REBUILD_IN_PROGRESS: + zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); + break; + case ZFS_ERR_BADPROP: + zfs_verror(hdl, EZFS_BADPROP, fmt, ap); + break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " @@ -615,7 +733,7 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; default: - zfs_error_aux(hdl, strerror(error)); + zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); } @@ -649,7 +767,6 @@ zfs_alloc(libzfs_handle_t *hdl, size_t size) /* * A safe form of asprintf() which will die if the allocation fails. */ -/*PRINTFLIKE2*/ char * zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) { @@ -663,8 +780,10 @@ zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) va_end(ap); - if (err < 0) + if (err < 0) { (void) no_memory(hdl); + ret = NULL; + } return (ret); } @@ -706,19 +825,6 @@ libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) hdl->libzfs_printerr = printerr; } -static int -libzfs_module_loaded(const char *module) -{ - const char path_prefix[] = "/sys/module/"; - char path[256]; - - memcpy(path, path_prefix, sizeof (path_prefix) - 1); - strcpy(path + sizeof (path_prefix) - 1, module); - - return (access(path, F_OK) == 0); -} - - /* * Read lines from an open file descriptor and store them in an array of * strings until EOF. lines[] will be allocated and populated with all the @@ -736,17 +842,13 @@ libzfs_read_stdout_from_fd(int fd, char **lines[]) size_t len = 0; char *line = NULL; char **tmp_lines = NULL, **tmp; - char *nl = NULL; - int rc; fp = fdopen(fd, "r"); - if (fp == NULL) + if (fp == NULL) { + close(fd); return (0); - while (1) { - rc = getline(&line, &len, fp); - if (rc == -1) - break; - + } + while (getline(&line, &len, fp) != -1) { tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1)); if (tmp == NULL) { /* Return the lines we were able to process */ @@ -754,13 +856,16 @@ libzfs_read_stdout_from_fd(int fd, char **lines[]) } tmp_lines = tmp; - /* Terminate newlines */ - if ((nl = strchr(line, '\n')) != NULL) - *nl = '\0'; - tmp_lines[lines_cnt] = line; - lines_cnt++; - line = NULL; + /* Remove newline if not EOF */ + if (line[strlen(line) - 1] == '\n') + line[strlen(line) - 1] = '\0'; + + tmp_lines[lines_cnt] = strdup(line); + if (tmp_lines[lines_cnt] == NULL) + break; + ++lines_cnt; } + free(line); fclose(fp); *lines = tmp_lines; return (lines_cnt); @@ -778,13 +883,13 @@ libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, * Setup a pipe between our child and parent process if we're * reading stdout. */ - if ((lines != NULL) && pipe(link) == -1) - return (-ESTRPIPE); + if (lines != NULL && pipe2(link, O_NONBLOCK | O_CLOEXEC) == -1) + return (-EPIPE); - pid = vfork(); + pid = fork(); if (pid == 0) { /* Child process */ - devnull_fd = open("/dev/null", O_WRONLY); + devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC); if (devnull_fd < 0) _exit(-1); @@ -794,15 +899,11 @@ libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, else if (lines != NULL) { /* Save the output to lines[] */ dup2(link[1], STDOUT_FILENO); - close(link[0]); - close(link[1]); } if (!(flags & STDERR_VERBOSE)) (void) dup2(devnull_fd, STDERR_FILENO); - close(devnull_fd); - if (flags & NO_DEFAULT_PATH) { if (env == NULL) execv(path, argv); @@ -821,7 +922,8 @@ libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, int status; while ((error = waitpid(pid, &status, 0)) == -1 && - errno == EINTR) { } + errno == EINTR) + ; if (error < 0 || !WIFEXITED(status)) return (-1); @@ -897,85 +999,14 @@ libzfs_envvar_is_set(char *envvar) return (0); } -/* - * Verify the required ZFS_DEV device is available and optionally attempt - * to load the ZFS modules. Under normal circumstances the modules - * should already have been loaded by some external mechanism. - * - * Environment variables: - * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. - * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV - */ -static int -libzfs_load_module(const char *module) -{ - char *argv[4] = {"/sbin/modprobe", "-q", (char *)module, (char *)0}; - char *load_str, *timeout_str; - long timeout = 10; /* seconds */ - long busy_timeout = 10; /* milliseconds */ - int load = 0, fd; - hrtime_t start; - - /* Optionally request module loading */ - if (!libzfs_module_loaded(module)) { - load_str = getenv("ZFS_MODULE_LOADING"); - if (load_str) { - if (!strncasecmp(load_str, "YES", strlen("YES")) || - !strncasecmp(load_str, "ON", strlen("ON"))) - load = 1; - else - load = 0; - } - - if (load) { - if (libzfs_run_process("/sbin/modprobe", argv, 0)) - return (ENOEXEC); - } - - if (!libzfs_module_loaded(module)) - return (ENXIO); - } - - /* - * Device creation by udev is asynchronous and waiting may be - * required. Busy wait for 10ms and then fall back to polling every - * 10ms for the allowed timeout (default 10s, max 10m). This is - * done to optimize for the common case where the device is - * immediately available and to avoid penalizing the possible - * case where udev is slow or unable to create the device. - */ - timeout_str = getenv("ZFS_MODULE_TIMEOUT"); - if (timeout_str) { - timeout = strtol(timeout_str, NULL, 0); - timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ - } - - start = gethrtime(); - do { - fd = open(ZFS_DEV, O_RDWR); - if (fd >= 0) { - (void) close(fd); - return (0); - } else if (errno != ENOENT) { - return (errno); - } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { - sched_yield(); - } else { - usleep(10 * MILLISEC); - } - } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); - - return (ENOENT); -} - libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; int error; + char *env; - error = libzfs_load_module(ZFS_DRIVER); - if (error) { + if ((error = libzfs_load_module()) != 0) { errno = error; return (NULL); } @@ -984,28 +1015,18 @@ libzfs_init(void) return (NULL); } - if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) { free(hdl); return (NULL); } -#ifdef HAVE_SETMNTENT - if ((hdl->libzfs_mnttab = setmntent(MNTTAB, "r")) == NULL) { -#else - if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) { -#endif - (void) close(hdl->libzfs_fd); + if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL|O_CLOEXEC)) < 0) { free(hdl); return (NULL); } - hdl->libzfs_sharetab = fopen(ZFS_SHARETAB, "r"); - if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); - (void) fclose(hdl->libzfs_mnttab); - if (hdl->libzfs_sharetab) - (void) fclose(hdl->libzfs_sharetab); free(hdl); return (NULL); } @@ -1019,6 +1040,17 @@ libzfs_init(void) if (getenv("ZFS_PROP_DEBUG") != NULL) { hdl->libzfs_prop_debug = B_TRUE; } + if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) { + if ((error = zfs_nicestrtonum(hdl, env, + &hdl->libzfs_max_nvlist))) { + errno = error; + (void) close(hdl->libzfs_fd); + free(hdl); + return (NULL); + } + } else { + hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4); + } /* * For testing, remove some settable properties and features @@ -1043,20 +1075,17 @@ void libzfs_fini(libzfs_handle_t *hdl) { (void) close(hdl->libzfs_fd); - if (hdl->libzfs_mnttab) -#ifdef HAVE_SETMNTENT - (void) endmntent(hdl->libzfs_mnttab); -#else - (void) fclose(hdl->libzfs_mnttab); -#endif - if (hdl->libzfs_sharetab) - (void) fclose(hdl->libzfs_sharetab); - zfs_uninit_libshare(hdl); zpool_free_handles(hdl); namespace_clear(hdl); libzfs_mnttab_fini(hdl); libzfs_core_fini(); + regfree(&hdl->libzfs_urire); fletcher_4_fini(); +#if LIBFETCH_DYNAMIC + if (hdl->libfetch != (void *)-1 && hdl->libfetch != NULL) + (void) dlclose(hdl->libfetch); + free(hdl->libfetch_load_error); +#endif free(hdl); } @@ -1085,11 +1114,10 @@ zfs_get_pool_handle(const zfs_handle_t *zhp) * fs/vol/snap/bkmark name. */ zfs_handle_t * -zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) +zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype) { struct stat64 statbuf; struct extmnttab entry; - int ret; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { /* @@ -1098,24 +1126,8 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) return (zfs_open(hdl, path, argtype)); } - if (stat64(path, &statbuf) != 0) { - (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); + if (getextmntent(path, &entry, &statbuf) != 0) return (NULL); - } - - /* Reopen MNTTAB to prevent reading stale data from open file */ - if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) - return (NULL); - - while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { - if (makedevice(entry.mnt_major, entry.mnt_minor) == - statbuf.st_dev) { - break; - } - } - if (ret != 0) { - return (NULL); - } if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), @@ -1134,7 +1146,7 @@ int zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) - len = 16 * 1024; + len = 256 * 1024; zc->zc_nvlist_dst_size = len; zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); @@ -1222,12 +1234,6 @@ zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) return (0); } -int -zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) -{ - return (ioctl(hdl->libzfs_fd, request, zc)); -} - /* * ================================================================ * API shared by zfs and zpool property management @@ -1533,7 +1539,12 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) fval *= pow(2, shift); - if (fval > UINT64_MAX) { + /* + * UINT64_MAX is not exactly representable as a double. + * The closest representation is UINT64_MAX + 1, so we + * use a >= comparison instead of > for the bounds check. + */ + if (fval >= (double)UINT64_MAX) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); @@ -1878,7 +1889,7 @@ typedef struct expand_data { zfs_type_t type; } expand_data_t; -int +static int zprop_expand_list_cb(int prop, void *cb) { zprop_list_t *entry; @@ -1954,36 +1965,6 @@ zfs_version_userland(char *version, int len) (void) strlcpy(version, ZFS_META_ALIAS, len); } -/* - * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR - * Returns 0 on success, and -1 on error (with errno set) - */ -int -zfs_version_kernel(char *version, int len) -{ - int _errno; - int fd; - int rlen; - - if ((fd = open(ZFS_SYSFS_DIR "/version", O_RDONLY)) == -1) - return (-1); - - if ((rlen = read(fd, version, len)) == -1) { - version[0] = '\0'; - _errno = errno; - (void) close(fd); - errno = _errno; - return (-1); - } - - version[rlen-1] = '\0'; /* discard '\n' */ - - if (close(fd) == -1) - return (-1); - - return (0); -} - /* * Prints both zfs userland and kernel versions * Returns 0 on success, and -1 on error (with errno set) @@ -1994,16 +1975,109 @@ zfs_version_print(void) char zver_userland[128]; char zver_kernel[128]; + zfs_version_userland(zver_userland, sizeof (zver_userland)); + + (void) printf("%s\n", zver_userland); + if (zfs_version_kernel(zver_kernel, sizeof (zver_kernel)) == -1) { fprintf(stderr, "zfs_version_kernel() failed: %s\n", strerror(errno)); return (-1); } - zfs_version_userland(zver_userland, sizeof (zver_userland)); - - (void) printf("%s\n", zver_userland); (void) printf("zfs-kmod-%s\n", zver_kernel); return (0); } + +/* + * Return 1 if the user requested ANSI color output, and our terminal supports + * it. Return 0 for no color. + */ +static int +use_color(void) +{ + static int use_color = -1; + char *term; + + /* + * Optimization: + * + * For each zpool invocation, we do a single check to see if we should + * be using color or not, and cache that value for the lifetime of the + * the zpool command. That makes it cheap to call use_color() when + * we're printing with color. We assume that the settings are not going + * to change during the invocation of a zpool command (the user isn't + * going to change the ZFS_COLOR value while zpool is running, for + * example). + */ + if (use_color != -1) { + /* + * We've already figured out if we should be using color or + * not. Return the cached value. + */ + return (use_color); + } + + term = getenv("TERM"); + /* + * The user sets the ZFS_COLOR env var set to enable zpool ANSI color + * output. However if NO_COLOR is set (https://no-color.org/) then + * don't use it. Also, don't use color if terminal doesn't support + * it. + */ + if (libzfs_envvar_is_set("ZFS_COLOR") && + !libzfs_envvar_is_set("NO_COLOR") && + isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 && + strcmp("unknown", term) != 0) { + /* Color supported */ + use_color = 1; + } else { + use_color = 0; + } + + return (use_color); +} + +/* + * color_start() and color_end() are used for when you want to colorize a block + * of text. For example: + * + * color_start(ANSI_RED_FG) + * printf("hello"); + * printf("world"); + * color_end(); + */ +void +color_start(char *color) +{ + if (use_color()) + printf("%s", color); +} + +void +color_end(void) +{ + if (use_color()) + printf(ANSI_RESET); +} + +/* printf() with a color. If color is NULL, then do a normal printf. */ +int +printf_color(char *color, char *format, ...) +{ + va_list aptr; + int rc; + + if (color) + color_start(color); + + va_start(aptr, format); + rc = vprintf(format, aptr); + va_end(aptr); + + if (color) + color_end(); + + return (rc); +} diff --git a/lib/libzfs/os/freebsd/libzfs_compat.c b/lib/libzfs/os/freebsd/libzfs_compat.c new file mode 100644 index 0000000000..f143f9cb63 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_compat.c @@ -0,0 +1,360 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ +#include "../../libzfs_impl.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef IN_BASE +#define ZFS_KMOD "zfs" +#else +#define ZFS_KMOD "openzfs" +#endif + +void +libzfs_set_pipe_max(int infd) +{ + /* FreeBSD automatically resizes */ +} + +static int +execvPe(const char *name, const char *path, char * const *argv, + char * const *envp) +{ + const char **memp; + size_t cnt, lp, ln; + int eacces, save_errno; + char buf[MAXPATHLEN]; + const char *bp, *np, *op, *p; + struct stat sb; + + eacces = 0; + + /* If it's an absolute or relative path name, it's easy. */ + if (strchr(name, '/')) { + bp = name; + op = NULL; + goto retry; + } + bp = buf; + + /* If it's an empty path name, fail in the usual POSIX way. */ + if (*name == '\0') { + errno = ENOENT; + return (-1); + } + + op = path; + ln = strlen(name); + while (op != NULL) { + np = strchrnul(op, ':'); + + /* + * It's a SHELL path -- double, leading and trailing colons + * mean the current directory. + */ + if (np == op) { + /* Empty component. */ + p = "."; + lp = 1; + } else { + /* Non-empty component. */ + p = op; + lp = np - op; + } + + /* Advance to the next component or terminate after this. */ + if (*np == '\0') + op = NULL; + else + op = np + 1; + + /* + * If the path is too long complain. This is a possible + * security issue; given a way to make the path too long + * the user may execute the wrong program. + */ + if (lp + ln + 2 > sizeof (buf)) { + (void) write(STDERR_FILENO, "execvP: ", 8); + (void) write(STDERR_FILENO, p, lp); + (void) write(STDERR_FILENO, ": path too long\n", + 16); + continue; + } + bcopy(p, buf, lp); + buf[lp] = '/'; + bcopy(name, buf + lp + 1, ln); + buf[lp + ln + 1] = '\0'; + +retry: (void) execve(bp, argv, envp); + switch (errno) { + case E2BIG: + goto done; + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + break; + case ENOEXEC: + for (cnt = 0; argv[cnt]; ++cnt) + ; + + /* + * cnt may be 0 above; always allocate at least + * 3 entries so that we can at least fit "sh", bp, and + * the NULL terminator. We can rely on cnt to take into + * account the NULL terminator in all other scenarios, + * as we drop argv[0]. + */ + memp = alloca(MAX(3, cnt + 2) * sizeof (char *)); + if (memp == NULL) { + /* errno = ENOMEM; XXX override ENOEXEC? */ + goto done; + } + if (cnt > 0) { + memp[0] = argv[0]; + memp[1] = bp; + bcopy(argv + 1, memp + 2, + cnt * sizeof (char *)); + } else { + memp[0] = "sh"; + memp[1] = bp; + memp[2] = NULL; + } + (void) execve(_PATH_BSHELL, + __DECONST(char **, memp), envp); + goto done; + case ENOMEM: + goto done; + case ENOTDIR: + break; + case ETXTBSY: + /* + * We used to retry here, but sh(1) doesn't. + */ + goto done; + default: + /* + * EACCES may be for an inaccessible directory or + * a non-executable file. Call stat() to decide + * which. This also handles ambiguities for EFAULT + * and EIO, and undocumented errors like ESTALE. + * We hope that the race for a stat() is unimportant. + */ + save_errno = errno; + if (stat(bp, &sb) != 0) + break; + if (save_errno == EACCES) { + eacces = 1; + continue; + } + errno = save_errno; + goto done; + } + } + if (eacces) + errno = EACCES; + else + errno = ENOENT; +done: + return (-1); +} + +int +execvpe(const char *name, char * const argv[], char * const envp[]) +{ + const char *path; + + /* Get the path we're searching. */ + if ((path = getenv("PATH")) == NULL) + path = _PATH_DEFPATH; + + return (execvPe(name, path, argv, envp)); +} + +#define ERRBUFLEN 256 + +static __thread char errbuf[ERRBUFLEN]; + +const char * +libzfs_error_init(int error) +{ + char *msg = errbuf; + size_t len, msglen = ERRBUFLEN; + + if (modfind("zfs") < 0) { + len = snprintf(msg, msglen, dgettext(TEXT_DOMAIN, + "Failed to load %s module: "), ZFS_KMOD); + msg += len; + msglen -= len; + } + + (void) snprintf(msg, msglen, "%s", strerror(error)); + + return (errbuf); +} + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (lzc_ioctl_fd(hdl->libzfs_fd, request, zc)); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + */ +int +libzfs_load_module(void) +{ + /* + * XXX: kldfind(ZFS_KMOD) would be nice here, but we retain + * modfind("zfs") so out-of-base openzfs userland works with the + * in-base module. + */ + if (modfind("zfs") < 0) { + /* Not present in kernel, try loading it. */ + if (kldload(ZFS_KMOD) < 0 && errno != EEXIST) { + return (errno); + } + } + return (0); +} + +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + return (0); +} + +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + return (0); +} + +int +find_shares_object(differ_info_t *di) +{ + return (0); +} + +int +zfs_destroy_snaps_nvl_os(libzfs_handle_t *hdl, nvlist_t *snaps) +{ + return (0); +} + +/* + * Attach/detach the given filesystem to/from the given jail. + */ +int +zfs_jail(zfs_handle_t *zhp, int jailid, int attach) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = {"\0"}; + char errbuf[1024]; + unsigned long cmd; + int ret; + + if (attach) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); + } + + switch (zhp->zfs_type) { + case ZFS_TYPE_VOLUME: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volumes can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_SNAPSHOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_BOOKMARK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bookmarks can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_POOL: + case ZFS_TYPE_FILESYSTEM: + /* OK */ + ; + } + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_zoneid = jailid; + + cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; + if ((ret = zfs_ioctl(hdl, cmd, &zc)) != 0) + zfs_standard_error(hdl, errno, errbuf); + + return (ret); +} + +/* + * Set loader options for next boot. + */ +int +zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid, + const char *command) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *args; + int error; + + args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid); + fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid); + fnvlist_add_string(args, "command", command); + error = zcmd_write_src_nvlist(hdl, &zc, args); + if (error == 0) + error = zfs_ioctl(hdl, ZFS_IOC_NEXTBOOT, &zc); + zcmd_free_nvlists(&zc); + nvlist_free(args); + return (error); +} + +/* + * Fill given version buffer with zfs kernel version. + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + size_t l = len; + + return (sysctlbyname("vfs.zfs.version.module", + version, &l, NULL, 0)); +} diff --git a/lib/libzfs/os/freebsd/libzfs_zmount.c b/lib/libzfs/os/freebsd/libzfs_zmount.c new file mode 100644 index 0000000000..12317fdde3 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible zmount() function. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../libzfs_impl.h" + +static void +build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val, + size_t len) +{ + int i; + + if (*iovlen < 0) + return; + i = *iovlen; + *iov = realloc(*iov, sizeof (**iov) * (i + 2)); + if (*iov == NULL) { + *iovlen = -1; + return; + } + (*iov)[i].iov_base = strdup(name); + (*iov)[i].iov_len = strlen(name) + 1; + i++; + (*iov)[i].iov_base = val; + if (len == (size_t)-1) { + if (val != NULL) + len = strlen(val) + 1; + else + len = 0; + } + (*iov)[i].iov_len = (int)len; + *iovlen = ++i; +} + +static int +do_mount_(const char *spec, const char *dir, int mflag, char *fstype, + char *dataptr, int datalen, char *optptr, int optlen) +{ + struct iovec *iov; + char *optstr, *p, *tofree; + int iovlen, rv; + + assert(spec != NULL); + assert(dir != NULL); + assert(fstype != NULL); + assert(strcmp(fstype, MNTTYPE_ZFS) == 0); + assert(dataptr == NULL); + assert(datalen == 0); + assert(optptr != NULL); + assert(optlen > 0); + + tofree = optstr = strdup(optptr); + assert(optstr != NULL); + + iov = NULL; + iovlen = 0; + if (strstr(optstr, MNTOPT_REMOUNT) != NULL) + build_iovec(&iov, &iovlen, "update", NULL, 0); + if (mflag & MS_RDONLY) + build_iovec(&iov, &iovlen, "ro", NULL, 0); + build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1); + build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir), + (size_t)-1); + build_iovec(&iov, &iovlen, "from", __DECONST(char *, spec), (size_t)-1); + while ((p = strsep(&optstr, ",/")) != NULL) + build_iovec(&iov, &iovlen, p, NULL, (size_t)-1); + rv = nmount(iov, iovlen, 0); + free(tofree); + if (rv < 0) + return (errno); + return (rv); +} + +int +do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) +{ + + return (do_mount_(zfs_get_name(zhp), mntpt, flags, MNTTYPE_ZFS, NULL, 0, + opts, sizeof (mntpt))); +} + +int +do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags) +{ + if (unmount(mntpt, flags) < 0) + return (errno); + return (0); +} + +int +zfs_mount_delegation_check(void) +{ + return (0); +} + +/* Called from the tail end of zpool_disable_datasets() */ +void +zpool_disable_datasets_os(zpool_handle_t *zhp, boolean_t force) +{ +} + +/* Called from the tail end of zfs_unmount() */ +void +zpool_disable_volume_os(const char *name) +{ +} diff --git a/lib/libzfs/os/linux/libzfs_mount_os.c b/lib/libzfs/os/linux/libzfs_mount_os.c new file mode 100644 index 0000000000..29fea736b6 --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_mount_os.c @@ -0,0 +1,425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2021 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017 RackTop Systems. + * Copyright (c) 2018 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../libzfs_impl.h" +#include + +#define ZS_COMMENT 0x00000000 /* comment */ +#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ + +typedef struct option_map { + const char *name; + unsigned long mntmask; + unsigned long zfsmask; +} option_map_t; + +static const option_map_t option_map[] = { + /* Canonicalized filesystem independent options from mount(8) */ + { MNTOPT_NOAUTO, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DEFAULTS, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NODEVICES, MS_NODEV, ZS_COMMENT }, + { MNTOPT_DEVICES, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DIRSYNC, MS_DIRSYNC, ZS_COMMENT }, + { MNTOPT_NOEXEC, MS_NOEXEC, ZS_COMMENT }, + { MNTOPT_EXEC, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_GROUP, MS_GROUP, ZS_COMMENT }, + { MNTOPT_NETDEV, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOFAIL, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOSUID, MS_NOSUID, ZS_COMMENT }, + { MNTOPT_SUID, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_OWNER, MS_OWNER, ZS_COMMENT }, + { MNTOPT_REMOUNT, MS_REMOUNT, ZS_COMMENT }, + { MNTOPT_RO, MS_RDONLY, ZS_COMMENT }, + { MNTOPT_RW, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_SYNC, MS_SYNCHRONOUS, ZS_COMMENT }, + { MNTOPT_USER, MS_USERS, ZS_COMMENT }, + { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, + /* acl flags passed with util-linux-2.24 mount command */ + { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, + { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, +#ifdef MS_NOATIME + { MNTOPT_NOATIME, MS_NOATIME, ZS_COMMENT }, + { MNTOPT_ATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_NODIRATIME + { MNTOPT_NODIRATIME, MS_NODIRATIME, ZS_COMMENT }, + { MNTOPT_DIRATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_RELATIME + { MNTOPT_RELATIME, MS_RELATIME, ZS_COMMENT }, + { MNTOPT_NORELATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_STRICTATIME + { MNTOPT_STRICTATIME, MS_STRICTATIME, ZS_COMMENT }, + { MNTOPT_NOSTRICTATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_LAZYTIME + { MNTOPT_LAZYTIME, MS_LAZYTIME, ZS_COMMENT }, +#endif + { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, +#ifdef MS_I_VERSION + { MNTOPT_IVERSION, MS_I_VERSION, ZS_COMMENT }, +#endif +#ifdef MS_MANDLOCK + { MNTOPT_NBMAND, MS_MANDLOCK, ZS_COMMENT }, + { MNTOPT_NONBMAND, MS_COMMENT, ZS_COMMENT }, +#endif + /* Valid options not found in mount(8) */ + { MNTOPT_BIND, MS_BIND, ZS_COMMENT }, +#ifdef MS_REC + { MNTOPT_RBIND, MS_BIND|MS_REC, ZS_COMMENT }, +#endif + { MNTOPT_COMMENT, MS_COMMENT, ZS_COMMENT }, +#ifdef MS_NOSUB + { MNTOPT_NOSUB, MS_NOSUB, ZS_COMMENT }, +#endif +#ifdef MS_SILENT + { MNTOPT_QUIET, MS_SILENT, ZS_COMMENT }, +#endif + /* Custom zfs options */ + { MNTOPT_XATTR, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOXATTR, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_ZFSUTIL, MS_COMMENT, ZS_ZFSUTIL }, + { NULL, 0, 0 } }; + +/* + * Break the mount option in to a name/value pair. The name is + * validated against the option map and mount flags set accordingly. + */ +static int +parse_option(char *mntopt, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy) +{ + const option_map_t *opt; + char *ptr, *name, *value = NULL; + int error = 0; + + name = strdup(mntopt); + if (name == NULL) + return (ENOMEM); + + for (ptr = name; ptr && *ptr; ptr++) { + if (*ptr == '=') { + *ptr = '\0'; + value = ptr+1; + VERIFY3P(value, !=, NULL); + break; + } + } + + for (opt = option_map; opt->name != NULL; opt++) { + if (strncmp(name, opt->name, strlen(name)) == 0) { + *mntflags |= opt->mntmask; + *zfsflags |= opt->zfsmask; + error = 0; + goto out; + } + } + + if (!sloppy) + error = ENOENT; +out: + /* If required further process on the value may be done here */ + free(name); + return (error); +} + +/* + * Translate the mount option string in to MS_* mount flags for the + * kernel vfs. When sloppy is non-zero unknown options will be ignored + * otherwise they are considered fatal are copied in to badopt. + */ +int +zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt) +{ + int error = 0, quote = 0, flag = 0, count = 0; + char *ptr, *opt, *opts; + + opts = strdup(mntopts); + if (opts == NULL) + return (ENOMEM); + + *mntflags = 0; + opt = NULL; + + /* + * Scan through all mount options which must be comma delimited. + * We must be careful to notice regions which are double quoted + * and skip commas in these regions. Each option is then checked + * to determine if it is a known option. + */ + for (ptr = opts; ptr && !flag; ptr++) { + if (opt == NULL) + opt = ptr; + + if (*ptr == '"') + quote = !quote; + + if (quote) + continue; + + if (*ptr == '\0') + flag = 1; + + if ((*ptr == ',') || (*ptr == '\0')) { + *ptr = '\0'; + + error = parse_option(opt, mntflags, zfsflags, sloppy); + if (error) { + strcpy(badopt, opt); + goto out; + + } + + if (!(*mntflags & MS_REMOUNT) && + !(*zfsflags & ZS_ZFSUTIL) && + mtabopt != NULL) { + if (count > 0) + strlcat(mtabopt, ",", MNT_LINE_MAX); + + strlcat(mtabopt, opt, MNT_LINE_MAX); + count++; + } + + opt = NULL; + } + } + +out: + free(opts); + return (error); +} + +static void +append_mntopt(const char *name, const char *val, char *mntopts, + char *mtabopt, boolean_t quote) +{ + char tmp[MNT_LINE_MAX]; + + snprintf(tmp, MNT_LINE_MAX, quote ? ",%s=\"%s\"" : ",%s=%s", name, val); + + if (mntopts) + strlcat(mntopts, tmp, MNT_LINE_MAX); + + if (mtabopt) + strlcat(mtabopt, tmp, MNT_LINE_MAX); +} + +static void +zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, + char *mntopts, char *mtabopt) +{ + char context[ZFS_MAXPROPLEN]; + + if (zfs_prop_get(zhp, zpt, context, sizeof (context), + NULL, NULL, 0, B_FALSE) == 0) { + if (strcmp(context, "none") != 0) + append_mntopt(name, context, mntopts, mtabopt, B_TRUE); + } +} + +void +zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, + char *mntopts, char *mtabopt) +{ + char prop[ZFS_MAXPROPLEN]; + + /* + * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists + * if it does, create a tmp variable in case it's needed + * checks to see if the selinux context is set to the default + * if it is, allow the setting of the other context properties + * this is needed because the 'context' property overrides others + * if it is not the default, set the 'context' property + */ + if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), + NULL, NULL, 0, B_FALSE) == 0) { + if (strcmp(prop, "none") == 0) { + zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, + MNTOPT_FSCONTEXT, mntopts, mtabopt); + zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, + MNTOPT_DEFCONTEXT, mntopts, mtabopt); + zfs_selinux_setcontext(zhp, + ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, + mntopts, mtabopt); + } else { + append_mntopt(MNTOPT_CONTEXT, prop, + mntopts, mtabopt, B_TRUE); + } + } + + /* A hint used to determine an auto-mounted snapshot mount point */ + append_mntopt(MNTOPT_MNTPOINT, mntpoint, mntopts, NULL, B_FALSE); +} + +/* + * By default the filesystem by preparing the mount options (i.e. parsing + * some flags from the "opts" parameter into the "flags" parameter) and then + * directly calling the system call mount(2). We don't need the mount utility + * or update /etc/mtab, because this is a symlink on all modern systems. + * + * If the environment variable ZFS_MOUNT_HELPER is set, we fall back to the + * previous behavior: + * The filesystem is mounted by invoking the system mount utility rather + * than by the system call mount(2). This ensures that the /etc/mtab + * file is correctly locked for the update. Performing our own locking + * and /etc/mtab update requires making an unsafe assumption about how + * the mount utility performs its locking. Unfortunately, this also means + * in the case of a mount failure we do not have the exact errno. We must + * make due with return value from the mount process. + */ +int +do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) +{ + const char *src = zfs_get_name(zhp); + int error = 0; + + if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + char badopt[MNT_LINE_MAX] = {0}; + unsigned long mntflags = flags, zfsflags; + char myopts[MNT_LINE_MAX] = {0}; + + if (zfs_parse_mount_options(opts, &mntflags, + &zfsflags, 0, badopt, NULL)) { + return (EINVAL); + } + strlcat(myopts, opts, MNT_LINE_MAX); + zfs_adjust_mount_options(zhp, mntpt, myopts, NULL); + if (mount(src, mntpt, MNTTYPE_ZFS, mntflags, myopts)) { + return (errno); + } + } else { + char *argv[9] = { + "/bin/mount", + "--no-canonicalize", + "-t", MNTTYPE_ZFS, + "-o", opts, + (char *)src, + (char *)mntpt, + (char *)NULL }; + + /* Return only the most critical mount error */ + error = libzfs_run_process(argv[0], argv, + STDOUT_VERBOSE|STDERR_VERBOSE); + if (error) { + if (error & MOUNT_FILEIO) { + error = EIO; + } else if (error & MOUNT_USER) { + error = EINTR; + } else if (error & MOUNT_SOFTWARE) { + error = EPIPE; + } else if (error & MOUNT_BUSY) { + error = EBUSY; + } else if (error & MOUNT_SYSERR) { + error = EAGAIN; + } else if (error & MOUNT_USAGE) { + error = EINVAL; + } else + error = ENXIO; /* Generic error */ + } + } + + return (error); +} + +int +do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags) +{ + if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + int rv = umount2(mntpt, flags); + + return (rv < 0 ? errno : 0); + } + + char force_opt[] = "-f"; + char lazy_opt[] = "-l"; + char *argv[7] = { + "/bin/umount", + "-t", MNTTYPE_ZFS, + NULL, NULL, NULL, NULL }; + int rc, count = 3; + + if (flags & MS_FORCE) { + argv[count] = force_opt; + count++; + } + + if (flags & MS_DETACH) { + argv[count] = lazy_opt; + count++; + } + + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); + + return (rc ? EINVAL : 0); +} + +int +zfs_mount_delegation_check(void) +{ + return ((geteuid() != 0) ? EACCES : 0); +} + +/* Called from the tail end of zpool_disable_datasets() */ +void +zpool_disable_datasets_os(zpool_handle_t *zhp, boolean_t force) +{ +} + +/* Called from the tail end of zfs_unmount() */ +void +zpool_disable_volume_os(const char *name) +{ +} diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c new file mode 100644 index 0000000000..90eb8db507 --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -0,0 +1,342 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "../../libzfs_impl.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + int fd, error; + + if ((fd = open(path, O_RDWR|O_DIRECT|O_CLOEXEC)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, msg)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + */ + error = efi_use_whole_disk(fd); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); + (void) ioctl(fd, BLKFLSBUF); + + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), path); + return (zfs_error(hdl, EZFS_NOCAP, msg)); + } + return (0); +} + +/* + * Read the EFI label from the config, if a label does not exist then + * pass back the error to the caller. If the caller has passed a non-NULL + * diskaddr argument then we set it to the starting address of the EFI + * partition. + */ +static int +read_efi_label(nvlist_t *config, diskaddr_t *sb) +{ + char *path; + int fd; + char diskname[MAXPATHLEN]; + int err = -1; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) + return (err); + + (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, + strrchr(path, '/')); + if ((fd = open(diskname, O_RDONLY|O_DIRECT|O_CLOEXEC)) >= 0) { + struct dk_gpt *vtoc; + + if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { + if (sb != NULL) + *sb = vtoc->efi_parts[0].p_start; + efi_free(vtoc); + } + (void) close(fd); + } + return (err); +} + +/* + * determine where a partition starts on a disk in the current + * configuration + */ +static diskaddr_t +find_start_block(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + diskaddr_t sb = MAXOFFSET_T; + uint64_t wholedisk; + + if (nvlist_lookup_nvlist_array(config, + ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) { + return (MAXOFFSET_T); + } + if (read_efi_label(config, &sb) < 0) + sb = MAXOFFSET_T; + return (sb); + } + + for (c = 0; c < children; c++) { + sb = find_start_block(child[c]); + if (sb != MAXOFFSET_T) { + return (sb); + } + } + return (MAXOFFSET_T); +} + +static int +zpool_label_disk_check(char *path) +{ + struct dk_gpt *vtoc; + int fd, err; + + if ((fd = open(path, O_RDONLY|O_DIRECT|O_CLOEXEC)) < 0) + return (errno); + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + return (err); + } + + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + return (EIDRM); + } + + efi_free(vtoc); + (void) close(fd); + return (0); +} + +/* + * Generate a unique partition name for the ZFS member. Partitions must + * have unique names to ensure udev will be able to create symlinks under + * /dev/disk/by-partlabel/ for all pool members. The partition names are + * of the form -. + */ +static void +zpool_label_name(char *label_name, int label_size) +{ + uint64_t id = 0; + int fd; + + fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC); + if (fd >= 0) { + if (read(fd, &id, sizeof (id)) != sizeof (id)) + id = 0; + + close(fd); + } + + if (id == 0) + id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); + + snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); +} + +/* + * Label an individual disk. The name provided is the short name, + * stripped of any leading /dev path. + */ +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + char path[MAXPATHLEN]; + struct dk_gpt *vtoc; + int rval, fd; + size_t resv = EFI_MIN_RESV_SIZE; + uint64_t slice_size; + diskaddr_t start_block; + char errbuf[1024]; + + /* prepare an error message just in case */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + + if (zhp) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + if (zhp->zpool_start_block == 0) + start_block = find_start_block(nvroot); + else + start_block = zhp->zpool_start_block; + zhp->zpool_start_block = start_block; + } else { + /* new pool */ + start_block = NEW_START_BLOCK; + } + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL|O_CLOEXEC)) < 0) { + /* + * This shouldn't happen. We've long since verified that this + * is a valid device. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { + /* + * The only way this can fail is if we run out of memory, or we + * were unable to read the disk's capacity + */ + if (errno == ENOMEM) + (void) no_memory(hdl); + + (void) close(fd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to read disk capacity"), path); + + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + + slice_size = vtoc->efi_last_u_lba + 1; + slice_size -= EFI_MIN_RESV_SIZE; + if (start_block == MAXOFFSET_T) + start_block = NEW_START_BLOCK; + slice_size -= start_block; + slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + + vtoc->efi_parts[0].p_start = start_block; + vtoc->efi_parts[0].p_size = slice_size; + + /* + * Why we use V_USR: V_BACKUP confuses users, and is considered + * disposable by some EFI utilities (since EFI doesn't have a backup + * slice). V_UNASSIGNED is supposed to be used only for zero size + * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, + * etc. were all pretty specific. V_USR is as close to reality as we + * can get, in the absence of V_OTHER. + */ + vtoc->efi_parts[0].p_tag = V_USR; + zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); + + vtoc->efi_parts[8].p_start = slice_size + start_block; + vtoc->efi_parts[8].p_size = resv; + vtoc->efi_parts[8].p_tag = V_RESERVED; + + rval = efi_write(fd, vtoc); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); + (void) ioctl(fd, BLKFLSBUF); + + if (rval == 0) + rval = efi_rescan(fd); + + /* + * Some block drivers (like pcata) may not support EFI GPT labels. + * Print out a helpful error message directing the user to manually + * label the disk and give a specific slice. + */ + if (rval != 0) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " + "parted(8) and then provide a specific slice: %d"), rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + (void) close(fd); + efi_free(vtoc); + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + (void) zfs_append_partition(path, MAXPATHLEN); + + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " + "detect device partitions on '%s': %d"), path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + /* We can't be to paranoid. Read the label back and verify it. */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + rval = zpool_label_disk_check(path); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " + "EFI label on '%s' is damaged. Ensure\nthis device " + "is not in use, and is functioning properly: %d"), + path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + return (0); +} diff --git a/lib/libzfs/os/linux/libzfs_sendrecv_os.c b/lib/libzfs/os/linux/libzfs_sendrecv_os.c new file mode 100644 index 0000000000..593c38ec62 --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_sendrecv_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include + +#include "../../libzfs_impl.h" + +#ifndef F_SETPIPE_SZ +#define F_SETPIPE_SZ (F_SETLEASE + 7) +#endif /* F_SETPIPE_SZ */ + +#ifndef F_GETPIPE_SZ +#define F_GETPIPE_SZ (F_GETLEASE + 7) +#endif /* F_GETPIPE_SZ */ + +void +libzfs_set_pipe_max(int infd) +{ + FILE *procf = fopen("/proc/sys/fs/pipe-max-size", "re"); + + if (procf != NULL) { + unsigned long max_psize; + long cur_psize; + if (fscanf(procf, "%lu", &max_psize) > 0) { + cur_psize = fcntl(infd, F_GETPIPE_SZ); + if (cur_psize > 0 && + max_psize > (unsigned long) cur_psize) + fcntl(infd, F_SETPIPE_SZ, + max_psize); + } + fclose(procf); + } +} diff --git a/lib/libzfs/os/linux/libzfs_util_os.c b/lib/libzfs/os/linux/libzfs_util_os.c new file mode 100644 index 0000000000..2ac31f1077 --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_util_os.c @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../libzfs_impl.h" +#include "zfs_prop.h" +#include +#include + +#define ZDIFF_SHARESDIR "/.zfs/shares/" + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (ioctl(hdl->libzfs_fd, request, zc)); +} + +const char * +libzfs_error_init(int error) +{ + switch (error) { + case ENXIO: + return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " + "loaded.\nTry running '/sbin/modprobe zfs' as root " + "to load them.")); + case ENOENT: + return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " + "are required.\nTry running 'udevadm trigger' and 'mount " + "-t proc proc /proc' as root.")); + case ENOEXEC: + return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " + "auto-loaded.\nTry running '/sbin/modprobe zfs' as " + "root to manually load them.")); + case EACCES: + return (dgettext(TEXT_DOMAIN, "Permission denied the " + "ZFS utilities must be run as root.")); + default: + return (dgettext(TEXT_DOMAIN, "Failed to initialize the " + "libzfs library.")); + } +} + +static int +libzfs_module_loaded(const char *module) +{ + const char path_prefix[] = "/sys/module/"; + char path[256]; + + memcpy(path, path_prefix, sizeof (path_prefix) - 1); + strcpy(path + sizeof (path_prefix) - 1, module); + + return (access(path, F_OK) == 0); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +static int +libzfs_load_module_impl(const char *module) +{ + char *argv[4] = {"/sbin/modprobe", "-q", (char *)module, (char *)0}; + char *load_str, *timeout_str; + long timeout = 10; /* seconds */ + long busy_timeout = 10; /* milliseconds */ + int load = 0, fd; + hrtime_t start; + + /* Optionally request module loading */ + if (!libzfs_module_loaded(module)) { + load_str = getenv("ZFS_MODULE_LOADING"); + if (load_str) { + if (!strncasecmp(load_str, "YES", strlen("YES")) || + !strncasecmp(load_str, "ON", strlen("ON"))) + load = 1; + else + load = 0; + } + + if (load) { + if (libzfs_run_process("/sbin/modprobe", argv, 0)) + return (ENOEXEC); + } + + if (!libzfs_module_loaded(module)) + return (ENXIO); + } + + /* + * Device creation by udev is asynchronous and waiting may be + * required. Busy wait for 10ms and then fall back to polling every + * 10ms for the allowed timeout (default 10s, max 10m). This is + * done to optimize for the common case where the device is + * immediately available and to avoid penalizing the possible + * case where udev is slow or unable to create the device. + */ + timeout_str = getenv("ZFS_MODULE_TIMEOUT"); + if (timeout_str) { + timeout = strtol(timeout_str, NULL, 0); + timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ + } + + start = gethrtime(); + do { + fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); + if (fd >= 0) { + (void) close(fd); + return (0); + } else if (errno != ENOENT) { + return (errno); + } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { + sched_yield(); + } else { + usleep(10 * MILLISEC); + } + } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); + + return (ENOENT); +} + +int +libzfs_load_module(void) +{ + return (libzfs_load_module_impl(ZFS_DRIVER)); +} + +int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +int +zfs_destroy_snaps_nvl_os(libzfs_handle_t *hdl, nvlist_t *snaps) +{ + return (0); +} + +/* + * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + int _errno; + int fd; + int rlen; + + if ((fd = open(ZFS_SYSFS_DIR "/version", O_RDONLY | O_CLOEXEC)) == -1) + return (-1); + + if ((rlen = read(fd, version, len)) == -1) { + version[0] = '\0'; + _errno = errno; + (void) close(fd); + errno = _errno; + return (-1); + } + + version[rlen-1] = '\0'; /* discard '\n' */ + + if (close(fd) == -1) + return (-1); + + return (0); +} diff --git a/lib/libzfs_core/.gitignore b/lib/libzfs_core/.gitignore new file mode 100644 index 0000000000..c428d63690 --- /dev/null +++ b/lib/libzfs_core/.gitignore @@ -0,0 +1 @@ +/libzfs_core.pc diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am index 421b8b4bfb..64cb76f199 100644 --- a/lib/libzfs_core/Makefile.am +++ b/lib/libzfs_core/Makefile.am @@ -1,20 +1,53 @@ include $(top_srcdir)/config/Rules.am -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +pkgconfig_DATA = libzfs_core.pc + +AM_CFLAGS += -fvisibility=hidden lib_LTLIBRARIES = libzfs_core.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ libzfs_core.c -nodist_libzfs_core_la_SOURCES = $(USER_C) +if BUILD_LINUX +USER_C += \ + os/linux/libzfs_core_ioctl.c +endif + +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs + +USER_C += \ + os/freebsd/libzfs_core_ioctl.c + +VPATH += $(top_srcdir)/module/os/freebsd/zfs + +nodist_libzfs_core_la_SOURCES = zfs_ioctl_compat.c +endif + +libzfs_core_la_SOURCES = $(USER_C) libzfs_core_la_LIBADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libuutil/libuutil.la + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libspl/libspl.la -libzfs_core_la_LDFLAGS = -version-info 1:0:0 +libzfs_core_la_LIBADD += $(LTLIBINTL) -EXTRA_DIST = $(USER_C) +libzfs_core_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzfs_core_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzfs_core_la_LIBADD += -lutil -lgeom +endif + +libzfs_core_la_LDFLAGS += -version-info 3:0:0 + +include $(top_srcdir)/config/CppCheck.am + +# Library ABI +EXTRA_DIST = libzfs_core.abi libzfs_core.suppr diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi new file mode 100644 index 0000000000..5bed6c8e0f --- /dev/null +++ b/lib/libzfs_core/libzfs_core.abi @@ -0,0 +1,2139 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 99fc84d046..cbe486d08b 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -20,11 +20,12 @@ */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ /* @@ -52,7 +53,7 @@ * * - Thin Layer. libzfs_core is a thin layer, marshaling arguments * to/from the kernel ioctls. There is generally a 1:1 correspondence - * between libzfs_core functions and ioctls to /dev/zfs. + * between libzfs_core functions and ioctls to ZFS_DEV. * * - Clear Atomicity. Because libzfs_core functions are generally 1:1 * with kernel ioctls, and kernel ioctls are general atomic, each @@ -84,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +97,7 @@ static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; static int g_refcount; #ifdef ZFS_DEBUG -static zfs_ioc_t fail_ioc_cmd; +static zfs_ioc_t fail_ioc_cmd = ZFS_IOC_LAST; static zfs_errno_t fail_ioc_err; static void @@ -116,7 +118,7 @@ libzfs_core_debug_ioc(void) * cannot checkpoint 'tank': the loaded zfs module does not support * this operation. A reboot may be required to enable this operation. */ - if (fail_ioc_cmd == 0) { + if (fail_ioc_cmd == ZFS_IOC_LAST) { char *ioc_test = getenv("ZFS_IOC_TEST"); unsigned int ioc_num = 0, ioc_err = 0; @@ -135,7 +137,7 @@ libzfs_core_init(void) { (void) pthread_mutex_lock(&g_lock); if (g_refcount == 0) { - g_fd = open("/dev/zfs", O_RDWR); + g_fd = open(ZFS_DEV, O_RDWR|O_CLOEXEC); if (g_fd < 0) { (void) pthread_mutex_unlock(&g_lock); return (errno); @@ -156,8 +158,7 @@ libzfs_core_fini(void) (void) pthread_mutex_lock(&g_lock); ASSERT3S(g_refcount, >, 0); - if (g_refcount > 0) - g_refcount--; + g_refcount--; if (g_refcount == 0 && g_fd != -1) { (void) close(g_fd); @@ -208,7 +209,7 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, } } - while (ioctl(g_fd, ioc, &zc) != 0) { + while (lzc_ioctl_fd(g_fd, ioc, &zc) != 0) { /* * If ioctl exited with ENOMEM, we retry the ioctl after * increasing the size of the destination nvlist. @@ -291,13 +292,13 @@ lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) * The promote ioctl is still legacy, so we need to construct our * own zfs_cmd_t rather than using lzc_ioctl(). */ - zfs_cmd_t zc = { "\0" }; + zfs_cmd_t zc = {"\0"}; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); - if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { + if (lzc_ioctl_fd(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { int error = errno; if (error == EEXIST && snapnamebuf != NULL) (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen); @@ -306,30 +307,22 @@ lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) return (0); } -int -lzc_remap(const char *fsname) -{ - int error; - nvlist_t *args = fnvlist_alloc(); - error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL); - nvlist_free(args); - return (error); -} - int lzc_rename(const char *source, const char *target) { - zfs_cmd_t zc = { "\0" }; + zfs_cmd_t zc = {"\0"}; int error; + ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - error = ioctl(g_fd, ZFS_IOC_RENAME, &zc); + error = lzc_ioctl_fd(g_fd, ZFS_IOC_RENAME, &zc); if (error != 0) error = errno; return (error); } + int lzc_destroy(const char *fsname) { @@ -475,7 +468,7 @@ lzc_exists(const char *dataset) VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); + return (lzc_ioctl_fd(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); } /* @@ -499,7 +492,7 @@ lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) * The snapshots must all be in the same pool. * The value is the name of the hold (string type). * - * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). + * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL). * In this case, when the cleanup_fd is closed (including on process * termination), the holds will be released. If the system is shut down * uncleanly, the holds will be released when the pool is next opened @@ -632,12 +625,42 @@ int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) { - return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + NULL)); +} + +int +lzc_send_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, const char *redactbook) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + redactbook)); } int lzc_send_resume(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, resumeobj, + resumeoff, NULL)); +} + +/* + * snapname: The name of the "tosnap", or the snapshot whose contents we are + * sending. + * from: The name of the "fromsnap", or the incremental source. + * fd: File descriptor to write the stream to. + * flags: flags that determine features to be used by the stream. + * resumeobj: Object to resume from, for resuming send + * resumeoff: Offset to resume from, for resuming send. + * redactnv: nvlist of string -> boolean(ignored) containing the names of all + * the snapshots that we should redact with respect to. + * redactbook: Name of the redaction bookmark to create. + */ +int +lzc_send_resume_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook) { nvlist_t *args; int err; @@ -654,10 +677,15 @@ lzc_send_resume(const char *snapname, const char *from, int fd, fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); + if (flags & LZC_SEND_FLAG_SAVED) + fnvlist_add_boolean(args, "savedok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); @@ -676,11 +704,13 @@ lzc_send_resume(const char *snapname, const char *from, int fd, * are traversed, looking for blocks with a birth time since the creation TXG of * the snapshot this bookmark was created from. This will result in * significantly more I/O and be less efficient than a send space estimation on - * an equivalent snapshot. + * an equivalent snapshot. This process is also used if redact_snaps is + * non-null. */ int -lzc_send_space(const char *snapname, const char *from, - enum lzc_send_flags flags, uint64_t *spacep) +lzc_send_space_resume_redacted(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; @@ -697,6 +727,16 @@ lzc_send_space(const char *snapname, const char *from, fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + fnvlist_add_uint64(args, "bytes", resume_bytes); + } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + if (fd != -1) + fnvlist_add_int32(args, "fd", fd); + err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) @@ -705,6 +745,14 @@ lzc_send_space(const char *snapname, const char *from, return (err); } +int +lzc_send_space(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) +{ + return (lzc_send_space_resume_redacted(snapname, from, flags, 0, 0, 0, + NULL, -1, spacep)); +} + static int recv_read(int fd, void *buf, int ilen) { @@ -736,14 +784,14 @@ static int recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, - const dmu_replay_record_t *begin_record, int cleanup_fd, - uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, - nvlist_t **errors) + const dmu_replay_record_t *begin_record, uint64_t *read_bytes, + uint64_t *errflags, nvlist_t **errors) { dmu_replay_record_t drr; char fsname[MAXPATHLEN]; char *atp; int error; + boolean_t payload = B_FALSE; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); @@ -774,13 +822,13 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, return (error); } else { drr = *begin_record; + payload = (begin_record->drr_payloadlen != 0); } /* - * Raw receives, resumable receives, and receives that include a - * wrapping key all use the new interface. + * All receives with a payload should use the new interface. */ - if (resumable || raw || wkeydata != NULL) { + if (resumable || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); @@ -820,12 +868,6 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, if (resumable) fnvlist_add_boolean(innvl, "resumable"); - if (cleanup_fd >= 0) - fnvlist_add_int32(innvl, "cleanup_fd", cleanup_fd); - - if (action_handle != NULL) - fnvlist_add_uint64(innvl, "action_handle", - *action_handle); error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl); @@ -837,10 +879,6 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, error = nvlist_lookup_uint64(outnvl, "error_flags", errflags); - if (error == 0 && action_handle != NULL) - error = nvlist_lookup_uint64(outnvl, "action_handle", - action_handle); - if (error == 0 && errors != NULL) { nvlist_t *nvl; error = nvlist_lookup_nvlist(outnvl, "errors", &nvl); @@ -883,17 +921,11 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, zc.zc_cleanup_fd = -1; zc.zc_action_handle = 0; - if (cleanup_fd >= 0) - zc.zc_cleanup_fd = cleanup_fd; - - if (action_handle != NULL) - zc.zc_action_handle = *action_handle; - zc.zc_nvlist_dst_size = 128 * 1024; zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); - error = ioctl(g_fd, ZFS_IOC_RECV, &zc); + error = lzc_ioctl_fd(g_fd, ZFS_IOC_RECV, &zc); if (error != 0) { error = errno; } else { @@ -903,9 +935,6 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, if (errflags != NULL) *errflags = zc.zc_obj; - if (action_handle != NULL) - *action_handle = zc.zc_action_handle; - if (errors != NULL) VERIFY0(nvlist_unpack( (void *)(uintptr_t)zc.zc_nvlist_dst, @@ -938,7 +967,7 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_FALSE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); + B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); } /* @@ -952,7 +981,7 @@ lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_TRUE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); + B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); } /* @@ -975,7 +1004,7 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props, return (EINVAL); return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, fd, begin_record, -1, NULL, NULL, NULL, NULL)); + resumable, raw, fd, begin_record, NULL, NULL, NULL)); } /* @@ -991,22 +1020,21 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props, * The 'errflags' value will contain zprop_errflags_t flags which are * used to describe any failures. * - * The 'action_handle' is used to pass the handle for this guid/ds mapping. - * It should be set to zero on first call and will contain an updated handle - * on success, it should be passed in subsequent calls. + * The 'action_handle' and 'cleanup_fd' are no longer used, and are ignored. * * The 'errors' nvlist contains an entry for each unapplied received * property. Callers are responsible for freeing this nvlist. */ -int lzc_receive_one(const char *snapname, nvlist_t *props, +int +lzc_receive_one(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, input_fd, begin_record, cleanup_fd, read_bytes, - errflags, action_handle, errors)); + resumable, raw, input_fd, begin_record, + read_bytes, errflags, errors)); } /* @@ -1017,7 +1045,8 @@ int lzc_receive_one(const char *snapname, nvlist_t *props, * exclude ('zfs receive -x') properties. Callers are responsible for freeing * this nvlist */ -int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, +int +lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, @@ -1025,8 +1054,8 @@ int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, nvlist_t **errors) { return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, - force, resumable, raw, input_fd, begin_record, cleanup_fd, - read_bytes, errflags, action_handle, errors)); + force, resumable, raw, input_fd, begin_record, + read_bytes, errflags, errors)); } /* @@ -1080,11 +1109,13 @@ lzc_rollback_to(const char *fsname, const char *snapname) } /* - * Creates bookmarks. + * Creates new bookmarks from existing snapshot or bookmark. * - * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to - * the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and - * snapshots must be in the same pool. + * The bookmarks nvlist maps from the full name of the new bookmark to + * the full name of the source snapshot or bookmark. + * All the bookmarks and snapshots must be in the same pool. + * The new bookmarks names must be unique. + * => see function dsl_bookmark_create_nvl_validate * * The returned results nvlist will have an entry for each bookmark that failed. * The value will be the (int32) error code. @@ -1099,7 +1130,7 @@ lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; - /* determine the pool name */ + /* determine pool name from first bookmark */ elem = nvlist_next_nvpair(bookmarks, NULL); if (elem == NULL) return (0); @@ -1118,19 +1149,33 @@ lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) * parameter is an nvlist of property names (with no values) that will be * returned for each bookmark. * - * The following are valid properties on bookmarks, all of which are numbers - * (represented as uint64 in the nvlist) + * The following are valid properties on bookmarks, most of which are numbers + * (represented as uint64 in the nvlist), except redact_snaps, which is a + * uint64 array, and redact_complete, which is a boolean * * "guid" - globally unique identifier of the snapshot it refers to * "createtxg" - txg when the snapshot it refers to was created * "creation" - timestamp when the snapshot it refers to was created * "ivsetguid" - IVset guid for identifying encrypted snapshots + * "redact_snaps" - list of guids of the redaction snapshots for the specified + * bookmark. If the bookmark is not a redaction bookmark, the nvlist will + * not contain an entry for this value. If it is redacted with respect to + * no snapshots, it will contain value -> NULL uint64 array + * "redact_complete" - boolean value; true if the redaction bookmark is + * complete, false otherwise. * * The format of the returned nvlist as follows: * -> { * -> { * "value" -> uint64 * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + * "redact_complete" -> { + * "value" -> boolean value + * } * } */ int @@ -1139,6 +1184,33 @@ lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); } +/* + * Get bookmark properties. + * + * Given a bookmark's full name, retrieve all properties for the bookmark. + * + * The format of the returned property list is as follows: + * { + * -> { + * "value" -> uint64 + * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + */ +int +lzc_get_bookmark_props(const char *bookmark, nvlist_t **props) +{ + int error; + + nvlist_t *innvl = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, innvl, props); + fnvlist_free(innvl); + + return (error); +} + /* * Destroys bookmarks. * @@ -1479,3 +1551,92 @@ lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate, return (error); } + +/* + * Create a redaction bookmark named bookname by redacting snapshot with respect + * to all the snapshots in snapnv. + */ +int +lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "bookname", bookname); + fnvlist_add_nvlist(args, "snapnv", snapnv); + int error = lzc_ioctl(ZFS_IOC_REDACT, snapshot, args, NULL); + fnvlist_free(args); + return (error); +} + +static int +wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, + uint64_t tag, boolean_t *waited) +{ + nvlist_t *args = fnvlist_alloc(); + nvlist_t *result = NULL; + + fnvlist_add_int32(args, ZPOOL_WAIT_ACTIVITY, activity); + if (use_tag) + fnvlist_add_uint64(args, ZPOOL_WAIT_TAG, tag); + + int error = lzc_ioctl(ZFS_IOC_WAIT, pool, args, &result); + + if (error == 0 && waited != NULL) + *waited = fnvlist_lookup_boolean_value(result, + ZPOOL_WAIT_WAITED); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +int +lzc_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) +{ + return (wait_common(pool, activity, B_FALSE, 0, waited)); +} + +int +lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, + boolean_t *waited) +{ + return (wait_common(pool, activity, B_TRUE, tag, waited)); +} + +int +lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) +{ + nvlist_t *args = fnvlist_alloc(); + nvlist_t *result = NULL; + + fnvlist_add_int32(args, ZFS_WAIT_ACTIVITY, activity); + + int error = lzc_ioctl(ZFS_IOC_WAIT_FS, fs, args, &result); + + if (error == 0 && waited != NULL) + *waited = fnvlist_lookup_boolean_value(result, + ZFS_WAIT_WAITED); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +/* + * Set the bootenv contents for the given pool. + */ +int +lzc_set_bootenv(const char *pool, const nvlist_t *env) +{ + return (lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, (nvlist_t *)env, NULL)); +} + +/* + * Get the contents of the bootenv of the given pool. + */ +int +lzc_get_bootenv(const char *pool, nvlist_t **outnvl) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); +} diff --git a/lib/libzfs/libzfs_core.pc.in b/lib/libzfs_core/libzfs_core.pc.in similarity index 50% rename from lib/libzfs/libzfs_core.pc.in rename to lib/libzfs_core/libzfs_core.pc.in index 2b6a86bfa4..bc9582ea33 100644 --- a/lib/libzfs/libzfs_core.pc.in +++ b/lib/libzfs_core/libzfs_core.pc.in @@ -6,6 +6,8 @@ includedir=@includedir@ Name: libzfs_core Description: LibZFS core library Version: @VERSION@ -URL: http://zfsonlinux.org +URL: https://github.com/openzfs/zfs +Requires.private: @LIBBLKID_PC@ @LIBUUID_PC@ @LIBTIRPC_PC@ @ZLIB_PC@ Cflags: -I${includedir}/libzfs -I${includedir}/libspl -Libs: -L${libdir} -lzfs_core +Libs: -L${libdir} -lzfs_core -lnvpair +Libs.private: @LIBCLOCK_GETTIME@ @LIBUDEV_LIBS@ -lm -pthread diff --git a/lib/libzfs_core/libzfs_core.suppr b/lib/libzfs_core/libzfs_core.suppr new file mode 100644 index 0000000000..109d331cfd --- /dev/null +++ b/lib/libzfs_core/libzfs_core.suppr @@ -0,0 +1,5 @@ +[suppress_type] + name = FILE* + +[suppress_type] + name = pthread_cond_t diff --git a/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c b/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c new file mode 100644 index 0000000000..b8394886d0 --- /dev/null +++ b/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c @@ -0,0 +1,123 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include +#include + +int zfs_ioctl_version = ZFS_IOCVER_UNDEF; + +/* + * Get zfs_ioctl_version + */ +static int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = ZFS_IOCVER_NONE; + + ver_size = sizeof (ver); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + return (ver); +} + +static int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int newrequest, ret; + void *zc_c = NULL; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, zfs_iocparm_t); + zp.zfs_cmd = (uint64_t)(uintptr_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_OZFS; + break; + case ZFS_CMD_COMPAT_LEGACY: + newrequest = zfs_ioctl_ozfs_to_legacy(request); + ncmd = _IOWR('Z', newrequest, zfs_iocparm_t); + zc_c = malloc(sizeof (zfs_cmd_legacy_t)); + zfs_cmd_ozfs_to_legacy(zc, zc_c); + zp.zfs_cmd = (uint64_t)(uintptr_t)zc_c; + zp.zfs_cmd_size = sizeof (zfs_cmd_legacy_t); + zp.zfs_ioctl_version = ZFS_IOCVER_LEGACY; + break; + default: + abort(); + return (EINVAL); + } + + ret = ioctl(fd, ncmd, &zp); + if (ret) { + if (zc_c) + free(zc_c); + return (ret); + } + if (zc_c) { + zfs_cmd_legacy_to_ozfs(zc_c, zc); + free(zc_c); + } + return (ret); +} + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +int +lzc_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + + switch (zfs_ioctl_version) { + case ZFS_IOCVER_LEGACY: + cflag = ZFS_CMD_COMPAT_LEGACY; + break; + case ZFS_IOCVER_OZFS: + cflag = ZFS_CMD_COMPAT_NONE; + break; + default: + errx(1, "unrecognized zfs ioctl version %d", + zfs_ioctl_version); + } + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/lib/libzfs_core/os/linux/libzfs_core_ioctl.c b/lib/libzfs_core/os/linux/libzfs_core_ioctl.c new file mode 100644 index 0000000000..9b44a4e3be --- /dev/null +++ b/lib/libzfs_core/os/linux/libzfs_core_ioctl.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include + +int +lzc_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + return (ioctl(fd, request, zc)); +} diff --git a/lib/libzfsbootenv/.gitignore b/lib/libzfsbootenv/.gitignore new file mode 100644 index 0000000000..3fea5c642d --- /dev/null +++ b/lib/libzfsbootenv/.gitignore @@ -0,0 +1 @@ +/libzfsbootenv.pc diff --git a/lib/libzfsbootenv/Makefile.am b/lib/libzfsbootenv/Makefile.am new file mode 100644 index 0000000000..0c454a5e03 --- /dev/null +++ b/lib/libzfsbootenv/Makefile.am @@ -0,0 +1,41 @@ +include $(top_srcdir)/config/Rules.am + +pkgconfig_DATA = libzfsbootenv.pc + +AM_CFLAGS += -fvisibility=hidden + +lib_LTLIBRARIES = libzfsbootenv.la + +include $(top_srcdir)/config/Abigail.am + +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs +endif +if BUILD_LINUX +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/linux/zfs +endif + +USER_C = \ + lzbe_device.c \ + lzbe_pair.c \ + lzbe_util.c + +dist_libzfsbootenv_la_SOURCES = \ + $(USER_C) + +libzfsbootenv_la_LIBADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +libzfsbootenv_la_LDFLAGS = + +if !ASAN_ENABLED +libzfsbootenv_la_LDFLAGS += -Wl,-z,defs +endif + +libzfsbootenv_la_LDFLAGS += -version-info 1:0:0 + +include $(top_srcdir)/config/CppCheck.am + +# Library ABI +EXTRA_DIST = libzfsbootenv.abi libzfsbootenv.suppr diff --git a/lib/libzfsbootenv/libzfsbootenv.abi b/lib/libzfsbootenv/libzfsbootenv.abi new file mode 100644 index 0000000000..86ec25cf84 --- /dev/null +++ b/lib/libzfsbootenv/libzfsbootenv.abi @@ -0,0 +1,201 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfsbootenv/libzfsbootenv.pc.in b/lib/libzfsbootenv/libzfsbootenv.pc.in new file mode 100644 index 0000000000..986286d9bc --- /dev/null +++ b/lib/libzfsbootenv/libzfsbootenv.pc.in @@ -0,0 +1,12 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libzfsbootenv +Description: LibZFSBootENV library +Version: @VERSION@ +URL: https://github.com/openzfs/zfs +Requires: libzfs +Cflags: -I${includedir} +Libs: -L${libdir} -lzfsbootenv diff --git a/lib/libzfsbootenv/libzfsbootenv.suppr b/lib/libzfsbootenv/libzfsbootenv.suppr new file mode 100644 index 0000000000..f4db8a49e4 --- /dev/null +++ b/lib/libzfsbootenv/libzfsbootenv.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/lib/libzfsbootenv/lzbe_device.c b/lib/libzfsbootenv/lzbe_device.c new file mode 100644 index 0000000000..2d9c7b749e --- /dev/null +++ b/lib/libzfsbootenv/lzbe_device.c @@ -0,0 +1,163 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2020 Toomas Soome + */ + +#include +#include +#include +#include +#include +#include + +/* + * Store device name to zpool label bootenv area. + * This call will set bootenv version to VB_NVLIST, if bootenv currently + * does contain other version, then old data will be replaced. + */ +int +lzbe_set_boot_device(const char *pool, lzbe_flags_t flag, const char *device) +{ + libzfs_handle_t *hdl; + zpool_handle_t *zphdl; + nvlist_t *nv; + char *descriptor; + uint64_t version; + int rv = -1; + + if (pool == NULL || *pool == '\0') + return (rv); + + if ((hdl = libzfs_init()) == NULL) + return (rv); + + zphdl = zpool_open(hdl, pool); + if (zphdl == NULL) { + libzfs_fini(hdl); + return (rv); + } + + switch (flag) { + case lzbe_add: + rv = zpool_get_bootenv(zphdl, &nv); + if (rv == 0) { + /* + * We got the nvlist, check for version. + * if version is missing or is not VB_NVLIST, + * create new list. + */ + rv = nvlist_lookup_uint64(nv, BOOTENV_VERSION, + &version); + if (rv == 0 && version == VB_NVLIST) + break; + + /* Drop this nvlist */ + fnvlist_free(nv); + } + fallthrough; + case lzbe_replace: + nv = fnvlist_alloc(); + break; + default: + return (rv); + } + + /* version is mandatory */ + fnvlist_add_uint64(nv, BOOTENV_VERSION, VB_NVLIST); + + /* + * If device name is empty, remove boot device configuration. + */ + if ((device == NULL || *device == '\0')) { + if (nvlist_exists(nv, OS_BOOTONCE)) + fnvlist_remove(nv, OS_BOOTONCE); + } else { + /* + * Use device name directly if it does start with + * prefix "zfs:". Otherwise, add prefix and suffix. + */ + if (strncmp(device, "zfs:", 4) == 0) { + fnvlist_add_string(nv, OS_BOOTONCE, device); + } else { + if (asprintf(&descriptor, "zfs:%s:", device) > 0) { + fnvlist_add_string(nv, OS_BOOTONCE, descriptor); + free(descriptor); + } else + rv = ENOMEM; + } + } + + rv = zpool_set_bootenv(zphdl, nv); + if (rv != 0) + fprintf(stderr, "%s\n", libzfs_error_description(hdl)); + + fnvlist_free(nv); + zpool_close(zphdl); + libzfs_fini(hdl); + return (rv); +} + +/* + * Return boot device name from bootenv, if set. + */ +int +lzbe_get_boot_device(const char *pool, char **device) +{ + libzfs_handle_t *hdl; + zpool_handle_t *zphdl; + nvlist_t *nv; + char *val; + int rv = -1; + + if (pool == NULL || *pool == '\0' || device == NULL) + return (rv); + + if ((hdl = libzfs_init()) == NULL) + return (rv); + + zphdl = zpool_open(hdl, pool); + if (zphdl == NULL) { + libzfs_fini(hdl); + return (rv); + } + + rv = zpool_get_bootenv(zphdl, &nv); + if (rv == 0) { + rv = nvlist_lookup_string(nv, OS_BOOTONCE, &val); + if (rv == 0) { + /* + * zfs device descriptor is in form of "zfs:dataset:", + * we only do need dataset name. + */ + if (strncmp(val, "zfs:", 4) == 0) { + val += 4; + val = strdup(val); + if (val != NULL) { + size_t len = strlen(val); + + if (val[len - 1] == ':') + val[len - 1] = '\0'; + *device = val; + } else { + rv = ENOMEM; + } + } else { + rv = EINVAL; + } + } + nvlist_free(nv); + } + + zpool_close(zphdl); + libzfs_fini(hdl); + return (rv); +} diff --git a/lib/libzfsbootenv/lzbe_pair.c b/lib/libzfsbootenv/lzbe_pair.c new file mode 100644 index 0000000000..831355ba4b --- /dev/null +++ b/lib/libzfsbootenv/lzbe_pair.c @@ -0,0 +1,347 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2020 Toomas Soome + */ + +#include +#include +#include +#include +#include +#include + +/* + * Get or create nvlist. If key is not NULL, get nvlist from bootenv, + * otherwise return bootenv. + */ +int +lzbe_nvlist_get(const char *pool, const char *key, void **ptr) +{ + libzfs_handle_t *hdl; + zpool_handle_t *zphdl; + nvlist_t *nv; + int rv = -1; + + if (pool == NULL || *pool == '\0') + return (rv); + + if ((hdl = libzfs_init()) == NULL) { + return (rv); + } + + zphdl = zpool_open(hdl, pool); + if (zphdl == NULL) { + libzfs_fini(hdl); + return (rv); + } + + rv = zpool_get_bootenv(zphdl, &nv); + if (rv == 0) { + nvlist_t *nvl, *dup; + + if (key != NULL) { + rv = nvlist_lookup_nvlist(nv, key, &nvl); + if (rv == 0) { + rv = nvlist_dup(nvl, &dup, 0); + nvlist_free(nv); + if (rv == 0) + nv = dup; + else + nv = NULL; + } else { + nvlist_free(nv); + rv = nvlist_alloc(&nv, NV_UNIQUE_NAME, 0); + } + } + *ptr = nv; + } + + zpool_close(zphdl); + libzfs_fini(hdl); + return (rv); +} + +int +lzbe_nvlist_set(const char *pool, const char *key, void *ptr) +{ + libzfs_handle_t *hdl; + zpool_handle_t *zphdl; + nvlist_t *nv; + uint64_t version; + int rv = -1; + + if (pool == NULL || *pool == '\0') + return (rv); + + if ((hdl = libzfs_init()) == NULL) { + return (rv); + } + + zphdl = zpool_open(hdl, pool); + if (zphdl == NULL) { + libzfs_fini(hdl); + return (rv); + } + + if (key != NULL) { + rv = zpool_get_bootenv(zphdl, &nv); + if (rv == 0) { + /* + * We got the nvlist, check for version. + * if version is missing or is not VB_NVLIST, + * create new list. + */ + rv = nvlist_lookup_uint64(nv, BOOTENV_VERSION, + &version); + if (rv != 0 || version != VB_NVLIST) { + /* Drop this nvlist */ + fnvlist_free(nv); + /* Create and prepare new nvlist */ + nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, BOOTENV_VERSION, + VB_NVLIST); + } + rv = nvlist_add_nvlist(nv, key, ptr); + if (rv == 0) + rv = zpool_set_bootenv(zphdl, nv); + nvlist_free(nv); + } + } else { + rv = zpool_set_bootenv(zphdl, ptr); + } + + zpool_close(zphdl); + libzfs_fini(hdl); + return (rv); +} + +/* + * free nvlist we got via lzbe_nvlist_get() + */ +void +lzbe_nvlist_free(void *ptr) +{ + nvlist_free(ptr); +} + +static const char *typenames[] = { + "DATA_TYPE_UNKNOWN", + "DATA_TYPE_BOOLEAN", + "DATA_TYPE_BYTE", + "DATA_TYPE_INT16", + "DATA_TYPE_UINT16", + "DATA_TYPE_INT32", + "DATA_TYPE_UINT32", + "DATA_TYPE_INT64", + "DATA_TYPE_UINT64", + "DATA_TYPE_STRING", + "DATA_TYPE_BYTE_ARRAY", + "DATA_TYPE_INT16_ARRAY", + "DATA_TYPE_UINT16_ARRAY", + "DATA_TYPE_INT32_ARRAY", + "DATA_TYPE_UINT32_ARRAY", + "DATA_TYPE_INT64_ARRAY", + "DATA_TYPE_UINT64_ARRAY", + "DATA_TYPE_STRING_ARRAY", + "DATA_TYPE_HRTIME", + "DATA_TYPE_NVLIST", + "DATA_TYPE_NVLIST_ARRAY", + "DATA_TYPE_BOOLEAN_VALUE", + "DATA_TYPE_INT8", + "DATA_TYPE_UINT8", + "DATA_TYPE_BOOLEAN_ARRAY", + "DATA_TYPE_INT8_ARRAY", + "DATA_TYPE_UINT8_ARRAY" +}; + +static int +nvpair_type_from_name(const char *name) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(typenames); i++) { + if (strcmp(name, typenames[i]) == 0) + return (i); + } + return (0); +} + +/* + * Add pair defined by key, type and value into nvlist. + */ +int +lzbe_add_pair(void *ptr, const char *key, const char *type, void *value, + size_t size) +{ + nvlist_t *nv = ptr; + data_type_t dt; + int rv = 0; + + if (ptr == NULL || key == NULL || value == NULL) + return (rv); + + if (type == NULL) + type = "DATA_TYPE_STRING"; + dt = nvpair_type_from_name(type); + if (dt == DATA_TYPE_UNKNOWN) + return (EINVAL); + + switch (dt) { + case DATA_TYPE_BYTE: + if (size != sizeof (uint8_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_byte(nv, key, *(uint8_t *)value); + break; + + case DATA_TYPE_INT16: + if (size != sizeof (int16_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_int16(nv, key, *(int16_t *)value); + break; + + case DATA_TYPE_UINT16: + if (size != sizeof (uint16_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_uint16(nv, key, *(uint16_t *)value); + break; + + case DATA_TYPE_INT32: + if (size != sizeof (int32_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_int32(nv, key, *(int32_t *)value); + break; + + case DATA_TYPE_UINT32: + if (size != sizeof (uint32_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_uint32(nv, key, *(uint32_t *)value); + break; + + case DATA_TYPE_INT64: + if (size != sizeof (int64_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_int64(nv, key, *(int64_t *)value); + break; + + case DATA_TYPE_UINT64: + if (size != sizeof (uint64_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_uint64(nv, key, *(uint64_t *)value); + break; + + case DATA_TYPE_STRING: + rv = nvlist_add_string(nv, key, value); + break; + + case DATA_TYPE_BYTE_ARRAY: + rv = nvlist_add_byte_array(nv, key, value, size); + break; + + case DATA_TYPE_INT16_ARRAY: + rv = nvlist_add_int16_array(nv, key, value, size); + break; + + case DATA_TYPE_UINT16_ARRAY: + rv = nvlist_add_uint16_array(nv, key, value, size); + break; + + case DATA_TYPE_INT32_ARRAY: + rv = nvlist_add_int32_array(nv, key, value, size); + break; + + case DATA_TYPE_UINT32_ARRAY: + rv = nvlist_add_uint32_array(nv, key, value, size); + break; + + case DATA_TYPE_INT64_ARRAY: + rv = nvlist_add_int64_array(nv, key, value, size); + break; + + case DATA_TYPE_UINT64_ARRAY: + rv = nvlist_add_uint64_array(nv, key, value, size); + break; + + case DATA_TYPE_STRING_ARRAY: + rv = nvlist_add_string_array(nv, key, value, size); + break; + + case DATA_TYPE_NVLIST: + rv = nvlist_add_nvlist(nv, key, (nvlist_t *)value); + break; + + case DATA_TYPE_NVLIST_ARRAY: + rv = nvlist_add_nvlist_array(nv, key, value, size); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + if (size != sizeof (boolean_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_boolean_value(nv, key, *(boolean_t *)value); + break; + + case DATA_TYPE_INT8: + if (size != sizeof (int8_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_int8(nv, key, *(int8_t *)value); + break; + + case DATA_TYPE_UINT8: + if (size != sizeof (uint8_t)) { + rv = EINVAL; + break; + } + rv = nvlist_add_uint8(nv, key, *(uint8_t *)value); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + rv = nvlist_add_boolean_array(nv, key, value, size); + break; + + case DATA_TYPE_INT8_ARRAY: + rv = nvlist_add_int8_array(nv, key, value, size); + break; + + case DATA_TYPE_UINT8_ARRAY: + rv = nvlist_add_uint8_array(nv, key, value, size); + break; + + default: + return (ENOTSUP); + } + + return (rv); +} + +int +lzbe_remove_pair(void *ptr, const char *key) +{ + + return (nvlist_remove_all(ptr, key)); +} diff --git a/lib/libzfsbootenv/lzbe_util.c b/lib/libzfsbootenv/lzbe_util.c new file mode 100644 index 0000000000..35e9854958 --- /dev/null +++ b/lib/libzfsbootenv/lzbe_util.c @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2020 Toomas Soome + */ + +#include +#include +#include +#include + +/* + * Output bootenv information. + */ +int +lzbe_bootenv_print(const char *pool, const char *nvlist, FILE *of) +{ + nvlist_t *nv; + int rv = -1; + + if (pool == NULL || *pool == '\0' || of == NULL) + return (rv); + + rv = lzbe_nvlist_get(pool, nvlist, (void **)&nv); + if (rv == 0) { + nvlist_print(of, nv); + nvlist_free(nv); + } + + return (rv); +} diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 91f47503a3..3cc0c2f2ec 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -4,19 +4,28 @@ VPATH = \ $(top_srcdir)/module/zfs \ $(top_srcdir)/module/zcommon \ $(top_srcdir)/module/lua \ + $(top_srcdir)/module/os/linux/zfs \ $(top_srcdir)/lib/libzpool +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs +endif +if BUILD_LINUX +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/linux/zfs +endif + +# Unconditionally enable debugging for libzpool +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) # Includes kernel code generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) -AM_CFLAGS += -DLIB_ZPOOL_BUILD +AM_CFLAGS += $(ZLIB_CFLAGS) -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +AM_CFLAGS += -DLIB_ZPOOL_BUILD lib_LTLIBRARIES = libzpool.la @@ -38,16 +47,18 @@ KERNEL_C = \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ - zfs_uio.c \ zpool_prop.c \ zprop_common.c \ abd.c \ + abd_os.c \ aggsum.c \ arc.c \ + arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ bptree.c \ + btree.c \ bqueue.c \ cityhash.c \ dbuf.c \ @@ -59,6 +70,7 @@ KERNEL_C = \ dmu_object.c \ dmu_objset.c \ dmu_recv.c \ + dmu_redact.c \ dmu_send.c \ dmu_traverse.c \ dmu_tx.c \ @@ -86,6 +98,7 @@ KERNEL_C = \ metaslab.c \ mmp.c \ multilist.c \ + objlist.c \ pathname.c \ range_tree.c \ refcount.c \ @@ -99,6 +112,7 @@ KERNEL_C = \ spa_config.c \ spa_errlog.c \ spa_history.c \ + spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ space_map.c \ @@ -109,6 +123,8 @@ KERNEL_C = \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ @@ -128,6 +144,8 @@ KERNEL_C = \ vdev_raidz_math_scalar.c \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ + vdev_raidz_math_powerpc_altivec.c \ + vdev_rebuild.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ @@ -138,12 +156,14 @@ KERNEL_C = \ zcp_get.c \ zcp_global.c \ zcp_iter.c \ + zcp_set.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ zfs_debug.c \ zfs_fm.c \ zfs_fuid.c \ + zfs_racct.c \ zfs_sa.c \ zfs_znode.c \ zfs_ratelimit.c \ @@ -184,18 +204,37 @@ LUA_C = \ lvm.c \ lzio.c +dist_libzpool_la_SOURCES = \ + $(USER_C) + nodist_libzpool_la_SOURCES = \ - $(USER_C) \ $(KERNEL_C) \ $(LUA_C) libzpool_la_LIBADD = \ - $(top_builddir)/lib/libicp/libicp.la \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libunicode/libunicode.la \ - $(top_builddir)/lib/libzutil/libzutil.la + $(abs_top_builddir)/lib/libicp/libicp.la \ + $(abs_top_builddir)/lib/libunicode/libunicode.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzstd/libzstd.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la -libzpool_la_LIBADD += $(ZLIB) -ldl -libzpool_la_LDFLAGS = -pthread -version-info 2:0:0 +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm -EXTRA_DIST = $(USER_C) +libzpool_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzpool_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzpool_la_LIBADD += -lgeom +endif + +libzpool_la_LDFLAGS += -version-info 5:0:0 + +if TARGET_CPU_POWERPC +vdev_raidz_math_powerpc_altivec.$(OBJEXT): CFLAGS += -maltivec +vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec +endif + +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 0f39e0d72b..ef75706fa6 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -20,39 +20,41 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include +#include #include #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include /* * Emulation of kernel services in userland. */ -int aok; uint64_t physmem; -vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; -vmem_t *zio_arena = NULL; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; @@ -145,36 +147,6 @@ void kstat_delete(kstat_t *ksp) {} -/*ARGSUSED*/ -void -kstat_waitq_enter(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_waitq_exit(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_enter(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_exit(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_waitq_to_runq(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_back_to_waitq(kstat_io_t *kiop) -{} - void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), @@ -339,7 +311,14 @@ cv_wait(kcondvar_t *cv, kmutex_t *mp) mp->m_owner = pthread_self(); } -clock_t +int +cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) +{ + cv_wait(cv, mp); + return (1); +} + +int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; @@ -373,7 +352,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) } /*ARGSUSED*/ -clock_t +int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { @@ -436,6 +415,7 @@ seq_printf(struct seq_file *m, const char *fmt, ...) void procfs_list_install(const char *module, + const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, @@ -481,231 +461,6 @@ procfs_list_add(procfs_list_t *procfs_list, void *p) * vnode operations * ========================================================================= */ -/* - * Note: for the xxxat() versions of these functions, we assume that the - * starting vp is always rootdir (which is true for spa_directory.c, the only - * ZFS consumer of these interfaces). We assert this is true, and then emulate - * them by adding '/' in front of the path. - */ - -/*ARGSUSED*/ -int -vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) -{ - int fd = -1; - int dump_fd = -1; - vnode_t *vp; - int old_umask = 0; - char *realpath; - struct stat64 st; - int err; - - realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); - - /* - * If we're accessing a real disk from userland, we need to use - * the character interface to avoid caching. This is particularly - * important if we're trying to look at a real in-kernel storage - * pool from userland, e.g. via zdb, because otherwise we won't - * see the changes occurring under the segmap cache. - * On the other hand, the stupid character device returns zero - * for its size. So -- gag -- we open the block device to get - * its size, and remember it for subsequent VOP_GETATTR(). - */ -#if defined(__sun__) || defined(__sun) - if (strncmp(path, "/dev/", 5) == 0) { -#else - if (0) { -#endif - char *dsk; - fd = open64(path, O_RDONLY); - if (fd == -1) { - err = errno; - free(realpath); - return (err); - } - if (fstat64(fd, &st) == -1) { - err = errno; - close(fd); - free(realpath); - return (err); - } - close(fd); - (void) sprintf(realpath, "%s", path); - dsk = strstr(path, "/dsk/"); - if (dsk != NULL) - (void) sprintf(realpath + (dsk - path) + 1, "r%s", - dsk + 1); - } else { - (void) sprintf(realpath, "%s", path); - if (!(flags & FCREAT) && stat64(realpath, &st) == -1) { - err = errno; - free(realpath); - return (err); - } - } - - if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) { -#ifdef __linux__ - flags |= O_DIRECT; -#endif - } - - if (flags & FCREAT) - old_umask = umask(0); - - /* - * The construct 'flags - FREAD' conveniently maps combinations of - * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. - */ - fd = open64(realpath, flags - FREAD, mode); - if (fd == -1) { - err = errno; - free(realpath); - return (err); - } - - if (flags & FCREAT) - (void) umask(old_umask); - - if (vn_dumpdir != NULL) { - char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); - (void) snprintf(dumppath, MAXPATHLEN, - "%s/%s", vn_dumpdir, basename(realpath)); - dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); - umem_free(dumppath, MAXPATHLEN); - if (dump_fd == -1) { - err = errno; - free(realpath); - close(fd); - return (err); - } - } else { - dump_fd = -1; - } - - free(realpath); - - if (fstat64_blk(fd, &st) == -1) { - err = errno; - close(fd); - if (dump_fd != -1) - close(dump_fd); - return (err); - } - - (void) fcntl(fd, F_SETFD, FD_CLOEXEC); - - *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); - - vp->v_fd = fd; - vp->v_size = st.st_size; - vp->v_path = spa_strdup(path); - vp->v_dump_fd = dump_fd; - - return (0); -} - -/*ARGSUSED*/ -int -vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, - int x3, vnode_t *startvp, int fd) -{ - char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); - int ret; - - ASSERT(startvp == rootdir); - (void) sprintf(realpath, "/%s", path); - - /* fd ignored for now, need if want to simulate nbmand support */ - ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); - - umem_free(realpath, strlen(path) + 2); - - return (ret); -} - -/*ARGSUSED*/ -int -vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) -{ - ssize_t rc, done = 0, split; - - if (uio == UIO_READ) { - rc = pread64(vp->v_fd, addr, len, offset); - if (vp->v_dump_fd != -1 && rc != -1) { - int status; - status = pwrite64(vp->v_dump_fd, addr, rc, offset); - ASSERT(status != -1); - } - } else { - /* - * To simulate partial disk writes, we split writes into two - * system calls so that the process can be killed in between. - */ - int sectors = len >> SPA_MINBLOCKSHIFT; - split = (sectors > 0 ? rand() % sectors : 0) << - SPA_MINBLOCKSHIFT; - rc = pwrite64(vp->v_fd, addr, split, offset); - if (rc != -1) { - done = rc; - rc = pwrite64(vp->v_fd, (char *)addr + split, - len - split, offset + split); - } - } - -#ifdef __linux__ - if (rc == -1 && errno == EINVAL) { - /* - * Under Linux, this most likely means an alignment issue - * (memory or disk) due to O_DIRECT, so we abort() in order to - * catch the offender. - */ - abort(); - } -#endif - if (rc == -1) - return (errno); - - done += rc; - - if (residp) - *residp = len - done; - else if (done != len) - return (EIO); - return (0); -} - -void -vn_close(vnode_t *vp) -{ - close(vp->v_fd); - if (vp->v_dump_fd != -1) - close(vp->v_dump_fd); - spa_strfree(vp->v_path); - umem_free(vp, sizeof (vnode_t)); -} - -/* - * At a minimum we need to update the size since vdev_reopen() - * will no longer call vn_openat(). - */ -int -fop_getattr(vnode_t *vp, vattr_t *vap) -{ - struct stat64 st; - int err; - - if (fstat64_blk(vp->v_fd, &st) == -1) { - err = errno; - close(vp->v_fd); - return (err); - } - - vap->va_size = st.st_size; - return (0); -} /* * ========================================================================= @@ -787,19 +542,10 @@ void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { - const char *newfile; + /* Get rid of annoying "../common/" prefix to filename. */ + const char *newfile = zfs_basename(file); + va_list adx; - - /* - * Get rid of annoying "../common/" prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - if (dprint) { /* dprintf messages are printed immediately */ @@ -888,7 +634,6 @@ vcmn_err(int ce, const char *fmt, va_list adx) } } -/*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { @@ -899,60 +644,6 @@ cmn_err(int ce, const char *fmt, ...) va_end(adx); } -/* - * ========================================================================= - * kobj interfaces - * ========================================================================= - */ -struct _buf * -kobj_open_file(char *name) -{ - struct _buf *file; - vnode_t *vp; - - /* set vp as the _fd field of the file */ - if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, - -1) != 0) - return ((void *)-1UL); - - file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); - file->_fd = (intptr_t)vp; - return (file); -} - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - ssize_t resid = 0; - - if (vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, - UIO_SYSSPACE, 0, 0, 0, &resid) != 0) - return (-1); - - return (size - resid); -} - -void -kobj_close_file(struct _buf *file) -{ - vn_close((vnode_t *)file->_fd); - umem_free(file, sizeof (struct _buf)); -} - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - struct stat64 st; - vnode_t *vp = (vnode_t *)file->_fd; - - if (fstat64(vp->v_fd, &st) == -1) { - vn_close(vp); - return (errno); - } - *size = st.st_size; - return (0); -} - /* * ========================================================================= * misc routines @@ -993,15 +684,15 @@ lowbit64(uint64_t i) return (__builtin_ffsll(i)); } -char *random_path = "/dev/random"; -char *urandom_path = "/dev/urandom"; +const char *random_path = "/dev/random"; +const char *urandom_path = "/dev/urandom"; static int random_fd = -1, urandom_fd = -1; void random_init(void) { - VERIFY((random_fd = open(random_path, O_RDONLY)) != -1); - VERIFY((urandom_fd = open(urandom_path, O_RDONLY)) != -1); + VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1); + VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1); } void @@ -1096,11 +787,11 @@ kernel_init(int mode) physmem = sysconf(_SC_PHYS_PAGES); - dprintf("physmem = %llu pages (%.2f GB)\n", physmem, + dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", - (mode & FWRITE) ? get_system_hostid() : 0); + (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); random_init(); @@ -1109,7 +800,9 @@ kernel_init(int mode) system_taskq_init(); icp_init(); - spa_init(mode); + zstd_init(); + + spa_init((spa_mode_t)mode); fletcher_4_init(); @@ -1122,6 +815,8 @@ kernel_fini(void) fletcher_4_fini(); spa_fini(); + zstd_fini(); + icp_fini(); system_taskq_fini(); @@ -1182,6 +877,12 @@ secpolicy_zfs(const cred_t *cr) return (0); } +int +secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) +{ + return (0); +} + ksiddomain_t * ksid_lookupdomain(const char *dom) { @@ -1226,16 +927,16 @@ kmem_asprintf(const char *fmt, ...) } /* ARGSUSED */ -int +zfs_file_t * zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; - return (0); + return (NULL); } /* ARGSUSED */ void -zfs_onexit_fd_rele(int fd) +zfs_onexit_fd_rele(zfs_file_t *fp) { } @@ -1247,20 +948,6 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, return (0); } -/* ARGSUSED */ -int -zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) -{ - return (0); -} - -/* ARGSUSED */ -int -zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) -{ - return (0); -} - fstrans_cookie_t spl_fstrans_mark(void) { @@ -1287,12 +974,12 @@ kmem_cache_reap_active(void) void *zvol_tag = "zvol_tag"; void -zvol_create_minors(spa_t *spa, const char *name, boolean_t async) +zvol_create_minor(const char *name) { } void -zvol_remove_minor(spa_t *spa, const char *name, boolean_t async) +zvol_create_minors_recursive(const char *name) { } @@ -1306,3 +993,384 @@ zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + int fd = -1; + int dump_fd = -1; + int err; + int old_umask = 0; + zfs_file_t *fp; + struct stat64 st; + + if (!(flags & O_CREAT) && stat64(path, &st) == -1) + return (errno); + + if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) + flags |= O_DIRECT; + + if (flags & O_CREAT) + old_umask = umask(0); + + fd = open64(path, flags, mode); + if (fd == -1) + return (errno); + + if (flags & O_CREAT) + (void) umask(old_umask); + + if (vn_dumpdir != NULL) { + char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); + const char *inpath = zfs_basename(path); + + (void) snprintf(dumppath, MAXPATHLEN, + "%s/%s", vn_dumpdir, inpath); + dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); + umem_free(dumppath, MAXPATHLEN); + if (dump_fd == -1) { + err = errno; + close(fd); + return (err); + } + } else { + dump_fd = -1; + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); + fp->f_fd = fd; + fp->f_dump_fd = dump_fd; + *fpp = fp; + + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + close(fp->f_fd); + if (fp->f_dump_fd != -1) + close(fp->f_dump_fd); + + umem_free(fp, sizeof (zfs_file_t)); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + ssize_t rc; + + rc = write(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, + size_t count, loff_t pos, ssize_t *resid) +{ + ssize_t rc, split, done; + int sectors; + + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + * This is used by ztest to simulate realistic failure modes. + */ + sectors = count >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; + rc = pwrite64(fp->f_fd, buf, split, pos); + if (rc != -1) { + done = rc; + rc = pwrite64(fp->f_fd, (char *)buf + split, + count - split, pos + split); + } +#ifdef __linux__ + if (rc == -1 && errno == EINVAL) { + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order + * to catch the offender. + */ + abort(); + } +#endif + + if (rc < 0) + return (errno); + + done += rc; + + if (resid) { + *resid = count - done; + } else if (done != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + int rc; + + rc = read(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = pread64(fp->f_fd, buf, count, off); + if (rc < 0) { +#ifdef __linux__ + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order to + * catch the offender. + */ + if (errno == EINVAL) + abort(); +#endif + return (errno); + } + + if (fp->f_dump_fd != -1) { + int status; + + status = pwrite64(fp->f_dump_fd, buf, rc, off); + ASSERT(status != -1); + } + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + rc = lseek(fp->f_fd, *offp, whence); + if (rc < 0) + return (errno); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct stat64 st; + + if (fstat64_blk(fp->f_fd, &st) == -1) + return (errno); + + zfattr->zfa_size = st.st_size; + zfattr->zfa_mode = st.st_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + int rc; + + rc = fsync(fp->f_fd); + if (rc < 0) + return (errno); + + return (0); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ +#ifdef __linux__ + return (fallocate(fp->f_fd, mode, offset, len)); +#else + return (EOPNOTSUPP); +#endif +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (lseek(fp->f_fd, SEEK_CUR, 0)); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (remove(path)); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * + * Returns pointer to file struct or NULL. + * Unsupported in user space. + */ +zfs_file_t * +zfs_file_get(int fd) +{ + abort(); + + return (NULL); +} +/* + * Drop reference to file pointer + * + * fp - pointer to file struct + * + * Unsupported in user space. + */ +void +zfs_file_put(zfs_file_t *fp) +{ + abort(); +} + +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ +} diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index ae67906fe0..456080f7f2 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -34,6 +34,8 @@ int taskq_now; taskq_t *system_taskq; taskq_t *system_delay_taskq; +static pthread_key_t taskq_tsd; + #define TASKQ_ACTIVE 0x00010000 static taskq_ent_t * @@ -213,6 +215,8 @@ taskq_thread(void *arg) taskq_ent_t *t; boolean_t prealloc; + VERIFY0(pthread_setspecific(taskq_tsd, tq)); + mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { @@ -343,6 +347,12 @@ taskq_member(taskq_t *tq, kthread_t *t) return (0); } +taskq_t * +taskq_of_curthread(void) +{ + return (pthread_getspecific(taskq_tsd)); +} + int taskq_cancel_id(taskq_t *tq, taskqid_t id) { @@ -352,6 +362,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id) void system_taskq_init(void) { + VERIFY0(pthread_key_create(&taskq_tsd, NULL)); system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, @@ -365,4 +376,5 @@ system_taskq_fini(void) system_taskq = NULL; /* defensive */ taskq_destroy(system_delay_taskq); system_delay_taskq = NULL; + VERIFY0(pthread_key_delete(taskq_tsd)); } diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index ad05d2239a..a2bdfec1d1 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include @@ -148,18 +148,54 @@ show_pool_stats(spa_t *spa) nvlist_free(config); } +/* *k_out must be freed by the caller */ +static int +set_global_var_parse_kv(const char *arg, char **k_out, u_longlong_t *v_out) +{ + int err; + VERIFY(arg); + char *d = strdup(arg); + + char *save = NULL; + char *k = strtok_r(d, "=", &save); + char *v_str = strtok_r(NULL, "=", &save); + char *follow = strtok_r(NULL, "=", &save); + if (k == NULL || v_str == NULL || follow != NULL) { + err = EINVAL; + goto err_free; + } + + u_longlong_t val = strtoull(v_str, NULL, 0); + if (val > UINT32_MAX) { + fprintf(stderr, "Value for global variable '%s' must " + "be a 32-bit unsigned integer, got '%s'\n", k, v_str); + err = EOVERFLOW; + goto err_free; + } + + *k_out = k; + *v_out = val; + return (0); + +err_free: + free(k); + + return (err); +} + /* * Sets given global variable in libzpool to given unsigned 32-bit value. * arg: "=" */ int -set_global_var(char *arg) +set_global_var(char const *arg) { void *zpoolhdl; - char *varname = arg, *varval; + char *varname; u_longlong_t val; + int ret; -#ifndef _LITTLE_ENDIAN +#ifndef _ZFS_LITTLE_ENDIAN /* * On big endian systems changing a 64-bit variable would set the high * 32 bits instead of the low 32 bits, which could cause unexpected @@ -167,19 +203,12 @@ set_global_var(char *arg) */ fprintf(stderr, "Setting global variables is only supported on " "little-endian systems\n"); - return (ENOTSUP); + ret = ENOTSUP; + goto out_ret; #endif - if (arg != NULL && (varval = strchr(arg, '=')) != NULL) { - *varval = '\0'; - varval++; - val = strtoull(varval, NULL, 0); - if (val > UINT32_MAX) { - fprintf(stderr, "Value for global variable '%s' must " - "be a 32-bit unsigned integer\n", varname); - return (EOVERFLOW); - } - } else { - return (EINVAL); + + if ((ret = set_global_var_parse_kv(arg, &varname, &val)) != 0) { + goto out_ret; } zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); @@ -189,18 +218,25 @@ set_global_var(char *arg) if (var == NULL) { fprintf(stderr, "Global variable '%s' does not exist " "in libzpool.so\n", varname); - return (EINVAL); + ret = EINVAL; + goto out_dlclose; } *var = (uint32_t)val; - dlclose(zpoolhdl); } else { fprintf(stderr, "Failed to open libzpool.so to set global " "variable\n"); - return (EIO); + ret = EIO; + goto out_dlclose; } - return (0); + ret = 0; + +out_dlclose: + dlclose(zpoolhdl); + free(varname); +out_ret: + return (ret); } static nvlist_t * @@ -209,39 +245,96 @@ refresh_config(void *unused, nvlist_t *tryconfig) return (spa_tryimport(tryconfig)); } +#if defined(__FreeBSD__) + +#include +#include +#include + +static int +pool_active(void *unused, const char *name, uint64_t guid, boolean_t *isactive) +{ + zfs_iocparm_t zp; + zfs_cmd_t *zc = NULL; + zfs_cmd_legacy_t *zcl = NULL; + unsigned long request; + int ret; + + int fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); + if (fd < 0) + return (-1); + + /* + * Use ZFS_IOC_POOL_STATS to check if the pool is active. We want to + * avoid adding a dependency on libzfs_core solely for this ioctl(), + * therefore we manually craft the stats command. Note that the command + * ID is identical between the openzfs and legacy ioctl() formats. + */ + int ver = ZFS_IOCVER_NONE; + size_t ver_size = sizeof (ver); + + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + switch (ver) { + case ZFS_IOCVER_OZFS: + zc = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); + + (void) strlcpy(zc->zc_name, name, sizeof (zc->zc_name)); + zp.zfs_cmd = (uint64_t)(uintptr_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_OZFS; + + request = _IOWR('Z', ZFS_IOC_POOL_STATS, zfs_iocparm_t); + ret = ioctl(fd, request, &zp); + + free((void *)(uintptr_t)zc->zc_nvlist_dst); + umem_free(zc, sizeof (zfs_cmd_t)); + + break; + case ZFS_IOCVER_LEGACY: + zcl = umem_zalloc(sizeof (zfs_cmd_legacy_t), UMEM_NOFAIL); + + (void) strlcpy(zcl->zc_name, name, sizeof (zcl->zc_name)); + zp.zfs_cmd = (uint64_t)(uintptr_t)zcl; + zp.zfs_cmd_size = sizeof (zfs_cmd_legacy_t); + zp.zfs_ioctl_version = ZFS_IOCVER_LEGACY; + + request = _IOWR('Z', ZFS_IOC_POOL_STATS, zfs_iocparm_t); + ret = ioctl(fd, request, &zp); + + free((void *)(uintptr_t)zcl->zc_nvlist_dst); + umem_free(zcl, sizeof (zfs_cmd_legacy_t)); + + break; + default: + fprintf(stderr, "unrecognized zfs ioctl version %d", ver); + exit(1); + } + + (void) close(fd); + + *isactive = (ret == 0); + + return (0); +} +#else static int pool_active(void *unused, const char *name, uint64_t guid, boolean_t *isactive) { - zfs_cmd_t *zcp; - nvlist_t *innvl; - char *packed = NULL; - size_t size = 0; - int fd, ret; - - /* - * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active - */ - - fd = open("/dev/zfs", O_RDWR); + int fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (fd < 0) return (-1); - zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); - - innvl = fnvlist_alloc(); - fnvlist_add_boolean_value(innvl, "force", B_FALSE); - + /* + * Use ZFS_IOC_POOL_STATS to check if a pool is active. + */ + zfs_cmd_t *zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name)); - packed = fnvlist_pack(innvl, &size); - zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed; - zcp->zc_nvlist_src_size = size; - ret = ioctl(fd, ZFS_IOC_POOL_SYNC, zcp); + int ret = ioctl(fd, ZFS_IOC_POOL_STATS, zcp); - fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zcp->zc_nvlist_dst); - nvlist_free(innvl); umem_free(zcp, sizeof (zfs_cmd_t)); (void) close(fd); @@ -250,6 +343,7 @@ pool_active(void *unused, const char *name, uint64_t guid, return (0); } +#endif const pool_config_ops_t libzpool_config_ops = { .pco_refresh_config = refresh_config, diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am new file mode 100644 index 0000000000..c9ed7e2aaf --- /dev/null +++ b/lib/libzstd/Makefile.am @@ -0,0 +1,23 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = $(top_srcdir)/module/zstd + +# -fno-tree-vectorize is set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +AM_CFLAGS += -fno-tree-vectorize + +noinst_LTLIBRARIES = libzstd.la + +KERNEL_C = \ + lib/zstd.c \ + zfs_zstd.c + +nodist_libzstd_la_SOURCES = $(KERNEL_C) + +lib/zstd.$(OBJEXT): CFLAGS += -fno-tree-vectorize -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h -Wp,-w +lib/zstd.l$(OBJEXT): CFLAGS += -fno-tree-vectorize -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h -Wp,-w + +zfs_zstd.$(OBJEXT): CFLAGS += -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h +zfs_zstd.l$(OBJEXT): CFLAGS += -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h + +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am index 720b843ab9..b163250619 100644 --- a/lib/libzutil/Makefile.am +++ b/lib/libzutil/Makefile.am @@ -2,26 +2,46 @@ include $(top_srcdir)/config/Rules.am # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) +AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS) +AM_CFLAGS += -fvisibility=hidden -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include +DEFAULT_INCLUDES += -I$(srcdir) noinst_LTLIBRARIES = libzutil.la USER_C = \ zutil_device_path.c \ zutil_import.c \ + zutil_import.h \ zutil_nicenum.c \ zutil_pool.c -nodist_libzutil_la_SOURCES = $(USER_C) +if BUILD_LINUX +USER_C += \ + os/linux/zutil_device_path_os.c \ + os/linux/zutil_import_os.c +endif + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/zutil_device_path_os.c \ + os/freebsd/zutil_import_os.c +endif + +libzutil_la_SOURCES = $(USER_C) libzutil_la_LIBADD = \ - $(top_builddir)/lib/libavl/libavl.la \ - $(top_builddir)/lib/libefi/libefi.la \ - $(top_builddir)/lib/libtpool/libtpool.la + $(abs_top_builddir)/lib/libavl/libavl.la \ + $(abs_top_builddir)/lib/libtpool/libtpool.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libspl/libspl.la -libzutil_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) +if BUILD_LINUX +libzutil_la_LIBADD += \ + $(abs_top_builddir)/lib/libefi/libefi.la \ + -lrt +endif -EXTRA_DIST = $(USER_C) +libzutil_la_LIBADD += -lm $(LIBBLKID_LIBS) $(LIBUDEV_LIBS) + +include $(top_srcdir)/config/CppCheck.am diff --git a/lib/libzutil/os/freebsd/zutil_device_path_os.c b/lib/libzutil/os/freebsd/zutil_device_path_os.c new file mode 100644 index 0000000000..71c9360052 --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_device_path_os.c @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +/* + * We don't strip/append partitions on FreeBSD. + */ + +/* + * Note: The caller must free the returned string. + */ +char * +zfs_strip_partition(char *dev) +{ + return (strdup(dev)); +} + +int +zfs_append_partition(char *path, size_t max_len) +{ + return (strnlen(path, max_len)); +} + +/* + * Strip the path from a device name. + * On FreeBSD we only want to remove "/dev/" from the beginning of + * paths if present. + */ +char * +zfs_strip_path(char *path) +{ + if (strncmp(path, _PATH_DEV, sizeof (_PATH_DEV) - 1) == 0) + return (path + sizeof (_PATH_DEV) - 1); + else + return (path); +} + +char * +zfs_get_underlying_path(const char *dev_name) +{ + + if (dev_name == NULL) + return (NULL); + + return (realpath(dev_name, NULL)); +} + +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + int fd; + + fd = g_open(dev_name, 0); + if (fd >= 0) { + g_close(fd); + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c new file mode 100644 index 0000000000..3da661f4c5 --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * Pool import support functions. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include "zutil_import.h" + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * On FreeBSD we currently just strip devid and phys_path to avoid confusion. + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); +} + +/* + * Do not even look at these devices. + */ +static const char * const excluded_devs[] = { + "nfslock", + "sequencer", + "zfs", +}; +#define EXCLUDED_DIR "/dev/" +#define EXCLUDED_DIR_LEN 5 + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + size_t i; + int num_labels; + int fd; + off_t mediasize = 0; + + /* + * Do not even look at excluded devices. + */ + if (strncmp(rn->rn_name, EXCLUDED_DIR, EXCLUDED_DIR_LEN) == 0) { + char *name = rn->rn_name + EXCLUDED_DIR_LEN; + for (i = 0; i < nitems(excluded_devs); ++i) { + const char *excluded_name = excluded_devs[i]; + size_t len = strlen(excluded_name); + if (strncmp(name, excluded_name, len) == 0) { + return; + } + } + } + + /* + * O_NONBLOCK so we don't hang trying to open things like serial ports. + */ + if ((fd = open(rn->rn_name, O_RDONLY|O_NONBLOCK|O_CLOEXEC)) < 0) + return; + + /* + * Ignore failed stats. + */ + if (fstat64(fd, &statbuf) != 0) + goto out; + /* + * We only want regular files, character devs and block devs. + */ + if (S_ISREG(statbuf.st_mode)) { + /* Check if this file is too small to hold a zpool. */ + if (statbuf.st_size < SPA_MINDEVSIZE) { + goto out; + } + } else if (S_ISCHR(statbuf.st_mode) || S_ISBLK(statbuf.st_mode)) { + /* Check if this device is too small to hold a zpool. */ + if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) != 0 || + mediasize < SPA_MINDEVSIZE) { + goto out; + } + } else { + goto out; + } + + if (zpool_read_label(fd, &config, &num_labels) != 0) + goto out; + if (num_labels == 0) { + nvlist_free(config); + goto out; + } + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* TODO: Reuse labelpaths logic from Linux? */ +out: + (void) close(fd); +} + +static const char * const +zpool_default_import_path[] = { + "/dev" +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = nitems(zpool_default_import_path); + return (zpool_default_import_path); +} + +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + const char *oid = "vfs.zfs.vol.recursive"; + char *end, path[MAXPATHLEN]; + rdsk_node_t *slice; + struct gmesh mesh; + struct gclass *mp; + struct ggeom *gp; + struct gprovider *pp; + avl_index_t where; + int error, value; + size_t pathleft, size = sizeof (value); + boolean_t skip_zvols = B_FALSE; + + end = stpcpy(path, "/dev/"); + pathleft = &path[sizeof (path)] - end; + + error = geom_gettree(&mesh); + if (error != 0) + return (error); + + if (sysctlbyname(oid, &value, &size, NULL, 0) == 0 && value == 0) + skip_zvols = B_TRUE; + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + LIST_FOREACH(mp, &mesh.lg_class, lg_class) { + if (skip_zvols && strcmp(mp->lg_name, "ZFS::ZVOL") == 0) + continue; + LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + strlcpy(end, pp->lg_name, pathleft); + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_FALSE; + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + } + } + + geom_deletetree(&mesh); + + return (0); +} + +int +zfs_dev_flush(int fd __unused) +{ + return (0); +} + +void +update_vdevs_config_dev_sysfs_path(nvlist_t *config) +{ +} diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c new file mode 100644 index 0000000000..13f8bd0316 --- /dev/null +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -0,0 +1,678 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_LIBUDEV +#include +#endif + +#include + +/* + * Append partition suffix to an otherwise fully qualified device path. + * This is used to generate the name the full path as its stored in + * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length + * of 'path' will be returned on error a negative value is returned. + */ +int +zfs_append_partition(char *path, size_t max_len) +{ + int len = strlen(path); + + if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || + (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { + if (len + 6 >= max_len) + return (-1); + + (void) strcat(path, "-part1"); + len += 6; + } else { + if (len + 2 >= max_len) + return (-1); + + if (isdigit(path[len-1])) { + (void) strcat(path, "p1"); + len += 2; + } else { + (void) strcat(path, "1"); + len += 1; + } + } + + return (len); +} + +/* + * Remove partition suffix from a vdev path. Partition suffixes may take three + * forms: "-partX", "pX", or "X", where X is a string of digits. The second + * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The + * third case only occurs when preceded by a string matching the regular + * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. + * + * caller must free the returned string + */ +char * +zfs_strip_partition(char *path) +{ + char *tmp = strdup(path); + char *part = NULL, *d = NULL; + if (!tmp) + return (NULL); + + if ((part = strstr(tmp, "-part")) && part != tmp) { + d = part + 5; + } else if ((part = strrchr(tmp, 'p')) && + part > tmp + 1 && isdigit(*(part-1))) { + d = part + 1; + } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && + tmp[1] == 'd') { + for (d = &tmp[2]; isalpha(*d); part = ++d) { } + } else if (strncmp("xvd", tmp, 3) == 0) { + for (d = &tmp[3]; isalpha(*d); part = ++d) { } + } + if (part && d && *d != '\0') { + for (; isdigit(*d); d++) { } + if (*d == '\0') + *part = '\0'; + } + + return (tmp); +} + +/* + * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname + * + * path: /dev/sda1 + * returns: /dev/sda + * + * Returned string must be freed. + */ +static char * +zfs_strip_partition_path(char *path) +{ + char *newpath = strdup(path); + char *sd_offset; + char *new_sd; + + if (!newpath) + return (NULL); + + /* Point to "sda1" part of "/dev/sda1" */ + sd_offset = strrchr(newpath, '/') + 1; + + /* Get our new name "sda" */ + new_sd = zfs_strip_partition(sd_offset); + if (!new_sd) { + free(newpath); + return (NULL); + } + + /* Paste the "sda" where "sda1" was */ + strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); + + /* Free temporary "sda" */ + free(new_sd); + + return (newpath); +} + +/* + * Strip the unwanted portion of a device path. + */ +char * +zfs_strip_path(char *path) +{ + return (strrchr(path, '/') + 1); +} + +/* + * Read the contents of a sysfs file into an allocated buffer and remove the + * last newline. + * + * This is useful for reading sysfs files that return a single string. Return + * an allocated string pointer on success, NULL otherwise. Returned buffer + * must be freed by the user. + */ +static char * +zfs_read_sysfs_file(char *filepath) +{ + char buf[4096]; /* all sysfs files report 4k size */ + char *str = NULL; + + FILE *fp = fopen(filepath, "r"); + if (fp == NULL) { + return (NULL); + } + if (fgets(buf, sizeof (buf), fp) == buf) { + /* success */ + + /* Remove the last newline (if any) */ + size_t len = strlen(buf); + if (buf[len - 1] == '\n') { + buf[len - 1] = '\0'; + } + str = strdup(buf); + } + + fclose(fp); + + return (str); +} + +/* + * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to + * the drive (in /sys/bus/pci/slots). + * + * For example: + * dev: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * + * 'dev' must be an NVMe device. + * + * Returned string must be freed. Returns NULL on error or no sysfs path. + */ +static char * +zfs_get_pci_slots_sys_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *address1 = NULL; + char *address2 = NULL; + char *path = NULL; + char buf[MAXPATHLEN]; + char *tmp; + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp = strrchr(dev_name, '/'); + if (tmp != NULL) + dev_name = tmp + 1; /* +1 since we want the chr after '/' */ + + if (strncmp("nvme", dev_name, 4) != 0) + return (NULL); + + (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", + dev_name); + + address1 = zfs_read_sysfs_file(buf); + if (!address1) + return (NULL); + + /* + * /sys/block/nvme0n1/device/address format will + * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be + * "0000:01:00". Just NULL terminate at the '.' so they match. + */ + tmp = strrchr(address1, '.'); + if (tmp != NULL) + *tmp = '\0'; + + dp = opendir("/sys/bus/pci/slots/"); + if (dp == NULL) { + free(address1); + return (NULL); + } + + /* + * Look through all the /sys/bus/pci/slots/ subdirs + */ + while ((ep = readdir(dp))) { + /* + * We only care about directory names that are a single number. + * Sometimes there's other directories like + * "/sys/bus/pci/slots/0-3/" in there - skip those. + */ + if (!zfs_isnumber(ep->d_name)) + continue; + + (void) snprintf(buf, sizeof (buf), + "/sys/bus/pci/slots/%s/address", ep->d_name); + + address2 = zfs_read_sysfs_file(buf); + if (!address2) + continue; + + if (strcmp(address1, address2) == 0) { + /* Addresses match, we're all done */ + free(address2); + if (asprintf(&path, "/sys/bus/pci/slots/%s", + ep->d_name) == -1) { + free(tmp); + continue; + } + break; + } + free(address2); + } + + closedir(dp); + free(address1); + + return (path); +} + +/* + * Given a dev name like "sda", return the full enclosure sysfs path to + * the disk. You can also pass in the name with "/dev" prepended + * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. + * + * For example, disk "sda" in enclosure slot 1: + * dev_name: "sda" + * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" + * + * Or: + * + * dev_name: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * + * 'dev' must be a non-devicemapper device. + * + * Returned string must be freed. Returns NULL on error. + */ +char * +zfs_get_enclosure_sysfs_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char buf[MAXPATHLEN]; + char *tmp1 = NULL; + char *tmp2 = NULL; + char *tmp3 = NULL; + char *path = NULL; + size_t size; + int tmpsize; + + if (dev_name == NULL) + return (NULL); + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp1 = strrchr(dev_name, '/'); + if (tmp1 != NULL) + dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ + + tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); + if (tmpsize == -1 || tmp1 == NULL) { + tmp1 = NULL; + goto end; + } + + dp = opendir(tmp1); + if (dp == NULL) + goto end; + + /* + * Look though all sysfs entries in /sys/block//device for + * the enclosure symlink. + */ + while ((ep = readdir(dp))) { + /* Ignore everything that's not our enclosure_device link */ + if (strstr(ep->d_name, "enclosure_device") == NULL) + continue; + + if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { + tmp2 = NULL; + break; + } + + size = readlink(tmp2, buf, sizeof (buf)); + + /* Did readlink fail or crop the link name? */ + if (size == -1 || size >= sizeof (buf)) + break; + + /* + * We got a valid link. readlink() doesn't terminate strings + * so we have to do it. + */ + buf[size] = '\0'; + + /* + * Our link will look like: + * + * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" + * + * We want to grab the "enclosure/1:0:3:0/SLOT 1" part + */ + tmp3 = strstr(buf, "enclosure"); + if (tmp3 == NULL) + break; + + if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { + /* If asprintf() fails, 'path' is undefined */ + path = NULL; + break; + } + + if (path == NULL) + break; + } + +end: + free(tmp2); + free(tmp1); + + if (dp != NULL) + closedir(dp); + + if (!path) { + /* + * This particular disk isn't in a JBOD. It could be an NVMe + * drive. If so, look up the NVMe device's path in + * /sys/bus/pci/slots/. Within that directory is a 'attention' + * file which controls the NVMe fault LED. + */ + path = zfs_get_pci_slots_sys_path(dev_name); + } + + return (path); +} + +/* + * Allocate and return the underlying device name for a device mapper device. + * + * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a + * DM device (like /dev/disk/by-vdev/A0) are also allowed. + * + * If the DM device has multiple underlying devices (like with multipath + * DM devices), then favor underlying devices that have a symlink back to their + * back to their enclosure device in sysfs. This will be useful for the + * zedlet scripts that toggle the fault LED. + * + * Returns an underlying device name, or NULL on error or no match. If dm_name + * is not a DM device then return NULL. + * + * NOTE: The returned name string must be *freed*. + */ +static char * +dm_get_underlying_path(const char *dm_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *realp; + char *tmp = NULL; + char *path = NULL; + char *dev_str; + int size; + char *first_path = NULL; + char *enclosure_path; + + if (dm_name == NULL) + return (NULL); + + /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ + realp = realpath(dm_name, NULL); + if (realp == NULL) + return (NULL); + + /* + * If they preface 'dev' with a path (like "/dev") then strip it off. + * We just want the 'dm-N' part. + */ + tmp = strrchr(realp, '/'); + if (tmp != NULL) + dev_str = tmp + 1; /* +1 since we want the chr after '/' */ + else + dev_str = tmp; + + if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { + tmp = NULL; + goto end; + } + + dp = opendir(tmp); + if (dp == NULL) + goto end; + + /* + * A device-mapper device can have multiple paths to it (multipath). + * Favor paths that have a symlink back to their enclosure device. + * We have to do this since some enclosures may only provide a symlink + * back for one underlying path to a disk and not the other. + * + * If no paths have links back to their enclosure, then just return the + * first path. + */ + while ((ep = readdir(dp))) { + if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ + if (!first_path) + first_path = strdup(ep->d_name); + + enclosure_path = + zfs_get_enclosure_sysfs_path(ep->d_name); + + if (!enclosure_path) + continue; + + if ((size = asprintf( + &path, "/dev/%s", ep->d_name)) == -1) + path = NULL; + free(enclosure_path); + break; + } + } + +end: + if (dp != NULL) + closedir(dp); + free(tmp); + free(realp); + + if (!path && first_path) { + /* + * None of the underlying paths had a link back to their + * enclosure devices. Throw up out hands and return the first + * underlying path. + */ + if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) + path = NULL; + } + + free(first_path); + return (path); +} + +/* + * Return B_TRUE if device is a device mapper or multipath device. + * Return B_FALSE if not. + */ +boolean_t +zfs_dev_is_dm(const char *dev_name) +{ + + char *tmp; + tmp = dm_get_underlying_path(dev_name); + if (tmp == NULL) + return (B_FALSE); + + free(tmp); + return (B_TRUE); +} + +/* + * By "whole disk" we mean an entire physical disk (something we can + * label, toggle the write cache on, etc.) as opposed to the full + * capacity of a pseudo-device such as lofi or did. We act as if we + * are labeling the disk, which should be a pretty good test of whether + * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if + * it isn't. + */ +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + struct dk_gpt *label; + int fd; + + if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) + return (B_FALSE); + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (B_FALSE); + } + + efi_free(label); + (void) close(fd); + + return (B_TRUE); +} + +/* + * Lookup the underlying device for a device name + * + * Often you'll have a symlink to a device, a partition device, + * or a multipath device, and want to look up the underlying device. + * This function returns the underlying device name. If the device + * name is already the underlying device, then just return the same + * name. If the device is a DM device with multiple underlying devices + * then return the first one. + * + * For example: + * + * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda + * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 + * returns: /dev/sda + * + * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) + * dev_name: /dev/mapper/mpatha + * returns: /dev/sda (first device) + * + * 3. /dev/sda (already the underlying device) + * dev_name: /dev/sda + * returns: /dev/sda + * + * 4. /dev/dm-3 (mapped to /dev/sda) + * dev_name: /dev/dm-3 + * returns: /dev/sda + * + * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 + * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 + * returns: /dev/sdb + * + * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 + * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a + * returns: /dev/sda + * + * Returns underlying device name, or NULL on error or no match. + * + * NOTE: The returned name string must be *freed*. + */ +char * +zfs_get_underlying_path(const char *dev_name) +{ + char *name = NULL; + char *tmp; + + if (dev_name == NULL) + return (NULL); + + tmp = dm_get_underlying_path(dev_name); + + /* dev_name not a DM device, so just un-symlinkize it */ + if (tmp == NULL) + tmp = realpath(dev_name, NULL); + + if (tmp != NULL) { + name = zfs_strip_partition_path(tmp); + free(tmp); + } + + return (name); +} + + +#ifdef HAVE_LIBUDEV + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a disk is effectively a multipath whole disk + */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + boolean_t wholedisk = B_FALSE; + + if (realpath(path, nodepath) == NULL) + return (B_FALSE); + sysname = strrchr(nodepath, '/') + 1; + if (strncmp(sysname, "dm-", 3) != 0) + return (B_FALSE); + if ((udev = udev_new()) == NULL) + return (B_FALSE); + if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname)) == NULL) { + udev_device_unref(dev); + return (B_FALSE); + } + + wholedisk = udev_mpath_whole_disk(dev); + + udev_device_unref(dev); + return (wholedisk); +} + +#else /* HAVE_LIBUDEV */ + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} + +#endif /* HAVE_LIBUDEV */ diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c new file mode 100644 index 0000000000..ab692401d8 --- /dev/null +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -0,0 +1,892 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zutil_import.h" + +#ifdef HAVE_LIBUDEV +#include +#include +#endif +#include + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* + * Skip devices with well known prefixes: + * there can be side effects when opening devices which need to be avoided. + * + * hpet - High Precision Event Timer + * watchdog[N] - Watchdog must be closed in a special way. + */ +static boolean_t +should_skip_dev(const char *dev) +{ + return ((strcmp(dev, "watchdog") == 0) || + (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) || + (strcmp(dev, "hpet") == 0)); +} + +int +zfs_dev_flush(int fd) +{ + return (ioctl(fd, BLKFLSBUF)); +} + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + libpc_handle_t *hdl = rn->rn_hdl; + struct stat64 statbuf; + nvlist_t *config; + uint64_t vdev_guid = 0; + int error; + int num_labels = 0; + int fd; + + if (should_skip_dev(zfs_basename(rn->rn_name))) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + * Ignore files that are too small to hold a zpool. + */ + if (stat64(rn->rn_name, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) || + (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE)) + return; + + /* + * Preferentially open using O_DIRECT to bypass the block device + * cache which may be stale for multipath devices. An EINVAL errno + * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. + */ + fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC); + if ((fd < 0) && (errno == EACCES)) + hdl->lpc_open_access_error = B_TRUE; + if (fd < 0) + return; + + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { + (void) close(fd); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* + * Add additional entries for paths described by this label. + */ + if (rn->rn_labelpaths) { + char *path = NULL; + char *devid = NULL; + char *env = NULL; + rdsk_node_t *slice; + avl_index_t where; + int timeout; + int error; + + if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) + return; + + env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); + if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || + timeout < 0) { + timeout = DISK_LABEL_WAIT; + } + + /* + * Allow devlinks to stabilize so all paths are available. + */ + zpool_label_disk_wait(rn->rn_name, timeout); + + if (path != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + + if (devid != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + error = asprintf(&slice->rn_name, "%s%s", + DEV_BYID_PATH, devid); + if (error == -1) { + free(slice); + return; + } + + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + } +} + +static const char * const +zpool_default_import_path[] = { + "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ + "/dev/mapper", /* Use multipath devices before components */ + "/dev/disk/by-partlabel", /* Single unique entry set by user */ + "/dev/disk/by-partuuid", /* Generated partition uuid */ + "/dev/disk/by-label", /* Custom persistent labels */ + "/dev/disk/by-uuid", /* Single unique entry and persistent */ + "/dev/disk/by-id", /* May be multiple entries and persistent */ + "/dev/disk/by-path", /* Encodes physical location and persistent */ + "/dev" /* UNSAFE device names will change */ +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = ARRAY_SIZE(zpool_default_import_path); + return (zpool_default_import_path); +} + +/* + * Given a full path to a device determine if that device appears in the + * import search path. If it does return the first match and store the + * index in the passed 'order' variable, otherwise return an error. + */ +static int +zfs_path_order(char *name, int *order) +{ + int i, error = ENOENT; + char *dir, *env, *envdup, *tmp = NULL; + + env = getenv("ZPOOL_IMPORT_PATH"); + if (env) { + envdup = strdup(env); + for (dir = strtok_r(envdup, ":", &tmp), i = 0; + dir != NULL; + dir = strtok_r(NULL, ":", &tmp), i++) { + if (strncmp(name, dir, strlen(dir)) == 0) { + *order = i; + error = 0; + break; + } + } + free(envdup); + } else { + for (i = 0; i < ARRAY_SIZE(zpool_default_import_path); i++) { + if (strncmp(name, zpool_default_import_path[i], + strlen(zpool_default_import_path[i])) == 0) { + *order = i; + error = 0; + break; + } + } + } + + return (error); +} + +/* + * Use libblkid to quickly enumerate all known zfs devices. + */ +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + rdsk_node_t *slice; + blkid_cache cache; + blkid_dev_iterate iter; + blkid_dev dev; + avl_index_t where; + int error; + + *slice_cache = NULL; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) + return (error); + + error = blkid_probe_all_new(cache); + if (error != 0) { + blkid_put_cache(cache); + return (error); + } + + iter = blkid_dev_iterate_begin(cache); + if (iter == NULL) { + blkid_put_cache(cache); + return (EINVAL); + } + + error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); + if (error != 0) { + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + return (error); + } + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + while (blkid_dev_next(iter, &dev) == 0) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev)); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_TRUE; + + error = zfs_path_order(slice->rn_name, &slice->rn_order); + if (error == 0) + slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; + else + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + + return (0); +} + +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +#ifdef HAVE_LIBUDEV + +/* + * Obtain the persistent device id string (describes what) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + struct udev_list_entry *entry; + const char *bus; + char devbyid[MAXPATHLEN]; + + /* The bus based by-id path is preferred */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + + if (bus == NULL) { + const char *dm_uuid; + + /* + * For multipath nodes use the persistent uuid based identifier + * + * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f + */ + dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (dm_uuid != NULL) { + (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); + return (0); + } + + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * NVME 'by-id' symlinks are similar to bus case + */ + struct udev_device *parent; + + parent = udev_device_get_parent_with_subsystem_devtype(dev, + "nvme", NULL); + if (parent != NULL) + bus = "nvme"; /* continue with bus symlink search */ + else + return (ENODATA); + } + + /* + * locate the bus specific by-id link + */ + (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, devbyid, strlen(devbyid)) == 0) { + name += strlen(DEV_BYID_PATH); + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * Obtain the persistent physical location string (describes where) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + const char *physpath = NULL; + struct udev_list_entry *entry; + + /* + * Normal disks use ID_PATH for their physical path. + */ + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * Device mapper devices are virtual and don't have a physical + * path. For them we use ID_VDEV instead, which is setup via the + * /etc/vdev_id.conf file. ID_VDEV provides a persistent path + * to a virtual device. If you don't have vdev_id.conf setup, + * you cannot use multipath autoreplace with device mapper. + */ + physpath = udev_device_get_property_value(dev, "ID_VDEV"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * For ZFS volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * For all other devices fallback to using the by-uuid name. + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +udev_device_is_ready(struct udev_device *dev) +{ +#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED + return (udev_device_get_is_initialized(dev)); +#else + /* wait for DEVLINKS property to be initialized */ + return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); +#endif +} + +#else + +/* ARGSUSED */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +#endif /* HAVE_LIBUDEV */ + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname = NULL; + int ret = ENODEV; + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + start = gethrtime(); + settle = 0; + + do { + if (sysname == NULL) { + if (realpath(path, nodepath) != NULL) { + sysname = strrchr(nodepath, '/') + 1; + } else { + (void) usleep(sleep_ms * MILLISEC); + continue; + } + } + + dev = udev_device_new_from_subsystem_sysname(udev, + "block", sysname); + if ((dev != NULL) && udev_device_is_ready(dev)) { + struct udev_list_entry *links, *link = NULL; + + ret = 0; + links = udev_device_get_devlinks_list_entry(dev); + + udev_list_entry_foreach(link, links) { + struct stat64 statbuf; + const char *name; + + name = udev_list_entry_get_name(link); + errno = 0; + if (stat64(name, &statbuf) == 0 && errno == 0) + continue; + + settle = 0; + ret = ENODEV; + break; + } + + if (ret == 0) { + if (settle == 0) { + settle = gethrtime(); + } else if (NSEC2MSEC(gethrtime() - settle) >= + settle_ms) { + udev_device_unref(dev); + break; + } + } + } + + udev_device_unref(dev); + (void) usleep(sleep_ms * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + udev_unref(udev); + + return (ret); +#else + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +#endif /* HAVE_LIBUDEV */ +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + int ret = ENODEV; + hrtime_t start; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + /* resolve path to a runtime device node instance */ + if (realpath(path, nodepath) == NULL) + goto no_dev; + + sysname = strrchr(nodepath, '/') + 1; + + /* + * Wait up to 3 seconds for udev to set up the device node context + */ + start = gethrtime(); + do { + dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname); + if (dev == NULL) + goto no_dev; + if (udev_device_is_ready(dev)) + break; /* udev ready */ + + udev_device_unref(dev); + dev = NULL; + + if (NSEC2MSEC(gethrtime() - start) < 10) + (void) sched_yield(); /* yield/busy wait up to 10ms */ + else + (void) usleep(10 * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); + + if (dev == NULL) + goto no_dev; + + /* + * Only whole disks require extra device strings + */ + if (!wholedisk && !udev_mpath_whole_disk(dev)) + goto no_dev; + + ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); + if (ret != 0) + goto no_dev_ref; + + /* physical location string (optional) */ + if (zfs_device_get_physical(dev, ds->vds_devphys, + sizeof (ds->vds_devphys)) != 0) { + ds->vds_devphys[0] = '\0'; /* empty string --> not available */ + } + +no_dev_ref: + udev_device_unref(dev); +no_dev: + udev_unref(udev); + + return (ret); +#else + return (ENOENT); +#endif +} + +/* + * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it + * in the nvlist * (if applicable). Like: + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +static void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, char *path) +{ + char *upath, *spath; + + /* Add enclosure sysfs path (if disk is in an enclosure). */ + upath = zfs_get_underlying_path(path); + spath = zfs_get_enclosure_sysfs_path(upath); + + if (spath) { + nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + } else { + nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } + + free(upath); + free(spath); +} + +/* + * This will get called for each leaf vdev. + */ +static int +sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) +{ + char *path = NULL; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return (1); + + /* Rescan our enclosure sysfs path for this vdev */ + update_vdev_config_dev_sysfs_path(nv, path); + return (0); +} + +/* + * Given an nvlist for our pool (with vdev tree), iterate over all the + * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH. + */ +void +update_vdevs_config_dev_sysfs_path(nvlist_t *config) +{ + nvlist_t *nvroot = NULL; + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL); +} + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * We also store the enclosure sysfs path for turning on enclosure LEDs + * (if applicable): + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older OpenZFS implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is + * present in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in the config nvlist. + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + update_vdev_config_dev_sysfs_path(nv, path); + } else { + /* Clear out any stale entries. */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } +} diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c index 1dc0d4d1d2..435c444b24 100644 --- a/lib/libzutil/zutil_device_path.c +++ b/lib/libzutil/zutil_device_path.c @@ -23,53 +23,28 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#include #include -#include -#include #include #include #include #include -#include #include -#ifdef HAVE_LIBUDEV -#include -#endif -/* - * Append partition suffix to an otherwise fully qualified device path. - * This is used to generate the name the full path as its stored in - * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length - * of 'path' will be returned on error a negative value is returned. - */ -int -zfs_append_partition(char *path, size_t max_len) +/* Substring from after the last slash, or the string itself if none */ +const char * +zfs_basename(const char *path) { - int len = strlen(path); + const char *bn = strrchr(path, '/'); + return (bn ? bn + 1 : path); +} - if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || - (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { - if (len + 6 >= max_len) - return (-1); - - (void) strcat(path, "-part1"); - len += 6; - } else { - if (len + 2 >= max_len) - return (-1); - - if (isdigit(path[len-1])) { - (void) strcat(path, "p1"); - len += 2; - } else { - (void) strcat(path, "1"); - len += 1; - } - } - - return (len); +/* Return index of last slash or -1 if none */ +ssize_t +zfs_dirnamelen(const char *path) +{ + const char *end = strrchr(path, '/'); + return (end ? end - path : -1); } /* @@ -82,18 +57,18 @@ int zfs_resolve_shortname(const char *name, char *path, size_t len) { int i, error = -1; - char *dir, *env, *envdup; + char *dir, *env, *envdup, *tmp = NULL; env = getenv("ZPOOL_IMPORT_PATH"); errno = ENOENT; if (env) { envdup = strdup(env); - dir = strtok(envdup, ":"); - while (dir && error) { + for (dir = strtok_r(envdup, ":", &tmp); + dir != NULL && error != 0; + dir = strtok_r(NULL, ":", &tmp)) { (void) snprintf(path, len, "%s/%s", dir, name); error = access(path, F_OK); - dir = strtok(NULL, ":"); } free(envdup); } else { @@ -123,21 +98,20 @@ static int zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk) { int path_len, cmp_len, i = 0, error = ENOENT; - char *dir, *env, *envdup = NULL; + char *dir, *env, *envdup = NULL, *tmp = NULL; char path_name[MAXPATHLEN]; - const char * const *zpool_default_import_path; + const char * const *zpool_default_import_path = NULL; size_t count; - zpool_default_import_path = zpool_default_search_paths(&count); - cmp_len = strlen(cmp_name); env = getenv("ZPOOL_IMPORT_PATH"); if (env) { envdup = strdup(env); - dir = strtok(envdup, ":"); + dir = strtok_r(envdup, ":", &tmp); } else { - dir = (char *)zpool_default_import_path[i]; + zpool_default_import_path = zpool_default_search_paths(&count); + dir = (char *)zpool_default_import_path[i]; } while (dir) { @@ -157,7 +131,7 @@ zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk) } if (env) { - dir = strtok(NULL, ":"); + dir = strtok_r(NULL, ":", &tmp); } else if (++i < count) { dir = (char *)zpool_default_import_path[i]; } else { @@ -182,18 +156,17 @@ zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk) int path_len, cmp_len; char path_name[MAXPATHLEN]; char cmp_name[MAXPATHLEN]; - char *dir, *dup; + char *dir, *tmp = NULL; - /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */ - memset(cmp_name, 0, MAXPATHLEN); - dup = strdup(cmp); - dir = strtok(dup, "/"); - while (dir) { + /* Strip redundant slashes if they exist due to ZPOOL_IMPORT_PATH */ + cmp_name[0] = '\0'; + (void) strlcpy(path_name, cmp, sizeof (path_name)); + for (dir = strtok_r(path_name, "/", &tmp); + dir != NULL; + dir = strtok_r(NULL, "/", &tmp)) { strlcat(cmp_name, "/", sizeof (cmp_name)); strlcat(cmp_name, dir, sizeof (cmp_name)); - dir = strtok(NULL, "/"); } - free(dup); if (name[0] != '/') return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); @@ -213,413 +186,3 @@ zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk) return (0); } - -/* - * Allocate and return the underlying device name for a device mapper device. - * If a device mapper device maps to multiple devices, return the first device. - * - * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a - * DM device (like /dev/disk/by-vdev/A0) are also allowed. - * - * Returns device name, or NULL on error or no match. If dm_name is not a DM - * device then return NULL. - * - * NOTE: The returned name string must be *freed*. - */ -static char * -dm_get_underlying_path(const char *dm_name) -{ - DIR *dp = NULL; - struct dirent *ep; - char *realp; - char *tmp = NULL; - char *path = NULL; - char *dev_str; - int size; - - if (dm_name == NULL) - return (NULL); - - /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ - realp = realpath(dm_name, NULL); - if (realp == NULL) - return (NULL); - - /* - * If they preface 'dev' with a path (like "/dev") then strip it off. - * We just want the 'dm-N' part. - */ - tmp = strrchr(realp, '/'); - if (tmp != NULL) - dev_str = tmp + 1; /* +1 since we want the chr after '/' */ - else - dev_str = tmp; - - size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); - if (size == -1 || !tmp) - goto end; - - dp = opendir(tmp); - if (dp == NULL) - goto end; - - /* Return first sd* entry in /sys/block/dm-N/slaves/ */ - while ((ep = readdir(dp))) { - if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ - size = asprintf(&path, "/dev/%s", ep->d_name); - break; - } - } - -end: - if (dp != NULL) - closedir(dp); - free(tmp); - free(realp); - return (path); -} - -/* - * Return 1 if device is a device mapper or multipath device. - * Return 0 if not. - */ -int -zfs_dev_is_dm(const char *dev_name) -{ - - char *tmp; - tmp = dm_get_underlying_path(dev_name); - if (tmp == NULL) - return (0); - - free(tmp); - return (1); -} - -/* - * By "whole disk" we mean an entire physical disk (something we can - * label, toggle the write cache on, etc.) as opposed to the full - * capacity of a pseudo-device such as lofi or did. We act as if we - * are labeling the disk, which should be a pretty good test of whether - * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if - * it isn't. - */ -int -zfs_dev_is_whole_disk(const char *dev_name) -{ - struct dk_gpt *label; - int fd; - - if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) - return (0); - - if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { - (void) close(fd); - return (0); - } - - efi_free(label); - (void) close(fd); - - return (1); -} - -/* - * Lookup the underlying device for a device name - * - * Often you'll have a symlink to a device, a partition device, - * or a multipath device, and want to look up the underlying device. - * This function returns the underlying device name. If the device - * name is already the underlying device, then just return the same - * name. If the device is a DM device with multiple underlying devices - * then return the first one. - * - * For example: - * - * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda - * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 - * returns: /dev/sda - * - * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) - * dev_name: /dev/mapper/mpatha - * returns: /dev/sda (first device) - * - * 3. /dev/sda (already the underlying device) - * dev_name: /dev/sda - * returns: /dev/sda - * - * 4. /dev/dm-3 (mapped to /dev/sda) - * dev_name: /dev/dm-3 - * returns: /dev/sda - * - * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 - * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 - * returns: /dev/sdb - * - * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 - * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a - * returns: /dev/sda - * - * Returns underlying device name, or NULL on error or no match. - * - * NOTE: The returned name string must be *freed*. - */ -char * -zfs_get_underlying_path(const char *dev_name) -{ - char *name = NULL; - char *tmp; - - if (dev_name == NULL) - return (NULL); - - tmp = dm_get_underlying_path(dev_name); - - /* dev_name not a DM device, so just un-symlinkize it */ - if (tmp == NULL) - tmp = realpath(dev_name, NULL); - - if (tmp != NULL) { - name = zfs_strip_partition_path(tmp); - free(tmp); - } - - return (name); -} - -/* - * Given a dev name like "sda", return the full enclosure sysfs path to - * the disk. You can also pass in the name with "/dev" prepended - * to it (like /dev/sda). - * - * For example, disk "sda" in enclosure slot 1: - * dev: "sda" - * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" - * - * 'dev' must be a non-devicemapper device. - * - * Returned string must be freed. - */ -char * -zfs_get_enclosure_sysfs_path(const char *dev_name) -{ - DIR *dp = NULL; - struct dirent *ep; - char buf[MAXPATHLEN]; - char *tmp1 = NULL; - char *tmp2 = NULL; - char *tmp3 = NULL; - char *path = NULL; - size_t size; - int tmpsize; - - if (dev_name == NULL) - return (NULL); - - /* If they preface 'dev' with a path (like "/dev") then strip it off */ - tmp1 = strrchr(dev_name, '/'); - if (tmp1 != NULL) - dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ - - tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); - if (tmpsize == -1 || tmp1 == NULL) { - tmp1 = NULL; - goto end; - } - - dp = opendir(tmp1); - if (dp == NULL) { - tmp1 = NULL; /* To make free() at the end a NOP */ - goto end; - } - - /* - * Look though all sysfs entries in /sys/block//device for - * the enclosure symlink. - */ - while ((ep = readdir(dp))) { - /* Ignore everything that's not our enclosure_device link */ - if (strstr(ep->d_name, "enclosure_device") == NULL) - continue; - - if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || - tmp2 == NULL) - break; - - size = readlink(tmp2, buf, sizeof (buf)); - - /* Did readlink fail or crop the link name? */ - if (size == -1 || size >= sizeof (buf)) { - free(tmp2); - tmp2 = NULL; /* To make free() at the end a NOP */ - break; - } - - /* - * We got a valid link. readlink() doesn't terminate strings - * so we have to do it. - */ - buf[size] = '\0'; - - /* - * Our link will look like: - * - * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" - * - * We want to grab the "enclosure/1:0:3:0/SLOT 1" part - */ - tmp3 = strstr(buf, "enclosure"); - if (tmp3 == NULL) - break; - - if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { - /* If asprintf() fails, 'path' is undefined */ - path = NULL; - break; - } - - if (path == NULL) - break; - } - -end: - free(tmp2); - free(tmp1); - - if (dp != NULL) - closedir(dp); - - return (path); -} - -/* - * Remove partition suffix from a vdev path. Partition suffixes may take three - * forms: "-partX", "pX", or "X", where X is a string of digits. The second - * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The - * third case only occurs when preceded by a string matching the regular - * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. - * - * caller must free the returned string - */ -char * -zfs_strip_partition(char *path) -{ - char *tmp = strdup(path); - char *part = NULL, *d = NULL; - if (!tmp) - return (NULL); - - if ((part = strstr(tmp, "-part")) && part != tmp) { - d = part + 5; - } else if ((part = strrchr(tmp, 'p')) && - part > tmp + 1 && isdigit(*(part-1))) { - d = part + 1; - } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && - tmp[1] == 'd') { - for (d = &tmp[2]; isalpha(*d); part = ++d) { } - } else if (strncmp("xvd", tmp, 3) == 0) { - for (d = &tmp[3]; isalpha(*d); part = ++d) { } - } - if (part && d && *d != '\0') { - for (; isdigit(*d); d++) { } - if (*d == '\0') - *part = '\0'; - } - - return (tmp); -} - -/* - * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname - * - * path: /dev/sda1 - * returns: /dev/sda - * - * Returned string must be freed. - */ -char * -zfs_strip_partition_path(char *path) -{ - char *newpath = strdup(path); - char *sd_offset; - char *new_sd; - - if (!newpath) - return (NULL); - - /* Point to "sda1" part of "/dev/sda1" */ - sd_offset = strrchr(newpath, '/') + 1; - - /* Get our new name "sda" */ - new_sd = zfs_strip_partition(sd_offset); - if (!new_sd) { - free(newpath); - return (NULL); - } - - /* Paste the "sda" where "sda1" was */ - strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); - - /* Free temporary "sda" */ - free(new_sd); - - return (newpath); -} - -#ifdef HAVE_LIBUDEV -/* - * A disk is considered a multipath whole disk when: - * DEVNAME key value has "dm-" - * DM_NAME key value has "mpath" prefix - * DM_UUID key exists - * ID_PART_TABLE_TYPE key does not exist or is not gpt - */ -static boolean_t -udev_mpath_whole_disk(struct udev_device *dev) -{ - const char *devname, *type, *uuid; - - devname = udev_device_get_property_value(dev, "DEVNAME"); - type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); - uuid = udev_device_get_property_value(dev, "DM_UUID"); - - if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && - ((type == NULL) || (strcmp(type, "gpt") != 0)) && - (uuid != NULL)) { - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * Check if a disk is effectively a multipath whole disk - */ -boolean_t -is_mpath_whole_disk(const char *path) -{ - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname; - boolean_t wholedisk = B_FALSE; - - if (realpath(path, nodepath) == NULL) - return (B_FALSE); - sysname = strrchr(nodepath, '/') + 1; - if (strncmp(sysname, "dm-", 3) != 0) - return (B_FALSE); - if ((udev = udev_new()) == NULL) - return (B_FALSE); - if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", - sysname)) == NULL) { - udev_device_unref(dev); - return (B_FALSE); - } - - wholedisk = udev_mpath_whole_disk(dev); - - udev_device_unref(dev); - return (wholedisk); -} -#endif diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index e82744383d..9eb55aaf77 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2021, Colm Buckley */ /* @@ -46,16 +47,12 @@ * using our derived config, and record the results. */ +#include #include -#include #include #include #include #include -#ifdef HAVE_LIBUDEV -#include -#include -#endif #include #include #include @@ -65,36 +62,15 @@ #include #include #include -#include -#include #include #include #include -#define IMPORT_ORDER_PREFERRED_1 1 -#define IMPORT_ORDER_PREFERRED_2 2 -#define IMPORT_ORDER_SCAN_OFFSET 10 -#define IMPORT_ORDER_DEFAULT 100 -#define DEFAULT_IMPORT_PATH_SIZE 9 +#include "zutil_import.h" -#define EZFS_BADCACHE "invalid or missing cache file" -#define EZFS_BADPATH "must be an absolute path" -#define EZFS_NOMEM "out of memory" -#define EZFS_EACESS "some devices require root privileges" - -typedef struct libpc_handle { - boolean_t lpc_printerr; - boolean_t lpc_open_access_error; - boolean_t lpc_desc_active; - char lpc_desc[1024]; - const pool_config_ops_t *lpc_ops; - void *lpc_lib_handle; -} libpc_handle_t; - -/*PRINTFLIKE2*/ -static void -zfs_error_aux(libpc_handle_t *hdl, const char *fmt, ...) +static __attribute__((format(printf, 2, 3))) void +zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) { va_list ap; @@ -107,7 +83,8 @@ zfs_error_aux(libpc_handle_t *hdl, const char *fmt, ...) } static void -zfs_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap) +zutil_verror(libpc_handle_t *hdl, const char *error, const char *fmt, + va_list ap) { char action[1024]; @@ -126,15 +103,14 @@ zfs_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap) } } -/*PRINTFLIKE3*/ -static int -zfs_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) +static __attribute__((format(printf, 3, 4))) int +zutil_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - zfs_verror(hdl, error, fmt, ap); + zutil_verror(hdl, error, fmt, ap); va_end(ap); @@ -142,36 +118,47 @@ zfs_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) } static int -zfs_error(libpc_handle_t *hdl, const char *error, const char *msg) +zutil_error(libpc_handle_t *hdl, const char *error, const char *msg) { - return (zfs_error_fmt(hdl, error, "%s", msg)); + return (zutil_error_fmt(hdl, error, "%s", msg)); } static int -no_memory(libpc_handle_t *hdl) +zutil_no_memory(libpc_handle_t *hdl) { - zfs_error(hdl, EZFS_NOMEM, "internal error"); + zutil_error(hdl, EZFS_NOMEM, "internal error"); exit(1); } -static void * -zfs_alloc(libpc_handle_t *hdl, size_t size) +void * +zutil_alloc(libpc_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) - (void) no_memory(hdl); + (void) zutil_no_memory(hdl); return (data); } -static char * -zfs_strdup(libpc_handle_t *hdl, const char *str) +char * +zutil_strdup(libpc_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) - (void) no_memory(hdl); + (void) zutil_no_memory(hdl); + + return (ret); +} + +static char * +zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n) +{ + char *ret; + + if ((ret = strndup(str, n)) == NULL) + (void) zutil_no_memory(hdl); return (ret); } @@ -210,472 +197,6 @@ typedef struct pool_list { name_entry_t *names; } pool_list_t; -#define ZVOL_ROOT "/dev/zvol" -#define DEV_BYID_PATH "/dev/disk/by-id/" - -/* - * Linux persistent device strings for vdev labels - * - * based on libudev for consistency with libudev disk add/remove events - */ - -typedef struct vdev_dev_strs { - char vds_devid[128]; - char vds_devphys[128]; -} vdev_dev_strs_t; - -#ifdef HAVE_LIBUDEV -/* - * Obtain the persistent device id string (describes what) - * - * used by ZED vdev matching for auto-{online,expand,replace} - */ -int -zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) -{ - struct udev_list_entry *entry; - const char *bus; - char devbyid[MAXPATHLEN]; - - /* The bus based by-id path is preferred */ - bus = udev_device_get_property_value(dev, "ID_BUS"); - - if (bus == NULL) { - const char *dm_uuid; - - /* - * For multipath nodes use the persistent uuid based identifier - * - * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f - */ - dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); - if (dm_uuid != NULL) { - (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); - return (0); - } - - /* - * For volumes use the persistent /dev/zvol/dataset identifier - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - const char *name; - - name = udev_list_entry_get_name(entry); - if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { - (void) strlcpy(bufptr, name, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } - - /* - * NVME 'by-id' symlinks are similar to bus case - */ - struct udev_device *parent; - - parent = udev_device_get_parent_with_subsystem_devtype(dev, - "nvme", NULL); - if (parent != NULL) - bus = "nvme"; /* continue with bus symlink search */ - else - return (ENODATA); - } - - /* - * locate the bus specific by-id link - */ - (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - const char *name; - - name = udev_list_entry_get_name(entry); - if (strncmp(name, devbyid, strlen(devbyid)) == 0) { - name += strlen(DEV_BYID_PATH); - (void) strlcpy(bufptr, name, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } - - return (ENODATA); -} - -/* - * Obtain the persistent physical location string (describes where) - * - * used by ZED vdev matching for auto-{online,expand,replace} - */ -int -zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) -{ - const char *physpath = NULL; - struct udev_list_entry *entry; - - /* - * Normal disks use ID_PATH for their physical path. - */ - physpath = udev_device_get_property_value(dev, "ID_PATH"); - if (physpath != NULL && strlen(physpath) > 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } - - /* - * Device mapper devices are virtual and don't have a physical - * path. For them we use ID_VDEV instead, which is setup via the - * /etc/vdev_id.conf file. ID_VDEV provides a persistent path - * to a virtual device. If you don't have vdev_id.conf setup, - * you cannot use multipath autoreplace with device mapper. - */ - physpath = udev_device_get_property_value(dev, "ID_VDEV"); - if (physpath != NULL && strlen(physpath) > 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } - - /* - * For ZFS volumes use the persistent /dev/zvol/dataset identifier - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - physpath = udev_list_entry_get_name(entry); - if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } - - /* - * For all other devices fallback to using the by-uuid name. - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - physpath = udev_list_entry_get_name(entry); - if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } - - return (ENODATA); -} - -/* - * A disk is considered a multipath whole disk when: - * DEVNAME key value has "dm-" - * DM_NAME key value has "mpath" prefix - * DM_UUID key exists - * ID_PART_TABLE_TYPE key does not exist or is not gpt - */ -static boolean_t -udev_mpath_whole_disk(struct udev_device *dev) -{ - const char *devname, *type, *uuid; - - devname = udev_device_get_property_value(dev, "DEVNAME"); - type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); - uuid = udev_device_get_property_value(dev, "DM_UUID"); - - if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && - ((type == NULL) || (strcmp(type, "gpt") != 0)) && - (uuid != NULL)) { - return (B_TRUE); - } - - return (B_FALSE); -} - -static int -udev_device_is_ready(struct udev_device *dev) -{ -#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED - return (udev_device_get_is_initialized(dev)); -#else - /* wait for DEVLINKS property to be initialized */ - return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); -#endif -} -#endif /* HAVE_LIBUDEV */ - -/* - * Wait up to timeout_ms for udev to set up the device node. The device is - * considered ready when libudev determines it has been initialized, all of - * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. - */ -int -zpool_label_disk_wait(const char *path, int timeout_ms) -{ -#ifdef HAVE_LIBUDEV - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname = NULL; - int ret = ENODEV; - int settle_ms = 50; - long sleep_ms = 10; - hrtime_t start, settle; - - if ((udev = udev_new()) == NULL) - return (ENXIO); - - start = gethrtime(); - settle = 0; - - do { - if (sysname == NULL) { - if (realpath(path, nodepath) != NULL) { - sysname = strrchr(nodepath, '/') + 1; - } else { - (void) usleep(sleep_ms * MILLISEC); - continue; - } - } - - dev = udev_device_new_from_subsystem_sysname(udev, - "block", sysname); - if ((dev != NULL) && udev_device_is_ready(dev)) { - struct udev_list_entry *links, *link = NULL; - - ret = 0; - links = udev_device_get_devlinks_list_entry(dev); - - udev_list_entry_foreach(link, links) { - struct stat64 statbuf; - const char *name; - - name = udev_list_entry_get_name(link); - errno = 0; - if (stat64(name, &statbuf) == 0 && errno == 0) - continue; - - settle = 0; - ret = ENODEV; - break; - } - - if (ret == 0) { - if (settle == 0) { - settle = gethrtime(); - } else if (NSEC2MSEC(gethrtime() - settle) >= - settle_ms) { - udev_device_unref(dev); - break; - } - } - } - - udev_device_unref(dev); - (void) usleep(sleep_ms * MILLISEC); - - } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); - - udev_unref(udev); - - return (ret); -#else - int settle_ms = 50; - long sleep_ms = 10; - hrtime_t start, settle; - struct stat64 statbuf; - - start = gethrtime(); - settle = 0; - - do { - errno = 0; - if ((stat64(path, &statbuf) == 0) && (errno == 0)) { - if (settle == 0) - settle = gethrtime(); - else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) - return (0); - } else if (errno != ENOENT) { - return (errno); - } - - usleep(sleep_ms * MILLISEC); - } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); - - return (ENODEV); -#endif /* HAVE_LIBUDEV */ -} - -/* - * Encode the persistent devices strings - * used for the vdev disk label - */ -static int -encode_device_strings(const char *path, vdev_dev_strs_t *ds, - boolean_t wholedisk) -{ -#ifdef HAVE_LIBUDEV - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname; - int ret = ENODEV; - hrtime_t start; - - if ((udev = udev_new()) == NULL) - return (ENXIO); - - /* resolve path to a runtime device node instance */ - if (realpath(path, nodepath) == NULL) - goto no_dev; - - sysname = strrchr(nodepath, '/') + 1; - - /* - * Wait up to 3 seconds for udev to set up the device node context - */ - start = gethrtime(); - do { - dev = udev_device_new_from_subsystem_sysname(udev, "block", - sysname); - if (dev == NULL) - goto no_dev; - if (udev_device_is_ready(dev)) - break; /* udev ready */ - - udev_device_unref(dev); - dev = NULL; - - if (NSEC2MSEC(gethrtime() - start) < 10) - (void) sched_yield(); /* yield/busy wait up to 10ms */ - else - (void) usleep(10 * MILLISEC); - - } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); - - if (dev == NULL) - goto no_dev; - - /* - * Only whole disks require extra device strings - */ - if (!wholedisk && !udev_mpath_whole_disk(dev)) - goto no_dev; - - ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); - if (ret != 0) - goto no_dev_ref; - - /* physical location string (optional) */ - if (zfs_device_get_physical(dev, ds->vds_devphys, - sizeof (ds->vds_devphys)) != 0) { - ds->vds_devphys[0] = '\0'; /* empty string --> not available */ - } - -no_dev_ref: - udev_device_unref(dev); -no_dev: - udev_unref(udev); - - return (ret); -#else - return (ENOENT); -#endif -} - -/* - * Update a leaf vdev's persistent device strings (Linux only) - * - * - only applies for a dedicated leaf vdev (aka whole disk) - * - updated during pool create|add|attach|import - * - used for matching device matching during auto-{online,expand,replace} - * - stored in a leaf disk config label (i.e. alongside 'path' NVP) - * - these strings are currently not used in kernel (i.e. for vdev_disk_open) - * - * single device node example: - * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' - * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' - * - * multipath device node example: - * devid: 'dm-uuid-mpath-35000c5006304de3f' - * - * We also store the enclosure sysfs path for turning on enclosure LEDs - * (if applicable): - * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' - */ -void -update_vdev_config_dev_strs(nvlist_t *nv) -{ - vdev_dev_strs_t vds; - char *env, *type, *path; - uint64_t wholedisk = 0; - char *upath, *spath; - - /* - * For the benefit of legacy ZFS implementations, allow - * for opting out of devid strings in the vdev label. - * - * example use: - * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer - * - * explanation: - * Older ZFS on Linux implementations had issues when attempting to - * display pool config VDEV names if a "devid" NVP value is present - * in the pool's config. - * - * For example, a pool that originated on illumos platform would - * have a devid value in the config and "zpool status" would fail - * when listing the config. - * - * A pool can be stripped of any "devid" values on import or - * prevented from adding them on zpool create|add by setting - * ZFS_VDEV_DEVID_OPT_OUT. - */ - env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); - if (env && (strtoul(env, NULL, 0) > 0 || - !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); - return; - } - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || - strcmp(type, VDEV_TYPE_DISK) != 0) { - return; - } - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) - return; - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); - - /* - * Update device string values in config nvlist - */ - if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { - (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); - if (vds.vds_devphys[0] != '\0') { - (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, - vds.vds_devphys); - } - - /* Add enclosure sysfs path (if disk is in an enclosure) */ - upath = zfs_get_underlying_path(path); - spath = zfs_get_enclosure_sysfs_path(upath); - if (spath) - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, - spath); - else - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); - - free(upath); - free(spath); - } else { - /* clear out any stale entries */ - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); - } -} - /* * Go through and fix up any path and/or devid information for the given vdev * configuration. @@ -753,7 +274,6 @@ fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) return (-1); - /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */ update_vdev_config_dev_strs(nv); return (0); @@ -781,10 +301,10 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, &state) == 0 && (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } @@ -826,7 +346,7 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, } if (pe == NULL) { - if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) { + if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { return (-1); } pe->pe_guid = pool_guid; @@ -844,7 +364,7 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, } if (ve == NULL) { - if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { + if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { return (-1); } ve->ve_guid = top_guid; @@ -863,7 +383,7 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, } if (ce == NULL) { - if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) { + if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { return (-1); } ce->ce_txg = txg; @@ -878,10 +398,10 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, * mappings so that we can fix up the configuration as necessary before * doing the import. */ - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } @@ -896,7 +416,7 @@ add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, } static int -pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, +zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, boolean_t *isactive) { ASSERT(hdl->lpc_ops->pco_pool_active != NULL); @@ -908,7 +428,7 @@ pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, } static nvlist_t * -refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) +zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) { ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); @@ -1041,12 +561,14 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, * pool guid * name * comment (if available) + * compatibility features (if available) * pool state * hostid (if available) * hostname (if available) */ uint64_t state, version; char *comment = NULL; + char *compatibility = NULL; version = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_VERSION); @@ -1066,6 +588,13 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, comment); + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMPATIBILITY, + &compatibility) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMPATIBILITY, + compatibility); + state = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_STATE); fnvlist_add_uint64(config, @@ -1096,7 +625,7 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, if (id >= children) { nvlist_t **newchild; - newchild = zfs_alloc(hdl, (id + 1) * + newchild = zutil_alloc(hdl, (id + 1) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; @@ -1128,7 +657,7 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, } else if (max_id > children) { nvlist_t **newchild; - newchild = zfs_alloc(hdl, (max_id) * + newchild = zutil_alloc(hdl, (max_id) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; @@ -1266,7 +795,7 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - if (pool_active(hdl, name, guid, &isactive) != 0) + if (zutil_pool_active(hdl, name, guid, &isactive) != 0) goto error; if (isactive) { @@ -1281,7 +810,7 @@ get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, goto nomem; } - if ((nvl = refresh_config(hdl, config)) == NULL) { + if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { nvlist_free(config); config = NULL; continue; @@ -1346,7 +875,7 @@ add_pool: return (ret); nomem: - (void) no_memory(hdl); + (void) zutil_no_memory(hdl); error: nvlist_free(config); nvlist_free(ret); @@ -1369,16 +898,16 @@ label_offset(uint64_t size, int l) } /* - * Given a file descriptor, read the label information and return an nvlist - * describing the configuration, if there is one. The number of valid - * labels found will be returned in num_labels when non-NULL. + * The same description applies as to zpool_read_label below, + * except here we do it without aio, presumably because an aio call + * errored out in a way we think not using it could circumvent. */ -int -zpool_read_label(int fd, nvlist_t **config, int *num_labels) +static int +zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels) { struct stat64 statbuf; int l, count = 0; - vdev_label_t *label; + vdev_phys_t *label; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; int error; @@ -1395,13 +924,14 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) for (l = 0; l < VDEV_LABELS; l++) { uint64_t state, guid, txg; + off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; - if (pread64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) + if (pread64(fd, label, sizeof (vdev_phys_t), + offset) != sizeof (vdev_phys_t)) continue; - if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, - sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + if (nvlist_unpack(label->vp_nvlist, + sizeof (label->vp_nvlist), config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, @@ -1444,18 +974,137 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) return (0); } -typedef struct rdsk_node { - char *rn_name; /* Full path to device */ - int rn_order; /* Preferred order (low to high) */ - int rn_num_labels; /* Number of valid labels */ - uint64_t rn_vdev_guid; /* Expected vdev guid when set */ - libpc_handle_t *rn_hdl; - nvlist_t *rn_config; /* Label config */ - avl_tree_t *rn_avl; - avl_node_t rn_node; - pthread_mutex_t *rn_lock; - boolean_t rn_labelpaths; -} rdsk_node_t; +/* + * Given a file descriptor, read the label information and return an nvlist + * describing the configuration, if there is one. The number of valid + * labels found will be returned in num_labels when non-NULL. + */ +int +zpool_read_label(int fd, nvlist_t **config, int *num_labels) +{ + struct stat64 statbuf; + struct aiocb aiocbs[VDEV_LABELS]; + struct aiocb *aiocbps[VDEV_LABELS]; + vdev_phys_t *labels; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size; + int error, l, count = 0; + + *config = NULL; + + if (fstat64_blk(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + error = posix_memalign((void **)&labels, PAGESIZE, + VDEV_LABELS * sizeof (*labels)); + if (error) + return (-1); + + memset(aiocbs, 0, sizeof (aiocbs)); + for (l = 0; l < VDEV_LABELS; l++) { + off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; + + aiocbs[l].aio_fildes = fd; + aiocbs[l].aio_offset = offset; + aiocbs[l].aio_buf = &labels[l]; + aiocbs[l].aio_nbytes = sizeof (vdev_phys_t); + aiocbs[l].aio_lio_opcode = LIO_READ; + aiocbps[l] = &aiocbs[l]; + } + + if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { + int saved_errno = errno; + boolean_t do_slow = B_FALSE; + error = -1; + + if (errno == EAGAIN || errno == EINTR || errno == EIO) { + /* + * A portion of the requests may have been submitted. + * Clean them up. + */ + for (l = 0; l < VDEV_LABELS; l++) { + errno = 0; + switch (aio_error(&aiocbs[l])) { + case EINVAL: + break; + case EINPROGRESS: + // This shouldn't be possible to + // encounter, die if we do. + ASSERT(B_FALSE); + fallthrough; + case EOPNOTSUPP: + case ENOSYS: + do_slow = B_TRUE; + fallthrough; + case 0: + default: + (void) aio_return(&aiocbs[l]); + } + } + } + if (do_slow) { + /* + * At least some IO involved access unsafe-for-AIO + * files. Let's try again, without AIO this time. + */ + error = zpool_read_label_slow(fd, config, num_labels); + saved_errno = errno; + } + free(labels); + errno = saved_errno; + return (error); + } + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + + if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t)) + continue; + + if (nvlist_unpack(labels[l].vp_nvlist, + sizeof (labels[l].vp_nvlist), config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + nvlist_free(*config); + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + continue; + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + continue; + } + + if (expected_guid) { + if (expected_guid == guid) + count++; + + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + if (num_labels != NULL) + *num_labels = count; + + free(labels); + *config = expected_config; + + return (0); +} /* * Sorted by full path and then vdev guid to allow for multiple entries with @@ -1466,7 +1115,7 @@ typedef struct rdsk_node { * include overwritten pool labels, devices which are visible from multiple * hosts and multipath devices. */ -static int +int slice_cache_compare(const void *arg1, const void *arg2) { const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; @@ -1475,25 +1124,11 @@ slice_cache_compare(const void *arg1, const void *arg2) uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; - rv = AVL_ISIGN(strcmp(nm1, nm2)); + rv = TREE_ISIGN(strcmp(nm1, nm2)); if (rv) return (rv); - return (AVL_CMP(guid1, guid2)); -} - -static boolean_t -is_watchdog_dev(char *dev) -{ - /* For 'watchdog' dev */ - if (strcmp(dev, "watchdog") == 0) - return (B_TRUE); - - /* For 'watchdog */ - if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) - return (B_TRUE); - - return (B_FALSE); + return (TREE_CMP(guid1, guid2)); } static int @@ -1540,7 +1175,7 @@ label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, * and store these strings as config_path and devid_path respectively. * The returned pointers are only valid as long as label remains valid. */ -static int +int label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) { nvlist_t *nvroot; @@ -1559,158 +1194,6 @@ label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) devid)); } -static void -zpool_open_func(void *arg) -{ - rdsk_node_t *rn = arg; - libpc_handle_t *hdl = rn->rn_hdl; - struct stat64 statbuf; - nvlist_t *config; - char *bname, *dupname; - uint64_t vdev_guid = 0; - int error; - int num_labels = 0; - int fd; - - /* - * Skip devices with well known prefixes there can be side effects - * when opening devices which need to be avoided. - * - * hpet - High Precision Event Timer - * watchdog - Watchdog must be closed in a special way. - */ - dupname = zfs_strdup(hdl, rn->rn_name); - bname = basename(dupname); - error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); - free(dupname); - if (error) - return; - - /* - * Ignore failed stats. We only want regular files and block devices. - */ - if (stat64(rn->rn_name, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) - return; - - /* - * Preferentially open using O_DIRECT to bypass the block device - * cache which may be stale for multipath devices. An EINVAL errno - * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. - */ - fd = open(rn->rn_name, O_RDONLY | O_DIRECT); - if ((fd < 0) && (errno == EINVAL)) - fd = open(rn->rn_name, O_RDONLY); - - if ((fd < 0) && (errno == EACCES)) - hdl->lpc_open_access_error = B_TRUE; - - if (fd < 0) - return; - - /* - * This file is too small to hold a zpool - */ - if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { - (void) close(fd); - return; - } - - error = zpool_read_label(fd, &config, &num_labels); - if (error != 0) { - (void) close(fd); - return; - } - - if (num_labels == 0) { - (void) close(fd); - nvlist_free(config); - return; - } - - /* - * Check that the vdev is for the expected guid. Additional entries - * are speculatively added based on the paths stored in the labels. - * Entries with valid paths but incorrect guids must be removed. - */ - error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); - if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { - (void) close(fd); - nvlist_free(config); - return; - } - - (void) close(fd); - - rn->rn_config = config; - rn->rn_num_labels = num_labels; - - /* - * Add additional entries for paths described by this label. - */ - if (rn->rn_labelpaths) { - char *path = NULL; - char *devid = NULL; - rdsk_node_t *slice; - avl_index_t where; - int error; - - if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) - return; - - /* - * Allow devlinks to stabilize so all paths are available. - */ - zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT); - - if (path != NULL) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, path); - slice->rn_vdev_guid = vdev_guid; - slice->rn_avl = rn->rn_avl; - slice->rn_hdl = hdl; - slice->rn_order = IMPORT_ORDER_PREFERRED_1; - slice->rn_labelpaths = B_FALSE; - pthread_mutex_lock(rn->rn_lock); - if (avl_find(rn->rn_avl, slice, &where)) { - pthread_mutex_unlock(rn->rn_lock); - free(slice->rn_name); - free(slice); - } else { - avl_insert(rn->rn_avl, slice, where); - pthread_mutex_unlock(rn->rn_lock); - zpool_open_func(slice); - } - } - - if (devid != NULL) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - error = asprintf(&slice->rn_name, "%s%s", - DEV_BYID_PATH, devid); - if (error == -1) { - free(slice); - return; - } - - slice->rn_vdev_guid = vdev_guid; - slice->rn_avl = rn->rn_avl; - slice->rn_hdl = hdl; - slice->rn_order = IMPORT_ORDER_PREFERRED_2; - slice->rn_labelpaths = B_FALSE; - pthread_mutex_lock(rn->rn_lock); - if (avl_find(rn->rn_avl, slice, &where)) { - pthread_mutex_unlock(rn->rn_lock); - free(slice->rn_name); - free(slice); - } else { - avl_insert(rn->rn_avl, slice, where); - pthread_mutex_unlock(rn->rn_lock); - zpool_open_func(slice); - } - } - } -} - static void zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *path, const char *name, int order) @@ -1718,7 +1201,7 @@ zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_index_t where; rdsk_node_t *slice; - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { free(slice); return; @@ -1754,8 +1237,8 @@ zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, if (error == ENOENT) return (0); - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + zutil_error_aux(hdl, "%s", strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); return (error); } @@ -1763,18 +1246,30 @@ zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, dirp = opendir(path); if (dirp == NULL) { error = errno; - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, + zutil_error_aux(hdl, "%s", strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); return (error); } while ((dp = readdir64(dirp)) != NULL) { const char *name = dp->d_name; - if (name[0] == '.' && - (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) continue; + switch (dp->d_type) { + case DT_UNKNOWN: + case DT_BLK: + case DT_LNK: +#ifdef __FreeBSD__ + case DT_CHR: +#endif + case DT_REG: + break; + default: + continue; + } + zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); } @@ -1789,20 +1284,22 @@ zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, { int error = 0; char path[MAXPATHLEN]; - char *d, *b; - char *dpath, *name; + char *d = NULL; + ssize_t dl; + const char *dpath, *name; /* - * Seperate the directory part and last part of the - * path. We do this so that we can get the realpath of + * Separate the directory and the basename. + * We do this so that we can get the realpath of * the directory. We don't get the realpath on the * whole path because if it's a symlink, we want the * path of the symlink not where it points to. */ - d = zfs_strdup(hdl, dir); - b = zfs_strdup(hdl, dir); - dpath = dirname(d); - name = basename(b); + name = zfs_basename(dir); + if ((dl = zfs_dirnamelen(dir)) == -1) + dpath = "."; + else + dpath = d = zutil_strndup(hdl, dir, dl); if (realpath(dpath, path) == NULL) { error = errno; @@ -1811,8 +1308,8 @@ zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, goto out; } - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + zutil_error_aux(hdl, "%s", strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); goto out; } @@ -1820,7 +1317,6 @@ zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); out: - free(b); free(d); return (error); } @@ -1830,7 +1326,7 @@ out: */ static int zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t **slice_cache, char **dir, int dirs) + avl_tree_t **slice_cache, const char * const *dir, size_t dirs) { avl_tree_t *cache; rdsk_node_t *slice; @@ -1838,7 +1334,7 @@ zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, int i, error; *slice_cache = NULL; - cache = zfs_alloc(hdl, sizeof (avl_tree_t)); + cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); @@ -1850,15 +1346,15 @@ zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, if (error == ENOENT) continue; - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + zutil_error_aux(hdl, "%s", strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); goto error; } /* * If dir[i] is a directory, we walk through it and add all - * the entry to the cache. If it's not a directory, we just + * the entries to the cache. If it's not a directory, we just * add it to the cache. */ if (S_ISDIR(sbuf.st_mode)) { @@ -1886,139 +1382,6 @@ error: return (error); } -static char * -zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { - "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ - "/dev/mapper", /* Use multipath devices before components */ - "/dev/disk/by-partlabel", /* Single unique entry set by user */ - "/dev/disk/by-partuuid", /* Generated partition uuid */ - "/dev/disk/by-label", /* Custom persistent labels */ - "/dev/disk/by-uuid", /* Single unique entry and persistent */ - "/dev/disk/by-id", /* May be multiple entries and persistent */ - "/dev/disk/by-path", /* Encodes physical location and persistent */ - "/dev" /* UNSAFE device names will change */ -}; - -const char * const * -zpool_default_search_paths(size_t *count) -{ - *count = DEFAULT_IMPORT_PATH_SIZE; - return ((const char * const *)zpool_default_import_path); -} - -/* - * Given a full path to a device determine if that device appears in the - * import search path. If it does return the first match and store the - * index in the passed 'order' variable, otherwise return an error. - */ -static int -zfs_path_order(char *name, int *order) -{ - int i = 0, error = ENOENT; - char *dir, *env, *envdup; - - env = getenv("ZPOOL_IMPORT_PATH"); - if (env) { - envdup = strdup(env); - dir = strtok(envdup, ":"); - while (dir) { - if (strncmp(name, dir, strlen(dir)) == 0) { - *order = i; - error = 0; - break; - } - dir = strtok(NULL, ":"); - i++; - } - free(envdup); - } else { - for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { - if (strncmp(name, zpool_default_import_path[i], - strlen(zpool_default_import_path[i])) == 0) { - *order = i; - error = 0; - break; - } - } - } - - return (error); -} - -/* - * Use libblkid to quickly enumerate all known zfs devices. - */ -static int -zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t **slice_cache) -{ - rdsk_node_t *slice; - blkid_cache cache; - blkid_dev_iterate iter; - blkid_dev dev; - avl_index_t where; - int error; - - *slice_cache = NULL; - - error = blkid_get_cache(&cache, NULL); - if (error != 0) - return (error); - - error = blkid_probe_all_new(cache); - if (error != 0) { - blkid_put_cache(cache); - return (error); - } - - iter = blkid_dev_iterate_begin(cache); - if (iter == NULL) { - blkid_put_cache(cache); - return (EINVAL); - } - - error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); - if (error != 0) { - blkid_dev_iterate_end(iter); - blkid_put_cache(cache); - return (error); - } - - *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t)); - avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), - offsetof(rdsk_node_t, rn_node)); - - while (blkid_dev_next(iter, &dev) == 0) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev)); - slice->rn_vdev_guid = 0; - slice->rn_lock = lock; - slice->rn_avl = *slice_cache; - slice->rn_hdl = hdl; - slice->rn_labelpaths = B_TRUE; - - error = zfs_path_order(slice->rn_name, &slice->rn_order); - if (error == 0) - slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; - else - slice->rn_order = IMPORT_ORDER_DEFAULT; - - pthread_mutex_lock(lock); - if (avl_find(*slice_cache, slice, &where)) { - free(slice->rn_name); - free(slice); - } else { - avl_insert(*slice_cache, slice, where); - } - pthread_mutex_unlock(lock); - } - - blkid_dev_iterate_end(iter); - blkid_put_cache(cache); - - return (0); -} - /* * Given a list of directories to search, find all pools stored on disk. This * includes partial pools which are not available to import. If no args are @@ -2027,7 +1390,8 @@ zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, * to import a specific pool. */ static nvlist_t * -zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) +zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, + pthread_mutex_t *lock, avl_tree_t *cache) { nvlist_t *ret = NULL; pool_list_t pools = { 0 }; @@ -2035,36 +1399,11 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; - pthread_mutex_t lock; - avl_tree_t *cache; rdsk_node_t *slice; void *cookie; tpool_t *t; verify(iarg->poolname == NULL || iarg->guid == 0); - pthread_mutex_init(&lock, NULL); - - /* - * Locate pool member vdevs using libblkid or by directory scanning. - * On success a newly allocated AVL tree which is populated with an - * entry for each discovered vdev will be returned as the cache. - * It's the callers responsibility to consume and destroy this tree. - */ - if (iarg->scan || iarg->paths != 0) { - int dirs = iarg->paths; - char **dir = iarg->path; - - if (dirs == 0) { - dir = zpool_default_import_path; - dirs = DEFAULT_IMPORT_PATH_SIZE; - } - - if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) - return (NULL); - } else { - if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) - return (NULL); - } /* * Create a thread pool to parallelize the process of reading and @@ -2080,8 +1419,8 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) tpool_destroy(t); /* - * Process the cache filtering out any entries which are not - * for the specificed pool then adding matching label configs. + * Process the cache, filtering out any entries which are not + * for the specified pool then adding matching label configs. */ cookie = NULL; while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { @@ -2128,7 +1467,8 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) * would prevent a zdb -e of active pools with * no cachefile. */ - fd = open(slice->rn_name, O_RDONLY | O_EXCL); + fd = open(slice->rn_name, + O_RDONLY | O_EXCL | O_CLOEXEC); if (fd >= 0 || iarg->can_be_active) { if (fd >= 0) close(fd); @@ -2144,7 +1484,6 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) } avl_destroy(cache); free(cache); - pthread_mutex_destroy(&lock); ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); @@ -2171,14 +1510,48 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) return (ret); } +/* + * Given a config, discover the paths for the devices which + * exist in the config. + */ +static int +discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv, + avl_tree_t *cache, pthread_mutex_t *lock) +{ + char *path = NULL; + ssize_t dl; + uint_t children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (int c = 0; c < children; c++) { + discover_cached_paths(hdl, child[c], cache, lock); + } + } + + /* + * Once we have the path, we need to add the directory to + * our directory cache. + */ + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + if ((dl = zfs_dirnamelen(path)) == -1) + path = "."; + else + path[dl] = '\0'; + return (zpool_find_import_scan_dir(hdl, lock, cache, + path, 0)); + } + return (0); +} + /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ static nvlist_t * -zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, - const char *poolname, uint64_t guid) +zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) { char *buf; int fd; @@ -2190,24 +1563,24 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, uint64_t this_guid; boolean_t active; - verify(poolname == NULL || guid == 0); + verify(iarg->poolname == NULL || iarg->guid == 0); - if ((fd = open(cachefile, O_RDONLY)) < 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); - (void) zfs_error(hdl, EZFS_BADCACHE, + if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) { + zutil_error_aux(hdl, "%s", strerror(errno)); + (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to open cache file")); return (NULL); } if (fstat64(fd, &statbuf) != 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); + zutil_error_aux(hdl, "%s", strerror(errno)); (void) close(fd); - (void) zfs_error(hdl, EZFS_BADCACHE, + (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to get size of cache file")); return (NULL); } - if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) { + if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { (void) close(fd); return (NULL); } @@ -2215,7 +1588,7 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) close(fd); free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, + (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to read cache file contents")); return (NULL); @@ -2225,7 +1598,7 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, + (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "invalid or corrupt cache file contents")); return (NULL); @@ -2238,7 +1611,7 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, * state. */ if (nvlist_alloc(&pools, 0, 0) != 0) { - (void) no_memory(hdl); + (void) zutil_no_memory(hdl); nvlist_free(raw); return (NULL); } @@ -2248,14 +1621,14 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, src = fnvpair_value_nvlist(elem); name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); - if (poolname != NULL && strcmp(poolname, name) != 0) + if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0) continue; this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); - if (guid != 0 && guid != this_guid) + if (iarg->guid != 0 && iarg->guid != this_guid) continue; - if (pool_active(hdl, name, this_guid, &active) != 0) { + if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { nvlist_free(raw); nvlist_free(pools); return (NULL); @@ -2264,22 +1637,84 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, if (active) continue; + if (iarg->scan) { + uint64_t saved_guid = iarg->guid; + const char *saved_poolname = iarg->poolname; + pthread_mutex_t lock; + + /* + * Create the device cache that will hold the + * devices we will scan based on the cachefile. + * This will get destroyed and freed by + * zpool_find_import_impl. + */ + avl_tree_t *cache = zutil_alloc(hdl, + sizeof (avl_tree_t)); + avl_create(cache, slice_cache_compare, + sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + nvlist_t *nvroot = fnvlist_lookup_nvlist(src, + ZPOOL_CONFIG_VDEV_TREE); + + /* + * We only want to find the pool with this_guid. + * We will reset these values back later. + */ + iarg->guid = this_guid; + iarg->poolname = NULL; + + /* + * We need to build up a cache of devices that exists + * in the paths pointed to by the cachefile. This allows + * us to preserve the device namespace that was + * originally specified by the user but also lets us + * scan devices in those directories in case they had + * been renamed. + */ + pthread_mutex_init(&lock, NULL); + discover_cached_paths(hdl, nvroot, cache, &lock); + nvlist_t *nv = zpool_find_import_impl(hdl, iarg, + &lock, cache); + pthread_mutex_destroy(&lock); + + /* + * zpool_find_import_impl will return back + * a list of pools that it found based on the + * device cache. There should only be one pool + * since we're looking for a specific guid. + * We will use that pool to build up the final + * pool nvlist which is returned back to the + * caller. + */ + nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + fnvlist_add_nvlist(pools, nvpair_name(pair), + fnvpair_value_nvlist(pair)); + + VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); + + iarg->guid = saved_guid; + iarg->poolname = saved_poolname; + continue; + } + if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, - cachefile) != 0) { - (void) no_memory(hdl); + iarg->cachefile) != 0) { + (void) zutil_no_memory(hdl); nvlist_free(raw); nvlist_free(pools); return (NULL); } - if ((dst = refresh_config(hdl, src)) == NULL) { + update_vdevs_config_dev_sysfs_path(src); + + if ((dst = zutil_refresh_config(hdl, src)) == NULL) { nvlist_free(raw); nvlist_free(pools); return (NULL); } if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { - (void) no_memory(hdl); + (void) zutil_no_memory(hdl); nvlist_free(dst); nvlist_free(raw); nvlist_free(pools); @@ -2287,11 +1722,51 @@ zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, } nvlist_free(dst); } - nvlist_free(raw); return (pools); } +static nvlist_t * +zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg) +{ + pthread_mutex_t lock; + avl_tree_t *cache; + nvlist_t *pools = NULL; + + verify(iarg->poolname == NULL || iarg->guid == 0); + pthread_mutex_init(&lock, NULL); + + /* + * Locate pool member vdevs by blkid or by directory scanning. + * On success a newly allocated AVL tree which is populated with an + * entry for each discovered vdev will be returned in the cache. + * It's the caller's responsibility to consume and destroy this tree. + */ + if (iarg->scan || iarg->paths != 0) { + size_t dirs = iarg->paths; + const char * const *dir = (const char * const *)iarg->path; + + if (dirs == 0) + dir = zpool_default_search_paths(&dirs); + + if (zpool_find_import_scan(hdl, &lock, &cache, + dir, dirs) != 0) { + pthread_mutex_destroy(&lock); + return (NULL); + } + } else { + if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) { + pthread_mutex_destroy(&lock); + return (NULL); + } + } + + pools = zpool_find_import_impl(hdl, iarg, &lock, cache); + pthread_mutex_destroy(&lock); + return (pools); +} + + nvlist_t * zpool_search_import(void *hdl, importargs_t *import, const pool_config_ops_t *pco) @@ -2306,14 +1781,13 @@ zpool_search_import(void *hdl, importargs_t *import, verify(import->poolname == NULL || import->guid == 0); if (import->cachefile != NULL) - pools = zpool_find_import_cached(&handle, import->cachefile, - import->poolname, import->guid); + pools = zpool_find_import_cached(&handle, import); else - pools = zpool_find_import_impl(&handle, import); + pools = zpool_find_import(&handle, import); if ((pools == NULL || nvlist_empty(pools)) && handle.lpc_open_access_error && geteuid() != 0) { - (void) zfs_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, + (void) zutil_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, "no pools found")); } @@ -2343,17 +1817,14 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, nvlist_t *pools; nvlist_t *match = NULL; nvlist_t *config = NULL; - char *name = NULL, *sepp = NULL; - char sep = '\0'; + char *sepp = NULL; int count = 0; char *targetdup = strdup(target); *configp = NULL; - if ((sepp = strpbrk(targetdup, "/@")) != NULL) { - sep = *sepp; + if ((sepp = strpbrk(targetdup, "/@")) != NULL) *sepp = '\0'; - } pools = zpool_search_import(hdl, args, pco); @@ -2367,11 +1838,11 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, /* multiple matches found */ continue; } else { - match = config; - name = nvpair_name(elem); + match = fnvlist_dup(config); } } } + fnvlist_free(pools); } if (count == 0) { @@ -2381,6 +1852,7 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, if (count > 1) { free(targetdup); + fnvlist_free(match); return (EINVAL); } @@ -2389,3 +1861,69 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, return (0); } + +/* + * Internal function for iterating over the vdevs. + * + * For each vdev, func() will be called and will be passed 'zhp' (which is + * typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and + * a user-defined data pointer). + * + * The return values from all the func() calls will be OR'd together and + * returned. + */ +int +for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], + func, data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling + * func() for each one. func() is passed the vdev's nvlist and an optional + * user-defined 'data' pointer. + */ +int +for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data) +{ + return (for_each_vdev_cb(NULL, nvroot, func, data)); +} diff --git a/lib/libzutil/zutil_import.h b/lib/libzutil/zutil_import.h new file mode 100644 index 0000000000..0108eb45c5 --- /dev/null +++ b/lib/libzutil/zutil_import.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ +#ifndef _LIBZUTIL_ZUTIL_IMPORT_H_ +#define _LIBZUTIL_ZUTIL_IMPORT_H_ + +#define EZFS_BADCACHE "invalid or missing cache file" +#define EZFS_BADPATH "must be an absolute path" +#define EZFS_NOMEM "out of memory" +#define EZFS_EACESS "some devices require root privileges" + +#define IMPORT_ORDER_PREFERRED_1 1 +#define IMPORT_ORDER_PREFERRED_2 2 +#define IMPORT_ORDER_SCAN_OFFSET 10 +#define IMPORT_ORDER_DEFAULT 100 + +typedef struct libpc_handle { + boolean_t lpc_printerr; + boolean_t lpc_open_access_error; + boolean_t lpc_desc_active; + char lpc_desc[1024]; + const pool_config_ops_t *lpc_ops; + void *lpc_lib_handle; +} libpc_handle_t; + + +int label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, + char **devid); +int zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache); + +void * zutil_alloc(libpc_handle_t *hdl, size_t size); +char *zutil_strdup(libpc_handle_t *hdl, const char *str); + +typedef struct rdsk_node { + char *rn_name; /* Full path to device */ + int rn_order; /* Preferred order (low to high) */ + int rn_num_labels; /* Number of valid labels */ + uint64_t rn_vdev_guid; /* Expected vdev guid when set */ + libpc_handle_t *rn_hdl; + nvlist_t *rn_config; /* Label config */ + avl_tree_t *rn_avl; + avl_node_t rn_node; + pthread_mutex_t *rn_lock; + boolean_t rn_labelpaths; +} rdsk_node_t; + +int slice_cache_compare(const void *, const void *); + +void zpool_open_func(void *); + +#endif /* _LIBZUTIL_ZUTIL_IMPORT_H_ */ diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c index 9a81011fca..4dcac1f855 100644 --- a/lib/libzutil/zutil_nicenum.c +++ b/lib/libzutil/zutil_nicenum.c @@ -23,9 +23,36 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +#include #include #include #include +#include + +/* + * Return B_TRUE if "str" is a number string, B_FALSE otherwise. + * Works for integer and floating point numbers. + */ +boolean_t +zfs_isnumber(const char *str) +{ + if (!*str) + return (B_FALSE); + + for (; *str; str++) + if (!(isdigit(*str) || (*str == '.'))) + return (B_FALSE); + + /* + * Numbers should not end with a period ("." ".." or "5." are + * not valid) + */ + if (str[strlen(str) - 1] == '.') { + return (B_FALSE); + } + + return (B_TRUE); +} /* * Convert a number to an appropriately human-readable output. diff --git a/man/Makefile.am b/man/Makefile.am index 841cb9c4e6..8ab1b75724 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -1 +1,117 @@ -SUBDIRS = man1 man5 man8 +include $(top_srcdir)/config/Substfiles.am + +EXTRA_DIST += \ + man1/cstyle.1 + +dist_man_MANS = \ + man1/zhack.1 \ + man1/ztest.1 \ + man1/raidz_test.1 \ + man1/zvol_wait.1 \ + man1/arcstat.1 \ + \ + man5/vdev_id.conf.5 \ + \ + man4/spl.4 \ + man4/zfs.4 \ + \ + man7/zpool-features.7 \ + man7/zfsconcepts.7 \ + man7/zfsprops.7 \ + man7/zpoolconcepts.7 \ + man7/zpoolprops.7 \ + \ + man8/fsck.zfs.8 \ + man8/mount.zfs.8 \ + man8/vdev_id.8 \ + man8/zdb.8 \ + man8/zfs.8 \ + man8/zfs-allow.8 \ + man8/zfs-bookmark.8 \ + man8/zfs-change-key.8 \ + man8/zfs-clone.8 \ + man8/zfs-create.8 \ + man8/zfs-destroy.8 \ + man8/zfs-diff.8 \ + man8/zfs-get.8 \ + man8/zfs-groupspace.8 \ + man8/zfs-hold.8 \ + man8/zfs-inherit.8 \ + man8/zfs-jail.8 \ + man8/zfs-list.8 \ + man8/zfs-load-key.8 \ + man8/zfs-mount.8 \ + man8/zfs-program.8 \ + man8/zfs-project.8 \ + man8/zfs-projectspace.8 \ + man8/zfs-promote.8 \ + man8/zfs-receive.8 \ + man8/zfs-recv.8 \ + man8/zfs-redact.8 \ + man8/zfs-release.8 \ + man8/zfs-rename.8 \ + man8/zfs-rollback.8 \ + man8/zfs-send.8 \ + man8/zfs-set.8 \ + man8/zfs-share.8 \ + man8/zfs-snapshot.8 \ + man8/zfs-unallow.8 \ + man8/zfs-unjail.8 \ + man8/zfs-unload-key.8 \ + man8/zfs-unmount.8 \ + man8/zfs-upgrade.8 \ + man8/zfs-userspace.8 \ + man8/zfs-wait.8 \ + man8/zfs_ids_to_path.8 \ + man8/zgenhostid.8 \ + man8/zinject.8 \ + man8/zpool.8 \ + man8/zpool-add.8 \ + man8/zpool-attach.8 \ + man8/zpool-checkpoint.8 \ + man8/zpool-clear.8 \ + man8/zpool-create.8 \ + man8/zpool-destroy.8 \ + man8/zpool-detach.8 \ + man8/zpool-events.8 \ + man8/zpool-export.8 \ + man8/zpool-get.8 \ + man8/zpool-history.8 \ + man8/zpool-import.8 \ + man8/zpool-initialize.8 \ + man8/zpool-iostat.8 \ + man8/zpool-labelclear.8 \ + man8/zpool-list.8 \ + man8/zpool-offline.8 \ + man8/zpool-online.8 \ + man8/zpool-reguid.8 \ + man8/zpool-remove.8 \ + man8/zpool-reopen.8 \ + man8/zpool-replace.8 \ + man8/zpool-resilver.8 \ + man8/zpool-scrub.8 \ + man8/zpool-set.8 \ + man8/zpool-split.8 \ + man8/zpool-status.8 \ + man8/zpool-sync.8 \ + man8/zpool-trim.8 \ + man8/zpool-upgrade.8 \ + man8/zpool-wait.8 \ + man8/zstream.8 \ + man8/zstreamdump.8 \ + man8/zpool_influxdb.8 + +nodist_man_MANS = \ + man8/zed.8 \ + man8/zfs-mount-generator.8 + +SUBSTFILES += $(nodist_man_MANS) + + +if BUILD_LINUX +# The manual pager in most Linux distros defaults to "BSD" when .Os is blank, +# but leaving it blank makes things a lot easier on +# FreeBSD when OpenZFS is vendored in the base system. +install-data-hook: + cd $(DESTDIR)$(mandir) && $(SED) ${ac_inplace} -e 's/^\.Os$$/.Os OpenZFS/' $(dist_man_MANS) $(nodist_man_MANS) +endif diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am deleted file mode 100644 index bd78be1452..0000000000 --- a/man/man1/Makefile.am +++ /dev/null @@ -1,5 +0,0 @@ -dist_man_MANS = zhack.1 ztest.1 raidz_test.1 -EXTRA_DIST = cstyle.1 - -install-data-local: - $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man1" diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 new file mode 100644 index 0000000000..a69cd8937b --- /dev/null +++ b/man/man1/arcstat.1 @@ -0,0 +1,184 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" Copyright 2014 Adam Stevko. All rights reserved. +.\" Copyright (c) 2015 by Delphix. All rights reserved. +.\" Copyright (c) 2020 by AJ Jordan. All rights reserved. +.\" +.Dd May 26, 2021 +.Dt ARCSTAT 1 +.Os +. +.Sh NAME +.Nm arcstat +.Nd report ZFS ARC and L2ARC statistics +.Sh SYNOPSIS +.Nm +.Op Fl havxp +.Op Fl f Ar field Ns Op , Ns Ar field Ns … +.Op Fl o Ar file +.Op Fl s Ar string +.Op Ar interval +.Op Ar count +. +.Sh DESCRIPTION +.Nm +prints various ZFS ARC and L2ARC statistics in vmstat-like fashion: +.Bl -tag -compact -offset Ds -width "l2asize" +.It Sy c +ARC target size +.It Sy dh% +Demand data hit percentage +.It Sy dm% +Demand data miss percentage +.It Sy mfu +MFU list hits per second +.It Sy mh% +Metadata hit percentage +.It Sy mm% +Metadata miss percentage +.It Sy mru +MRU list hits per second +.It Sy ph% +Prefetch hits percentage +.It Sy pm% +Prefetch miss percentage +.It Sy dhit +Demand data hits per second +.It Sy dmis +Demand data misses per second +.It Sy hit% +ARC hit percentage +.It Sy hits +ARC reads per second +.It Sy mfug +MFU ghost list hits per second +.It Sy mhit +Metadata hits per second +.It Sy miss +ARC misses per second +.It Sy mmis +Metadata misses per second +.It Sy mrug +MRU ghost list hits per second +.It Sy phit +Prefetch hits per second +.It Sy pmis +Prefetch misses per second +.It Sy read +Total ARC accesses per second +.It Sy time +Current time +.It Sy size +ARC size +.It Sy arcsz +Alias for +.Sy size +.It Sy dread +Demand data accesses per second +.It Sy eskip +evict_skip per second +.It Sy miss% +ARC miss percentage +.It Sy mread +Metadata accesses per second +.It Sy pread +Prefetch accesses per second +.It Sy l2hit% +L2ARC access hit percentage +.It Sy l2hits +L2ARC hits per second +.It Sy l2miss +L2ARC misses per second +.It Sy l2read +Total L2ARC accesses per second +.It Sy l2pref +L2ARC prefetch allocated size per second +.It Sy l2pref% +L2ARC prefetch allocated size percentage +.It Sy l2mfu +L2ARC MFU allocated size per second +.It Sy l2mfu% +L2ARC MFU allocated size percentage +.It Sy l2mru +L2ARC MRU allocated size per second +.It Sy l2mru% +L2ARC MRU allocated size percentage +.It Sy l2data +L2ARC data (buf content) allocated size per second +.It Sy l2data% +L2ARC data (buf content) allocated size percentage +.It Sy l2meta +L2ARC metadata (buf content) allocated size per second +.It Sy l2meta% +L2ARC metadata (buf content) allocated size percentage +.It Sy l2size +Size of the L2ARC +.It Sy mtxmis +mutex_miss per second +.It Sy l2bytes +Bytes read per second from the L2ARC +.It Sy l2miss% +L2ARC access miss percentage +.It Sy l2asize +Actual (compressed) size of the L2ARC +.It Sy grow +ARC grow disabled +.It Sy need +ARC reclaim needed +.It Sy free +The ARC's idea of how much free memory there is, which includes evictable memory in the page cache. +Since the ARC tries to keep +.Sy avail +above zero, +.Sy avail +is usually more instructive to observe than +.Sy free . +.It Sy avail +The ARC's idea of how much free memory is available to it, which is a bit less than +.Sy free . +May temporarily be negative, in which case the ARC will reduce the target size +.Sy c . +.El +. +.Sh OPTIONS +.Bl -tag -width "-v" +.It Fl a +Print all possible stats. +.It Fl f +Display only specific fields. +See +.Sx DESCRIPTION +for supported statistics. +.It Fl h +Display help message. +.It Fl o +Report statistics to a file instead of the standard output. +.It Fl p +Disable auto-scaling of numerical fields (for raw, machine-parsable values). +.It Fl s +Display data with a specified separator (default: 2 spaces). +.It Fl x +Print extended stats +.Pq same as Fl f Sy time , Ns Sy mfu , Ns Sy mru , Ns Sy mfug , Ns Sy mrug , Ns Sy eskip , Ns Sy mtxmis , Ns Sy dread , Ns Sy pread , Ns Sy read . +.It Fl v +Show field headers and definitions +.El +. +.Sh OPERANDS +The following operands are supported: +.Bl -tag -compact -offset Ds -width "interval" +.It Ar interval +Specify the sampling interval in seconds. +.It Ar count +Display only +.Ar count +reports. +.El diff --git a/man/man1/cstyle.1 b/man/man1/cstyle.1 index f2b637d4c3..f5f9ec78f8 100644 --- a/man/man1/cstyle.1 +++ b/man/man1/cstyle.1 @@ -20,148 +20,141 @@ .\" .\" CDDL HEADER END .\" -.TH cstyle 1 "28 March 2005" -.SH NAME -.I cstyle -\- check for some common stylistic errors in C source files -.SH SYNOPSIS -\fBcstyle [-chpvCP] [-o constructs] [file...]\fP -.LP -.SH DESCRIPTION -.IX "OS-Net build tools" "cstyle" "" "\fBcstyle\fP" -.LP -.I cstyle -inspects C source files (*.c and *.h) for common sylistic errors. It -attempts to check for the cstyle documented in -\fIhttp://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf\fP. +.Dd May 26, 2021 +.Dt CSTYLE 1 +.Os +. +.Sh NAME +.Nm cstyle +.Nd check for some common stylistic errors in C source files +.Sh SYNOPSIS +.Nm +.Op Fl chpvCP +.Op Fl o Ar construct Ns Op , Ns Ar construct Ns … +.Oo Ar file Oc Ns … +.Sh DESCRIPTION +.Nm +inspects C source files (*.c and *.h) for common stylistic errors. +It attempts to check for the cstyle documented in +.Lk http://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf . Note that there is much in that document that -.I cannot -be checked for; just because your code is \fBcstyle(1)\fP clean does not -mean that you've followed Sun's C style. \fICaveat emptor\fP. -.LP -.SH OPTIONS -.LP +.Em cannot +be checked for; just because your code is +.Nm Ns -clean +does not mean that you've followed Sun's C style. +.Em Caveat emptor . +. +.Sh OPTIONS The following options are supported: -.TP 4 -.B \-c -Check continuation line indentation inside of functions. Sun's C style +.Bl -tag -width "-c" +.It Fl c +Check continuation line indentation inside of functions. +Sun's C style states that all statements must be indented to an appropriate tab stop, -and any continuation lines after them must be indented \fIexactly\fP four -spaces from the start line. This option enables a series of checks -designed to find continuation line problems within functions only. The -checks have some limitations; see CONTINUATION CHECKING, below. -.LP -.TP 4 -.B \-h -Performs heuristic checks that are sometimes wrong. Not generally used. -.LP -.TP 4 -.B \-p -Performs some of the more picky checks. Includes ANSI #else and #endif -rules, and tries to detect spaces after casts. Used as part of the -putback checks. -.LP -.TP 4 -.B \-v -Verbose output; includes the text of the line of error, and, for -\fB-c\fP, the first statement in the current continuation block. -.LP -.TP 4 -.B \-C +and any continuation lines after them must be indented +.Em exactly +four spaces from the start line. +This option enables a series of checks designed to find +continuation line problems within functions only. +The checks have some limitations; see +.Sy CONTINUATION CHECKING , +below. +.It Fl h +Performs heuristic checks that are sometimes wrong. +Not generally used. +.It Fl p +Performs some of the more picky checks. +Includes ANSI +.Sy #else +and +.Sy #endif +rules, and tries to detect spaces after casts. +Used as part of the putback checks. +.It Fl v +Verbose output; includes the text of the line of error, and, for +.Fl c , +the first statement in the current continuation block. +.It Fl C Ignore errors in header comments (i.e. block comments starting in the -first column). Not generally used. -.LP -.TP 4 -.B \-P -Check for use of non-POSIX types. Historically, types like "u_int" and -"u_long" were used, but they are now deprecated in favor of the POSIX -types uint_t, ulong_t, etc. This detects any use of the deprecated -types. Used as part of the putback checks. -.LP -.TP 4 -.B \-o \fIconstructs\fP -Allow a comma-separated list of additional constructs. Available -constructs include: -.LP -.TP 10 -.B doxygen -Allow doxygen-style block comments (\fB/**\fP and \fB/*!\fP) -.LP -.TP 10 -.B splint -Allow splint-style lint comments (\fB/*@...@*/\fP) -.LP -.SH NOTES -.LP -The cstyle rule for the OS/Net consolidation is that all new files must -be \fB-pP\fP clean. For existing files, the following invocations are -run against both the old and new files: -.LP -.TP 4 -\fBcstyle file\fB -.LP -.TP 4 -\fBcstyle -p file\fB -.LP -.TP 4 -\fBcstyle -pP file\fB -.LP -If the old file gave no errors for one of the invocations, the new file -must also give no errors. This way, files can only become more clean. -.LP -.SH CONTINUATION CHECKING -.LP +first column). +Not generally used. +.It Fl P +Check for use of non-POSIX types. +Historically, types like +.Sy u_int +and +.Sy u_long +were used, but they are now deprecated in favor of the POSIX +types +.Sy uint_t , +.Sy ulong_t , +etc. +This detects any use of the deprecated types. +Used as part of the putback checks. +.It Fl o Ar construct Ns Op , Ns Ar construct Ns … +Available constructs include: +.Bl -tag -compact -width "doxygen" +.It Sy doxygen +Allow doxygen-style block comments +.Pq Sy /** No and Sy /*!\& . +.It Sy splint +Allow splint-style lint comments +.Pq Sy /*@ Ns ... Ns Sy @*/ . +.El +.El +. +.Sh CONTINUATION CHECKING The continuation checker is a reasonably simple state machine that knows something about how C is laid out, and can match parenthesis, etc. over -multiple lines. It does have some limitations: -.LP -.TP 4 -.B 1. +multiple lines. +It does have some limitations: +.Bl -enum +.It Preprocessor macros which cause unmatched parenthesis will confuse the -checker for that line. To fix this, you'll need to make sure that each -branch of the #if statement has balanced parenthesis. -.LP -.TP 4 -.B 2. -Some \fBcpp\fP macros do not require ;s after them. Any such macros -*must* be ALL_CAPS; any lower case letters will cause bad output. -.LP -The bad output will generally be corrected after the next \fB;\fP, -\fB{\fP, or \fB}\fP. -.LP -Some continuation error messages deserve some additional explanation -.LP -.TP 4 -.B -multiple statements continued over multiple lines -A multi-line statement which is not broken at statement -boundaries. For example: -.RS 4 -.HP 4 +checker for that line. +To fix this, you'll need to make sure that each branch of the +.Sy #if +statement has balanced parenthesis. +.It +Some +.Xr cpp 1 +macros do not require +.Sy ;\& Ns s after them. +Any such macros +.Em must +be ALL_CAPS; any lower case letters will cause bad output. +.Pp +The bad output will generally be corrected after the next +.Sy ;\& , { , No or Sy } . +.El +Some continuation error messages deserve some additional explanation: +.Bl -tag -width Ds +.It Sy multiple statements continued over multiple lines +A multi-line statement which is not broken at statement boundaries. +For example: +.Bd -literal -compact -offset Ds if (this_is_a_long_variable == another_variable) a = -.br -b + c; -.LP -Will trigger this error. Instead, do: -.HP 8 + b + c; +.Ed +.Pp +Will trigger this error. +Instead, do: +.Bd -literal -compact -offset Ds if (this_is_a_long_variable == another_variable) -.br -a = b + c; -.RE -.LP -.TP 4 -.B -empty if/for/while body not on its own line + a = b + c; +.Ed +.It Sy empty if/for/while body not on its own line For visibility, empty bodies for if, for, and while statements should be -on their own line. For example: -.RS 4 -.HP 4 +on their own line. +For example: +.Bd -literal -compact -offset Ds while (do_something(&x) == 0); -.LP -Will trigger this error. Instead, do: -.HP 8 +.Ed +.Pp +Will trigger this error. +Instead, do: +.Bd -literal -compact -offset Ds while (do_something(&x) == 0) -.br -; -.RE - + ; +.Ed +.El diff --git a/man/man1/raidz_test.1 b/man/man1/raidz_test.1 index 90d858d5bb..4283a4b527 100644 --- a/man/man1/raidz_test.1 +++ b/man/man1/raidz_test.1 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,79 +18,84 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright (c) 2016 Gvozden Nešković. All rights reserved. .\" -.TH raidz_test 1 "2016" "ZFS on Linux" "User Commands" - -.SH NAME -\fBraidz_test\fR \- raidz implementation verification and bencmarking tool -.SH SYNOPSIS -.LP -.BI "raidz_test " -.SH DESCRIPTION -.LP -This manual page documents briefly the \fBraidz_test\fR command. -.LP -Purpose of this tool is to run all supported raidz implementation and verify -results of all methods. Tool also contains a parameter sweep option where all -parameters affecting RAIDZ block are verified (like ashift size, data offset, -data size, etc...). -The tool also supports a benchmarking mode using -B option. -.SH OPTION -.HP -.BI "\-h" "" -.IP +.Dd May 26, 2021 +.Dt RAIDZ_TEST 1 +.Os +. +.Sh NAME +.Nm raidz_test +.Nd raidz implementation verification and benchmarking tool +.Sh SYNOPSIS +.Nm +.Op Fl StBevTD +.Op Fl a Ar ashift +.Op Fl o Ar zio_off_shift +.Op Fl d Ar raidz_data_disks +.Op Fl s Ar zio_size_shift +.Op Fl r Ar reflow_offset +. +.Sh DESCRIPTION +The purpose of this tool is to run all supported raidz implementation and verify +the results of all methods. +It also contains a parameter sweep option where all +parameters affecting a RAIDZ block are verified (like ashift size, data offset, +data size, etc.). +The tool also supports a benchmarking mode using the +.Fl B +option. +. +.Sh OPTION +.Bl -tag -width "-B(enchmark)" +.It Fl h Print a help summary. -.HP -.BI "\-a" " ashift (default: 9)" -.IP +.It Fl a Ar ashift Pq default: Sy 9 Ashift value. -.HP -.BI "\-o" " zio_off_shift" " (default: 0)" -.IP -Zio offset for raidz block. Offset value is 1 << (zio_off_shift) -.HP -.BI "\-d" " raidz_data_disks" " (default: 8)" -.IP -Number of raidz data disks to use. Additional disks for parity will be used -during testing. -.HP -.BI "\-s" " zio_size_shift" " (default: 19)" -.IP -Size of data for raidz block. Size is 1 << (zio_size_shift). -.HP -.BI "\-S(weep)" -.IP -Sweep parameter space while verifying the raidz implementations. This option -will exhaust all most of valid values for -a -o -d -s options. Runtime using -this option will be long. -.HP -.BI "\-t(imeout)" -.IP -Wall time for sweep test in seconds. The actual runtime could be longer. -.HP -.BI "\-B(enchmark)" -.IP -This options starts the benchmark mode. All implementations are benchmarked -using increasing per disk data size. Results are given as throughput per disk, -measured in MiB/s. -.HP -.BI "\-v(erbose)" -.IP +.It Fl o Ar zio_off_shift Pq default: Sy 0 +ZIO offset for each raidz block. +The offset's value is +.Em 2^zio_off_shift . +.It Fl d Ar raidz_data_disks Pq default: Sy 8 +Number of raidz data disks to use. +Additional disks will be used for parity. +.It Fl s Ar zio_size_shift Pq default: Sy 19 +Size of data for raidz block. +The real size is +.Em 2^zio_size_shift . +.It Fl r Ar reflow_offset Pq default: Sy uint max +Set raidz expansion offset. +The expanded raidz map allocation function will +produce different map configurations depending on this value. +.It Fl S Ns Pq weep +Sweep parameter space while verifying the raidz implementations. +This option +will exhaust all most of valid values for the +.Fl aods +options. +Runtime using this option will be long. +.It Fl t Ns Pq imeout +Wall time for sweep test in seconds. +The actual runtime could be longer. +.It Fl B Ns Pq enchmark +All implementations are benchmarked using increasing per disk data size. +Results are given as throughput per disk, measured in MiB/s. +.It Fl e Ns Pq xpansion +Use expanded raidz map allocation function. +.It Fl v Ns Pq erbose Increase verbosity. -.HP -.BI "\-T(est the test)" -.IP -Debugging option. When this option is specified tool is supposed to fail -all tests. This is to check if tests would properly verify bit-exactness. -.HP -.BI "\-D(ebug)" -.IP -Debugging option. Specify to attach gdb when SIGSEGV or SIGABRT are received. -.HP - -.SH "SEE ALSO" -.BR "ztest (1)" -.SH "AUTHORS" -vdev_raidz, created for ZFS on Linux by Gvozden Nešković +.It Fl T Ns Pq est the test +Debugging option: fail all tests. +This is to check if tests would properly verify bit-exactness. +.It Fl D Ns Pq ebug +Debugging option: attach +.Xr gdb 1 +when +.Sy SIGSEGV +or +.Sy SIGABRT +are received. +.El +. +.Sh "SEE ALSO" +.Xr ztest 1 diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index 11d300b700..83046ee8f5 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,63 +18,108 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright 2013 Darik Horn . All rights reserved. .\" -.TH zhack 1 "2013 MAR 16" "ZFS on Linux" "User Commands" - -.SH NAME -zhack \- libzpool debugging tool -.SH DESCRIPTION +.\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS +.\" +.Dd May 26, 2021 +.Dt ZHACK 1 +.Os +. +.Sh NAME +.Nm zhack +.Nd libzpool debugging tool +.Sh DESCRIPTION This utility pokes configuration changes directly into a ZFS pool, which is dangerous and can cause data corruption. -.SH SYNOPSIS -.LP -.BI "zhack [\-c " "cachefile" "] [\-d " "dir" "] <" "subcommand" "> [" "arguments" "]" -.SH OPTIONS -.HP -.BI "\-c" " cachefile" -.IP -Read the \fIpool\fR configuration from the \fIcachefile\fR, which is -/etc/zfs/zpool.cache by default. -.HP -.BI "\-d" " dir" -.IP -Search for \fIpool\fR members in the \fIdir\fR path. Can be specified -more than once. -.SH SUBCOMMANDS -.LP -.BI "feature stat " "pool" -.IP +.Sh SYNOPSIS +.Bl -tag -width Ds +.It Xo +.Nm zhack +.Cm feature stat +.Ar pool +.Xc List feature flags. -.LP -.BI "feature enable [\-d " "description" "] [\-r] " "pool guid" -.IP -Add a new feature to \fIpool\fR that is uniquely identified by -\fIguid\fR, which is specified in the same form as a zfs(8) user -property. -.IP -The \fIdescription\fR is a short human readable explanation of the new +. +.It Xo +.Nm zhack +.Cm feature enable +.Op Fl d Ar description +.Op Fl r +.Ar pool +.Ar guid +.Xc +Add a new feature to +.Ar pool +that is uniquely identified by +.Ar guid , +which is specified in the same form as a +.Xr zfs 8 +user property. +.Pp +The +.Ar description +is a short human readable explanation of the new feature. +.Pp +The +.Fl r +flag indicates that +.Ar pool +can be safely opened in read-only mode by a system that does not understand the +.Ar guid feature. -.IP -The \fB\-r\fR switch indicates that \fIpool\fR can be safely opened -in read-only mode by a system that does not have the \fIguid\fR -feature. -.LP -.BI "feature ref [\-d|\-m] " "pool guid" -.IP -Increment the reference count of the \fIguid\fR feature in \fIpool\fR. -.IP -The \fB\-d\fR switch decrements the reference count of the \fIguid\fR -feature in \fIpool\fR. -.IP -The \fB\-m\fR switch indicates that the \fIguid\fR feature is now -required to read the pool MOS. -.SH EXAMPLES -.LP -.nf -# zhack feature stat tank - +. +.It Xo +.Nm zhack +.Cm feature ref +.Op Fl d Ns | Ns Fl m +.Ar pool +.Ar guid +.Xc +Increment the reference count of the +.Ar guid +feature in +.Ar pool . +.Pp +The +.Fl d +flag decrements the reference count of the +.Ar guid +feature in +.Ar pool +instead. +.Pp +The +.Fl m +flag indicates that the +.Ar guid +feature is now required to read the pool MOS. +.El +. +.Sh GLOBAL OPTIONS +The following can be passed to all +.Nm +invocations before any subcommand: +.Bl -tag -width "-d dir" +.It Fl c Ar cachefile +Read +.Ar pool +configuration from the +.Ar cachefile , +which is +.Pa /etc/zfs/zpool.cache +by default. +.It Fl d Ar dir +Search for +.Ar pool +members in +.Ar dir . +Can be specified more than once. +.El +. +.Sh EXAMPLES +.Bd -literal +.No # Nm zhack Cm feature stat Ar tank for_read_obj: org.illumos:lz4_compress = 0 for_write_obj: @@ -85,14 +129,12 @@ descriptions_obj: com.delphix:async_destroy = Destroy filesystems asynchronously. com.delphix:empty_bpobj = Snapshots use less space. org.illumos:lz4_compress = LZ4 compression algorithm support. -.LP -# zhack feature enable -d 'Predict future disk failures.' \\ - tank com.example:clairvoyance -.LP -# zhack feature ref tank com.example:clairvoyance -.SH AUTHORS -This man page was written by Darik Horn . -.SH SEE ALSO -.BR zfs (8), -.BR zpool-features (5), -.BR ztest (1) + +.No # Nm zhack Cm feature enable Fl d No 'Predict future disk failures.' Ar tank com.example:clairvoyance +.No # Nm zhack Cm feature ref Ar tank com.example:clairvoyance +.Ed +. +.Sh SEE ALSO +.Xr ztest 1 , +.Xr zpool-features 7 , +.Xr zfs 8 diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index b8cb0d45d9..fd1374a2f1 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,161 +18,216 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved. .\" Copyright (c) 2009 Michael Gebetsroither . All rights .\" reserved. +.\" Copyright (c) 2017, Intel Corporation. .\" -.TH ztest 1 "2009 NOV 01" "ZFS on Linux" "User Commands" - -.SH NAME -\fBztest\fR \- was written by the ZFS Developers as a ZFS unit test. -.SH SYNOPSIS -.LP -.BI "ztest " -.SH DESCRIPTION -.LP -This manual page documents briefly the \fBztest\fR command. -.LP -\fBztest\fR was written by the ZFS Developers as a ZFS unit test. The -tool was developed in tandem with the ZFS functionality and was -executed nightly as one of the many regression test against the daily -build. As features were added to ZFS, unit tests were also added to -\fBztest\fR. In addition, a separate test development team wrote and +.Dd May 26, 2021 +.Dt ZTEST 1 +.Os +. +.Sh NAME +.Nm ztest +.Nd was written by the ZFS Developers as a ZFS unit test +.Sh SYNOPSIS +.Nm +.Op Fl VEG +.Op Fl v Ar vdevs +.Op Fl s Ar size_of_each_vdev +.Op Fl a Ar alignment_shift +.Op Fl m Ar mirror_copies +.Op Fl r Ar raidz_disks/draid_disks +.Op Fl R Ar raid_parity +.Op Fl K Ar raid_kind +.Op Fl D Ar draid_data +.Op Fl S Ar draid_spares +.Op Fl C Ar vdev_class_state +.Op Fl d Ar datasets +.Op Fl t Ar threads +.Op Fl g Ar gang_block_threshold +.Op Fl i Ar initialize_pool_i_times +.Op Fl k Ar kill_percentage +.Op Fl p Ar pool_name +.Op Fl T Ar time +.Op Fl z Ar zil_failure_rate +. +.Sh DESCRIPTION +.Nm +was written by the ZFS Developers as a ZFS unit test. +The tool was developed in tandem with the ZFS functionality and was +executed nightly as one of the many regression test against the daily build. +As features were added to ZFS, unit tests were also added to +.Nm . +In addition, a separate test development team wrote and executed more functional and stress tests. -.LP -By default \fBztest\fR runs for ten minutes and uses block files -(stored in /tmp) to create pools rather than using physical disks. -Block files afford \fBztest\fR its flexibility to play around with +. +.Pp +By default +.Nm +runs for ten minutes and uses block files +(stored in +.Pa /tmp ) +to create pools rather than using physical disks. +Block files afford +.Nm +its flexibility to play around with zpool components without requiring large hardware configurations. -However, storing the block files in /tmp may not work for you if you +However, storing the block files in +.Pa /tmp +may not work for you if you have a small tmp directory. -.LP -By default is non-verbose. This is why entering the command above will -result in \fBztest\fR quietly executing for 5 minutes. The -V option -can be used to increase the verbosity of the tool. Adding multiple -V -option is allowed and the more you add the more chatty \fBztest\fR +. +.Pp +By default is non-verbose. +This is why entering the command above will result in +.Nm +quietly executing for 5 minutes. +The +.Fl V +option can be used to increase the verbosity of the tool. +Adding multiple +.Fl V +options is allowed and the more you add the more chatty +.Nm becomes. -.LP -After the \fBztest\fR run completes, you should notice many ztest.* -files lying around. Once the run completes you can safely remove these -files. Note that you shouldn't remove these files during a run. You -can re-use these files in your next \fBztest\fR run by using the -E +. +.Pp +After the +.Nm +run completes, you should notice many +.Pa ztest.* +files lying around. +Once the run completes you can safely remove these files. +Note that you shouldn't remove these files during a run. +You can re-use these files in your next +.Nm +run by using the +.Fl E option. -.SH OPTIONS -.HP -.BI "\-?" "" -.IP +. +.Sh OPTIONS +.Bl -tag -width "-v v" +.It Fl h , \&? , -help Print a help summary. -.HP -.BI "\-v" " vdevs" " (default: 5) -.IP +.It Fl v , -vdevs Ns = (default: Sy 5 ) Number of vdevs. -.HP -.BI "\-s" " size_of_each_vdev" " (default: 64M)" -.IP +.It Fl s , -vdev-size Ns = (default: Sy 64M ) Size of each vdev. -.HP -.BI "\-a" " alignment_shift" " (default: 9) (use 0 for random)" -.IP -Used alignment in test. -.HP -.BI "\-m" " mirror_copies" " (default: 2)" -.IP +.It Fl a , -alignment-shift Ns = (default: Sy 9 ) No (use Sy 0 No for random) +Alignment shift used in test. +.It Fl m , -mirror-copies Ns = (default: Sy 2 ) Number of mirror copies. -.HP -.BI "\-r" " raidz_disks" " (default: 4)" -.IP -Number of raidz disks. -.HP -.BI "\-R" " raidz_parity" " (default: 1)" -.IP -Raidz parity. -.HP -.BI "\-d" " datasets" " (default: 7)" -.IP +.It Fl r , -raid-disks Ns = (default: Sy 4 No for raidz/ Ns Sy 16 No for draid) +Number of raidz/draid disks. +.It Fl R , -raid-parity Ns = (default: Sy 1 ) +Raid parity (raidz & draid). +.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy draid Ns | Ns Sy random No (default: Sy random ) +The kind of RAID config to use. +With +.Sy random +the kind alternates between raidz and draid. +.It Fl D , -draid-data Ns = (default: Sy 4 ) +Number of data disks in a dRAID redundancy group. +.It Fl S , -draid-spares Ns = (default: Sy 1 ) +Number of dRAID distributed spare disks. +.It Fl d , -datasets Ns = (default: Sy 7 ) Number of datasets. -.HP -.BI "\-t" " threads" " (default: 23)" -.IP +.It Fl t , -threads Ns = (default: Sy 23 ) Number of threads. -.HP -.BI "\-g" " gang_block_threshold" " (default: 32K)" -.IP +.It Fl g , -gang-block-threshold Ns = (default: Sy 32K ) Gang block threshold. -.HP -.BI "\-i" " initialize_pool_i_times" " (default: 1)" -.IP -Number of pool initialisations. -.HP -.BI "\-k" " kill_percentage" " (default: 70%)" -.IP +.It Fl i , -init-count Ns = (default: Sy 1 ) +Number of pool initializations. +.It Fl k , -kill-percentage Ns = (default: Sy 70% ) Kill percentage. -.HP -.BI "\-p" " pool_name" " (default: ztest)" -.IP +.It Fl p , -pool-name Ns = (default: Sy ztest ) Pool name. -.HP -.BI "\-V(erbose)" -.IP -Verbose (use multiple times for ever more blather). -.HP -.BI "\-E(xisting)" -.IP +.It Fl f , -vdev-file-directory Ns = (default: Pa /tmp ) +File directory for vdev files. +.It Fl M , -multi-host +Multi-host; simulate pool imported on remote host. +.It Fl E , -use-existing-pool Use existing pool (use existing pool instead of creating new one). -.HP -.BI "\-T" " time" " (default: 300 sec)" -.IP +.It Fl T , -run-time Ns = (default: Sy 300 Ns s) Total test run time. -.HP -.BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs) -.IP -Injected failure rate. -.HP -.BI "\-G" -.IP -Dump zfs_dbgmsg buffer before exiting. -.SH "EXAMPLES" -.LP -To override /tmp as your location for block files, you can use the -f +.It Fl P , -pass-time Ns = (default: Sy 60 Ns s) +Time per pass. +.It Fl F , -freeze-loops Ns = (default: Sy 50 ) +Max loops in +.Fn spa_freeze . +.It Fl B , -alt-ztest Ns = +Alternate ztest path. +.It Fl C , -vdev-class-state Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy random No (default: Sy random ) +The vdev allocation class state. +.It Fl o , -option Ns = Ns Ar variable Ns = Ns Ar value +Set global +.Ar variable +to an unsigned 32-bit integer +.Ar value +(little-endian only). +.It Fl G , -dump-debug +Dump zfs_dbgmsg buffer before exiting due to an error. +.It Fl V , -verbose +Verbose (use multiple times for ever more verbosity). +.El +. +.Sh EXAMPLES +To override +.Pa /tmp +as your location for block files, you can use the +.Fl f option: -.IP -ztest -f / -.LP -To get an idea of what ztest is actually testing try this: -.IP -ztest -f / -VVV -.LP -Maybe you'd like to run ztest for longer? To do so simply use the -T +.Dl # ztest -f / +.Pp +To get an idea of what +.Nm +is actually testing try this: +.Dl # ztest -f / -VVV +.Pp +Maybe you'd like to run +.Nm ztest +for longer? To do so simply use the +.Fl T option and specify the runlength in seconds like so: -.IP -ztest -f / -V -T 120 - -.SH "ENVIRONMENT VARIABLES" -.TP -.B "ZFS_HOSTID=id" -Use \fBid\fR instead of the SPL hostid to identify this host. Intended for use -with ztest, but this environment variable will affect any utility which uses -libzpool, including \fBzpool(8)\fR. Since the kernel is unaware of this setting +.Dl # ztest -f / -V -T 120 +. +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZF" +.It Ev ZFS_HOSTID Ns = Ns Em id +Use +.Em id +instead of the SPL hostid to identify this host. +Intended for use with +.Nm , but this environment variable will affect any utility which uses +libzpool, including +.Xr zpool 8 . +Since the kernel is unaware of this setting, results with utilities other than ztest are undefined. -.TP -.B "ZFS_STACK_SIZE=stacksize" -Limit the default stack size to \fBstacksize\fR bytes for the purpose of -detecting and debugging kernel stack overflows. This value defaults to -\fB32K\fR which is double the default \fB16K\fR Linux kernel stack size. - +.It Ev ZFS_STACK_SIZE Ns = Ns Em stacksize +Limit the default stack size to +.Em stacksize +bytes for the purpose of +detecting and debugging kernel stack overflows. +This value defaults to +.Em 32K +which is double the default +.Em 16K +Linux kernel stack size. +.Pp In practice, setting the stack size slightly higher is needed because differences in stack usage between kernel and user space can lead to spurious -stack overflows (especially when debugging is enabled). The specified value +stack overflows (especially when debugging is enabled). +The specified value will be rounded up to a floor of PTHREAD_STACK_MIN which is the minimum stack required for a NULL procedure in user space. - -By default the stack size is limited to 256K. -.SH "SEE ALSO" -.BR "spl-module-parameters (5)" "," -.BR "zpool (1)" "," -.BR "zfs (1)" "," -.BR "zdb (1)" "," -.SH "AUTHOR" -This manual page was transvered to asciidoc by Michael Gebetsroither - from http://opensolaris.org/os/community/zfs/ztest/ +.Pp +By default the stack size is limited to +.Em 256K . +.El +. +.Sh SEE ALSO +.Xr zdb 1 , +.Xr zfs 1 , +.Xr zpool 1 , +.Xr spl 4 diff --git a/man/man1/zvol_wait.1 b/man/man1/zvol_wait.1 new file mode 100644 index 0000000000..0fb47ce734 --- /dev/null +++ b/man/man1/zvol_wait.1 @@ -0,0 +1,32 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.Dd May 27, 2021 +.Dt ZVOL_WAIT 1 +.Os +. +.Sh NAME +.Nm zvol_wait +.Nd wait for ZFS volume links to appear in /dev +.Sh SYNOPSIS +.Nm +. +.Sh DESCRIPTION +When a ZFS pool is imported, the volumes within it will appear as block devices. +As they're registered, +.Xr udev 7 +asynchronously creates symlinks under +.Pa /dev/zvol +using the volumes' names. +.Nm +will wait for all those symlinks to be created before exiting. +. +.Sh SEE ALSO +.Xr udev 7 diff --git a/man/man4/spl.4 b/man/man4/spl.4 new file mode 100644 index 0000000000..11cde14ae5 --- /dev/null +++ b/man/man4/spl.4 @@ -0,0 +1,195 @@ +.\" +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" Copyright 2013 Turbo Fredriksson . All rights reserved. +.\" +.Dd August 24, 2020 +.Dt SPL 4 +.Os +. +.Sh NAME +.Nm spl +.Nd parameters of the SPL kernel module +. +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Sy spl_kmem_cache_kmem_threads Ns = Ns Sy 4 Pq uint +The number of threads created for the spl_kmem_cache task queue. +This task queue is responsible for allocating new slabs +for use by the kmem caches. +For the majority of systems and workloads only a small number of threads are +required. +. +.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint +When this is set it prevents Linux from being able to rapidly reclaim all the +memory held by the kmem caches. +This may be useful in circumstances where it's preferable that Linux +reclaim memory from some other subsystem first. +Setting this will increase the likelihood out of memory events on a memory +constrained system. +. +.It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint +The preferred number of objects per slab in the cache. +In general, a larger value will increase the caches memory footprint +while decreasing the time required to perform an allocation. +Conversely, a smaller value will minimize the footprint +and improve cache reclaim time but individual allocations may take longer. +. +.It Sy spl_kmem_cache_max_size Ns = Ns Sy 32 Po 64-bit Pc or Sy 4 Po 32-bit Pc Pq uint +The maximum size of a kmem cache slab in MiB. +This effectively limits the maximum cache object size to +.Sy spl_kmem_cache_max_size Ns / Ns Sy spl_kmem_cache_obj_per_slab . +.Pp +Caches may not be created with +object sized larger than this limit. +. +.It Sy spl_kmem_cache_slab_limit Ns = Ns Sy 16384 Pq uint +For small objects the Linux slab allocator should be used to make the most +efficient use of the memory. +However, large objects are not supported by +the Linux slab and therefore the SPL implementation is preferred. +This value is used to determine the cutoff between a small and large object. +.Pp +Objects of size +.Sy spl_kmem_cache_slab_limit +or smaller will be allocated using the Linux slab allocator, +large objects use the SPL allocator. +A cutoff of 16K was determined to be optimal for architectures using 4K pages. +. +.It Sy spl_kmem_alloc_warn Ns = Ns Sy 32768 Pq uint +As a general rule +.Fn kmem_alloc +allocations should be small, +preferably just a few pages, since they must by physically contiguous. +Therefore, a rate limited warning will be printed to the console for any +.Fn kmem_alloc +which exceeds a reasonable threshold. +.Pp +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. +This value was selected to be small enough to ensure +the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. +Since this value is tunable, developers are encouraged to set it lower +when testing so any new largish allocations are quickly caught. +These warnings may be disabled by setting the threshold to zero. +. +.It Sy spl_kmem_alloc_max Ns = Ns Sy KMALLOC_MAX_SIZE Ns / Ns Sy 4 Pq uint +Large +.Fn kmem_alloc +allocations will fail if they exceed +.Sy KMALLOC_MAX_SIZE . +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. +Therefore, a maximum kmem size with reasonable safely margin of 4x is set. +.Fn kmem_alloc +allocations larger than this maximum will quickly fail. +.Fn vmem_alloc +allocations less than or equal to this value will use +.Fn kmalloc , +but shift to +.Fn vmalloc +when exceeding this value. +. +.It Sy spl_kmem_cache_magazine_size Ns = Ns Sy 0 Pq uint +Cache magazines are an optimization designed to minimize the cost of +allocating memory. +They do this by keeping a per-cpu cache of recently +freed objects, which can then be reallocated without taking a lock. +This can improve performance on highly contended caches. +However, because objects in magazines will prevent otherwise empty slabs +from being immediately released this may not be ideal for low memory machines. +.Pp +For this reason, +.Sy spl_kmem_cache_magazine_size +can be used to set a maximum magazine size. +When this value is set to 0 the magazine size will +be automatically determined based on the object size. +Otherwise magazines will be limited to 2-256 objects per magazine (i.e per cpu). +Magazines may never be entirely disabled in this implementation. +. +.It Sy spl_hostid Ns = Ns Sy 0 Pq ulong +The system hostid, when set this can be used to uniquely identify a system. +By default this value is set to zero which indicates the hostid is disabled. +It can be explicitly enabled by placing a unique non-zero value in +.Pa /etc/hostid . +. +.It Sy spl_hostid_path Ns = Ns Pa /etc/hostid Pq charp +The expected path to locate the system hostid when specified. +This value may be overridden for non-standard configurations. +. +.It Sy spl_panic_halt Ns = Ns Sy 0 Pq uint +Cause a kernel panic on assertion failures. +When not enabled, the thread is halted to facilitate further debugging. +.Pp +Set to a non-zero value to enable. +. +.It Sy spl_taskq_kick Ns = Ns Sy 0 Pq uint +Kick stuck taskq to spawn threads. +When writing a non-zero value to it, it will scan all the taskqs. +If any of them have a pending task more than 5 seconds old, +it will kick it to spawn more threads. +This can be used if you find a rare +deadlock occurs because one or more taskqs didn't spawn a thread when it should. +. +.It Sy spl_taskq_thread_bind Ns = Ns Sy 0 Pq int +Bind taskq threads to specific CPUs. +When enabled all taskq threads will be distributed evenly +across the available CPUs. +By default, this behavior is disabled to allow the Linux scheduler +the maximum flexibility to determine where a thread should run. +. +.It Sy spl_taskq_thread_dynamic Ns = Ns Sy 1 Pq int +Allow dynamic taskqs. +When enabled taskqs which set the +.Sy TASKQ_DYNAMIC +flag will by default create only a single thread. +New threads will be created on demand up to a maximum allowed number +to facilitate the completion of outstanding tasks. +Threads which are no longer needed will be promptly destroyed. +By default this behavior is enabled but it can be disabled to +aid performance analysis or troubleshooting. +. +.It Sy spl_taskq_thread_priority Ns = Ns Sy 1 Pq int +Allow newly created taskq threads to set a non-default scheduler priority. +When enabled, the priority specified when a taskq is created will be applied +to all threads created by that taskq. +When disabled all threads will use the default Linux kernel thread priority. +By default, this behavior is enabled. +. +.It Sy spl_taskq_thread_sequential Ns = Ns Sy 4 Pq int +The number of items a taskq worker thread must handle without interruption +before requesting a new worker thread be spawned. +This is used to control +how quickly taskqs ramp up the number of threads processing the queue. +Because Linux thread creation and destruction are relatively inexpensive a +small default value has been selected. +This means that normally threads will be created aggressively which is desirable. +Increasing this value will +result in a slower thread creation rate which may be preferable for some +configurations. +. +.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint +The maximum number of tasks per pending list in each taskq shown in +.Pa /proc/spl/taskq{,-all} . +Write +.Sy 0 +to turn off the limit. +The proc file will walk the lists with lock held, +reading it could cause a lock-up if the list grow too large +without limiting the output. +"(truncated)" will be shown if the list is larger than the limit. +.El diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 new file mode 100644 index 0000000000..a136690c76 --- /dev/null +++ b/man/man4/zfs.4 @@ -0,0 +1,2394 @@ +.\" +.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. +.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. +.\" Copyright (c) 2019 Datto Inc. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.\" +.Dd June 1, 2021 +.Dt ZFS 4 +.Os +. +.Sh NAME +.Nm zfs +.Nd tuning of the ZFS kernel module +. +.Sh DESCRIPTION +The ZFS module supports these parameters: +.Bl -tag -width Ds +.It Sy dbuf_cache_max_bytes Ns = Ns Sy ULONG_MAX Ns B Pq ulong +Maximum size in bytes of the dbuf cache. +The target size is determined by the MIN versus +.No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd +of the target ARC size. +The behavior of the dbuf cache and its associated settings +can be observed via the +.Pa /proc/spl/kstat/zfs/dbufstats +kstat. +. +.It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy ULONG_MAX Ns B Pq ulong +Maximum size in bytes of the metadata dbuf cache. +The target size is determined by the MIN versus +.No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th +of the target ARC size. +The behavior of the metadata dbuf cache and its associated settings +can be observed via the +.Pa /proc/spl/kstat/zfs/dbufstats +kstat. +. +.It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint +The percentage over +.Sy dbuf_cache_max_bytes +when dbufs must be evicted directly. +. +.It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint +The percentage below +.Sy dbuf_cache_max_bytes +when the evict thread stops evicting dbufs. +. +.It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq int +Set the size of the dbuf cache +.Pq Sy dbuf_cache_max_bytes +to a log2 fraction of the target ARC size. +. +.It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq int +Set the size of the dbuf metadata cache +.Pq Sy dbuf_metadata_cache_max_bytes +to a log2 fraction of the target ARC size. +. +.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int +dnode slots allocated in a single operation as a power of 2. +The default value minimizes lock contention for the bulk operation performed. +. +.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq int +Limit the amount we can prefetch with one call to this amount in bytes. +This helps to limit the amount of memory that can be used by prefetching. +. +.It Sy ignore_hole_birth Pq int +Alias for +.Sy send_holes_without_birth_time . +. +.It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int +Turbo L2ARC warm-up. +When the L2ARC is cold the fill interval will be set as fast as possible. +. +.It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq ulong +Min feed interval in milliseconds. +Requires +.Sy l2arc_feed_again Ns = Ns Ar 1 +and only applicable in related situations. +. +.It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq ulong +Seconds between L2ARC writing. +. +.It Sy l2arc_headroom Ns = Ns Sy 2 Pq ulong +How far through the ARC lists to search for L2ARC cacheable content, +expressed as a multiplier of +.Sy l2arc_write_max . +ARC persistence across reboots can be achieved with persistent L2ARC +by setting this parameter to +.Sy 0 , +allowing the full length of ARC lists to be searched for cacheable content. +. +.It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq ulong +Scales +.Sy l2arc_headroom +by this percentage when L2ARC contents are being successfully compressed +before writing. +A value of +.Sy 100 +disables this feature. +. +.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int +Controls whether only MFU metadata and data are cached from ARC into L2ARC. +This may be desired to avoid wasting space on L2ARC when reading/writing large +amounts of data that are not expected to be accessed more than once. +.Pp +The default is off, +meaning both MRU and MFU data and metadata are cached. +When turning off this feature, some MRU buffers will still be present +in ARC and eventually cached on L2ARC. +.No If Sy l2arc_noprefetch Ns = Ns Sy 0 , +some prefetched buffers will be cached to L2ARC, and those might later +transition to MRU, in which case the +.Sy l2arc_mru_asize No arcstat will not be Sy 0 . +.Pp +Regardless of +.Sy l2arc_noprefetch , +some MFU buffers might be evicted from ARC, +accessed later on as prefetches and transition to MRU as prefetches. +If accessed again they are counted as MRU and the +.Sy l2arc_mru_asize No arcstat will not be Sy 0 . +.Pp +The ARC status of L2ARC buffers when they were first cached in +L2ARC can be seen in the +.Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize +arcstats when importing the pool or onlining a cache +device if persistent L2ARC is enabled. +.Pp +The +.Sy evict_l2_eligible_mru +arcstat does not take into account if this option is enabled as the information +provided by the +.Sy evict_l2_eligible_m[rf]u +arcstats can be used to decide if toggling this option is appropriate +for the current workload. +. +.It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq int +Percent of ARC size allowed for L2ARC-only headers. +Since L2ARC buffers are not evicted on memory pressure, +too many headers on a system with an irrationally large L2ARC +can render it slow or unusable. +This parameter limits L2ARC writes and rebuilds to achieve the target. +. +.It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq ulong +Trims ahead of the current write size +.Pq Sy l2arc_write_max +on L2ARC devices by this percentage of write size if we have filled the device. +If set to +.Sy 100 +we TRIM twice the space required to accommodate upcoming writes. +A minimum of +.Sy 64MB +will be trimmed. +It also enables TRIM of the whole L2ARC device upon creation +or addition to an existing pool or if the header of the device is +invalid upon importing a pool or onlining a cache device. +A value of +.Sy 0 +disables TRIM on L2ARC altogether and is the default as it can put significant +stress on the underlying storage devices. +This will vary depending of how well the specific device handles these commands. +. +.It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int +Do not write buffers to L2ARC if they were prefetched but not used by +applications. +In case there are prefetched buffers in L2ARC and this option +is later set, we do not read the prefetched buffers from L2ARC. +Unsetting this option is useful for caching sequential reads from the +disks to L2ARC and serve those reads from L2ARC later on. +This may be beneficial in case the L2ARC device is significantly faster +in sequential reads than the disks of the pool. +.Pp +Use +.Sy 1 +to disable and +.Sy 0 +to enable caching/reading prefetches to/from L2ARC. +. +.It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int +No reads during writes. +. +.It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq ulong +Cold L2ARC devices will have +.Sy l2arc_write_max +increased by this amount while they remain cold. +. +.It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq ulong +Max write bytes per interval. +. +.It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Rebuild the L2ARC when importing a pool (persistent L2ARC). +This can be disabled if there are problems importing a pool +or attaching an L2ARC device (e.g. the L2ARC device is slow +in reading stored log metadata, or the metadata +has become somehow fragmented/unusable). +. +.It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +Mininum size of an L2ARC device required in order to write log blocks in it. +The log blocks are used upon importing the pool to rebuild the persistent L2ARC. +.Pp +For L2ARC devices less than 1GB, the amount of data +.Fn l2arc_evict +evicts is significant compared to the amount of restored L2ARC data. +In this case, do not write log blocks in L2ARC in order not to waste space. +. +.It Sy metaslab_aliquot Ns = Ns Sy 524288 Ns B Po 512kB Pc Pq ulong +Metaslab granularity, in bytes. +This is roughly similar to what would be referred to as the "stripe size" +in traditional RAID arrays. +In normal operation, ZFS will try to write this amount of data +to a top-level vdev before moving on to the next one. +. +.It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable metaslab group biasing based on their vdevs' over- or under-utilization +relative to the pool. +. +.It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Ns B Po 16MB + 1B Pc Pq ulong +Make some blocks above a certain size be gang blocks. +This option is used by the test suite to facilitate testing. +. +.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Ns B Po 1MB Pc Pq int +When attempting to log an output nvlist of an ioctl in the on-disk history, +the output will not be stored if it is larger than this size (in bytes). +This must be less than +.Sy DMU_MAX_ACCESS Pq 64MB . +This applies primarily to +.Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 . +. +.It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int +Prevent log spacemaps from being destroyed during pool exports and destroys. +. +.It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable/disable segment-based metaslab selection. +. +.It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int +When using segment-based metaslab selection, continue allocating +from the active metaslab until this option's +worth of buckets have been exhausted. +. +.It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int +Load all metaslabs during pool import. +. +.It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int +Prevent metaslabs from being unloaded. +. +.It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable use of the fragmentation metric in computing metaslab weights. +. +.It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +Maximum distance to search forward from the last offset. +Without this limit, fragmented pools can see +.Em >100`000 +iterations and +.Fn metaslab_block_picker +becomes the performance limiting factor on high-performance storage. +.Pp +With the default setting of +.Sy 16MB , +we typically see less than +.Em 500 +iterations, even with very fragmented +.Sy ashift Ns = Ns Sy 9 +pools. +The maximum number of iterations possible is +.Sy metaslab_df_max_search / 2^(ashift+1) . +With the default setting of +.Sy 16MB +this is +.Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9 +or +.Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 . +. +.It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int +If not searching forward (due to +.Sy metaslab_df_max_search , metaslab_df_free_pct , +.No or Sy metaslab_df_alloc_threshold ) , +this tunable controls which segment is used. +If set, we will use the largest free segment. +If unset, we will use a segment of at least the requested size. +. +.It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1h Pc Pq ulong +When we unload a metaslab, we cache the size of the largest free chunk. +We use that cached size to determine whether or not to load a metaslab +for a given allocation. +As more frees accumulate in that metaslab while it's unloaded, +the cached max size becomes less and less accurate. +After a number of seconds controlled by this tunable, +we stop considering the cached max size and start +considering only the histogram instead. +. +.It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq int +When we are loading a new metaslab, we check the amount of memory being used +to store metaslab range trees. +If it is over a threshold, we attempt to unload the least recently used metaslab +to prevent the system from clogging all of its memory with range trees. +This tunable sets the percentage of total system memory that is the threshold. +. +.It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int +.Bl -item -compact +.It +If unset, we will first try normal allocation. +.It +If that fails then we will do a gang allocation. +.It +If that fails then we will do a "try hard" gang allocation. +.It +If that fails then we will have a multi-layer gang block. +.El +.Pp +.Bl -item -compact +.It +If set, we will first try normal allocation. +.It +If that fails then we will do a "try hard" allocation. +.It +If that fails we will do a gang allocation. +.It +If that fails we will do a "try hard" gang allocation. +.It +If that fails then we will have a multi-layer gang block. +.El +. +.It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq int +When not trying hard, we only consider this number of the best metaslabs. +This improves performance, especially when there are many metaslabs per vdev +and the allocation can't actually be satisfied +(so we would otherwise iterate all metaslabs). +. +.It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq int +When a vdev is added, target this number of metaslabs per top-level vdev. +. +.It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int +Default limit for metaslab size. +. +.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong +Maximum ashift used when optimizing for logical -> physical sector size on new +top-level vdevs. +. +.It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong +Minimum ashift used when creating new top-level vdevs. +. +.It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq int +Minimum number of metaslabs to create in a top-level vdev. +. +.It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int +Skip label validation steps during pool import. +Changing is not recommended unless you know what you're doing +and are recovering a damaged label. +. +.It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq int +Practical upper limit of total metaslabs per top-level vdev. +. +.It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable metaslab group preloading. +. +.It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Give more weight to metaslabs with lower LBAs, +assuming they have greater bandwidth, +as is typically the case on a modern constant angular velocity disk drive. +. +.It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq int +After a metaslab is used, we keep it loaded for this many TXGs, to attempt to +reduce unnecessary reloading. +Note that both this many TXGs and +.Sy metaslab_unload_delay_ms +milliseconds must pass before unloading will occur. +. +.It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10min Pc Pq int +After a metaslab is used, we keep it loaded for this many milliseconds, +to attempt to reduce unnecessary reloading. +Note, that both this many milliseconds and +.Sy metaslab_unload_delay +TXGs must pass before unloading will occur. +. +.It Sy reference_history Ns = Ns Sy 3 Pq int +Maximum reference holders being tracked when reference_tracking_enable is active. +. +.It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Track reference holders to +.Sy refcount_t +objects (debug builds only). +. +.It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int +When set, the +.Sy hole_birth +optimization will not be used, and all holes will always be sent during a +.Nm zfs Cm send . +This is useful if you suspect your datasets are affected by a bug in +.Sy hole_birth . +. +.It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp +SPA config file. +. +.It Sy spa_asize_inflation Ns = Ns Sy 24 Pq int +Multiplication factor used to estimate actual disk consumption from the +size of data being written. +The default value is a worst case estimate, +but lower values may be valid for a given pool depending on its configuration. +Pool administrators who understand the factors involved +may wish to specify a more realistic inflation factor, +particularly if they operate close to quota or capacity limits. +. +.It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int +Whether to print the vdev tree in the debugging message buffer during pool import. +. +.It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int +Whether to traverse data blocks during an "extreme rewind" +.Pq Fl X +import. +.Pp +An extreme rewind import normally performs a full traversal of all +blocks in the pool for verification. +If this parameter is unset, the traversal skips non-metadata blocks. +It can be toggled once the +import has started to stop or start the traversal of non-metadata blocks. +. +.It Sy spa_load_verify_metadata Ns = Ns Sy 1 Ns | Ns 0 Pq int +Whether to traverse blocks during an "extreme rewind" +.Pq Fl X +pool import. +.Pp +An extreme rewind import normally performs a full traversal of all +blocks in the pool for verification. +If this parameter is unset, the traversal is not performed. +It can be toggled once the import has started to stop or start the traversal. +. +.It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq int +Sets the maximum number of bytes to consume during pool import to the log2 +fraction of the target ARC size. +. +.It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int +Normally, we don't allow the last +.Sy 3.2% Pq Sy 1/2^spa_slop_shift +of space in the pool to be consumed. +This ensures that we don't run the pool completely out of space, +due to unaccounted changes (e.g. to the MOS). +It also limits the worst-case time to allocate space. +If we have less than this amount of free space, +most ZPL operations (e.g. write, create) will return +.Sy ENOSPC . +. +.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int +During top-level vdev removal, chunks of data are copied from the vdev +which may include free space in order to trade bandwidth for IOPS. +This parameter determines the maximum span of free space, in bytes, +which will be included as "unnecessary" data in a chunk of copied data. +.Pp +The default value here was chosen to align with +.Sy zfs_vdev_read_gap_limit , +which is a similar concept when doing +regular reads (but there's no reason it has to be the same). +. +.It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512B Pc Pq ulong +Logical ashift for file-based devices. +. +.It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512B Pc Pq ulong +Physical ashift for file-based devices. +. +.It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int +If set, when we start iterating over a ZAP object, +prefetch the entire object (all leaf blocks). +However, this is limited by +.Sy dmu_prefetch_max . +. +.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +If prefetching is enabled, disable prefetching for reads larger than this size. +. +.It Sy zfetch_max_distance Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq uint +Max bytes to prefetch per stream. +. +.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64MB Pc Pq uint +Max bytes to prefetch indirects for per stream. +. +.It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint +Max number of streams per zfetch (prefetch streams per file). +. +.It Sy zfetch_min_sec_reap Ns = Ns Sy 2 Pq uint +Min time before an active prefetch stream can be reclaimed +. +.It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enables ARC from using scatter/gather lists and forces all allocations to be +linear in kernel memory. +Disabling can improve performance in some code paths +at the expense of fragmented kernel memory. +. +.It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER-1 Pq uint +Maximum number of consecutive memory pages allocated in a single block for +scatter/gather lists. +.Pp +The value of +.Sy MAX_ORDER +depends on kernel configuration. +. +.It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5kB Pc Pq uint +This is the minimum allocation size that will use scatter (page-based) ABDs. +Smaller allocations will use linear ABDs. +. +.It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq ulong +When the number of bytes consumed by dnodes in the ARC exceeds this number of +bytes, try to unpin some of it in response to demand for non-metadata. +This value acts as a ceiling to the amount of dnode metadata, and defaults to +.Sy 0 , +which indicates that a percent which is based on +.Sy zfs_arc_dnode_limit_percent +of the ARC meta buffers that may be used for dnodes. +.Pp +Also see +.Sy zfs_arc_meta_prune +which serves a similar purpose but is used +when the amount of metadata in the ARC exceeds +.Sy zfs_arc_meta_limit +rather than in response to overall demand for non-metadata. +. +.It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq ulong +Percentage that can be consumed by dnodes of ARC meta buffers. +.Pp +See also +.Sy zfs_arc_dnode_limit , +which serves a similar purpose but has a higher priority if nonzero. +. +.It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq ulong +Percentage of ARC dnodes to try to scan in response to demand for non-metadata +when the number of bytes consumed by dnodes exceeds +.Sy zfs_arc_dnode_limit . +. +.It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8kB Pc Pq int +The ARC's buffer hash table is sized based on the assumption of an average +block size of this value. +This works out to roughly 1MB of hash table per 1GB of physical memory +with 8-byte pointers. +For configurations with a known larger average block size, +this value can be increased to reduce the memory footprint. +. +.It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq int +When +.Fn arc_is_overflowing , +.Fn arc_get_data_impl +waits for this percent of the requested amount of data to be evicted. +For example, by default, for every +.Em 2kB +that's evicted, +.Em 1kB +of it may be "reused" by a new allocation. +Since this is above +.Sy 100 Ns % , +it ensures that progress is made towards getting +.Sy arc_size No under Sy arc_c . +Since this is finite, it ensures that allocations can still happen, +even during the potentially long time that +.Sy arc_size No is more than Sy arc_c . +. +.It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq int +Number ARC headers to evict per sub-list before proceeding to another sub-list. +This batch-style operation prevents entire sub-lists from being evicted at once +but comes at a cost of additional unlocking and locking. +. +.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq int +If set to a non zero value, it will replace the +.Sy arc_grow_retry +value with this value. +The +.Sy arc_grow_retry +.No value Pq default Sy 5 Ns s +is the number of seconds the ARC will wait before +trying to resume growth after a memory pressure event. +. +.It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int +Throttle I/O when free system memory drops below this percentage of total +system memory. +Setting this value to +.Sy 0 +will disable the throttle. +. +.It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq ulong +Max size of ARC in bytes. +If +.Sy 0 , +then the max size of ARC is determined by the amount of system memory installed. +Under Linux, half of system memory will be used as the limit. +Under +.Fx , +the larger of +.Sy all_system_memory - 1GB No and Sy 5/8 * all_system_memory +will be used as the limit. +This value must be at least +.Sy 67108864 Ns B Pq 64MB . +.Pp +This value can be changed dynamically, with some caveats. +It cannot be set back to +.Sy 0 +while running, and reducing it below the current ARC size will not cause +the ARC to shrink without memory pressure to induce shrinking. +. +.It Sy zfs_arc_meta_adjust_restarts Ns = Ns Sy 4096 Pq ulong +The number of restart passes to make while scanning the ARC attempting +the free buffers in order to stay below the +.Sy fs_arc_meta_limit . +This value should not need to be tuned but is available to facilitate +performance analysis. +. +.It Sy zfs_arc_meta_limit Ns = Ns Sy 0 Ns B Pq ulong +The maximum allowed size in bytes that metadata buffers are allowed to +consume in the ARC. +When this limit is reached, metadata buffers will be reclaimed, +even if the overall +.Sy arc_c_max +has not been reached. +It defaults to +.Sy 0 , +which indicates that a percentage based on +.Sy zfs_arc_meta_limit_percent +of the ARC may be used for metadata. +.Pp +This value my be changed dynamically, except that must be set to an explicit value +.Pq cannot be set back to Sy 0 . +. +.It Sy zfs_arc_meta_limit_percent Ns = Ns Sy 75 Ns % Pq ulong +Percentage of ARC buffers that can be used for metadata. +.Pp +See also +.Sy zfs_arc_meta_limit , +which serves a similar purpose but has a higher priority if nonzero. +. +.It Sy zfs_arc_meta_min Ns = Ns Sy 0 Ns B Pq ulong +The minimum allowed size in bytes that metadata buffers may consume in +the ARC. +. +.It Sy zfs_arc_meta_prune Ns = Ns Sy 10000 Pq int +The number of dentries and inodes to be scanned looking for entries +which can be dropped. +This may be required when the ARC reaches the +.Sy zfs_arc_meta_limit +because dentries and inodes can pin buffers in the ARC. +Increasing this value will cause to dentry and inode caches +to be pruned more aggressively. +Setting this value to +.Sy 0 +will disable pruning the inode and dentry caches. +. +.It Sy zfs_arc_meta_strategy Ns = Ns Sy 1 Ns | Ns 0 Pq int +Define the strategy for ARC metadata buffer eviction (meta reclaim strategy): +.Bl -tag -compact -offset 4n -width "0 (META_ONLY)" +.It Sy 0 Pq META_ONLY +evict only the ARC metadata buffers +.It Sy 1 Pq BALANCED +additional data buffers may be evicted if required +to evict the required number of metadata buffers. +.El +. +.It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq ulong +Min size of ARC in bytes. +.No If set to Sy 0 , arc_c_min +will default to consuming the larger of +.Sy 32MB No or Sy all_system_memory/32 . +. +.It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq int +Minimum time prefetched blocks are locked in the ARC. +. +.It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq int +Minimum time "prescient prefetched" blocks are locked in the ARC. +These blocks are meant to be prefetched fairly aggressively ahead of +the code that may use them. +. +.It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int +Number of missing top-level vdevs which will be allowed during +pool import (only in read-only mode). +. +.It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq ulong +Maximum size in bytes allowed to be passed as +.Sy zc_nvlist_src_size +for ioctls on +.Pa /dev/zfs . +This prevents a user from causing the kernel to allocate +an excessive amount of memory. +When the limit is exceeded, the ioctl fails with +.Sy EINVAL +and a description of the error is sent to the +.Pa zfs-dbgmsg +log. +This parameter should not need to be touched under normal circumstances. +If +.Sy 0 , +equivalent to a quarter of the user-wired memory limit under +.Fx +and to +.Sy 134217728 Ns B Pq 128MB +under Linux. +. +.It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq int +To allow more fine-grained locking, each ARC state contains a series +of lists for both data and metadata objects. +Locking is performed at the level of these "sub-lists". +This parameters controls the number of sub-lists per ARC state, +and also applies to other uses of the multilist data structure. +.Pp +If +.Sy 0 , +equivalent to the greater of the number of online CPUs and +.Sy 4 . +. +.It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int +The ARC size is considered to be overflowing if it exceeds the current +ARC target size +.Pq Sy arc_c +by thresholds determined by this parameter. +Exceeding by +.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5 +starts ARC reclamation process. +If that appears insufficient, exceeding by +.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5 +blocks new buffer allocation until the reclaim thread catches up. +Started reclamation process continues till ARC size returns below the +target size. +.Pp +The default value of +.Sy 8 +causes the ARC to start reclamation if it exceeds the target size by +.Em 0.2% +of the target size, and block allocations by +.Em 0.6% . +. +.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int +If nonzero, this will update +.Sy arc_p_min_shift Pq default Sy 4 +with the new value. +.Sy arc_p_min_shift No is used as a shift of Sy arc_c +when calculating the minumum +.Sy arc_p No size. +. +.It Sy zfs_arc_p_dampener_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int +Disable +.Sy arc_p +adapt dampener, which reduces the maximum single adjustment to +.Sy arc_p . +. +.It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq int +If nonzero, this will update +.Sy arc_shrink_shift Pq default Sy 7 +with the new value. +. +.It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint +Percent of pagecache to reclaim ARC to. +.Pp +This tunable allows the ZFS ARC to play more nicely +with the kernel's LRU pagecache. +It can guarantee that the ARC size won't collapse under scanning +pressure on the pagecache, yet still allows the ARC to be reclaimed down to +.Sy zfs_arc_min +if necessary. +This value is specified as percent of pagecache size (as measured by +.Sy NR_FILE_PAGES ) , +where that percent may exceed +.Sy 100 . +This +only operates during memory pressure/reclaim. +. +.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int +This is a limit on how many pages the ARC shrinker makes available for +eviction in response to one page allocation attempt. +Note that in practice, the kernel's shrinker can ask us to evict +up to about four times this for one allocation attempt. +.Pp +The default limit of +.Sy 10000 Pq in practice, Em 160MB No per allocation attempt with 4kB pages +limits the amount of time spent attempting to reclaim ARC memory to +less than 100ms per allocation attempt, +even with a small average compressed block size of ~8kB. +.Pp +The parameter can be set to 0 (zero) to disable the limit, +and only applies on Linux. +. +.It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq ulong +The target number of bytes the ARC should leave as free memory on the system. +If zero, equivalent to the bigger of +.Sy 512kB No and Sy all_system_memory/64 . +. +.It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int +Disable pool import at module load by ignoring the cache file +.Pq Sy spa_config_path . +. +.It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint +Rate limit checksum events to this many per second. +Note that this should not be set below the ZED thresholds +(currently 10 checksums over 10 seconds) +or else the daemon may not trigger any action. +. +.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq int +This controls the amount of time that a ZIL block (lwb) will remain "open" +when it isn't "full", and it has a thread waiting for it to be committed to +stable storage. +The timeout is scaled based on a percentage of the last lwb +latency to avoid significantly impacting the latency of each individual +transaction record (itx). +. +.It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int +Vdev indirection layer (used for device removal) sleeps for this many +milliseconds during mapping generation. +Intended for use with the test suite to throttle vdev removal speed. +. +.It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq int +Minimum percent of obsolete bytes in vdev mapping required to attempt to condense +.Pq see Sy zfs_condense_indirect_vdevs_enable . +Intended for use with the test suite +to facilitate triggering condensing as needed. +. +.It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable condensing indirect vdev mappings. +When set, attempt to condense indirect vdev mappings +if the mapping uses more than +.Sy zfs_condense_min_mapping_bytes +bytes of memory and if the obsolete space map object uses more than +.Sy zfs_condense_max_obsolete_bytes +bytes on-disk. +The condensing process is an attempt to save memory by removing obsolete mappings. +. +.It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +Only attempt to condense indirect vdev mappings if the on-disk size +of the obsolete space map object is greater than this number of bytes +.Pq see Sy zfs_condense_indirect_vdevs_enable . +. +.It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq ulong +Minimum size vdev mapping to attempt to condense +.Pq see Sy zfs_condense_indirect_vdevs_enable . +. +.It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int +Internally ZFS keeps a small log to facilitate debugging. +The log is enabled by default, and can be disabled by unsetting this option. +The contents of the log can be accessed by reading +.Pa /proc/spl/kstat/zfs/dbgmsg . +Writing +.Sy 0 +to the file clears the log. +.Pp +This setting does not influence debug prints due to +.Sy zfs_flags . +. +.It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4MB Pc Pq int +Maximum size of the internal ZFS debug log. +. +.It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int +Historically used for controlling what reporting was available under +.Pa /proc/spl/kstat/zfs . +No effect. +. +.It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +When a pool sync operation takes longer than +.Sy zfs_deadman_synctime_ms , +or when an individual I/O operation takes longer than +.Sy zfs_deadman_ziotime_ms , +then the operation is considered to be "hung". +If +.Sy zfs_deadman_enabled +is set, then the deadman behavior is invoked as described by +.Sy zfs_deadman_failmode . +By default, the deadman is enabled and set to +.Sy wait +which results in "hung" I/Os only being logged. +The deadman is automatically disabled when a pool gets suspended. +. +.It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp +Controls the failure behavior when the deadman detects a "hung" I/O operation. +Valid values are: +.Bl -tag -compact -offset 4n -width "continue" +.It Sy wait +Wait for a "hung" operation to complete. +For each "hung" operation a "deadman" event will be posted +describing that operation. +.It Sy continue +Attempt to recover from a "hung" operation by re-dispatching it +to the I/O pipeline if possible. +.It Sy panic +Panic the system. +This can be used to facilitate automatic fail-over +to a properly configured fail-over partner. +.El +. +.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1min Pc Pq int +Check time in milliseconds. +This defines the frequency at which we check for hung I/O requests +and potentially invoke the +.Sy zfs_deadman_failmode +behavior. +. +.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10min Pc Pq ulong +Interval in milliseconds after which the deadman is triggered and also +the interval after which a pool sync operation is considered to be "hung". +Once this limit is exceeded the deadman will be invoked every +.Sy zfs_deadman_checktime_ms +milliseconds until the pool sync completes. +. +.It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5min Pc Pq ulong +Interval in milliseconds after which the deadman is triggered and an +individual I/O operation is considered to be "hung". +As long as the operation remains "hung", +the deadman will be invoked every +.Sy zfs_deadman_checktime_ms +milliseconds until the operation completes. +. +.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int +Enable prefetching dedup-ed blocks which are going to be freed. +. +.It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq int +Start to delay each transaction once there is this amount of dirty data, +expressed as a percentage of +.Sy zfs_dirty_data_max . +This value should be at least +.Sy zfs_vdev_async_write_active_max_dirty_percent . +.No See Sx ZFS TRANSACTION DELAY . +. +.It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int +This controls how quickly the transaction delay approaches infinity. +Larger values cause longer delays for a given amount of dirty data. +.Pp +For the smoothest delay, this value should be about 1 billion divided +by the maximum number of operations per second. +This will smoothly handle between ten times and a tenth of this number. +.No See Sx ZFS TRANSACTION DELAY . +.Pp +.Sy zfs_delay_scale * zfs_dirty_data_max Em must be smaller than Sy 2^64 . +. +.It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disables requirement for IVset GUIDs to be present and match when doing a raw +receive of encrypted datasets. +Intended for users whose pools were created with +OpenZFS pre-release versions and now have compatibility issues. +. +.It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong +Maximum number of uses of a single salt value before generating a new one for +encrypted datasets. +The default value is also the maximum. +. +.It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint +Size of the znode hashtable used for holds. +.Pp +Due to the need to hold locks on objects that may not exist yet, kernel mutexes +are not created per-object and instead a hashtable is used where collisions +will result in objects waiting when there is not actually contention on the +same object. +. +.It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int +Rate limit delay and deadman zevents (which report slow I/Os) to this many per +second. +. +.It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +Upper-bound limit for unflushed metadata changes to be held by the +log spacemap in memory, in bytes. +. +.It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq ulong +Part of overall system memory that ZFS allows to be used +for unflushed metadata changes by the log spacemap, in millionths. +. +.It Sy zfs_unflushed_log_block_max Ns = Ns Sy 262144 Po 256k Pc Pq ulong +Describes the maximum number of log spacemap blocks allowed for each pool. +The default value means that the space in all the log spacemaps +can add up to no more than +.Sy 262144 +blocks (which means +.Em 32GB +of logical space before compression and ditto blocks, +assuming that blocksize is +.Em 128kB ) . +.Pp +This tunable is important because it involves a trade-off between import +time after an unclean export and the frequency of flushing metaslabs. +The higher this number is, the more log blocks we allow when the pool is +active which means that we flush metaslabs less often and thus decrease +the number of I/Os for spacemap updates per TXG. +At the same time though, that means that in the event of an unclean export, +there will be more log spacemap blocks for us to read, inducing overhead +in the import time of the pool. +The lower the number, the amount of flushing increases, destroying log +blocks quicker as they become obsolete faster, which leaves less blocks +to be read during import time after a crash. +.Pp +Each log spacemap block existing during pool import leads to approximately +one extra logical I/O issued. +This is the reason why this tunable is exposed in terms of blocks rather +than space used. +. +.It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq ulong +If the number of metaslabs is small and our incoming rate is high, +we could get into a situation that we are flushing all our metaslabs every TXG. +Thus we always allow at least this many log blocks. +. +.It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq ulong +Tunable used to determine the number of blocks that can be used for +the spacemap log, expressed as a percentage of the total number of +metaslabs in the pool. +. +.It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint +When enabled, files will not be asynchronously removed from the list of pending +unlinks and the space they consume will be leaked. +Once this option has been disabled and the dataset is remounted, +the pending unlinks will be processed and the freed space returned to the pool. +This option is used by the test suite. +. +.It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong +This is the used to define a large file for the purposes of deletion. +Files containing more than +.Sy zfs_delete_blocks +will be deleted asynchronously, while smaller files are deleted synchronously. +Decreasing this value will reduce the time spent in an +.Xr unlink 2 +system call, at the expense of a longer delay before the freed space is available. +. +.It Sy zfs_dirty_data_max Ns = Pq int +Determines the dirty space limit in bytes. +Once this limit is exceeded, new writes are halted until space frees up. +This parameter takes precedence over +.Sy zfs_dirty_data_max_percent . +.No See Sx ZFS TRANSACTION DELAY . +.Pp +Defaults to +.Sy physical_ram/10 , +capped at +.Sy zfs_dirty_data_max_max . +. +.It Sy zfs_dirty_data_max_max Ns = Pq int +Maximum allowable value of +.Sy zfs_dirty_data_max , +expressed in bytes. +This limit is only enforced at module load time, and will be ignored if +.Sy zfs_dirty_data_max +is later changed. +This parameter takes precedence over +.Sy zfs_dirty_data_max_max_percent . +.No See Sx ZFS TRANSACTION DELAY . +.Pp +Defaults to +.Sy physical_ram/4 , +. +.It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq int +Maximum allowable value of +.Sy zfs_dirty_data_max , +expressed as a percentage of physical RAM. +This limit is only enforced at module load time, and will be ignored if +.Sy zfs_dirty_data_max +is later changed. +The parameter +.Sy zfs_dirty_data_max_max +takes precedence over this one. +.No See Sx ZFS TRANSACTION DELAY . +. +.It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq int +Determines the dirty space limit, expressed as a percentage of all memory. +Once this limit is exceeded, new writes are halted until space frees up. +The parameter +.Sy zfs_dirty_data_max +takes precedence over this one. +.No See Sx ZFS TRANSACTION DELAY . +.Pp +Subject to +.Sy zfs_dirty_data_max_max . +. +.It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq int +Start syncing out a transaction group if there's at least this much dirty data +.Pq as a percentage of Sy zfs_dirty_data_max . +This should be less than +.Sy zfs_vdev_async_write_active_min_dirty_percent . +. +.It Sy zfs_wrlog_data_max Ns = Pq int +The upper limit of write-transaction zil log data size in bytes. +Once it is reached, write operation is blocked, until log data is cleared out +after transaction group sync. Because of some overhead, it should be set +at least 2 times the size of +.Sy zfs_dirty_data_max +.No to prevent harming normal write throughput. +It also should be smaller than the size of the slog device if slog is present. +.Pp +Defaults to +.Sy zfs_dirty_data_max*2 +. +.It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint +Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be +preallocated for a file in order to guarantee that later writes will not +run out of space. +Instead, +.Xr fallocate 2 +space preallocation only checks that sufficient space is currently available +in the pool or the user's project quota allocation, +and then creates a sparse file of the requested size. +The requested space is multiplied by +.Sy zfs_fallocate_reserve_percent +to allow additional space for indirect blocks and other internal metadata. +Setting this to +.Sy 0 +disables support for +.Xr fallocate 2 +and causes it to return +.Sy EOPNOTSUPP . +. +.It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string +Select a fletcher 4 implementation. +.Pp +Supported selectors are: +.Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw , +.No and Sy aarch64_neon . +All except +.Sy fastest No and Sy scalar +require instruction set extensions to be available, +and will only appear if ZFS detects that they are present at runtime. +If multiple implementations of fletcher 4 are available, the +.Sy fastest +will be chosen using a micro benchmark. +Selecting +.Sy scalar +results in the original CPU-based calculation being used. +Selecting any option other than +.Sy fastest No or Sy scalar +results in vector instructions +from the respective CPU instruction set being used. +. +.It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable/disable the processing of the free_bpobj object. +. +.It Sy zfs_async_block_max_blocks Ns = Ns Sy ULONG_MAX Po unlimited Pc Pq ulong +Maximum number of blocks freed in a single TXG. +. +.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq ulong +Maximum number of dedup blocks freed in a single TXG. +. +.It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Pq ulong +If nonzer, override record size calculation for +.Nm zfs Cm send +estimates. +. +.It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq int +Maximum asynchronous read I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq int +Minimum asynchronous read I/O operation active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq int +When the pool has more than this much dirty data, use +.Sy zfs_vdev_async_write_max_active +to limit active async writes. +If the dirty data is between the minimum and maximum, +the active I/O limit is linearly interpolated. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq int +When the pool has less than this much dirty data, use +.Sy zfs_vdev_async_write_min_active +to limit active async writes. +If the dirty data is between the minimum and maximum, +the active I/O limit is linearly +interpolated. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 30 Pq int +Maximum asynchronous write I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq int +Minimum asynchronous write I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +.Pp +Lower values are associated with better latency on rotational media but poorer +resilver performance. +The default value of +.Sy 2 +was chosen as a compromise. +A value of +.Sy 3 +has been shown to improve resilver performance further at a cost of +further increasing latency. +. +.It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq int +Maximum initializing I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq int +Minimum initializing I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq int +The maximum number of I/O operations active to each device. +Ideally, this will be at least the sum of each queue's +.Sy max_active . +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq int +Maximum sequential resilver I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq int +Minimum sequential resilver I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq int +Maximum removal I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq int +Minimum removal I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq int +Maximum scrub I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq int +Minimum scrub I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq int +Maximum synchronous read I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq int +Minimum synchronous read I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq int +Maximum synchronous write I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq int +Minimum synchronous write I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq int +Maximum trim/discard I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq int +Minimum trim/discard I/O operations active to each device. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq int +For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), +the number of concurrently-active I/O operations is limited to +.Sy zfs_*_min_active , +unless the vdev is "idle". +When there are no interactive I/O operatinons active (synchronous or otherwise), +and +.Sy zfs_vdev_nia_delay +operations have completed since the last interactive operation, +then the vdev is considered to be "idle", +and the number of concurrently-active non-interactive operations is increased to +.Sy zfs_*_max_active . +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq int +Some HDDs tend to prioritize sequential I/O so strongly, that concurrent +random I/O latency reaches several seconds. +On some HDDs this happens even if sequential I/O operations +are submitted one at a time, and so setting +.Sy zfs_*_max_active Ns = Sy 1 +does not help. +To prevent non-interactive I/O, like scrub, +from monopolizing the device, no more than +.Sy zfs_vdev_nia_credit operations can be sent +while there are outstanding incomplete interactive operations. +This enforced wait ensures the HDD services the interactive I/O +within a reasonable amount of time. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq int +Maximum number of queued allocations per top-level vdev expressed as +a percentage of +.Sy zfs_vdev_async_write_max_active , +which allows the system to detect devices that are more capable +of handling allocations and to allocate more blocks to those devices. +This allows for dynamic allocation distribution when devices are imbalanced, +as fuller devices will tend to be slower than empty devices. +.Pp +Also see +.Sy zio_dva_throttle_enabled . +. +.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int +Time before expiring +.Pa .zfs/snapshot . +. +.It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int +Allow the creation, removal, or renaming of entries in the +.Sy .zfs/snapshot +directory to cause the creation, destruction, or renaming of snapshots. +When enabled, this functionality works both locally and over NFS exports +which have the +.Em no_root_squash +option set. +. +.It Sy zfs_flags Ns = Ns Sy 0 Pq int +Set additional debugging flags. +The following flags may be bitwise-ored together: +.TS +box; +lbz r l l . + Value Symbolic Name Description +_ + 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. +* 2 ZFS_DEBUG_DBUF_VERIFY Enable extra dbuf verifications. +* 4 ZFS_DEBUG_DNODE_VERIFY Enable extra dnode verifications. + 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. + 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. + 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. + 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. + 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-memory \fBrange_trees\fP. + 512 ZFS_DEBUG_SET_ERROR Enable \fBSET_ERROR\fP and dprintf entries in the debug log. + 1024 ZFS_DEBUG_INDIRECT_REMAP Verify split blocks created by device removal. + 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. + 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log + and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing. +.TE +.Sy \& * No Requires debug build. +. +.It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int +If destroy encounters an +.Sy EIO +while reading metadata (e.g. indirect blocks), +space referenced by the missing metadata can not be freed. +Normally this causes the background destroy to become "stalled", +as it is unable to make forward progress. +While in this stalled state, all remaining space to free +from the error-encountering filesystem is "temporarily leaked". +Set this flag to cause it to ignore the +.Sy EIO , +permanently leak the space from indirect blocks that can not be read, +and continue to free everything else that it can. +.Pp +The default "stalling" behavior is useful if the storage partially +fails (i.e. some but not all I/O operations fail), and then later recovers. +In this case, we will be able to continue pool operations while it is +partially failed, and when it recovers, we can continue to free the +space, with no leaks. +Note, however, that this case is actually fairly rare. +.Pp +Typically pools either +.Bl -enum -compact -offset 4n -width "1." +.It +fail completely (but perhaps temporarily, +e.g. due to a top-level vdev going offline), or +.It +have localized, permanent errors (e.g. disk returns the wrong data +due to bit flip or firmware bug). +.El +In the former case, this setting does not matter because the +pool will be suspended and the sync thread will not be able to make +forward progress regardless. +In the latter, because the error is permanent, the best we can do +is leak the minimum amount of space, +which is what setting this flag will do. +It is therefore reasonable for this flag to normally be set, +but we chose the more conservative approach of not setting it, +so that there is no possibility of +leaking space in the "partial temporary" failure case. +. +.It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq int +During a +.Nm zfs Cm destroy +operation using the +.Sy async_destroy +feature, +a minimum of this much time will be spent working on freeing blocks per TXG. +. +.It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq int +Similar to +.Sy zfs_free_min_time_ms , +but for cleanup of old indirection records for removed vdevs. +. +.It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq long +Largest data block to write to the ZIL. +Larger blocks will be treated as if the dataset being written to had the +.Sy logbias Ns = Ns Sy throughput +property set. +. +.It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq ulong +Pattern written to vdev free space by +.Xr zpool-initialize 8 . +. +.It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +Size of writes used by +.Xr zpool-initialize 8 . +This option is used by the test suite. +. +.It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq ulong +The threshold size (in block pointers) at which we create a new sub-livelist. +Larger sublists are more costly from a memory perspective but the fewer +sublists there are, the lower the cost of insertion. +. +.It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int +If the amount of shared space between a snapshot and its clone drops below +this threshold, the clone turns off the livelist and reverts to the old +deletion method. +This is in place because livelists no long give us a benefit +once a clone has been overwritten enough. +. +.It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int +Incremented each time an extra ALLOC blkptr is added to a livelist entry while +it is being condensed. +This option is used by the test suite to track race conditions. +. +.It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int +Incremented each time livelist condensing is canceled while in +.Fn spa_livelist_condense_sync . +This option is used by the test suite to track race conditions. +. +.It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set, the livelist condense process pauses indefinitely before +executing the synctask - +.Fn spa_livelist_condense_sync . +This option is used by the test suite to trigger race conditions. +. +.It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int +Incremented each time livelist condensing is canceled while in +.Fn spa_livelist_condense_cb . +This option is used by the test suite to track race conditions. +. +.It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set, the livelist condense process pauses indefinitely before +executing the open context condensing work in +.Fn spa_livelist_condense_cb . +This option is used by the test suite to trigger race conditions. +. +.It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq ulong +The maximum execution time limit that can be set for a ZFS channel program, +specified as a number of Lua instructions. +. +.It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100MB Pc Pq ulong +The maximum memory limit that can be set for a ZFS channel program, specified +in bytes. +. +.It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int +The maximum depth of nested datasets. +This value can be tuned temporarily to +fix existing datasets that exceed the predefined limit. +. +.It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq ulong +The number of past TXGs that the flushing algorithm of the log spacemap +feature uses to estimate incoming log blocks. +. +.It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq ulong +Maximum number of rows allowed in the summary of the spacemap log. +. +.It Sy zfs_max_recordsize Ns = Ns Sy 1048576 Po 1MB Pc Pq int +We currently support block sizes from +.Em 512B No to Em 16MB . +The benefits of larger blocks, and thus larger I/O, +need to be weighed against the cost of COWing a giant block to modify one byte. +Additionally, very large blocks can have an impact on I/O latency, +and also potentially on the memory allocator. +Therefore, we do not allow the recordsize to be set larger than this tunable. +Larger blocks can be created by changing it, +and pools with larger blocks can always be imported and used, +regardless of this setting. +. +.It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int +Allow datasets received with redacted send/receive to be mounted. +Normally disabled because these datasets may be missing key data. +. +.It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq ulong +Minimum number of metaslabs to flush per dirty TXG. +. +.It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq int +Allow metaslabs to keep their active state as long as their fragmentation +percentage is no more than this value. +An active metaslab that exceeds this threshold +will no longer keep its active status allowing better metaslabs to be selected. +. +.It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq int +Metaslab groups are considered eligible for allocations if their +fragmentation metric (measured as a percentage) is less than or equal to +this value. +If a metaslab group exceeds this threshold then it will be +skipped unless all metaslab groups within the metaslab class have also +crossed this threshold. +. +.It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq int +Defines a threshold at which metaslab groups should be eligible for allocations. +The value is expressed as a percentage of free space +beyond which a metaslab group is always eligible for allocations. +If a metaslab group's free space is less than or equal to the +threshold, the allocator will avoid allocating to that group +unless all groups in the pool have reached the threshold. +Once all groups have reached the threshold, all groups are allowed to accept +allocations. +The default value of +.Sy 0 +disables the feature and causes all metaslab groups to be eligible for allocations. +.Pp +This parameter allows one to deal with pools having heavily imbalanced +vdevs such as would be the case when a new vdev has been added. +Setting the threshold to a non-zero percentage will stop allocations +from being made to vdevs that aren't filled to the specified percentage +and allow lesser filled vdevs to acquire more allocations than they +otherwise would under the old +.Sy zfs_mg_alloc_failures +facility. +. +.It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int +If enabled, ZFS will place DDT data into the special allocation class. +. +.It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int +If enabled, ZFS will place user data indirect blocks +into the special allocation class. +. +.It Sy zfs_multihost_history Ns = Ns Sy 0 Pq int +Historical statistics for this many latest multihost updates will be available in +.Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost . +. +.It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq ulong +Used to control the frequency of multihost writes which are performed when the +.Sy multihost +pool property is on. +This is one of the factors used to determine the +length of the activity check during import. +.Pp +The multihost write period is +.Sy zfs_multihost_interval / leaf-vdevs . +On average a multihost write will be issued for each leaf vdev +every +.Sy zfs_multihost_interval +milliseconds. +In practice, the observed period can vary with the I/O load +and this observed value is the delay which is stored in the uberblock. +. +.It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint +Used to control the duration of the activity test on import. +Smaller values of +.Sy zfs_multihost_import_intervals +will reduce the import time but increase +the risk of failing to detect an active pool. +The total activity check time is never allowed to drop below one second. +.Pp +On import the activity check waits a minimum amount of time determined by +.Sy zfs_multihost_interval * zfs_multihost_import_intervals , +or the same product computed on the host which last had the pool imported, +whichever is greater. +The activity check time may be further extended if the value of MMP +delay found in the best uberblock indicates actual multihost updates happened +at longer intervals than +.Sy zfs_multihost_interval . +A minimum of +.Em 100ms +is enforced. +.Pp +.Sy 0 No is equivalent to Sy 1 . +. +.It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint +Controls the behavior of the pool when multihost write failures or delays are +detected. +.Pp +When +.Sy 0 , +multihost write failures or delays are ignored. +The failures will still be reported to the ZED which depending on +its configuration may take action such as suspending the pool or offlining a +device. +.Pp +Otherwise, the pool will be suspended if +.Sy zfs_multihost_fail_intervals * zfs_multihost_interval +milliseconds pass without a successful MMP write. +This guarantees the activity test will see MMP writes if the pool is imported. +.Sy 1 No is equivalent to Sy 2 ; +this is necessary to prevent the pool from being suspended +due to normal, small I/O latency variations. +. +.It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int +Set to disable scrub I/O. +This results in scrubs not actually scrubbing data and +simply doing a metadata crawl of the pool instead. +. +.It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int +Set to disable block prefetching for scrubs. +. +.It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable cache flush operations on disks when writing. +Setting this will cause pool corruption on power loss +if a volatile out-of-order write cache is enabled. +. +.It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Allow no-operation writes. +The occurrence of nopwrites will further depend on other pool properties +.Pq i.a. the checksumming and compression algorithms . +. +.It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 0 Ns | Ns 1 Pq int +Enable forcing TXG sync to find holes. +When enabled forces ZFS to act like prior versions when +.Sy SEEK_HOLE No or Sy SEEK_DATA +flags are used, which, when a dnode is dirty, +causes TXGs to be synced so that this data can be found. +. +.It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50MB Pc Pq int +The number of bytes which should be prefetched during a pool traversal, like +.Nm zfs Cm send +or other data crawling operations. +. +.It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq int +The number of blocks pointed by indirect (non-L0) block which should be +prefetched during a pool traversal, like +.Nm zfs Cm send +or other data crawling operations. +. +.It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 5 Ns % Pq ulong +Control percentage of dirtied indirect blocks from frees allowed into one TXG. +After this threshold is crossed, additional frees will wait until the next TXG. +.Sy 0 No disables this throttle. +. +.It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable predictive prefetch. +Note that it leaves "prescient" prefetch (for. e.g.\& +.Nm zfs Cm send ) +intact. +Unlike predictive prefetch, prescient prefetch never issues I/O +that ends up not being needed, so it can't hurt performance. +. +.It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable QAT hardware acceleration for SHA256 checksums. +May be unset after the ZFS modules have been loaded to initialize the QAT +hardware as long as support is compiled in and the QAT driver is present. +. +.It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable QAT hardware acceleration for gzip compression. +May be unset after the ZFS modules have been loaded to initialize the QAT +hardware as long as support is compiled in and the QAT driver is present. +. +.It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable QAT hardware acceleration for AES-GCM encryption. +May be unset after the ZFS modules have been loaded to initialize the QAT +hardware as long as support is compiled in and the QAT driver is present. +. +.It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq long +Bytes to read per chunk. +. +.It Sy zfs_read_history Ns = Ns Sy 0 Pq int +Historical statistics for this many latest reads will be available in +.Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads . +. +.It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int +Include cache hits in read history +. +.It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +Maximum read segment size to issue when sequentially resilvering a +top-level vdev. +. +.It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Automatically start a pool scrub when the last active sequential resilver +completes in order to verify the checksums of all blocks which have been +resilvered. +This is enabled by default and strongly recommended. +. +.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32MB Pc Pq ulong +Maximum amount of I/O that can be concurrently issued for a sequential +resilver per leaf device, given in bytes. +. +.It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int +If an indirect split block contains more than this many possible unique +combinations when being reconstructed, consider it too computationally +expensive to check them all. +Instead, try at most this many randomly selected +combinations each time the block is accessed. +This allows all segment copies to participate fairly +in the reconstruction when all combinations +cannot be checked and prevents repeated use of one bad copy. +. +.It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int +Set to attempt to recover from fatal errors. +This should only be used as a last resort, +as it typically results in leaked space, or worse. +. +.It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int +Ignore hard IO errors during device removal. +When set, if a device encounters a hard IO error during the removal process +the removal will not be cancelled. +This can result in a normally recoverable block becoming permanently damaged +and is hence not recommended. +This should only be used as a last resort when the +pool cannot be returned to a healthy state prior to removing the device. +. +.It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int +This is used by the test suite so that it can ensure that certain actions +happen while in the middle of a removal. +. +.It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +The largest contiguous segment that we will attempt to allocate when removing +a device. +If there is a performance problem with attempting to allocate large blocks, +consider decreasing this. +The default value is also the maximum. +. +.It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int +Ignore the +.Sy resilver_defer +feature, causing an operation that would start a resilver to +immediately restart the one in progress. +. +.It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3s Pc Pq int +Resilvers are processed by the sync thread. +While resilvering, it will spend at least this much time +working on a resilver between TXG flushes. +. +.It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int +If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub), +even if there were unrepairable errors. +Intended to be used during pool repair or recovery to +stop resilvering when the pool is next imported. +. +.It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq int +Scrubs are processed by the sync thread. +While scrubbing, it will spend at least this much time +working on a scrub between TXG flushes. +. +.It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2h Pc Pq int +To preserve progress across reboots, the sequential scan algorithm periodically +needs to stop metadata scanning and issue all the verification I/O to disk. +The frequency of this flushing is determined by this tunable. +. +.It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq int +This tunable affects how scrub and resilver I/O segments are ordered. +A higher number indicates that we care more about how filled in a segment is, +while a lower number indicates we care more about the size of the extent without +considering the gaps within a segment. +This value is only tunable upon module insertion. +Changing the value afterwards will have no affect on scrub or resilver performance. +. +.It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq int +Determines the order that data will be verified while scrubbing or resilvering: +.Bl -tag -compact -offset 4n -width "a" +.It Sy 1 +Data will be verified as sequentially as possible, given the +amount of memory reserved for scrubbing +.Pq see Sy zfs_scan_mem_lim_fact . +This may improve scrub performance if the pool's data is very fragmented. +.It Sy 2 +The largest mostly-contiguous chunk of found data will be verified first. +By deferring scrubbing of small segments, we may later find adjacent data +to coalesce and increase the segment size. +.It Sy 0 +.No Use strategy Sy 1 No during normal verification +.No and strategy Sy 2 No while taking a checkpoint. +.El +. +.It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int +If unset, indicates that scrubs and resilvers will gather metadata in +memory before issuing sequential I/O. +Otherwise indicates that the legacy algorithm will be used, +where I/O is initiated as soon as it is discovered. +Unsetting will not affect scrubs or resilvers that are already in progress. +. +.It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2MB Pc Pq int +Sets the largest gap in bytes between scrub/resilver I/O operations +that will still be considered sequential for sorting purposes. +Changing this value will not +affect scrubs or resilvers that are already in progress. +. +.It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq int +Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. +This tunable determines the hard limit for I/O sorting memory usage. +When the hard limit is reached we stop scanning metadata and start issuing +data verification I/O. +This is done until we get below the soft limit. +. +.It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq int +The fraction of the hard limit used to determined the soft limit for I/O sorting +by the sequential scan algorithm. +When we cross this limit from below no action is taken. +When we cross this limit from above it is because we are issuing verification I/O. +In this case (unless the metadata scan is done) we stop issuing verification I/O +and start scanning metadata again until we get to the hard limit. +. +.It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int +Enforce tight memory limits on pool scans when a sequential scan is in progress. +When disabled, the memory limit may be exceeded by fast disks. +. +.It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int +Freezes a scrub/resilver in progress without actually pausing it. +Intended for testing/debugging. +. +.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4MB Pc Pq int +Maximum amount of data that can be concurrently issued at once for scrubs and +resilvers per leaf device, given in bytes. +. +.It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int +Allow sending of corrupt data (ignore read/checksum errors when sending). +. +.It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int +Include unmodified spill blocks in the send stream. +Under certain circumstances, previous versions of ZFS could incorrectly +remove the spill block from an existing object. +Including unmodified copies of the spill blocks creates a backwards-compatible +stream which will recreate a spill block if it was incorrectly removed. +. +.It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +The fill fraction of the +.Nm zfs Cm send +internal queues. +The fill fraction controls the timing with which internal threads are woken up. +. +.It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +The maximum number of bytes allowed in +.Nm zfs Cm send Ns 's +internal queues. +. +.It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +The fill fraction of the +.Nm zfs Cm send +prefetch queue. +The fill fraction controls the timing with which internal threads are woken up. +. +.It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +The maximum number of bytes allowed that will be prefetched by +.Nm zfs Cm send . +This value must be at least twice the maximum block size in use. +. +.It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +The fill fraction of the +.Nm zfs Cm receive +queue. +The fill fraction controls the timing with which internal threads are woken up. +. +.It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +The maximum number of bytes allowed in the +.Nm zfs Cm receive +queue. +This value must be at least twice the maximum block size in use. +. +.It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +The maximum amount of data, in bytes, that +.Nm zfs Cm receive +will write in one DMU transaction. +This is the uncompressed size, even when receiving a compressed send stream. +This setting will not reduce the write size below a single block. +Capped at a maximum of +.Sy 32MB . +. +.It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq ulong +Setting this variable overrides the default logic for estimating block +sizes when doing a +.Nm zfs Cm send . +The default heuristic is that the average block size +will be the current recordsize. +Override this value if most data in your dataset is not of that size +and you require accurate zfs send size estimates. +. +.It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq int +Flushing of data to disk is done in passes. +Defer frees starting in this pass. +. +.It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +Maximum memory used for prefetching a checkpoint's space map on each +vdev while discarding the checkpoint. +. +.It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq int +Only allow small data blocks to be allocated on the special and dedup vdev +types when the available free space percentage on these vdevs exceeds this value. +This ensures reserved space is available for pool metadata as the +special vdevs approach capacity. +. +.It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq int +Starting in this sync pass, disable compression (including of metadata). +With the default setting, in practice, we don't have this many sync passes, +so this has no effect. +.Pp +The original intent was that disabling compression would help the sync passes +to converge. +However, in practice, disabling compression increases +the average number of sync passes; because when we turn compression off, +many blocks' size will change, and thus we have to re-allocate +(not overwrite) them. +It also increases the number of +.Em 128kB +allocations (e.g. for indirect blocks and spacemaps) +because these will not be compressed. +The +.Em 128kB +allocations are especially detrimental to performance +on highly fragmented systems, which may have very few free segments of this size, +and may need to load new metaslabs to satisfy these allocations. +. +.It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq int +Rewrite new block pointers starting in this pass. +. +.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int +This controls the number of threads used by +.Sy dp_sync_taskq . +The default value of +.Sy 75% +will create a maximum of one thread per CPU. +. +.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq uint +Maximum size of TRIM command. +Larger ranges will be split into chunks no larger than this value before issuing. +. +.It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq uint +Minimum size of TRIM commands. +TRIM ranges smaller than this will be skipped, +unless they're part of a larger range which was chunked. +This is done because it's common for these small TRIMs +to negatively impact overall performance. +. +.It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Skip uninitialized metaslabs during the TRIM process. +This option is useful for pools constructed from large thinly-provisioned devices +where TRIM operations are slow. +As a pool ages, an increasing fraction of the pool's metaslabs +will be initialized, progressively degrading the usefulness of this option. +This setting is stored when starting a manual TRIM and will +persist for the duration of the requested TRIM. +. +.It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint +Maximum number of queued TRIMs outstanding per leaf vdev. +The number of concurrent TRIM commands issued to the device is controlled by +.Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active . +. +.It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint +The number of transaction groups' worth of frees which should be aggregated +before TRIM operations are issued to the device. +This setting represents a trade-off between issuing larger, +more efficient TRIM operations and the delay +before the recently trimmed space is available for use by the device. +.Pp +Increasing this value will allow frees to be aggregated for a longer time. +This will result is larger TRIM operations and potentially increased memory usage. +Decreasing this value will have the opposite effect. +The default of +.Sy 32 +was determined to be a reasonable compromise. +. +.It Sy zfs_txg_history Ns = Ns Sy 0 Pq int +Historical statistics for this many latest TXGs will be available in +.Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs . +. +.It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq int +Flush dirty data to disk at least every this many seconds (maximum TXG duration). +. +.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq int +Allow TRIM I/Os to be aggregated. +This is normally not helpful because the extents to be trimmed +will have been already been aggregated by the metaslab. +This option is provided for debugging and performance analysis. +. +.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +Max vdev I/O aggregation size. +. +.It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq int +Max vdev I/O aggregation size for non-rotating media. +. +.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64kB Pc Pq int +Shift size to inflate reads to. +. +.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16kB Pc Pq int +Inflate reads smaller than this value to meet the +.Sy zfs_vdev_cache_bshift +size +.Pq default Sy 64kB . +. +.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq int +Total size of the per-disk cache in bytes. +.Pp +Currently this feature is disabled, as it has been found to not be helpful +for performance and in some cases harmful. +. +.It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O operation +immediately follows its predecessor on rotational vdevs +for the purpose of making decisions based on load. +. +.It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O operation +lacks locality as defined by +.Sy zfs_vdev_mirror_rotating_seek_offset . +Operations within this that are not immediately following the previous operation +are incremented by half. +. +.It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +The maximum distance for the last queued I/O operation in which +the balancing algorithm considers an operation to have locality. +.No See Sx ZFS I/O SCHEDULER . +. +.It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member on non-rotational vdevs +when I/O operations do not immediately follow one another. +. +.It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O operation lacks +locality as defined by the +.Sy zfs_vdev_mirror_rotating_seek_offset . +Operations within this that are not immediately following the previous operation +are incremented by half. +. +.It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int +Aggregate read I/O operations if the on-disk gap between them is within this +threshold. +. +.It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4kB Pc Pq int +Aggregate write I/O operations if the on-disk gap between them is within this +threshold. +. +.It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string +Select the raidz parity implementation to use. +.Pp +Variants that don't depend on CPU-specific features +may be selected on module load, as they are supported on all systems. +The remaining options may only be set after the module is loaded, +as they are available only if the implementations are compiled in +and supported on the running system. +.Pp +Once the module is loaded, +.Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl +will show the available options, +with the currently selected one enclosed in square brackets. +.Pp +.TS +lb l l . +fastest selected by built-in benchmark +original original implementation +scalar scalar implementation +sse2 SSE2 instruction set 64-bit x86 +ssse3 SSSE3 instruction set 64-bit x86 +avx2 AVX2 instruction set 64-bit x86 +avx512f AVX512F instruction set 64-bit x86 +avx512bw AVX512F & AVX512BW instruction sets 64-bit x86 +aarch64_neon NEON Aarch64/64-bit ARMv8 +aarch64_neonx2 NEON with more unrolling Aarch64/64-bit ARMv8 +powerpc_altivec Altivec PowerPC +.TE +. +.It Sy zfs_vdev_scheduler Pq charp +.Sy DEPRECATED . +Prints warning to kernel log for compatibility. +. +.It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq int +Max event queue length. +Events in the queue can be viewed with +.Xr zpool-events 8 . +. +.It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int +Maximum recent zevent records to retain for duplicate checking. +Setting this to +.Sy 0 +disables duplicate detection. +. +.It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15min Pc Pq int +Lifespan for a recent ereport that was retained for duplicate checking. +. +.It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int +The maximum number of taskq entries that are allowed to be cached. +When this limit is exceeded transaction records (itxs) +will be cleaned synchronously. +. +.It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int +The number of taskq entries that are pre-populated when the taskq is first +created and are immediately available for use. +. +.It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int +This controls the number of threads used by +.Sy dp_zil_clean_taskq . +The default value of +.Sy 100% +will create a maximum of one thread per cpu. +. +.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq int +This sets the maximum block size used by the ZIL. +On very fragmented pools, lowering this +.Pq typically to Sy 36kB +can improve performance. +. +.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable the cache flush commands that are normally sent to disk by +the ZIL after an LWB write has completed. +Setting this will cause ZIL corruption on power loss +if a volatile out-of-order write cache is enabled. +. +.It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int +Disable intent logging replay. +Can be disabled for recovery from corrupted ZIL. +. +.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768kB Pc Pq ulong +Limit SLOG write size per commit executed with synchronous priority. +Any writes above that will be executed with lower (asynchronous) priority +to limit potential SLOG device abuse by single active ZIL writer. +. +.It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq int +Usually, one metaslab from each normal-class vdev is dedicated for use by +the ZIL to log synchronous writes. +However, if there are fewer than +.Sy zfs_embedded_slog_min_ms +metaslabs in the vdev, this functionality is disabled. +This ensures that we don't set aside an unreasonable amount of space for the ZIL. +. +.It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int +If non-zero, the zio deadman will produce debugging messages +.Pq see Sy zfs_dbgmsg_enable +for all zios, rather than only for leaf zios possessing a vdev. +This is meant to be used by developers to gain +diagnostic information for hang conditions which don't involve a mutex +or other locking primitive: typically conditions in which a thread in +the zio pipeline is looping indefinitely. +. +.It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30s Pc Pq int +When an I/O operation takes more than this much time to complete, +it's marked as slow. +Each slow operation causes a delay zevent. +Slow I/O counters can be seen with +.Nm zpool Cm status Fl s . +. +.It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Throttle block allocations in the I/O pipeline. +This allows for dynamic allocation distribution when devices are imbalanced. +When enabled, the maximum number of pending allocations per top-level vdev +is limited by +.Sy zfs_vdev_queue_depth_pct . +. +.It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int +Prioritize requeued I/O. +. +.It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint +Percentage of online CPUs which will run a worker thread for I/O. +These workers are responsible for I/O work such as compression and +checksum calculations. +Fractional number of CPUs will be rounded down. +.Pp +The default value of +.Sy 80% +was chosen to avoid using all CPUs which can result in +latency issues and inconsistent application performance, +especially when slower compression and/or checksumming is enabled. +. +.It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint +Number of worker threads per taskq. +Lower values improve I/O ordering and CPU utilization, +while higher reduces lock contention. +.Pp +If +.Sy 0 , +generate a system-dependent value close to 6 threads per taskq. +. +.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Do not create zvol device nodes. +This may slightly improve startup time on +systems with a very large number of zvols. +. +.It Sy zvol_major Ns = Ns Sy 230 Pq uint +Major number for zvol block devices. +. +.It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq ulong +Discard (TRIM) operations done on zvols will be done in batches of this +many blocks, where block size is determined by the +.Sy volblocksize +property of a zvol. +. +.It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq uint +When adding a zvol to the system, prefetch this many bytes +from the start and end of the volume. +Prefetching these regions of the volume is desirable, +because they are likely to be accessed immediately by +.Xr blkid 8 +or the kernel partitioner. +. +.It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint +When processing I/O requests for a zvol, submit them synchronously. +This effectively limits the queue depth to +.Em 1 +for each I/O submitter. +When unset, requests are handled asynchronously by a thread pool. +The number of requests which can be handled concurrently is controlled by +.Sy zvol_threads . +. +.It Sy zvol_threads Ns = Ns Sy 32 Pq uint +Max number of threads which can handle zvol I/O requests concurrently. +. +.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint +Defines zvol block devices behaviour when +.Sy volmode Ns = Ns Sy default : +.Bl -tag -compact -offset 4n -width "a" +.It Sy 1 +.No equivalent to Sy full +.It Sy 2 +.No equivalent to Sy dev +.It Sy 3 +.No equivalent to Sy none +.El +.El +. +.Sh ZFS I/O SCHEDULER +ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. +The scheduler determines when and in what order those operations are issued. +The scheduler divides operations into five I/O classes, +prioritized in the following order: sync read, sync write, async read, +async write, and scrub/resilver. +Each queue defines the minimum and maximum number of concurrent operations +that may be issued to the device. +In addition, the device has an aggregate maximum, +.Sy zfs_vdev_max_active . +Note that the sum of the per-queue minima must not exceed the aggregate maximum. +If the sum of the per-queue maxima exceeds the aggregate maximum, +then the number of active operations may reach +.Sy zfs_vdev_max_active , +in which case no further operations will be issued, +regardless of whether all per-queue minima have been met. +.Pp +For many physical devices, throughput increases with the number of +concurrent operations, but latency typically suffers. +Furthermore, physical devices typically have a limit +at which more concurrent operations have no +effect on throughput or can actually cause it to decrease. +.Pp +The scheduler selects the next operation to issue by first looking for an +I/O class whose minimum has not been satisfied. +Once all are satisfied and the aggregate maximum has not been hit, +the scheduler looks for classes whose maximum has not been satisfied. +Iteration through the I/O classes is done in the order specified above. +No further operations are issued +if the aggregate maximum number of concurrent operations has been hit, +or if there are no operations queued for an I/O class that has not hit its maximum. +Every time an I/O operation is queued or an operation completes, +the scheduler looks for new operations to issue. +.Pp +In general, smaller +.Sy max_active Ns s +will lead to lower latency of synchronous operations. +Larger +.Sy max_active Ns s +may lead to higher overall throughput, depending on underlying storage. +.Pp +The ratio of the queues' +.Sy max_active Ns s +determines the balance of performance between reads, writes, and scrubs. +For example, increasing +.Sy zfs_vdev_scrub_max_active +will cause the scrub or resilver to complete more quickly, +but reads and writes to have higher latency and lower throughput. +.Pp +All I/O classes have a fixed maximum number of outstanding operations, +except for the async write class. +Asynchronous writes represent the data that is committed to stable storage +during the syncing stage for transaction groups. +Transaction groups enter the syncing state periodically, +so the number of queued async writes will quickly burst up +and then bleed down to zero. +Rather than servicing them as quickly as possible, +the I/O scheduler changes the maximum number of active async write operations +according to the amount of dirty data in the pool. +Since both throughput and latency typically increase with the number of +concurrent operations issued to physical devices, reducing the +burstiness in the number of concurrent operations also stabilizes the +response time of operations from other – and in particular synchronous – queues. +In broad strokes, the I/O scheduler will issue more concurrent operations +from the async write queue as there's more dirty data in the pool. +. +.Ss Async Writes +The number of concurrent operations issued for the async write I/O class +follows a piece-wise linear function defined by a few adjustable points: +.Bd -literal + | o---------| <-- \fBzfs_vdev_async_write_max_active\fP + ^ | /^ | + | | / | | +active | / | | + I/O | / | | +count | / | | + | / | | + |-------o | | <-- \fBzfs_vdev_async_write_min_active\fP + 0|_______^______|_________| + 0% | | 100% of \fBzfs_dirty_data_max\fP + | | + | `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP + `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP +.Ed +.Pp +Until the amount of dirty data exceeds a minimum percentage of the dirty +data allowed in the pool, the I/O scheduler will limit the number of +concurrent operations to the minimum. +As that threshold is crossed, the number of concurrent operations issued +increases linearly to the maximum at the specified maximum percentage +of the dirty data allowed in the pool. +.Pp +Ideally, the amount of dirty data on a busy pool will stay in the sloped +part of the function between +.Sy zfs_vdev_async_write_active_min_dirty_percent +and +.Sy zfs_vdev_async_write_active_max_dirty_percent . +If it exceeds the maximum percentage, +this indicates that the rate of incoming data is +greater than the rate that the backend storage can handle. +In this case, we must further throttle incoming writes, +as described in the next section. +. +.Sh ZFS TRANSACTION DELAY +We delay transactions when we've determined that the backend storage +isn't able to accommodate the rate of incoming writes. +.Pp +If there is already a transaction waiting, we delay relative to when +that transaction will finish waiting. +This way the calculated delay time +is independent of the number of threads concurrently executing transactions. +.Pp +If we are the only waiter, wait relative to when the transaction started, +rather than the current time. +This credits the transaction for "time already served", +e.g. reading indirect blocks. +.Pp +The minimum time for a transaction to take is calculated as +.Dl min_time = min( Ns Sy zfs_delay_scale No * (dirty - min) / (max - dirty), 100ms) +.Pp +The delay has two degrees of freedom that can be adjusted via tunables. +The percentage of dirty data at which we start to delay is defined by +.Sy zfs_delay_min_dirty_percent . +This should typically be at or above +.Sy zfs_vdev_async_write_active_max_dirty_percent , +so that we only start to delay after writing at full speed +has failed to keep up with the incoming write rate. +The scale of the curve is defined by +.Sy zfs_delay_scale . +Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. +.Bd -literal +delay + 10ms +-------------------------------------------------------------*+ + | *| + 9ms + *+ + | *| + 8ms + *+ + | * | + 7ms + * + + | * | + 6ms + * + + | * | + 5ms + * + + | * | + 4ms + * + + | * | + 3ms + * + + | * | + 2ms + (midpoint) * + + | | ** | + 1ms + v *** + + | \fBzfs_delay_scale\fP ----------> ******** | + 0 +-------------------------------------*********----------------+ + 0% <- \fBzfs_dirty_data_max\fP -> 100% +.Ed +.Pp +Note, that since the delay is added to the outstanding time remaining on the +most recent transaction it's effectively the inverse of IOPS. +Here, the midpoint of +.Em 500us +translates to +.Em 2000 IOPS . +The shape of the curve +was chosen such that small changes in the amount of accumulated dirty data +in the first three quarters of the curve yield relatively small differences +in the amount of delay. +.Pp +The effects can be easier to understand when the amount of delay is +represented on a logarithmic scale: +.Bd -literal +delay +100ms +-------------------------------------------------------------++ + + + + | | + + *+ + 10ms + *+ + + ** + + | (midpoint) ** | + + | ** + + 1ms + v **** + + + \fBzfs_delay_scale\fP ----------> ***** + + | **** | + + **** + +100us + ** + + + * + + | * | + + * + + 10us + * + + + + + | | + + + + +--------------------------------------------------------------+ + 0% <- \fBzfs_dirty_data_max\fP -> 100% +.Ed +.Pp +Note here that only as the amount of dirty data approaches its limit does +the delay start to increase rapidly. +The goal of a properly tuned system should be to keep the amount of dirty data +out of that range by first ensuring that the appropriate limits are set +for the I/O scheduler to reach optimal throughput on the back-end storage, +and then by changing the value of +.Sy zfs_delay_scale +to increase the steepness of the curve. diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am deleted file mode 100644 index 1c0683ee30..0000000000 --- a/man/man5/Makefile.am +++ /dev/null @@ -1,9 +0,0 @@ -dist_man_MANS = \ - vdev_id.conf.5 \ - zpool-features.5 \ - spl-module-parameters.5 \ - zfs-module-parameters.5 \ - zfs-events.5 - -install-data-local: - $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man5" diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 deleted file mode 100644 index 30d9fc7544..0000000000 --- a/man/man5/spl-module-parameters.5 +++ /dev/null @@ -1,357 +0,0 @@ -'\" te -.\" -.\" Copyright 2013 Turbo Fredriksson . All rights reserved. -.\" -.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017" -.SH NAME -spl\-module\-parameters \- SPL module parameters -.SH DESCRIPTION -.sp -.LP -Description of the different parameters to the SPL module. - -.SS "Module parameters" -.sp -.LP - -.sp -.ne 2 -.na -\fBspl_kmem_cache_expire\fR (uint) -.ad -.RS 12n -Cache expiration is part of default Illumos cache behavior. The idea is -that objects in magazines which have not been recently accessed should be -returned to the slabs periodically. This is known as cache aging and -when enabled objects will be typically returned after 15 seconds. -.sp -On the other hand Linux slabs are designed to never move objects back to -the slabs unless there is memory pressure. This is possible because under -Linux the cache will be notified when memory is low and objects can be -released. -.sp -By default only the Linux method is enabled. It has been shown to improve -responsiveness on low memory systems and not negatively impact the performance -of systems with more memory. This policy may be changed by setting the -\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled -concurrently. -.sp -0x01 - Aging (Illumos), 0x02 - Low memory (Linux) -.sp -Default value: \fB0x02\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_kmem_threads\fR (uint) -.ad -.RS 12n -The number of threads created for the spl_kmem_cache task queue. This task -queue is responsible for allocating new slabs for use by the kmem caches. -For the majority of systems and workloads only a small number of threads are -required. -.sp -Default value: \fB4\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_reclaim\fR (uint) -.ad -.RS 12n -When this is set it prevents Linux from being able to rapidly reclaim all the -memory held by the kmem caches. This may be useful in circumstances where -it's preferable that Linux reclaim memory from some other subsystem first. -Setting this will increase the likelihood out of memory events on a memory -constrained system. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_obj_per_slab\fR (uint) -.ad -.RS 12n -The preferred number of objects per slab in the cache. In general, a larger -value will increase the caches memory footprint while decreasing the time -required to perform an allocation. Conversely, a smaller value will minimize -the footprint and improve cache reclaim time but individual allocations may -take longer. -.sp -Default value: \fB8\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_obj_per_slab_min\fR (uint) -.ad -.RS 12n -The minimum number of objects allowed per slab. Normally slabs will contain -\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very -large objects it's desirable to only have a few, or even just one, object per -slab. -.sp -Default value: \fB1\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_max_size\fR (uint) -.ad -.RS 12n -The maximum size of a kmem cache slab in MiB. This effectively limits -the maximum cache object size to \fBspl_kmem_cache_max_size\fR / -\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with -object sized larger than this limit. -.sp -Default value: \fB32 (64-bit) or 4 (32-bit)\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_slab_limit\fR (uint) -.ad -.RS 12n -For small objects the Linux slab allocator should be used to make the most -efficient use of the memory. However, large objects are not supported by -the Linux slab and therefore the SPL implementation is preferred. This -value is used to determine the cutoff between a small and large object. -.sp -Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated -using the Linux slab allocator, large objects use the SPL allocator. A -cutoff of 16K was determined to be optimal for architectures using 4K pages. -.sp -Default value: \fB16,384\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_kmem_limit\fR (uint) -.ad -.RS 12n -Depending on the size of a cache object it may be backed by kmalloc()'d -or vmalloc()'d memory. This is because the size of the required allocation -greatly impacts the best way to allocate the memory. -.sp -When objects are small and only a small number of memory pages need to be -allocated, ideally just one, then kmalloc() is very efficient. However, -when allocating multiple pages with kmalloc() it gets increasingly expensive -because the pages must be physically contiguous. -.sp -For this reason we shift to vmalloc() for slabs of large objects which -which removes the need for contiguous pages. We cannot use vmalloc() in -all cases because there is significant locking overhead involved. This -function takes a single global lock over the entire virtual address range -which serializes all allocations. Using slightly different allocation -functions for small and large objects allows us to handle a wide range of -object sizes. -.sp -The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff -size. One quarter the PAGE_SIZE is used as the default value because -\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at -most we will need to allocate four contiguous pages. -.sp -Default value: \fBPAGE_SIZE/4\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_alloc_warn\fR (uint) -.ad -.RS 12n -As a general rule kmem_alloc() allocations should be small, preferably -just a few pages since they must by physically contiguous. Therefore, a -rate limited warning will be printed to the console for any kmem_alloc() -which exceeds a reasonable threshold. -.sp -The default warning threshold is set to eight pages but capped at 32K to -accommodate systems using large pages. This value was selected to be small -enough to ensure the largest allocations are quickly noticed and fixed. -But large enough to avoid logging any warnings when a allocation size is -larger than optimal but not a serious concern. Since this value is tunable, -developers are encouraged to set it lower when testing so any new largish -allocations are quickly caught. These warnings may be disabled by setting -the threshold to zero. -.sp -Default value: \fB32,768\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_alloc_max\fR (uint) -.ad -.RS 12n -Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. -Allocations which are marginally smaller than this limit may succeed but -should still be avoided due to the expense of locating a contiguous range -of free pages. Therefore, a maximum kmem size with reasonable safely -margin of 4x is set. Kmem_alloc() allocations larger than this maximum -will quickly fail. Vmem_alloc() allocations less than or equal to this -value will use kmalloc(), but shift to vmalloc() when exceeding this value. -.sp -Default value: \fBKMALLOC_MAX_SIZE/4\fR -.RE - -.sp -.ne 2 -.na -\fBspl_kmem_cache_magazine_size\fR (uint) -.ad -.RS 12n -Cache magazines are an optimization designed to minimize the cost of -allocating memory. They do this by keeping a per-cpu cache of recently -freed objects, which can then be reallocated without taking a lock. This -can improve performance on highly contended caches. However, because -objects in magazines will prevent otherwise empty slabs from being -immediately released this may not be ideal for low memory machines. -.sp -For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a -maximum magazine size. When this value is set to 0 the magazine size will -be automatically determined based on the object size. Otherwise magazines -will be limited to 2-256 objects per magazine (i.e per cpu). Magazines -may never be entirely disabled in this implementation. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_hostid\fR (ulong) -.ad -.RS 12n -The system hostid, when set this can be used to uniquely identify a system. -By default this value is set to zero which indicates the hostid is disabled. -It can be explicitly enabled by placing a unique non-zero value in -\fB/etc/hostid/\fR. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_hostid_path\fR (charp) -.ad -.RS 12n -The expected path to locate the system hostid when specified. This value -may be overridden for non-standard configurations. -.sp -Default value: \fB/etc/hostid\fR -.RE - -.sp -.ne 2 -.na -\fBspl_panic_halt\fR (uint) -.ad -.RS 12n -Cause a kernel panic on assertion failures. When not enabled, the thread is -halted to facilitate further debugging. -.sp -Set to a non-zero value to enable. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_taskq_kick\fR (uint) -.ad -.RS 12n -Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will -scan all the taskqs. If any of them have a pending task more than 5 seconds old, -it will kick it to spawn more threads. This can be used if you find a rare -deadlock occurs because one or more taskqs didn't spawn a thread when it should. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_taskq_thread_bind\fR (int) -.ad -.RS 12n -Bind taskq threads to specific CPUs. When enabled all taskq threads will -be distributed evenly over the available CPUs. By default, this behavior -is disabled to allow the Linux scheduler the maximum flexibility to determine -where a thread should run. -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBspl_taskq_thread_dynamic\fR (int) -.ad -.RS 12n -Allow dynamic taskqs. When enabled taskqs which set the TASKQ_DYNAMIC flag -will by default create only a single thread. New threads will be created on -demand up to a maximum allowed number to facilitate the completion of -outstanding tasks. Threads which are no longer needed will be promptly -destroyed. By default this behavior is enabled but it can be disabled to -aid performance analysis or troubleshooting. -.sp -Default value: \fB1\fR -.RE - -.sp -.ne 2 -.na -\fBspl_taskq_thread_priority\fR (int) -.ad -.RS 12n -Allow newly created taskq threads to set a non-default scheduler priority. -When enabled the priority specified when a taskq is created will be applied -to all threads created by that taskq. When disabled all threads will use -the default Linux kernel thread priority. By default, this behavior is -enabled. -.sp -Default value: \fB1\fR -.RE - -.sp -.ne 2 -.na -\fBspl_taskq_thread_sequential\fR (int) -.ad -.RS 12n -The number of items a taskq worker thread must handle without interruption -before requesting a new worker thread be spawned. This is used to control -how quickly taskqs ramp up the number of threads processing the queue. -Because Linux thread creation and destruction are relatively inexpensive a -small default value has been selected. This means that normally threads will -be created aggressively which is desirable. Increasing this value will -result in a slower thread creation rate which may be preferable for some -configurations. -.sp -Default value: \fB4\fR -.RE - -.sp -.ne 2 -.na -\fBspl_max_show_tasks\fR (uint) -.ad -.RS 12n -The maximum number of tasks per pending list in each taskq shown in -/proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will -walk the lists with lock held, reading it could cause a lock up if the list -grow too large without limiting the output. "(truncated)" will be shown if the -list is larger than the limit. -.sp -Default value: \fB512\fR -.RE diff --git a/man/man5/vdev_id.conf.5 b/man/man5/vdev_id.conf.5 index 5b7fbf0cad..a2d38add4e 100644 --- a/man/man5/vdev_id.conf.5 +++ b/man/man5/vdev_id.conf.5 @@ -1,222 +1,249 @@ -.TH vdev_id.conf 5 -.SH NAME -vdev_id.conf \- Configuration file for vdev_id -.SH DESCRIPTION -.I vdev_id.conf +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.Dd May 26, 2021 +.Dt VDEV_ID.CONF 5 +.Os +. +.Sh NAME +.Nm vdev_id.conf +.Nd configuration file for vdev_id(8) +.Sh DESCRIPTION +.Nm is the configuration file for -.BR vdev_id (8). +.Xr vdev_id 8 . It controls the default behavior of -.BR vdev_id (8) +.Xr vdev_id 8 while it is mapping a disk device name to an alias. -.PP +.Pp The -.I vdev_id.conf +.Nm file uses a simple format consisting of a keyword followed by one or -more values on a single line. Any line not beginning with a recognized -keyword is ignored. Comments may optionally begin with a hash -character. - +more values on a single line. +Any line not beginning with a recognized keyword is ignored. +Comments may optionally begin with a hash character. +.Pp The following keywords and values are used. -.TP -\fIalias\fR -Maps a device link in the /dev directory hierarchy to a new device -name. The udev rule defining the device link must have run prior to -.BR vdev_id (8). +.Bl -tag -width "-h" +.It Sy alias Ar name Ar devlink +Maps a device link in the +.Pa /dev +directory hierarchy to a new device name. +The udev rule defining the device link must have run prior to +.Xr vdev_id 8 . A defined alias takes precedence over a topology-derived name, but the -two naming methods can otherwise coexist. For example, one might name -drives in a JBOD with the sas_direct topology while naming an internal -L2ARC device with an alias. - -\fIname\fR - the name of the link to the device that will by created in -/dev/disk/by-vdev. - -\fIdevlink\fR - the name of the device link that has already been -defined by udev. This may be an absolute path or the base filename. - -.TP -\fIchannel\fR [pci_slot] +two naming methods can otherwise coexist. +For example, one might name drives in a JBOD with the +.Sy sas_direct +topology while naming an internal L2ARC device with an alias. +.Pp +.Ar name +is the name of the link to the device that will by created under +.Pa /dev/disk/by-vdev . +.Pp +.Ar devlink +is the name of the device link that has already been +defined by udev. +This may be an absolute path or the base filename. +. +.It Sy channel [ Ns Ar pci_slot ] Ar port Ar name Maps a physical path to a channel name (typically representing a single disk enclosure). - -.TP -\fIenclosure_symlinks\fR -Additionally create /dev/by-enclosure symlinks to the disk enclosure -sg devices using the naming scheme from from vdev_id.conf. -\fIenclosure_symlinks\fR is only allowed for sas_direct mode. -.TP -\fIenclosure_symlinks_prefix\fR -Specify the prefix for the enclosure symlinks in the form of: - -/dev/by-enclosure/- - -Defaults to "enc" if not specified. -.TP -\fIpci_slot\fR - specifies the PCI SLOT of the HBA -hosting the disk enclosure being mapped, as found in the output of -.BR lspci (8). -This argument is not used in sas_switch mode. - -\fIport\fR - specifies the numeric identifier of the HBA or SAS switch port -connected to the disk enclosure being mapped. - -\fIname\fR - specifies the name of the channel. - -.TP -\fIslot\fR [channel] +. +.It Sy enclosure_symlinks Sy yes Ns | Ns Sy no +Additionally create +.Pa /dev/by-enclosure +symlinks to the disk enclosure +.Em sg +devices using the naming scheme from +.Pa vdev_id.conf . +.Sy enclosure_symlinks +is only allowed for +.Sy sas_direct +mode. +. +.It Sy enclosure_symlinks_prefix Ar prefix +Specify the prefix for the enclosure symlinks in the form +.Pa /dev/by-enclosure/ Ns Ao Ar prefix Ac Ns - Ns Ao Ar channel Ac Ns Aq Ar num +.Pp +Defaults to +.Dq Em enc . +. +.It Sy slot Ar prefix Ar new Op Ar channel Maps a disk slot number as reported by the operating system to an -alternative slot number. If the \fIchannel\fR parameter is specified +alternative slot number. +If the +.Ar channel +parameter is specified then the mapping is only applied to slots in the named channel, -otherwise the mapping is applied to all channels. The first-specified -\fIslot\fR rule that can match a slot takes precedence. Therefore a -channel-specific mapping for a given slot should generally appear before -a generic mapping for the same slot. In this way a custom mapping may -be applied to a particular channel and a default mapping applied to the -others. - -.TP -\fImultipath\fR +otherwise the mapping is applied to all channels. +The first-specified +.Ar slot +rule that can match a slot takes precedence. +Therefore a channel-specific mapping for a given slot should generally appear +before a generic mapping for the same slot. +In this way a custom mapping may be applied to a particular channel +and a default mapping applied to the others. +. +.It Sy multipath Sy yes Ns | Ns Sy no Specifies whether -.BR vdev_id (8) -will handle only dm-multipath devices. If set to "yes" then -.BR vdev_id (8) +.Xr vdev_id 8 +will handle only dm-multipath devices. +If set to +.Sy yes +then +.Xr vdev_id 8 will examine the first running component disk of a dm-multipath -device as listed by the -.BR multipath (8) -command to determine the physical path. -.TP -\fItopology\fR +device as provided by the driver command to determine the physical path. +. +.It Sy topology Sy sas_direct Ns | Ns Sy sas_switch Ns | Ns Sy scsi Identifies a physical topology that governs how physical paths are -mapped to channels. - -\fIsas_direct\fR - in this mode a channel is uniquely identified by -a PCI slot and a HBA port number - -\fIsas_switch\fR - in this mode a channel is uniquely identified by -a SAS switch port number - -.TP -\fIphys_per_port\fR +mapped to channels: +.Bl -tag -compact -width "sas_direct and scsi" +.It Sy sas_direct No and Sy scsi +channels are uniquely identified by a PCI slot and HBA port number +.It Sy sas_switch +channels are uniquely identified by a SAS switch port number +.El +. +.It Sy phys_per_port Ar num Specifies the number of PHY devices associated with a SAS HBA port or SAS switch port. -.BR vdev_id (8) +.Xr vdev_id 8 internally uses this value to determine which HBA or switch port a -device is connected to. The default is 4. - -.TP -\fIslot\fR +device is connected to. +The default is +.Sy 4 . +. +.It Sy slot Sy bay Ns | Ns Sy phy Ns | Ns Sy port Ns | Ns Sy id Ns | Ns Sy lun Ns | Ns Sy ses Specifies from which element of a SAS identifier the slot number is -taken. The default is bay. - -\fIbay\fR - read the slot number from the bay identifier. - -\fIphy\fR - read the slot number from the phy identifier. - -\fIport\fR - use the SAS port as the slot number. - -\fIid\fR - use the scsi id as the slot number. - -\fIlun\fR - use the scsi lun as the slot number. - -\fIses\fR - use the SCSI Enclosure Services (SES) enclosure device slot number, +taken. +The default is +.Sy bay : +.Bl -tag -compact -width "port" +.It Sy bay +read the slot number from the bay identifier. +.It Sy phy +read the slot number from the phy identifier. +.It Sy port +use the SAS port as the slot number. +.It Sy id +use the scsi id as the slot number. +.It Sy lun +use the scsi lun as the slot number. +.It Sy ses +use the SCSI Enclosure Services (SES) enclosure device slot number, as reported by -.BR sg_ses (8). -This is intended for use only on systems where \fIbay\fR is unsupported, -noting that \fIport\fR and \fIid\fR may be unstable across disk replacement. -.SH EXAMPLES -A non-multipath configuration with direct-attached SAS enclosures and an -arbitrary slot re-mapping. -.P -.nf - multipath no - topology sas_direct - phys_per_port 4 - slot bay - - # PCI_SLOT HBA PORT CHANNEL NAME - channel 85:00.0 1 A - channel 85:00.0 0 B - channel 86:00.0 1 C - channel 86:00.0 0 D - - # Custom mapping for Channel A - - # Linux Mapped - # Slot Slot Channel - slot 1 7 A - slot 2 10 A - slot 3 3 A - slot 4 6 A - - # Default mapping for B, C, and D - - slot 1 4 - slot 2 2 - slot 3 1 - slot 4 3 -.fi -.P -A SAS-switch topology. Note that the -.I channel -keyword takes only two arguments in this example. -.P -.nf - topology sas_switch - - # SWITCH PORT CHANNEL NAME - channel 1 A - channel 2 B - channel 3 C - channel 4 D -.fi -.P -A multipath configuration. Note that channel names have multiple -definitions - one per physical path. -.P -.nf - multipath yes - - # PCI_SLOT HBA PORT CHANNEL NAME - channel 85:00.0 1 A - channel 85:00.0 0 B - channel 86:00.0 1 A - channel 86:00.0 0 B -.fi -.P -A configuration with enclosure_symlinks enabled. -.P -.nf - multipath yes - enclosure_symlinks yes - - # PCI_ID HBA PORT CHANNEL NAME - channel 05:00.0 1 U - channel 05:00.0 0 L - channel 06:00.0 1 U - channel 06:00.0 0 L -.fi -In addition to the disks symlinks, this configuration will create: -.P -.nf - /dev/by-enclosure/enc-L0 - /dev/by-enclosure/enc-L1 - /dev/by-enclosure/enc-U0 - /dev/by-enclosure/enc-U1 -.fi -.P -A configuration using device link aliases. -.P -.nf - # by-vdev - # name fully qualified or base name of device link - alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca - alias d2 wwn-0x5000c5002def789e -.fi -.P - -.SH FILES -.TP -.I /etc/zfs/vdev_id.conf +.Xr sg_ses 8 . +Intended for use only on systems where +.Sy bay +is unsupported, +noting that +.Sy port +and +.Sy id +may be unstable across disk replacement. +.El +.El +. +.Sh FILES +.Bl -tag -width "-v v" +.It Pa /etc/zfs/vdev_id.conf The configuration file for -.BR vdev_id (8). -.SH SEE ALSO -.BR vdev_id (8) +.Xr vdev_id 8 . +.El +. +.Sh EXAMPLES +A non-multipath configuration with direct-attached SAS enclosures and an +arbitrary slot re-mapping: +.Bd -literal -compact -offset Ds +multipath no +topology sas_direct +phys_per_port 4 +slot bay + +# PCI_SLOT HBA PORT CHANNEL NAME +channel 85:00.0 1 A +channel 85:00.0 0 B +channel 86:00.0 1 C +channel 86:00.0 0 D + +# Custom mapping for Channel A + +# Linux Mapped +# Slot Slot Channel +slot 1 7 A +slot 2 10 A +slot 3 3 A +slot 4 6 A + +# Default mapping for B, C, and D + +slot 1 4 +slot 2 2 +slot 3 1 +slot 4 3 +.Ed +.Pp +A SAS-switch topology. +Note, that the +.Ar channel +keyword takes only two arguments in this example: +.Bd -literal -compact -offset Ds +topology sas_switch + +# SWITCH PORT CHANNEL NAME +channel 1 A +channel 2 B +channel 3 C +channel 4 D +.Ed +.Pp +A multipath configuration. +Note that channel names have multiple definitions - one per physical path: +.Bd -literal -compact -offset Ds +multipath yes + +# PCI_SLOT HBA PORT CHANNEL NAME +channel 85:00.0 1 A +channel 85:00.0 0 B +channel 86:00.0 1 A +channel 86:00.0 0 B +.Ed +.Pp +A configuration with enclosure_symlinks enabled: +.Bd -literal -compact -offset Ds +multipath yes +enclosure_symlinks yes + +# PCI_ID HBA PORT CHANNEL NAME +channel 05:00.0 1 U +channel 05:00.0 0 L +channel 06:00.0 1 U +channel 06:00.0 0 L +.Ed +In addition to the disks symlinks, this configuration will create: +.Bd -literal -compact -offset Ds +/dev/by-enclosure/enc-L0 +/dev/by-enclosure/enc-L1 +/dev/by-enclosure/enc-U0 +/dev/by-enclosure/enc-U1 +.Ed +.Pp +A configuration using device link aliases: +.Bd -literal -compact -offset Ds +# by-vdev +# name fully qualified or base name of device link +alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca +alias d2 wwn-0x5000c5002def789e +.Ed +. +.Sh SEE ALSO +.Xr vdev_id 8 diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5 deleted file mode 100644 index 7e9bbedafd..0000000000 --- a/man/man5/zfs-events.5 +++ /dev/null @@ -1,965 +0,0 @@ -'\" te -.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. -.\" Portions Copyright 2018 by Richard Elling -.\" The contents of this file are subject to the terms of the Common Development -.\" and Distribution License (the "License"). You may not use this file except -.\" in compliance with the License. You can obtain a copy of the license at -.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" -.\" See the License for the specific language governing permissions and -.\" limitations under the License. When distributing Covered Code, include this -.\" CDDL HEADER in each file and include the License file at -.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this -.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your -.\" own identifying information: -.\" Portions Copyright [yyyy] [name of copyright owner] -.TH ZFS-EVENTS 5 "Oct 24, 2018" -.SH NAME -zfs\-events \- Events created by the ZFS filesystem. -.SH DESCRIPTION -.sp -.LP -Description of the different events generated by the ZFS stack. -.sp -Most of these don't have any description. The events generated by ZFS -have never been publicly documented. What is here is intended as a -starting point to provide documentation for all possible events. -.sp -To view all events created since the loading of the ZFS infrastructure -(i.e, "the module"), run -.P -.nf -\fBzpool events\fR -.fi -.P -to get a short list, and -.P -.nf -\fBzpool events -v\fR -.fi -.P -to get a full detail of the events and what information -is available about it. -.sp -This man page lists the different subclasses that are issued -in the case of an event. The full event name would be -\fIereport.fs.zfs.SUBCLASS\fR, but we only list the last -part here. - -.SS "EVENTS (SUBCLASS)" -.sp -.LP - -.sp -.ne 2 -.na -\fBchecksum\fR -.ad -.RS 12n -Issued when a checksum error has been detected. -.RE - -.sp -.ne 2 -.na -\fBio\fR -.ad -.RS 12n -Issued when there is an I/O error in a vdev in the pool. -.RE - -.sp -.ne 2 -.na -\fBdata\fR -.ad -.RS 12n -Issued when there have been data errors in the pool. -.RE - -.sp -.ne 2 -.na -\fBdeadman\fR -.ad -.RS 12n -Issued when an I/O is determined to be "hung", this can be caused by lost -completion events due to flaky hardware or drivers. See the -\fBzfs_deadman_failmode\fR module option description for additional -information regarding "hung" I/O detection and configuration. -.RE - -.sp -.ne 2 -.na -\fBdelay\fR -.ad -.RS 12n -Issued when a completed I/O exceeds the maximum allowed time specified -by the \fBzio_slow_io_ms\fR module option. This can be an indicator of -problems with the underlying storage device. The number of delay events is -ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter. -.RE - -.sp -.ne 2 -.na -\fBconfig.sync\fR -.ad -.RS 12n -Issued every time a vdev change have been done to the pool. -.RE - -.sp -.ne 2 -.na -\fBzpool\fR -.ad -.RS 12n -Issued when a pool cannot be imported. -.RE - -.sp -.ne 2 -.na -\fBzpool.destroy\fR -.ad -.RS 12n -Issued when a pool is destroyed. -.RE - -.sp -.ne 2 -.na -\fBzpool.export\fR -.ad -.RS 12n -Issued when a pool is exported. -.RE - -.sp -.ne 2 -.na -\fBzpool.import\fR -.ad -.RS 12n -Issued when a pool is imported. -.RE - -.sp -.ne 2 -.na -\fBzpool.reguid\fR -.ad -.RS 12n -Issued when a REGUID (new unique identifier for the pool have been regenerated) have been detected. -.RE - -.sp -.ne 2 -.na -\fBvdev.unknown\fR -.ad -.RS 12n -Issued when the vdev is unknown. Such as trying to clear device -errors on a vdev that have failed/been kicked from the system/pool -and is no longer available. -.RE - -.sp -.ne 2 -.na -\fBvdev.open_failed\fR -.ad -.RS 12n -Issued when a vdev could not be opened (because it didn't exist for example). -.RE - -.sp -.ne 2 -.na -\fBvdev.corrupt_data\fR -.ad -.RS 12n -Issued when corrupt data have been detected on a vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev.no_replicas\fR -.ad -.RS 12n -Issued when there are no more replicas to sustain the pool. -This would lead to the pool being \fIDEGRADED\fR. -.RE - -.sp -.ne 2 -.na -\fBvdev.bad_guid_sum\fR -.ad -.RS 12n -Issued when a missing device in the pool have been detected. -.RE - -.sp -.ne 2 -.na -\fBvdev.too_small\fR -.ad -.RS 12n -Issued when the system (kernel) have removed a device, and ZFS -notices that the device isn't there any more. This is usually -followed by a \fBprobe_failure\fR event. -.RE - -.sp -.ne 2 -.na -\fBvdev.bad_label\fR -.ad -.RS 12n -Issued when the label is OK but invalid. -.RE - -.sp -.ne 2 -.na -\fBvdev.bad_ashift\fR -.ad -.RS 12n -Issued when the ashift alignment requirement has increased. -.RE - -.sp -.ne 2 -.na -\fBvdev.remove\fR -.ad -.RS 12n -Issued when a vdev is detached from a mirror (or a spare detached from a -vdev where it have been used to replace a failed drive - only works if -the original drive have been readded). -.RE - -.sp -.ne 2 -.na -\fBvdev.clear\fR -.ad -.RS 12n -Issued when clearing device errors in a pool. Such as running \fBzpool clear\fR -on a device in the pool. -.RE - -.sp -.ne 2 -.na -\fBvdev.check\fR -.ad -.RS 12n -Issued when a check to see if a given vdev could be opened is started. -.RE - -.sp -.ne 2 -.na -\fBvdev.spare\fR -.ad -.RS 12n -Issued when a spare have kicked in to replace a failed device. -.RE - -.sp -.ne 2 -.na -\fBvdev.autoexpand\fR -.ad -.RS 12n -Issued when a vdev can be automatically expanded. -.RE - -.sp -.ne 2 -.na -\fBio_failure\fR -.ad -.RS 12n -Issued when there is an I/O failure in a vdev in the pool. -.RE - -.sp -.ne 2 -.na -\fBprobe_failure\fR -.ad -.RS 12n -Issued when a probe fails on a vdev. This would occur if a vdev -have been kicked from the system outside of ZFS (such as the kernel -have removed the device). -.RE - -.sp -.ne 2 -.na -\fBlog_replay\fR -.ad -.RS 12n -Issued when the intent log cannot be replayed. The can occur in the case -of a missing or damaged log device. -.RE - -.sp -.ne 2 -.na -\fBresilver.start\fR -.ad -.RS 12n -Issued when a resilver is started. -.RE - -.sp -.ne 2 -.na -\fBresilver.finish\fR -.ad -.RS 12n -Issued when the running resilver have finished. -.RE - -.sp -.ne 2 -.na -\fBscrub.start\fR -.ad -.RS 12n -Issued when a scrub is started on a pool. -.RE - -.sp -.ne 2 -.na -\fBscrub.finish\fR -.ad -.RS 12n -Issued when a pool has finished scrubbing. -.RE - -.sp -.ne 2 -.na -\fBscrub.abort\fR -.ad -.RS 12n -Issued when a scrub is aborted on a pool. -.RE - -.sp -.ne 2 -.na -\fBscrub.resume\fR -.ad -.RS 12n -Issued when a scrub is resumed on a pool. -.RE - -.sp -.ne 2 -.na -\fBscrub.paused\fR -.ad -.RS 12n -Issued when a scrub is paused on a pool. -.RE - -.sp -.ne 2 -.na -\fBbootfs.vdev.attach\fR -.ad -.RS 12n -.RE - -.SS "PAYLOADS" -.sp -.LP -This is the payload (data, information) that accompanies an -event. -.sp -For -.BR zed (8), -these are set to uppercase and prefixed with \fBZEVENT_\fR. - -.sp -.ne 2 -.na -\fBpool\fR -.ad -.RS 12n -Pool name. -.RE - -.sp -.ne 2 -.na -\fBpool_failmode\fR -.ad -.RS 12n -Failmode - \fBwait\fR, \fBcontinue\fR or \fBpanic\fR. -See -.BR zpool (8) -(\fIfailmode\fR property) for more information. -.RE - -.sp -.ne 2 -.na -\fBpool_guid\fR -.ad -.RS 12n -The GUID of the pool. -.RE - -.sp -.ne 2 -.na -\fBpool_context\fR -.ad -.RS 12n -The load state for the pool (0=none, 1=open, 2=import, 3=tryimport, 4=recover -5=error). -.RE - -.sp -.ne 2 -.na -\fBvdev_guid\fR -.ad -.RS 12n -The GUID of the vdev in question (the vdev failing or operated upon with -\fBzpool clear\fR etc). -.RE - -.sp -.ne 2 -.na -\fBvdev_type\fR -.ad -.RS 12n -Type of vdev - \fBdisk\fR, \fBfile\fR, \fBmirror\fR etc. See -.BR zpool (8) -under \fBVirtual Devices\fR for more information on possible values. -.RE - -.sp -.ne 2 -.na -\fBvdev_path\fR -.ad -.RS 12n -Full path of the vdev, including any \fI-partX\fR. -.RE - -.sp -.ne 2 -.na -\fBvdev_devid\fR -.ad -.RS 12n -ID of vdev (if any). -.RE - -.sp -.ne 2 -.na -\fBvdev_fru\fR -.ad -.RS 12n -Physical FRU location. -.RE - -.sp -.ne 2 -.na -\fBvdev_state\fR -.ad -.RS 12n -State of vdev (0=uninitialized, 1=closed, 2=offline, 3=removed, 4=failed to open, 5=faulted, 6=degraded, 7=healthy). -.RE - -.sp -.ne 2 -.na -\fBvdev_ashift\fR -.ad -.RS 12n -The ashift value of the vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev_complete_ts\fR -.ad -.RS 12n -The time the last I/O completed for the specified vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev_delta_ts\fR -.ad -.RS 12n -The time since the last I/O completed for the specified vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev_spare_paths\fR -.ad -.RS 12n -List of spares, including full path and any \fI-partX\fR. -.RE - -.sp -.ne 2 -.na -\fBvdev_spare_guids\fR -.ad -.RS 12n -GUID(s) of spares. -.RE - -.sp -.ne 2 -.na -\fBvdev_read_errors\fR -.ad -.RS 12n -How many read errors that have been detected on the vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev_write_errors\fR -.ad -.RS 12n -How many write errors that have been detected on the vdev. -.RE - -.sp -.ne 2 -.na -\fBvdev_cksum_errors\fR -.ad -.RS 12n -How many checkum errors that have been detected on the vdev. -.RE - -.sp -.ne 2 -.na -\fBparent_guid\fR -.ad -.RS 12n -GUID of the vdev parent. -.RE - -.sp -.ne 2 -.na -\fBparent_type\fR -.ad -.RS 12n -Type of parent. See \fBvdev_type\fR. -.RE - -.sp -.ne 2 -.na -\fBparent_path\fR -.ad -.RS 12n -Path of the vdev parent (if any). -.RE - -.sp -.ne 2 -.na -\fBparent_devid\fR -.ad -.RS 12n -ID of the vdev parent (if any). -.RE - -.sp -.ne 2 -.na -\fBzio_objset\fR -.ad -.RS 12n -The object set number for a given I/O. -.RE - -.sp -.ne 2 -.na -\fBzio_object\fR -.ad -.RS 12n -The object number for a given I/O. -.RE - -.sp -.ne 2 -.na -\fBzio_level\fR -.ad -.RS 12n -The indirect level for the block. Level 0 is the lowest level and includes -data blocks. Values > 0 indicate metadata blocks at the appropriate level. -.RE - -.sp -.ne 2 -.na -\fBzio_blkid\fR -.ad -.RS 12n -The block ID for a given I/O. -.RE - -.sp -.ne 2 -.na -\fBzio_err\fR -.ad -.RS 12n -The errno for a failure when handling a given I/O. The errno is compatible -with \fBerrno\fR(3) with the value for EBADE (0x34) used to indicate ZFS -checksum error. -.RE - -.sp -.ne 2 -.na -\fBzio_offset\fR -.ad -.RS 12n -The offset in bytes of where to write the I/O for the specified vdev. -.RE - -.sp -.ne 2 -.na -\fBzio_size\fR -.ad -.RS 12n -The size in bytes of the I/O. -.RE - -.sp -.ne 2 -.na -\fBzio_flags\fR -.ad -.RS 12n -The current flags describing how the I/O should be handled. See the -\fBI/O FLAGS\fR section for the full list of I/O flags. -.RE - -.sp -.ne 2 -.na -\fBzio_stage\fR -.ad -.RS 12n -The current stage of the I/O in the pipeline. See the \fBI/O STAGES\fR -section for a full list of all the I/O stages. -.RE - -.sp -.ne 2 -.na -\fBzio_pipeline\fR -.ad -.RS 12n -The valid pipeline stages for the I/O. See the \fBI/O STAGES\fR section for a -full list of all the I/O stages. -.RE - -.sp -.ne 2 -.na -\fBzio_delay\fR -.ad -.RS 12n -The time elapsed (in nanoseconds) waiting for the block layer to complete the -I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is -therefore solely a measure of the block layer performance. -.RE - -.sp -.ne 2 -.na -\fBzio_timestamp\fR -.ad -.RS 12n -The time when a given I/O was submitted. -.RE - -.sp -.ne 2 -.na -\fBzio_delta\fR -.ad -.RS 12n -The time required to service a given I/O. -.RE - -.sp -.ne 2 -.na -\fBprev_state\fR -.ad -.RS 12n -The previous state of the vdev. -.RE - -.sp -.ne 2 -.na -\fBcksum_expected\fR -.ad -.RS 12n -The expected checksum value for the block. -.RE - -.sp -.ne 2 -.na -\fBcksum_actual\fR -.ad -.RS 12n -The actual checksum value for an errant block. -.RE - -.sp -.ne 2 -.na -\fBcksum_algorithm\fR -.ad -.RS 12n -Checksum algorithm used. See \fBzfs\fR(8) for more information on checksum -algorithms available. -.RE - -.sp -.ne 2 -.na -\fBcksum_byteswap\fR -.ad -.RS 12n -Whether or not the data is byteswapped. -.RE - -.sp -.ne 2 -.na -\fBbad_ranges\fR -.ad -.RS 12n -[start, end) pairs of corruption offsets. Offsets are always aligned on a -64-bit boundary, and can include some gaps of non-corruption. -(See \fBbad_ranges_min_gap\fR) -.RE - -.sp -.ne 2 -.na -\fBbad_ranges_min_gap\fR -.ad -.RS 12n -In order to bound the size of the \fBbad_ranges\fR array, gaps of non-corruption -less than or equal to \fBbad_ranges_min_gap\fR bytes have been merged with -adjacent corruption. Always at least 8 bytes, since corruption is detected -on a 64-bit word basis. -.RE - -.sp -.ne 2 -.na -\fBbad_range_sets\fR -.ad -.RS 12n -This array has one element per range in \fBbad_ranges\fR. Each element contains -the count of bits in that range which were clear in the good data and set -in the bad data. -.RE - -.sp -.ne 2 -.na -\fBbad_range_clears\fR -.ad -.RS 12n -This array has one element per range in \fBbad_ranges\fR. Each element contains -the count of bits for that range which were set in the good data and clear in -the bad data. -.RE - -.sp -.ne 2 -.na -\fBbad_set_bits\fR -.ad -.RS 12n -If this field exists, it is an array of: (bad data & ~(good data)); that is, -the bits set in the bad data which are cleared in the good data. Each element -corresponds a byte whose offset is in a range in \fBbad_ranges\fR, and the -array is ordered by offset. Thus, the first element is the first byte in the -first \fBbad_ranges\fR range, and the last element is the last byte in the last -\fBbad_ranges\fR range. -.RE - -.sp -.ne 2 -.na -\fBbad_cleared_bits\fR -.ad -.RS 12n -Like \fBbad_set_bits\fR, but contains: (good data & ~(bad data)); that is, -the bits set in the good data which are cleared in the bad data. -.RE - -.sp -.ne 2 -.na -\fBbad_set_histogram\fR -.ad -.RS 12n -If this field exists, it is an array of counters. Each entry counts bits set -in a particular bit of a big-endian uint64 type. The first entry counts bits -set in the high-order bit of the first byte, the 9th byte, etc, and the last -entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. -This information is useful for observing a stuck bit in a parallel data path, -such as IDE or parallel SCSI. -.RE - -.sp -.ne 2 -.na -\fBbad_cleared_histogram\fR -.ad -.RS 12n -If this field exists, it is an array of counters. Each entry counts bit clears -in a particular bit of a big-endian uint64 type. The first entry counts bits -clears of the the high-order bit of the first byte, the 9th byte, etc, and the -last entry counts clears of the low-order bit of the 8th byte, the 16th byte, -etc. This information is useful for observing a stuck bit in a parallel data -path, such as IDE or parallel SCSI. -.RE - -.SS "I/O STAGES" -.sp -.LP -The ZFS I/O pipeline is comprised of various stages which are defined -below. The individual stages are used to construct these basic I/O -operations: Read, Write, Free, Claim, and Ioctl. These stages may be -set on an event to describe the life cycle of a given I/O. - -.TS -tab(:); -l l l . -Stage:Bit Mask:Operations -_:_:_ -ZIO_STAGE_OPEN:0x00000001:RWFCI - -ZIO_STAGE_READ_BP_INIT:0x00000002:R---- -ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W--- -ZIO_STAGE_FREE_BP_INIT:0x00000008:--F-- -ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF-- -ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W--- - -ZIO_STAGE_ENCRYPT:0x00000040:-W--- -ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- - -ZIO_STAGE_NOP_WRITE:0x00000100:-W--- - -ZIO_STAGE_DDT_READ_START:0x00000200:R---- -ZIO_STAGE_DDT_READ_DONE:0x00000400:R---- -ZIO_STAGE_DDT_WRITE:0x00000800:-W--- -ZIO_STAGE_DDT_FREE:0x00001000:--F-- - -ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC- -ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC- - -ZIO_STAGE_DVA_THROTTLE:0x00008000:-W--- -ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W--- -ZIO_STAGE_DVA_FREE:0x00020000:--F-- -ZIO_STAGE_DVA_CLAIM:0x00040000:---C- - -ZIO_STAGE_READY:0x00080000:RWFCI - -ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I -ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I -ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I - -ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R---- - -ZIO_STAGE_DONE:0x01000000:RWFCI -.TE - -.SS "I/O FLAGS" -.sp -.LP -Every I/O in the pipeline contains a set of flags which describe its -function and are used to govern its behavior. These flags will be set -in an event as an \fBzio_flags\fR payload entry. - -.TS -tab(:); -l l . -Flag:Bit Mask -_:_ -ZIO_FLAG_DONT_AGGREGATE:0x00000001 -ZIO_FLAG_IO_REPAIR:0x00000002 -ZIO_FLAG_SELF_HEAL:0x00000004 -ZIO_FLAG_RESILVER:0x00000008 -ZIO_FLAG_SCRUB:0x00000010 -ZIO_FLAG_SCAN_THREAD:0x00000020 -ZIO_FLAG_PHYSICAL:0x00000040 - -ZIO_FLAG_CANFAIL:0x00000080 -ZIO_FLAG_SPECULATIVE:0x00000100 -ZIO_FLAG_CONFIG_WRITER:0x00000200 -ZIO_FLAG_DONT_RETRY:0x00000400 -ZIO_FLAG_DONT_CACHE:0x00000800 -ZIO_FLAG_NODATA:0x00001000 -ZIO_FLAG_INDUCE_DAMAGE:0x00002000 - -ZIO_FLAG_IO_ALLOCATING:0x00004000 -ZIO_FLAG_IO_RETRY:0x00008000 -ZIO_FLAG_PROBE:0x00010000 -ZIO_FLAG_TRYHARD:0x00020000 -ZIO_FLAG_OPTIONAL:0x00040000 - -ZIO_FLAG_DONT_QUEUE:0x00080000 -ZIO_FLAG_DONT_PROPAGATE:0x00100000 -ZIO_FLAG_IO_BYPASS:0x00200000 -ZIO_FLAG_IO_REWRITE:0x00400000 -ZIO_FLAG_RAW_COMPRESS:0x00800000 -ZIO_FLAG_RAW_ENCRYPT:0x01000000 - -ZIO_FLAG_GANG_CHILD:0x02000000 -ZIO_FLAG_DDT_CHILD:0x04000000 -ZIO_FLAG_GODFATHER:0x08000000 -ZIO_FLAG_NOPWRITE:0x10000000 -ZIO_FLAG_REEXECUTED:0x20000000 -ZIO_FLAG_DELEGATED:0x40000000 -ZIO_FLAG_FASTWRITE:0x80000000 -.TE diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 deleted file mode 100644 index 5bca12e06e..0000000000 --- a/man/man5/zfs-module-parameters.5 +++ /dev/null @@ -1,3222 +0,0 @@ -'\" te -.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. -.\" Copyright (c) 2019 by Delphix. All rights reserved. -.\" Copyright (c) 2019 Datto Inc. -.\" The contents of this file are subject to the terms of the Common Development -.\" and Distribution License (the "License"). You may not use this file except -.\" in compliance with the License. You can obtain a copy of the license at -.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" -.\" See the License for the specific language governing permissions and -.\" limitations under the License. When distributing Covered Code, include this -.\" CDDL HEADER in each file and include the License file at -.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this -.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your -.\" own identifying information: -.\" Portions Copyright [yyyy] [name of copyright owner] -.TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019" -.SH NAME -zfs\-module\-parameters \- ZFS module parameters -.SH DESCRIPTION -.sp -.LP -Description of the different parameters to the ZFS module. - -.SS "Module parameters" -.sp -.LP - -.sp -.ne 2 -.na -\fBdbuf_cache_max_bytes\fR (ulong) -.ad -.RS 12n -Maximum size in bytes of the dbuf cache. When \fB0\fR this value will default -to \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size, otherwise the -provided value in bytes will be used. The behavior of the dbuf cache and its -associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR -kstat. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBdbuf_metadata_cache_max_bytes\fR (ulong) -.ad -.RS 12n -Maximum size in bytes of the metadata dbuf cache. When \fB0\fR this value will -default to \fB1/2^dbuf_cache_shift\fR (1/16) of the target ARC size, otherwise -the provided value in bytes will be used. The behavior of the metadata dbuf -cache and its associated settings can be observed via the -\fB/proc/spl/kstat/zfs/dbufstats\fR kstat. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBdbuf_cache_hiwater_pct\fR (uint) -.ad -.RS 12n -The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted -directly. -.sp -Default value: \fB10\fR%. -.RE - -.sp -.ne 2 -.na -\fBdbuf_cache_lowater_pct\fR (uint) -.ad -.RS 12n -The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops -evicting dbufs. -.sp -Default value: \fB10\fR%. -.RE - -.sp -.ne 2 -.na -\fBdbuf_cache_shift\fR (int) -.ad -.RS 12n -Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction -of the target arc size. -.sp -Default value: \fB5\fR. -.RE - -.sp -.ne 2 -.na -\fBdbuf_metadata_cache_shift\fR (int) -.ad -.RS 12n -Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR, -to a log2 fraction of the target arc size. -.sp -Default value: \fB6\fR. -.RE - -.sp -.ne 2 -.na -\fBignore_hole_birth\fR (int) -.ad -.RS 12n -This is an alias for \fBsend_holes_without_birth_time\fR. -.RE - -.sp -.ne 2 -.na -\fBl2arc_feed_again\fR (int) -.ad -.RS 12n -Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as -fast as possible. -.sp -Use \fB1\fR for yes (default) and \fB0\fR to disable. -.RE - -.sp -.ne 2 -.na -\fBl2arc_feed_min_ms\fR (ulong) -.ad -.RS 12n -Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only -applicable in related situations. -.sp -Default value: \fB200\fR. -.RE - -.sp -.ne 2 -.na -\fBl2arc_feed_secs\fR (ulong) -.ad -.RS 12n -Seconds between L2ARC writing -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBl2arc_headroom\fR (ulong) -.ad -.RS 12n -How far through the ARC lists to search for L2ARC cacheable content, expressed -as a multiplier of \fBl2arc_write_max\fR -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBl2arc_headroom_boost\fR (ulong) -.ad -.RS 12n -Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being -successfully compressed before writing. A value of 100 disables this feature. -.sp -Default value: \fB200\fR%. -.RE - -.sp -.ne 2 -.na -\fBl2arc_noprefetch\fR (int) -.ad -.RS 12n -Do not write buffers to L2ARC if they were prefetched but not used by -applications -.sp -Use \fB1\fR for yes (default) and \fB0\fR to disable. -.RE - -.sp -.ne 2 -.na -\fBl2arc_norw\fR (int) -.ad -.RS 12n -No reads during writes -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBl2arc_write_boost\fR (ulong) -.ad -.RS 12n -Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount -while they remain cold. -.sp -Default value: \fB8,388,608\fR. -.RE - -.sp -.ne 2 -.na -\fBl2arc_write_max\fR (ulong) -.ad -.RS 12n -Max write bytes per interval -.sp -Default value: \fB8,388,608\fR. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_aliquot\fR (ulong) -.ad -.RS 12n -Metaslab granularity, in bytes. This is roughly similar to what would be -referred to as the "stripe size" in traditional RAID arrays. In normal -operation, ZFS will try to write this amount of data to a top-level vdev -before moving on to the next one. -.sp -Default value: \fB524,288\fR. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_bias_enabled\fR (int) -.ad -.RS 12n -Enable metaslab group biasing based on its vdev's over- or under-utilization -relative to the pool. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_force_ganging\fR (ulong) -.ad -.RS 12n -Make some blocks above a certain size be gang blocks. This option is used -by the test suite to facilitate testing. -.sp -Default value: \fB16,777,217\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_metaslab_segment_weight_enabled\fR (int) -.ad -.RS 12n -Enable/disable segment-based metaslab selection. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBzfs_metaslab_switch_threshold\fR (int) -.ad -.RS 12n -When using segment-based metaslab selection, continue allocating -from the active metaslab until \fBzfs_metaslab_switch_threshold\fR -worth of buckets have been exhausted. -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_debug_load\fR (int) -.ad -.RS 12n -Load all metaslabs during pool import. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBmetaslab_debug_unload\fR (int) -.ad -.RS 12n -Prevent metaslabs from being unloaded. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBmetaslab_fragmentation_factor_enabled\fR (int) -.ad -.RS 12n -Enable use of the fragmentation metric in computing metaslab weights. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_default_ms_count\fR (int) -.ad -.RS 12n -When a vdev is added target this number of metaslabs per top-level vdev. -.sp -Default value: \fB200\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_min_ms_count\fR (int) -.ad -.RS 12n -Minimum number of metaslabs to create in a top-level vdev. -.sp -Default value: \fB16\fR. -.RE - -.sp -.ne 2 -.na -\fBvdev_ms_count_limit\fR (int) -.ad -.RS 12n -Practical upper limit of total metaslabs per top-level vdev. -.sp -Default value: \fB131,072\fR. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_preload_enabled\fR (int) -.ad -.RS 12n -Enable metaslab group preloading. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBmetaslab_lba_weighting_enabled\fR (int) -.ad -.RS 12n -Give more weight to metaslabs with lower LBAs, assuming they have -greater bandwidth as is typically the case on a modern constant -angular velocity disk drive. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBsend_holes_without_birth_time\fR (int) -.ad -.RS 12n -When set, the hole_birth optimization will not be used, and all holes will -always be sent on zfs send. This is useful if you suspect your datasets are -affected by a bug in hole_birth. -.sp -Use \fB1\fR for on (default) and \fB0\fR for off. -.RE - -.sp -.ne 2 -.na -\fBspa_config_path\fR (charp) -.ad -.RS 12n -SPA config file -.sp -Default value: \fB/etc/zfs/zpool.cache\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_asize_inflation\fR (int) -.ad -.RS 12n -Multiplication factor used to estimate actual disk consumption from the -size of data being written. The default value is a worst case estimate, -but lower values may be valid for a given pool depending on its -configuration. Pool administrators who understand the factors involved -may wish to specify a more realistic inflation factor, particularly if -they operate close to quota or capacity limits. -.sp -Default value: \fB24\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_load_print_vdev_tree\fR (int) -.ad -.RS 12n -Whether to print the vdev tree in the debugging message buffer during pool import. -Use 0 to disable and 1 to enable. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_load_verify_data\fR (int) -.ad -.RS 12n -Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR) -import. Use 0 to disable and 1 to enable. - -An extreme rewind import normally performs a full traversal of all -blocks in the pool for verification. If this parameter is set to 0, -the traversal skips non-metadata blocks. It can be toggled once the -import has started to stop or start the traversal of non-metadata blocks. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_load_verify_metadata\fR (int) -.ad -.RS 12n -Whether to traverse blocks during an "extreme rewind" (\fB-X\fR) -pool import. Use 0 to disable and 1 to enable. - -An extreme rewind import normally performs a full traversal of all -blocks in the pool for verification. If this parameter is set to 0, -the traversal is not performed. It can be toggled once the import has -started to stop or start the traversal. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_load_verify_maxinflight\fR (int) -.ad -.RS 12n -Maximum concurrent I/Os during the traversal performed during an "extreme -rewind" (\fB-X\fR) pool import. -.sp -Default value: \fB10000\fR. -.RE - -.sp -.ne 2 -.na -\fBspa_slop_shift\fR (int) -.ad -.RS 12n -Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space -in the pool to be consumed. This ensures that we don't run the pool -completely out of space, due to unaccounted changes (e.g. to the MOS). -It also limits the worst-case time to allocate space. If we have -less than this amount of free space, most ZPL operations (e.g. write, -create) will return ENOSPC. -.sp -Default value: \fB5\fR. -.RE - -.sp -.ne 2 -.na -\fBvdev_removal_max_span\fR (int) -.ad -.RS 12n -During top-level vdev removal, chunks of data are copied from the vdev -which may include free space in order to trade bandwidth for IOPS. -This parameter determines the maximum span of free space (in bytes) -which will be included as "unnecessary" data in a chunk of copied data. - -The default value here was chosen to align with -\fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing -regular reads (but there's no reason it has to be the same). -.sp -Default value: \fB32,768\fR. -.RE - -.sp -.ne 2 -.na -\fBzfetch_array_rd_sz\fR (ulong) -.ad -.RS 12n -If prefetching is enabled, disable prefetching for reads larger than this size. -.sp -Default value: \fB1,048,576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfetch_max_distance\fR (uint) -.ad -.RS 12n -Max bytes to prefetch per stream (default 8MB). -.sp -Default value: \fB8,388,608\fR. -.RE - -.sp -.ne 2 -.na -\fBzfetch_max_streams\fR (uint) -.ad -.RS 12n -Max number of streams per zfetch (prefetch streams per file). -.sp -Default value: \fB8\fR. -.RE - -.sp -.ne 2 -.na -\fBzfetch_min_sec_reap\fR (uint) -.ad -.RS 12n -Min time before an active prefetch stream can be reclaimed -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_abd_scatter_min_size\fR (uint) -.ad -.RS 12n -This is the minimum allocation size that will use scatter (page-based) -ABD's. Smaller allocations will use linear ABD's. -.sp -Default value: \fB1536\fR (512B and 1KB allocations will be linear). -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_dnode_limit\fR (ulong) -.ad -.RS 12n -When the number of bytes consumed by dnodes in the ARC exceeds this number of -bytes, try to unpin some of it in response to demand for non-metadata. This -value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which -indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of -the ARC meta buffers that may be used for dnodes. - -See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used -when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather -than in response to overall demand for non-metadata. - -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_dnode_limit_percent\fR (ulong) -.ad -.RS 12n -Percentage that can be consumed by dnodes of ARC meta buffers. -.sp -See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a -higher priority if set to nonzero value. -.sp -Default value: \fB10\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_dnode_reduce_percent\fR (ulong) -.ad -.RS 12n -Percentage of ARC dnodes to try to scan in response to demand for non-metadata -when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR. - -.sp -Default value: \fB10\fR% of the number of dnodes in the ARC. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_average_blocksize\fR (int) -.ad -.RS 12n -The ARC's buffer hash table is sized based on the assumption of an average -block size of \fBzfs_arc_average_blocksize\fR (default 8K). This works out -to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers. -For configurations with a known larger average block size this value can be -increased to reduce the memory footprint. - -.sp -Default value: \fB8192\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_evict_batch_limit\fR (int) -.ad -.RS 12n -Number ARC headers to evict per sub-list before proceeding to another sub-list. -This batch-style operation prevents entire sub-lists from being evicted at once -but comes at a cost of additional unlocking and locking. -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_grow_retry\fR (int) -.ad -.RS 12n -If set to a non zero value, it will replace the arc_grow_retry value with this value. -The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before -trying to resume growth after a memory pressure event. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_lotsfree_percent\fR (int) -.ad -.RS 12n -Throttle I/O when free system memory drops below this percentage of total -system memory. Setting this value to 0 will disable the throttle. -.sp -Default value: \fB10\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_max\fR (ulong) -.ad -.RS 12n -Max arc size of ARC in bytes. If set to 0 then it will consume 1/2 of system -RAM. This value must be at least 67108864 (64 megabytes). -.sp -This value can be changed dynamically with some caveats. It cannot be set back -to 0 while running and reducing it below the current ARC size will not cause -the ARC to shrink without memory pressure to induce shrinking. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_adjust_restarts\fR (ulong) -.ad -.RS 12n -The number of restart passes to make while scanning the ARC attempting -the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR. -This value should not need to be tuned but is available to facilitate -performance analysis. -.sp -Default value: \fB4096\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_limit\fR (ulong) -.ad -.RS 12n -The maximum allowed size in bytes that meta data buffers are allowed to -consume in the ARC. When this limit is reached meta data buffers will -be reclaimed even if the overall arc_c_max has not been reached. This -value defaults to 0 which indicates that a percent which is based on -\fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data. -.sp -This value my be changed dynamically except that it cannot be set back to 0 -for a specific percent of the ARC; it must be set to an explicit value. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_limit_percent\fR (ulong) -.ad -.RS 12n -Percentage of ARC buffers that can be used for meta data. - -See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a -higher priority if set to nonzero value. - -.sp -Default value: \fB75\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_min\fR (ulong) -.ad -.RS 12n -The minimum allowed size in bytes that meta data buffers may consume in -the ARC. This value defaults to 0 which disables a floor on the amount -of the ARC devoted meta data. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_prune\fR (int) -.ad -.RS 12n -The number of dentries and inodes to be scanned looking for entries -which can be dropped. This may be required when the ARC reaches the -\fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers -in the ARC. Increasing this value will cause to dentry and inode caches -to be pruned more aggressively. Setting this value to 0 will disable -pruning the inode and dentry caches. -.sp -Default value: \fB10,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_meta_strategy\fR (int) -.ad -.RS 12n -Define the strategy for ARC meta data buffer eviction (meta reclaim strategy). -A value of 0 (META_ONLY) will evict only the ARC meta data buffers. -A value of 1 (BALANCED) indicates that additional data buffers may be evicted if -that is required to in order to evict the required number of meta data buffers. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_min\fR (ulong) -.ad -.RS 12n -Min arc size of ARC in bytes. If set to 0 then arc_c_min will default to -consuming the larger of 32M or 1/32 of total system memory. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_min_prefetch_ms\fR (int) -.ad -.RS 12n -Minimum time prefetched blocks are locked in the ARC, specified in ms. -A value of \fB0\fR will default to 1000 ms. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_min_prescient_prefetch_ms\fR (int) -.ad -.RS 12n -Minimum time "prescient prefetched" blocks are locked in the ARC, specified -in ms. These blocks are meant to be prefetched fairly aggresively ahead of -the code that may use them. A value of \fB0\fR will default to 6000 ms. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_max_missing_tvds\fR (int) -.ad -.RS 12n -Number of missing top-level vdevs which will be allowed during -pool import (only in read-only mode). -.sp -Default value: \fB0\fR -.RE - -.sp -.ne 2 -.na -\fBzfs_multilist_num_sublists\fR (int) -.ad -.RS 12n -To allow more fine-grained locking, each ARC state contains a series -of lists for both data and meta data objects. Locking is performed at -the level of these "sub-lists". This parameters controls the number of -sub-lists per ARC state, and also applies to other uses of the -multilist data structure. -.sp -Default value: \fB4\fR or the number of online CPUs, whichever is greater -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_overflow_shift\fR (int) -.ad -.RS 12n -The ARC size is considered to be overflowing if it exceeds the current -ARC target size (arc_c) by a threshold determined by this parameter. -The threshold is calculated as a fraction of arc_c using the formula -"arc_c >> \fBzfs_arc_overflow_shift\fR". - -The default value of 8 causes the ARC to be considered to be overflowing -if it exceeds the target size by 1/256th (0.3%) of the target size. - -When the ARC is overflowing, new buffer allocations are stalled until -the reclaim thread catches up and the overflow condition no longer exists. -.sp -Default value: \fB8\fR. -.RE - -.sp -.ne 2 -.na - -\fBzfs_arc_p_min_shift\fR (int) -.ad -.RS 12n -If set to a non zero value, this will update arc_p_min_shift (default 4) -with the new value. -arc_p_min_shift is used to shift of arc_c for calculating both min and max -max arc_p -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_p_dampener_disable\fR (int) -.ad -.RS 12n -Disable arc_p adapt dampener -.sp -Use \fB1\fR for yes (default) and \fB0\fR to disable. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_shrink_shift\fR (int) -.ad -.RS 12n -If set to a non zero value, this will update arc_shrink_shift (default 7) -with the new value. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_pc_percent\fR (uint) -.ad -.RS 12n -Percent of pagecache to reclaim arc to - -This tunable allows ZFS arc to play more nicely with the kernel's LRU -pagecache. It can guarantee that the arc size won't collapse under scanning -pressure on the pagecache, yet still allows arc to be reclaimed down to -zfs_arc_min if necessary. This value is specified as percent of pagecache -size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This -only operates during memory pressure/reclaim. -.sp -Default value: \fB0\fR% (disabled). -.RE - -.sp -.ne 2 -.na -\fBzfs_arc_sys_free\fR (ulong) -.ad -.RS 12n -The target number of bytes the ARC should leave as free memory on the system. -Defaults to the larger of 1/64 of physical memory or 512K. Setting this -option to a non-zero value will override the default. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_autoimport_disable\fR (int) -.ad -.RS 12n -Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR). -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBzfs_checksums_per_second\fR (int) -.ad -.RS 12n -Rate limit checksum events to this many per second. Note that this should -not be set below the zed thresholds (currently 10 checksums over 10 sec) -or else zed may not trigger any action. -.sp -Default value: 20 -.RE - -.sp -.ne 2 -.na -\fBzfs_commit_timeout_pct\fR (int) -.ad -.RS 12n -This controls the amount of time that a ZIL block (lwb) will remain "open" -when it isn't "full", and it has a thread waiting for it to be committed to -stable storage. The timeout is scaled based on a percentage of the last lwb -latency to avoid significantly impacting the latency of each individual -transaction record (itx). -.sp -Default value: \fB5\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_condense_indirect_vdevs_enable\fR (int) -.ad -.RS 12n -Enable condensing indirect vdev mappings. When set to a non-zero value, -attempt to condense indirect vdev mappings if the mapping uses more than -\fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete -space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR -bytes on-disk. The condensing process is an attempt to save memory by -removing obsolete mappings. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_condense_max_obsolete_bytes\fR (ulong) -.ad -.RS 12n -Only attempt to condense indirect vdev mappings if the on-disk size -of the obsolete space map object is greater than this number of bytes -(see \fBfBzfs_condense_indirect_vdevs_enable\fR). -.sp -Default value: \fB1,073,741,824\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_condense_min_mapping_bytes\fR (ulong) -.ad -.RS 12n -Minimum size vdev mapping to attempt to condense (see -\fBzfs_condense_indirect_vdevs_enable\fR). -.sp -Default value: \fB131,072\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dbgmsg_enable\fR (int) -.ad -.RS 12n -Internally ZFS keeps a small log to facilitate debugging. By default the log -is disabled, to enable it set this option to 1. The contents of the log can -be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file. Writing 0 to -this proc file clears the log. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dbgmsg_maxsize\fR (int) -.ad -.RS 12n -The maximum size in bytes of the internal ZFS debug log. -.sp -Default value: \fB4M\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dbuf_state_index\fR (int) -.ad -.RS 12n -This feature is currently unused. It is normally used for controlling what -reporting is available under /proc/spl/kstat/zfs. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_deadman_enabled\fR (int) -.ad -.RS 12n -When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR -milliseconds, or when an individual I/O takes longer than -\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to -be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is -invoked as described by the \fBzfs_deadman_failmode\fR module option. -By default the deadman is enabled and configured to \fBwait\fR which results -in "hung" I/Os only being logged. The deadman is automatically disabled -when a pool gets suspended. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_deadman_failmode\fR (charp) -.ad -.RS 12n -Controls the failure behavior when the deadman detects a "hung" I/O. Valid -values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR. -.sp -\fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a -"deadman" event will be posted describing that I/O. -.sp -\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it -to the I/O pipeline if possible. -.sp -\fBpanic\fR - Panic the system. This can be used to facilitate an automatic -fail-over to a properly configured fail-over partner. -.sp -Default value: \fBwait\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_deadman_checktime_ms\fR (int) -.ad -.RS 12n -Check time in milliseconds. This defines the frequency at which we check -for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior. -.sp -Default value: \fB60,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_deadman_synctime_ms\fR (ulong) -.ad -.RS 12n -Interval in milliseconds after which the deadman is triggered and also -the interval after which a pool sync operation is considered to be "hung". -Once this limit is exceeded the deadman will be invoked every -\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes. -.sp -Default value: \fB600,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_deadman_ziotime_ms\fR (ulong) -.ad -.RS 12n -Interval in milliseconds after which the deadman is triggered and an -individual I/O operation is considered to be "hung". As long as the I/O -remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR -milliseconds until the I/O completes. -.sp -Default value: \fB300,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dedup_prefetch\fR (int) -.ad -.RS 12n -Enable prefetching dedup-ed blks -.sp -Use \fB1\fR for yes and \fB0\fR to disable (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_delay_min_dirty_percent\fR (int) -.ad -.RS 12n -Start to delay each transaction once there is this amount of dirty data, -expressed as a percentage of \fBzfs_dirty_data_max\fR. -This value should be >= zfs_vdev_async_write_active_max_dirty_percent. -See the section "ZFS TRANSACTION DELAY". -.sp -Default value: \fB60\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_delay_scale\fR (int) -.ad -.RS 12n -This controls how quickly the transaction delay approaches infinity. -Larger values cause longer delays for a given amount of dirty data. -.sp -For the smoothest delay, this value should be about 1 billion divided -by the maximum number of operations per second. This will smoothly -handle between 10x and 1/10th this number. -.sp -See the section "ZFS TRANSACTION DELAY". -.sp -Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. -.sp -Default value: \fB500,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_slow_io_events_per_second\fR (int) -.ad -.RS 12n -Rate limit delay zevents (which report slow I/Os) to this many per second. -.sp -Default value: 20 -.RE - -.sp -.ne 2 -.na -\fBzfs_unlink_suspend_progress\fR (uint) -.ad -.RS 12n -When enabled, files will not be asynchronously removed from the list of pending -unlinks and the space they consume will be leaked. Once this option has been -disabled and the dataset is remounted, the pending unlinks will be processed -and the freed space returned to the pool. -This option is used by the test suite to facilitate testing. -.sp -Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress. -.RE - -.sp -.ne 2 -.na -\fBzfs_delete_blocks\fR (ulong) -.ad -.RS 12n -This is the used to define a large file for the purposes of delete. Files -containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously -while smaller files are deleted synchronously. Decreasing this value will -reduce the time spent in an unlink(2) system call at the expense of a longer -delay before the freed space is available. -.sp -Default value: \fB20,480\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dirty_data_max\fR (int) -.ad -.RS 12n -Determines the dirty space limit in bytes. Once this limit is exceeded, new -writes are halted until space frees up. This parameter takes precedence -over \fBzfs_dirty_data_max_percent\fR. -See the section "ZFS TRANSACTION DELAY". -.sp -Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dirty_data_max_max\fR (int) -.ad -.RS 12n -Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. -This limit is only enforced at module load time, and will be ignored if -\fBzfs_dirty_data_max\fR is later changed. This parameter takes -precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section -"ZFS TRANSACTION DELAY". -.sp -Default value: \fB25\fR% of physical RAM. -.RE - -.sp -.ne 2 -.na -\fBzfs_dirty_data_max_max_percent\fR (int) -.ad -.RS 12n -Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a -percentage of physical RAM. This limit is only enforced at module load -time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. -The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this -one. See the section "ZFS TRANSACTION DELAY". -.sp -Default value: \fB25\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_dirty_data_max_percent\fR (int) -.ad -.RS 12n -Determines the dirty space limit, expressed as a percentage of all -memory. Once this limit is exceeded, new writes are halted until space frees -up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this -one. See the section "ZFS TRANSACTION DELAY". -.sp -Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_dirty_data_sync_percent\fR (int) -.ad -.RS 12n -Start syncing out a transaction group if there's at least this much dirty data -as a percentage of \fBzfs_dirty_data_max\fR. This should be less than -\fBzfs_vdev_async_write_active_min_dirty_percent\fR. -.sp -Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_fletcher_4_impl\fR (string) -.ad -.RS 12n -Select a fletcher 4 implementation. -.sp -Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, -\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR. -All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction -set extensions to be available and will only appear if ZFS detects that they are -present at runtime. If multiple implementations of fletcher 4 are available, -the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR -results in the original, CPU based calculation, being used. Selecting any option -other than \fBfastest\fR and \fBscalar\fR results in vector instructions from -the respective CPU instruction set being used. -.sp -Default value: \fBfastest\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_free_bpobj_enabled\fR (int) -.ad -.RS 12n -Enable/disable the processing of the free_bpobj object. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_async_block_max_blocks\fR (ulong) -.ad -.RS 12n -Maximum number of blocks freed in a single txg. -.sp -Default value: \fB100,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_override_estimate_recordsize\fR (ulong) -.ad -.RS 12n -Record size calculation override for zfs send estimates. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_read_max_active\fR (int) -.ad -.RS 12n -Maximum asynchronous read I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB3\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_read_min_active\fR (int) -.ad -.RS 12n -Minimum asynchronous read I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) -.ad -.RS 12n -When the pool has more than -\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use -\fBzfs_vdev_async_write_max_active\fR to limit active async writes. If -the dirty data is between min and max, the active I/O limit is linearly -interpolated. See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB60\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) -.ad -.RS 12n -When the pool has less than -\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use -\fBzfs_vdev_async_write_min_active\fR to limit active async writes. If -the dirty data is between min and max, the active I/O limit is linearly -interpolated. See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB30\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_write_max_active\fR (int) -.ad -.RS 12n -Maximum asynchronous write I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_async_write_min_active\fR (int) -.ad -.RS 12n -Minimum asynchronous write I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Lower values are associated with better latency on rotational media but poorer -resilver performance. The default value of 2 was chosen as a compromise. A -value of 3 has been shown to improve resilver performance further at a cost of -further increasing latency. -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_initializing_max_active\fR (int) -.ad -.RS 12n -Maximum initializing I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_initializing_min_active\fR (int) -.ad -.RS 12n -Minimum initializing I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_max_active\fR (int) -.ad -.RS 12n -The maximum number of I/Os active to each device. Ideally, this will be >= -the sum of each queue's max_active. It must be at least the sum of each -queue's min_active. See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_removal_max_active\fR (int) -.ad -.RS 12n -Maximum removal I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_removal_min_active\fR (int) -.ad -.RS 12n -Minimum removal I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_scrub_max_active\fR (int) -.ad -.RS 12n -Maximum scrub I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_scrub_min_active\fR (int) -.ad -.RS 12n -Minimum scrub I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_sync_read_max_active\fR (int) -.ad -.RS 12n -Maximum synchronous read I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_sync_read_min_active\fR (int) -.ad -.RS 12n -Minimum synchronous read I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_sync_write_max_active\fR (int) -.ad -.RS 12n -Maximum synchronous write I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_sync_write_min_active\fR (int) -.ad -.RS 12n -Minimum synchronous write I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_trim_max_active\fR (int) -.ad -.RS 12n -Maximum trim/discard I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_trim_min_active\fR (int) -.ad -.RS 12n -Minimum trim/discard I/Os active to each device. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_queue_depth_pct\fR (int) -.ad -.RS 12n -Maximum number of queued allocations per top-level vdev expressed as -a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the -system to detect devices that are more capable of handling allocations -and to allocate more blocks to those devices. It allows for dynamic -allocation distribution when devices are imbalanced as fuller devices -will tend to be slower than empty devices. - -See also \fBzio_dva_throttle_enabled\fR. -.sp -Default value: \fB1000\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_expire_snapshot\fR (int) -.ad -.RS 12n -Seconds to expire .zfs/snapshot -.sp -Default value: \fB300\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_admin_snapshot\fR (int) -.ad -.RS 12n -Allow the creation, removal, or renaming of entries in the .zfs/snapshot -directory to cause the creation, destruction, or renaming of snapshots. -When enabled this functionality works both locally and over NFS exports -which have the 'no_root_squash' option set. This functionality is disabled -by default. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_flags\fR (int) -.ad -.RS 12n -Set additional debugging flags. The following flags may be bitwise-or'd -together. -.sp -.TS -box; -rB lB -lB lB -r l. -Value Symbolic Name - Description -_ -1 ZFS_DEBUG_DPRINTF - Enable dprintf entries in the debug log. -_ -2 ZFS_DEBUG_DBUF_VERIFY * - Enable extra dbuf verifications. -_ -4 ZFS_DEBUG_DNODE_VERIFY * - Enable extra dnode verifications. -_ -8 ZFS_DEBUG_SNAPNAMES - Enable snapshot name verification. -_ -16 ZFS_DEBUG_MODIFY - Check for illegally modified ARC buffers. -_ -64 ZFS_DEBUG_ZIO_FREE - Enable verification of block frees. -_ -128 ZFS_DEBUG_HISTOGRAM_VERIFY - Enable extra spacemap histogram verifications. -_ -256 ZFS_DEBUG_METASLAB_VERIFY - Verify space accounting on disk matches in-core range_trees. -_ -512 ZFS_DEBUG_SET_ERROR - Enable SET_ERROR and dprintf entries in the debug log. -_ -1024 ZFS_DEBUG_INDIRECT_REMAP - Verify split blocks created by device removal. -_ -2048 ZFS_DEBUG_TRIM - Verify TRIM ranges are always within the allocatable range tree. -.TE -.sp -* Requires debug build. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_free_leak_on_eio\fR (int) -.ad -.RS 12n -If destroy encounters an EIO while reading metadata (e.g. indirect -blocks), space referenced by the missing metadata can not be freed. -Normally this causes the background destroy to become "stalled", as -it is unable to make forward progress. While in this stalled state, -all remaining space to free from the error-encountering filesystem is -"temporarily leaked". Set this flag to cause it to ignore the EIO, -permanently leak the space from indirect blocks that can not be read, -and continue to free everything else that it can. - -The default, "stalling" behavior is useful if the storage partially -fails (i.e. some but not all i/os fail), and then later recovers. In -this case, we will be able to continue pool operations while it is -partially failed, and when it recovers, we can continue to free the -space, with no leaks. However, note that this case is actually -fairly rare. - -Typically pools either (a) fail completely (but perhaps temporarily, -e.g. a top-level vdev going offline), or (b) have localized, -permanent errors (e.g. disk returns the wrong data due to bit flip or -firmware bug). In case (a), this setting does not matter because the -pool will be suspended and the sync thread will not be able to make -forward progress regardless. In case (b), because the error is -permanent, the best we can do is leak the minimum amount of space, -which is what setting this flag will do. Therefore, it is reasonable -for this flag to normally be set, but we chose the more conservative -approach of not setting it, so that there is no possibility of -leaking space in the "partial temporary" failure case. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_free_min_time_ms\fR (int) -.ad -.RS 12n -During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum -of this much time will be spent working on freeing blocks per txg. -.sp -Default value: \fB1,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_immediate_write_sz\fR (long) -.ad -.RS 12n -Largest data block to write to zil. Larger blocks will be treated as if the -dataset being written to had the property setting \fBlogbias=throughput\fR. -.sp -Default value: \fB32,768\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_initialize_value\fR (ulong) -.ad -.RS 12n -Pattern written to vdev free space by \fBzpool initialize\fR. -.sp -Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). -.RE - -.sp -.ne 2 -.na -\fBzfs_lua_max_instrlimit\fR (ulong) -.ad -.RS 12n -The maximum execution time limit that can be set for a ZFS channel program, -specified as a number of Lua instructions. -.sp -Default value: \fB100,000,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_lua_max_memlimit\fR (ulong) -.ad -.RS 12n -The maximum memory limit that can be set for a ZFS channel program, specified -in bytes. -.sp -Default value: \fB104,857,600\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_max_dataset_nesting\fR (int) -.ad -.RS 12n -The maximum depth of nested datasets. This value can be tuned temporarily to -fix existing datasets that exceed the predefined limit. -.sp -Default value: \fB50\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_max_recordsize\fR (int) -.ad -.RS 12n -We currently support block sizes from 512 bytes to 16MB. The benefits of -larger blocks, and thus larger I/O, need to be weighed against the cost of -COWing a giant block to modify one byte. Additionally, very large blocks -can have an impact on i/o latency, and also potentially on the memory -allocator. Therefore, we do not allow the recordsize to be set larger than -zfs_max_recordsize (default 1MB). Larger blocks can be created by changing -this tunable, and pools with larger blocks can always be imported and used, -regardless of this setting. -.sp -Default value: \fB1,048,576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_metaslab_fragmentation_threshold\fR (int) -.ad -.RS 12n -Allow metaslabs to keep their active state as long as their fragmentation -percentage is less than or equal to this value. An active metaslab that -exceeds this threshold will no longer keep its active status allowing -better metaslabs to be selected. -.sp -Default value: \fB70\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_mg_fragmentation_threshold\fR (int) -.ad -.RS 12n -Metaslab groups are considered eligible for allocations if their -fragmentation metric (measured as a percentage) is less than or equal to -this value. If a metaslab group exceeds this threshold then it will be -skipped unless all metaslab groups within the metaslab class have also -crossed this threshold. -.sp -Default value: \fB85\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_mg_noalloc_threshold\fR (int) -.ad -.RS 12n -Defines a threshold at which metaslab groups should be eligible for -allocations. The value is expressed as a percentage of free space -beyond which a metaslab group is always eligible for allocations. -If a metaslab group's free space is less than or equal to the -threshold, the allocator will avoid allocating to that group -unless all groups in the pool have reached the threshold. Once all -groups have reached the threshold, all groups are allowed to accept -allocations. The default value of 0 disables the feature and causes -all metaslab groups to be eligible for allocations. - -This parameter allows one to deal with pools having heavily imbalanced -vdevs such as would be the case when a new vdev has been added. -Setting the threshold to a non-zero percentage will stop allocations -from being made to vdevs that aren't filled to the specified percentage -and allow lesser filled vdevs to acquire more allocations than they -otherwise would under the old \fBzfs_mg_alloc_failures\fR facility. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_ddt_data_is_special\fR (int) -.ad -.RS 12n -If enabled, ZFS will place DDT data into the special allocation class. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_user_indirect_is_special\fR (int) -.ad -.RS 12n -If enabled, ZFS will place user data (both file and zvol) indirect blocks -into the special allocation class. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_multihost_history\fR (int) -.ad -.RS 12n -Historical statistics for the last N multihost updates will be available in -\fB/proc/spl/kstat/zfs//multihost\fR -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_multihost_interval\fR (ulong) -.ad -.RS 12n -Used to control the frequency of multihost writes which are performed when the -\fBmultihost\fR pool property is on. This is one factor used to determine the -length of the activity check during import. -.sp -The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR -milliseconds. On average a multihost write will be issued for each leaf vdev -every \fBzfs_multihost_interval\fR milliseconds. In practice, the observed -period can vary with the I/O load and this observed value is the delay which is -stored in the uberblock. -.sp -Default value: \fB1000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_multihost_import_intervals\fR (uint) -.ad -.RS 12n -Used to control the duration of the activity test on import. Smaller values of -\fBzfs_multihost_import_intervals\fR will reduce the import time but increase -the risk of failing to detect an active pool. The total activity check time is -never allowed to drop below one second. -.sp -On import the activity check waits a minimum amount of time determined by -\fBzfs_multihost_interval * zfs_multihost_import_intervals\fR, or the same -product computed on the host which last had the pool imported (whichever is -greater). The activity check time may be further extended if the value of mmp -delay found in the best uberblock indicates actual multihost updates happened -at longer intervals than \fBzfs_multihost_interval\fR. A minimum value of -\fB100ms\fR is enforced. -.sp -A value of 0 is ignored and treated as if it was set to 1. -.sp -Default value: \fB20\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_multihost_fail_intervals\fR (uint) -.ad -.RS 12n -Controls the behavior of the pool when multihost write failures or delays are -detected. -.sp -When \fBzfs_multihost_fail_intervals = 0\fR, multihost write failures or delays -are ignored. The failures will still be reported to the ZED which depending on -its configuration may take action such as suspending the pool or offlining a -device. - -.sp -When \fBzfs_multihost_fail_intervals > 0\fR, the pool will be suspended if -\fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds pass -without a successful mmp write. This guarantees the activity test will see -mmp writes if the pool is imported. A value of 1 is ignored and treated as -if it was set to 2. This is necessary to prevent the pool from being suspended -due to normal, small I/O latency variations. - -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_no_scrub_io\fR (int) -.ad -.RS 12n -Set for no scrub I/O. This results in scrubs not actually scrubbing data and -simply doing a metadata crawl of the pool instead. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_no_scrub_prefetch\fR (int) -.ad -.RS 12n -Set to disable block prefetching for scrubs. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_nocacheflush\fR (int) -.ad -.RS 12n -Disable cache flush operations on disks when writing. Setting this will -cause pool corruption on power loss if a volatile out-of-order write cache -is enabled. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_nopwrite_enabled\fR (int) -.ad -.RS 12n -Enable NOP writes -.sp -Use \fB1\fR for yes (default) and \fB0\fR to disable. -.RE - -.sp -.ne 2 -.na -\fBzfs_dmu_offset_next_sync\fR (int) -.ad -.RS 12n -Enable forcing txg sync to find holes. When enabled forces ZFS to act -like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which -when a dnode is dirty causes txg's to be synced so that this data can be -found. -.sp -Use \fB1\fR for yes and \fB0\fR to disable (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_pd_bytes_max\fR (int) -.ad -.RS 12n -The number of bytes which should be prefetched during a pool traversal -(eg: \fBzfs send\fR or other data crawling operations) -.sp -Default value: \fB52,428,800\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_per_txg_dirty_frees_percent \fR (ulong) -.ad -.RS 12n -Tunable to control percentage of dirtied indirect blocks from frees allowed -into one TXG. After this threshold is crossed, additional frees will wait until -the next TXG. -A value of zero will disable this throttle. -.sp -Default value: \fB5\fR, set to \fB0\fR to disable. -.RE - -.sp -.ne 2 -.na -\fBzfs_prefetch_disable\fR (int) -.ad -.RS 12n -This tunable disables predictive prefetch. Note that it leaves "prescient" -prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, -prescient prefetch never issues i/os that end up not being needed, so it -can't hurt performance. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_qat_checksum_disable\fR (int) -.ad -.RS 12n -This tunable disables qat hardware acceleration for sha256 checksums. It -may be set after the zfs modules have been loaded to initialize the qat -hardware as long as support is compiled in and the qat driver is present. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_qat_compress_disable\fR (int) -.ad -.RS 12n -This tunable disables qat hardware acceleration for gzip compression. It -may be set after the zfs modules have been loaded to initialize the qat -hardware as long as support is compiled in and the qat driver is present. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_qat_encrypt_disable\fR (int) -.ad -.RS 12n -This tunable disables qat hardware acceleration for AES-GCM encryption. It -may be set after the zfs modules have been loaded to initialize the qat -hardware as long as support is compiled in and the qat driver is present. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_read_chunk_size\fR (long) -.ad -.RS 12n -Bytes to read per chunk -.sp -Default value: \fB1,048,576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_read_history\fR (int) -.ad -.RS 12n -Historical statistics for the last N reads will be available in -\fB/proc/spl/kstat/zfs//reads\fR -.sp -Default value: \fB0\fR (no data is kept). -.RE - -.sp -.ne 2 -.na -\fBzfs_read_history_hits\fR (int) -.ad -.RS 12n -Include cache hits in read history -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_reconstruct_indirect_combinations_max\fR (int) -.ad -.RS 12na -If an indirect split block contains more than this many possible unique -combinations when being reconstructed, consider it too computationally -expensive to check them all. Instead, try at most -\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected -combinations each time the block is accessed. This allows all segment -copies to participate fairly in the reconstruction when all combinations -cannot be checked and prevents repeated use of one bad copy. -.sp -Default value: \fB4096\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_recover\fR (int) -.ad -.RS 12n -Set to attempt to recover from fatal errors. This should only be used as a -last resort, as it typically results in leaked space, or worse. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_removal_ignore_errors\fR (int) -.ad -.RS 12n -.sp -Ignore hard IO errors during device removal. When set, if a device encounters -a hard IO error during the removal process the removal will not be cancelled. -This can result in a normally recoverable block becoming permanently damaged -and is not recommended. This should only be used as a last resort when the -pool cannot be returned to a healthy state prior to removing the device. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_resilver_min_time_ms\fR (int) -.ad -.RS 12n -Resilvers are processed by the sync thread. While resilvering it will spend -at least this much time working on a resilver between txg flushes. -.sp -Default value: \fB3,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_ignore_errors\fR (int) -.ad -.RS 12n -If set to a nonzero value, remove the DTL (dirty time list) upon -completion of a pool scan (scrub) even if there were unrepairable -errors. It is intended to be used during pool repair or recovery to -stop resilvering when the pool is next imported. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scrub_min_time_ms\fR (int) -.ad -.RS 12n -Scrubs are processed by the sync thread. While scrubbing it will spend -at least this much time working on a scrub between txg flushes. -.sp -Default value: \fB1,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_checkpoint_intval\fR (int) -.ad -.RS 12n -To preserve progress across reboots the sequential scan algorithm periodically -needs to stop metadata scanning and issue all the verifications I/Os to disk. -The frequency of this flushing is determined by the -\fBzfs_scan_checkpoint_intval\fR tunable. -.sp -Default value: \fB7200\fR seconds (every 2 hours). -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_fill_weight\fR (int) -.ad -.RS 12n -This tunable affects how scrub and resilver I/O segments are ordered. A higher -number indicates that we care more about how filled in a segment is, while a -lower number indicates we care more about the size of the extent without -considering the gaps within a segment. This value is only tunable upon module -insertion. Changing the value afterwards will have no affect on scrub or -resilver performance. -.sp -Default value: \fB3\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_issue_strategy\fR (int) -.ad -.RS 12n -Determines the order that data will be verified while scrubbing or resilvering. -If set to \fB1\fR, data will be verified as sequentially as possible, given the -amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This -may improve scrub performance if the pool's data is very fragmented. If set to -\fB2\fR, the largest mostly-contiguous chunk of found data will be verified -first. By deferring scrubbing of small segments, we may later find adjacent data -to coalesce and increase the segment size. If set to \fB0\fR, zfs will use -strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a -checkpoint. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_legacy\fR (int) -.ad -.RS 12n -A value of 0 indicates that scrubs and resilvers will gather metadata in -memory before issuing sequential I/O. A value of 1 indicates that the legacy -algorithm will be used where I/O is initiated as soon as it is discovered. -Changing this value to 0 will not affect scrubs or resilvers that are already -in progress. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_max_ext_gap\fR (int) -.ad -.RS 12n -Indicates the largest gap in bytes between scrub / resilver I/Os that will still -be considered sequential for sorting purposes. Changing this value will not -affect scrubs or resilvers that are already in progress. -.sp -Default value: \fB2097152 (2 MB)\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_mem_lim_fact\fR (int) -.ad -.RS 12n -Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. -This tunable determines the hard limit for I/O sorting memory usage. -When the hard limit is reached we stop scanning metadata and start issuing -data verification I/O. This is done until we get below the soft limit. -.sp -Default value: \fB20\fR which is 5% of RAM (1/20). -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_mem_lim_soft_fact\fR (int) -.ad -.RS 12n -The fraction of the hard limit used to determined the soft limit for I/O sorting -by the sequential scan algorithm. When we cross this limit from bellow no action -is taken. When we cross this limit from above it is because we are issuing -verification I/O. In this case (unless the metadata scan is done) we stop -issuing verification I/O and start scanning metadata again until we get to the -hard limit. -.sp -Default value: \fB20\fR which is 5% of the hard limit (1/20). -.RE - -.sp -.ne 2 -.na -\fBzfs_scan_vdev_limit\fR (int) -.ad -.RS 12n -Maximum amount of data that can be concurrently issued at once for scrubs and -resilvers per leaf device, given in bytes. -.sp -Default value: \fB41943040\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_send_corrupt_data\fR (int) -.ad -.RS 12n -Allow sending of corrupt data (ignore read/checksum errors when sending data) -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_send_unmodified_spill_blocks\fR (int) -.ad -.RS 12n -Include unmodified spill blocks in the send stream. Under certain circumstances -previous versions of ZFS could incorrectly remove the spill block from an -existing object. Including unmodified copies of the spill blocks creates a -backwards compatible stream which will recreate a spill block if it was -incorrectly removed. -.sp -Use \fB1\fR for yes (default) and \fB0\fR for no. -.RE - -.sp -.ne 2 -.na -\fBzfs_send_queue_length\fR (int) -.ad -.RS 12n -The maximum number of bytes allowed in the \fBzfs send\fR queue. This value -must be at least twice the maximum block size in use. -.sp -Default value: \fB16,777,216\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_recv_queue_length\fR (int) -.ad -.RS 12n -The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value -must be at least twice the maximum block size in use. -.sp -Default value: \fB16,777,216\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_sync_pass_deferred_free\fR (int) -.ad -.RS 12n -Flushing of data to disk is done in passes. Defer frees starting in this pass -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_spa_discard_memory_limit\fR (int) -.ad -.RS 12n -Maximum memory used for prefetching a checkpoint's space map on each -vdev while discarding the checkpoint. -.sp -Default value: \fB16,777,216\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_special_class_metadata_reserve_pct\fR (int) -.ad -.RS 12n -Only allow small data blocks to be allocated on the special and dedup vdev -types when the available free space percentage on these vdevs exceeds this -value. This ensures reserved space is available for pool meta data as the -special vdevs approach capacity. -.sp -Default value: \fB25\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_sync_pass_dont_compress\fR (int) -.ad -.RS 12n -Don't compress starting in this pass -.sp -Default value: \fB5\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_sync_pass_rewrite\fR (int) -.ad -.RS 12n -Rewrite new block pointers starting in this pass -.sp -Default value: \fB2\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_sync_taskq_batch_pct\fR (int) -.ad -.RS 12n -This controls the number of threads used by the dp_sync_taskq. The default -value of 75% will create a maximum of one thread per cpu. -.sp -Default value: \fB75\fR%. -.RE - -.sp -.ne 2 -.na -\fBzfs_trim_extent_bytes_max\fR (unsigned int) -.ad -.RS 12n -Maximum size of TRIM command. Ranges larger than this will be split in to -chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being -issued to the device. -.sp -Default value: \fB134,217,728\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_trim_extent_bytes_min\fR (unsigned int) -.ad -.RS 12n -Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped -unless they're part of a larger range which was broken in to chunks. This is -done because it's common for these small TRIMs to negatively impact overall -performance. This value can be set to 0 to TRIM all unallocated space. -.sp -Default value: \fB32,768\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_trim_metaslab_skip\fR (unsigned int) -.ad -.RS 12n -Skip uninitialized metaslabs during the TRIM process. This option is useful -for pools constructed from large thinly-provisioned devices where TRIM -operations are slow. As a pool ages an increasing fraction of the pools -metaslabs will be initialized progressively degrading the usefulness of -this option. This setting is stored when starting a manual TRIM and will -persist for the duration of the requested TRIM. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_trim_queue_limit\fR (unsigned int) -.ad -.RS 12n -Maximum number of queued TRIMs outstanding per leaf vdev. The number of -concurrent TRIM commands issued to the device is controlled by the -\fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module -options. -.sp -Default value: \fB10\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_trim_txg_batch\fR (unsigned int) -.ad -.RS 12n -The number of transaction groups worth of frees which should be aggregated -before TRIM operations are issued to the device. This setting represents a -trade-off between issuing larger, more efficient TRIM operations and the -delay before the recently trimmed space is available for use by the device. -.sp -Increasing this value will allow frees to be aggregated for a longer time. -This will result is larger TRIM operations and potentially increased memory -usage. Decreasing this value will have the opposite effect. The default -value of 32 was determined to be a reasonable compromise. -.sp -Default value: \fB32\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_txg_history\fR (int) -.ad -.RS 12n -Historical statistics for the last N txgs will be available in -\fB/proc/spl/kstat/zfs//txgs\fR -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_txg_timeout\fR (int) -.ad -.RS 12n -Flush dirty data to disk at least every N seconds (maximum txg duration) -.sp -Default value: \fB5\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_aggregate_trim\fR (int) -.ad -.RS 12n -Allow TRIM I/Os to be aggregated. This is normally not helpful because -the extents to be trimmed will have been already been aggregated by the -metaslab. This option is provided for debugging and performance analysis. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_aggregation_limit\fR (int) -.ad -.RS 12n -Max vdev I/O aggregation size -.sp -Default value: \fB1,048,576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_aggregation_limit_non_rotating\fR (int) -.ad -.RS 12n -Max vdev I/O aggregation size for non-rotating media -.sp -Default value: \fB131,072\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_cache_bshift\fR (int) -.ad -.RS 12n -Shift size to inflate reads too -.sp -Default value: \fB16\fR (effectively 65536). -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_cache_max\fR (int) -.ad -.RS 12n -Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR -size (default 64k). -.sp -Default value: \fB16384\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_cache_size\fR (int) -.ad -.RS 12n -Total size of the per-disk cache in bytes. -.sp -Currently this feature is disabled as it has been found to not be helpful -for performance and in some cases harmful. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_mirror_rotating_inc\fR (int) -.ad -.RS 12n -A number by which the balancing algorithm increments the load calculation for -the purpose of selecting the least busy mirror member when an I/O immediately -follows its predecessor on rotational vdevs for the purpose of making decisions -based on load. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_mirror_rotating_seek_inc\fR (int) -.ad -.RS 12n -A number by which the balancing algorithm increments the load calculation for -the purpose of selecting the least busy mirror member when an I/O lacks -locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within -this that are not immediately following the previous I/O are incremented by -half. -.sp -Default value: \fB5\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_mirror_rotating_seek_offset\fR (int) -.ad -.RS 12n -The maximum distance for the last queued I/O in which the balancing algorithm -considers an I/O to have locality. -See the section "ZFS I/O SCHEDULER". -.sp -Default value: \fB1048576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_mirror_non_rotating_inc\fR (int) -.ad -.RS 12n -A number by which the balancing algorithm increments the load calculation for -the purpose of selecting the least busy mirror member on non-rotational vdevs -when I/Os do not immediately follow one another. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) -.ad -.RS 12n -A number by which the balancing algorithm increments the load calculation for -the purpose of selecting the least busy mirror member when an I/O lacks -locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within -this that are not immediately following the previous I/O are incremented by -half. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_read_gap_limit\fR (int) -.ad -.RS 12n -Aggregate read I/O operations if the gap on-disk between them is within this -threshold. -.sp -Default value: \fB32,768\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_scheduler\fR (charp) -.ad -.RS 12n -Set the Linux I/O scheduler on whole disk vdevs to this scheduler. Valid options -are noop, cfq, bfq & deadline -.sp -Default value: \fBnoop\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_write_gap_limit\fR (int) -.ad -.RS 12n -Aggregate write I/O over gap -.sp -Default value: \fB4,096\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_vdev_raidz_impl\fR (string) -.ad -.RS 12n -Parameter for selecting raidz parity implementation to use. - -Options marked (always) below may be selected on module load as they are -supported on all systems. -The remaining options may only be set after the module is loaded, as they -are available only if the implementations are compiled in and supported -on the running system. - -Once the module is loaded, the content of -/sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options -with the currently selected one enclosed in []. -Possible options are: - fastest - (always) implementation selected using built-in benchmark - original - (always) original raidz implementation - scalar - (always) scalar raidz implementation - sse2 - implementation using SSE2 instruction set (64bit x86 only) - ssse3 - implementation using SSSE3 instruction set (64bit x86 only) - avx2 - implementation using AVX2 instruction set (64bit x86 only) - avx512f - implementation using AVX512F instruction set (64bit x86 only) - avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only) - aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only) - aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only) -.sp -Default value: \fBfastest\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_zevent_cols\fR (int) -.ad -.RS 12n -When zevents are logged to the console use this as the word wrap width. -.sp -Default value: \fB80\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_zevent_console\fR (int) -.ad -.RS 12n -Log events to the console -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzfs_zevent_len_max\fR (int) -.ad -.RS 12n -Max event queue length. A value of 0 will result in a calculated value which -increases with the number of CPUs in the system (minimum 64 events). Events -in the queue can be viewed with the \fBzpool events\fR command. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_zil_clean_taskq_maxalloc\fR (int) -.ad -.RS 12n -The maximum number of taskq entries that are allowed to be cached. When this -limit is exceeded transaction records (itxs) will be cleaned synchronously. -.sp -Default value: \fB1048576\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_zil_clean_taskq_minalloc\fR (int) -.ad -.RS 12n -The number of taskq entries that are pre-populated when the taskq is first -created and are immediately available for use. -.sp -Default value: \fB1024\fR. -.RE - -.sp -.ne 2 -.na -\fBzfs_zil_clean_taskq_nthr_pct\fR (int) -.ad -.RS 12n -This controls the number of threads used by the dp_zil_clean_taskq. The default -value of 100% will create a maximum of one thread per cpu. -.sp -Default value: \fB100\fR%. -.RE - -.sp -.ne 2 -.na -\fBzil_nocacheflush\fR (int) -.ad -.RS 12n -Disable the cache flush commands that are normally sent to the disk(s) by -the ZIL after an LWB write has completed. Setting this will cause ZIL -corruption on power loss if a volatile out-of-order write cache is enabled. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzil_replay_disable\fR (int) -.ad -.RS 12n -Disable intent logging replay. Can be disabled for recovery from corrupted -ZIL -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzil_slog_bulk\fR (ulong) -.ad -.RS 12n -Limit SLOG write size per commit executed with synchronous priority. -Any writes above that will be executed with lower (asynchronous) priority -to limit potential SLOG device abuse by single active ZIL writer. -.sp -Default value: \fB786,432\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_deadman_log_all\fR (int) -.ad -.RS 12n -If non-zero, the zio deadman will produce debugging messages (see -\fBzfs_dbgmsg_enable\fR) for all zios, rather than only for leaf -zios possessing a vdev. This is meant to be used by developers to gain -diagnostic information for hang conditions which don't involve a mutex -or other locking primitive; typically conditions in which a thread in -the zio pipeline is looping indefinitely. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_decompress_fail_fraction\fR (int) -.ad -.RS 12n -If non-zero, this value represents the denominator of the probability that zfs -should induce a decompression failure. For instance, for a 5% decompression -failure rate, this value should be set to 20. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_slow_io_ms\fR (int) -.ad -.RS 12n -When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to -complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow -I/O counters can be seen with "zpool status -s". - -.sp -Default value: \fB30,000\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_dva_throttle_enabled\fR (int) -.ad -.RS 12n -Throttle block allocations in the I/O pipeline. This allows for -dynamic allocation distribution when devices are imbalanced. -When enabled, the maximum number of pending allocations per top-level vdev -is limited by \fBzfs_vdev_queue_depth_pct\fR. -.sp -Default value: \fB1\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_requeue_io_start_cut_in_line\fR (int) -.ad -.RS 12n -Prioritize requeued I/O -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzio_taskq_batch_pct\fR (uint) -.ad -.RS 12n -Percentage of online CPUs (or CPU cores, etc) which will run a worker thread -for I/O. These workers are responsible for I/O work such as compression and -checksum calculations. Fractional number of CPUs will be rounded down. -.sp -The default value of 75 was chosen to avoid using all CPUs which can result in -latency issues and inconsistent application performance, especially when high -compression is enabled. -.sp -Default value: \fB75\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_inhibit_dev\fR (uint) -.ad -.RS 12n -Do not create zvol device nodes. This may slightly improve startup time on -systems with a very large number of zvols. -.sp -Use \fB1\fR for yes and \fB0\fR for no (default). -.RE - -.sp -.ne 2 -.na -\fBzvol_major\fR (uint) -.ad -.RS 12n -Major number for zvol block devices -.sp -Default value: \fB230\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_max_discard_blocks\fR (ulong) -.ad -.RS 12n -Discard (aka TRIM) operations done on zvols will be done in batches of this -many blocks, where block size is determined by the \fBvolblocksize\fR property -of a zvol. -.sp -Default value: \fB16,384\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_prefetch_bytes\fR (uint) -.ad -.RS 12n -When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR -from the start and end of the volume. Prefetching these regions -of the volume is desirable because they are likely to be accessed -immediately by \fBblkid(8)\fR or by the kernel scanning for a partition -table. -.sp -Default value: \fB131,072\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_request_sync\fR (uint) -.ad -.RS 12n -When processing I/O requests for a zvol submit them synchronously. This -effectively limits the queue depth to 1 for each I/O submitter. When set -to 0 requests are handled asynchronously by a thread pool. The number of -requests which can be handled concurrently is controller by \fBzvol_threads\fR. -.sp -Default value: \fB0\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_threads\fR (uint) -.ad -.RS 12n -Max number of threads which can handle zvol I/O requests concurrently. -.sp -Default value: \fB32\fR. -.RE - -.sp -.ne 2 -.na -\fBzvol_volmode\fR (uint) -.ad -.RS 12n -Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR. -Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none). -.sp -Default value: \fB1\fR. -.RE - -.SH ZFS I/O SCHEDULER -ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. -The I/O scheduler determines when and in what order those operations are -issued. The I/O scheduler divides operations into five I/O classes -prioritized in the following order: sync read, sync write, async read, -async write, and scrub/resilver. Each queue defines the minimum and -maximum number of concurrent operations that may be issued to the -device. In addition, the device has an aggregate maximum, -\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums -must not exceed the aggregate maximum. If the sum of the per-queue -maximums exceeds the aggregate maximum, then the number of active I/Os -may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will -be issued regardless of whether all per-queue minimums have been met. -.sp -For many physical devices, throughput increases with the number of -concurrent operations, but latency typically suffers. Further, physical -devices typically have a limit at which more concurrent operations have no -effect on throughput or can actually cause it to decrease. -.sp -The scheduler selects the next operation to issue by first looking for an -I/O class whose minimum has not been satisfied. Once all are satisfied and -the aggregate maximum has not been hit, the scheduler looks for classes -whose maximum has not been satisfied. Iteration through the I/O classes is -done in the order specified above. No further operations are issued if the -aggregate maximum number of concurrent operations has been hit or if there -are no operations queued for an I/O class that has not hit its maximum. -Every time an I/O is queued or an operation completes, the I/O scheduler -looks for new operations to issue. -.sp -In general, smaller max_active's will lead to lower latency of synchronous -operations. Larger max_active's may lead to higher overall throughput, -depending on underlying storage. -.sp -The ratio of the queues' max_actives determines the balance of performance -between reads, writes, and scrubs. E.g., increasing -\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete -more quickly, but reads and writes to have higher latency and lower throughput. -.sp -All I/O classes have a fixed maximum number of outstanding operations -except for the async write class. Asynchronous writes represent the data -that is committed to stable storage during the syncing stage for -transaction groups. Transaction groups enter the syncing state -periodically so the number of queued async writes will quickly burst up -and then bleed down to zero. Rather than servicing them as quickly as -possible, the I/O scheduler changes the maximum number of active async -write I/Os according to the amount of dirty data in the pool. Since -both throughput and latency typically increase with the number of -concurrent operations issued to physical devices, reducing the -burstiness in the number of concurrent operations also stabilizes the -response time of operations from other -- and in particular synchronous --- queues. In broad strokes, the I/O scheduler will issue more -concurrent operations from the async write queue as there's more dirty -data in the pool. -.sp -Async Writes -.sp -The number of concurrent operations issued for the async write I/O class -follows a piece-wise linear function defined by a few adjustable points. -.nf - - | o---------| <-- zfs_vdev_async_write_max_active - ^ | /^ | - | | / | | -active | / | | - I/O | / | | -count | / | | - | / | | - |-------o | | <-- zfs_vdev_async_write_min_active - 0|_______^______|_________| - 0% | | 100% of zfs_dirty_data_max - | | - | `-- zfs_vdev_async_write_active_max_dirty_percent - `--------- zfs_vdev_async_write_active_min_dirty_percent - -.fi -Until the amount of dirty data exceeds a minimum percentage of the dirty -data allowed in the pool, the I/O scheduler will limit the number of -concurrent operations to the minimum. As that threshold is crossed, the -number of concurrent operations issued increases linearly to the maximum at -the specified maximum percentage of the dirty data allowed in the pool. -.sp -Ideally, the amount of dirty data on a busy pool will stay in the sloped -part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR -and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the -maximum percentage, this indicates that the rate of incoming data is -greater than the rate that the backend storage can handle. In this case, we -must further throttle incoming writes, as described in the next section. - -.SH ZFS TRANSACTION DELAY -We delay transactions when we've determined that the backend storage -isn't able to accommodate the rate of incoming writes. -.sp -If there is already a transaction waiting, we delay relative to when -that transaction will finish waiting. This way the calculated delay time -is independent of the number of threads concurrently executing -transactions. -.sp -If we are the only waiter, wait relative to when the transaction -started, rather than the current time. This credits the transaction for -"time already served", e.g. reading indirect blocks. -.sp -The minimum time for a transaction to take is calculated as: -.nf - min_time = zfs_delay_scale * (dirty - min) / (max - dirty) - min_time is then capped at 100 milliseconds. -.fi -.sp -The delay has two degrees of freedom that can be adjusted via tunables. The -percentage of dirty data at which we start to delay is defined by -\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above -\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to -delay after writing at full speed has failed to keep up with the incoming write -rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, -this variable determines the amount of delay at the midpoint of the curve. -.sp -.nf -delay - 10ms +-------------------------------------------------------------*+ - | *| - 9ms + *+ - | *| - 8ms + *+ - | * | - 7ms + * + - | * | - 6ms + * + - | * | - 5ms + * + - | * | - 4ms + * + - | * | - 3ms + * + - | * | - 2ms + (midpoint) * + - | | ** | - 1ms + v *** + - | zfs_delay_scale ----------> ******** | - 0 +-------------------------------------*********----------------+ - 0% <- zfs_dirty_data_max -> 100% -.fi -.sp -Note that since the delay is added to the outstanding time remaining on the -most recent transaction, the delay is effectively the inverse of IOPS. -Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve -was chosen such that small changes in the amount of accumulated dirty data -in the first 3/4 of the curve yield relatively small differences in the -amount of delay. -.sp -The effects can be easier to understand when the amount of delay is -represented on a log scale: -.sp -.nf -delay -100ms +-------------------------------------------------------------++ - + + - | | - + *+ - 10ms + *+ - + ** + - | (midpoint) ** | - + | ** + - 1ms + v **** + - + zfs_delay_scale ----------> ***** + - | **** | - + **** + -100us + ** + - + * + - | * | - + * + - 10us + * + - + + - | | - + + - +--------------------------------------------------------------+ - 0% <- zfs_dirty_data_max -> 100% -.fi -.sp -Note here that only as the amount of dirty data approaches its limit does -the delay start to increase rapidly. The goal of a properly tuned system -should be to keep the amount of dirty data out of that range by first -ensuring that the appropriate limits are set for the I/O scheduler to reach -optimal throughput on the backend storage, and then by changing the value -of \fBzfs_delay_scale\fR to increase the steepness of the curve. diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 deleted file mode 100644 index 2534d3d20e..0000000000 --- a/man/man5/zpool-features.5 +++ /dev/null @@ -1,827 +0,0 @@ -'\" te -.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved. -.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. -.\" The contents of this file are subject to the terms of the Common Development -.\" and Distribution License (the "License"). You may not use this file except -.\" in compliance with the License. You can obtain a copy of the license at -.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" -.\" See the License for the specific language governing permissions and -.\" limitations under the License. When distributing Covered Code, include this -.\" CDDL HEADER in each file and include the License file at -.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this -.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your -.\" own identifying information: -.\" Portions Copyright [yyyy] [name of copyright owner] -.TH ZPOOL-FEATURES 5 "Jun 8, 2018" -.SH NAME -zpool\-features \- ZFS pool feature descriptions -.SH DESCRIPTION -.sp -.LP -ZFS pool on\-disk format versions are specified via "features" which replace -the old on\-disk format numbers (the last supported on\-disk format number is -28). To enable a feature on a pool use the \fBupgrade\fR subcommand of the -zpool(8) command, or set the \fBfeature@\fR\fIfeature_name\fR property -to \fBenabled\fR. -.sp -.LP -The pool format does not affect file system version compatibility or the ability -to send file systems between pools. -.sp -.LP -Since most features can be enabled independently of each other the on\-disk -format of the pool is specified by the set of all features marked as -\fBactive\fR on the pool. If the pool was created by another software version -this set may include unsupported features. -.SS "Identifying features" -.sp -.LP -Every feature has a GUID of the form \fIcom.example:feature_name\fR. The -reverse DNS name ensures that the feature's GUID is unique across all ZFS -implementations. When unsupported features are encountered on a pool they will -be identified by their GUIDs. Refer to the documentation for the ZFS -implementation that created the pool for information about those features. -.sp -.LP -Each supported feature also has a short name. By convention a feature's short -name is the portion of its GUID which follows the ':' (e.g. -\fIcom.example:feature_name\fR would have the short name \fIfeature_name\fR), -however a feature's short name may differ across ZFS implementations if -following the convention would result in name conflicts. -.SS "Feature states" -.sp -.LP -Features can be in one of three states: -.sp -.ne 2 -.na -\fBactive\fR -.ad -.RS 12n -This feature's on\-disk format changes are in effect on the pool. Support for -this feature is required to import the pool in read\-write mode. If this -feature is not read-only compatible, support is also required to import the pool -in read\-only mode (see "Read\-only compatibility"). -.RE - -.sp -.ne 2 -.na -\fBenabled\fR -.ad -.RS 12n -An administrator has marked this feature as enabled on the pool, but the -feature's on\-disk format changes have not been made yet. The pool can still be -imported by software that does not support this feature, but changes may be made -to the on\-disk format at any time which will move the feature to the -\fBactive\fR state. Some features may support returning to the \fBenabled\fR -state after becoming \fBactive\fR. See feature\-specific documentation for -details. -.RE - -.sp -.ne 2 -.na -\fBdisabled\fR -.ad -.RS 12n -This feature's on\-disk format changes have not been made and will not be made -unless an administrator moves the feature to the \fBenabled\fR state. Features -cannot be disabled once they have been enabled. -.RE - -.sp -.LP -The state of supported features is exposed through pool properties of the form -\fIfeature@short_name\fR. -.SS "Read\-only compatibility" -.sp -.LP -Some features may make on\-disk format changes that do not interfere with other -software's ability to read from the pool. These features are referred to as -"read\-only compatible". If all unsupported features on a pool are read\-only -compatible, the pool can be imported in read\-only mode by setting the -\fBreadonly\fR property during import (see zpool(8) for details on -importing pools). -.SS "Unsupported features" -.sp -.LP -For each unsupported feature enabled on an imported pool a pool property -named \fIunsupported@feature_name\fR will indicate why the import was allowed -despite the unsupported feature. Possible values for this property are: - -.sp -.ne 2 -.na -\fBinactive\fR -.ad -.RS 12n -The feature is in the \fBenabled\fR state and therefore the pool's on\-disk -format is still compatible with software that does not support this feature. -.RE - -.sp -.ne 2 -.na -\fBreadonly\fR -.ad -.RS 12n -The feature is read\-only compatible and the pool has been imported in -read\-only mode. -.RE - -.SS "Feature dependencies" -.sp -.LP -Some features depend on other features being enabled in order to function -properly. Enabling a feature will automatically enable any features it -depends on. -.SH FEATURES -.sp -.LP -The following features are supported on this system: - -.sp -.ne 2 -.na -\fBallocation_classes\fR -.ad -.RS 4n -.TS -l l . -GUID org.zfsonlinux:allocation_classes -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This feature enables support for separate allocation classes. - -This feature becomes \fBactive\fR when a dedicated allocation class vdev -(dedup or special) is created with the \fBzpool create\fR or \fBzpool add\fR -subcommands. With device removal, it can be returned to the \fBenabled\fR -state if all the dedicated allocation class vdevs are removed. -.RE - -.sp -.ne 2 -.na -\fBasync_destroy\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:async_destroy -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -Destroying a file system requires traversing all of its data in order to -return its used space to the pool. Without \fBasync_destroy\fR the file system -is not fully removed until all space has been reclaimed. If the destroy -operation is interrupted by a reboot or power outage the next attempt to open -the pool will need to complete the destroy operation synchronously. - -When \fBasync_destroy\fR is enabled the file system's data will be reclaimed -by a background process, allowing the destroy operation to complete without -traversing the entire file system. The background process is able to resume -interrupted destroys after the pool has been opened, eliminating the need -to finish interrupted destroys as part of the open operation. The amount -of space remaining to be reclaimed by the background process is available -through the \fBfreeing\fR property. - -This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero. -.RE - -.sp -.ne 2 -.na -\fBbookmarks\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:bookmarks -READ\-ONLY COMPATIBLE yes -DEPENDENCIES extensible_dataset -.TE - -This feature enables use of the \fBzfs bookmark\fR subcommand. - -This feature is \fBactive\fR while any bookmarks exist in the pool. -All bookmarks in the pool can be listed by running -\fBzfs list -t bookmark -r \fIpoolname\fR\fR. -.RE - -.sp -.ne 2 -.na -\fBbookmark_v2\fR -.ad -.RS 4n -.TS -l l . -GUID com.datto:bookmark_v2 -READ\-ONLY COMPATIBLE no -DEPENDENCIES bookmark, extensible_dataset -.TE - -This feature enables the creation and management of larger bookmarks which are -needed for other features in ZFS. - -This feature becomes \fBactive\fR when a v2 bookmark is created and will be -returned to the \fBenabled\fR state when all v2 bookmarks are destroyed. -.RE - -.sp -.ne 2 -.na -\fBdevice_removal\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:device_removal -READ\-ONLY COMPATIBLE no -DEPENDENCIES none -.TE - -This feature enables the \fBzpool remove\fR subcommand to remove top-level -vdevs, evacuating them to reduce the total size of the pool. - -This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used -on a top-level vdev, and will never return to being \fBenabled\fR. -.RE - -.sp -.ne 2 -.na -\fBedonr\fR -.ad -.RS 4n -.TS -l l . -GUID org.illumos:edonr -READ\-ONLY COMPATIBLE no -DEPENDENCIES extensible_dataset -.TE - -This feature enables the use of the Edon-R hash algorithm for checksum, -including for nopwrite (if compression is also enabled, an overwrite of -a block whose checksum matches the data being written will be ignored). -In an abundance of caution, Edon-R requires verification when used with -dedup: \fBzfs set dedup=edonr,verify\fR. See \fBzfs\fR(8). - -Edon-R is a very high-performance hash algorithm that was part -of the NIST SHA-3 competition. It provides extremely high hash -performance (over 350% faster than SHA-256), but was not selected -because of its unsuitability as a general purpose secure hash algorithm. -This implementation utilizes the new salted checksumming functionality -in ZFS, which means that the checksum is pre-seeded with a secret -256-bit random key (stored on the pool) before being fed the data block -to be checksummed. Thus the produced checksums are unique to a given -pool. - -When the \fBedonr\fR feature is set to \fBenabled\fR, the administrator -can turn on the \fBedonr\fR checksum on any dataset using the -\fBzfs set checksum=edonr\fR. See zfs(8). This feature becomes -\fBactive\fR once a \fBchecksum\fR property has been set to \fBedonr\fR, -and will return to being \fBenabled\fR once all filesystems that have -ever had their checksum set to \fBedonr\fR are destroyed. - -The \fBedonr\fR feature is not supported by GRUB and must not be used on -the pool if GRUB needs to access the pool (e.g. for /boot). -.RE - -.sp -.ne 2 -.na -\fBembedded_data\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:embedded_data -READ\-ONLY COMPATIBLE no -DEPENDENCIES none -.TE - -This feature improves the performance and compression ratio of -highly-compressible blocks. Blocks whose contents can compress to 112 bytes -or smaller can take advantage of this feature. - -When this feature is enabled, the contents of highly-compressible blocks are -stored in the block "pointer" itself (a misnomer in this case, as it contains -the compressed data, rather than a pointer to its location on disk). Thus -the space of the block (one sector, typically 512 bytes or 4KB) is saved, -and no additional i/o is needed to read and write the data block. - -This feature becomes \fBactive\fR as soon as it is enabled and will -never return to being \fBenabled\fR. -.RE - -.sp -.ne 2 -.na -\fBempty_bpobj\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:empty_bpobj -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This feature increases the performance of creating and using a large -number of snapshots of a single filesystem or volume, and also reduces -the disk space required. - -When there are many snapshots, each snapshot uses many Block Pointer -Objects (bpobj's) to track blocks associated with that snapshot. -However, in common use cases, most of these bpobj's are empty. This -feature allows us to create each bpobj on-demand, thus eliminating the -empty bpobjs. - -This feature is \fBactive\fR while there are any filesystems, volumes, -or snapshots which were created after enabling this feature. -.RE - -.sp -.ne 2 -.na -\fBenabled_txg\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:enabled_txg -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -Once this feature is enabled ZFS records the transaction group number -in which new features are enabled. This has no user-visible impact, -but other features may depend on this feature. - -This feature becomes \fBactive\fR as soon as it is enabled and will -never return to being \fBenabled\fB. -.RE - -.sp -.ne 2 -.na -\fBencryption\fR -.ad -.RS 4n -.TS -l l . -GUID com.datto:encryption -READ\-ONLY COMPATIBLE no -DEPENDENCIES bookmark_v2, extensible_dataset -.TE - -This feature enables the creation and management of natively encrypted datasets. - -This feature becomes \fBactive\fR when an encrypted dataset is created and will -be returned to the \fBenabled\fR state when all datasets that use this feature -are destroyed. -.RE - -.sp -.ne 2 -.na -\fBextensible_dataset\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:extensible_dataset -READ\-ONLY COMPATIBLE no -DEPENDENCIES none -.TE - -This feature allows more flexible use of internal ZFS data structures, -and exists for other features to depend on. - -This feature will be \fBactive\fR when the first dependent feature uses it, -and will be returned to the \fBenabled\fR state when all datasets that use -this feature are destroyed. -.RE - -.sp -.ne 2 -.na -\fBfilesystem_limits\fR -.ad -.RS 4n -.TS -l l . -GUID com.joyent:filesystem_limits -READ\-ONLY COMPATIBLE yes -DEPENDENCIES extensible_dataset -.TE - -This feature enables filesystem and snapshot limits. These limits can be used -to control how many filesystems and/or snapshots can be created at the point in -the tree on which the limits are set. - -This feature is \fBactive\fR once either of the limit properties has been -set on a dataset. Once activated the feature is never deactivated. -.RE - -.sp -.ne 2 -.na -\fBhole_birth\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:hole_birth -READ\-ONLY COMPATIBLE no -DEPENDENCIES enabled_txg -.TE - -This feature has/had bugs, the result of which is that, if you do a -\fBzfs send -i\fR (or \fB-R\fR, since it uses \fB-i\fR) from an affected -dataset, the receiver will not see any checksum or other errors, but the -resulting destination snapshot will not match the source. Its use by -\fBzfs send -i\fR has been disabled by default. See the -\fBsend_holes_without_birth_time\fR module parameter in -zfs-module-parameters(5). - -This feature improves performance of incremental sends (\fBzfs send -i\fR) -and receives for objects with many holes. The most common case of -hole-filled objects is zvols. - -An incremental send stream from snapshot \fBA\fR to snapshot \fBB\fR -contains information about every block that changed between \fBA\fR and -\fBB\fR. Blocks which did not change between those snapshots can be -identified and omitted from the stream using a piece of metadata called -the 'block birth time', but birth times are not recorded for holes (blocks -filled only with zeroes). Since holes created after \fBA\fR cannot be -distinguished from holes created before \fBA\fR, information about every -hole in the entire filesystem or zvol is included in the send stream. - -For workloads where holes are rare this is not a problem. However, when -incrementally replicating filesystems or zvols with many holes (for -example a zvol formatted with another filesystem) a lot of time will -be spent sending and receiving unnecessary information about holes that -already exist on the receiving side. - -Once the \fBhole_birth\fR feature has been enabled the block birth times -of all new holes will be recorded. Incremental sends between snapshots -created after this feature is enabled will use this new metadata to avoid -sending information about holes that already exist on the receiving side. - -This feature becomes \fBactive\fR as soon as it is enabled and will -never return to being \fBenabled\fB. -.RE - -.sp -.ne 2 -.na -\fBlarge_blocks\fR -.ad -.RS 4n -.TS -l l . -GUID org.open-zfs:large_blocks -READ\-ONLY COMPATIBLE no -DEPENDENCIES extensible_dataset -.TE - -The \fBlarge_block\fR feature allows the record size on a dataset to be -set larger than 128KB. - -This feature becomes \fBactive\fR once a dataset contains a file with -a block size larger than 128KB, and will return to being \fBenabled\fR once all -filesystems that have ever had their recordsize larger than 128KB are destroyed. -.RE - -.sp -.ne 2 -.na -\fBlarge_dnode\fR -.ad -.RS 4n -.TS -l l . -GUID org.zfsonlinux:large_dnode -READ\-ONLY COMPATIBLE no -DEPENDENCIES extensible_dataset -.TE - -The \fBlarge_dnode\fR feature allows the size of dnodes in a dataset to be -set larger than 512B. - -This feature becomes \fBactive\fR once a dataset contains an object with -a dnode larger than 512B, which occurs as a result of setting the -\fBdnodesize\fR dataset property to a value other than \fBlegacy\fR. The -feature will return to being \fBenabled\fR once all filesystems that -have ever contained a dnode larger than 512B are destroyed. Large dnodes -allow more data to be stored in the bonus buffer, thus potentially -improving performance by avoiding the use of spill blocks. -.RE - -.sp -.ne 2 -.na -\fBlz4_compress\fR -.ad -.RS 4n -.TS -l l . -GUID org.illumos:lz4_compress -READ\-ONLY COMPATIBLE no -DEPENDENCIES none -.TE - -\fBlz4\fR is a high-performance real-time compression algorithm that -features significantly faster compression and decompression as well as a -higher compression ratio than the older \fBlzjb\fR compression. -Typically, \fBlz4\fR compression is approximately 50% faster on -compressible data and 200% faster on incompressible data than -\fBlzjb\fR. It is also approximately 80% faster on decompression, while -giving approximately 10% better compression ratio. - -When the \fBlz4_compress\fR feature is set to \fBenabled\fR, the -administrator can turn on \fBlz4\fR compression on any dataset on the -pool using the zfs(8) command. Please note that doing so will -immediately activate the \fBlz4_compress\fR feature on the underlying -pool using the zfs(8) command. Also, all newly written metadata -will be compressed with \fBlz4\fR algorithm. Since this feature is not -read-only compatible, this operation will render the pool unimportable -on systems without support for the \fBlz4_compress\fR feature. - -Booting off of \fBlz4\fR-compressed root pools is supported. - -This feature becomes \fBactive\fR as soon as it is enabled and will -never return to being \fBenabled\fB. -.RE - -.sp -.ne 2 -.na -\fBmulti_vdev_crash_dump\fR -.ad -.RS 4n -.TS -l l . -GUID com.joyent:multi_vdev_crash_dump -READ\-ONLY COMPATIBLE no -DEPENDENCIES none -.TE - -This feature allows a dump device to be configured with a pool comprised -of multiple vdevs. Those vdevs may be arranged in any mirrored or raidz -configuration. - -When the \fBmulti_vdev_crash_dump\fR feature is set to \fBenabled\fR, -the administrator can use the \fBdumpadm\fR(1M) command to configure a -dump device on a pool comprised of multiple vdevs. - -Under Linux this feature is registered for compatibility but not used. -New pools created under Linux will have the feature \fBenabled\fR but -will never transition to \fB\fBactive\fR. This functionality is not -required in order to support crash dumps under Linux. Existing pools -where this feature is \fB\fBactive\fR can be imported. -.RE - -.sp -.ne 2 -.na -\fBobsolete_counts\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:obsolete_counts -READ\-ONLY COMPATIBLE yes -DEPENDENCIES device_removal -.TE - -This feature is an enhancement of device_removal, which will over time -reduce the memory used to track removed devices. When indirect blocks -are freed or remapped, we note that their part of the indirect mapping -is "obsolete", i.e. no longer needed. - -This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is -used on a top-level vdev, and will never return to being \fBenabled\fR. -.RE - -.sp -.ne 2 -.na -\fBproject_quota\fR -.ad -.RS 4n -.TS -l l . -GUID org.zfsonlinux:project_quota -READ\-ONLY COMPATIBLE yes -DEPENDENCIES extensible_dataset -.TE - -This feature allows administrators to account the spaces and objects usage -information against the project identifier (ID). - -The project ID is new object-based attribute. When upgrading an existing -filesystem, object without project ID attribute will be assigned a zero -project ID. After this feature is enabled, newly created object will inherit -its parent directory's project ID if the parent inherit flag is set (via -\fBchattr +/-P\fR or \fBzfs project [-s|-C]\fR). Otherwise, the new object's -project ID will be set as zero. An object's project ID can be changed at -anytime by the owner (or privileged user) via \fBchattr -p $prjid\fR or -\fBzfs project -p $prjid\fR. - -This feature will become \fBactive\fR as soon as it is enabled and will never -return to being \fBdisabled\fR. Each filesystem will be upgraded automatically -when remounted or when new file is created under that filesystem. The upgrade -can also be triggered on filesystems via `zfs set version=current `. -The upgrade process runs in the background and may take a while to complete -for the filesystems containing a large number of files. -.RE - -.sp -.ne 2 -.na -\fBresilver_defer\fR -.ad -.RS 4n -.TS -l l . -GUID com.datto:resilver_defer -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This feature allows zfs to postpone new resilvers if an existing one is already -in progress. Without this feature, any new resilvers will cause the currently -running one to be immediately restarted from the beginning. - -This feature becomes \fBactive\fR once a resilver has been deferred, and -returns to being \fBenabled\fR when the deferred resilver begins. -.RE - -.sp -.ne 2 -.na -\fBsha512\fR -.ad -.RS 4n -.TS -l l . -GUID org.illumos:sha512 -READ\-ONLY COMPATIBLE no -DEPENDENCIES extensible_dataset -.TE - -This feature enables the use of the SHA-512/256 truncated hash algorithm -(FIPS 180-4) for checksum and dedup. The native 64-bit arithmetic of -SHA-512 provides an approximate 50% performance boost over SHA-256 on -64-bit hardware and is thus a good minimum-change replacement candidate -for systems where hash performance is important, but these systems -cannot for whatever reason utilize the faster \fBskein\fR and -\fBedonr\fR algorithms. - -When the \fBsha512\fR feature is set to \fBenabled\fR, the administrator -can turn on the \fBsha512\fR checksum on any dataset using -\fBzfs set checksum=sha512\fR. See zfs(8). This feature becomes -\fBactive\fR once a \fBchecksum\fR property has been set to \fBsha512\fR, -and will return to being \fBenabled\fR once all filesystems that have -ever had their checksum set to \fBsha512\fR are destroyed. - -The \fBsha512\fR feature is not supported by GRUB and must not be used on -the pool if GRUB needs to access the pool (e.g. for /boot). -.RE - -.sp -.ne 2 -.na -\fBskein\fR -.ad -.RS 4n -.TS -l l . -GUID org.illumos:skein -READ\-ONLY COMPATIBLE no -DEPENDENCIES extensible_dataset -.TE - -This feature enables the use of the Skein hash algorithm for checksum -and dedup. Skein is a high-performance secure hash algorithm that was a -finalist in the NIST SHA-3 competition. It provides a very high security -margin and high performance on 64-bit hardware (80% faster than -SHA-256). This implementation also utilizes the new salted checksumming -functionality in ZFS, which means that the checksum is pre-seeded with a -secret 256-bit random key (stored on the pool) before being fed the data -block to be checksummed. Thus the produced checksums are unique to a -given pool, preventing hash collision attacks on systems with dedup. - -When the \fBskein\fR feature is set to \fBenabled\fR, the administrator -can turn on the \fBskein\fR checksum on any dataset using -\fBzfs set checksum=skein\fR. See zfs(8). This feature becomes -\fBactive\fR once a \fBchecksum\fR property has been set to \fBskein\fR, -and will return to being \fBenabled\fR once all filesystems that have -ever had their checksum set to \fBskein\fR are destroyed. - -The \fBskein\fR feature is not supported by GRUB and must not be used on -the pool if GRUB needs to access the pool (e.g. for /boot). -.RE - -.sp -.ne 2 -.na -\fBspacemap_histogram\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:spacemap_histogram -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This features allows ZFS to maintain more information about how free space -is organized within the pool. If this feature is \fBenabled\fR, ZFS will -set this feature to \fBactive\fR when a new space map object is created or -an existing space map is upgraded to the new format. Once the feature is -\fBactive\fR, it will remain in that state until the pool is destroyed. -.RE - -.sp -.ne 2 -.na -\fBspacemap_v2\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:spacemap_v2 -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This feature enables the use of the new space map encoding which -consists of two words (instead of one) whenever it is advantageous. -The new encoding allows space maps to represent large regions of -space more efficiently on-disk while also increasing their maximum -addressable offset. - -This feature becomes \fBactive\fR once it is \fBenabled\fR, and never -returns back to being \fBenabled\fR. -.RE - -.sp -.ne 2 -.na -\fBuserobj_accounting\fR -.ad -.RS 4n -.TS -l l . -GUID org.zfsonlinux:userobj_accounting -READ\-ONLY COMPATIBLE yes -DEPENDENCIES extensible_dataset -.TE - -This feature allows administrators to account the object usage information -by user and group. - -This feature becomes \fBactive\fR as soon as it is enabled and will never -return to being \fBenabled\fR. Each filesystem will be upgraded automatically -when remounted, or when new files are created under that filesystem. -The upgrade can also be started manually on filesystems by running -`zfs set version=current `. The upgrade process runs in the background -and may take a while to complete for filesystems containing a large number of -files. -.RE - -.sp -.ne 2 -.na -\fBzpool_checkpoint\fR -.ad -.RS 4n -.TS -l l . -GUID com.delphix:zpool_checkpoint -READ\-ONLY COMPATIBLE yes -DEPENDENCIES none -.TE - -This feature enables the \fBzpool checkpoint\fR subcommand that can -checkpoint the state of the pool at the time it was issued and later -rewind back to it or discard it. - -This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand -is used to checkpoint the pool. -The feature will only return back to being \fBenabled\fR when the pool -is rewound or the checkpoint has been discarded. -.RE - -.SH "SEE ALSO" -zpool(8) diff --git a/man/man7/zfsconcepts.7 b/man/man7/zfsconcepts.7 new file mode 100644 index 0000000000..f958035f72 --- /dev/null +++ b/man/man7/zfsconcepts.7 @@ -0,0 +1,206 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFSCONCEPTS 7 +.Os +. +.Sh NAME +.Nm zfsconcepts +.Nd overview of ZFS concepts +. +.Sh DESCRIPTION +.Ss ZFS File System Hierarchy +A ZFS storage pool is a logical collection of devices that provide space for +datasets. +A storage pool is also the root of the ZFS file system hierarchy. +.Pp +The root of the pool can be accessed as a file system, such as mounting and +unmounting, taking snapshots, and setting properties. +The physical storage characteristics, however, are managed by the +.Xr zpool 8 +command. +.Pp +See +.Xr zpool 8 +for more information on creating and administering pools. +.Ss Snapshots +A snapshot is a read-only copy of a file system or volume. +Snapshots can be created extremely quickly, and initially consume no additional +space within the pool. +As data within the active dataset changes, the snapshot consumes more data than +would otherwise be shared with the active dataset. +.Pp +Snapshots can have arbitrary names. +Snapshots of volumes can be cloned or rolled back, visibility is determined +by the +.Sy snapdev +property of the parent volume. +.Pp +File system snapshots can be accessed under the +.Pa .zfs/snapshot +directory in the root of the file system. +Snapshots are automatically mounted on demand and may be unmounted at regular +intervals. +The visibility of the +.Pa .zfs +directory can be controlled by the +.Sy snapdir +property. +.Ss Bookmarks +A bookmark is like a snapshot, a read-only copy of a file system or volume. +Bookmarks can be created extremely quickly, compared to snapshots, and they +consume no additional space within the pool. +Bookmarks can also have arbitrary names, much like snapshots. +.Pp +Unlike snapshots, bookmarks can not be accessed through the filesystem in any way. +From a storage standpoint a bookmark just provides a way to reference +when a snapshot was created as a distinct object. +Bookmarks are initially tied to a snapshot, not the filesystem or volume, +and they will survive if the snapshot itself is destroyed. +Since they are very light weight there's little incentive to destroy them. +.Ss Clones +A clone is a writable volume or file system whose initial contents are the same +as another dataset. +As with snapshots, creating a clone is nearly instantaneous, and initially +consumes no additional space. +.Pp +Clones can only be created from a snapshot. +When a snapshot is cloned, it creates an implicit dependency between the parent +and child. +Even though the clone is created somewhere else in the dataset hierarchy, the +original snapshot cannot be destroyed as long as a clone exists. +The +.Sy origin +property exposes this dependency, and the +.Cm destroy +command lists any such dependencies, if they exist. +.Pp +The clone parent-child dependency relationship can be reversed by using the +.Cm promote +subcommand. +This causes the +.Qq origin +file system to become a clone of the specified file system, which makes it +possible to destroy the file system that the clone was created from. +.Ss "Mount Points" +Creating a ZFS file system is a simple operation, so the number of file systems +per system is likely to be numerous. +To cope with this, ZFS automatically manages mounting and unmounting file +systems without the need to edit the +.Pa /etc/fstab +file. +All automatically managed file systems are mounted by ZFS at boot time. +.Pp +By default, file systems are mounted under +.Pa /path , +where +.Ar path +is the name of the file system in the ZFS namespace. +Directories are created and destroyed as needed. +.Pp +A file system can also have a mount point set in the +.Sy mountpoint +property. +This directory is created as needed, and ZFS automatically mounts the file +system when the +.Nm zfs Cm mount Fl a +command is invoked +.Po without editing +.Pa /etc/fstab +.Pc . +The +.Sy mountpoint +property can be inherited, so if +.Em pool/home +has a mount point of +.Pa /export/stuff , +then +.Em pool/home/user +automatically inherits a mount point of +.Pa /export/stuff/user . +.Pp +A file system +.Sy mountpoint +property of +.Sy none +prevents the file system from being mounted. +.Pp +If needed, ZFS file systems can also be managed with traditional tools +.Po +.Nm mount , +.Nm umount , +.Pa /etc/fstab +.Pc . +If a file system's mount point is set to +.Sy legacy , +ZFS makes no attempt to manage the file system, and the administrator is +responsible for mounting and unmounting the file system. +Because pools must +be imported before a legacy mount can succeed, administrators should ensure +that legacy mounts are only attempted after the zpool import process +finishes at boot time. +For example, on machines using systemd, the mount option +.Pp +.Nm x-systemd.requires=zfs-import.target +.Pp +will ensure that the zfs-import completes before systemd attempts mounting +the filesystem. +See +.Xr systemd.mount 5 +for details. +.Ss Deduplication +Deduplication is the process for removing redundant data at the block level, +reducing the total amount of data stored. +If a file system has the +.Sy dedup +property enabled, duplicate data blocks are removed synchronously. +The result +is that only unique data is stored and common components are shared among files. +.Pp +Deduplicating data is a very resource-intensive operation. +It is generally recommended that you have at least 1.25 GiB of RAM +per 1 TiB of storage when you enable deduplication. +Calculating the exact requirement depends heavily +on the type of data stored in the pool. +.Pp +Enabling deduplication on an improperly-designed system can result in +performance issues (slow IO and administrative operations). +It can potentially lead to problems importing a pool due to memory exhaustion. +Deduplication can consume significant processing power (CPU) and memory as well +as generate additional disk IO. +.Pp +Before creating a pool with deduplication enabled, ensure that you have planned +your hardware requirements appropriately and implemented appropriate recovery +practices, such as regular backups. +Consider using the +.Sy compression +property as a less resource-intensive alternative. diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 new file mode 100644 index 0000000000..78721f2df9 --- /dev/null +++ b/man/man7/zfsprops.7 @@ -0,0 +1,2072 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2019, Kjeld Schouten-Lebbing +.\" +.Dd May 24, 2021 +.Dt ZFSPROPS 7 +.Os +. +.Sh NAME +.Nm zfsprops +.Nd native and user-defined properties of ZFS datasets +. +.Sh DESCRIPTION +Properties are divided into two types, native properties and user-defined +.Po or +.Qq user +.Pc +properties. +Native properties either export internal statistics or control ZFS behavior. +In addition, native properties are either editable or read-only. +User properties have no effect on ZFS behavior, but you can use them to annotate +datasets in a way that is meaningful in your environment. +For more information about user properties, see the +.Sx User Properties +section, below. +. +.Ss Native Properties +Every dataset has a set of properties that export statistics about the dataset +as well as control various behaviors. +Properties are inherited from the parent unless overridden by the child. +Some properties apply only to certain types of datasets +.Pq file systems, volumes, or snapshots . +.Pp +The values of numeric properties can be specified using human-readable suffixes +.Po for example, +.Sy k , +.Sy KB , +.Sy M , +.Sy Gb , +and so forth, up to +.Sy Z +for zettabyte +.Pc . +The following are all valid +.Pq and equal +specifications: +.Li 1536M, 1.5g, 1.50GB . +.Pp +The values of non-numeric properties are case sensitive and must be lowercase, +except for +.Sy mountpoint , +.Sy sharenfs , +and +.Sy sharesmb . +.Pp +The following native properties consist of read-only statistics about the +dataset. +These properties can be neither set, nor inherited. +Native properties apply to all dataset types unless otherwise noted. +.Bl -tag -width "usedbyrefreservation" +.It Sy available +The amount of space available to the dataset and all its children, assuming that +there is no other activity in the pool. +Because space is shared within a pool, availability can be limited by any number +of factors, including physical pool size, quotas, reservations, or other +datasets within the pool. +.Pp +This property can also be referred to by its shortened column name, +.Sy avail . +.It Sy compressratio +For non-snapshots, the compression ratio achieved for the +.Sy used +space of this dataset, expressed as a multiplier. +The +.Sy used +property includes descendant datasets, and, for clones, does not include the +space shared with the origin snapshot. +For snapshots, the +.Sy compressratio +is the same as the +.Sy refcompressratio +property. +Compression can be turned on by running: +.Nm zfs Cm set Sy compression Ns = Ns Sy on Ar dataset . +The default value is +.Sy off . +.It Sy createtxg +The transaction group (txg) in which the dataset was created. +Bookmarks have the same +.Sy createtxg +as the snapshot they are initially tied to. +This property is suitable for ordering a list of snapshots, +e.g. for incremental send and receive. +.It Sy creation +The time this dataset was created. +.It Sy clones +For snapshots, this property is a comma-separated list of filesystems or volumes +which are clones of this snapshot. +The clones' +.Sy origin +property is this snapshot. +If the +.Sy clones +property is not empty, then this snapshot can not be destroyed +.Po even with the +.Fl r +or +.Fl f +options +.Pc . +The roles of origin and clone can be swapped by promoting the clone with the +.Nm zfs Cm promote +command. +.It Sy defer_destroy +This property is +.Sy on +if the snapshot has been marked for deferred destroy by using the +.Nm zfs Cm destroy Fl d +command. +Otherwise, the property is +.Sy off . +.It Sy encryptionroot +For encrypted datasets, indicates where the dataset is currently inheriting its +encryption key from. +Loading or unloading a key for the +.Sy encryptionroot +will implicitly load / unload the key for any inheriting datasets (see +.Nm zfs Cm load-key +and +.Nm zfs Cm unload-key +for details). +Clones will always share an +encryption key with their origin. +See the +.Sx Encryption +section of +.Xr zfs-load-key 8 +for details. +.It Sy filesystem_count +The total number of filesystems and volumes that exist under this location in +the dataset tree. +This value is only available when a +.Sy filesystem_limit +has been set somewhere in the tree under which the dataset resides. +.It Sy keystatus +Indicates if an encryption key is currently loaded into ZFS. +The possible values are +.Sy none , +.Sy available , +and +.Sy unavailable . +See +.Nm zfs Cm load-key +and +.Nm zfs Cm unload-key . +.It Sy guid +The 64 bit GUID of this dataset or bookmark which does not change over its +entire lifetime. +When a snapshot is sent to another pool, the received snapshot has the same GUID. +Thus, the +.Sy guid +is suitable to identify a snapshot across pools. +.It Sy logicalreferenced +The amount of space that is +.Qq logically +accessible by this dataset. +See the +.Sy referenced +property. +The logical space ignores the effect of the +.Sy compression +and +.Sy copies +properties, giving a quantity closer to the amount of data that applications +see. +However, it does include space consumed by metadata. +.Pp +This property can also be referred to by its shortened column name, +.Sy lrefer . +.It Sy logicalused +The amount of space that is +.Qq logically +consumed by this dataset and all its descendents. +See the +.Sy used +property. +The logical space ignores the effect of the +.Sy compression +and +.Sy copies +properties, giving a quantity closer to the amount of data that applications +see. +However, it does include space consumed by metadata. +.Pp +This property can also be referred to by its shortened column name, +.Sy lused . +.It Sy mounted +For file systems, indicates whether the file system is currently mounted. +This property can be either +.Sy yes +or +.Sy no . +.It Sy objsetid +A unique identifier for this dataset within the pool. +Unlike the dataset's +.Sy guid , No the Sy objsetid +of a dataset is not transferred to other pools when the snapshot is copied +with a send/receive operation. +The +.Sy objsetid +can be reused (for a new dataset) after the dataset is deleted. +.It Sy origin +For cloned file systems or volumes, the snapshot from which the clone was +created. +See also the +.Sy clones +property. +.It Sy receive_resume_token +For filesystems or volumes which have saved partially-completed state from +.Nm zfs Cm receive Fl s , +this opaque token can be provided to +.Nm zfs Cm send Fl t +to resume and complete the +.Nm zfs Cm receive . +.It Sy redact_snaps +For bookmarks, this is the list of snapshot guids the bookmark contains a redaction +list for. +For snapshots, this is the list of snapshot guids the snapshot is redacted with +respect to. +.It Sy referenced +The amount of data that is accessible by this dataset, which may or may not be +shared with other datasets in the pool. +When a snapshot or clone is created, it initially references the same amount of +space as the file system or snapshot it was created from, since its contents are +identical. +.Pp +This property can also be referred to by its shortened column name, +.Sy refer . +.It Sy refcompressratio +The compression ratio achieved for the +.Sy referenced +space of this dataset, expressed as a multiplier. +See also the +.Sy compressratio +property. +.It Sy snapshot_count +The total number of snapshots that exist under this location in the dataset +tree. +This value is only available when a +.Sy snapshot_limit +has been set somewhere in the tree under which the dataset resides. +.It Sy type +The type of dataset: +.Sy filesystem , +.Sy volume , +.Sy snapshot , +or +.Sy bookmark . +.It Sy used +The amount of space consumed by this dataset and all its descendents. +This is the value that is checked against this dataset's quota and reservation. +The space used does not include this dataset's reservation, but does take into +account the reservations of any descendent datasets. +The amount of space that a dataset consumes from its parent, as well as the +amount of space that is freed if this dataset is recursively destroyed, is the +greater of its space used and its reservation. +.Pp +The used space of a snapshot +.Po see the +.Sx Snapshots +section of +.Xr zfsconcepts 7 +.Pc +is space that is referenced exclusively by this snapshot. +If this snapshot is destroyed, the amount of +.Sy used +space will be freed. +Space that is shared by multiple snapshots isn't accounted for in this metric. +When a snapshot is destroyed, space that was previously shared with this +snapshot can become unique to snapshots adjacent to it, thus changing the used +space of those snapshots. +The used space of the latest snapshot can also be affected by changes in the +file system. +Note that the +.Sy used +space of a snapshot is a subset of the +.Sy written +space of the snapshot. +.Pp +The amount of space used, available, or referenced does not take into account +pending changes. +Pending changes are generally accounted for within a few seconds. +Committing a change to a disk using +.Xr fsync 2 +or +.Sy O_SYNC +does not necessarily guarantee that the space usage information is updated +immediately. +.It Sy usedby* +The +.Sy usedby* +properties decompose the +.Sy used +properties into the various reasons that space is used. +Specifically, +.Sy used No = +.Sy usedbychildren No + +.Sy usedbydataset No + +.Sy usedbyrefreservation No + +.Sy usedbysnapshots . +These properties are only available for datasets created on +.Nm zpool +.Qo version 13 Qc +pools. +.It Sy usedbychildren +The amount of space used by children of this dataset, which would be freed if +all the dataset's children were destroyed. +.It Sy usedbydataset +The amount of space used by this dataset itself, which would be freed if the +dataset were destroyed +.Po after first removing any +.Sy refreservation +and destroying any necessary snapshots or descendents +.Pc . +.It Sy usedbyrefreservation +The amount of space used by a +.Sy refreservation +set on this dataset, which would be freed if the +.Sy refreservation +was removed. +.It Sy usedbysnapshots +The amount of space consumed by snapshots of this dataset. +In particular, it is the amount of space that would be freed if all of this +dataset's snapshots were destroyed. +Note that this is not simply the sum of the snapshots' +.Sy used +properties because space can be shared by multiple snapshots. +.It Sy userused Ns @ Ns Ar user +The amount of space consumed by the specified user in this dataset. +Space is charged to the owner of each file, as displayed by +.Nm ls Fl l . +The amount of space charged is displayed by +.Nm du No and Nm ls Fl s . +See the +.Nm zfs Cm userspace +command for more information. +.Pp +Unprivileged users can access only their own space usage. +The root user, or a user who has been granted the +.Sy userused +privilege with +.Nm zfs Cm allow , +can access everyone's usage. +.Pp +The +.Sy userused Ns @ Ns Ar ... +properties are not displayed by +.Nm zfs Cm get Sy all . +The user's name must be appended after the +.Sy @ +symbol, using one of the following forms: +.Bl -bullet -compact -offset 4n +.It +POSIX name +.Pq Qq joe +.It +POSIX numeric ID +.Pq Qq 789 +.It +SID name +.Pq Qq joe.smith@mydomain +.It +SID numeric ID +.Pq Qq S-1-123-456-789 +.El +.Pp +Files created on Linux always have POSIX owners. +.It Sy userobjused Ns @ Ns Ar user +The +.Sy userobjused +property is similar to +.Sy userused +but instead it counts the number of objects consumed by a user. +This property counts all objects allocated on behalf of the user, +it may differ from the results of system tools such as +.Nm df Fl i . +.Pp +When the property +.Sy xattr Ns = Ns Sy on +is set on a file system additional objects will be created per-file to store +extended attributes. +These additional objects are reflected in the +.Sy userobjused +value and are counted against the user's +.Sy userobjquota . +When a file system is configured to use +.Sy xattr Ns = Ns Sy sa +no additional internal objects are normally required. +.It Sy userrefs +This property is set to the number of user holds on this snapshot. +User holds are set by using the +.Nm zfs Cm hold +command. +.It Sy groupused Ns @ Ns Ar group +The amount of space consumed by the specified group in this dataset. +Space is charged to the group of each file, as displayed by +.Nm ls Fl l . +See the +.Sy userused Ns @ Ns Ar user +property for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupused +privilege with +.Nm zfs Cm allow , +can access all groups' usage. +.It Sy groupobjused Ns @ Ns Ar group +The number of objects consumed by the specified group in this dataset. +Multiple objects may be charged to the group for each file when extended +attributes are in use. +See the +.Sy userobjused Ns @ Ns Ar user +property for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupobjused +privilege with +.Nm zfs Cm allow , +can access all groups' usage. +.It Sy projectused Ns @ Ns Ar project +The amount of space consumed by the specified project in this dataset. +Project is identified via the project identifier (ID) that is object-based +numeral attribute. +An object can inherit the project ID from its parent object (if the +parent has the flag of inherit project ID that can be set and changed via +.Nm chattr Fl /+P +or +.Nm zfs project Fl s ) +when being created. +The privileged user can set and change object's project +ID via +.Nm chattr Fl p +or +.Nm zfs project Fl s +anytime. +Space is charged to the project of each file, as displayed by +.Nm lsattr Fl p +or +.Nm zfs project . +See the +.Sy userused Ns @ Ns Ar user +property for more information. +.Pp +The root user, or a user who has been granted the +.Sy projectused +privilege with +.Nm zfs allow , +can access all projects' usage. +.It Sy projectobjused Ns @ Ns Ar project +The +.Sy projectobjused +is similar to +.Sy projectused +but instead it counts the number of objects consumed by project. +When the property +.Sy xattr Ns = Ns Sy on +is set on a fileset, ZFS will create additional objects per-file to store +extended attributes. +These additional objects are reflected in the +.Sy projectobjused +value and are counted against the project's +.Sy projectobjquota . +When a filesystem is configured to use +.Sy xattr Ns = Ns Sy sa +no additional internal objects are required. +See the +.Sy userobjused Ns @ Ns Ar user +property for more information. +.Pp +The root user, or a user who has been granted the +.Sy projectobjused +privilege with +.Nm zfs allow , +can access all projects' objects usage. +.It Sy volblocksize +For volumes, specifies the block size of the volume. +The +.Sy blocksize +cannot be changed once the volume has been written, so it should be set at +volume creation time. +The default +.Sy blocksize +for volumes is 16 Kbytes. +Any power of 2 from 512 bytes to 128 Kbytes is valid. +.Pp +This property can also be referred to by its shortened column name, +.Sy volblock . +.It Sy written +The amount of space +.Sy referenced +by this dataset, that was written since the previous snapshot +.Pq i.e. that is not referenced by the previous snapshot . +.It Sy written Ns @ Ns Ar snapshot +The amount of +.Sy referenced +space written to this dataset since the specified snapshot. +This is the space that is referenced by this dataset but was not referenced by +the specified snapshot. +.Pp +The +.Ar snapshot +may be specified as a short snapshot name +.Pq just the part after the Sy @ , +in which case it will be interpreted as a snapshot in the same filesystem as +this dataset. +The +.Ar snapshot +may be a full snapshot name +.Pq Ar filesystem Ns @ Ns Ar snapshot , +which for clones may be a snapshot in the origin's filesystem +.Pq or the origin of the origin's filesystem, etc. +.El +.Pp +The following native properties can be used to change the behavior of a ZFS +dataset. +.Bl -tag -width "" +.It Xo +.Sy aclinherit Ns = Ns Sy discard Ns | Ns Sy noallow Ns | Ns +.Sy restricted Ns | Ns Sy passthrough Ns | Ns Sy passthrough-x +.Xc +Controls how ACEs are inherited when files and directories are created. +.Bl -tag -compact -offset 4n -width "passthrough-x" +.It Sy discard +does not inherit any ACEs. +.It Sy noallow +only inherits inheritable ACEs that specify +.Qq deny +permissions. +.It Sy restricted +default, removes the +.Sy write_acl +and +.Sy write_owner +permissions when the ACE is inherited. +.It Sy passthrough +inherits all inheritable ACEs without any modifications. +.It Sy passthrough-x +same meaning as +.Sy passthrough , +except that the +.Sy owner@ , group@ , No and Sy everyone@ +ACEs inherit the execute permission only if the file creation mode also requests +the execute bit. +.El +.Pp +When the property value is set to +.Sy passthrough , +files are created with a mode determined by the inheritable ACEs. +If no inheritable ACEs exist that affect the mode, then the mode is set in +accordance to the requested mode from the application. +.Pp +The +.Sy aclinherit +property does not apply to POSIX ACLs. +.It Xo +.Sy aclmode Ns = Ns Sy discard Ns | Ns Sy groupmask Ns | Ns +.Sy passthrough Ns | Ns Sy restricted Ns +.Xc +Controls how an ACL is modified during chmod(2) and how inherited ACEs +are modified by the file creation mode: +.Bl -tag -compact -offset 4n -width "passthrough" +.It Sy discard +default, deletes all +.Sy ACEs +except for those representing +the mode of the file or directory requested by +.Xr chmod 2 . +.It Sy groupmask +reduces permissions granted in all +.Sy ALLOW +entries found in the +.Sy ACL +such that they are no greater than the group permissions specified by +.Xr chmod 2 . +.It Sy passthrough +indicates that no changes are made to the ACL other than creating or updating +the necessary ACL entries to represent the new mode of the file or directory. +.It Sy restricted +will cause the +.Xr chmod 2 +operation to return an error when used on any file or directory which has +a non-trivial ACL whose entries can not be represented by a mode. +.Xr chmod 2 +is required to change the set user ID, set group ID, or sticky bits on a file +or directory, as they do not have equivalent ACL entries. +In order to use +.Xr chmod 2 +on a file or directory with a non-trivial ACL when +.Sy aclmode +is set to +.Sy restricted , +you must first remove all ACL entries which do not represent the current mode. +.El +.It Sy acltype Ns = Ns Sy off Ns | Ns Sy nfsv4 Ns | Ns Sy posix +Controls whether ACLs are enabled and if so what type of ACL to use. +When this property is set to a type of ACL not supported by the current +platform, the behavior is the same as if it were set to +.Sy off . +.Bl -tag -compact -offset 4n -width "posixacl" +.It Sy off +default on Linux, when a file system has the +.Sy acltype +property set to off then ACLs are disabled. +.It Sy noacl +an alias for +.Sy off +.It Sy nfsv4 +default on +.Fx , +indicates that NFSv4-style ZFS ACLs should be used. +These ACLs can be managed with the +.Xr getfacl 1 +and +.Xr setfacl 1 . +The +.Sy nfsv4 +ZFS ACL type is not yet supported on Linux. +.It Sy posix +indicates POSIX ACLs should be used. +POSIX ACLs are specific to Linux and are not functional on other platforms. +POSIX ACLs are stored as an extended +attribute and therefore will not overwrite any existing NFSv4 ACLs which +may be set. +.It Sy posixacl +an alias for +.Sy posix +.El +.Pp +To obtain the best performance when setting +.Sy posix +users are strongly encouraged to set the +.Sy xattr Ns = Ns Sy sa +property. +This will result in the POSIX ACL being stored more efficiently on disk. +But as a consequence, all new extended attributes will only be +accessible from OpenZFS implementations which support the +.Sy xattr Ns = Ns Sy sa +property. +See the +.Sy xattr +property for more details. +.It Sy atime Ns = Ns Sy on Ns | Ns Sy off +Controls whether the access time for files is updated when they are read. +Turning this property off avoids producing write traffic when reading files and +can result in significant performance gains, though it might confuse mailers +and other similar utilities. +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy atime +and +.Sy noatime +mount options. +The default value is +.Sy on . +See also +.Sy relatime +below. +.It Sy canmount Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy noauto +If this property is set to +.Sy off , +the file system cannot be mounted, and is ignored by +.Nm zfs Cm mount Fl a . +Setting this property to +.Sy off +is similar to setting the +.Sy mountpoint +property to +.Sy none , +except that the dataset still has a normal +.Sy mountpoint +property, which can be inherited. +Setting this property to +.Sy off +allows datasets to be used solely as a mechanism to inherit properties. +One example of setting +.Sy canmount Ns = Ns Sy off +is to have two datasets with the same +.Sy mountpoint , +so that the children of both datasets appear in the same directory, but might +have different inherited characteristics. +.Pp +When set to +.Sy noauto , +a dataset can only be mounted and unmounted explicitly. +The dataset is not mounted automatically when the dataset is created or +imported, nor is it mounted by the +.Nm zfs Cm mount Fl a +command or unmounted by the +.Nm zfs Cm unmount Fl a +command. +.Pp +This property is not inherited. +.It Xo +.Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns +.Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns +.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr +.Xc +Controls the checksum used to verify data integrity. +The default value is +.Sy on , +which automatically selects an appropriate algorithm +.Po currently, +.Sy fletcher4 , +but this may change in future releases +.Pc . +The value +.Sy off +disables integrity checking on user data. +The value +.Sy noparity +not only disables integrity but also disables maintaining parity for user data. +This setting is used internally by a dump device residing on a RAID-Z pool and +should not be used by any other dataset. +Disabling checksums is +.Em NOT +a recommended practice. +.Pp +The +.Sy sha512 , +.Sy skein , +and +.Sy edonr +checksum algorithms require enabling the appropriate features on the pool. +.Fx +does not support the +.Sy edonr +algorithm. +.Pp +Please see +.Xr zpool-features 7 +for more information on these algorithms. +.Pp +Changing this property affects only newly-written data. +.It Xo +.Sy compression Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy gzip Ns | Ns +.Sy gzip- Ns Ar N Ns | Ns Sy lz4 Ns | Ns Sy lzjb Ns | Ns Sy zle Ns | Ns Sy zstd Ns | Ns +.Sy zstd- Ns Ar N Ns | Ns Sy zstd-fast Ns | Ns Sy zstd-fast- Ns Ar N +.Xc +Controls the compression algorithm used for this dataset. +.Pp +Setting compression to +.Sy on +indicates that the current default compression algorithm should be used. +The default balances compression and decompression speed, with compression ratio +and is expected to work well on a wide variety of workloads. +Unlike all other settings for this property, +.Sy on +does not select a fixed compression type. +As new compression algorithms are added to ZFS and enabled on a pool, the +default compression algorithm may change. +The current default compression algorithm is either +.Sy lzjb +or, if the +.Sy lz4_compress +feature is enabled, +.Sy lz4 . +.Pp +The +.Sy lz4 +compression algorithm is a high-performance replacement for the +.Sy lzjb +algorithm. +It features significantly faster compression and decompression, as well as a +moderately higher compression ratio than +.Sy lzjb , +but can only be used on pools with the +.Sy lz4_compress +feature set to +.Sy enabled . +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy lz4_compress +feature. +.Pp +The +.Sy lzjb +compression algorithm is optimized for performance while providing decent data +compression. +.Pp +The +.Sy gzip +compression algorithm uses the same compression as the +.Xr gzip 1 +command. +You can specify the +.Sy gzip +level by using the value +.Sy gzip- Ns Ar N , +where +.Ar N +is an integer from 1 +.Pq fastest +to 9 +.Pq best compression ratio . +Currently, +.Sy gzip +is equivalent to +.Sy gzip-6 +.Po which is also the default for +.Xr gzip 1 +.Pc . +.Pp +The +.Sy zstd +compression algorithm provides both high compression ratios and good performance. +You can specify the +.Sy zstd +level by using the value +.Sy zstd- Ns Ar N , +where +.Ar N +is an integer from 1 +.Pq fastest +to 19 +.Pq best compression ratio . +.Sy zstd +is equivalent to +.Sy zstd-3 . +.Pp +Faster speeds at the cost of the compression ratio can be requested by +setting a negative +.Sy zstd +level. +This is done using +.Sy zstd-fast- Ns Ar N , +where +.Ar N +is an integer in [1-9,10,20,30,...,100,500,1000] which maps to a negative +.Sy zstd +level. +The lower the level the faster the compression - +.Ar 1000 No provides the fastest compression and lowest compression ratio. +.Sy zstd-fast +is equivalent to +.Sy zstd-fast-1 . +.Pp +The +.Sy zle +compression algorithm compresses runs of zeros. +.Pp +This property can also be referred to by its shortened column name +.Sy compress . +Changing this property affects only newly-written data. +.Pp +When any setting except +.Sy off +is selected, compression will explicitly check for blocks consisting of only +zeroes (the NUL byte). +When a zero-filled block is detected, it is stored as +a hole and not compressed using the indicated compression algorithm. +.Pp +Any block being compressed must be no larger than 7/8 of its original size +after compression, otherwise the compression will not be considered worthwhile +and the block saved uncompressed. +Note that when the logical block is less than +8 times the disk sector size this effectively reduces the necessary compression +ratio; for example, 8kB blocks on disks with 4kB disk sectors must compress to 1/2 +or less of their original size. +.It Xo +.Sy context Ns = Ns Sy none Ns | Ns +.Ar SELinux-User : Ns Ar SElinux-Role : Ns Ar Selinux-Type : Ns Ar Sensitivity-Level +.Xc +This flag sets the SELinux context for all files in the file system under +a mount point for that file system. +See +.Xr selinux 8 +for more information. +.It Xo +.Sy fscontext Ns = Ns Sy none Ns | Ns +.Ar SELinux-User : Ns Ar SElinux-Role : Ns Ar Selinux-Type : Ns Ar Sensitivity-Level +.Xc +This flag sets the SELinux context for the file system file system being +mounted. +See +.Xr selinux 8 +for more information. +.It Xo +.Sy defcontext Ns = Ns Sy none Ns | Ns +.Ar SELinux-User : Ns Ar SElinux-Role : Ns Ar Selinux-Type : Ns Ar Sensitivity-Level +.Xc +This flag sets the SELinux default context for unlabeled files. +See +.Xr selinux 8 +for more information. +.It Xo +.Sy rootcontext Ns = Ns Sy none Ns | Ns +.Ar SELinux-User : Ns Ar SElinux-Role : Ns Ar Selinux-Type : Ns Ar Sensitivity-Level +.Xc +This flag sets the SELinux context for the root inode of the file system. +See +.Xr selinux 8 +for more information. +.It Sy copies Ns = Ns Sy 1 Ns | Ns Sy 2 Ns | Ns Sy 3 +Controls the number of copies of data stored for this dataset. +These copies are in addition to any redundancy provided by the pool, for +example, mirroring or RAID-Z. +The copies are stored on different disks, if possible. +The space used by multiple copies is charged to the associated file and dataset, +changing the +.Sy used +property and counting against quotas and reservations. +.Pp +Changing this property only affects newly-written data. +Therefore, set this property at file system creation time by using the +.Fl o Sy copies Ns = Ns Ar N +option. +.Pp +Remember that ZFS will not import a pool with a missing top-level vdev. +Do +.Em NOT +create, for example a two-disk striped pool and set +.Sy copies Ns = Ns Ar 2 +on some datasets thinking you have setup redundancy for them. +When a disk fails you will not be able to import the pool +and will have lost all of your data. +.Pp +Encrypted datasets may not have +.Sy copies Ns = Ns Ar 3 +since the implementation stores some encryption metadata where the third copy +would normally be. +.It Sy devices Ns = Ns Sy on Ns | Ns Sy off +Controls whether device nodes can be opened on this file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy dev +and +.Sy nodev +mount options. +.It Xo +.Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns +.Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns +.Sy edonr , Ns Sy verify +.Xc +Configures deduplication for a dataset. +The default value is +.Sy off . +The default deduplication checksum is +.Sy sha256 +(this may change in the future). +When +.Sy dedup +is enabled, the checksum defined here overrides the +.Sy checksum +property. +Setting the value to +.Sy verify +has the same effect as the setting +.Sy sha256 , Ns Sy verify . +.Pp +If set to +.Sy verify , +ZFS will do a byte-to-byte comparison in case of two blocks having the same +signature to make sure the block contents are identical. +Specifying +.Sy verify +is mandatory for the +.Sy edonr +algorithm. +.Pp +Unless necessary, deduplication should +.Em not +be enabled on a system. +See the +.Sx Deduplication +section of +.Xr zfsconcepts 7 . +.It Xo +.Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns +.Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k +.Xc +Specifies a compatibility mode or literal value for the size of dnodes in the +file system. +The default value is +.Sy legacy . +Setting this property to a value other than +.Sy legacy No requires the Sy large_dnode No pool feature to be enabled. +.Pp +Consider setting +.Sy dnodesize +to +.Sy auto +if the dataset uses the +.Sy xattr Ns = Ns Sy sa +property setting and the workload makes heavy use of extended attributes. +This +may be applicable to SELinux-enabled systems, Lustre servers, and Samba +servers, for example. +Literal values are supported for cases where the optimal +size is known in advance and for performance testing. +.Pp +Leave +.Sy dnodesize +set to +.Sy legacy +if you need to receive a send stream of this dataset on a pool that doesn't +enable the +.Sy large_dnode +feature, or if you need to import this pool on a system that doesn't support the +.Sy large_dnode No feature. +.Pp +This property can also be referred to by its shortened column name, +.Sy dnsize . +.It Xo +.Sy encryption Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy aes-128-ccm Ns | Ns +.Sy aes-192-ccm Ns | Ns Sy aes-256-ccm Ns | Ns Sy aes-128-gcm Ns | Ns +.Sy aes-192-gcm Ns | Ns Sy aes-256-gcm +.Xc +Controls the encryption cipher suite (block cipher, key length, and mode) used +for this dataset. +Requires the +.Sy encryption +feature to be enabled on the pool. +Requires a +.Sy keyformat +to be set at dataset creation time. +.Pp +Selecting +.Sy encryption Ns = Ns Sy on +when creating a dataset indicates that the default encryption suite will be +selected, which is currently +.Sy aes-256-gcm . +In order to provide consistent data protection, encryption must be specified at +dataset creation time and it cannot be changed afterwards. +.Pp +For more details and caveats about encryption see the +.Sx Encryption +section of +.Xr zfs-load-key 8 . +.It Sy keyformat Ns = Ns Sy raw Ns | Ns Sy hex Ns | Ns Sy passphrase +Controls what format the user's encryption key will be provided as. +This property is only set when the dataset is encrypted. +.Pp +Raw keys and hex keys must be 32 bytes long (regardless of the chosen +encryption suite) and must be randomly generated. +A raw key can be generated with the following command: +.Dl # Nm dd Sy if=/dev/urandom bs=32 count=1 Sy of= Ns Pa /path/to/output/key +.Pp +Passphrases must be between 8 and 512 bytes long and will be processed through +PBKDF2 before being used (see the +.Sy pbkdf2iters +property). +Even though the encryption suite cannot be changed after dataset creation, +the keyformat can be with +.Nm zfs Cm change-key . +.It Xo +.Sy keylocation Ns = Ns Sy prompt Ns | Ns Sy file:// Ns Ar /absolute/file/path Ns | Ns Sy https:// Ns Ar address Ns | Ns Sy http:// Ns Ar address +.Xc +Controls where the user's encryption key will be loaded from by default for +commands such as +.Nm zfs Cm load-key +and +.Nm zfs Cm mount Fl l . +This property is only set for encrypted datasets which are encryption roots. +If unspecified, the default is +.Sy prompt . +.Pp +Even though the encryption suite cannot be changed after dataset creation, the +keylocation can be with either +.Nm zfs Cm set +or +.Nm zfs Cm change-key . +If +.Sy prompt +is selected ZFS will ask for the key at the command prompt when it is required +to access the encrypted data (see +.Nm zfs Cm load-key +for details). +This setting will also allow the key to be passed in via the standard input stream, +but users should be careful not to place keys which should be kept secret on +the command line. +If a file URI is selected, the key will be loaded from the +specified absolute file path. +If an HTTPS or HTTP URL is selected, it will be GETted using +.Xr fetch 3 , +libcurl, or nothing, depending on compile-time configuration and run-time availability. +The +.Sy SSL_CA_CERT_FILE +environment variable can be set to set the location +of the concatenated certificate store. +The +.Sy SSL_CA_CERT_PATH +environment variable can be set to override the location +of the directory containing the certificate authority bundle. +The +.Sy SSL_CLIENT_CERT_FILE +and +.Sy SSL_CLIENT_KEY_FILE +environment variables can be set to configure the path +to the client certificate and its key. +.It Sy pbkdf2iters Ns = Ns Ar iterations +Controls the number of PBKDF2 iterations that a +.Sy passphrase +encryption key should be run through when processing it into an encryption key. +This property is only defined when encryption is enabled and a keyformat of +.Sy passphrase +is selected. +The goal of PBKDF2 is to significantly increase the +computational difficulty needed to brute force a user's passphrase. +This is accomplished by forcing the attacker to run each passphrase through a +computationally expensive hashing function many times before they arrive at the +resulting key. +A user who actually knows the passphrase will only have to pay this cost once. +As CPUs become better at processing, this number should be +raised to ensure that a brute force attack is still not possible. +The current default is +.Sy 350000 +and the minimum is +.Sy 100000 . +This property may be changed with +.Nm zfs Cm change-key . +.It Sy exec Ns = Ns Sy on Ns | Ns Sy off +Controls whether processes can be executed from within this file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy exec +and +.Sy noexec +mount options. +.It Sy filesystem_limit Ns = Ns Ar count Ns | Ns Sy none +Limits the number of filesystems and volumes that can exist under this point in +the dataset tree. +The limit is not enforced if the user is allowed to change the limit. +Setting a +.Sy filesystem_limit +to +.Sy on +a descendent of a filesystem that already has a +.Sy filesystem_limit +does not override the ancestor's +.Sy filesystem_limit , +but rather imposes an additional limit. +This feature must be enabled to be used +.Po see +.Xr zpool-features 7 +.Pc . +.It Sy special_small_blocks Ns = Ns Ar size +This value represents the threshold block size for including small file +blocks into the special allocation class. +Blocks smaller than or equal to this +value will be assigned to the special allocation class while greater blocks +will be assigned to the regular class. +Valid values are zero or a power of two from 512B up to 1M. +The default size is 0 which means no small file blocks +will be allocated in the special class. +.Pp +Before setting this property, a special class vdev must be added to the +pool. +See +.Xr zpoolconcepts 7 +for more details on the special allocation class. +.It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy +Controls the mount point used for this file system. +See the +.Sx Mount Points +section of +.Xr zfsconcepts 7 +for more information on how this property is used. +.Pp +When the +.Sy mountpoint +property is changed for a file system, the file system and any children that +inherit the mount point are unmounted. +If the new value is +.Sy legacy , +then they remain unmounted. +Otherwise, they are automatically remounted in the new location if the property +was previously +.Sy legacy +or +.Sy none , +or if they were mounted before the property was changed. +In addition, any shared file systems are unshared and shared in the new +location. +.It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off +Controls whether the file system should be mounted with +.Sy nbmand +.Pq Non-blocking mandatory locks . +This is used for SMB clients. +Changes to this property only take effect when the file system is umounted and +remounted. +Support for these locks is scarce and not described by POSIX. +.It Sy overlay Ns = Ns Sy on Ns | Ns Sy off +Allow mounting on a busy directory or a directory which already contains +files or directories. +This is the default mount behavior for Linux and +.Fx +file systems. +On these platforms the property is +.Sy on +by default. +Set to +.Sy off +to disable overlay mounts for consistency with OpenZFS on other platforms. +.It Sy primarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what is cached in the primary cache +.Pq ARC . +If this property is set to +.Sy all , +then both user data and metadata is cached. +If this property is set to +.Sy none , +then neither user data nor metadata is cached. +If this property is set to +.Sy metadata , +then only metadata is cached. +The default value is +.Sy all . +.It Sy quota Ns = Ns Ar size Ns | Ns Sy none +Limits the amount of space a dataset and its descendents can consume. +This property enforces a hard limit on the amount of space used. +This includes all space consumed by descendents, including file systems and +snapshots. +Setting a quota on a descendent of a dataset that already has a quota does not +override the ancestor's quota, but rather imposes an additional limit. +.Pp +Quotas cannot be set on volumes, as the +.Sy volsize +property acts as an implicit quota. +.It Sy snapshot_limit Ns = Ns Ar count Ns | Ns Sy none +Limits the number of snapshots that can be created on a dataset and its +descendents. +Setting a +.Sy snapshot_limit +on a descendent of a dataset that already has a +.Sy snapshot_limit +does not override the ancestor's +.Sy snapshot_limit , +but rather imposes an additional limit. +The limit is not enforced if the user is allowed to change the limit. +For example, this means that recursive snapshots taken from the global zone are +counted against each delegated dataset within a zone. +This feature must be enabled to be used +.Po see +.Xr zpool-features 7 +.Pc . +.It Sy userquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none +Limits the amount of space consumed by the specified user. +User space consumption is identified by the +.Sy userspace@ Ns Ar user +property. +.Pp +Enforcement of user quotas may be delayed by several seconds. +This delay means that a user might exceed their quota before the system notices +that they are over quota and begins to refuse additional writes with the +.Er EDQUOT +error message. +See the +.Nm zfs Cm userspace +command for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy userquota +privilege with +.Nm zfs Cm allow , +can get and set everyone's quota. +.Pp +This property is not available on volumes, on file systems before version 4, or +on pools before version 15. +The +.Sy userquota@ Ns Ar ... +properties are not displayed by +.Nm zfs Cm get Sy all . +The user's name must be appended after the +.Sy @ +symbol, using one of the following forms: +.Bl -bullet -compact -offset 4n +.It +POSIX name +.Pq Qq joe +.It +POSIX numeric ID +.Pq Qq 789 +.It +SID name +.Pq Qq joe.smith@mydomain +.It +SID numeric ID +.Pq Qq S-1-123-456-789 +.El +.Pp +Files created on Linux always have POSIX owners. +.It Sy userobjquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none +The +.Sy userobjquota +is similar to +.Sy userquota +but it limits the number of objects a user can create. +Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy groupquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none +Limits the amount of space consumed by the specified group. +Group space consumption is identified by the +.Sy groupused@ Ns Ar group +property. +.Pp +Unprivileged users can access only their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupquota +privilege with +.Nm zfs Cm allow , +can get and set all groups' quotas. +.It Sy groupobjquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none +The +.Sy groupobjquota +is similar to +.Sy groupquota +but it limits number of objects a group can consume. +Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy projectquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none +Limits the amount of space consumed by the specified project. +Project space consumption is identified by the +.Sy projectused@ Ns Ar project +property. +Please refer to +.Sy projectused +for more information about how project is identified and set/changed. +.Pp +The root user, or a user who has been granted the +.Sy projectquota +privilege with +.Nm zfs allow , +can access all projects' quota. +.It Sy projectobjquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none +The +.Sy projectobjquota +is similar to +.Sy projectquota +but it limits number of objects a project can consume. +Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off +Controls whether this dataset can be modified. +The default value is +.Sy off . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy ro +and +.Sy rw +mount options. +.Pp +This property can also be referred to by its shortened column name, +.Sy rdonly . +.It Sy recordsize Ns = Ns Ar size +Specifies a suggested block size for files in the file system. +This property is designed solely for use with database workloads that access +files in fixed-size records. +ZFS automatically tunes block sizes according to internal algorithms optimized +for typical access patterns. +.Pp +For databases that create very large files but access them in small random +chunks, these algorithms may be suboptimal. +Specifying a +.Sy recordsize +greater than or equal to the record size of the database can result in +significant performance gains. +Use of this property for general purpose file systems is strongly discouraged, +and may adversely affect performance. +.Pp +The size specified must be a power of two greater than or equal to +.Ar 512B +and less than or equal to +.Ar 128kB . +If the +.Sy large_blocks +feature is enabled on the pool, the size may be up to +.Ar 1MB . +See +.Xr zpool-features 7 +for details on ZFS feature flags. +.Pp +Changing the file system's +.Sy recordsize +affects only files created afterward; existing files are unaffected. +.Pp +This property can also be referred to by its shortened column name, +.Sy recsize . +.It Sy redundant_metadata Ns = Ns Sy all Ns | Ns Sy most +Controls what types of metadata are stored redundantly. +ZFS stores an extra copy of metadata, so that if a single block is corrupted, +the amount of user data lost is limited. +This extra copy is in addition to any redundancy provided at the pool level +.Pq e.g. by mirroring or RAID-Z , +and is in addition to an extra copy specified by the +.Sy copies +property +.Pq up to a total of 3 copies . +For example if the pool is mirrored, +.Sy copies Ns = Ns 2 , +and +.Sy redundant_metadata Ns = Ns Sy most , +then ZFS stores 6 copies of most metadata, and 4 copies of data and some +metadata. +.Pp +When set to +.Sy all , +ZFS stores an extra copy of all metadata. +If a single on-disk block is corrupt, at worst a single block of user data +.Po which is +.Sy recordsize +bytes long +.Pc +can be lost. +.Pp +When set to +.Sy most , +ZFS stores an extra copy of most types of metadata. +This can improve performance of random writes, because less metadata must be +written. +In practice, at worst about 100 blocks +.Po of +.Sy recordsize +bytes each +.Pc +of user data can be lost if a single on-disk block is corrupt. +The exact behavior of which metadata blocks are stored redundantly may change in +future releases. +.Pp +The default value is +.Sy all . +.It Sy refquota Ns = Ns Ar size Ns | Ns Sy none +Limits the amount of space a dataset can consume. +This property enforces a hard limit on the amount of space used. +This hard limit does not include space used by descendents, including file +systems and snapshots. +.It Sy refreservation Ns = Ns Ar size Ns | Ns Sy none Ns | Ns Sy auto +The minimum amount of space guaranteed to a dataset, not including its +descendents. +When the amount of space used is below this value, the dataset is treated as if +it were taking up the amount of space specified by +.Sy refreservation . +The +.Sy refreservation +reservation is accounted for in the parent datasets' space used, and counts +against the parent datasets' quotas and reservations. +.Pp +If +.Sy refreservation +is set, a snapshot is only allowed if there is enough free pool space outside of +this reservation to accommodate the current number of +.Qq referenced +bytes in the dataset. +.Pp +If +.Sy refreservation +is set to +.Sy auto , +a volume is thick provisioned +.Po or +.Qq not sparse +.Pc . +.Sy refreservation Ns = Ns Sy auto +is only supported on volumes. +See +.Sy volsize +in the +.Sx Native Properties +section for more information about sparse volumes. +.Pp +This property can also be referred to by its shortened column name, +.Sy refreserv . +.It Sy relatime Ns = Ns Sy on Ns | Ns Sy off +Controls the manner in which the access time is updated when +.Sy atime Ns = Ns Sy on +is set. +Turning this property on causes the access time to be updated relative +to the modify or change time. +Access time is only updated if the previous +access time was earlier than the current modify or change time or if the +existing access time hasn't been updated within the past 24 hours. +The default value is +.Sy off . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy relatime +and +.Sy norelatime +mount options. +.It Sy reservation Ns = Ns Ar size Ns | Ns Sy none +The minimum amount of space guaranteed to a dataset and its descendants. +When the amount of space used is below this value, the dataset is treated as if +it were taking up the amount of space specified by its reservation. +Reservations are accounted for in the parent datasets' space used, and count +against the parent datasets' quotas and reservations. +.Pp +This property can also be referred to by its shortened column name, +.Sy reserv . +.It Sy secondarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what is cached in the secondary cache +.Pq L2ARC . +If this property is set to +.Sy all , +then both user data and metadata is cached. +If this property is set to +.Sy none , +then neither user data nor metadata is cached. +If this property is set to +.Sy metadata , +then only metadata is cached. +The default value is +.Sy all . +.It Sy setuid Ns = Ns Sy on Ns | Ns Sy off +Controls whether the setuid bit is respected for the file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy suid +and +.Sy nosuid +mount options. +.It Sy sharesmb Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts +Controls whether the file system is shared by using +.Sy Samba USERSHARES +and what options are to be used. +Otherwise, the file system is automatically shared and unshared with the +.Nm zfs Cm share +and +.Nm zfs Cm unshare +commands. +If the property is set to on, the +.Xr net 8 +command is invoked to create a +.Sy USERSHARE . +.Pp +Because SMB shares requires a resource name, a unique resource name is +constructed from the dataset name. +The constructed name is a copy of the +dataset name except that the characters in the dataset name, which would be +invalid in the resource name, are replaced with underscore (_) characters. +Linux does not currently support additional options which might be available +on Solaris. +.Pp +If the +.Sy sharesmb +property is set to +.Sy off , +the file systems are unshared. +.Pp +The share is created with the ACL (Access Control List) "Everyone:F" ("F" +stands for "full permissions", i.e. read and write permissions) and no guest +access (which means Samba must be able to authenticate a real user, system +passwd/shadow, LDAP or smbpasswd based) by default. +This means that any additional access control +(disallow specific user specific access etc) must be done on the underlying file system. +.It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts +Controls whether the file system is shared via NFS, and what options are to be +used. +A file system with a +.Sy sharenfs +property of +.Sy off +is managed with the +.Xr exportfs 8 +command and entries in the +.Pa /etc/exports +file. +Otherwise, the file system is automatically shared and unshared with the +.Nm zfs Cm share +and +.Nm zfs Cm unshare +commands. +If the property is set to +.Sy on , +the dataset is shared using the default options: +.Dl sec=sys,rw,crossmnt,no_subtree_check +.Pp +Please note that the options are comma-separated, unlike those found in +.Xr exports 5 . +This is done to negate the need for quoting, as well as to make parsing +with scripts easier. +.Pp +See +.Xr exports 5 +for the meaning of the default options. +Otherwise, the +.Xr exportfs 8 +command is invoked with options equivalent to the contents of this property. +.Pp +When the +.Sy sharenfs +property is changed for a dataset, the dataset and any children inheriting the +property are re-shared with the new options, only if the property was previously +.Sy off , +or if they were shared before the property was changed. +If the new property is +.Sy off , +the file systems are unshared. +.It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput +Provide a hint to ZFS about handling of synchronous requests in this dataset. +If +.Sy logbias +is set to +.Sy latency +.Pq the default , +ZFS will use pool log devices +.Pq if configured +to handle the requests at low latency. +If +.Sy logbias +is set to +.Sy throughput , +ZFS will not use configured pool log devices. +ZFS will instead optimize synchronous operations for global pool throughput and +efficient use of resources. +.It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible +Controls whether the volume snapshot devices under +.Pa /dev/zvol/ Ns Aq Ar pool +are hidden or visible. +The default value is +.Sy hidden . +.It Sy snapdir Ns = Ns Sy hidden Ns | Ns Sy visible +Controls whether the +.Pa .zfs +directory is hidden or visible in the root of the file system as discussed in +the +.Sx Snapshots +section of +.Xr zfsconcepts 7 . +The default value is +.Sy hidden . +.It Sy sync Ns = Ns Sy standard Ns | Ns Sy always Ns | Ns Sy disabled +Controls the behavior of synchronous requests +.Pq e.g. fsync, O_DSYNC . +.Sy standard +is the POSIX-specified behavior of ensuring all synchronous requests +are written to stable storage and all devices are flushed to ensure +data is not cached by device controllers +.Pq this is the default . +.Sy always +causes every file system transaction to be written and flushed before its +system call returns. +This has a large performance penalty. +.Sy disabled +disables synchronous requests. +File system transactions are only committed to stable storage periodically. +This option will give the highest performance. +However, it is very dangerous as ZFS would be ignoring the synchronous +transaction demands of applications such as databases or NFS. +Administrators should only use this option when the risks are understood. +.It Sy version Ns = Ns Ar N Ns | Ns Sy current +The on-disk version of this file system, which is independent of the pool +version. +This property can only be set to later supported versions. +See the +.Nm zfs Cm upgrade +command. +.It Sy volsize Ns = Ns Ar size +For volumes, specifies the logical size of the volume. +By default, creating a volume establishes a reservation of equal size. +For storage pools with a version number of 9 or higher, a +.Sy refreservation +is set instead. +Any changes to +.Sy volsize +are reflected in an equivalent change to the reservation +.Pq or Sy refreservation . +The +.Sy volsize +can only be set to a multiple of +.Sy volblocksize , +and cannot be zero. +.Pp +The reservation is kept equal to the volume's logical size to prevent unexpected +behavior for consumers. +Without the reservation, the volume could run out of space, resulting in +undefined behavior or data corruption, depending on how the volume is used. +These effects can also occur when the volume size is changed while it is in use +.Pq particularly when shrinking the size . +Extreme care should be used when adjusting the volume size. +.Pp +Though not recommended, a +.Qq sparse volume +.Po also known as +.Qq thin provisioned +.Pc +can be created by specifying the +.Fl s +option to the +.Nm zfs Cm create Fl V +command, or by changing the value of the +.Sy refreservation +property +.Po or +.Sy reservation +property on pool version 8 or earlier +.Pc +after the volume has been created. +A +.Qq sparse volume +is a volume where the value of +.Sy refreservation +is less than the size of the volume plus the space required to store its +metadata. +Consequently, writes to a sparse volume can fail with +.Er ENOSPC +when the pool is low on space. +For a sparse volume, changes to +.Sy volsize +are not reflected in the +.Sy refreservation . +A volume that is not sparse is said to be +.Qq thick provisioned . +A sparse volume can become thick provisioned by setting +.Sy refreservation +to +.Sy auto . +.It Sy volmode Ns = Ns Sy default Ns | Ns Sy full Ns | Ns Sy geom Ns | Ns Sy dev Ns | Ns Sy none +This property specifies how volumes should be exposed to the OS. +Setting it to +.Sy full +exposes volumes as fully fledged block devices, providing maximal +functionality. +The value +.Sy geom +is just an alias for +.Sy full +and is kept for compatibility. +Setting it to +.Sy dev +hides its partitions. +Volumes with property set to +.Sy none +are not exposed outside ZFS, but can be snapshotted, cloned, replicated, etc, +that can be suitable for backup purposes. +Value +.Sy default +means that volumes exposition is controlled by system-wide tunable +.Sy zvol_volmode , +where +.Sy full , +.Sy dev +and +.Sy none +are encoded as 1, 2 and 3 respectively. +The default value is +.Sy full . +.It Sy vscan Ns = Ns Sy on Ns | Ns Sy off +Controls whether regular files should be scanned for viruses when a file is +opened and closed. +In addition to enabling this property, the virus scan service must also be +enabled for virus scanning to occur. +The default value is +.Sy off . +This property is not used by OpenZFS. +.It Sy xattr Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy sa +Controls whether extended attributes are enabled for this file system. +Two styles of extended attributes are supported: either directory based +or system attribute based. +.Pp +The default value of +.Sy on +enables directory based extended attributes. +This style of extended attribute imposes no practical limit +on either the size or number of attributes which can be set on a file. +Although under Linux the +.Xr getxattr 2 +and +.Xr setxattr 2 +system calls limit the maximum size to 64K. +This is the most compatible +style of extended attribute and is supported by all ZFS implementations. +.Pp +System attribute based xattrs can be enabled by setting the value to +.Sy sa . +The key advantage of this type of xattr is improved performance. +Storing extended attributes as system attributes +significantly decreases the amount of disk IO required. +Up to 64K of data may be stored per-file in the space reserved for system attributes. +If there is not enough space available for an extended attribute +then it will be automatically written as a directory based xattr. +System attribute based extended attributes are not accessible +on platforms which do not support the +.Sy xattr Ns = Ns Sy sa +feature. +OpenZFS supports +.Sy xattr Ns = Ns Sy sa +on both +.Fx +and Linux. +.Pp +The use of system attribute based xattrs is strongly encouraged for users of +SELinux or POSIX ACLs. +Both of these features heavily rely on extended +attributes and benefit significantly from the reduced access time. +.Pp +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy xattr +and +.Sy noxattr +mount options. +.It Sy jailed Ns = Ns Sy off Ns | Ns Sy on +Controls whether the dataset is managed from a jail. +See +.Xr zfs-jail 8 +for more information. +Jails are a +.Fx +feature and are not relevant on other platforms. +The default value is +.Sy off . +.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off +Controls whether the dataset is managed from a non-global zone. +Zones are a Solaris feature and are not relevant on other platforms. +The default value is +.Sy off . +.El +.Pp +The following three properties cannot be changed after the file system is +created, and therefore, should be set when the file system is created. +If the properties are not set with the +.Nm zfs Cm create +or +.Nm zpool Cm create +commands, these properties are inherited from the parent dataset. +If the parent dataset lacks these properties due to having been created prior to +these features being supported, the new file system will have the default values +for these properties. +.Bl -tag -width "" +.It Xo +.Sy casesensitivity Ns = Ns Sy sensitive Ns | Ns +.Sy insensitive Ns | Ns Sy mixed +.Xc +Indicates whether the file name matching algorithm used by the file system +should be case-sensitive, case-insensitive, or allow a combination of both +styles of matching. +The default value for the +.Sy casesensitivity +property is +.Sy sensitive . +Traditionally, +.Ux +and POSIX file systems have case-sensitive file names. +.Pp +The +.Sy mixed +value for the +.Sy casesensitivity +property indicates that the file system can support requests for both +case-sensitive and case-insensitive matching behavior. +Currently, case-insensitive matching behavior on a file system that supports +mixed behavior is limited to the SMB server product. +For more information about the +.Sy mixed +value behavior, see the "ZFS Administration Guide". +.It Xo +.Sy normalization Ns = Ns Sy none Ns | Ns Sy formC Ns | Ns +.Sy formD Ns | Ns Sy formKC Ns | Ns Sy formKD +.Xc +Indicates whether the file system should perform a +.Sy unicode +normalization of file names whenever two file names are compared, and which +normalization algorithm should be used. +File names are always stored unmodified, names are normalized as part of any +comparison process. +If this property is set to a legal value other than +.Sy none , +and the +.Sy utf8only +property was left unspecified, the +.Sy utf8only +property is automatically set to +.Sy on . +The default value of the +.Sy normalization +property is +.Sy none . +This property cannot be changed after the file system is created. +.It Sy utf8only Ns = Ns Sy on Ns | Ns Sy off +Indicates whether the file system should reject file names that include +characters that are not present in the +.Sy UTF-8 +character code set. +If this property is explicitly set to +.Sy off , +the normalization property must either not be explicitly set or be set to +.Sy none . +The default value for the +.Sy utf8only +property is +.Sy off . +This property cannot be changed after the file system is created. +.El +.Pp +The +.Sy casesensitivity , +.Sy normalization , +and +.Sy utf8only +properties are also new permissions that can be assigned to non-privileged users +by using the ZFS delegated administration feature. +. +.Ss Temporary Mount Point Properties +When a file system is mounted, either through +.Xr mount 8 +for legacy mounts or the +.Nm zfs Cm mount +command for normal file systems, its mount options are set according to its +properties. +The correlation between properties and mount options is as follows: +.Bl -tag -compact -offset Ds -width "rootcontext=" +.It Sy atime +atime/noatime +.It Sy canmount +auto/noauto +.It Sy devices +dev/nodev +.It Sy exec +exec/noexec +.It Sy readonly +ro/rw +.It Sy relatime +relatime/norelatime +.It Sy setuid +suid/nosuid +.It Sy xattr +xattr/noxattr +.It Sy nbmand +mand/nomand +.It Sy context Ns = +context= +.It Sy fscontext Ns = +fscontext= +.It Sy defcontext Ns = +defcontext= +.It Sy rootcontext Ns = +rootcontext= +.El +.Pp +In addition, these options can be set on a per-mount basis using the +.Fl o +option, without affecting the property that is stored on disk. +The values specified on the command line override the values stored in the +dataset. +The +.Sy nosuid +option is an alias for +.Sy nodevices , Ns Sy nosetuid . +These properties are reported as +.Qq temporary +by the +.Nm zfs Cm get +command. +If the properties are changed while the dataset is mounted, the new setting +overrides any temporary settings. +. +.Ss User Properties +In addition to the standard native properties, ZFS supports arbitrary user +properties. +User properties have no effect on ZFS behavior, but applications or +administrators can use them to annotate datasets +.Pq file systems, volumes, and snapshots . +.Pp +User property names must contain a colon +.Pq Qq Sy \&: +character to distinguish them from native properties. +They may contain lowercase letters, numbers, and the following punctuation +characters: colon +.Pq Qq Sy \&: , +dash +.Pq Qq Sy - , +period +.Pq Qq Sy \&. , +and underscore +.Pq Qq Sy _ . +The expected convention is that the property name is divided into two portions +such as +.Ar module : Ns Ar property , +but this namespace is not enforced by ZFS. +User property names can be at most 256 characters, and cannot begin with a dash +.Pq Qq Sy - . +.Pp +When making programmatic use of user properties, it is strongly suggested to use +a reversed DNS domain name for the +.Ar module +component of property names to reduce the chance that two +independently-developed packages use the same property name for different +purposes. +.Pp +The values of user properties are arbitrary strings, are always inherited, and +are never validated. +All of the commands that operate on properties +.Po Nm zfs Cm list , +.Nm zfs Cm get , +.Nm zfs Cm set , +and so forth +.Pc +can be used to manipulate both native properties and user properties. +Use the +.Nm zfs Cm inherit +command to clear a user property. +If the property is not defined in any parent dataset, it is removed entirely. +Property values are limited to 8192 bytes. diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 new file mode 100644 index 0000000000..83ca911753 --- /dev/null +++ b/man/man7/zpool-features.7 @@ -0,0 +1,842 @@ +.\" +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.\" Copyright (c) 2019, Klara Inc. +.\" Copyright (c) 2019, Allan Jude +.\" Copyright (c) 2021, Colm Buckley +.\" +.Dd May 31, 2021 +.Dt ZPOOL-FEATURES 7 +.Os +. +.Sh NAME +.Nm zpool-features +.Nd description of ZFS pool features +. +.Sh DESCRIPTION +ZFS pool on-disk format versions are specified via "features" which replace +the old on-disk format numbers (the last supported on-disk format number is 28). +To enable a feature on a pool use the +.Nm zpool Cm upgrade , +or set the +.Sy feature Ns @ Ns Ar feature-name +property to +.Sy enabled . +Please also see the +.Sx Compatibility feature sets +section for information on how sets of features may be enabled together. +.Pp +The pool format does not affect file system version compatibility or the ability +to send file systems between pools. +.Pp +Since most features can be enabled independently of each other, the on-disk +format of the pool is specified by the set of all features marked as +.Sy active +on the pool. +If the pool was created by another software version +this set may include unsupported features. +. +.Ss Identifying features +Every feature has a GUID of the form +.Ar com.example : Ns Ar feature-name . +The reversed DNS name ensures that the feature's GUID is unique across all ZFS +implementations. +When unsupported features are encountered on a pool they will +be identified by their GUIDs. +Refer to the documentation for the ZFS +implementation that created the pool for information about those features. +.Pp +Each supported feature also has a short name. +By convention a feature's short name is the portion of its GUID which follows the +.Sq \&: +(i.e. +.Ar com.example : Ns Ar feature-name +would have the short name +.Ar feature-name ) , +however a feature's short name may differ across ZFS implementations if +following the convention would result in name conflicts. +. +.Ss Feature states +Features can be in one of three states: +.Bl -tag -width "disabled" +.It Sy active +This feature's on-disk format changes are in effect on the pool. +Support for this feature is required to import the pool in read-write mode. +If this feature is not read-only compatible, +support is also required to import the pool in read-only mode +.Pq see Sx Read-only compatibility . +.It Sy enabled +An administrator has marked this feature as enabled on the pool, but the +feature's on-disk format changes have not been made yet. +The pool can still be imported by software that does not support this feature, +but changes may be made to the on-disk format at any time +which will move the feature to the +.Sy active +state. +Some features may support returning to the +.Sy enabled +state after becoming +.Sy active . +See feature-specific documentation for details. +.It Sy disabled +This feature's on-disk format changes have not been made and will not be made +unless an administrator moves the feature to the +.Sy enabled +state. +Features cannot be disabled once they have been enabled. +.El +.Pp +The state of supported features is exposed through pool properties of the form +.Sy feature Ns @ Ns Ar short-name . +. +.Ss Read-only compatibility +Some features may make on-disk format changes that do not interfere with other +software's ability to read from the pool. +These features are referred to as +.Dq read-only compatible . +If all unsupported features on a pool are read-only compatible, +the pool can be imported in read-only mode by setting the +.Sy readonly +property during import (see +.Xr zpool-import 8 +for details on importing pools). +. +.Ss Unsupported features +For each unsupported feature enabled on an imported pool, a pool property +named +.Sy unsupported Ns @ Ns Ar feature-name +will indicate why the import was allowed despite the unsupported feature. +Possible values for this property are: +.Bl -tag -width "readonly" +.It Sy inactive +The feature is in the +.Sy enabled +state and therefore the pool's on-disk +format is still compatible with software that does not support this feature. +.It Sy readonly +The feature is read-only compatible and the pool has been imported in +read-only mode. +.El +. +.Ss Feature dependencies +Some features depend on other features being enabled in order to function. +Enabling a feature will automatically enable any features it depends on. +. +.Ss Compatibility feature sets +It is sometimes necessary for a pool to maintain compatibility with a +specific on-disk format, by enabling and disabling particular features. +The +.Sy compatibility +feature facilitates this by allowing feature sets to be read from text files. +When set to +.Sy off +(the default), compatibility feature sets are disabled +(i.e. all features are enabled); when set to +.Sy legacy , +no features are enabled. +When set to a comma-separated list of filenames +(each filename may either be an absolute path, or relative to +.Pa /etc/zfs/compatibility.d +or +.Pa /usr/share/zfs/compatibility.d ) , +the lists of requested features are read from those files, +separated by whitespace and/or commas. +Only features present in all files are enabled. +.Pp +Simple sanity checks are applied to the files: +they must be between 1B and 16kB in size, and must end with a newline character. +.Pp +The requested features are applied when a pool is created using +.Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar … +and controls which features are enabled when using +.Nm zpool Cm upgrade . +.Nm zpool Cm status +will not show a warning about disabled features which are not part +of the requested feature set. +.Pp +The special value +.Sy legacy +prevents any features from being enabled, either via +.Nm zpool Cm upgrade +or +.Nm zpool Cm set Sy feature Ns @ Ns Ar feature-name Ns = Ns Sy enabled . +This setting also prevents pools from being upgraded to newer on-disk versions. +This is a safety measure to prevent new features from being +accidentally enabled, breaking compatibility. +.Pp +By convention, compatibility files in +.Pa /usr/share/zfs/compatibility.d +are provided by the distribution, and include feature sets +supported by important versions of popular distributions, and feature +sets commonly supported at the start of each year. +Compatibility files in +.Pa /etc/zfs/compatibility.d , +if present, will take precedence over files with the same name in +.Pa /usr/share/zfs/compatibility.d . +.Pp +If an unrecognized feature is found in these files, an error message will +be shown. +If the unrecognized feature is in a file in +.Pa /etc/zfs/compatibility.d , +this is treated as an error and processing will stop. +If the unrecognized feature is under +.Pa /usr/share/zfs/compatibility.d , +this is treated as a warning and processing will continue. +This difference is to allow distributions to include features +which might not be recognized by the currently-installed binaries. +.Pp +Compatibility files may include comments: +any text from +.Sq # +to the end of the line is ignored. +.Pp +.Sy Example : +.Bd -literal -compact -offset 4n +.No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 +# Features which are supported by GRUB2 +async_destroy +bookmarks +embedded_data +empty_bpobj +enabled_txg +extensible_dataset +filesystem_limits +hole_birth +large_blocks +lz4_compress +spacemap_histogram + +.No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev +.Ed +.Pp +See +.Xr zpool-create 8 +and +.Xr zpool-upgrade 8 +for more information on how these commands are affected by feature sets. +. +.de feature +.It Sy \\$2 +.Bl -tag -compact -width "READ-ONLY COMPATIBLE" +.It GUID +.Sy \\$1:\\$2 +.if !"\\$4"" \{\ +.It DEPENDENCIES +\fB\\$4\fP\c +.if !"\\$5"" , \fB\\$5\fP\c +.if !"\\$6"" , \fB\\$6\fP\c +.if !"\\$7"" , \fB\\$7\fP\c +.if !"\\$8"" , \fB\\$8\fP\c +.if !"\\$9"" , \fB\\$9\fP\c +.\} +.It READ-ONLY COMPATIBLE +\\$3 +.El +.Pp +.. +. +.ds instant-never \ +.No This feature becomes Sy active No as soon as it is enabled \ +and will never return to being Sy enabled . +. +.ds remount-upgrade \ +.No Each filesystem will be upgraded automatically when remounted, \ +or when a new file is created under that filesystem. \ +The upgrade can also be triggered on filesystems via \ +Nm zfs Cm set Sy version Ns = Ns Sy current Ar fs . \ +No The upgrade process runs in the background and may take a while to complete \ +for filesystems containing large amounts of files. +. +.de checksum-spiel +When the +.Sy \\$1 +feature is set to +.Sy enabled , +the administrator can turn on the +.Sy \\$1 +checksum on any dataset using +.Nm zfs Cm set Sy checksum Ns = Ns Sy \\$1 Ar dset +.Po see Xr zfs-set 8 Pc . +This feature becomes +.Sy active +once a +.Sy checksum +property has been set to +.Sy \\$1 , +and will return to being +.Sy enabled +once all filesystems that have ever had their checksum set to +.Sy \\$1 +are destroyed. +.. +. +.Sh FEATURES +The following features are supported on this system: +.Bl -tag -width Ds +.feature org.zfsonlinux allocation_classes yes +This feature enables support for separate allocation classes. +.Pp +This feature becomes +.Sy active +when a dedicated allocation class vdev (dedup or special) is created with the +.Nm zpool Cm create No or Nm zpool Cm add No commands . +With device removal, it can be returned to the +.Sy enabled +state if all the dedicated allocation class vdevs are removed. +. +.feature com.delphix async_destroy yes +Destroying a file system requires traversing all of its data in order to +return its used space to the pool. +Without +.Sy async_destroy , +the file system is not fully removed until all space has been reclaimed. +If the destroy operation is interrupted by a reboot or power outage, +the next attempt to open the pool will need to complete the destroy +operation synchronously. +.Pp +When +.Sy async_destroy +is enabled, the file system's data will be reclaimed by a background process, +allowing the destroy operation to complete +without traversing the entire file system. +The background process is able to resume +interrupted destroys after the pool has been opened, eliminating the need +to finish interrupted destroys as part of the open operation. +The amount of space remaining to be reclaimed by the background process +is available through the +.Sy freeing +property. +.Pp +This feature is only +.Sy active +while +.Sy freeing +is non-zero. +. +.feature com.delphix bookmarks yes extensible_dataset +This feature enables use of the +.Nm zfs Cm bookmark +command. +.Pp +This feature is +.Sy active +while any bookmarks exist in the pool. +All bookmarks in the pool can be listed by running +.Nm zfs Cm list Fl t Sy bookmark Fl r Ar poolname . +. +.feature com.datto bookmark_v2 no bookmark extensible_dataset +This feature enables the creation and management of larger bookmarks which are +needed for other features in ZFS. +.Pp +This feature becomes +.Sy active +when a v2 bookmark is created and will be returned to the +.Sy enabled +state when all v2 bookmarks are destroyed. +. +.feature com.delphix bookmark_written no bookmark extensible_dataset bookmark_v2 +This feature enables additional bookmark accounting fields, enabling the +.Sy written Ns # Ns Ar bookmark +property (space written since a bookmark) and estimates of +send stream sizes for incrementals from bookmarks. +.Pp +This feature becomes +.Sy active +when a bookmark is created and will be +returned to the +.Sy enabled +state when all bookmarks with these fields are destroyed. +. +.feature org.openzfs device_rebuild yes +This feature enables the ability for the +.Nm zpool Cm attach +and +.Nm zpool Cm replace +commands to perform sequential reconstruction +(instead of healing reconstruction) when resilvering. +.Pp +Sequential reconstruction resilvers a device in LBA order without immediately +verifying the checksums. +Once complete, a scrub is started, which then verifies the checksums. +This approach allows full redundancy to be restored to the pool +in the minimum amount of time. +This two-phase approach will take longer than a healing resilver +when the time to verify the checksums is included. +However, unless there is additional pool damage, +no checksum errors should be reported by the scrub. +This feature is incompatible with raidz configurations. +. +This feature becomes +.Sy active +while a sequential resilver is in progress, and returns to +.Sy enabled +when the resilver completes. +. +.feature com.delphix device_removal no +This feature enables the +.Nm zpool Cm remove +command to remove top-level vdevs, +evacuating them to reduce the total size of the pool. +.Pp +This feature becomes +.Sy active +when the +.Nm zpool Cm remove +command is used +on a top-level vdev, and will never return to being +.Sy enabled . +. +.feature org.openzfs draid no +This feature enables use of the +.Sy draid +vdev type. +dRAID is a variant of raidz which provides integrated distributed +hot spares that allow faster resilvering while retaining the benefits of raidz. +Data, parity, and spare space are organized in redundancy groups +and distributed evenly over all of the devices. +.Pp +This feature becomes +.Sy active +when creating a pool which uses the +.Sy draid +vdev type, or when adding a new +.Sy draid +vdev to an existing pool. +. +.feature org.illumos edonr no extensible_dataset +This feature enables the use of the Edon-R hash algorithm for checksum, +including for nopwrite (if compression is also enabled, an overwrite of +a block whose checksum matches the data being written will be ignored). +In an abundance of caution, Edon-R requires verification when used with +dedup: +.Nm zfs Cm set Sy dedup Ns = Ns Sy edonr , Ns Sy verify +.Po see Xr zfs-set 8 Pc . +.Pp +Edon-R is a very high-performance hash algorithm that was part +of the NIST SHA-3 competition. +It provides extremely high hash performance (over 350% faster than SHA-256), +but was not selected because of its unsuitability +as a general purpose secure hash algorithm. +This implementation utilizes the new salted checksumming functionality +in ZFS, which means that the checksum is pre-seeded with a secret +256-bit random key (stored on the pool) before being fed the data block +to be checksummed. +Thus the produced checksums are unique to a given pool, +preventing hash collision attacks on systems with dedup. +.Pp +.checksum-spiel edonr +.Pp +.Fx does not support the Sy edonr No feature. +. +.feature com.delphix embedded_data no +This feature improves the performance and compression ratio of +highly-compressible blocks. +Blocks whose contents can compress to 112 bytes +or smaller can take advantage of this feature. +.Pp +When this feature is enabled, the contents of highly-compressible blocks are +stored in the block "pointer" itself (a misnomer in this case, as it contains +the compressed data, rather than a pointer to its location on disk). +Thus the space of the block (one sector, typically 512B or 4kB) is saved, +and no additional I/O is needed to read and write the data block. +. +\*[instant-never] +. +.feature com.delphix empty_bpobj yes +This feature increases the performance of creating and using a large +number of snapshots of a single filesystem or volume, and also reduces +the disk space required. +.Pp +When there are many snapshots, each snapshot uses many Block Pointer +Objects (bpobjs) to track blocks associated with that snapshot. +However, in common use cases, most of these bpobjs are empty. +This feature allows us to create each bpobj on-demand, +thus eliminating the empty bpobjs. +.Pp +This feature is +.Sy active +while there are any filesystems, volumes, +or snapshots which were created after enabling this feature. +. +.feature com.delphix enabled_txg yes +Once this feature is enabled, ZFS records the transaction group number +in which new features are enabled. +This has no user-visible impact, but other features may depend on this feature. +.Pp +This feature becomes +.Sy active + as soon as it is enabled and will +never return to being +.Sy enabled . +. +.feature com.datto encryption no bookmark_v2 extensible_dataset +This feature enables the creation and management of natively encrypted datasets. +.Pp +This feature becomes +.Sy active +when an encrypted dataset is created and will be returned to the +.Sy enabled +state when all datasets that use this feature are destroyed. +. +.feature com.delphix extensible_dataset no +This feature allows more flexible use of internal ZFS data structures, +and exists for other features to depend on. +.Pp +This feature will be +.Sy active +when the first dependent feature uses it, and will be returned to the +.Sy enabled +state when all datasets that use this feature are destroyed. +. +.feature com.joyent filesystem_limits yes extensible_dataset +This feature enables filesystem and snapshot limits. +These limits can be used to control how many filesystems and/or snapshots +can be created at the point in the tree on which the limits are set. +.Pp +This feature is +.Sy active +once either of the limit properties has been set on a dataset. +Once activated the feature is never deactivated. +. +.feature com.delphix hole_birth no enabled_txg +This feature has/had bugs, the result of which is that, if you do a +.Nm zfs Cm send Fl i +.Pq or Fl R , No since it uses Fl i +from an affected dataset, the receiving party will not see any checksum +or other errors, but the resulting destination snapshot +will not match the source. +Its use by +.Nm zfs Cm send Fl i +has been disabled by default +.Pq see Sy send_holes_without_birth_time No in Xr zfs 4 . +.Pp +This feature improves performance of incremental sends +.Pq Nm zfs Cm send Fl i +and receives for objects with many holes. +The most common case of hole-filled objects is zvols. +.Pp +An incremental send stream from snapshot +.Sy A No to snapshot Sy B +contains information about every block that changed between +.Sy A No and Sy B . +Blocks which did not change between those snapshots can be +identified and omitted from the stream using a piece of metadata called +the "block birth time", but birth times are not recorded for holes +(blocks filled only with zeroes). +Since holes created after +.Sy A No cannot be distinguished from holes created before Sy A , +information about every hole in the entire filesystem or zvol +is included in the send stream. +.Pp +For workloads where holes are rare this is not a problem. +However, when incrementally replicating filesystems or zvols with many holes +(for example a zvol formatted with another filesystem) a lot of time will +be spent sending and receiving unnecessary information about holes that +already exist on the receiving side. +.Pp +Once the +.Sy hole_birth +feature has been enabled the block birth times +of all new holes will be recorded. +Incremental sends between snapshots created after this feature is enabled +will use this new metadata to avoid sending information about holes that +already exist on the receiving side. +.Pp +\*[instant-never] +. +.feature org.open-zfs large_blocks no extensible_dataset +This feature allows the record size on a dataset to be set larger than 128kB. +.Pp +This feature becomes +.Sy active +once a dataset contains a file with a block size larger than 128kB, +and will return to being +.Sy enabled +once all filesystems that have ever had their recordsize larger than 128kB +are destroyed. +. +.feature org.zfsonlinux large_dnode no extensible_dataset +This feature allows the size of dnodes in a dataset to be set larger than 512B. +. +This feature becomes +.Sy active +once a dataset contains an object with a dnode larger than 512B, +which occurs as a result of setting the +.Sy dnodesize +dataset property to a value other than +.Sy legacy . +The feature will return to being +.Sy enabled +once all filesystems that have ever contained a dnode larger than 512B +are destroyed. +Large dnodes allow more data to be stored in the bonus buffer, +thus potentially improving performance by avoiding the use of spill blocks. +. +.feature com.delphix livelist yes +This feature allows clones to be deleted faster than the traditional method +when a large number of random/sparse writes have been made to the clone. +All blocks allocated and freed after a clone is created are tracked by the +the clone's livelist which is referenced during the deletion of the clone. +The feature is activated when a clone is created and remains +.Sy active +until all clones have been destroyed. +. +.feature com.delphix log_spacemap yes com.delphix:spacemap_v2 +This feature improves performance for heavily-fragmented pools, +especially when workloads are heavy in random-writes. +It does so by logging all the metaslab changes on a single spacemap every TXG +instead of scattering multiple writes to all the metaslab spacemaps. +.Pp +\*[instant-never] +. +.feature org.illumos lz4_compress no +.Sy lz4 +is a high-performance real-time compression algorithm that +features significantly faster compression and decompression as well as a +higher compression ratio than the older +.Sy lzjb +compression. +Typically, +.Sy lz4 +compression is approximately 50% faster on compressible data and 200% faster +on incompressible data than +.Sy lzjb . +It is also approximately 80% faster on decompression, +while giving approximately a 10% better compression ratio. +.Pp +When the +.Sy lz4_compress +feature is set to +.Sy enabled , +the administrator can turn on +.Sy lz4 +compression on any dataset on the pool using the +.Xr zfs-set 8 +command. +All newly written metadata will be compressed with the +.Sy lz4 +algorithm. +.Pp +\*[instant-never] +. +.feature com.joyent multi_vdev_crash_dump no +This feature allows a dump device to be configured with a pool comprised +of multiple vdevs. +Those vdevs may be arranged in any mirrored or raidz configuration. +.Pp +When the +.Sy multi_vdev_crash_dump +feature is set to +.Sy enabled , +the administrator can use +.Xr dumpadm 1M +to configure a dump device on a pool comprised of multiple vdevs. +.Pp +Under +.Fx +and Linux this feature is unused, but registered for compatibility. +New pools created on these systems will have the feature +.Sy enabled +but will never transition to +.Sy active , +as this functionality is not required for crash dump support. +Existing pools where this feature is +.Sy active +can be imported. +. +.feature com.delphix obsolete_counts yes device_removal +This feature is an enhancement of +.Sy device_removal , +which will over time reduce the memory used to track removed devices. +When indirect blocks are freed or remapped, +we note that their part of the indirect mapping is "obsolete" – no longer needed. +.Pp +This feature becomes +.Sy active +when the +.Nm zpool Cm remove +command is used on a top-level vdev, and will never return to being +.Sy enabled . +. +.feature org.zfsonlinux project_quota yes extensible_dataset +This feature allows administrators to account the spaces and objects usage +information against the project identifier (ID). +.Pp +The project ID is an object-based attribute. +When upgrading an existing filesystem, +objects without a project ID will be assigned a zero project ID. +When this feature is enabled, newly created objects inherit +their parent directories' project ID if the parent's inherit flag is set +.Pq via Nm chattr Sy [+-]P No or Nm zfs Cm project Fl s Ns | Ns Fl C . +Otherwise, the new object's project ID will be zero. +An object's project ID can be changed at any time by the owner +(or privileged user) via +.Nm chattr Fl p Ar prjid +or +.Nm zfs Cm project Fl p Ar prjid . +.Pp +This feature will become +.Sy active +as soon as it is enabled and will never return to being +.Sy disabled . +\*[remount-upgrade] +. +.feature com.delphix redaction_bookmarks no bookmarks extensible_dataset +This feature enables the use of redacted +.Nm zfs Cm send Ns s , +which create redaction bookmarks storing the list of blocks +redacted by the send that created them. +For more information about redacted sends, see +.Xr zfs-send 8 . +. +.feature com.delphix redacted_datasets no extensible_dataset +This feature enables the receiving of redacted +.Nm zfs Cm send Ns +streams. which create redacted datasets when received. +These datasets are missing some of their blocks, +and so cannot be safely mounted, and their contents cannot be safely read. +For more information about redacted receives, see +.Xr zfs-send 8 . +. +.feature com.datto resilver_defer yes +This feature allows ZFS to postpone new resilvers if an existing one is already +in progress. +Without this feature, any new resilvers will cause the currently +running one to be immediately restarted from the beginning. +.Pp +This feature becomes +.Sy active +once a resilver has been deferred, and returns to being +.Sy enabled +when the deferred resilver begins. +. +.feature org.illumos sha512 no extensible_dataset +This feature enables the use of the SHA-512/256 truncated hash algorithm +(FIPS 180-4) for checksum and dedup. +The native 64-bit arithmetic of SHA-512 provides an approximate 50% +performance boost over SHA-256 on 64-bit hardware +and is thus a good minimum-change replacement candidate +for systems where hash performance is important, +but these systems cannot for whatever reason utilize the faster +.Sy skein No and Sy edonr +algorithms. +.Pp +.checksum-spiel sha512 +. +.feature org.illumos skein no extensible_dataset +This feature enables the use of the Skein hash algorithm for checksum and dedup. +Skein is a high-performance secure hash algorithm that was a +finalist in the NIST SHA-3 competition. +It provides a very high security margin and high performance on 64-bit hardware +(80% faster than SHA-256). +This implementation also utilizes the new salted checksumming +functionality in ZFS, which means that the checksum is pre-seeded with a +secret 256-bit random key (stored on the pool) before being fed the data +block to be checksummed. +Thus the produced checksums are unique to a given pool, +preventing hash collision attacks on systems with dedup. +.Pp +.checksum-spiel skein +. +.feature com.delphix spacemap_histogram yes +This features allows ZFS to maintain more information about how free space +is organized within the pool. +If this feature is +.Sy enabled , +it will be activated when a new space map object is created, or +an existing space map is upgraded to the new format, +and never returns back to being +.Sy enabled . +. +.feature com.delphix spacemap_v2 yes +This feature enables the use of the new space map encoding which +consists of two words (instead of one) whenever it is advantageous. +The new encoding allows space maps to represent large regions of +space more efficiently on-disk while also increasing their maximum +addressable offset. +.Pp +This feature becomes +.Sy active +once it is +.Sy enabled , +and never returns back to being +.Sy enabled . +. +.feature org.zfsonlinux userobj_accounting yes extensible_dataset +This feature allows administrators to account the object usage information +by user and group. +.Pp +\*[instant-never] +\*[remount-upgrade] +. +.feature com.delphix zpool_checkpoint yes +This feature enables the +.Nm zpool Cm checkpoint +command that can checkpoint the state of the pool +at the time it was issued and later rewind back to it or discard it. +.Pp +This feature becomes +.Sy active +when the +.Nm zpool Cm checkpoint +command is used to checkpoint the pool. +The feature will only return back to being +.Sy enabled +when the pool is rewound or the checkpoint has been discarded. +. +.feature org.freebsd zstd_compress no extensible_dataset +.Sy zstd +is a high-performance compression algorithm that features a +combination of high compression ratios and high speed. +Compared to +.Sy gzip , +.Sy zstd +offers slightly better compression at much higher speeds. +Compared to +.Sy lz4 , +.Sy zstd +offers much better compression while being only modestly slower. +Typically, +.Sy zstd +compression speed ranges from 250 to 500 MB/s per thread +and decompression speed is over 1 GB/s per thread. +.Pp +When the +.Sy zstd +feature is set to +.Sy enabled , +the administrator can turn on +.Sy zstd +compression of any dataset using +.Nm zfs Cm set Sy compress Ns = Ns Sy zstd Ar dset +.Po see Xr zfs-set 8 Pc . +This feature becomes +.Sy active +once a +.Sy compress +property has been set to +.Sy zstd , +and will return to being +.Sy enabled +once all filesystems that have ever had their +.Sy compress +property set to +.Sy zstd +are destroyed. +.El +. +.Sh SEE ALSO +.Xr zpool 8 diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 new file mode 100644 index 0000000000..58132baf50 --- /dev/null +++ b/man/man7/zpoolconcepts.7 @@ -0,0 +1,512 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd June 2, 2021 +.Dt ZPOOLCONCEPTS 7 +.Os +. +.Sh NAME +.Nm zpoolconcepts +.Nd overview of ZFS storage pools +. +.Sh DESCRIPTION +.Ss Virtual Devices (vdevs) +A "virtual device" describes a single device or a collection of devices +organized according to certain performance and fault characteristics. +The following virtual devices are supported: +.Bl -tag -width "special" +.It Sy disk +A block device, typically located under +.Pa /dev . +ZFS can use individual slices or partitions, though the recommended mode of +operation is to use whole disks. +A disk can be specified by a full path, or it can be a shorthand name +.Po the relative portion of the path under +.Pa /dev +.Pc . +A whole disk can be specified by omitting the slice or partition designation. +For example, +.Pa sda +is equivalent to +.Pa /dev/sda . +When given a whole disk, ZFS automatically labels the disk, if necessary. +.It Sy file +A regular file. +The use of files as a backing store is strongly discouraged. +It is designed primarily for experimental purposes, as the fault tolerance of a +file is only as good as the file system on which it resides. +A file must be specified by a full path. +.It Sy mirror +A mirror of two or more devices. +Data is replicated in an identical fashion across all components of a mirror. +A mirror with +.Em N No disks of size Em X No can hold Em X No bytes and can withstand Em N-1 +devices failing without losing data. +.It Sy raidz , raidz1 , raidz2 , raidz3 +A variation on RAID-5 that allows for better distribution of parity and +eliminates the RAID-5 +.Qq write hole +.Pq in which data and parity become inconsistent after a power loss . +Data and parity is striped across all disks within a raidz group. +.Pp +A raidz group can have single, double, or triple parity, meaning that the +raidz group can sustain one, two, or three failures, respectively, without +losing any data. +The +.Sy raidz1 +vdev type specifies a single-parity raidz group; the +.Sy raidz2 +vdev type specifies a double-parity raidz group; and the +.Sy raidz3 +vdev type specifies a triple-parity raidz group. +The +.Sy raidz +vdev type is an alias for +.Sy raidz1 . +.Pp +A raidz group with +.Em N No disks of size Em X No with Em P No parity disks can hold approximately +.Em (N-P)*X No bytes and can withstand Em P No devices failing without losing data. +The minimum number of devices in a raidz group is one more than the number of +parity disks. +The recommended number is between 3 and 9 to help increase performance. +.It Sy draid , draid1 , draid2 , draid3 +A variant of raidz that provides integrated distributed hot spares which +allows for faster resilvering while retaining the benefits of raidz. +A dRAID vdev is constructed from multiple internal raidz groups, each with +.Em D No data devices and Em P No parity devices. +These groups are distributed over all of the children in order to fully +utilize the available disk performance. +.Pp +Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with +zeros) to allow fully sequential resilvering. +This fixed stripe width significantly effects both usable capacity and IOPS. +For example, with the default +.Em D=8 No and Em 4kB No disk sectors the minimum allocation size is Em 32kB . +If using compression, this relatively large allocation size can reduce the +effective compression ratio. +When using ZFS volumes and dRAID, the default of the +.Sy volblocksize +property is increased to account for the allocation size. +If a dRAID pool will hold a significant amount of small blocks, it is +recommended to also add a mirrored +.Sy special +vdev to store those blocks. +.Pp +In regards to I/O, performance is similar to raidz since for any read all +.Em D No data disks must be accessed. +Delivered random IOPS can be reasonably approximated as +.Sy floor((N-S)/(D+P))*single_drive_IOPS . +.Pp +Like raidzm a dRAID can have single-, double-, or triple-parity. +The +.Sy draid1 , +.Sy draid2 , +and +.Sy draid3 +types can be used to specify the parity level. +The +.Sy draid +vdev type is an alias for +.Sy draid1 . +.Pp +A dRAID with +.Em N No disks of size Em X , D No data disks per redundancy group, Em P +.No parity level, and Em S No distributed hot spares can hold approximately +.Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P +devices failing without losing data. +.It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc +A non-default dRAID configuration can be specified by appending one or more +of the following optional arguments to the +.Sy draid +keyword: +.Bl -tag -compact -width "children" +.It Ar parity +The parity level (1-3). +.It Ar data +The number of data devices per redundancy group. +In general, a smaller value of +.Em D No will increase IOPS, improve the compression ratio, +and speed up resilvering at the expense of total usable capacity. +Defaults to +.Em 8 , No unless Em N-P-S No is less than Em 8 . +.It Ar children +The expected number of children. +Useful as a cross-check when listing a large number of devices. +An error is returned when the provided number of children differs. +.It Ar spares +The number of distributed hot spares. +Defaults to zero. +.El +.It Sy spare +A pseudo-vdev which keeps track of available hot spares for a pool. +For more information, see the +.Sx Hot Spares +section. +.It Sy log +A separate intent log device. +If more than one log device is specified, then writes are load-balanced between +devices. +Log devices can be mirrored. +However, raidz vdev types are not supported for the intent log. +For more information, see the +.Sx Intent Log +section. +.It Sy dedup +A device dedicated solely for deduplication tables. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. +If more than one dedup device is specified, then +allocations are load-balanced between those devices. +.It Sy special +A device dedicated solely for allocating various kinds of internal metadata, +and optionally small file blocks. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. +If more than one special device is specified, then +allocations are load-balanced between those devices. +.Pp +For more information on special allocations, see the +.Sx Special Allocation Class +section. +.It Sy cache +A device used to cache storage pool data. +A cache device cannot be configured as a mirror or raidz group. +For more information, see the +.Sx Cache Devices +section. +.El +.Pp +Virtual devices cannot be nested, so a mirror or raidz virtual device can only +contain files or disks. +Mirrors of mirrors +.Pq or other combinations +are not allowed. +.Pp +A pool can have any number of virtual devices at the top of the configuration +.Po known as +.Qq root vdevs +.Pc . +Data is dynamically distributed across all top-level devices to balance data +among devices. +As new virtual devices are added, ZFS automatically places data on the newly +available devices. +.Pp +Virtual devices are specified one at a time on the command line, +separated by whitespace. +Keywords like +.Sy mirror No and Sy raidz +are used to distinguish where a group ends and another begins. +For example, the following creates a pool with two root vdevs, +each a mirror of two disks: +.Dl # Nm zpool Cm create Ar mypool Sy mirror Ar sda sdb Sy mirror Ar sdc sdd +. +.Ss Device Failure and Recovery +ZFS supports a rich set of mechanisms for handling device failure and data +corruption. +All metadata and data is checksummed, and ZFS automatically repairs bad data +from a good copy when corruption is detected. +.Pp +In order to take advantage of these features, a pool must make use of some form +of redundancy, using either mirrored or raidz groups. +While ZFS supports running in a non-redundant configuration, where each root +vdev is simply a disk or file, this is strongly discouraged. +A single case of bit corruption can render some or all of your data unavailable. +.Pp +A pool's health status is described by one of three states: +.Sy online , degraded , No or Sy faulted . +An online pool has all devices operating normally. +A degraded pool is one in which one or more devices have failed, but the data is +still available due to a redundant configuration. +A faulted pool has corrupted metadata, or one or more faulted devices, and +insufficient replicas to continue functioning. +.Pp +The health of the top-level vdev, such as a mirror or raidz device, +is potentially impacted by the state of its associated vdevs, +or component devices. +A top-level vdev or component device is in one of the following states: +.Bl -tag -width "DEGRADED" +.It Sy DEGRADED +One or more top-level vdevs is in the degraded state because one or more +component devices are offline. +Sufficient replicas exist to continue functioning. +.Pp +One or more component devices is in the degraded or faulted state, but +sufficient replicas exist to continue functioning. +The underlying conditions are as follows: +.Bl -bullet -compact +.It +The number of checksum errors exceeds acceptable levels and the device is +degraded as an indication that something may be wrong. +ZFS continues to use the device as necessary. +.It +The number of I/O errors exceeds acceptable levels. +The device could not be marked as faulted because there are insufficient +replicas to continue functioning. +.El +.It Sy FAULTED +One or more top-level vdevs is in the faulted state because one or more +component devices are offline. +Insufficient replicas exist to continue functioning. +.Pp +One or more component devices is in the faulted state, and insufficient +replicas exist to continue functioning. +The underlying conditions are as follows: +.Bl -bullet -compact +.It +The device could be opened, but the contents did not match expected values. +.It +The number of I/O errors exceeds acceptable levels and the device is faulted to +prevent further use of the device. +.El +.It Sy OFFLINE +The device was explicitly taken offline by the +.Nm zpool Cm offline +command. +.It Sy ONLINE +The device is online and functioning. +.It Sy REMOVED +The device was physically removed while the system was running. +Device removal detection is hardware-dependent and may not be supported on all +platforms. +.It Sy UNAVAIL +The device could not be opened. +If a pool is imported when a device was unavailable, then the device will be +identified by a unique identifier instead of its path since the path was never +correct in the first place. +.El +.Pp +Checksum errors represent events where a disk returned data that was expected +to be correct, but was not. +In other words, these are instances of silent data corruption. +The checksum errors are reported in +.Nm zpool Cm status +and +.Nm zpool Cm events . +When a block is stored redundantly, a damaged block may be reconstructed +(e.g. from raidz parity or a mirrored copy). +In this case, ZFS reports the checksum error against the disks that contained +damaged data. +If a block is unable to be reconstructed (e.g. due to 3 disks being damaged +in a raidz2 group), it is not possible to determine which disks were silently +corrupted. +In this case, checksum errors are reported for all disks on which the block +is stored. +.Pp +If a device is removed and later re-attached to the system, +ZFS attempts online the device automatically. +Device attachment detection is hardware-dependent +and might not be supported on all platforms. +. +.Ss Hot Spares +ZFS allows devices to be associated with pools as +.Qq hot spares . +These devices are not actively used in the pool, but when an active device +fails, it is automatically replaced by a hot spare. +To create a pool with hot spares, specify a +.Sy spare +vdev with any number of devices. +For example, +.Dl # Nm zpool Cm create Ar pool Sy mirror Ar sda sdb Sy spare Ar sdc sdd +.Pp +Spares can be shared across multiple pools, and can be added with the +.Nm zpool Cm add +command and removed with the +.Nm zpool Cm remove +command. +Once a spare replacement is initiated, a new +.Sy spare +vdev is created within the configuration that will remain there until the +original device is replaced. +At this point, the hot spare becomes available again if another device fails. +.Pp +If a pool has a shared spare that is currently being used, the pool can not be +exported since other pools may use this shared spare, which may lead to +potential data corruption. +.Pp +Shared spares add some risk. +If the pools are imported on different hosts, +and both pools suffer a device failure at the same time, +both could attempt to use the spare at the same time. +This may not be detected, resulting in data corruption. +.Pp +An in-progress spare replacement can be cancelled by detaching the hot spare. +If the original faulted device is detached, then the hot spare assumes its +place in the configuration, and is removed from the spare list of all active +pools. +.Pp +The +.Sy draid +vdev type provides distributed hot spares. +These hot spares are named after the dRAID vdev they're a part of +.Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 , +.No which is a single parity dRAID Pc +and may only be used by that dRAID vdev. +Otherwise, they behave the same as normal hot spares. +.Pp +Spares cannot replace log devices. +. +.Ss Intent Log +The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous +transactions. +For instance, databases often require their transactions to be on stable storage +devices when returning from a system call. +NFS and other applications can also use +.Xr fsync 2 +to ensure data stability. +By default, the intent log is allocated from blocks within the main pool. +However, it might be possible to get better performance using separate intent +log devices such as NVRAM or a dedicated disk. +For example: +.Dl # Nm zpool Cm create Ar pool sda sdb Sy log Ar sdc +.Pp +Multiple log devices can also be specified, and they can be mirrored. +See the +.Sx EXAMPLES +section for an example of mirroring multiple log devices. +.Pp +Log devices can be added, replaced, attached, detached and removed. +In addition, log devices are imported and exported as part of the pool +that contains them. +Mirrored devices can be removed by specifying the top-level mirror vdev. +. +.Ss Cache Devices +Devices can be added to a storage pool as +.Qq cache devices . +These devices provide an additional layer of caching between main memory and +disk. +For read-heavy workloads, where the working set size is much larger than what +can be cached in main memory, using cache devices allows much more of this +working set to be served from low latency media. +Using cache devices provides the greatest performance improvement for random +read-workloads of mostly static content. +.Pp +To create a pool with cache devices, specify a +.Sy cache +vdev with any number of devices. +For example: +.Dl # Nm zpool Cm create Ar pool sda sdb Sy cache Ar sdc sdd +.Pp +Cache devices cannot be mirrored or part of a raidz configuration. +If a read error is encountered on a cache device, that read I/O is reissued to +the original storage pool device, which might be part of a mirrored or raidz +configuration. +.Pp +The content of the cache devices is persistent across reboots and restored +asynchronously when importing the pool in L2ARC (persistent L2ARC). +This can be disabled by setting +.Sy l2arc_rebuild_enabled Ns = Ns Sy 0 . +For cache devices smaller than +.Em 1GB , +we do not write the metadata structures +required for rebuilding the L2ARC in order not to waste space. +This can be changed with +.Sy l2arc_rebuild_blocks_min_l2size . +The cache device header +.Pq Em 512B +is updated even if no metadata structures are written. +Setting +.Sy l2arc_headroom Ns = Ns Sy 0 +will result in scanning the full-length ARC lists for cacheable content to be +written in L2ARC (persistent ARC). +If a cache device is added with +.Nm zpool Cm add +its label and header will be overwritten and its contents are not going to be +restored in L2ARC, even if the device was previously part of the pool. +If a cache device is onlined with +.Nm zpool Cm online +its contents will be restored in L2ARC. +This is useful in case of memory pressure +where the contents of the cache device are not fully restored in L2ARC. +The user can off- and online the cache device when there is less memory pressure +in order to fully restore its contents to L2ARC. +. +.Ss Pool checkpoint +Before starting critical procedures that include destructive actions +.Pq like Nm zfs Cm destroy , +an administrator can checkpoint the pool's state and in the case of a +mistake or failure, rewind the entire pool back to the checkpoint. +Otherwise, the checkpoint can be discarded when the procedure has completed +successfully. +.Pp +A pool checkpoint can be thought of as a pool-wide snapshot and should be used +with care as it contains every part of the pool's state, from properties to vdev +configuration. +Thus, certain operations are not allowed while a pool has a checkpoint. +Specifically, vdev removal/attach/detach, mirror splitting, and +changing the pool's GUID. +Adding a new vdev is supported, but in the case of a rewind it will have to be +added again. +Finally, users of this feature should keep in mind that scrubs in a pool that +has a checkpoint do not repair checkpointed data. +.Pp +To create a checkpoint for a pool: +.Dl # Nm zpool Cm checkpoint Ar pool +.Pp +To later rewind to its checkpointed state, you need to first export it and +then rewind it during import: +.Dl # Nm zpool Cm export Ar pool +.Dl # Nm zpool Cm import Fl -rewind-to-checkpoint Ar pool +.Pp +To discard the checkpoint from a pool: +.Dl # Nm zpool Cm checkpoint Fl d Ar pool +.Pp +Dataset reservations (controlled by the +.Sy reservation No and Sy refreservation +properties) may be unenforceable while a checkpoint exists, because the +checkpoint is allowed to consume the dataset's reservation. +Finally, data that is part of the checkpoint but has been freed in the +current state of the pool won't be scanned during a scrub. +. +.Ss Special Allocation Class +Allocations in the special class are dedicated to specific block types. +By default this includes all metadata, the indirect blocks of user data, and +any deduplication tables. +The class can also be provisioned to accept small file blocks. +.Pp +A pool must always have at least one normal +.Pq non- Ns Sy dedup Ns /- Ns Sy special +vdev before +other devices can be assigned to the special class. +If the +.Sy special +class becomes full, then allocations intended for it +will spill back into the normal class. +.Pp +Deduplication tables can be excluded from the special class by unsetting the +.Sy zfs_ddt_data_is_special +ZFS module parameter. +.Pp +Inclusion of small file blocks in the special class is opt-in. +Each dataset can control the size of small file blocks allowed +in the special class by setting the +.Sy special_small_blocks +property to nonzero. +See +.Xr zfsprops 7 +for more info on this property. diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 new file mode 100644 index 0000000000..513f02e031 --- /dev/null +++ b/man/man7/zpoolprops.7 @@ -0,0 +1,412 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2021, Colm Buckley +.\" +.Dd May 27, 2021 +.Dt ZPOOLPROPS 7 +.Os +. +.Sh NAME +.Nm zpoolprops +.Nd properties of ZFS storage pools +. +.Sh DESCRIPTION +Each pool has several properties associated with it. +Some properties are read-only statistics while others are configurable and +change the behavior of the pool. +.Pp +The following are read-only properties: +.Bl -tag -width "unsupported@guid" +.It Cm allocated +Amount of storage used within the pool. +See +.Sy fragmentation +and +.Sy free +for more information. +.It Sy capacity +Percentage of pool space used. +This property can also be referred to by its shortened column name, +.Sy cap . +.It Sy expandsize +Amount of uninitialized space within the pool or device that can be used to +increase the total capacity of the pool. +On whole-disk vdevs, this is the space beyond the end of the GPT – +typically occurring when a LUN is dynamically expanded +or a disk replaced with a larger one. +On partition vdevs, this is the space appended to the partition after it was +added to the pool – most likely by resizing it in-place. +The space can be claimed for the pool by bringing it online with +.Sy autoexpand=on +or using +.Nm zpool Cm online Fl e . +.It Sy fragmentation +The amount of fragmentation in the pool. +As the amount of space +.Sy allocated +increases, it becomes more difficult to locate +.Sy free +space. +This may result in lower write performance compared to pools with more +unfragmented free space. +.It Sy free +The amount of free space available in the pool. +By contrast, the +.Xr zfs 8 +.Sy available +property describes how much new data can be written to ZFS filesystems/volumes. +The zpool +.Sy free +property is not generally useful for this purpose, and can be substantially more than the zfs +.Sy available +space. +This discrepancy is due to several factors, including raidz parity; +zfs reservation, quota, refreservation, and refquota properties; and space set aside by +.Sy spa_slop_shift +(see +.Xr zfs 4 +for more information). +.It Sy freeing +After a file system or snapshot is destroyed, the space it was using is +returned to the pool asynchronously. +.Sy freeing +is the amount of space remaining to be reclaimed. +Over time +.Sy freeing +will decrease while +.Sy free +increases. +.It Sy health +The current health of the pool. +Health can be one of +.Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . +.It Sy guid +A unique identifier for the pool. +.It Sy load_guid +A unique identifier for the pool. +Unlike the +.Sy guid +property, this identifier is generated every time we load the pool (i.e. does +not persist across imports/exports) and never changes while the pool is loaded +(even if a +.Sy reguid +operation takes place). +.It Sy size +Total size of the storage pool. +.It Sy unsupported@ Ns Em guid +Information about unsupported features that are enabled on the pool. +See +.Xr zpool-features 7 +for details. +.El +.Pp +The space usage properties report actual physical space available to the +storage pool. +The physical space can be different from the total amount of space that any +contained datasets can actually use. +The amount of space used in a raidz configuration depends on the characteristics +of the data being written. +In addition, ZFS reserves some space for internal accounting that the +.Xr zfs 8 +command takes into account, but the +.Nm +command does not. +For non-full pools of a reasonable size, these effects should be invisible. +For small pools, or pools that are close to being completely full, these +discrepancies may become more noticeable. +.Pp +The following property can be set at creation time and import time: +.Bl -tag -width Ds +.It Sy altroot +Alternate root directory. +If set, this directory is prepended to any mount points within the pool. +This can be used when examining an unknown pool where the mount points cannot be +trusted, or in an alternate boot environment, where the typical paths are not +valid. +.Sy altroot +is not a persistent property. +It is valid only while the system is up. +Setting +.Sy altroot +defaults to using +.Sy cachefile Ns = Ns Sy none , +though this may be overridden using an explicit setting. +.El +.Pp +The following property can be set only at import time: +.Bl -tag -width Ds +.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off +If set to +.Sy on , +the pool will be imported in read-only mode. +This property can also be referred to by its shortened column name, +.Sy rdonly . +.El +.Pp +The following properties can be set at creation time and import time, and later +changed with the +.Nm zpool Cm set +command: +.Bl -tag -width Ds +.It Sy ashift Ns = Ns Sy ashift +Pool sector size exponent, to the power of +.Sy 2 +(internally referred to as +.Sy ashift ) . +Values from 9 to 16, inclusive, are valid; also, the +value 0 (the default) means to auto-detect using the kernel's block +layer and a ZFS internal exception list. +I/O operations will be aligned to the specified size boundaries. +Additionally, the minimum (disk) +write size will be set to the specified size, so this represents a +space vs. performance trade-off. +For optimal performance, the pool sector size should be greater than +or equal to the sector size of the underlying disks. +The typical case for setting this property is when +performance is important and the underlying disks use 4KiB sectors but +report 512B sectors to the OS (for compatibility reasons); in that +case, set +.Sy ashift Ns = Ns Sy 12 +(which is +.Sy 1<<12 No = Sy 4096 ) . +When set, this property is +used as the default hint value in subsequent vdev operations (add, +attach and replace). +Changing this value will not modify any existing +vdev, not even on disk replacement; however it can be used, for +instance, to replace a dying 512B sectors disk with a newer 4KiB +sectors device: this will probably result in bad performance but at the +same time could prevent loss of data. +.It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off +Controls automatic pool expansion when the underlying LUN is grown. +If set to +.Sy on , +the pool will be resized according to the size of the expanded device. +If the device is part of a mirror or raidz then all devices within that +mirror/raidz group must be expanded before the new space is made available to +the pool. +The default behavior is +.Sy off . +This property can also be referred to by its shortened column name, +.Sy expand . +.It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off +Controls automatic device replacement. +If set to +.Sy off , +device replacement must be initiated by the administrator by using the +.Nm zpool Cm replace +command. +If set to +.Sy on , +any new device, found in the same physical location as a device that previously +belonged to the pool, is automatically formatted and replaced. +The default behavior is +.Sy off . +This property can also be referred to by its shortened column name, +.Sy replace . +Autoreplace can also be used with virtual disks (like device +mapper) provided that you use the /dev/disk/by-vdev paths setup by +vdev_id.conf. +See the +.Xr vdev_id 8 +manual page for more details. +Autoreplace and autoonline require the ZFS Event Daemon be configured and +running. +See the +.Xr zed 8 +manual page for more details. +.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off +When set to +.Sy on +space which has been recently freed, and is no longer allocated by the pool, +will be periodically trimmed. +This allows block device vdevs which support +BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system +supports hole-punching, to reclaim unused blocks. +The default value for this property is +.Sy off . +.Pp +Automatic TRIM does not immediately reclaim blocks after a free. +Instead, it will optimistically delay allowing smaller ranges to be aggregated +into a few larger ones. +These can then be issued more efficiently to the storage. +TRIM on L2ARC devices is enabled by setting +.Sy l2arc_trim_ahead > 0 . +.Pp +Be aware that automatic trimming of recently freed data blocks can put +significant stress on the underlying storage devices. +This will vary depending of how well the specific device handles these commands. +For lower-end devices it is often possible to achieve most of the benefits +of automatic trimming by running an on-demand (manual) TRIM periodically +using the +.Nm zpool Cm trim +command. +.It Sy bootfs Ns = Ns Sy (unset) Ns | Ns Ar pool Ns Op / Ns Ar dataset +Identifies the default bootable dataset for the root pool. +This property is expected to be set mainly by the installation and upgrade programs. +Not all Linux distribution boot processes use the bootfs property. +.It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none +Controls the location of where the pool configuration is cached. +Discovering all pools on system startup requires a cached copy of the +configuration data that is stored on the root file system. +All pools in this cache are automatically imported when the system boots. +Some environments, such as install and clustering, need to cache this +information in a different location so that pools are not automatically +imported. +Setting this property caches the pool configuration in a different location that +can later be imported with +.Nm zpool Cm import Fl c . +Setting it to the value +.Sy none +creates a temporary pool that is never cached, and the +.Qq +.Pq empty string +uses the default location. +.Pp +Multiple pools can share the same cache file. +Because the kernel destroys and recreates this file when pools are added and +removed, care should be taken when attempting to access this file. +When the last pool using a +.Sy cachefile +is exported or destroyed, the file will be empty. +.It Sy comment Ns = Ns Ar text +A text string consisting of printable ASCII characters that will be stored +such that it is available even if the pool becomes faulted. +An administrator can provide additional information about a pool using this +property. +.It Sy compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns … +Specifies that the pool maintain compatibility with specific feature sets. +When set to +.Sy off +(or unset) compatibility is disabled (all features may be enabled); when set to +.Sy legacy Ns +no features may be enabled. +When set to a comma-separated list of filenames +(each filename may either be an absolute path, or relative to +.Pa /etc/zfs/compatibility.d +or +.Pa /usr/share/zfs/compatibility.d ) +the lists of requested features are read from those files, separated by +whitespace and/or commas. +Only features present in all files may be enabled. +.Pp +See +.Xr zpool-features 7 , +.Xr zpool-create 8 +and +.Xr zpool-upgrade 8 +for more information on the operation of compatibility feature sets. +.It Sy dedupditto Ns = Ns Ar number +This property is deprecated and no longer has any effect. +.It Sy delegation Ns = Ns Sy on Ns | Ns Sy off +Controls whether a non-privileged user is granted access based on the dataset +permissions defined on the dataset. +See +.Xr zfs 8 +for more information on ZFS delegated administration. +.It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic +Controls the system behavior in the event of catastrophic pool failure. +This condition is typically a result of a loss of connectivity to the underlying +storage device(s) or a failure of all devices within the pool. +The behavior of such an event is determined as follows: +.Bl -tag -width "continue" +.It Sy wait +Blocks all I/O access until the device connectivity is recovered and the errors +are cleared. +This is the default behavior. +.It Sy continue +Returns +.Er EIO +to any new write I/O requests but allows reads to any of the remaining healthy +devices. +Any write requests that have yet to be committed to disk would be blocked. +.It Sy panic +Prints out a message to the console and generates a system crash dump. +.El +.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled +The value of this property is the current state of +.Ar feature_name . +The only valid value when setting this property is +.Sy enabled +which moves +.Ar feature_name +to the enabled state. +See +.Xr zpool-features 7 +for details on feature states. +.It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off +Controls whether information about snapshots associated with this pool is +output when +.Nm zfs Cm list +is run without the +.Fl t +option. +The default value is +.Sy off . +This property can also be referred to by its shortened name, +.Sy listsnaps . +.It Sy multihost Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool activity check should be performed during +.Nm zpool Cm import . +When a pool is determined to be active it cannot be imported, even with the +.Fl f +option. +This property is intended to be used in failover configurations +where multiple hosts have access to a pool on shared storage. +.Pp +Multihost provides protection on import only. +It does not protect against an +individual device being used in multiple pools, regardless of the type of vdev. +See the discussion under +.Nm zpool Cm create . +.Pp +When this property is on, periodic writes to storage occur to show the pool is +in use. +See +.Sy zfs_multihost_interval +in the +.Xr zfs 4 +manual page. +In order to enable this property each host must set a unique hostid. +See +.Xr genhostid 1 +.Xr zgenhostid 8 +.Xr spl 4 +for additional details. +The default value is +.Sy off . +.It Sy version Ns = Ns Ar version +The current on-disk version of the pool. +This can be increased, but never decreased. +The preferred method of updating pools is with the +.Nm zpool Cm upgrade +command, though this property can be used when a specific version is needed for +backwards compatibility. +Once feature flags are enabled on a pool this property will no longer have a +value. +.El diff --git a/man/man8/Makefile.am b/man/man8/Makefile.am deleted file mode 100644 index 5401ff06f2..0000000000 --- a/man/man8/Makefile.am +++ /dev/null @@ -1,31 +0,0 @@ -dist_man_MANS = \ - fsck.zfs.8 \ - mount.zfs.8 \ - vdev_id.8 \ - zdb.8 \ - zfs.8 \ - zfs-program.8 \ - zgenhostid.8 \ - zinject.8 \ - zpool.8 \ - zstreamdump.8 - -nodist_man_MANS = \ - zed.8 \ - zfs-mount-generator.8 - -EXTRA_DIST = \ - zed.8.in \ - zfs-mount-generator.8.in - -$(nodist_man_MANS): %: %.in - -$(SED) -e 's,@zfsexecdir\@,$(zfsexecdir),g' \ - -e 's,@runstatedir\@,$(runstatedir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -install-data-local: - $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man8" - -CLEANFILES = \ - $(nodist_man_MANS) diff --git a/man/man8/fsck.zfs.8 b/man/man8/fsck.zfs.8 index baa8c3330c..0ce7576ebe 100644 --- a/man/man8/fsck.zfs.8 +++ b/man/man8/fsck.zfs.8 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,49 +18,60 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright 2013 Darik Horn . All rights reserved. .\" -.TH fsck.zfs 8 "2013 MAR 16" "ZFS on Linux" "System Administration Commands" - -.SH NAME -fsck.zfs \- Dummy ZFS filesystem checker. - -.SH SYNOPSIS -.LP -.BI "fsck.zfs [" "options" "] <" "dataset" ">" - -.SH DESCRIPTION -.LP -\fBfsck.zfs\fR is a shell stub that does nothing and always returns -true. It is installed by ZoL because some Linux distributions expect -a fsck helper for all filesystems. - -.SH OPTIONS -.HP -All \fIoptions\fR and the \fIdataset\fR are ignored. - -.SH "NOTES" -.LP -ZFS datasets are checked by running \fBzpool scrub\fR on the -containing pool. An individual ZFS dataset is never checked -independently of its pool, which is unlike a regular filesystem. - -.SH "BUGS" -.LP -On some systems, if the \fIdataset\fR is in a degraded pool, then it -might be appropriate for \fBfsck.zfs\fR to return exit code 4 to -indicate an uncorrected filesystem error. -.LP -Similarly, if the \fIdataset\fR is in a faulted pool and has a legacy -/etc/fstab record, then \fBfsck.zfs\fR should return exit code 8 to -indicate a fatal operational error. - -.SH "AUTHORS" -.LP -Darik Horn . - -.SH "SEE ALSO" -.BR fsck (8), -.BR fstab (5), -.BR zpool (8) +.Dd May 26, 2021 +.Dt FSCK.ZFS 8 +.Os +. +.Sh NAME +.Nm fsck.zfs +.Nd dummy ZFS filesystem checker +.Sh SYNOPSIS +.Nm +.Op Ar options +.Ar dataset Ns No … +. +.Sh DESCRIPTION +.Nm +is a thin shell wrapper that at most checks the status of a dataset's container pool. +It is installed by OpenZFS because some Linux +distributions expect a fsck helper for all filesystems. +.Pp +If more than one +.Ar dataset +is specified, each is checked in turn and the results binary-ored. +. +.Sh OPTIONS +Ignored. +. +.Sh NOTES +ZFS datasets are checked by running +.Nm zpool Cm scrub +on the containing pool. +An individual ZFS dataset is never checked independently of its pool, +which is unlike a regular filesystem. +.Pp +However, the +.Xr fsck 8 +interface still allows it to communicate some errors: if the +.Ar dataset +is in a degraded pool, then +.Nm +will return exit code +.Sy 4 +to indicate an uncorrected filesystem error. +.Pp +Similarly, if the +.Ar dataset +is in a faulted pool and has a legacy +.Pa /etc/fstab +record, then +.Nm +will return exit code +.Sy 8 +to indicate a fatal operational error. +.Sh SEE ALSO +.Xr fstab 5 , +.Xr fsck 8 , +.Xr zpool-scrub 8 diff --git a/man/man8/mount.zfs.8 b/man/man8/mount.zfs.8 index 4b71367e23..2101f70cd5 100644 --- a/man/man8/mount.zfs.8 +++ b/man/man8/mount.zfs.8 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,126 +18,75 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright 2013 Darik Horn . All rights reserved. .\" -.TH mount.zfs 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands" - -.SH NAME -mount.zfs \- mount a ZFS filesystem -.SH SYNOPSIS -.LP -.BI "mount.zfs [\-sfnvh] [\-o " options "]" " dataset mountpoint - -.SH DESCRIPTION -.BR mount.zfs -is part of the zfsutils package for Linux. It is a helper program that -is usually invoked by the -.BR mount (8) +.Dd May 24, 2021 +.Dt MOUNT.ZFS 8 +.Os +. +.Sh NAME +.Nm mount.zfs +.Nd mount ZFS filesystem +.Sh SYNOPSIS +.Nm +.Op Fl sfnvh +.Op Fl o Ar options +.Ar dataset +.Ar mountpoint +. +.Sh DESCRIPTION +The +.Nm +helper is used by +.Xr mount 8 +to mount filesystem snapshots and +.Sy mountpoint= Ns Ar legacy +ZFS filesystems, as well as by +.Xr zfs 8 +when the +.Sy ZFS_MOUNT_HELPER +environment variable is not set. +Users should should invoke either +.Xr mount 8 or -.BR zfs (8) -commands to mount a ZFS dataset. - -All -.I options -are handled according to the FILESYSTEM INDEPENDENT MOUNT OPTIONS -section in the -.BR mount (8) -manual, except for those described below. - -The -.I dataset -parameter is a ZFS filesystem name, as output by the -.B "zfs list -H -o name -command. This parameter never has a leading slash character and is -not a device name. - -The -.I mountpoint -parameter is the path name of a directory. - - -.SH OPTIONS -.TP -.BI "\-s" -Ignore bad or sloppy mount options. -.TP -.BI "\-f" -Do a fake mount; do not perform the mount operation. -.TP -.BI "\-n" -Do not update the /etc/mtab file. -.TP -.BI "\-v" -Increase verbosity. -.TP -.BI "\-h" +.Xr zfs 8 +in most cases. +.Pp +.Ar options +are handled according to the +.Em Temporary Mount Point Properties +section in +.Xr zfsprops 7 , +except for those described below. +.Pp +If +.Pa /etc/mtab +is a regular file and +.Fl n +was not specified, it will be updated via libmount. +. +.Sh OPTIONS +.Bl -tag -width "-o xa" +.It Fl s +Ignore unknown (sloppy) mount options. +.It Fl f +Do everything except actually executing the system call. +.It Fl n +Never update +.Pa /etc/mtab . +.It Fl v +Print resolved mount options and parser state. +.It Fl h Print the usage message. -.TP -.BI "\-o context" -This flag sets the SELinux context for all files in the filesystem -under that mountpoint. -.TP -.BI "\-o fscontext" -This flag sets the SELinux context for the filesystem being mounted. -.TP -.BI "\-o defcontext" -This flag sets the SELinux context for unlabeled files. -.TP -.BI "\-o rootcontext" -This flag sets the SELinux context for the root inode of the filesystem. -.TP -.BI "\-o legacy" -This private flag indicates that the -.I dataset -has an entry in the /etc/fstab file. -.TP -.BI "\-o noxattr" -This private flag disables extended attributes. -.TP -.BI "\-o xattr -This private flag enables directory-based extended attributes and, if -appropriate, adds a ZFS context to the selinux system policy. -.TP -.BI "\-o saxattr -This private flag enables system attributed-based extended attributes and, if -appropriate, adds a ZFS context to the selinux system policy. -.TP -.BI "\-o dirxattr -Equivalent to -.BR xattr . -.TP -.BI "\-o zfsutil" +.It Fl o Ar zfsutil This private flag indicates that -.BR mount (8) +.Xr mount 8 is being called by the -.BR zfs (8) +.Xr zfs 8 command. - -.SH NOTES -ZFS conventionally requires that the -.I mountpoint -be an empty directory, but the Linux implementation inconsistently -enforces the requirement. - -The -.BR mount.zfs -helper does not mount the contents of zvols. - -.SH FILES -.TP 18n -.I /etc/fstab -The static filesystem table. -.TP -.I /etc/mtab -The mounted filesystem table. -.SH "AUTHORS" -The primary author of -.BR mount.zfs -is Brian Behlendorf . - -This man page was written by Darik Horn . -.SH "SEE ALSO" -.BR fstab (5), -.BR mount (8), -.BR zfs (8) +.El +. +.Sh SEE ALSO +.Xr fstab 5 , +.Xr mount 8 , +.Xr zfs-mount 8 diff --git a/man/man8/vdev_id.8 b/man/man8/vdev_id.8 index 70956c634f..2b327b3192 100644 --- a/man/man8/vdev_id.8 +++ b/man/man8/vdev_id.8 @@ -1,77 +1,93 @@ -.TH vdev_id 8 -.SH NAME -vdev_id \- generate user-friendly names for JBOD disks -.SH SYNOPSIS -.LP -.nf -\fBvdev_id\fR <-d dev> [-c config_file] [-g sas_direct|sas_switch] - [-m] [-p phys_per_port] -\fBvdev_id\fR -h -.fi -.SH DESCRIPTION -The \fBvdev_id\fR command is a udev helper which parses the file -.BR /etc/zfs/vdev_id.conf (5) -to map a physical path in a storage topology to a channel name. The -channel name is combined with a disk enclosure slot number to create an -alias that reflects the physical location of the drive. This is -particularly helpful when it comes to tasks like replacing failed -drives. Slot numbers may also be re-mapped in case the default -numbering is unsatisfactory. The drive aliases will be created as -symbolic links in /dev/disk/by-vdev. - -The currently supported topologies are sas_direct and sas_switch. A -multipath mode is supported in which dm-mpath devices are handled by -examining the first-listed running component disk as reported by the -.BR multipath (8) -command. In multipath mode the configuration file should contain a +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.Dd May 26, 2021 +.Dt VDEV_ID 8 +.Os +. +.Sh NAME +.Nm vdev_id +.Nd generate user-friendly names for JBOD disks +.Sh SYNOPSIS +.Nm +.Fl d Ar dev +.Fl c Ar config_file +.Fl g Sy sas_direct Ns | Ns Sy sas_switch Ns | Ns Sy scsi +.Fl m +.Fl p Ar phys_per_port +. +.Sh DESCRIPTION +.Nm +is an udev helper which parses +.Xr vdev_id.conf 5 +to map a physical path in a storage topology to a channel name. +The channel name is combined with a disk enclosure slot number to create +an alias that reflects the physical location of the drive. +This is particularly helpful when it comes to tasks like replacing failed drives. +Slot numbers may also be remapped in case the default numbering is unsatisfactory. +The drive aliases will be created as symbolic links in +.Pa /dev/disk/by-vdev . +.Pp +The currently supported topologies are +.Sy sas_direct , +.Sy sas_switch , +and +.Sy scsi . +A multipath mode is supported in which dm-mpath devices are handled by +examining the first running component disk as reported by the driver. +In multipath mode the configuration file should contain a channel definition with the same name for each path to a given enclosure. - -.BR vdev_id +.Pp +.Nm also supports creating aliases based on existing udev links in the /dev -hierarchy using the \fIalias\fR configuration file keyword. See the -.BR vdev_id.conf (5) -man page for details. - -.SH OPTIONS -.TP -\fB\-c\fR -Specifies the path to an alternate configuration file. The default is -/etc/zfs/vdev_id.conf. -.TP -\fB\-d\fR -This is the only mandatory argument. Specifies the name of a device -in /dev, i.e. "sda". -.TP -\fB\-g\fR +hierarchy using the +.Sy alias +configuration file keyword. +See +.Xr vdev_id.conf 5 +for details. +. +.Sh OPTIONS +.Bl -tag -width "-m" +.It Fl d Ar device +The device node to classify, like +.Pa /dev/sda . +.It Fl c Ar config_file +Specifies the path to an alternate configuration file. +The default is +.Pa /etc/zfs/vdev_id.conf . +.It Fl g Sy sas_direct Ns | Ns Sy sas_switch Ns | Ns Sy scsi Identifies a physical topology that governs how physical paths are -mapped to channels. - -\fIsas_direct\fR - in this mode a channel is uniquely identified by -a PCI slot and a HBA port number - -\fIsas_switch\fR - in this mode a channel is uniquely identified by -a SAS switch port number -.TP -\fB\-m\fR -Specifies that -.BR vdev_id (8) -will handle only dm-multipath devices. If set to "yes" then -.BR vdev_id (8) -will examine the first running component disk of a dm-multipath -device as listed by the -.BR multipath (8) -command to determine the physical path. -.TP -\fB\-p\fR +mapped to channels: +.Bl -tag -compact -width "sas_direct and scsi" +.It Sy sas_direct No and Sy scsi +channels are uniquely identified by a PCI slot and HBA port number +.It Sy sas_switch +channels are uniquely identified by a SAS switch port number +.El +.It Fl m +Only handle dm-multipath devices. +If specified, examine the first running component disk of a dm-multipath +device as provided by the driver to determine the physical path. +.It Fl p Ar phys_per_port Specifies the number of PHY devices associated with a SAS HBA port or SAS switch port. -.BR vdev_id (8) +.Nm internally uses this value to determine which HBA or switch port a -device is connected to. The default is 4. -.TP -\fB\-h\fR +device is connected to. +The default is +.Sy 4 . +.It Fl h Print a usage summary. -.SH SEE ALSO -.LP -\fBvdev_id.conf\fR(5) +.El +. +.Sh SEE ALSO +.Xr vdev_id.conf 5 diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 57403cba74..a8a9442190 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -8,34 +8,36 @@ .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. .\" -.\" .\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2019 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd April 14, 2019 -.Dt ZDB 8 SMM -.Os Linux +.Dd October 7, 2020 +.Dt ZDB 8 +.Os +. .Sh NAME .Nm zdb -.Nd display zpool debugging and consistency information +.Nd display ZFS storage pool debugging and consistency information .Sh SYNOPSIS .Nm -.Op Fl AbcdDFGhikLMPsvXY -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl AbcdDFGhikLMPsvXYy +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl I Ar inflight I/Os -.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... +.Oo Fl o Ar var Ns = Ns Ar value Oc Ns … .Op Fl t Ar txg .Op Fl U Ar cache .Op Fl x Ar dumpdir -.Op Ar poolname Op Ar object ... +.Op Ar poolname Ns Op / Ns Ar dataset | objset ID +.Op Ar object Ns | Ns Ar range Ns … .Nm .Op Fl AdiPv -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl U Ar cache -.Ar dataset Op Ar object ... +.Ar poolname Ns Op Ar / Ns Ar dataset | objset ID +.Op Ar object Ns | Ns Ar range Ns … .Nm .Fl C .Op Fl A @@ -43,7 +45,7 @@ .Nm .Fl E .Op Fl A -.Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 +.Ar word0 : Ns Ar word1 Ns :…: Ns Ar word15 .Nm .Fl l .Op Fl Aqu @@ -51,25 +53,29 @@ .Nm .Fl m .Op Fl AFLPXY -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl t Ar txg .Op Fl U Ar cache -.Ar poolname Op Ar vdev Op Ar metaslab ... +.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … .Nm .Fl O .Ar dataset path .Nm +.Fl r +.Ar dataset path destination +.Nm .Fl R .Op Fl A -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl U Ar cache -.Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags +.Ar poolname vdev : Ns Ar offset : Ns Oo Ar lsize Ns / Oc Ns Ar psize Ns Op : Ns Ar flags .Nm .Fl S .Op Fl AP -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl U Ar cache .Ar poolname +. .Sh DESCRIPTION The .Nm @@ -95,11 +101,11 @@ or .Qq Sy @ characters, it is interpreted as a pool name. The root dataset can be specified as -.Ar pool Ns / -.Pq pool name followed by a slash . +.Qq Ar pool Ns / . .Pp When operating on an imported and active pool it is possible, though unlikely, that zdb may interpret inconsistent pool data and behave erratically. +. .Sh OPTIONS Display options: .Bl -tag -width Ds @@ -134,8 +140,48 @@ size, and object count. .Pp If specified multiple times provides greater and greater verbosity. .Pp -If object IDs are specified, display information about those specific objects -only. +If object IDs or object ID ranges are specified, display information about +those specific objects or ranges only. +.Pp +An object ID range is specified in terms of a colon-separated tuple of +the form +.Ao start Ac : Ns Ao end Ac Ns Op : Ns Ao flags Ac . +The fields +.Ar start +and +.Ar end +are integer object identifiers that denote the upper and lower bounds +of the range. +An +.Ar end +value of -1 specifies a range with no upper bound. +The +.Ar flags +field optionally specifies a set of flags, described below, that control +which object types are dumped. +By default, all object types are dumped. +A minus sign +.Pq - +negates the effect of the flag that follows it and has no effect unless +preceded by the +.Ar A +flag. +For example, the range 0:-1:A-d will dump all object types except for directories. +.Pp +.Bl -tag -compact -width Ds +.It Sy A +Dump all objects (this is the default) +.It Sy d +Dump ZFS directory objects +.It Sy f +Dump ZFS plain file objects +.It Sy m +Dump SPA space map objects +.It Sy z +Dump ZAP objects +.It Sy - +Negate the effect of next flag +.El .It Fl D Display deduplication statistics, including the deduplication ratio .Pq Sy dedup , @@ -157,7 +203,7 @@ Display the statistics independently for each deduplication table. Dump the contents of the deduplication tables describing duplicate blocks. .It Fl DDDDD Also dump the contents of the deduplication tables describing unique blocks. -.It Fl E Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 +.It Fl E Ar word0 : Ns Ar word1 Ns :…: Ns Ar word15 Decode and display block from an embedded block pointer specified by the .Ar word arguments. @@ -174,30 +220,43 @@ If specified multiple times, display counts of each intent log transaction type. Examine the checkpointed state of the pool. Note, the on disk format of the pool is not reverted to the checkpointed state. .It Fl l Ar device -Read the vdev labels from the specified device. +Read the vdev labels and L2ARC header from the specified device. .Nm Fl l will return 0 if valid label was found, 1 if error occurred, and 2 if no valid -labels were found. Each unique configuration is displayed only once. +labels were found. +The presence of L2ARC header is indicated by a specific +sequence (L2ARC_DEV_HDR_MAGIC). +If there is an accounting error in the size or the number of L2ARC log blocks +.Nm Fl l +will return 1. +Each unique configuration is displayed only once. .It Fl ll Ar device In addition display label space usage stats. +If a valid L2ARC header was found +also display the properties of log blocks used for restoring L2ARC contents +(persistent L2ARC). .It Fl lll Ar device Display every configuration, unique or not. +If a valid L2ARC header was found +also display the properties of log entries in log blocks used for restoring +L2ARC contents (persistent L2ARC). .Pp If the .Fl q -option is also specified, don't print the labels. +option is also specified, don't print the labels or the L2ARC header. .Pp If the .Fl u -option is also specified, also display the uberblocks on this device. Specify -multiple times to increase verbosity. +option is also specified, also display the uberblocks on this device. +Specify multiple times to increase verbosity. .It Fl L Disable leak detection and the loading of space maps. By default, .Nm verifies that all non-free blocks are referenced, which can be very expensive. .It Fl m -Display the offset, spacemap, and free space of each metaslab. +Display the offset, spacemap, free space of each metaslab, all the log +spacemaps and their obsolete entry statistics. .It Fl mm Also display information about the on-disk free space histogram associated with each metaslab. @@ -226,8 +285,21 @@ must be relative to the root of This option can be combined with .Fl v for increasing verbosity. +.It Fl r Ar dataset path destination +Copy the specified +.Ar path +inside of the +.Ar dataset +to the specified destination. +Specified +.Ar path +must be relative to the root of +.Ar dataset . +This option can be combined with +.Fl v +for increasing verbosity. .It Xo -.Fl R Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags +.Fl R Ar poolname vdev : Ns Ar offset : Ns Oo Ar lsize Ns / Oc Ns Ar psize Ns Op : Ns Ar flags .Xc Read and display a block from the specified device. By default the block is displayed as a hex dump, but see the description of the @@ -240,17 +312,20 @@ The block is specified in terms of a colon-separated tuple .Ar offset .Pq the offset within the vdev .Ar size -.Pq the size of the block to read -and, optionally, +.Pq the physical size, or logical size / physical size +of the block to read and, optionally, .Ar flags .Pq a set of flags, described below . .Pp .Bl -tag -compact -width "b offset" .It Sy b Ar offset -Print block pointer +Print block pointer at hex offset +.It Sy c +Calculate and display checksums .It Sy d -Decompress the block. Set environment variable -.Nm ZBD_NO_ZLE +Decompress the block. +Set environment variable +.Nm ZDB_NO_ZLE to skip zle when guessing. .It Sy e Byte swap the block @@ -260,6 +335,8 @@ Dump gang block header Dump indirect block .It Sy r Dump raw uninterpreted block data +.It Sy v +Verbose output for guessing compression algorithm .El .It Fl s Report statistics on @@ -284,7 +361,7 @@ Enable panic recovery, certain errors which would otherwise be fatal are demoted to warnings. .It Fl AAA Do not abort if asserts fail and also enable panic recovery. -.It Fl e Op Fl p Ar path ... +.It Fl e Oo Fl p Ar path Oc Ns … Operate on an exported pool, not present in .Pa /etc/zfs/zpool.cache . The @@ -314,14 +391,16 @@ The default value is 200. This option affects the performance of the .Fl c option. -.It Fl o Ar var Ns = Ns Ar value ... +.It Fl o Ar var Ns = Ns Ar value … Set the given global libzpool variable to the provided value. The value must be an unsigned 32-bit integer. Currently only little-endian systems are supported to avoid accidentally setting the high 32 bits of 64-bit variables. .It Fl P -Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather -than 1M. +Print numbers in an unscaled form more amenable to parsing, e.g.\& +.Sy 1000000 +rather than +.Sy 1M . .It Fl t Ar transaction Specify the highest transaction to use when searching for uberblocks. See also the @@ -351,6 +430,12 @@ but read transactions otherwise deemed too old. Attempt all possible combinations when reconstructing indirect split blocks. This flag disables the individual I/O deadman timer in order to allow as much time as required for the attempted reconstruction. +.It Fl y +Perform validation for livelists that are being deleted. +Scans through the livelist and metaslabs, checking for duplicate entries +and compares the two, checking for potential double frees. +If it encounters issues, warnings will be printed, but the command will not +necessarily fail. .El .Pp Specifying a display option more than once enables verbosity for only that @@ -358,51 +443,51 @@ option, with more occurrences enabling more verbosity. .Pp If no options are specified, all information about the named pool will be displayed at default verbosity. +. .Sh EXAMPLES .Bl -tag -width Ds .It Xo -.Sy Example 1 +.Sy Example 1 : Display the configuration of imported pool -.Pa rpool +.Ar rpool .Xc .Bd -literal -# zdb -C rpool - +.No # Nm zdb Fl C Ar rpool MOS Configuration: version: 28 name: 'rpool' - ... + … .Ed .It Xo -.Sy Example 2 +.Sy Example 2 : Display basic dataset information about -.Pa rpool +.Ar rpool .Xc .Bd -literal -# zdb -d rpool +.No # Nm zdb Fl d Ar rpool Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects - ... + … .Ed .It Xo -.Sy Example 3 +.Sy Example 3 : Display basic information about object 0 in -.Pa rpool/export/home +.Ar rpool/export/home .Xc .Bd -literal -# zdb -d rpool/export/home 0 +.No # Nm zdb Fl d Ar rpool/export/home 0 Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects Object lvl iblk dblk dsize lsize %full type 0 7 16K 16K 15.0K 16K 25.00 DMU dnode .Ed .It Xo -.Sy Example 4 +.Sy Example 4 : Display the predicted effect of enabling deduplication on -.Pa rpool +.Ar rpool .Xc .Bd -literal -# zdb -S rpool +.No # Nm zdb Fl S Ar rpool Simulated DDT histogram: bucket allocated referenced @@ -411,10 +496,11 @@ refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE ------ ------ ----- ----- ----- ------ ----- ----- ----- 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G - ... + … dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 .Ed .El +. .Sh SEE ALSO .Xr zfs 8 , .Xr zpool 8 diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in index 097a8f4a7e..d329760520 100644 --- a/man/man8/zed.8.in +++ b/man/man8/zed.8.in @@ -1,9 +1,8 @@ .\" -.\" This file is part of the ZFS Event Daemon (ZED) -.\" for ZFS on Linux (ZoL) . +.\" This file is part of the ZFS Event Daemon (ZED). .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). .\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -.\" Refer to the ZoL git commit log for authoritative copyright attribution. +.\" Refer to the OpenZFS git commit log for authoritative copyright attribution. .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License Version 1.0 (CDDL-1.0). @@ -11,250 +10,246 @@ .\" "OPENSOLARIS.LICENSE" or at . .\" You may not use this file except in compliance with the license. .\" -.TH ZED 8 "Octember 1, 2013" "ZFS on Linux" "System Administration Commands" - -.SH NAME -ZED \- ZFS Event Daemon - -.SH SYNOPSIS -.HP -.B zed -.\" [\fB\-c\fR \fIconfigfile\fR] -[\fB\-d\fR \fIzedletdir\fR] -[\fB\-f\fR] -[\fB\-F\fR] -[\fB\-h\fR] -[\fB\-L\fR] -[\fB\-M\fR] -[\fB\-p\fR \fIpidfile\fR] -[\fB\-P\fR \fIpath\fR] -[\fB\-s\fR \fIstatefile\fR] -[\fB\-v\fR] -[\fB\-V\fR] -[\fB\-Z\fR] - -.SH DESCRIPTION -.PP -\fBZED\fR (ZFS Event Daemon) monitors events generated by the ZFS kernel -module. When a zevent (ZFS Event) is posted, \fBZED\fR will run any ZEDLETs -(ZFS Event Daemon Linkage for Executable Tasks) that have been enabled for the -corresponding zevent class. - -.SH OPTIONS -.TP -.BI \-h +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +.\" +.Dd May 26, 2021 +.Dt ZED 8 +.Os +. +.Sh NAME +.Nm ZED +.Nd ZFS Event Daemon +.Sh SYNOPSIS +.Nm +.Op Fl fFhILMvVZ +.Op Fl d Ar zedletdir +.Op Fl p Ar pidfile +.Op Fl P Ar path +.Op Fl s Ar statefile +.Op Fl j Ar jobs +. +.Sh DESCRIPTION +The +.Nm +(ZFS Event Daemon) monitors events generated by the ZFS kernel +module. +When a zevent (ZFS Event) is posted, the +.Nm +will run any ZEDLETs (ZFS Event Daemon Linkage for Executable Tasks) +that have been enabled for the corresponding zevent class. +. +.Sh OPTIONS +.Bl -tag -width "-h" +.It Fl h Display a summary of the command-line options. -.TP -.BI \-L +.It Fl L Display license information. -.TP -.BI \-V +.It Fl V Display version information. -.TP -.BI \-v +.It Fl v Be verbose. -.TP -.BI \-f +.It Fl f Force the daemon to run if at all possible, disabling security checks and -throwing caution to the wind. Not recommended for use in production. -.TP -.BI \-F -Run the daemon in the foreground. -.TP -.BI \-M +throwing caution to the wind. +Not recommended for use in production. +.It Fl F +Don't daemonise: remain attached to the controlling terminal, +log to the standard I/O streams. +.It Fl M Lock all current and future pages in the virtual memory address space. This may help the daemon remain responsive when the system is under heavy memory pressure. -.TP -.BI \-Z +.It Fl I +Request that the daemon idle rather than exit when the kernel modules are not loaded. +Processing of events will start, or resume, when the kernel modules are (re)loaded. +Under Linux the kernel modules cannot be unloaded while the daemon is running. +.It Fl Z Zero the daemon's state, thereby allowing zevents still within the kernel to be reprocessed. -.\" .TP -.\" .BI \-c\ configfile -.\" Read the configuration from the specified file. -.TP -.BI \-d\ zedletdir +.It Fl d Ar zedletdir Read the enabled ZEDLETs from the specified directory. -.TP -.BI \-p\ pidfile +.It Fl p Ar pidfile Write the daemon's process ID to the specified file. -.TP -.BI \-P\ path -Custom $PATH for zedlets to use. Normally zedlets run in a locked-down -environment, with hardcoded paths to the ZFS commands ($ZFS, $ZPOOL, $ZED, ...), -and a hardcoded $PATH. This is done for security reasons. However, the -ZFS test suite uses a custom PATH for its ZFS commands, and passes it to zed -with -P. In short, -P is only to be used by the ZFS test suite; never use +.It Fl P Ar path +Custom +.Ev $PATH +for zedlets to use. +Normally zedlets run in a locked-down environment, with hardcoded paths to the ZFS commands +.Pq Ev $ZFS , $ZPOOL , $ZED , ... , +and a hard-coded +.Ev $PATH . +This is done for security reasons. +However, the ZFS test suite uses a custom PATH for its ZFS commands, and passes it to +.Nm +with +.Fl P . +In short, +.Fl P +is only to be used by the ZFS test suite; never use it in production! -.TP -.BI \-s\ statefile +.It Fl s Ar statefile Write the daemon's state to the specified file. -.SH ZEVENTS -.PP -A zevent is comprised of a list of nvpairs (name/value pairs). Each zevent -contains an EID (Event IDentifier) that uniquely identifies it throughout +.It Fl j Ar jobs +Allow at most +.Ar jobs +ZEDLETs to run concurrently, +delaying execution of new ones until they finish. +Defaults to +.Sy 16 . +.El +.Sh ZEVENTS +A zevent is comprised of a list of nvpairs (name/value pairs). +Each zevent contains an EID (Event IDentifier) that uniquely identifies it throughout the lifetime of the loaded ZFS kernel module; this EID is a monotonically increasing integer that resets to 1 each time the kernel module is loaded. Each zevent also contains a class string that identifies the type of event. For brevity, a subclass string is defined that omits the leading components -of the class string. Additional nvpairs exist to provide event details. -.PP +of the class string. +Additional nvpairs exist to provide event details. +.Pp The kernel maintains a list of recent zevents that can be viewed (along with -their associated lists of nvpairs) using the "\fBzpool events \-v\fR" command. - -.SH CONFIGURATION -.PP +their associated lists of nvpairs) using the +.Nm zpool Cm events Fl v +command. +. +.Sh CONFIGURATION ZEDLETs to be invoked in response to zevents are located in the -\fIenabled-zedlets\fR directory. These can be symlinked or copied from the -\fIinstalled-zedlets\fR directory; symlinks allow for automatic updates +.Em enabled-zedlets +directory +.Pq Ar zedletdir . +These can be symlinked or copied from the +.Em installed-zedlets +directory; symlinks allow for automatic updates from the installed ZEDLETs, whereas copies preserve local modifications. -As a security measure, ZEDLETs must be owned by root. They must have -execute permissions for the user, but they must not have write permissions -for group or other. Dotfiles are ignored. -.PP +As a security measure, since ownership change is a privileged operation, +ZEDLETs must be owned by root. +They must have execute permissions for the user, +but they must not have write permissions for group or other. +Dotfiles are ignored. +.Pp ZEDLETs are named after the zevent class for which they should be invoked. In particular, a ZEDLET will be invoked for a given zevent if either its class or subclass string is a prefix of its filename (and is followed by -a non-alphabetic character). As a special case, the prefix "all" matches -all zevents. Multiple ZEDLETs may be invoked for a given zevent. - -.SH ZEDLETS -.PP +a non-alphabetic character). +As a special case, the prefix +.Sy all +matches all zevents. +Multiple ZEDLETs may be invoked for a given zevent. +. +.Sh ZEDLETS ZEDLETs are executables invoked by the ZED in response to a given zevent. They should be written under the presumption they can be invoked concurrently, and they should use appropriate locking to access any shared resources. Common variables used by ZEDLETs can be stored in the default rc file which -is sourced by scripts; these variables should be prefixed with "ZED_". -.PP +is sourced by scripts; these variables should be prefixed with +.Sy ZED_ . +.Pp The zevent nvpairs are passed to ZEDLETs as environment variables. Each nvpair name is converted to an environment variable in the following -manner: 1) it is prefixed with "ZEVENT_", 2) it is converted to uppercase, -and 3) each non-alphanumeric character is converted to an underscore. +manner: +.Bl -enum -compact +.It +it is prefixed with +.Sy ZEVENT_ , +.It +it is converted to uppercase, and +.It +each non-alphanumeric character is converted to an underscore. +.El +.Pp Some additional environment variables have been defined to present certain -nvpair values in a more convenient form. An incomplete list of zevent -environment variables is as follows: -.TP -.B -ZEVENT_EID +nvpair values in a more convenient form. +An incomplete list of zevent environment variables is as follows: +.Bl -tag -compact -width "ZEVENT_TIME_STRING" +.It Sy ZEVENT_EID The Event IDentifier. -.TP -.B -ZEVENT_CLASS +.It Sy ZEVENT_CLASS The zevent class string. -.TP -.B -ZEVENT_SUBCLASS +.It Sy ZEVENT_SUBCLASS The zevent subclass string. -.TP -.B -ZEVENT_TIME +.It Sy ZEVENT_TIME The time at which the zevent was posted as -"\fIseconds\fR\ \fInanoseconds\fR" since the Epoch. -.TP -.B -ZEVENT_TIME_SECS -The \fIseconds\fR component of ZEVENT_TIME. -.TP -.B -ZEVENT_TIME_NSECS -The \fInanoseconds\fR component of ZEVENT_TIME. -.TP -.B -ZEVENT_TIME_STRING -An almost-RFC3339-compliant string for ZEVENT_TIME. -.PP +.Dq Em seconds nanoseconds +since the Epoch. +.It Sy ZEVENT_TIME_SECS +The +.Em seconds +component of +.Sy ZEVENT_TIME . +.It Sy ZEVENT_TIME_NSECS +The +.Em nanoseconds +component of +.Sy ZEVENT_TIME . +.It Sy ZEVENT_TIME_STRING +An almost-RFC3339-compliant string for +.Sy ZEVENT_TIME . +.El +.Pp Additionally, the following ZED & ZFS variables are defined: -.TP -.B -ZED_PID +.Bl -tag -compact -width "ZEVENT_TIME_STRING" +.It Sy ZED_PID The daemon's process ID. -.TP -.B -ZED_ZEDLET_DIR -The daemon's current \fIenabled-zedlets\fR directory. -.TP -.B -ZFS_ALIAS -The ZFS alias (\fIname-version-release\fR) string used to build the daemon. -.TP -.B -ZFS_VERSION -The ZFS version used to build the daemon. -.TP -.B -ZFS_RELEASE -The ZFS release used to build the daemon. -.PP -ZEDLETs may need to call other ZFS commands. The installation paths of -the following executables are defined: \fBZDB\fR, \fBZED\fR, \fBZFS\fR, -\fBZINJECT\fR, and \fBZPOOL\fR. These variables can be overridden in the -rc file if needed. - -.SH FILES -.\" .TP -.\" @sysconfdir@/zfs/zed.conf -.\" The default configuration file for the daemon. -.TP -.I @sysconfdir@/zfs/zed.d +.It Sy ZED_ZEDLET_DIR +The daemon's current +.Em enabled-zedlets +directory. +.It Sy ZFS_ALIAS +The alias +.Pq Dq Em name Ns - Ns Em version Ns - Ns Em release +string of the ZFS distribution the daemon is part of. +.It Sy ZFS_VERSION +The ZFS version the daemon is part of. +.It Sy ZFS_RELEASE +The ZFS release the daemon is part of. +.El +.Pp +ZEDLETs may need to call other ZFS commands. +The installation paths of the following executables are defined as environment variables: +.Sy ZDB , +.Sy ZED , +.Sy ZFS , +.Sy ZINJECT , +and +.Sy ZPOOL . +These variables may be overridden in the rc file. +. +.Sh FILES +.Bl -tag -width "-c" +.It Pa @sysconfdir@/zfs/zed.d The default directory for enabled ZEDLETs. -.TP -.I @sysconfdir@/zfs/zed.d/zed.rc +.It Pa @sysconfdir@/zfs/zed.d/zed.rc The default rc file for common variables used by ZEDLETs. -.TP -.I @zfsexecdir@/zed.d +.It Pa @zfsexecdir@/zed.d The default directory for installed ZEDLETs. -.TP -.I @runstatedir@/zed.pid +.It Pa @runstatedir@/zed.pid The default file containing the daemon's process ID. -.TP -.I @runstatedir@/zed.state +.It Pa @runstatedir@/zed.state The default file containing the daemon's state. - -.SH SIGNALS -.TP -.B HUP +.El +. +.Sh SIGNALS +.Bl -tag -width "-c" +.It Sy SIGHUP Reconfigure the daemon and rescan the directory for enabled ZEDLETs. -.TP -.B TERM +.It Sy SIGTERM , SIGINT Terminate the daemon. - -.SH NOTES -.PP -\fBZED\fR requires root privileges. -.\" Do not taunt zed. - -.SH BUGS -.PP -Events are processed synchronously by a single thread. This can delay the -processing of simultaneous zevents. -.PP -There is no maximum timeout for ZEDLET execution. Consequently, a misbehaving -ZEDLET can delay the processing of subsequent zevents. -.PP -The ownership and permissions of the \fIenabled-zedlets\fR directory (along -with all parent directories) are not checked. If any of these directories -are improperly owned or permissioned, an unprivileged user could insert a -ZEDLET to be executed as root. The requirement that ZEDLETs be owned by -root mitigates this to some extent. -.PP +.El +. +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zpool 8 , +.Xr zpool-events 8 +. +.Sh NOTES +The +.Nm +requires root privileges. +.Pp +Do not taunt the +.Nm . +. +.Sh BUGS ZEDLETs are unable to return state/status information to the kernel. -.PP -Some zevent nvpair types are not handled. These are denoted by zevent -environment variables having a "_NOT_IMPLEMENTED_" value. -.PP +.Pp Internationalization support via gettext has not been added. -.PP -The configuration file is not yet implemented. -.PP -The diagnosis engine is not yet implemented. - -.SH LICENSE -.PP -\fBZED\fR (ZFS Event Daemon) is distributed under the terms of the -Common Development and Distribution License Version 1.0 (CDDL\-1.0). -.PP -Developed at Lawrence Livermore National Laboratory (LLNL\-CODE\-403049). - -.SH SEE ALSO -.BR zfs (8), -.BR zpool (8) diff --git a/man/man8/zfs-allow.8 b/man/man8/zfs-allow.8 new file mode 100644 index 0000000000..bbd62edc28 --- /dev/null +++ b/man/man8/zfs-allow.8 @@ -0,0 +1,386 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-ALLOW 8 +.Os +. +.Sh NAME +.Nm zfs-allow +.Nd delegate ZFS administration permissions to unprivileged users +.Sh SYNOPSIS +.Nm zfs +.Cm allow +.Op Fl dglu +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns … +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm allow +.Op Fl dl +.Fl e Ns | Ns Sy everyone +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm allow +.Fl c +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm allow +.Fl s No @ Ns Ar setname +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm unallow +.Op Fl dglru +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns … +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm unallow +.Op Fl dlr +.Fl e Ns | Ns Sy everyone +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm unallow +.Op Fl r +.Fl c +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm unallow +.Op Fl r +.Fl s No @ Ns Ar setname +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm allow +.Ar filesystem Ns | Ns Ar volume +.Xc +Displays permissions that have been delegated on the specified filesystem or +volume. +See the other forms of +.Nm zfs Cm allow +for more information. +.Pp +Delegations are supported under Linux with the exception of +.Sy mount , +.Sy unmount , +.Sy mountpoint , +.Sy canmount , +.Sy rename , +and +.Sy share . +These permissions cannot be delegated because the Linux +.Xr mount 8 +command restricts modifications of the global namespace to the root user. +.It Xo +.Nm zfs +.Cm allow +.Op Fl dglu +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns … +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm zfs +.Cm allow +.Op Fl dl +.Fl e Ns | Ns Sy everyone +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Xc +Delegates ZFS administration permission for the file systems to non-privileged +users. +.Bl -tag -width "-d" +.It Fl d +Allow only for the descendent file systems. +.It Fl e Ns | Ns Sy everyone +Specifies that the permissions be delegated to everyone. +.It Fl g Ar group Ns Oo , Ns Ar group Oc Ns … +Explicitly specify that permissions are delegated to the group. +.It Fl l +Allow +.Qq locally +only for the specified file system. +.It Fl u Ar user Ns Oo , Ns Ar user Oc Ns … +Explicitly specify that permissions are delegated to the user. +.It Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns … +Specifies to whom the permissions are delegated. +Multiple entities can be specified as a comma-separated list. +If neither of the +.Fl gu +options are specified, then the argument is interpreted preferentially as the +keyword +.Sy everyone , +then as a user name, and lastly as a group name. +To specify a user or group named +.Qq everyone , +use the +.Fl g +or +.Fl u +options. +To specify a group with the same name as a user, use the +.Fl g +options. +.It Xo +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Xc +The permissions to delegate. +Multiple permissions may be specified as a comma-separated list. +Permission names are the same as ZFS subcommand and property names. +See the property list below. +Property set names, which begin with +.Sy @ , +may be specified. +See the +.Fl s +form below for details. +.El +.Pp +If neither of the +.Fl dl +options are specified, or both are, then the permissions are allowed for the +file system or volume, and all of its descendents. +.Pp +Permissions are generally the ability to use a ZFS subcommand or change a ZFS +property. +The following permissions are available: +.TS +l l l . +NAME TYPE NOTES +_ _ _ +allow subcommand Must also have the permission that is being allowed +bookmark subcommand +clone subcommand Must also have the \fBcreate\fR ability and \fBmount\fR ability in the origin file system +create subcommand Must also have the \fBmount\fR ability. Must also have the \fBrefreservation\fR ability to create a non-sparse volume. +destroy subcommand Must also have the \fBmount\fR ability +diff subcommand Allows lookup of paths within a dataset given an object number, and the ability to create snapshots necessary to \fBzfs diff\fR. +hold subcommand Allows adding a user hold to a snapshot +load-key subcommand Allows loading and unloading of encryption key (see \fBzfs load-key\fR and \fBzfs unload-key\fR). +change-key subcommand Allows changing an encryption key via \fBzfs change-key\fR. +mount subcommand Allows mounting/umounting ZFS datasets +promote subcommand Must also have the \fBmount\fR and \fBpromote\fR ability in the origin file system +receive subcommand Must also have the \fBmount\fR and \fBcreate\fR ability +release subcommand Allows releasing a user hold which might destroy the snapshot +rename subcommand Must also have the \fBmount\fR and \fBcreate\fR ability in the new parent +rollback subcommand Must also have the \fBmount\fR ability +send subcommand +share subcommand Allows sharing file systems over NFS or SMB protocols +snapshot subcommand Must also have the \fBmount\fR ability + +groupquota other Allows accessing any \fBgroupquota@\fI...\fR property +groupobjquota other Allows accessing any \fBgroupobjquota@\fI...\fR property +groupused other Allows reading any \fBgroupused@\fI...\fR property +groupobjused other Allows reading any \fBgroupobjused@\fI...\fR property +userprop other Allows changing any user property +userquota other Allows accessing any \fBuserquota@\fI...\fR property +userobjquota other Allows accessing any \fBuserobjquota@\fI...\fR property +userused other Allows reading any \fBuserused@\fI...\fR property +userobjused other Allows reading any \fBuserobjused@\fI...\fR property +projectobjquota other Allows accessing any \fBprojectobjquota@\fI...\fR property +projectquota other Allows accessing any \fBprojectquota@\fI...\fR property +projectobjused other Allows reading any \fBprojectobjused@\fI...\fR property +projectused other Allows reading any \fBprojectused@\fI...\fR property + +aclinherit property +aclmode property +acltype property +atime property +canmount property +casesensitivity property +checksum property +compression property +context property +copies property +dedup property +defcontext property +devices property +dnodesize property +encryption property +exec property +filesystem_limit property +fscontext property +keyformat property +keylocation property +logbias property +mlslabel property +mountpoint property +nbmand property +normalization property +overlay property +pbkdf2iters property +primarycache property +quota property +readonly property +recordsize property +redundant_metadata property +refquota property +refreservation property +relatime property +reservation property +rootcontext property +secondarycache property +setuid property +sharenfs property +sharesmb property +snapdev property +snapdir property +snapshot_limit property +special_small_blocks property +sync property +utf8only property +version property +volblocksize property +volmode property +volsize property +vscan property +xattr property +zoned property +.TE +.It Xo +.Nm zfs +.Cm allow +.Fl c +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Xc +Sets +.Qq create time +permissions. +These permissions are granted +.Pq locally +to the creator of any newly-created descendent file system. +.It Xo +.Nm zfs +.Cm allow +.Fl s No @ Ns Ar setname +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … +.Ar filesystem Ns | Ns Ar volume +.Xc +Defines or adds permissions to a permission set. +The set can be used by other +.Nm zfs Cm allow +commands for the specified file system and its descendents. +Sets are evaluated dynamically, so changes to a set are immediately reflected. +Permission sets follow the same naming restrictions as ZFS file systems, but the +name must begin with +.Sy @ , +and can be no more than 64 characters long. +.It Xo +.Nm zfs +.Cm unallow +.Op Fl dglru +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns … +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm zfs +.Cm unallow +.Op Fl dlr +.Fl e Ns | Ns Sy everyone +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm zfs +.Cm unallow +.Op Fl r +.Fl c +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +Removes permissions that were granted with the +.Nm zfs Cm allow +command. +No permissions are explicitly denied, so other permissions granted are still in +effect. +For example, if the permission is granted by an ancestor. +If no permissions are specified, then all permissions for the specified +.Ar user , +.Ar group , +or +.Sy everyone +are removed. +Specifying +.Sy everyone +.Po or using the +.Fl e +option +.Pc +only removes the permissions that were granted to everyone, not all permissions +for every user and group. +See the +.Nm zfs Cm allow +command for a description of the +.Fl ldugec +options. +.Bl -tag -width "-r" +.It Fl r +Recursively remove the permissions from this file system and all descendents. +.El +.It Xo +.Nm zfs +.Cm unallow +.Op Fl r +.Fl s No @ Ns Ar setname +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns … Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +Removes permissions from a permission set. +If no permissions are specified, then all permissions are removed, thus removing +the set entirely. +.El diff --git a/man/man8/zfs-bookmark.8 b/man/man8/zfs-bookmark.8 new file mode 100644 index 0000000000..094a7b3090 --- /dev/null +++ b/man/man8/zfs-bookmark.8 @@ -0,0 +1,67 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZFS-BOOKMARK 8 +.Os +. +.Sh NAME +.Nm zfs-bookmark +.Nd create bookmark of ZFS snapshot +.Sh SYNOPSIS +.Nm zfs +.Cm bookmark +.Ar snapshot Ns | Ns Ar bookmark +.Ar newbookmark +. +.Sh DESCRIPTION +Creates a new bookmark of the given snapshot or bookmark. +Bookmarks mark the point in time when the snapshot was created, and can be used +as the incremental source for a +.Nm zfs Cm send . +.Pp +When creating a bookmark from an existing redaction bookmark, the resulting +bookmark is +.Em not +a redaction bookmark. +.Pp +This feature must be enabled to be used. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy bookmarks +feature. +. +.Sh SEE ALSO +.Xr zfs-destroy 8 , +.Xr zfs-send 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-change-key.8 b/man/man8/zfs-change-key.8 new file mode 120000 index 0000000000..d027a419d1 --- /dev/null +++ b/man/man8/zfs-change-key.8 @@ -0,0 +1 @@ +zfs-load-key.8 \ No newline at end of file diff --git a/man/man8/zfs-clone.8 b/man/man8/zfs-clone.8 new file mode 100644 index 0000000000..0640244f20 --- /dev/null +++ b/man/man8/zfs-clone.8 @@ -0,0 +1,70 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-CLONE 8 +.Os +. +.Sh NAME +.Nm zfs-clone +.Nd clone snapshot of ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm clone +.Op Fl p +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Ar snapshot Ar filesystem Ns | Ns Ar volume +. +.Sh DESCRIPTION +See the +.Sx Clones +section of +.Xr zfsconcepts 7 +for details. +The target dataset can be located anywhere in the ZFS hierarchy, +and is created as the same type as the original. +.Bl -tag -width Ds +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property; see +.Nm zfs Cm create +for details. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +If the target filesystem or volume already exists, the operation completes +successfully. +.El +. +.Sh SEE ALSO +.Xr zfs-promote 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-create.8 b/man/man8/zfs-create.8 new file mode 100644 index 0000000000..55397fa661 --- /dev/null +++ b/man/man8/zfs-create.8 @@ -0,0 +1,249 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd December 1, 2020 +.Dt ZFS-CREATE 8 +.Os +. +.Sh NAME +.Nm zfs-create +.Nd create ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm create +.Op Fl Pnpuv +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Ar filesystem +.Nm zfs +.Cm create +.Op Fl ps +.Op Fl b Ar blocksize +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Fl V Ar size Ar volume +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm create +.Op Fl Pnpuv +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Ar filesystem +.Xc +Creates a new ZFS file system. +The file system is automatically mounted according to the +.Sy mountpoint +property inherited from the parent, unless the +.Fl u +option is used. +.Bl -tag -width "-o" +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property as if the command +.Nm zfs Cm set Ar property Ns = Ns Ar value +was invoked at the same time the dataset was created. +Any editable ZFS property can also be set at creation time. +Multiple +.Fl o +options can be specified. +An error results if the same property is specified in multiple +.Fl o +options. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +Any property specified on the command line using the +.Fl o +option is ignored. +If the target filesystem already exists, the operation completes successfully. +.It Fl n +Do a dry-run +.Pq Qq No-op +creation. +No datasets will be created. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to validate properties that are passed via +.Fl o +options and those implied by other options. +The actual dataset creation can still fail due to insufficient privileges or +available capacity. +.It Fl P +Print machine-parsable verbose information about the created dataset. +Each line of output contains a key and one or two values, all separated by tabs. +The +.Sy create_ancestors +and +.Sy create +keys have +.Em filesystem +as their only value. +The +.Sy create_ancestors +key only appears if the +.Fl p +option is used. +The +.Sy property +key has two values, a property name that property's value. +The +.Sy property +key may appear zero or more times, once for each property that will be set local +to +.Em filesystem +due to the use of the +.Fl o +option. +.It Fl u +Do not mount the newly created file system. +.It Fl v +Print verbose information about the created dataset. +.El +.It Xo +.Nm zfs +.Cm create +.Op Fl ps +.Op Fl b Ar blocksize +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Fl V Ar size Ar volume +.Xc +Creates a volume of the given size. +The volume is exported as a block device in +.Pa /dev/zvol/path , +where +.Em path +is the name of the volume in the ZFS namespace. +The size represents the logical size as exported by the device. +By default, a reservation of equal size is created. +.Pp +.Ar size +is automatically rounded up to the nearest multiple of the +.Sy blocksize . +.Bl -tag -width "-b" +.It Fl b Ar blocksize +Equivalent to +.Fl o Sy volblocksize Ns = Ns Ar blocksize . +If this option is specified in conjunction with +.Fl o Sy volblocksize , +the resulting behavior is undefined. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property as if the +.Nm zfs Cm set Ar property Ns = Ns Ar value +command was invoked at the same time the dataset was created. +Any editable ZFS property can also be set at creation time. +Multiple +.Fl o +options can be specified. +An error results if the same property is specified in multiple +.Fl o +options. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +Any property specified on the command line using the +.Fl o +option is ignored. +If the target filesystem already exists, the operation completes successfully. +.It Fl s +Creates a sparse volume with no reservation. +See +.Sy volsize +in the +.Em Native Properties +section of +.Xr zfsprops 7 +for more information about sparse volumes. +.It Fl n +Do a dry-run +.Pq Qq No-op +creation. +No datasets will be created. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to validate properties that are passed via +.Fl o +options and those implied by other options. +The actual dataset creation can still fail due to insufficient privileges or +available capacity. +.It Fl P +Print machine-parsable verbose information about the created dataset. +Each line of output contains a key and one or two values, all separated by tabs. +The +.Sy create_ancestors +and +.Sy create +keys have +.Em volume +as their only value. +The +.Sy create_ancestors +key only appears if the +.Fl p +option is used. +The +.Sy property +key has two values, a property name that property's value. +The +.Sy property +key may appear zero or more times, once for each property that will be set local +to +.Em volume +due to the use of the +.Fl b +or +.Fl o +options, as well as +.Sy refreservation +if the volume is not sparse. +.It Fl v +Print verbose information about the created dataset. +.El +.El +.Ss ZFS Volumes as Swap +ZFS volumes may be used as swap devices. +After creating the volume with the +.Nm zfs Cm create Fl V +enable the swap area using the +.Xr swapon 8 +command. +Swapping to files on ZFS filesystems is not supported. +. +.Sh SEE ALSO +.Xr zfs-destroy 8 , +.Xr zfs-list 8 , +.Xr zpool-create 8 diff --git a/man/man8/zfs-destroy.8 b/man/man8/zfs-destroy.8 new file mode 100644 index 0000000000..51d9b7ab8e --- /dev/null +++ b/man/man8/zfs-destroy.8 @@ -0,0 +1,178 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-DESTROY 8 +.Os +. +.Sh NAME +.Nm zfs-destroy +.Nd destroy ZFS dataset, snapshots, or bookmark +.Sh SYNOPSIS +.Nm zfs +.Cm destroy +.Op Fl Rfnprv +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm destroy +.Op Fl Rdnprv +.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns +.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns … +.Nm zfs +.Cm destroy +.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm destroy +.Op Fl Rfnprv +.Ar filesystem Ns | Ns Ar volume +.Xc +Destroys the given dataset. +By default, the command unshares any file systems that are currently shared, +unmounts any file systems that are currently mounted, and refuses to destroy a +dataset that has active dependents +.Pq children or clones . +.Bl -tag -width "-R" +.It Fl R +Recursively destroy all dependents, including cloned file systems outside the +target hierarchy. +.It Fl f +Forcibly unmount file systems. +This option has no effect on non-file systems or unmounted file systems. +.It Fl n +Do a dry-run +.Pq Qq No-op +deletion. +No data will be deleted. +This is useful in conjunction with the +.Fl v +or +.Fl p +flags to determine what data would be deleted. +.It Fl p +Print machine-parsable verbose information about the deleted data. +.It Fl r +Recursively destroy all children. +.It Fl v +Print verbose information about the deleted data. +.El +.Pp +Extreme care should be taken when applying either the +.Fl r +or the +.Fl R +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.It Xo +.Nm zfs +.Cm destroy +.Op Fl Rdnprv +.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns +.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns … +.Xc +The given snapshots are destroyed immediately if and only if the +.Nm zfs Cm destroy +command without the +.Fl d +option would have destroyed it. +Such immediate destruction would occur, for example, if the snapshot had no +clones and the user-initiated reference count were zero. +.Pp +If a snapshot does not qualify for immediate destruction, it is marked for +deferred deletion. +In this state, it exists as a usable, visible snapshot until both of the +preconditions listed above are met, at which point it is destroyed. +.Pp +An inclusive range of snapshots may be specified by separating the first and +last snapshots with a percent sign. +The first and/or last snapshots may be left blank, in which case the +filesystem's oldest or newest snapshot will be implied. +.Pp +Multiple snapshots +.Pq or ranges of snapshots +of the same filesystem or volume may be specified in a comma-separated list of +snapshots. +Only the snapshot's short name +.Po the part after the +.Sy @ +.Pc +should be specified when using a range or comma-separated list to identify +multiple snapshots. +.Bl -tag -width "-R" +.It Fl R +Recursively destroy all clones of these snapshots, including the clones, +snapshots, and children. +If this flag is specified, the +.Fl d +flag will have no effect. +.It Fl d +Destroy immediately. +If a snapshot cannot be destroyed now, mark it for deferred destruction. +.It Fl n +Do a dry-run +.Pq Qq No-op +deletion. +No data will be deleted. +This is useful in conjunction with the +.Fl p +or +.Fl v +flags to determine what data would be deleted. +.It Fl p +Print machine-parsable verbose information about the deleted data. +.It Fl r +Destroy +.Pq or mark for deferred deletion +all snapshots with this name in descendent file systems. +.It Fl v +Print verbose information about the deleted data. +.Pp +Extreme care should be taken when applying either the +.Fl r +or the +.Fl R +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.El +.It Xo +.Nm zfs +.Cm destroy +.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark +.Xc +The given bookmark is destroyed. +.El +. +.Sh SEE ALSO +.Xr zfs-create 8 , +.Xr zfs-hold 8 diff --git a/man/man8/zfs-diff.8 b/man/man8/zfs-diff.8 new file mode 100644 index 0000000000..49443bf47d --- /dev/null +++ b/man/man8/zfs-diff.8 @@ -0,0 +1,98 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 29, 2021 +.Dt ZFS-DIFF 8 +.Os +. +.Sh NAME +.Nm zfs-diff +.Nd show difference between ZFS snapshots +.Sh SYNOPSIS +.Nm zfs +.Cm diff +.Op Fl FHt +.Ar snapshot Ar snapshot Ns | Ns Ar filesystem +. +.Sh DESCRIPTION +Display the difference between a snapshot of a given filesystem and another +snapshot of that filesystem from a later time or the current contents of the +filesystem. +The first column is a character indicating the type of change, the other columns +indicate pathname, new pathname +.Pq in case of rename , +change in link count, and optionally file type and/or change time. +The types of change are: +.Bl -tag -compact -offset Ds -width "M" +.It Sy - +The path has been removed +.It Sy + +The path has been created +.It Sy M +The path has been modified +.It Sy R +The path has been renamed +.El +.Bl -tag -width "-F" +.It Fl F +Display an indication of the type of file, in a manner similar to the +.Fl F +option of +.Xr ls 1 . +.Bl -tag -compact -offset 2n -width "B" +.It Sy B +Block device +.It Sy C +Character device +.It Sy / +Directory +.It Sy > +Door +.It Sy |\& +Named pipe +.It Sy @ +Symbolic link +.It Sy P +Event port +.It Sy = +Socket +.It Sy F +Regular file +.El +.It Fl H +Give more parsable tab-separated output, without header lines and without +arrows. +.It Fl t +Display the path's inode change time as the first column of output. +.El +. +.Sh SEE ALSO +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-get.8 b/man/man8/zfs-get.8 new file mode 120000 index 0000000000..c70b41ae40 --- /dev/null +++ b/man/man8/zfs-get.8 @@ -0,0 +1 @@ +zfs-set.8 \ No newline at end of file diff --git a/man/man8/zfs-groupspace.8 b/man/man8/zfs-groupspace.8 new file mode 120000 index 0000000000..8bc2f1df30 --- /dev/null +++ b/man/man8/zfs-groupspace.8 @@ -0,0 +1 @@ +zfs-userspace.8 \ No newline at end of file diff --git a/man/man8/zfs-hold.8 b/man/man8/zfs-hold.8 new file mode 100644 index 0000000000..5e4652092e --- /dev/null +++ b/man/man8/zfs-hold.8 @@ -0,0 +1,112 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-HOLD 8 +.Os +. +.Sh NAME +.Nm zfs-hold +.Nd hold ZFS snapshots to prevent their removal +.Sh SYNOPSIS +.Nm zfs +.Cm hold +.Op Fl r +.Ar tag Ar snapshot Ns … +.Nm zfs +.Cm holds +.Op Fl rH +.Ar snapshot Ns … +.Nm zfs +.Cm release +.Op Fl r +.Ar tag Ar snapshot Ns … +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm hold +.Op Fl r +.Ar tag Ar snapshot Ns … +.Xc +Adds a single reference, named with the +.Ar tag +argument, to the specified snapshots. +Each snapshot has its own tag namespace, and tags must be unique within that +space. +.Pp +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Sy EBUSY . +.Bl -tag -width "-r" +.It Fl r +Specifies that a hold with the given tag is applied recursively to the snapshots +of all descendent file systems. +.El +.It Xo +.Nm zfs +.Cm holds +.Op Fl rH +.Ar snapshot Ns … +.Xc +Lists all existing user references for the given snapshot or snapshots. +.Bl -tag -width "-r" +.It Fl r +Lists the holds that are set on the named descendent snapshots, in addition to +listing the holds on the named snapshot. +.It Fl H +Do not print headers, use tab-delimited output. +.El +.It Xo +.Nm zfs +.Cm release +.Op Fl r +.Ar tag Ar snapshot Ns … +.Xc +Removes a single reference, named with the +.Ar tag +argument, from the specified snapshot or snapshots. +The tag must already exist for each snapshot. +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Sy EBUSY . +.Bl -tag -width "-r" +.It Fl r +Recursively releases a hold with the given tag on the snapshots of all +descendent file systems. +.El +.El +. +.Sh SEE ALSO +.Xr zfs-destroy 8 diff --git a/man/man8/zfs-inherit.8 b/man/man8/zfs-inherit.8 new file mode 120000 index 0000000000..c70b41ae40 --- /dev/null +++ b/man/man8/zfs-inherit.8 @@ -0,0 +1 @@ +zfs-set.8 \ No newline at end of file diff --git a/man/man8/zfs-jail.8 b/man/man8/zfs-jail.8 new file mode 100644 index 0000000000..4f9faaea9b --- /dev/null +++ b/man/man8/zfs-jail.8 @@ -0,0 +1,123 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-JAIL 8 +.Os +. +.Sh NAME +.Nm zfs-jail +.Nd attach or detach ZFS filesystem from FreeBSD jail +.Sh SYNOPSIS +.Nm zfs Cm jail +.Ar jailid Ns | Ns Ar jailname +.Ar filesystem +.Nm zfs Cm unjail +.Ar jailid Ns | Ns Ar jailname +.Ar filesystem +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm jail +.Ar jailid Ns | Ns Ar jailname +.Ar filesystem +.Xc +Attach the specified +.Ar filesystem +to the jail identified by JID +.Ar jailid +or name +.Ar jailname . +From now on this file system tree can be managed from within a jail if the +.Sy jailed +property has been set. +To use this functionality, the jail needs the +.Sy allow.mount +and +.Sy allow.mount.zfs +parameters set to +.Sy 1 +and the +.Sy enforce_statfs +parameter set to a value lower than +.Sy 2 . +.Pp +You cannot attach a jailed dataset's children to another jail. +You can also not attach the root file system +of the jail or any dataset which needs to be mounted before the zfs rc script +is run inside the jail, as it would be attached unmounted until it is +mounted from the rc script inside the jail. +.Pp +To allow management of the dataset from within a jail, the +.Sy jailed +property has to be set and the jail needs access to the +.Pa /dev/zfs +device. +The +.Sy quota +property cannot be changed from within a jail. +.Pp +After a dataset is attached to a jail and the +.Sy jailed +property is set, a jailed file system cannot be mounted outside the jail, +since the jail administrator might have set the mount point to an unacceptable value. +.Pp +See +.Xr jail 8 +for more information on managing jails. +Jails are a +.Fx +feature and are not relevant on other platforms. +.It Xo +.Nm zfs +.Cm unjail +.Ar jailid Ns | Ns Ar jailname +.Ar filesystem +.Xc +Detaches the specified +.Ar filesystem +from the jail identified by JID +.Ar jailid +or name +.Ar jailname . +.El +.Sh SEE ALSO +.Xr zfsprops 7 , +.Xr jail 8 diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8 new file mode 100644 index 0000000000..5200483868 --- /dev/null +++ b/man/man8/zfs-list.8 @@ -0,0 +1,162 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-LIST 8 +.Os +. +.Sh NAME +.Nm zfs-list +.Nd list properties of ZFS datasets +.Sh SYNOPSIS +.Nm zfs +.Cm list +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns … Oc +.Oo Fl s Ar property Oc Ns … +.Oo Fl S Ar property Oc Ns … +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns … +. +.Sh DESCRIPTION +If specified, you can list property information by the absolute pathname or the +relative pathname. +By default, all file systems and volumes are displayed. +Snapshots are displayed if the +.Sy listsnapshots +pool property is +.Sy on +.Po the default is +.Sy off +.Pc , +or if the +.Fl t Sy snapshot +or +.Fl t Sy all +options are specified. +The following fields are displayed: +.Sy name , Sy used , Sy available , Sy referenced , Sy mountpoint . +.Bl -tag -width "-H" +.It Fl H +Used for scripting mode. +Do not print headers and separate fields by a single tab instead of arbitrary +white space. +.It Fl S Ar property +Same as the +.Fl s +option, but sorts by property in descending order. +.It Fl d Ar depth +Recursively display any children of the dataset, limiting the recursion to +.Ar depth . +A +.Ar depth +of +.Sy 1 +will display only the dataset and its direct children. +.It Fl o Ar property +A comma-separated list of properties to display. +The property must be: +.Bl -bullet -compact +.It +One of the properties described in the +.Sx Native Properties +section of +.Xr zfsprops 7 +.It +A user property +.It +The value +.Sy name +to display the dataset name +.It +The value +.Sy space +to display space usage properties on file systems and volumes. +This is a shortcut for specifying +.Fl o Ns \ \& Ns Sy name , Ns Sy avail , Ns Sy used , Ns Sy usedsnap , Ns +.Sy usedds , Ns Sy usedrefreserv , Ns Sy usedchild +.Fl t Sy filesystem , Ns Sy volume . +.El +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl r +Recursively display any children of the dataset on the command line. +.It Fl s Ar property +A property for sorting the output by column in ascending order based on the +value of the property. +The property must be one of the properties described in the +.Sx Properties +section of +.Xr zfsprops 7 +or the value +.Sy name +to sort by the dataset name. +Multiple properties can be specified at one time using multiple +.Fl s +property options. +Multiple +.Fl s +options are evaluated from left to right in decreasing order of importance. +The following is a list of sorting criteria: +.Bl -bullet -compact +.It +Numeric types sort in numeric order. +.It +String types sort in alphabetical order. +.It +Types inappropriate for a row sort that row to the literal bottom, regardless of +the specified ordering. +.El +.Pp +If no sorting options are specified the existing behavior of +.Nm zfs Cm list +is preserved. +.It Fl t Ar type +A comma-separated list of types to display, where +.Ar type +is one of +.Sy filesystem , +.Sy snapshot , +.Sy volume , +.Sy bookmark , +or +.Sy all . +For example, specifying +.Fl t Sy snapshot +displays only snapshots. +.El +. +.Sh SEE ALSO +.Xr zfsprops 7 , +.Xr zfs-get 8 diff --git a/man/man8/zfs-load-key.8 b/man/man8/zfs-load-key.8 new file mode 100644 index 0000000000..ed89b65d71 --- /dev/null +++ b/man/man8/zfs-load-key.8 @@ -0,0 +1,301 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd January 13, 2020 +.Dt ZFS-LOAD-KEY 8 +.Os +. +.Sh NAME +.Nm zfs-load-key +.Nd load, unload, or change encryption key of ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm load-key +.Op Fl nr +.Op Fl L Ar keylocation +.Fl a Ns | Ns Ar filesystem +.Nm zfs +.Cm unload-key +.Op Fl r +.Fl a Ns | Ns Ar filesystem +.Nm zfs +.Cm change-key +.Op Fl l +.Op Fl o Ar keylocation Ns = Ns Ar value +.Op Fl o Ar keyformat Ns = Ns Ar value +.Op Fl o Ar pbkdf2iters Ns = Ns Ar value +.Ar filesystem +.Nm zfs +.Cm change-key +.Fl i +.Op Fl l +.Ar filesystem +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm load-key +.Op Fl nr +.Op Fl L Ar keylocation +.Fl a Ns | Ns Ar filesystem +.Xc +Load the key for +.Ar filesystem , +allowing it and all children that inherit the +.Sy keylocation +property to be accessed. +The key will be expected in the format specified by the +.Sy keyformat +and location specified by the +.Sy keylocation +property. +Note that if the +.Sy keylocation +is set to +.Sy prompt +the terminal will interactively wait for the key to be entered. +Loading a key will not automatically mount the dataset. +If that functionality is desired, +.Nm zfs Cm mount Fl l +will ask for the key and mount the dataset +.Po +see +.Xr zfs-mount 8 +.Pc . +Once the key is loaded the +.Sy keystatus +property will become +.Sy available . +.Bl -tag -width "-r" +.It Fl r +Recursively loads the keys for the specified filesystem and all descendent +encryption roots. +.It Fl a +Loads the keys for all encryption roots in all imported pools. +.It Fl n +Do a dry-run +.Pq Qq No-op +.Cm load-key . +This will cause +.Nm zfs +to simply check that the provided key is correct. +This command may be run even if the key is already loaded. +.It Fl L Ar keylocation +Use +.Ar keylocation +instead of the +.Sy keylocation +property. +This will not change the value of the property on the dataset. +Note that if used with either +.Fl r +or +.Fl a , +.Ar keylocation +may only be given as +.Sy prompt . +.El +.It Xo +.Nm zfs +.Cm unload-key +.Op Fl r +.Fl a Ns | Ns Ar filesystem +.Xc +Unloads a key from ZFS, removing the ability to access the dataset and all of +its children that inherit the +.Sy keylocation +property. +This requires that the dataset is not currently open or mounted. +Once the key is unloaded the +.Sy keystatus +property will become +.Sy unavailable . +.Bl -tag -width "-r" +.It Fl r +Recursively unloads the keys for the specified filesystem and all descendent +encryption roots. +.It Fl a +Unloads the keys for all encryption roots in all imported pools. +.El +.It Xo +.Nm zfs +.Cm change-key +.Op Fl l +.Op Fl o Ar keylocation Ns = Ns Ar value +.Op Fl o Ar keyformat Ns = Ns Ar value +.Op Fl o Ar pbkdf2iters Ns = Ns Ar value +.Ar filesystem +.Xc +.It Xo +.Nm zfs +.Cm change-key +.Fl i +.Op Fl l +.Ar filesystem +.Xc +Changes the user's key (e.g. a passphrase) used to access a dataset. +This command requires that the existing key for the dataset is already loaded. +This command may also be used to change the +.Sy keylocation , +.Sy keyformat , +and +.Sy pbkdf2iters +properties as needed. +If the dataset was not previously an encryption root it will become one. +Alternatively, the +.Fl i +flag may be provided to cause an encryption root to inherit the parent's key +instead. +.Pp +If the user's key is compromised, +.Nm zfs Cm change-key +does not necessarily protect existing or newly-written data from attack. +Newly-written data will continue to be encrypted with the same master key as +the existing data. +The master key is compromised if an attacker obtains a +user key and the corresponding wrapped master key. +Currently, +.Nm zfs Cm change-key +does not overwrite the previous wrapped master key on disk, so it is +accessible via forensic analysis for an indeterminate length of time. +.Pp +In the event of a master key compromise, ideally the drives should be securely +erased to remove all the old data (which is readable using the compromised +master key), a new pool created, and the data copied back. +This can be approximated in place by creating new datasets, copying the data +.Pq e.g. using Nm zfs Cm send | Nm zfs Cm recv , +and then clearing the free space with +.Nm zpool Cm trim Fl -secure +if supported by your hardware, otherwise +.Nm zpool Cm initialize . +.Bl -tag -width "-r" +.It Fl l +Ensures the key is loaded before attempting to change the key. +This is effectively equivalent to runnin +.Nm zfs Cm load-key Ar filesystem ; Nm zfs Cm change-key Ar filesystem +.It Fl o Ar property Ns = Ns Ar value +Allows the user to set encryption key properties +.Pq Sy keyformat , keylocation , No and Sy pbkdf2iters +while changing the key. +This is the only way to alter +.Sy keyformat +and +.Sy pbkdf2iters +after the dataset has been created. +.It Fl i +Indicates that zfs should make +.Ar filesystem +inherit the key of its parent. +Note that this command can only be run on an encryption root +that has an encrypted parent. +.El +.El +.Ss Encryption +Enabling the +.Sy encryption +feature allows for the creation of encrypted filesystems and volumes. +ZFS will encrypt file and volume data, file attributes, ACLs, permission bits, +directory listings, FUID mappings, and +.Sy userused Ns / Ns Sy groupused +data. +ZFS will not encrypt metadata related to the pool structure, including +dataset and snapshot names, dataset hierarchy, properties, file size, file +holes, and deduplication tables (though the deduplicated data itself is +encrypted). +.Pp +Key rotation is managed by ZFS. +Changing the user's key (e.g. a passphrase) +does not require re-encrypting the entire dataset. +Datasets can be scrubbed, +resilvered, renamed, and deleted without the encryption keys being loaded (see the +.Cm load-key +subcommand for more info on key loading). +.Pp +Creating an encrypted dataset requires specifying the +.Sy encryption No and Sy keyformat +properties at creation time, along with an optional +.Sy keylocation No and Sy pbkdf2iters . +After entering an encryption key, the +created dataset will become an encryption root. +Any descendant datasets will +inherit their encryption key from the encryption root by default, meaning that +loading, unloading, or changing the key for the encryption root will implicitly +do the same for all inheriting datasets. +If this inheritance is not desired, simply supply a +.Sy keyformat +when creating the child dataset or use +.Nm zfs Cm change-key +to break an existing relationship, creating a new encryption root on the child. +Note that the child's +.Sy keyformat +may match that of the parent while still creating a new encryption root, and +that changing the +.Sy encryption +property alone does not create a new encryption root; this would simply use a +different cipher suite with the same key as its encryption root. +The one exception is that clones will always use their origin's encryption key. +As a result of this exception, some encryption-related properties +.Pq namely Sy keystatus , keyformat , keylocation , No and Sy pbkdf2iters +do not inherit like other ZFS properties and instead use the value determined +by their encryption root. +Encryption root inheritance can be tracked via the read-only +.Sy encryptionroot +property. +.Pp +Encryption changes the behavior of a few ZFS +operations. +Encryption is applied after compression so compression ratios are preserved. +Normally checksums in ZFS are 256 bits long, but for encrypted data +the checksum is 128 bits of the user-chosen checksum and 128 bits of MAC from +the encryption suite, which provides additional protection against maliciously +altered data. +Deduplication is still possible with encryption enabled but for security, +datasets will only deduplicate against themselves, their snapshots, +and their clones. +.Pp +There are a few limitations on encrypted datasets. +Encrypted data cannot be embedded via the +.Sy embedded_data +feature. +Encrypted datasets may not have +.Sy copies Ns = Ns Em 3 +since the implementation stores some encryption metadata where the third copy +would normally be. +Since compression is applied before encryption, datasets may +be vulnerable to a CRIME-like attack if applications accessing the data allow for it. +Deduplication with encryption will leak information about which blocks +are equivalent in a dataset and will incur an extra CPU cost for each block written. +. +.Sh SEE ALSO +.Xr zfsprops 7 , +.Xr zfs-create 8 , +.Xr zfs-set 8 diff --git a/man/man8/zfs-mount-generator.8.in b/man/man8/zfs-mount-generator.8.in index 79720601d6..7aa332ba81 100644 --- a/man/man8/zfs-mount-generator.8.in +++ b/man/man8/zfs-mount-generator.8.in @@ -1,83 +1,192 @@ -.TH "ZFS\-MOUNT\-GENERATOR" "8" "ZFS" "zfs-mount-generator" "\"" -.SH "NAME" -zfs\-mount\-generator \- generates systemd mount units for ZFS -.SH SYNOPSIS -.B /lib/systemd/system-generators/zfs\-mount\-generator -.sp -.SH DESCRIPTION -zfs\-mount\-generator implements the \fBGenerators Specification\fP -of -.BR systemd (1), -and is called during early boot to generate -.BR systemd.mount (5) -units for automatically mounted datasets. Mount ordering and dependencies -are created for all tracked pools (see below). If a dataset has -.BR canmount=on -and -.BR mountpoint -set, the -.BR auto -mount option will be set, and a dependency for -.BR local-fs.target -on the mount will be created. - -Because zfs pools may not be available very early in the boot process, -information on ZFS mountpoints must be stored separately. The output -of the command -.PP -.RS 4 -zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand -.RE -.PP -for datasets that should be mounted by systemd, should be kept -separate from the pool, at -.PP -.RS 4 -.RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME +.\" +.\" Copyright 2018 Antonio Russo +.\" Copyright 2019 Kjeld Schouten-Lebbing +.\" Copyright 2020 InsanePrawn +.\" +.\" Permission is hereby granted, free of charge, to any person obtaining +.\" a copy of this software and associated documentation files (the +.\" "Software"), to deal in the Software without restriction, including +.\" without limitation the rights to use, copy, modify, merge, publish, +.\" distribute, sublicense, and/or sell copies of the Software, and to +.\" permit persons to whom the Software is furnished to do so, subject to +.\" the following conditions: +.\" +.\" The above copyright notice and this permission notice shall be +.\" included in all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +.\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +.\" MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +.\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +.\" LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +.\" OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +.\" WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +.\" +.Dd May 31, 2021 +.Dt ZFS-MOUNT-GENERATOR 8 +.Os . -.RE -.PP -The cache file, if writeable, will be kept synchronized with the pool -state by the ZEDLET -.PP -.RS 4 -history_event-zfs-list-cacher.sh . -.RE -.PP -.sp -.SH EXAMPLE +.Sh NAME +.Nm zfs-mount-generator +.Nd generate systemd mount units for ZFS filesystems +.Sh SYNOPSIS +.Pa @systemdgeneratordir@/zfs-mount-generator +. +.Sh DESCRIPTION +.Nm +is a +.Xr systemd.generator 7 +that generates native +.Xr systemd.mount 5 +units for configured ZFS datasets. +. +.Ss Properties +.Bl -tag -compact -width "org.openzfs.systemd:required-by=unit[ unit]…" +.It Sy mountpoint Ns = +.No Skipped if Sy legacy No or Sy none . +. +.It Sy canmount Ns = +.No Skipped if Sy off . +.No Skipped if only Sy noauto +datasets exist for a given mountpoint and there's more than one. +.No Datasets with Sy yes No take precedence over ones with Sy noauto No for the same mountpoint. +.No Sets logical Em noauto No flag if Sy noauto . +Encryption roots always generate +.Sy zfs-load-key@ Ns Ar root Ns Sy .service , +even if +.Sy off . +. +.It Sy atime Ns = , Sy relatime Ns = , Sy devices Ns = , Sy exec Ns = , Sy readonly Ns = , Sy setuid Ns = , Sy nbmand Ns = +Used to generate mount options equivalent to +.Nm zfs Cm mount . +. +.It Sy encroot Ns = , Sy keylocation Ns = +If the dataset is an encryption root, its mount unit will bind to +.Sy zfs-load-key@ Ns Ar root Ns Sy .service , +with additional dependencies as follows: +.Bl -tag -compact -offset Ds -width "keylocation=https://URL (et al.)" +.It Sy keylocation Ns = Ns Sy prompt +None, uses +.Xr systemd-ask-password 1 +.It Sy keylocation Ns = Ns Sy https:// Ns Ar URL Pq et al.\& +.Sy Wants Ns = , Sy After Ns = : Pa network-online.target +.It Sy keylocation Ns = Ns Sy file:// Ns < Ns Ar path Ns > +.Sy RequiresMountsFor Ns = Ns Ar path +.El +. +The service also uses the same +.Sy Wants Ns = , +.Sy After Ns = , +.Sy Requires Ns = , No and +.Sy RequiresMountsFor Ns = , +as the mount unit. +. +.It Sy org.openzfs.systemd:requires Ns = Ns Pa path Ns Oo " " Ns Pa path Oc Ns … +.No Sets Sy Requires Ns = for the mount- and key-loading unit. +. +.It Sy org.openzfs.systemd:requires-mounts-for Ns = Ns Pa path Ns Oo " " Ns Pa path Oc Ns … +.No Sets Sy RequiresMountsFor Ns = for the mount- and key-loading unit. +. +.It Sy org.openzfs.systemd:before Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns … +.No Sets Sy Before Ns = for the mount unit. +. +.It Sy org.openzfs.systemd:after Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns … +.No Sets Sy After Ns = for the mount unit. +. +.It Sy org.openzfs.systemd:wanted-by Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns … +.No Sets logical Em noauto No flag (see below). +.No If not Sy none , No sets Sy WantedBy Ns = for the mount unit. +.It Sy org.openzfs.systemd:required-by Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns … +.No Sets logical Em noauto No flag (see below). +.No If not Sy none , No sets Sy RequiredBy Ns = for the mount unit. +. +.It Sy org.openzfs.systemd:nofail Ns = Ns (unset) Ns | Ns Sy on Ns | Ns Sy off +Waxes or wanes strength of default reverse dependencies of the mount unit, see below. +. +.It Sy org.openzfs.systemd:ignore Ns = Ns Sy on Ns | Ns Sy off +.No Skip if Sy on . +.No Defaults to Sy off . +.El +. +.Ss Unit Ordering And Dependencies +Additionally, unless the pool the dataset resides on +is imported at generation time, both units gain +.Sy Wants Ns = Ns Pa zfs-import.target +and +.Sy After Ns = Ns Pa zfs-import.target . +.Pp +Additionally, unless the logical +.Em noauto +flag is set, the mount unit gains a reverse-dependency for +.Pa local-fs.target +of strength +.Bl -tag -compact -offset Ds -width "(unset)" +.It (unset) +.Sy WantedBy Ns = No + Sy Before Ns = +.It Sy on +.Sy WantedBy Ns = +.It Sy off +.Sy RequiredBy Ns = No + Sy Before Ns = +.El +. +.Ss Cache File +Because ZFS pools may not be available very early in the boot process, +information on ZFS mountpoints must be stored separately. +The output of +.Dl Nm zfs Cm list Fl Ho Ar name , Ns Aq every property above in order +for datasets that should be mounted by systemd should be kept at +.Pa @sysconfdir@/zfs/zfs-list.cache/ Ns Ar poolname , +and, if writeable, will be kept synchronized for the entire pool by the +.Pa history_event-zfs-list-cacher.sh +ZEDLET, if enabled +.Pq see Xr zed 8 . +. +.Sh ENVIRONMENT +The +.Sy ZFS_DEBUG +environment variable can either be +.Sy 0 +(default), +.Sy 1 +(print summary accounting information at the end), or at least +.Sy 2 +(print accounting information for each subprocess as it finishes). +. +If not present, +.Pa /proc/cmdline +is additionally checked for +.Qq debug , +in which case the debug level is set to +.Sy 2 . +. +.Sh EXAMPLES To begin, enable tracking for the pool: -.PP -.RS 4 -touch -.RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME -.RE -.PP -Then, enable the tracking ZEDLET: -.PP -.RS 4 -ln -s "@zfsexecdir@/zed.d/history_event-zfs-list-cacher.sh" "@sysconfdir@/zfs/zed.d" - -systemctl enable zed.service - -systemctl restart zed.service -.RE -.PP -Force the running of the ZEDLET by setting canmount=on for at least one dataset in the pool: -.PP -.RS 4 -zfs set canmount=on -.I DATASET -.RE -.PP -This forces an update to the stale cache file. -.sp -.SH SEE ALSO -.BR zfs (5) -.BR zfs-events (5) -.BR zed (8) -.BR zpool (5) -.BR systemd (1) -.BR systemd.target (5) -.BR systemd.special (7) -.BR systemd.mount (7) +.Dl # Nm touch Pa @sysconfdir@/zfs/zfs-list.cache/ Ns Ar poolname +Then enable the tracking ZEDLET: +.Dl # Nm ln Fl s Pa @zfsexecdir@/zed.d/history_event-zfs-list-cacher.sh @sysconfdir@/zfs/zed.d +.Dl # Nm systemctl Cm enable Pa zfs-zed.service +.Dl # Nm systemctl Cm restart Pa zfs-zed.service +.Pp +If no history event is in the queue, +inject one to ensure the ZEDLET runs to refresh the cache file +by setting a monitored property somewhere on the pool: +.Dl # Nm zfs Cm set Sy relatime Ns = Ns Sy off Ar poolname/dset +.Dl # Nm zfs Cm inherit Sy relatime Ar poolname/dset +.Pp +To test the generator output: +.Dl $ Nm mkdir Pa /tmp/zfs-mount-generator +.Dl $ Nm @systemdgeneratordir@/zfs-mount-generator Pa /tmp/zfs-mount-generator +. +If the generated units are satisfactory, instruct +.Nm systemd +to re-run all generators: +.Dl # Nm systemctl daemon-reload +. +.Sh SEE ALSO +.Xr systemd.mount 5 , +.Xr systemd.target 5 , +.Xr zfs 5 , +.Xr systemd.generator 7 , +.Xr systemd.special 7 , +.Xr zed 8 , +.Xr zpool-events 8 diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 new file mode 100644 index 0000000000..42ce6b5ca1 --- /dev/null +++ b/man/man8/zfs-mount.8 @@ -0,0 +1,130 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd February 16, 2019 +.Dt ZFS-MOUNT 8 +.Os +. +.Sh NAME +.Nm zfs-mount +.Nd manage mount state of ZFS filesystems +.Sh SYNOPSIS +.Nm zfs +.Cm mount +.Nm zfs +.Cm mount +.Op Fl Oflv +.Op Fl o Ar options +.Fl a Ns | Ns Ar filesystem +.Nm zfs +.Cm unmount +.Op Fl fu +.Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm mount +.Xc +Displays all ZFS file systems currently mounted. +.It Xo +.Nm zfs +.Cm mount +.Op Fl Oflv +.Op Fl o Ar options +.Fl a Ns | Ns Ar filesystem +.Xc +Mount ZFS filesystem on a path described by its +.Sy mountpoint +property, if the path exists and is empty. +If +.Sy mountpoint +is set to +.Em legacy , +the filesystem should be instead mounted using +.Xr mount 8 . +.Bl -tag -width "-O" +.It Fl O +Perform an overlay mount. +Allows mounting in non-empty +.Sy mountpoint . +See +.Xr mount 8 +for more information. +.It Fl a +Mount all available ZFS file systems. +Invoked automatically as part of the boot process if configured. +.It Ar filesystem +Mount the specified filesystem. +.It Fl o Ar options +An optional, comma-separated list of mount options to use temporarily for the +duration of the mount. +See the +.Em Temporary Mount Point Properties +section of +.Xr zfsprops 7 +for details. +.It Fl l +Load keys for encrypted filesystems as they are being mounted. +This is equivalent to executing +.Nm zfs Cm load-key +on each encryption root before mounting it. +Note that if a filesystem has +.Sy keylocation Ns = Ns Sy prompt , +this will cause the terminal to interactively block after asking for the key. +.It Fl v +Report mount progress. +.It Fl f +Attempt to force mounting of all filesystems, even those that couldn't normally be mounted (e.g. redacted datasets). +.El +.It Xo +.Nm zfs +.Cm unmount +.Op Fl fu +.Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint +.Xc +Unmounts currently mounted ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Unmount all available ZFS file systems. +Invoked automatically as part of the shutdown process. +.It Fl f +Forcefully unmount the file system, even if it is currently in use. +This option is not supported on Linux. +.It Fl u +Unload keys for any encryption roots unmounted by this command. +.It Ar filesystem Ns | Ns Ar mountpoint +Unmount the specified filesystem. +The command can also be given a path to a ZFS file system mount point on the +system. +.El +.El diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8 index 532fda19b6..4a9718cdcf 100644 --- a/man/man8/zfs-program.8 +++ b/man/man8/zfs-program.8 @@ -1,3 +1,4 @@ +.\" .\" This file and its contents are supplied under the terms of the .\" Common Development and Distribution License ("CDDL"), version 1.0. .\" You may only use this file in accordance with the terms of version @@ -7,23 +8,27 @@ .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. .\" -.\" .\" Copyright (c) 2016, 2019 by Delphix. All Rights Reserved. +.\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved. +.\" Copyright 2020 Joyent, Inc. .\" -.Dd February 26, 2019 +.Dd May 27, 2021 .Dt ZFS-PROGRAM 8 .Os +. .Sh NAME -.Nm zfs program -.Nd executes ZFS channel programs +.Nm zfs-program +.Nd execute ZFS channel programs .Sh SYNOPSIS -.Cm "zfs program" +.Nm zfs +.Cm program .Op Fl jn .Op Fl t Ar instruction-limit .Op Fl m Ar memory-limit .Ar pool .Ar script -.\".Op Ar optional arguments to channel program +.Op Ar script arguments +. .Sh DESCRIPTION The ZFS channel program interface allows ZFS administrative operations to be run programmatically as a Lua script. @@ -34,22 +39,22 @@ Channel programs may only be run with root privileges. .Pp A modified version of the Lua 5.2 interpreter is used to run channel program scripts. -The Lua 5.2 manual can be found at: -.Bd -centered -offset indent +The Lua 5.2 manual can be found at .Lk http://www.lua.org/manual/5.2/ -.Ed .Pp The channel program given by .Ar script will be run on .Ar pool , and any attempts to access or modify other pools will cause an error. +. .Sh OPTIONS .Bl -tag -width "-t" .It Fl j -Display channel program output in JSON format. When this flag is specified and -standard output is empty - channel program encountered an error. The details of -such an error will be printed to standard error in plain text. +Display channel program output in JSON format. +When this flag is specified and standard output is empty - +channel program encountered an error. +The details of such an error will be printed to standard error in plain text. .It Fl n Executes a read-only channel program, which runs faster. The program cannot change on-disk state by calling functions from the @@ -75,15 +80,17 @@ All remaining argument strings will be passed directly to the Lua script as described in the .Sx LUA INTERFACE section below. +. .Sh LUA INTERFACE A channel program can be invoked either from the command line, or via a library call to .Fn lzc_channel_program . +. .Ss Arguments Arguments passed to the channel program are converted to a Lua table. If invoked from the command line, extra arguments to the Lua script will be accessible as an array stored in the argument table with the key 'argv': -.Bd -literal -offset indent +.Bd -literal -compact -offset indent args = ... argv = args["argv"] -- argv == {1="arg1", 2="arg2", ...} @@ -92,7 +99,7 @@ argv = args["argv"] If invoked from the libZFS interface, an arbitrary argument list can be passed to the channel program, which is accessible via the same "..." syntax in Lua: -.Bd -literal -offset indent +.Bd -literal -compact -offset indent args = ... -- args == {"foo"="bar", "baz"={...}, ...} .Ed @@ -105,37 +112,35 @@ in in a C array passed to a channel program will be stored in .Va arr[1] when accessed from Lua. +. .Ss Return Values Lua return statements take the form: -.Bd -literal -offset indent -return ret0, ret1, ret2, ... -.Ed +.Dl return ret0, ret1, ret2, ... .Pp Return statements returning multiple values are permitted internally in a channel program script, but attempting to return more than one value from the top level of the channel program is not permitted and will throw an error. However, tables containing multiple values can still be returned. If invoked from the command line, a return statement: -.Bd -literal -offset indent +.Bd -literal -compact -offset indent a = {foo="bar", baz=2} return a .Ed .Pp Will be output formatted as: -.Bd -literal -offset indent +.Bd -literal -compact -offset indent Channel program fully executed with return value: return: baz: 2 foo: 'bar' .Ed +. .Ss Fatal Errors If the channel program encounters a fatal error while running, a non-zero exit status will be returned. If more information about the error is available, a singleton list will be returned detailing the error: -.Bd -literal -offset indent -error: "error string, including Lua stack trace" -.Ed +.Dl error: \&"error string, including Lua stack trace" .Pp If a fatal error is returned, the channel program may have not executed at all, may have partially executed, or may have fully executed but failed to pass a @@ -153,12 +158,13 @@ can guarantee that it will finish successfully against a similar size system. If a channel program attempts to return too large a value, the program will fully execute but exit with a nonzero status code and no return value. .Pp -.Em Note: +.Em Note : ZFS API functions do not generate Fatal Errors when correctly invoked, they return an error code and the channel program continues executing. See the .Sx ZFS API section below for function-specific details on error return codes. +. .Ss Lua to C Value Conversion When invoking a channel program via the libZFS interface, it is necessary to translate arguments and return values from Lua values to their C equivalents, @@ -168,37 +174,37 @@ There is a correspondence between nvlist values in C and Lua tables. A Lua table which is returned from the channel program will be recursively converted to an nvlist, with table values converted to their natural equivalents: -.Bd -literal -offset indent -string -> string -number -> int64 -boolean -> boolean_value -nil -> boolean (no value) -table -> nvlist -.Ed +.TS +cw3 l c l . + string -> string + number -> int64 + boolean -> boolean_value + nil -> boolean (no value) + table -> nvlist +.TE .Pp Likewise, table keys are replaced by string equivalents as follows: -.Bd -literal -offset indent -string -> no change -number -> signed decimal string ("%lld") -boolean -> "true" | "false" -.Ed +.TS +cw3 l c l . + string -> no change + number -> signed decimal string ("%lld") + boolean -> "true" | "false" +.TE .Pp Any collision of table key strings (for example, the string "true" and a true boolean value) will cause a fatal error. .Pp Lua numbers are represented internally as signed 64-bit integers. +. .Sh LUA STANDARD LIBRARY The following Lua built-in base library functions are available: -.Bd -literal -offset indent -assert rawlen -collectgarbage rawget -error rawset -getmetatable select -ipairs setmetatable -next tonumber -pairs tostring -rawequal type -.Ed +.TS +cw3 l l l l . + assert rawlen collectgarbage rawget + error rawset getmetatable select + ipairs setmetatable next tonumber + pairs tostring rawequal type +.TE .Pp All functions in the .Em coroutine , @@ -211,15 +217,13 @@ manual. .Pp The following functions base library functions have been disabled and are not available for use in channel programs: -.Bd -literal -offset indent -dofile -loadfile -load -pcall -print -xpcall -.Ed +.TS +cw3 l l l l l l . + dofile loadfile load pcall print xpcall +.TE +. .Sh ZFS API +. .Ss Function Arguments Each API function takes a fixed set of required positional arguments and optional keyword arguments. @@ -228,22 +232,17 @@ For example, the destroy function takes a single positional string argument argument. When using parentheses to specify the arguments to a Lua function, only positional arguments can be used: -.Bd -literal -offset indent -zfs.sync.destroy("rpool@snap") -.Ed +.Dl Sy zfs.sync.destroy Ns Pq \&"rpool@snap" .Pp To use keyword arguments, functions must be called with a single argument that is a Lua table containing entries mapping integers to positional arguments and strings to keyword arguments: -.Bd -literal -offset indent -zfs.sync.destroy({1="rpool@snap", defer=true}) -.Ed +.Dl Sy zfs.sync.destroy Ns Pq {1="rpool@snap", defer=true} .Pp The Lua language allows curly braces to be used in place of parenthesis as syntactic sugar for this calling convention: -.Bd -literal -offset indent -zfs.sync.snapshot{"rpool@snap", defer=true} -.Ed +.Dl Sy zfs.sync.snapshot Ns {"rpool@snap", defer=true} +. .Ss Function Return Values If an API function succeeds, it returns 0. If it fails, it returns an error code and the channel program continues @@ -258,13 +257,11 @@ Lua table, or Nil if no error details were returned. Different keys will exist in the error details table depending on the function and error case. Any such function may be called expecting a single return value: -.Bd -literal -offset indent -errno = zfs.sync.promote(dataset) -.Ed +.Dl errno = Sy zfs.sync.promote Ns Pq dataset .Pp Or, the error details can be retrieved: -.Bd -literal -offset indent -errno, details = zfs.sync.promote(dataset) +.Bd -literal -compact -offset indent +.No errno, details = Sy zfs.sync.promote Ns Pq dataset if (errno == EEXIST) then assert(details ~= Nil) list_of_conflicting_snapshots = details @@ -273,48 +270,46 @@ end .Pp The following global aliases for API function error return codes are defined for use in channel programs: -.Bd -literal -offset indent -EPERM ECHILD ENODEV ENOSPC -ENOENT EAGAIN ENOTDIR ESPIPE -ESRCH ENOMEM EISDIR EROFS -EINTR EACCES EINVAL EMLINK -EIO EFAULT ENFILE EPIPE -ENXIO ENOTBLK EMFILE EDOM -E2BIG EBUSY ENOTTY ERANGE -ENOEXEC EEXIST ETXTBSY EDQUOT -EBADF EXDEV EFBIG -.Ed +.TS +cw3 l l l l l l l . + EPERM ECHILD ENODEV ENOSPC ENOENT EAGAIN ENOTDIR + ESPIPE ESRCH ENOMEM EISDIR EROFS EINTR EACCES + EINVAL EMLINK EIO EFAULT ENFILE EPIPE ENXIO + ENOTBLK EMFILE EDOM E2BIG EBUSY ENOTTY ERANGE + ENOEXEC EEXIST ETXTBSY EDQUOT EBADF EXDEV EFBIG +.TE +. .Ss API Functions -For detailed descriptions of the exact behavior of any zfs administrative +For detailed descriptions of the exact behavior of any ZFS administrative operations, see the main -.Xr zfs 1 +.Xr zfs 8 manual page. .Bl -tag -width "xx" -.It Em zfs.debug(msg) +.It Fn zfs.debug msg Record a debug message in the zfs_dbgmsg log. A log of these messages can be printed via mdb's "::zfs_dbgmsg" command, or -can be monitored live by running: -.Bd -literal -offset indent - dtrace -n 'zfs-dbgmsg{trace(stringof(arg0))}' -.Ed +can be monitored live by running +.Dl dtrace -n 'zfs-dbgmsg{trace(stringof(arg0))}' .Pp -msg (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "property (string)" +.It Ar msg Pq string Debug message to be printed. -.Ed -.It Em zfs.exists(dataset) +.El +.It Fn zfs.exists dataset Returns true if the given dataset exists, or false if it doesn't. A fatal error will be thrown if the dataset is not in the target pool. That is, in a channel program running on rpool, -zfs.exists("rpool/nonexistent_fs") returns false, but -zfs.exists("somepool/fs_that_may_exist") will error. +.Sy zfs.exists Ns Pq \&"rpool/nonexistent_fs" +returns false, but +.Sy zfs.exists Ns Pq \&"somepool/fs_that_may_exist" +will error. .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "property (string)" +.It Ar dataset Pq string Dataset to check for existence. Must be in the target pool. -.Ed -.It Em zfs.get_prop(dataset, property) +.El +.It Fn zfs.get_prop dataset property Returns two values. First, a string, number or table containing the property value for the given dataset. @@ -323,22 +318,25 @@ dataset in which it was set or nil if it is readonly). Throws a Lua error if the dataset is invalid or the property doesn't exist. Note that Lua only supports int64 number types whereas ZFS number properties are uint64. -This means very large values (like guid) may wrap around and appear negative. +This means very large values (like GUIDs) may wrap around and appear negative. .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "property (string)" +.It Ar dataset Pq string Filesystem or snapshot path to retrieve properties from. -.Ed -.Pp -property (string) -.Bd -ragged -compact -offset "xxxx" +.It Ar property Pq string Name of property to retrieve. -All filesystem, snapshot and volume properties are supported except -for 'mounted' and 'iscsioptions.' -Also supports the 'written@snap' and 'written#bookmark' properties and -the '@id' properties, though the id must be in numeric -form. -.Ed +All filesystem, snapshot and volume properties are supported except for +.Sy mounted +and +.Sy iscsioptions . +Also supports the +.Sy written@ Ns Ar snap +and +.Sy written# Ns Ar bookmark +properties and the +.Ao Sy user Ns | Ns Sy group Ac Ns Ao Sy quota Ns | Ns Sy used Ac Ns Sy @ Ns Ar id +properties, though the id must be in numeric form. +.El .El .Bl -tag -width "xx" .It Sy zfs.sync submodule @@ -347,45 +345,73 @@ They are executed in "syncing context". .Pp The available sync submodule functions are as follows: .Bl -tag -width "xx" -.It Em zfs.sync.destroy(dataset, [defer=true|false]) +.It Sy zfs.sync.destroy Ns Pq Ar dataset , Op Ar defer Ns = Ns Sy true Ns | Ns Sy false Destroy the given dataset. Returns 0 on successful destroy, or a nonzero error code if the dataset could not be destroyed (for example, if the dataset has any active children or clones). .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string Filesystem or snapshot to be destroyed. -.Ed -.Pp -[optional] defer (boolean) -.Bd -ragged -compact -offset "xxxx" +.It Op Ar defer Pq boolean Valid only for destroying snapshots. If set to true, and the snapshot has holds or clones, allows the snapshot to be marked for deferred deletion rather than failing. -.Ed -.It Em zfs.sync.promote(dataset) +.El +.It Fn zfs.sync.inherit dataset property +Clears the specified property in the given dataset, causing it to be inherited +from an ancestor, or restored to the default if no ancestor property is set. +The +.Nm zfs Cm inherit Fl S +option has not been implemented. +Returns 0 on success, or a nonzero error code if the property could not be +cleared. +.Pp +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string +Filesystem or snapshot containing the property to clear. +.It Ar property Pq string +The property to clear. +Allowed properties are the same as those for the +.Nm zfs Cm inherit +command. +.El +.It Fn zfs.sync.promote dataset Promote the given clone to a filesystem. Returns 0 on successful promotion, or a nonzero error code otherwise. If EEXIST is returned, the second return value will be an array of the clone's snapshots whose names collide with snapshots of the parent filesystem. .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string Clone to be promoted. -.Ed -.It Em zfs.sync.rollback(filesystem) +.El +.It Fn zfs.sync.rollback filesystem Rollback to the previous snapshot for a dataset. Returns 0 on successful rollback, or a nonzero error code otherwise. Rollbacks can be performed on filesystems or zvols, but not on snapshots or mounted datasets. EBUSY is returned in the case where the filesystem is mounted. .Pp -filesystem (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "newbookmark (string)" +.It Ar filesystem Pq string Filesystem to rollback. -.Ed -.It Em zfs.sync.snapshot(dataset) +.El +.It Fn zfs.sync.set_prop dataset property value +Sets the given property on a dataset. +Currently only user properties are supported. +Returns 0 if the property was set, or a nonzero error code otherwise. +.Pp +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string +The dataset where the property will be set. +.It Ar property Pq string +The property to set. +.It Ar value Pq string +The value of the property to be set. +.El +.It Fn zfs.sync.snapshot dataset Create a snapshot of a filesystem. Returns 0 if the snapshot was successfully created, and a nonzero error code otherwise. @@ -393,89 +419,142 @@ and a nonzero error code otherwise. Note: Taking a snapshot will fail on any pool older than legacy version 27. To enable taking snapshots from ZCP scripts, the pool must be upgraded. .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string Name of snapshot to create. -.Ed +.El +.It Fn zfs.sync.bookmark source newbookmark +Create a bookmark of an existing source snapshot or bookmark. +Returns 0 if the new bookmark was successfully created, +and a nonzero error code otherwise. +.Pp +Note: Bookmarking requires the corresponding pool feature to be enabled. +.Pp +.Bl -tag -compact -width "newbookmark (string)" +.It Ar source Pq string +Full name of the existing snapshot or bookmark. +.It Ar newbookmark Pq string +Full name of the new bookmark. +.El .El .It Sy zfs.check submodule -For each function in the zfs.sync submodule, there is a corresponding zfs.check +For each function in the +.Sy zfs.sync +submodule, there is a corresponding +.Sy zfs.check function which performs a "dry run" of the same operation. -Each takes the same arguments as its zfs.sync counterpart and returns 0 if the -operation would succeed, or a non-zero error code if it would fail, along with -any other error details. +Each takes the same arguments as its +.Sy zfs.sync +counterpart and returns 0 if the operation would succeed, +or a non-zero error code if it would fail, along with any other error details. That is, each has the same behavior as the corresponding sync function except for actually executing the requested change. For example, -.Em zfs.check.destroy("fs") +.Fn zfs.check.destroy \&"fs" returns 0 if -.Em zfs.sync.destroy("fs") +.Fn zfs.sync.destroy \&"fs" would successfully destroy the dataset. .Pp -The available zfs.check functions are: -.Bl -tag -width "xx" -.It Em zfs.check.destroy(dataset, [defer=true|false]) -.It Em zfs.check.promote(dataset) -.It Em zfs.check.rollback(filesystem) -.It Em zfs.check.snapshot(dataset) +The available +.Sy zfs.check +functions are: +.Bl -tag -compact -width "xx" +.It Sy zfs.check.destroy Ns Pq Ar dataset , Op Ar defer Ns = Ns Sy true Ns | Ns Sy false +.It Fn zfs.check.promote dataset +.It Fn zfs.check.rollback filesystem +.It Fn zfs.check.set_property dataset property value +.It Fn zfs.check.snapshot dataset .El .It Sy zfs.list submodule The zfs.list submodule provides functions for iterating over datasets and properties. Rather than returning tables, these functions act as Lua iterators, and are generally used as follows: -.Bd -literal -offset indent -for child in zfs.list.children("rpool") do +.Bd -literal -compact -offset indent +.No for child in Fn zfs.list.children \&"rpool" No do ... end .Ed .Pp -The available zfs.list functions are: +The available +.Sy zfs.list +functions are: .Bl -tag -width "xx" -.It Em zfs.list.clones(snapshot) +.It Fn zfs.list.clones snapshot Iterate through all clones of the given snapshot. .Pp -snapshot (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "snapshot (string)" +.It Ar snapshot Pq string Must be a valid snapshot path in the current pool. -.Ed -.It Em zfs.list.snapshots(dataset) +.El +.It Fn zfs.list.snapshots dataset Iterate through all snapshots of the given dataset. -Each snapshot is returned as a string containing the full dataset name, e.g. -"pool/fs@snap". +Each snapshot is returned as a string containing the full dataset name, +e.g. "pool/fs@snap". .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string Must be a valid filesystem or volume. -.Ed -.It Em zfs.list.children(dataset) +.El +.It Fn zfs.list.children dataset Iterate through all direct children of the given dataset. -Each child is returned as a string containing the full dataset name, e.g. -"pool/fs/child". +Each child is returned as a string containing the full dataset name, +e.g. "pool/fs/child". .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string Must be a valid filesystem or volume. -.Ed -.It Em zfs.list.properties(dataset) -Iterate through all user properties for the given dataset. +.El +.It Fn zfs.list.bookmarks dataset +Iterate through all bookmarks of the given dataset. +Each bookmark is returned as a string containing the full dataset name, +e.g. "pool/fs#bookmark". .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string +Must be a valid filesystem or volume. +.El +.It Fn zfs.list.holds snapshot +Iterate through all user holds on the given snapshot. +Each hold is returned +as a pair of the hold's tag and the timestamp (in seconds since the epoch) at +which it was created. +.Pp +.Bl -tag -compact -width "snapshot (string)" +.It Ar snapshot Pq string +Must be a valid snapshot. +.El +.It Fn zfs.list.properties dataset +An alias for zfs.list.user_properties (see relevant entry). +.Pp +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string Must be a valid filesystem, snapshot, or volume. -.Ed -.It Em zfs.list.system_properties(dataset) +.El +.It Fn zfs.list.user_properties dataset +Iterate through all user properties for the given dataset. +For each step of the iteration, output the property name, its value, +and its source. +Throws a Lua error if the dataset is invalid. +.Pp +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string +Must be a valid filesystem, snapshot, or volume. +.El +.It Fn zfs.list.system_properties dataset Returns an array of strings, the names of the valid system (non-user defined) properties for the given dataset. Throws a Lua error if the dataset is invalid. .Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" +.Bl -tag -compact -width "snapshot (string)" +.It Ar dataset Pq string Must be a valid filesystem, snapshot or volume. -.Ed .El .El +.El +. .Sh EXAMPLES +. .Ss Example 1 The following channel program recursively destroys a filesystem and all its snapshots and children in a naive manner. @@ -492,6 +571,7 @@ function destroy_recursive(root) end destroy_recursive("pool/somefs") .Ed +. .Ss Example 2 A more verbose and robust version of the same channel program, which properly detects and reports errors, and also takes the dataset to destroy @@ -530,6 +610,7 @@ results["succeeded"] = succeeded results["failed"] = failed return results .Ed +. .Ss Example 3 The following function performs a forced promote operation by attempting to promote the given clone and destroying any conflicting snapshots. diff --git a/man/man8/zfs-project.8 b/man/man8/zfs-project.8 new file mode 100644 index 0000000000..f264a110fc --- /dev/null +++ b/man/man8/zfs-project.8 @@ -0,0 +1,141 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-PROJECT 8 +.Os +. +.Sh NAME +.Nm zfs-project +.Nd manage projects in ZFS filesystem +.Sh SYNOPSIS +.Nm zfs +.Cm project +.Oo Fl d Ns | Ns Fl r Ns Oc +.Ar file Ns | Ns Ar directory Ns … +.Nm zfs +.Cm project +.Fl C +.Oo Fl kr Ns Oc +.Ar file Ns | Ns Ar directory Ns … +.Nm zfs +.Cm project +.Fl c +.Oo Fl 0 Ns Oc +.Oo Fl d Ns | Ns Fl r Ns Oc +.Op Fl p Ar id +.Ar file Ns | Ns Ar directory Ns … +.Nm zfs +.Cm project +.Op Fl p Ar id +.Oo Fl rs Ns Oc +.Ar file Ns | Ns Ar directory Ns … +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm project +.Oo Fl d Ns | Ns Fl r Ns Oc +.Ar file Ns | Ns Ar directory Ns … +.Xc +List project identifier (ID) and inherit flag of files and directories. +.Bl -tag -width "-d" +.It Fl d +Show the directory project ID and inherit flag, not its children. +.It Fl r +List subdirectories recursively. +.El +.It Xo +.Nm zfs +.Cm project +.Fl C +.Oo Fl kr Ns Oc +.Ar file Ns | Ns Ar directory Ns … +.Xc +Clear project inherit flag and/or ID on the files and directories. +.Bl -tag -width "-k" +.It Fl k +Keep the project ID unchanged. +If not specified, the project ID will be reset to zero. +.It Fl r +Clear subdirectories' flags recursively. +.El +.It Xo +.Nm zfs +.Cm project +.Fl c +.Oo Fl 0 Ns Oc +.Oo Fl d Ns | Ns Fl r Ns Oc +.Op Fl p Ar id +.Ar file Ns | Ns Ar directory Ns … +.Xc +Check project ID and inherit flag on the files and directories: +report entries without the project inherit flag, or with project IDs different from the +target directory's project ID or the one specified with +.Fl p . +.Bl -tag -width "-p id" +.It Fl 0 +Delimit filenames with a NUL byte instead of newline. +.It Fl d +Check the directory project ID and inherit flag, not its children. +.It Fl p Ar id +Compare to +.Ar id +instead of the target files and directories' project IDs. +.It Fl r +Check subdirectories recursively. +.El +.It Xo +.Nm zfs +.Cm project +.Fl p Ar id +.Oo Fl rs Ns Oc +.Ar file Ns | Ns Ar directory Ns … +.Xc +Set project ID and/or inherit flag on the files and directories. +.Bl -tag -width "-p id" +.It Fl p Ar id +Set the project ID to the given value. +.It Fl r +Set on subdirectories recursively. +.It Fl s +Set project inherit flag on the given files and directories. +This is usually used for setting up tree quotas with +.Fl r . +In that case, the directory's project ID +will be set for all its descendants, unless specified explicitly with +.Fl p . +.El +.El +. +.Sh SEE ALSO +.Xr zfs-projectspace 8 diff --git a/man/man8/zfs-projectspace.8 b/man/man8/zfs-projectspace.8 new file mode 120000 index 0000000000..8bc2f1df30 --- /dev/null +++ b/man/man8/zfs-projectspace.8 @@ -0,0 +1 @@ +zfs-userspace.8 \ No newline at end of file diff --git a/man/man8/zfs-promote.8 b/man/man8/zfs-promote.8 new file mode 100644 index 0000000000..ba8cd5f6da --- /dev/null +++ b/man/man8/zfs-promote.8 @@ -0,0 +1,64 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-PROMOTE 8 +.Os +. +.Sh NAME +.Nm zfs-promote +.Nd promote clone dataset to no longer depend on origin snapshot +.Sh SYNOPSIS +.Nm zfs +.Cm promote +.Ar clone +. +.Sh DESCRIPTION +The +.Nm zfs Cm promote +command makes it possible to destroy the dataset that the clone was created from. +The clone parent-child dependency relationship is reversed, so that the origin +dataset becomes a clone of the specified dataset. +.Pp +The snapshot that was cloned, and any snapshots previous to this snapshot, are +now owned by the promoted clone. +The space they use moves from the origin dataset to the promoted clone, so +enough space must be available to accommodate these snapshots. +No new space is consumed by this operation, but the space accounting is +adjusted. +The promoted clone must not have any conflicting snapshot names of its own. +The +.Nm zfs Cm rename +subcommand can be used to rename any conflicting snapshots. +. +.Sh SEE ALSO +.Xr zfs-clone 8 , +.Xr zfs-rename 8 diff --git a/man/man8/zfs-receive.8 b/man/man8/zfs-receive.8 new file mode 100644 index 0000000000..d2cec42a8e --- /dev/null +++ b/man/man8/zfs-receive.8 @@ -0,0 +1,400 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd February 16, 2020 +.Dt ZFS-RECEIVE 8 +.Os +. +.Sh NAME +.Nm zfs-receive +.Nd create snapshot from backup stream +.Sh SYNOPSIS +.Nm zfs +.Cm receive +.Op Fl FhMnsuv +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm zfs +.Cm receive +.Op Fl FhMnsuv +.Op Fl d Ns | Ns Fl e +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem +.Nm zfs +.Cm receive +.Fl A +.Ar filesystem Ns | Ns Ar volume +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm receive +.Op Fl FhMnsuv +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +.It Xo +.Nm zfs +.Cm receive +.Op Fl FhMnsuv +.Op Fl d Ns | Ns Fl e +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem +.Xc +Creates a snapshot whose contents are as specified in the stream provided on +standard input. +If a full stream is received, then a new file system is created as well. +Streams are created using the +.Nm zfs Cm send +subcommand, which by default creates a full stream. +.Nm zfs Cm recv +can be used as an alias for +.Nm zfs Cm receive . +.Pp +If an incremental stream is received, then the destination file system must +already exist, and its most recent snapshot must match the incremental stream's +source. +For +.Sy zvols , +the destination device link is destroyed and recreated, which means the +.Sy zvol +cannot be accessed during the +.Cm receive +operation. +.Pp +When a snapshot replication package stream that is generated by using the +.Nm zfs Cm send Fl R +command is received, any snapshots that do not exist on the sending location are +destroyed by using the +.Nm zfs Cm destroy Fl d +command. +.Pp +The ability to send and receive deduplicated send streams has been removed. +However, a deduplicated send stream created with older software can be converted +to a regular (non-deduplicated) stream by using the +.Nm zstream Cm redup +command. +.Pp +If +.Fl o Em property Ns = Ns Ar value +or +.Fl x Em property +is specified, it applies to the effective value of the property throughout +the entire subtree of replicated datasets. +Effective property values will be set +.Pq Fl o +or inherited +.Pq Fl x +on the topmost in the replicated subtree. +In descendant datasets, if the +property is set by the send stream, it will be overridden by forcing the +property to be inherited from the top‐most file system. +Received properties are retained in spite of being overridden +and may be restored with +.Nm zfs Cm inherit Fl S . +Specifying +.Fl o Sy origin Ns = Ns Em snapshot +is a special case because, even if +.Sy origin +is a read-only property and cannot be set, it's allowed to receive the send +stream as a clone of the given snapshot. +.Pp +Raw encrypted send streams (created with +.Nm zfs Cm send Fl w ) +may only be received as is, and cannot be re-encrypted, decrypted, or +recompressed by the receive process. +Unencrypted streams can be received as +encrypted datasets, either through inheritance or by specifying encryption +parameters with the +.Fl o +options. +Note that the +.Sy keylocation +property cannot be overridden to +.Sy prompt +during a receive. +This is because the receive process itself is already using +the standard input for the send stream. +Instead, the property can be overridden after the receive completes. +.Pp +The added security provided by raw sends adds some restrictions to the send +and receive process. +ZFS will not allow a mix of raw receives and non-raw receives. +Specifically, any raw incremental receives that are attempted after +a non-raw receive will fail. +Non-raw receives do not have this restriction and, +therefore, are always possible. +Because of this, it is best practice to always +use either raw sends for their security benefits or non-raw sends for their +flexibility when working with encrypted datasets, but not a combination. +.Pp +The reason for this restriction stems from the inherent restrictions of the +AEAD ciphers that ZFS uses to encrypt data. +When using ZFS native encryption, +each block of data is encrypted against a randomly generated number known as +the "initialization vector" (IV), which is stored in the filesystem metadata. +This number is required by the encryption algorithms whenever the data is to +be decrypted. +Together, all of the IVs provided for all of the blocks in a +given snapshot are collectively called an "IV set". +When ZFS performs a raw send, the IV set is transferred from the source +to the destination in the send stream. +When ZFS performs a non-raw send, the data is decrypted by the source +system and re-encrypted by the destination system, creating a snapshot with +effectively the same data, but a different IV set. +In order for decryption to work after a raw send, ZFS must ensure that +the IV set used on both the source and destination side match. +When an incremental raw receive is performed on +top of an existing snapshot, ZFS will check to confirm that the "from" +snapshot on both the source and destination were using the same IV set, +ensuring the new IV set is consistent. +.Pp +The name of the snapshot +.Pq and file system, if a full stream is received +that this subcommand creates depends on the argument type and the use of the +.Fl d +or +.Fl e +options. +.Pp +If the argument is a snapshot name, the specified +.Ar snapshot +is created. +If the argument is a file system or volume name, a snapshot with the same name +as the sent snapshot is created within the specified +.Ar filesystem +or +.Ar volume . +If neither of the +.Fl d +or +.Fl e +options are specified, the provided target snapshot name is used exactly as +provided. +.Pp +The +.Fl d +and +.Fl e +options cause the file system name of the target snapshot to be determined by +appending a portion of the sent snapshot's name to the specified target +.Ar filesystem . +If the +.Fl d +option is specified, all but the first element of the sent snapshot's file +system path +.Pq usually the pool name +is used and any required intermediate file systems within the specified one are +created. +If the +.Fl e +option is specified, then only the last element of the sent snapshot's file +system name +.Pq i.e. the name of the source file system itself +is used as the target file system name. +.Bl -tag -width "-F" +.It Fl F +Force a rollback of the file system to the most recent snapshot before +performing the receive operation. +If receiving an incremental replication stream +.Po for example, one generated by +.Nm zfs Cm send Fl R Op Fl i Ns | Ns Fl I +.Pc , +destroy snapshots and file systems that do not exist on the sending side. +.It Fl d +Discard the first element of the sent snapshot's file system name, using the +remaining elements to determine the name of the target file system for the new +snapshot as described in the paragraph above. +.It Fl e +Discard all but the last element of the sent snapshot's file system name, using +that element to determine the name of the target file system for the new +snapshot as described in the paragraph above. +.It Fl h +Skip the receive of holds. +There is no effect if holds are not sent. +.It Fl M +Force an unmount of the file system while receiving a snapshot. +This option is not supported on Linux. +.It Fl n +Do not actually receive the stream. +This can be useful in conjunction with the +.Fl v +option to verify the name the receive operation would use. +.It Fl o Sy origin Ns = Ns Ar snapshot +Forces the stream to be received as a clone of the given snapshot. +If the stream is a full send stream, this will create the filesystem +described by the stream as a clone of the specified snapshot. +Which snapshot was specified will not affect the success or failure of the +receive, as long as the snapshot does exist. +If the stream is an incremental send stream, all the normal verification will be +performed. +.It Fl o Em property Ns = Ns Ar value +Sets the specified property as if the command +.Nm zfs Cm set Em property Ns = Ns Ar value +was invoked immediately before the receive. +When receiving a stream from +.Nm zfs Cm send Fl R , +causes the property to be inherited by all descendant datasets, as through +.Nm zfs Cm inherit Em property +was run on any descendant datasets that have this property set on the +sending system. +.Pp +If the send stream was sent with +.Fl c +then overriding the +.Sy compression +property will have no affect on received data but the +.Sy compression +property will be set. +To have the data recompressed on receive remove the +.Fl c +flag from the send stream. +.Pp +Any editable property can be set at receive time. +Set-once properties bound +to the received data, such as +.Sy normalization +and +.Sy casesensitivity , +cannot be set at receive time even when the datasets are newly created by +.Nm zfs Cm receive . +Additionally both settable properties +.Sy version +and +.Sy volsize +cannot be set at receive time. +.Pp +The +.Fl o +option may be specified multiple times, for different properties. +An error results if the same property is specified in multiple +.Fl o +or +.Fl x +options. +.Pp +The +.Fl o +option may also be used to override encryption properties upon initial receive. +This allows unencrypted streams to be received as encrypted datasets. +To cause the received dataset (or root dataset of a recursive stream) to be +received as an encryption root, specify encryption properties in the same +manner as is required for +.Nm zfs Cm create . +For instance: +.Dl # Nm zfs Cm send Pa tank/test@snap1 | Nm zfs Cm recv Fl o Sy encryption Ns = Ns Sy on Fl o keyformat=passphrase Fl o Sy keylocation Ns = Ns Pa file:///path/to/keyfile +.Pp +Note that +.Fl o Sy keylocation Ns = Ns Sy prompt +may not be specified here, since the standard input +is already being utilized for the send stream. +Once the receive has completed, you can use +.Nm zfs Cm set +to change this setting after the fact. +Similarly, you can receive a dataset as an encrypted child by specifying +.Op Fl x Ar encryption +to force the property to be inherited. +Overriding encryption properties (except for +.Sy keylocation ) +is not possible with raw send streams. +.It Fl s +If the receive is interrupted, save the partially received state, rather +than deleting it. +Interruption may be due to premature termination of the stream +.Po e.g. due to network failure or failure of the remote system +if the stream is being read over a network connection +.Pc , +a checksum error in the stream, termination of the +.Nm zfs Cm receive +process, or unclean shutdown of the system. +.Pp +The receive can be resumed with a stream generated by +.Nm zfs Cm send Fl t Ar token , +where the +.Ar token +is the value of the +.Sy receive_resume_token +property of the filesystem or volume which is received into. +.Pp +To use this flag, the storage pool must have the +.Sy extensible_dataset +feature enabled. +See +.Xr zpool-features 7 +for details on ZFS feature flags. +.It Fl u +File system that is associated with the received stream is not mounted. +.It Fl v +Print verbose information about the stream and the time required to perform the +receive operation. +.It Fl x Em property +Ensures that the effective value of the specified property after the +receive is unaffected by the value of that property in the send stream (if any), +as if the property had been excluded from the send stream. +.Pp +If the specified property is not present in the send stream, this option does +nothing. +.Pp +If a received property needs to be overridden, the effective value will be +set or inherited, depending on whether the property is inheritable or not. +.Pp +In the case of an incremental update, +.Fl x +leaves any existing local setting or explicit inheritance unchanged. +.Pp +All +.Fl o +restrictions (e.g. set-once) apply equally to +.Fl x . +.El +.It Xo +.Nm zfs +.Cm receive +.Fl A +.Ar filesystem Ns | Ns Ar volume +.Xc +Abort an interrupted +.Nm zfs Cm receive Fl s , +deleting its saved partially received state. +.El +. +.Sh SEE ALSO +.Xr zfs-send 8 , +.Xr zstream 8 diff --git a/man/man8/zfs-recv.8 b/man/man8/zfs-recv.8 new file mode 120000 index 0000000000..f11b7add7b --- /dev/null +++ b/man/man8/zfs-recv.8 @@ -0,0 +1 @@ +zfs-receive.8 \ No newline at end of file diff --git a/man/man8/zfs-redact.8 b/man/man8/zfs-redact.8 new file mode 120000 index 0000000000..f7c6057883 --- /dev/null +++ b/man/man8/zfs-redact.8 @@ -0,0 +1 @@ +zfs-send.8 \ No newline at end of file diff --git a/man/man8/zfs-release.8 b/man/man8/zfs-release.8 new file mode 120000 index 0000000000..58809d66a5 --- /dev/null +++ b/man/man8/zfs-release.8 @@ -0,0 +1 @@ +zfs-hold.8 \ No newline at end of file diff --git a/man/man8/zfs-rename.8 b/man/man8/zfs-rename.8 new file mode 100644 index 0000000000..6caee50657 --- /dev/null +++ b/man/man8/zfs-rename.8 @@ -0,0 +1,123 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd September 1, 2020 +.Dt ZFS-RENAME 8 +.Os +. +.Sh NAME +.Nm zfs-rename +.Nd rename ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm rename +.Op Fl f +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm zfs +.Cm rename +.Fl p +.Op Fl f +.Ar filesystem Ns | Ns Ar volume +.Ar filesystem Ns | Ns Ar volume +.Nm zfs +.Cm rename +.Fl u +.Op Fl f +.Ar filesystem Ar filesystem +.Nm zfs +.Cm rename +.Fl r +.Ar snapshot Ar snapshot +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm rename +.Op Fl f +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +.It Xo +.Nm zfs +.Cm rename +.Fl p +.Op Fl f +.Ar filesystem Ns | Ns Ar volume +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm zfs +.Cm rename +.Fl u +.Op Fl f +.Ar filesystem +.Ar filesystem +.Xc +Renames the given dataset. +The new target can be located anywhere in the ZFS hierarchy, with the exception +of snapshots. +Snapshots can only be renamed within the parent file system or volume. +When renaming a snapshot, the parent file system of the snapshot does not need +to be specified as part of the second argument. +Renamed file systems can inherit new mount points, in which case they are +unmounted and remounted at the new mount point. +.Bl -tag -width "-a" +.It Fl f +Force unmount any file systems that need to be unmounted in the process. +This flag has no effect if used together with the +.Fl u +flag. +.It Fl p +Creates all the nonexistent parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +.It Fl u +Do not remount file systems during rename. +If a file system's +.Sy mountpoint +property is set to +.Sy legacy +or +.Sy none , +the file system is not unmounted even if this option is not given. +.El +.It Xo +.Nm zfs +.Cm rename +.Fl r +.Ar snapshot Ar snapshot +.Xc +Recursively rename the snapshots of all descendent datasets. +Snapshots are the only dataset that can be renamed recursively. +.El diff --git a/man/man8/zfs-rollback.8 b/man/man8/zfs-rollback.8 new file mode 100644 index 0000000000..08e914b476 --- /dev/null +++ b/man/man8/zfs-rollback.8 @@ -0,0 +1,75 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-ROLLBACK 8 +.Os +. +.Sh NAME +.Nm zfs-rollback +.Nd roll ZFS dataset back to snapshot +.Sh SYNOPSIS +.Nm zfs +.Cm rollback +.Op Fl Rfr +.Ar snapshot +. +.Sh DESCRIPTION +When a dataset is rolled back, all data that has changed since the snapshot is +discarded, and the dataset reverts to the state at the time of the snapshot. +By default, the command refuses to roll back to a snapshot other than the most +recent one. +In order to do so, all intermediate snapshots and bookmarks must be destroyed by +specifying the +.Fl r +option. +.Pp +The +.Fl rR +options do not recursively destroy the child snapshots of a recursive snapshot. +Only direct snapshots of the specified filesystem are destroyed by either of +these options. +To completely roll back a recursive snapshot, you must roll back the individual +child snapshots. +.Bl -tag -width "-R" +.It Fl R +Destroy any more recent snapshots and bookmarks, as well as any clones of those +snapshots. +.It Fl f +Used with the +.Fl R +option to force an unmount of any clone file systems that are to be destroyed. +.It Fl r +Destroy any snapshots and bookmarks more recent than the one specified. +.El +. +.Sh SEE ALSO +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 new file mode 100644 index 0000000000..e83a92e4b3 --- /dev/null +++ b/man/man8/zfs-send.8 @@ -0,0 +1,655 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd April 15, 2021 +.Dt ZFS-SEND 8 +.Os +. +.Sh NAME +.Nm zfs-send +.Nd generate backup stream of ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm send +.Op Fl DLPRbcehnpsvw +.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot +.Ar snapshot +.Nm zfs +.Cm send +.Op Fl DLPcensvw +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm zfs +.Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Nm zfs +.Cm send +.Op Fl Penv +.Fl t +.Ar receive_resume_token +.Nm zfs +.Cm send +.Op Fl Pnv +.Fl S Ar filesystem +.Nm zfs +.Cm redact +.Ar snapshot redaction_bookmark +.Ar redaction_snapshot Ns … +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm send +.Op Fl DLPRbcehnpvw +.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot +.Ar snapshot +.Xc +Creates a stream representation of the second +.Ar snapshot , +which is written to standard output. +The output can be redirected to a file or to a different system +.Po for example, using +.Xr ssh 1 +.Pc . +By default, a full stream is generated. +.Bl -tag -width "-D" +.It Fl D , -dedup +Deduplicated send is no longer supported. +This flag is accepted for backwards compatibility, but a regular, +non-deduplicated stream will be generated. +.It Fl I Ar snapshot +Generate a stream package that sends all intermediary snapshots from the first +snapshot to the second snapshot. +For example, +.Fl I Em @a Em fs@d +is similar to +.Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . +The incremental source may be specified as with the +.Fl i +option. +.It Fl L , -large-block +Generate a stream which may contain blocks larger than 128KB. +This flag has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy large_blocks +feature. +.It Fl P , -parsable +Print machine-parsable verbose information about the stream package generated. +.It Fl R , -replicate +Generate a replication stream package, which will replicate the specified +file system, and all descendent file systems, up to the named snapshot. +When received, all properties, snapshots, descendent file systems, and clones +are preserved. +.Pp +If the +.Fl i +or +.Fl I +flags are used in conjunction with the +.Fl R +flag, an incremental replication stream is generated. +The current values of properties, and current snapshot and file system names are +set when the stream is received. +If the +.Fl F +flag is specified when this stream is received, snapshots and file systems that +do not exist on the sending side are destroyed. +If the +.Fl R +flag is used to send encrypted datasets, then +.Fl w +must also be specified. +.It Fl e , -embed +Generate a more compact stream by using +.Sy WRITE_EMBEDDED +records for blocks which are stored more compactly on disk by the +.Sy embedded_data +pool feature. +This flag has no effect if the +.Sy embedded_data +feature is disabled. +The receiving system must have the +.Sy embedded_data +feature enabled. +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +Datasets that are sent with this flag may not be +received as an encrypted dataset, since encrypted datasets cannot use the +.Sy embedded_data +feature. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy embedded_data +feature. +.It Fl b , -backup +Sends only received property values whether or not they are overridden by local +settings, but only if the dataset has ever been received. +Use this option when you want +.Nm zfs Cm receive +to restore received properties backed up on the sent dataset and to avoid +sending local settings that may have nothing to do with the source dataset, +but only with how the data is backed up. +.It Fl c , -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory +.Po see the +.Sy compression +property for details +.Pc . +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c , +then the data will be decompressed before sending so it can be split into +smaller block sizes. +Streams sent with +.Fl c +will not have their data recompressed on the receiver side using +.Fl o Sy compress Ns = Ar value . +The data will stay compressed as it was from the sender. +The new compression property will be set for future data. +Note that uncompressed data from the sender will still attempt to +compress on the receiver, unless you specify +.Fl o Sy compress Ns = Em off . +.It Fl w , -raw +For encrypted datasets, send data exactly as it exists on disk. +This allows backups to be taken even if encryption keys are not currently loaded. +The backup may then be received on an untrusted machine since that machine will +not have the encryption keys to read the protected data or alter it without +being detected. +Upon being received, the dataset will have the same encryption +keys as it did on the send side, although the +.Sy keylocation +property will be defaulted to +.Sy prompt +if not otherwise provided. +For unencrypted datasets, this flag will be equivalent to +.Fl Lec . +Note that if you do not use this flag for sending encrypted datasets, data will +be sent unencrypted and may be re-encrypted with a different encryption key on +the receiving system, which will disable the ability to do a raw send to that +system for incrementals. +.It Fl h , -holds +Generate a stream package that includes any snapshot holds (created with the +.Nm zfs Cm hold +command), and indicating to +.Nm zfs Cm receive +that the holds be applied to the dataset on the receiving system. +.It Fl i Ar snapshot +Generate an incremental stream from the first +.Ar snapshot +.Pq the incremental source +to the second +.Ar snapshot +.Pq the incremental target . +The incremental source can be specified as the last component of the snapshot +name +.Po the +.Sy @ +character and following +.Pc +and it is assumed to be from the same file system as the incremental target. +.Pp +If the destination is a clone, the source may be the origin snapshot, which must +be fully specified +.Po for example, +.Em pool/fs@origin , +not just +.Em @origin +.Pc . +.It Fl n , -dryrun +Do a dry-run +.Pq Qq No-op +send. +Do not generate any actual send data. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to determine what data will be sent. +In this case, the verbose output will be written to standard output +.Po contrast with a non-dry-run, where the stream is written to standard output +and the verbose output goes to standard error +.Pc . +.It Fl p , -props +Include the dataset's properties in the stream. +This flag is implicit when +.Fl R +is specified. +The receiving system must also support this feature. +Sends of encrypted datasets must use +.Fl w +when using this flag. +.It Fl s , -skip-missing +Allows sending a replication stream even when there are snapshots missing in the +hierarchy. +When a snapshot is missing, instead of throwing an error and aborting the send, +a warning is printed to the standard error stream and the dataset to which it belongs +and its descendents are skipped. +This flag can only be used in conjunction with +.Fl R . +.It Fl v , -verbose +Print verbose information about the stream package generated. +This information includes a per-second report of how much data has been sent. +.Pp +The format of the stream is committed. +You will be able to receive your streams on future versions of ZFS. +.El +.It Xo +.Nm zfs +.Cm send +.Op Fl DLPcenvw +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +Generate a send stream, which may be of a filesystem, and may be incremental +from a bookmark. +If the destination is a filesystem or volume, the pool must be read-only, or the +filesystem must not be mounted. +When the stream generated from a filesystem or volume is received, the default +snapshot name will be +.Qq --head-- . +.Bl -tag -width "-D" +.It Fl D , -dedup +Deduplicated send is no longer supported. +This flag is accepted for backwards compatibility, but a regular, +non-deduplicated stream will be generated. +.It Fl L , -large-block +Generate a stream which may contain blocks larger than 128KB. +This flag has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy large_blocks +feature. +.It Fl P , -parsable +Print machine-parsable verbose information about the stream package generated. +.It Fl c , -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory +.Po see the +.Sy compression +property for details +.Pc . +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c , +then the data will be decompressed before sending so it can be split into +smaller block sizes. +.It Fl w , -raw +For encrypted datasets, send data exactly as it exists on disk. +This allows backups to be taken even if encryption keys are not currently loaded. +The backup may then be received on an untrusted machine since that machine will +not have the encryption keys to read the protected data or alter it without +being detected. +Upon being received, the dataset will have the same encryption +keys as it did on the send side, although the +.Sy keylocation +property will be defaulted to +.Sy prompt +if not otherwise provided. +For unencrypted datasets, this flag will be equivalent to +.Fl Lec . +Note that if you do not use this flag for sending encrypted datasets, data will +be sent unencrypted and may be re-encrypted with a different encryption key on +the receiving system, which will disable the ability to do a raw send to that +system for incrementals. +.It Fl e , -embed +Generate a more compact stream by using +.Sy WRITE_EMBEDDED +records for blocks which are stored more compactly on disk by the +.Sy embedded_data +pool feature. +This flag has no effect if the +.Sy embedded_data +feature is disabled. +The receiving system must have the +.Sy embedded_data +feature enabled. +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +Datasets that are sent with this flag may not be received as an encrypted dataset, +since encrypted datasets cannot use the +.Sy embedded_data +feature. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy embedded_data +feature. +.It Fl i Ar snapshot Ns | Ns Ar bookmark +Generate an incremental send stream. +The incremental source must be an earlier snapshot in the destination's history. +It will commonly be an earlier snapshot in the destination's file system, in +which case it can be specified as the last component of the name +.Po the +.Sy # +or +.Sy @ +character and following +.Pc . +.Pp +If the incremental target is a clone, the incremental source can be the origin +snapshot, or an earlier snapshot in the origin's filesystem, or the origin's +origin, etc. +.It Fl n , -dryrun +Do a dry-run +.Pq Qq No-op +send. +Do not generate any actual send data. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to determine what data will be sent. +In this case, the verbose output will be written to standard output +.Po contrast with a non-dry-run, where the stream is written to standard output +and the verbose output goes to standard error +.Pc . +.It Fl v , -verbose +Print verbose information about the stream package generated. +This information includes a per-second report of how much data has been sent. +.El +.It Xo +.Nm zfs +.Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Xc +Generate a redacted send stream. +This send stream contains all blocks from the snapshot being sent that aren't +included in the redaction list contained in the bookmark specified by the +.Fl -redact +(or +.Fl d ) +flag. +The resulting send stream is said to be redacted with respect to the snapshots +the bookmark specified by the +.Fl -redact No flag was created with. +The bookmark must have been created by running +.Nm zfs Cm redact +on the snapshot being sent. +.Pp +This feature can be used to allow clones of a filesystem to be made available on +a remote system, in the case where their parent need not (or needs to not) be +usable. +For example, if a filesystem contains sensitive data, and it has clones where +that sensitive data has been secured or replaced with dummy data, redacted sends +can be used to replicate the secured data without replicating the original +sensitive data, while still sharing all possible blocks. +A snapshot that has been redacted with respect to a set of snapshots will +contain all blocks referenced by at least one snapshot in the set, but will +contain none of the blocks referenced by none of the snapshots in the set. +In other words, if all snapshots in the set have modified a given block in the +parent, that block will not be sent; but if one or more snapshots have not +modified a block in the parent, they will still reference the parent's block, so +that block will be sent. +Note that only user data will be redacted. +.Pp +When the redacted send stream is received, we will generate a redacted +snapshot. +Due to the nature of redaction, a redacted dataset can only be used in the +following ways: +.Bl -enum -width "a." +.It +To receive, as a clone, an incremental send from the original snapshot to one +of the snapshots it was redacted with respect to. +In this case, the stream will produce a valid dataset when received because all +blocks that were redacted in the parent are guaranteed to be present in the +child's send stream. +This use case will produce a normal snapshot, which can be used just like other +snapshots. +. +.It +To receive an incremental send from the original snapshot to something +redacted with respect to a subset of the set of snapshots the initial snapshot +was redacted with respect to. +In this case, each block that was redacted in the original is still redacted +(redacting with respect to additional snapshots causes less data to be redacted +(because the snapshots define what is permitted, and everything else is +redacted)). +This use case will produce a new redacted snapshot. +.It +To receive an incremental send from a redaction bookmark of the original +snapshot that was created when redacting with respect to a subset of the set of +snapshots the initial snapshot was created with respect to +anything else. +A send stream from such a redaction bookmark will contain all of the blocks +necessary to fill in any redacted data, should it be needed, because the sending +system is aware of what blocks were originally redacted. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.It +To receive an incremental send from a redacted version of the initial +snapshot that is redacted with respect to a subject of the set of snapshots the +initial snapshot was created with respect to. +A send stream from a compatible redacted dataset will contain all of the blocks +necessary to fill in any redacted data. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.It +To receive a full send as a clone of the redacted snapshot. +Since the stream is a full send, it definitionally contains all the data needed +to create a new dataset. +This use case will either produce a normal snapshot or a redacted one, depending +on whether the full send stream was redacted. +.El +.Pp +These restrictions are detected and enforced by +.Nm zfs Cm receive ; +a redacted send stream will contain the list of snapshots that the stream is +redacted with respect to. +These are stored with the redacted snapshot, and are used to detect and +correctly handle the cases above. +Note that for technical reasons, +raw sends and redacted sends cannot be combined at this time. +.It Xo +.Nm zfs +.Cm send +.Op Fl Penv +.Fl t +.Ar receive_resume_token +.Xc +Creates a send stream which resumes an interrupted receive. +The +.Ar receive_resume_token +is the value of this property on the filesystem or volume that was being +received into. +See the documentation for +.Nm zfs Cm receive Fl s +for more details. +.It Xo +.Nm zfs +.Cm send +.Op Fl Pnv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Fl S +.Ar filesystem +.Xc +Generate a send stream from a dataset that has been partially received. +.Bl -tag -width "-L" +.It Fl S , -saved +This flag requires that the specified filesystem previously received a resumable +send that did not finish and was interrupted. +In such scenarios this flag +enables the user to send this partially received state. +Using this flag will always use the last fully received snapshot +as the incremental source if it exists. +.El +.It Xo +.Nm zfs +.Cm redact +.Ar snapshot redaction_bookmark +.Ar redaction_snapshot Ns … +.Xc +Generate a new redaction bookmark. +In addition to the typical bookmark information, a redaction bookmark contains +the list of redacted blocks and the list of redaction snapshots specified. +The redacted blocks are blocks in the snapshot which are not referenced by any +of the redaction snapshots. +These blocks are found by iterating over the metadata in each redaction snapshot +to determine what has been changed since the target snapshot. +Redaction is designed to support redacted zfs sends; see the entry for +.Nm zfs Cm send +for more information on the purpose of this operation. +If a redact operation fails partway through (due to an error or a system +failure), the redaction can be resumed by rerunning the same command. +.El +.Ss Redaction +ZFS has support for a limited version of data subsetting, in the form of +redaction. +Using the +.Nm zfs Cm redact +command, a +.Sy redaction bookmark +can be created that stores a list of blocks containing sensitive information. +When provided to +.Nm zfs Cm send , +this causes a +.Sy redacted send +to occur. +Redacted sends omit the blocks containing sensitive information, +replacing them with REDACT records. +When these send streams are received, a +.Sy redacted dataset +is created. +A redacted dataset cannot be mounted by default, since it is incomplete. +It can be used to receive other send streams. +In this way datasets can be used for data backup and replication, +with all the benefits that zfs send and receive have to offer, +while protecting sensitive information from being +stored on less-trusted machines or services. +.Pp +For the purposes of redaction, there are two steps to the process. +A redact step, and a send/receive step. +First, a redaction bookmark is created. +This is done by providing the +.Nm zfs Cm redact +command with a parent snapshot, a bookmark to be created, and a number of +redaction snapshots. +These redaction snapshots must be descendants of the parent snapshot, +and they should modify data that is considered sensitive in some way. +Any blocks of data modified by all of the redaction snapshots will +be listed in the redaction bookmark, because it represents the truly sensitive +information. +When it comes to the send step, the send process will not send +the blocks listed in the redaction bookmark, instead replacing them with +REDACT records. +When received on the target system, this will create a +redacted dataset, missing the data that corresponds to the blocks in the +redaction bookmark on the sending system. +The incremental send streams from +the original parent to the redaction snapshots can then also be received on +the target system, and this will produce a complete snapshot that can be used +normally. +Incrementals from one snapshot on the parent filesystem and another +can also be done by sending from the redaction bookmark, rather than the +snapshots themselves. +.Pp +In order to make the purpose of the feature more clear, an example is provided. +Consider a zfs filesystem containing four files. +These files represent information for an online shopping service. +One file contains a list of usernames and passwords, another contains purchase histories, +a third contains click tracking data, and a fourth contains user preferences. +The owner of this data wants to make it available for their development teams to +test against, and their market research teams to do analysis on. +The development teams need information about user preferences and the click +tracking data, while the market research teams need information about purchase +histories and user preferences. +Neither needs access to the usernames and passwords. +However, because all of this data is stored in one ZFS filesystem, +it must all be sent and received together. +In addition, the owner of the data +wants to take advantage of features like compression, checksumming, and +snapshots, so they do want to continue to use ZFS to store and transmit their data. +Redaction can help them do so. +First, they would make two clones of a snapshot of the data on the source. +In one clone, they create the setup they want their market research team to see; +they delete the usernames and passwords file, +and overwrite the click tracking data with dummy information. +In another, they create the setup they want the development teams +to see, by replacing the passwords with fake information and replacing the +purchase histories with randomly generated ones. +They would then create a redaction bookmark on the parent snapshot, +using snapshots on the two clones as redaction snapshots. +The parent can then be sent, redacted, to the target +server where the research and development teams have access. +Finally, incremental sends from the parent snapshot to each of the clones can be sent +to and received on the target server; these snapshots are identical to the +ones on the source, and are ready to be used, while the parent snapshot on the +target contains none of the username and password data present on the source, +because it was removed by the redacted send operation. +. +.Sh SEE ALSO +.Xr zfs-bookmark 8 , +.Xr zfs-receive 8 , +.Xr zfs-redact 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 new file mode 100644 index 0000000000..a3588cc266 --- /dev/null +++ b/man/man8/zfs-set.8 @@ -0,0 +1,182 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 2, 2021 +.Dt ZFS-SET 8 +.Os +. +.Sh NAME +.Nm zfs-set +.Nd set properties on ZFS datasets +.Sh SYNOPSIS +.Nm zfs +.Cm set +.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns … +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … +.Nm zfs +.Cm get +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns … Oc +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Cm all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns … +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns … +.Nm zfs +.Cm inherit +.Op Fl rS +.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm set +.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns … +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … +.Xc +Only some properties can be edited. +See +.Xr zfsprops 7 +for more information on what properties can be set and acceptable +values. +Numeric values can be specified as exact values, or in a human-readable form +with a suffix of +.Sy B , K , M , G , T , P , E , Z +.Po for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, +or zettabytes, respectively +.Pc . +User properties can be set on snapshots. +For more information, see the +.Em User Properties +section of +.Xr zfsprops 7 . +.It Xo +.Nm zfs +.Cm get +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns … Oc +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Cm all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns … +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns … +.Xc +Displays properties for the given datasets. +If no datasets are specified, then the command displays properties for all +datasets on the system. +For each property, the following columns are displayed: +.Bl -tag -compact -offset 4n -width "property" +.It Sy name +Dataset name +.It Sy property +Property name +.It Sy value +Property value +.It Sy source +Property source +.Sy local , default , inherited , temporary , received , No or Sy - Pq none . +.El +.Pp +All columns are displayed by default, though this can be controlled by using the +.Fl o +option. +This command takes a comma-separated list of properties as described in the +.Sx Native Properties +and +.Sx User Properties +sections of +.Xr zfsprops 7 . +.Pp +The value +.Sy all +can be used to display all properties that apply to the given dataset's type +.Pq Sy filesystem , volume , snapshot , No or Sy bookmark . +.Bl -tag -width "-s source" +.It Fl H +Display output in a form more easily parsed by scripts. +Any headers are omitted, and fields are explicitly separated by a single tab +instead of an arbitrary amount of space. +.It Fl d Ar depth +Recursively display any children of the dataset, limiting the recursion to +.Ar depth . +A depth of +.Sy 1 +will display only the dataset and its direct children. +.It Fl o Ar field +A comma-separated list of columns to display, defaults to +.Sy name , Ns Sy property , Ns Sy value , Ns Sy source . +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl r +Recursively display properties for any children. +.It Fl s Ar source +A comma-separated list of sources to display. +Those properties coming from a source other than those in this list are ignored. +Each source must be one of the following: +.Sy local , default , inherited , temporary , received , No or Sy none . +The default value is all sources. +.It Fl t Ar type +A comma-separated list of types to display, where +.Ar type +is one of +.Sy filesystem , snapshot , volume , bookmark , No or Sy all . +.El +.It Xo +.Nm zfs +.Cm inherit +.Op Fl rS +.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … +.Xc +Clears the specified property, causing it to be inherited from an ancestor, +restored to default if no ancestor has the property set, or with the +.Fl S +option reverted to the received value if one exists. +See +.Xr zfsprops 7 +for a listing of default values, and details on which properties can be +inherited. +.Bl -tag -width "-r" +.It Fl r +Recursively inherit the given property for all children. +.It Fl S +Revert the property to the received value if one exists; otherwise operate as +if the +.Fl S +option was not specified. +.El +.El +. +.Sh SEE ALSO +.Xr zfsprops 7 , +.Xr zfs-list 8 diff --git a/man/man8/zfs-share.8 b/man/man8/zfs-share.8 new file mode 100644 index 0000000000..e30d538814 --- /dev/null +++ b/man/man8/zfs-share.8 @@ -0,0 +1,90 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-SHARE 8 +.Os +. +.Sh NAME +.Nm zfs-share +.Nd share and unshare ZFS filesystems +.Sh SYNOPSIS +.Nm zfs +.Cm share +.Fl a Ns | Ns Ar filesystem +.Nm zfs +.Cm unshare +.Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm share +.Fl a Ns | Ns Ar filesystem +.Xc +Shares available ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Share all available ZFS file systems. +Invoked automatically as part of the boot process. +.It Ar filesystem +Share the specified filesystem according to the +.Sy sharenfs +and +.Sy sharesmb +properties. +File systems are shared when the +.Sy sharenfs +or +.Sy sharesmb +property is set. +.El +.It Xo +.Nm zfs +.Cm unshare +.Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint +.Xc +Unshares currently shared ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Unshare all available ZFS file systems. +Invoked automatically as part of the shutdown process. +.It Ar filesystem Ns | Ns Ar mountpoint +Unshare the specified filesystem. +The command can also be given a path to a ZFS file system shared on the system. +.El +.El +. +.Sh SEE ALSO +.Xr exports 5 , +.Xr smb.conf 5 , +.Xr zfsprops 7 diff --git a/man/man8/zfs-snapshot.8 b/man/man8/zfs-snapshot.8 new file mode 100644 index 0000000000..225123f44b --- /dev/null +++ b/man/man8/zfs-snapshot.8 @@ -0,0 +1,76 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd May 27, 2021 +.Dt ZFS-SNAPSHOT 8 +.Os +. +.Sh NAME +.Nm zfs-snapshot +.Nd create snapshots of ZFS datasets +.Sh SYNOPSIS +.Nm zfs +.Cm snapshot +.Op Fl r +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Ar dataset Ns @ Ns Ar snapname Ns … +. +.Sh DESCRIPTION +All previous modifications by successful system calls to the file system are +part of the snapshots. +Snapshots are taken atomically, so that all snapshots correspond to the same +moment in time. +.Nm zfs Cm snap +can be used as an alias for +.Nm zfs Cm snapshot . +See the +.Sx Snapshots +section of +.Xr zfsconcepts 7 +for details. +.Bl -tag -width "-o" +.It Fl o Ar property Ns = Ns Ar value +Set the specified property; see +.Nm zfs Cm create +for details. +.It Fl r +Recursively create snapshots of all descendent datasets +.El +. +.Sh SEE ALSO +.Xr zfs-bookmark 8 , +.Xr zfs-clone 8 , +.Xr zfs-destroy 8 , +.Xr zfs-diff 8 , +.Xr zfs-hold 8 , +.Xr zfs-rename 8 , +.Xr zfs-rollback 8 , +.Xr zfs-send 8 diff --git a/man/man8/zfs-unallow.8 b/man/man8/zfs-unallow.8 new file mode 120000 index 0000000000..8886f334bf --- /dev/null +++ b/man/man8/zfs-unallow.8 @@ -0,0 +1 @@ +zfs-allow.8 \ No newline at end of file diff --git a/man/man8/zfs-unjail.8 b/man/man8/zfs-unjail.8 new file mode 120000 index 0000000000..04cc05a002 --- /dev/null +++ b/man/man8/zfs-unjail.8 @@ -0,0 +1 @@ +zfs-jail.8 \ No newline at end of file diff --git a/man/man8/zfs-unload-key.8 b/man/man8/zfs-unload-key.8 new file mode 120000 index 0000000000..d027a419d1 --- /dev/null +++ b/man/man8/zfs-unload-key.8 @@ -0,0 +1 @@ +zfs-load-key.8 \ No newline at end of file diff --git a/man/man8/zfs-unmount.8 b/man/man8/zfs-unmount.8 new file mode 120000 index 0000000000..be0d9dbf6c --- /dev/null +++ b/man/man8/zfs-unmount.8 @@ -0,0 +1 @@ +zfs-mount.8 \ No newline at end of file diff --git a/man/man8/zfs-upgrade.8 b/man/man8/zfs-upgrade.8 new file mode 100644 index 0000000000..f3620faa61 --- /dev/null +++ b/man/man8/zfs-upgrade.8 @@ -0,0 +1,103 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-UPGRADE 8 +.Os +. +.Sh NAME +.Nm zfs-upgrade +.Nd manage on-disk version of ZFS filesystems +.Sh SYNOPSIS +.Nm zfs +.Cm upgrade +.Nm zfs +.Cm upgrade +.Fl v +.Nm zfs +.Cm upgrade +.Op Fl r +.Op Fl V Ar version +.Fl a Ns | Ns Ar filesystem +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm upgrade +.Xc +Displays a list of file systems that are not the most recent version. +.It Xo +.Nm zfs +.Cm upgrade +.Fl v +.Xc +Displays a list of currently supported file system versions. +.It Xo +.Nm zfs +.Cm upgrade +.Op Fl r +.Op Fl V Ar version +.Fl a Ns | Ns Ar filesystem +.Xc +Upgrades file systems to a new on-disk version. +Once this is done, the file systems will no longer be accessible on systems +running older versions of ZFS. +.Nm zfs Cm send +streams generated from new snapshots of these file systems cannot be accessed on +systems running older versions of ZFS. +.Pp +In general, the file system version is independent of the pool version. +See +.Xr zpool-features 7 +for information on features of ZFS storage pools. +.Pp +In some cases, the file system version and the pool version are interrelated and +the pool version must be upgraded before the file system version can be +upgraded. +.Bl -tag -width "filesystem" +.It Fl V Ar version +Upgrade to +.Ar version . +If not specified, upgrade to the most recent version. +This +option can only be used to increase the version number, and only up to the most +recent version supported by this version of ZFS. +.It Fl a +Upgrade all file systems on all imported pools. +.It Ar filesystem +Upgrade the specified file system. +.It Fl r +Upgrade the specified file system and all descendent file systems. +.El +.El +.Sh SEE ALSO +.Xr zpool-upgrade 8 diff --git a/man/man8/zfs-userspace.8 b/man/man8/zfs-userspace.8 new file mode 100644 index 0000000000..b7bd61b570 --- /dev/null +++ b/man/man8/zfs-userspace.8 @@ -0,0 +1,187 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-USERSPACE 8 +.Os +. +.Sh NAME +.Nm zfs-userspace +.Nd display space and quotas of ZFS dataset +.Sh SYNOPSIS +.Nm zfs +.Cm userspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path +.Nm zfs +.Cm groupspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path +.Nm zfs +.Cm projectspace +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path +. +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm zfs +.Cm userspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path +.Xc +Displays space consumed by, and quotas on, each user in the specified filesystem, +snapshot, or path. +If a path is given, the filesystem that contains that path will be used. +This corresponds to the +.Sy userused@ Ns Em user , +.Sy userobjused@ Ns Em user , +.Sy userquota@ Ns Em user , +and +.Sy userobjquota@ Ns Em user +properties. +.Bl -tag -width "-S field" +.It Fl H +Do not print headers, use tab-delimited output. +.It Fl S Ar field +Sort by this field in reverse order. +See +.Fl s . +.It Fl i +Translate SID to POSIX ID. +The POSIX ID may be ephemeral if no mapping exists. +Normal POSIX interfaces +.Pq like Xr stat 2 , Nm ls Fl l +perform this translation, so the +.Fl i +option allows the output from +.Nm zfs Cm userspace +to be compared directly with those utilities. +However, +.Fl i +may lead to confusion if some files were created by an SMB user before a +SMB-to-POSIX name mapping was established. +In such a case, some files will be owned by the SMB entity and some by the POSIX +entity. +However, the +.Fl i +option will report that the POSIX entity has the total usage and quota for both. +.It Fl n +Print numeric ID instead of user/group name. +.It Fl o Ar field Ns Oo , Ns Ar field Oc Ns … +Display only the specified fields from the following set: +.Sy type , +.Sy name , +.Sy used , +.Sy quota . +The default is to display all fields. +.It Fl p +Use exact +.Pq parsable +numeric output. +.It Fl s Ar field +Sort output by this field. +The +.Fl s +and +.Fl S +flags may be specified multiple times to sort first by one field, then by +another. +The default is +.Fl s Sy type Fl s Sy name . +.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns … +Print only the specified types from the following set: +.Sy all , +.Sy posixuser , +.Sy smbuser , +.Sy posixgroup , +.Sy smbgroup . +The default is +.Fl t Sy posixuser , Ns Sy smbuser . +The default can be changed to include group types. +.El +.It Xo +.Nm zfs +.Cm groupspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc +.Ar filesystem Ns | Ns Ar snapshot +.Xc +Displays space consumed by, and quotas on, each group in the specified +filesystem or snapshot. +This subcommand is identical to +.Cm userspace , +except that the default types to display are +.Fl t Sy posixgroup , Ns Sy smbgroup . +.It Xo +.Nm zfs +.Cm projectspace +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc +.Oo Fl s Ar field Oc Ns … +.Oo Fl S Ar field Oc Ns … +.Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path +.Xc +Displays space consumed by, and quotas on, each project in the specified +filesystem or snapshot. +This subcommand is identical to +.Cm userspace , +except that the project identifier is a numeral, not a name. +So need neither the option +.Fl i +for SID to POSIX ID nor +.Fl n +for numeric ID, nor +.Fl t +for types. +.El +. +.Sh SEE ALSO +.Xr zfsprops 7 , +.Xr zfs-set 8 diff --git a/man/man8/zfs-wait.8 b/man/man8/zfs-wait.8 new file mode 100644 index 0000000000..81bc156365 --- /dev/null +++ b/man/man8/zfs-wait.8 @@ -0,0 +1,65 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 31, 2021 +.Dt ZFS-WAIT 8 +.Os +. +.Sh NAME +.Nm zfs-wait +.Nd wait for activity in ZFS filesystem to stop +.Sh SYNOPSIS +.Nm zfs +.Cm wait +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns … +.Ar filesystem +. +.Sh DESCRIPTION +Waits until all background activity of the given types has ceased in the given +filesystem. +The activity could cease because it has completed or because the filesystem has +been destroyed or unmounted. +If no activities are specified, the command waits until background activity of +every type listed below has ceased. +If there is no activity of the given types in progress, the command returns +immediately. +.Pp +These are the possible values for +.Ar activity , +along with what each one waits for: +.Bl -tag -compact -offset Ds -width "deleteq" +.It Sy deleteq +The filesystem's internal delete queue to empty +.El +.Pp +Note that the internal delete queue does not finish draining until +all large files have had time to be fully destroyed and all open file +handles to unlinked files are closed. +. +.Sh SEE ALSO +.Xr lsof 8 diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 8d7b0bbb6f..48453ef46c 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -18,317 +18,40 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. -.\" Copyright 2018 Joyent, Inc. +.\" Copyright 2019 Joyent, Inc. .\" -.Dd April 30, 2019 -.Dt ZFS 8 SMM -.Os Linux +.Dd June 30, 2019 +.Dt ZFS 8 +.Os +. .Sh NAME .Nm zfs -.Nd configures ZFS file systems +.Nd configure ZFS datasets .Sh SYNOPSIS .Nm .Fl ?V .Nm -.Cm create -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem -.Nm -.Cm create -.Op Fl ps -.Op Fl b Ar blocksize -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Fl V Ar size Ar volume -.Nm -.Cm destroy -.Op Fl Rfnprv -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm destroy -.Op Fl Rdnprv -.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns -.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns ... -.Nm -.Cm destroy -.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark -.Nm -.Cm snapshot -.Op Fl r -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns @ Ns Ar snapname Ns | Ns Ar volume Ns @ Ns Ar snapname Ns ... -.Nm -.Cm rollback -.Op Fl Rfr -.Ar snapshot -.Nm -.Cm clone -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar snapshot Ar filesystem Ns | Ns Ar volume -.Nm -.Cm promote -.Ar clone-filesystem -.Nm -.Cm rename -.Op Fl f -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm rename -.Op Fl fp -.Ar filesystem Ns | Ns Ar volume -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm rename -.Fl r -.Ar snapshot Ar snapshot -.Nm -.Cm list -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... Oc -.Oo Fl s Ar property Oc Ns ... -.Oo Fl S Ar property Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns ... -.Nm -.Cm set -.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm -.Cm get -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... Oc -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Cm all | Ar property Ns Oo , Ns Ar property Oc Ns ... -.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns ... -.Nm -.Cm inherit -.Op Fl rS -.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm -.Cm upgrade -.Nm -.Cm upgrade -.Fl v -.Nm -.Cm upgrade -.Op Fl r -.Op Fl V Ar version -.Fl a | Ar filesystem -.Nm -.Cm userspace -.Op Fl Hinp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar snapshot -.Nm -.Cm groupspace -.Op Fl Hinp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar snapshot -.Nm -.Cm projectspace -.Op Fl Hp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Nm -.Cm project -.Oo Fl d Ns | Ns Fl r Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Nm -.Cm project -.Fl C -.Oo Fl kr Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Nm -.Cm project -.Fl c -.Oo Fl 0 Ns Oc -.Oo Fl d Ns | Ns Fl r Ns Oc -.Op Fl p Ar id -.Ar file Ns | Ns Ar directory Ns ... -.Nm -.Cm project -.Op Fl p Ar id -.Oo Fl rs Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Nm -.Cm mount -.Nm -.Cm mount -.Op Fl Olv -.Op Fl o Ar options -.Fl a | Ar filesystem -.Nm -.Cm unmount -.Op Fl f -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Nm -.Cm share -.Fl a | Ar filesystem -.Nm -.Cm unshare -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Nm -.Cm bookmark -.Ar snapshot bookmark -.Nm -.Cm send -.Op Fl DLPRbcehnpvw -.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot -.Ar snapshot -.Nm -.Cm send -.Op Fl LPcenvw -.Op Fl i Ar snapshot Ns | Ns Ar bookmark -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm send -.Op Fl Penv -.Fl t Ar receive_resume_token -.Nm -.Cm receive -.Op Fl Fhnsuv -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Op Fl o Ar property Ns = Ns Ar value -.Op Fl x Ar property -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm receive -.Op Fl Fhnsuv -.Op Fl d Ns | Ns Fl e -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Op Fl o Ar property Ns = Ns Ar value -.Op Fl x Ar property -.Ar filesystem -.Nm -.Cm receive -.Fl A -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Op Fl dglu -.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Op Fl dl -.Fl e Ns | Ns Sy everyone -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Fl c -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Fl s No @ Ns Ar setname -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl dglru -.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl dlr -.Fl e Ns | Ns Sy everyone -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl r -.Fl c -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl r -.Fl s @ Ns Ar setname -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm hold -.Op Fl r -.Ar tag Ar snapshot Ns ... -.Nm -.Cm holds -.Op Fl rH -.Ar snapshot Ns ... -.Nm -.Cm release -.Op Fl r -.Ar tag Ar snapshot Ns ... -.Nm -.Cm diff -.Op Fl FHt -.Ar snapshot Ar snapshot Ns | Ns Ar filesystem -.Nm -.Cm program -.Op Fl jn -.Op Fl t Ar instruction-limit -.Op Fl m Ar memory-limit -.Ar pool script -.Op Ar arg1 No ... -.Nm -.Cm load-key -.Op Fl nr -.Op Fl L Ar keylocation -.Fl a | Ar filesystem -.Nm -.Cm unload-key -.Op Fl r -.Fl a | Ar filesystem -.Nm -.Cm change-key -.Op Fl l -.Op Fl o Ar keylocation Ns = Ns Ar value -.Op Fl o Ar keyformat Ns = Ns Ar value -.Op Fl o Ar pbkdf2iters Ns = Ns Ar value -.Ar filesystem -.Nm -.Cm change-key -.Fl i -.Op Fl l -.Ar filesystem -.Nm .Cm version +.Nm +.Cm subcommand +.Op Ar arguments +. .Sh DESCRIPTION The .Nm @@ -336,23 +59,18 @@ command configures ZFS datasets within a ZFS storage pool, as described in .Xr zpool 8 . A dataset is identified by a unique path within the ZFS namespace. For example: -.Bd -literal -pool/{filesystem,volume,snapshot} -.Ed +.Dl pool/{filesystem,volume,snapshot} .Pp where the maximum length of a dataset name is -.Dv MAXNAMELEN -.Pq 256 bytes +.Sy MAXNAMELEN Pq 256B and the maximum amount of nesting allowed in a path is 50 levels deep. .Pp A dataset can be one of the following: -.Bl -tag -width "file system" +.Bl -tag -offset Ds -width "file system" .It Sy file system -A ZFS dataset of type -.Sy filesystem -can be mounted within the standard system namespace and behaves like other file +Can be mounted within the standard system namespace and behaves like other file systems. -While ZFS file systems are designed to be POSIX compliant, known issues exist +While ZFS file systems are designed to be POSIX-compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to non-standard behavior when checking file system free space. @@ -369,2091 +87,40 @@ or .It Sy bookmark Much like a .Sy snapshot , -but without the hold on on-disk data. It can be used as the source of a send -(but not for a receive). It is specified as +but without the hold on on-disk data. +It can be used as the source of a send (but not for a receive). +It is specified as .Ar filesystem Ns # Ns Ar name or .Ar volume Ns # Ns Ar name . .El -.Ss ZFS File System Hierarchy -A ZFS storage pool is a logical collection of devices that provide space for -datasets. -A storage pool is also the root of the ZFS file system hierarchy. -.Pp -The root of the pool can be accessed as a file system, such as mounting and -unmounting, taking snapshots, and setting properties. -The physical storage characteristics, however, are managed by the -.Xr zpool 8 -command. .Pp See -.Xr zpool 8 -for more information on creating and administering pools. -.Ss Snapshots -A snapshot is a read-only copy of a file system or volume. -Snapshots can be created extremely quickly, and initially consume no additional -space within the pool. -As data within the active dataset changes, the snapshot consumes more data than -would otherwise be shared with the active dataset. -.Pp -Snapshots can have arbitrary names. -Snapshots of volumes can be cloned or rolled back, visibility is determined -by the -.Sy snapdev -property of the parent volume. -.Pp -File system snapshots can be accessed under the -.Pa .zfs/snapshot -directory in the root of the file system. -Snapshots are automatically mounted on demand and may be unmounted at regular -intervals. -The visibility of the -.Pa .zfs -directory can be controlled by the -.Sy snapdir -property. -.Ss Bookmarks -A bookmark is like a snapshot, a read-only copy of a file system or volume. -Bookmarks can be created extremely quickly, compared to snapshots, and they -consume no additional space within the pool. Bookmarks can also have arbitrary -names, much like snapshots. -.Pp -Unlike snapshots, bookmarks can not be accessed through the filesystem in any -way. From a storage standpoint a bookmark just provides a way to reference -when a snapshot was created as a distinct object. Bookmarks are initially -tied to a snapshot, not the filesystem or volume, and they will survive if the -snapshot itself is destroyed. Since they are very light weight there's little -incentive to destroy them. -.Ss Clones -A clone is a writable volume or file system whose initial contents are the same -as another dataset. -As with snapshots, creating a clone is nearly instantaneous, and initially -consumes no additional space. -.Pp -Clones can only be created from a snapshot. -When a snapshot is cloned, it creates an implicit dependency between the parent -and child. -Even though the clone is created somewhere else in the dataset hierarchy, the -original snapshot cannot be destroyed as long as a clone exists. -The -.Sy origin -property exposes this dependency, and the -.Cm destroy -command lists any such dependencies, if they exist. -.Pp -The clone parent-child dependency relationship can be reversed by using the -.Cm promote -subcommand. -This causes the -.Qq origin -file system to become a clone of the specified file system, which makes it -possible to destroy the file system that the clone was created from. -.Ss "Mount Points" -Creating a ZFS file system is a simple operation, so the number of file systems -per system is likely to be numerous. -To cope with this, ZFS automatically manages mounting and unmounting file -systems without the need to edit the -.Pa /etc/fstab -file. -All automatically managed file systems are mounted by ZFS at boot time. -.Pp -By default, file systems are mounted under -.Pa /path , -where -.Ar path -is the name of the file system in the ZFS namespace. -Directories are created and destroyed as needed. -.Pp -A file system can also have a mount point set in the -.Sy mountpoint -property. -This directory is created as needed, and ZFS automatically mounts the file -system when the -.Nm zfs Cm mount Fl a -command is invoked -.Po without editing -.Pa /etc/fstab -.Pc . -The -.Sy mountpoint -property can be inherited, so if -.Em pool/home -has a mount point of -.Pa /export/stuff , -then -.Em pool/home/user -automatically inherits a mount point of -.Pa /export/stuff/user . -.Pp -A file system -.Sy mountpoint -property of -.Sy none -prevents the file system from being mounted. -.Pp -If needed, ZFS file systems can also be managed with traditional tools -.Po -.Nm mount , -.Nm umount , -.Pa /etc/fstab -.Pc . -If a file system's mount point is set to -.Sy legacy , -ZFS makes no attempt to manage the file system, and the administrator is -responsible for mounting and unmounting the file system. Because pools must -be imported before a legacy mount can succeed, administrators should ensure -that legacy mounts are only attempted after the zpool import process -finishes at boot time. For example, on machines using systemd, the mount -option -.Pp -.Nm x-systemd.requires=zfs-import.target -.Pp -will ensure that the zfs-import completes before systemd attempts mounting -the filesystem. See systemd.mount(5) for details. -.Ss Deduplication -Deduplication is the process for removing redundant data at the block level, -reducing the total amount of data stored. If a file system has the -.Sy dedup -property enabled, duplicate data blocks are removed synchronously. The result -is that only unique data is stored and common components are shared among files. -.Pp -Deduplicating data is a very resource-intensive operation. It is generally -recommended that you have at least 1.25 GiB of RAM per 1 TiB of storage when -you enable deduplication. Calculating the exact requirement depends heavily -on the type of data stored in the pool. -.Pp -Enabling deduplication on an improperly-designed system can result in -performance issues (slow IO and administrative operations). It can potentially -lead to problems importing a pool due to memory exhaustion. Deduplication -can consume significant processing power (CPU) and memory as well as generate -additional disk IO. -.Pp -Before creating a pool with deduplication enabled, ensure that you have planned -your hardware requirements appropriately and implemented appropriate recovery -practices, such as regular backups. As an alternative to deduplication -consider using -.Sy compression=on , -as a less resource-intensive alternative. -.Ss Native Properties -Properties are divided into two types, native properties and user-defined -.Po or -.Qq user -.Pc +.Xr zfsconcepts 7 +for details. +. +.Ss Properties +Properties are divided into two types: native properties and user-defined +.Pq or Qq user properties. Native properties either export internal statistics or control ZFS behavior. In addition, native properties are either editable or read-only. User properties have no effect on ZFS behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. -For more information about user properties, see the -.Sx User Properties -section, below. -.Pp -Every dataset has a set of properties that export statistics about the dataset -as well as control various behaviors. -Properties are inherited from the parent unless overridden by the child. -Some properties apply only to certain types of datasets -.Pq file systems, volumes, or snapshots . -.Pp -The values of numeric properties can be specified using human-readable suffixes -.Po for example, -.Sy k , -.Sy KB , -.Sy M , -.Sy Gb , -and so forth, up to -.Sy Z -for zettabyte -.Pc . -The following are all valid -.Pq and equal -specifications: -.Li 1536M, 1.5g, 1.50GB . -.Pp -The values of non-numeric properties are case sensitive and must be lowercase, -except for -.Sy mountpoint , -.Sy sharenfs , -and -.Sy sharesmb . -.Pp -The following native properties consist of read-only statistics about the -dataset. -These properties can be neither set, nor inherited. -Native properties apply to all dataset types unless otherwise noted. -.Bl -tag -width "usedbyrefreservation" -.It Sy available -The amount of space available to the dataset and all its children, assuming that -there is no other activity in the pool. -Because space is shared within a pool, availability can be limited by any number -of factors, including physical pool size, quotas, reservations, or other -datasets within the pool. -.Pp -This property can also be referred to by its shortened column name, -.Sy avail . -.It Sy compressratio -For non-snapshots, the compression ratio achieved for the -.Sy used -space of this dataset, expressed as a multiplier. -The -.Sy used -property includes descendant datasets, and, for clones, does not include the -space shared with the origin snapshot. -For snapshots, the -.Sy compressratio -is the same as the -.Sy refcompressratio -property. -Compression can be turned on by running: -.Nm zfs Cm set Sy compression Ns = Ns Sy on Ar dataset . -The default value is -.Sy off . -.It Sy createtxg -The transaction group (txg) in which the dataset was created. Bookmarks have -the same -.Sy createtxg -as the snapshot they are initially tied to. This property is suitable for -ordering a list of snapshots, e.g. for incremental send and receive. -.It Sy creation -The time this dataset was created. -.It Sy clones -For snapshots, this property is a comma-separated list of filesystems or volumes -which are clones of this snapshot. -The clones' -.Sy origin -property is this snapshot. -If the -.Sy clones -property is not empty, then this snapshot can not be destroyed -.Po even with the -.Fl r -or -.Fl f -options -.Pc . -The roles of origin and clone can be swapped by promoting the clone with the -.Nm zfs Cm promote -command. -.It Sy defer_destroy -This property is -.Sy on -if the snapshot has been marked for deferred destroy by using the -.Nm zfs Cm destroy Fl d -command. -Otherwise, the property is -.Sy off . -.It Sy encryptionroot -For encrypted datasets, indicates where the dataset is currently inheriting its -encryption key from. Loading or unloading a key for the -.Sy encryptionroot -will implicitly load / unload the key for any inheriting datasets (see -.Nm zfs Cm load-key -and -.Nm zfs Cm unload-key -for details). -Clones will always share an -encryption key with their origin. See the -.Sx Encryption -section for details. -.It Sy filesystem_count -The total number of filesystems and volumes that exist under this location in -the dataset tree. -This value is only available when a -.Sy filesystem_limit -has been set somewhere in the tree under which the dataset resides. -.It Sy keystatus -Indicates if an encryption key is currently loaded into ZFS. The possible -values are -.Sy none , -.Sy available , -and -.Sy unavailable . -See -.Nm zfs Cm load-key -and -.Nm zfs Cm unload-key . -.It Sy guid -The 64 bit GUID of this dataset or bookmark which does not change over its -entire lifetime. When a snapshot is sent to another pool, the received -snapshot has the same GUID. Thus, the -.Sy guid -is suitable to identify a snapshot across pools. -.It Sy logicalreferenced -The amount of space that is -.Qq logically -accessible by this dataset. -See the -.Sy referenced -property. -The logical space ignores the effect of the -.Sy compression -and -.Sy copies -properties, giving a quantity closer to the amount of data that applications -see. -However, it does include space consumed by metadata. -.Pp -This property can also be referred to by its shortened column name, -.Sy lrefer . -.It Sy logicalused -The amount of space that is -.Qq logically -consumed by this dataset and all its descendents. -See the -.Sy used -property. -The logical space ignores the effect of the -.Sy compression -and -.Sy copies -properties, giving a quantity closer to the amount of data that applications -see. -However, it does include space consumed by metadata. -.Pp -This property can also be referred to by its shortened column name, -.Sy lused . -.It Sy mounted -For file systems, indicates whether the file system is currently mounted. -This property can be either -.Sy yes -or -.Sy no . -.It Sy objsetid -A unique identifier for this dataset within the pool. Unlike the dataset's -.Sy guid -, the -.Sy objsetid -of a dataset is not transferred to other pools when the snapshot is copied -with a send/receive operation. -The -.Sy objsetid -can be reused (for a new datatset) after the dataset is deleted. -.It Sy origin -For cloned file systems or volumes, the snapshot from which the clone was -created. -See also the -.Sy clones -property. -.It Sy receive_resume_token -For filesystems or volumes which have saved partially-completed state from -.Sy zfs receive -s , -this opaque token can be provided to -.Sy zfs send -t -to resume and complete the -.Sy zfs receive . -.It Sy referenced -The amount of data that is accessible by this dataset, which may or may not be -shared with other datasets in the pool. -When a snapshot or clone is created, it initially references the same amount of -space as the file system or snapshot it was created from, since its contents are -identical. -.Pp -This property can also be referred to by its shortened column name, -.Sy refer . -.It Sy refcompressratio -The compression ratio achieved for the -.Sy referenced -space of this dataset, expressed as a multiplier. -See also the -.Sy compressratio -property. -.It Sy snapshot_count -The total number of snapshots that exist under this location in the dataset -tree. -This value is only available when a -.Sy snapshot_limit -has been set somewhere in the tree under which the dataset resides. -.It Sy type -The type of dataset: -.Sy filesystem , -.Sy volume , -or -.Sy snapshot . -.It Sy used -The amount of space consumed by this dataset and all its descendents. -This is the value that is checked against this dataset's quota and reservation. -The space used does not include this dataset's reservation, but does take into -account the reservations of any descendent datasets. -The amount of space that a dataset consumes from its parent, as well as the -amount of space that is freed if this dataset is recursively destroyed, is the -greater of its space used and its reservation. -.Pp -The used space of a snapshot -.Po see the -.Sx Snapshots -section -.Pc -is space that is referenced exclusively by this snapshot. -If this snapshot is destroyed, the amount of -.Sy used -space will be freed. -Space that is shared by multiple snapshots isn't accounted for in this metric. -When a snapshot is destroyed, space that was previously shared with this -snapshot can become unique to snapshots adjacent to it, thus changing the used -space of those snapshots. -The used space of the latest snapshot can also be affected by changes in the -file system. -Note that the -.Sy used -space of a snapshot is a subset of the -.Sy written -space of the snapshot. -.Pp -The amount of space used, available, or referenced does not take into account -pending changes. -Pending changes are generally accounted for within a few seconds. -Committing a change to a disk using -.Xr fsync 2 -or -.Dv O_SYNC -does not necessarily guarantee that the space usage information is updated -immediately. -.It Sy usedby* -The -.Sy usedby* -properties decompose the -.Sy used -properties into the various reasons that space is used. -Specifically, -.Sy used No = -.Sy usedbychildren No + -.Sy usedbydataset No + -.Sy usedbyrefreservation No + -.Sy usedbysnapshots . -These properties are only available for datasets created on -.Nm zpool -.Qo version 13 Qc -pools. -.It Sy usedbychildren -The amount of space used by children of this dataset, which would be freed if -all the dataset's children were destroyed. -.It Sy usedbydataset -The amount of space used by this dataset itself, which would be freed if the -dataset were destroyed -.Po after first removing any -.Sy refreservation -and destroying any necessary snapshots or descendents -.Pc . -.It Sy usedbyrefreservation -The amount of space used by a -.Sy refreservation -set on this dataset, which would be freed if the -.Sy refreservation -was removed. -.It Sy usedbysnapshots -The amount of space consumed by snapshots of this dataset. -In particular, it is the amount of space that would be freed if all of this -dataset's snapshots were destroyed. -Note that this is not simply the sum of the snapshots' -.Sy used -properties because space can be shared by multiple snapshots. -.It Sy userused Ns @ Ns Em user -The amount of space consumed by the specified user in this dataset. -Space is charged to the owner of each file, as displayed by -.Nm ls Fl l . -The amount of space charged is displayed by -.Nm du -and -.Nm ls Fl s . -See the -.Nm zfs Cm userspace -subcommand for more information. -.Pp -Unprivileged users can access only their own space usage. -The root user, or a user who has been granted the -.Sy userused -privilege with -.Nm zfs Cm allow , -can access everyone's usage. -.Pp -The -.Sy userused Ns @ Ns Em ... -properties are not displayed by -.Nm zfs Cm get Sy all . -The user's name must be appended after the @ symbol, using one of the following -forms: -.Bl -bullet -width "" -.It -.Em POSIX name -.Po for example, -.Sy joe -.Pc -.It -.Em POSIX numeric ID -.Po for example, -.Sy 789 -.Pc -.It -.Em SID name -.Po for example, -.Sy joe.smith@mydomain -.Pc -.It -.Em SID numeric ID -.Po for example, -.Sy S-1-123-456-789 -.Pc -.El -.Pp -Files created on Linux always have POSIX owners. -.It Sy userobjused Ns @ Ns Em user -The -.Sy userobjused -property is similar to -.Sy userused -but instead it counts the number of objects consumed by a user. This property -counts all objects allocated on behalf of the user, it may differ from the -results of system tools such as -.Nm df Fl i . -.Pp -When the property -.Sy xattr=on -is set on a file system additional objects will be created per-file to store -extended attributes. These additional objects are reflected in the -.Sy userobjused -value and are counted against the user's -.Sy userobjquota . -When a file system is configured to use -.Sy xattr=sa -no additional internal objects are normally required. -.It Sy userrefs -This property is set to the number of user holds on this snapshot. -User holds are set by using the -.Nm zfs Cm hold -command. -.It Sy groupused Ns @ Ns Em group -The amount of space consumed by the specified group in this dataset. -Space is charged to the group of each file, as displayed by -.Nm ls Fl l . -See the -.Sy userused Ns @ Ns Em user -property for more information. -.Pp -Unprivileged users can only access their own groups' space usage. -The root user, or a user who has been granted the -.Sy groupused -privilege with -.Nm zfs Cm allow , -can access all groups' usage. -.It Sy groupobjused Ns @ Ns Em group -The number of objects consumed by the specified group in this dataset. -Multiple objects may be charged to the group for each file when extended -attributes are in use. See the -.Sy userobjused Ns @ Ns Em user -property for more information. -.Pp -Unprivileged users can only access their own groups' space usage. -The root user, or a user who has been granted the -.Sy groupobjused -privilege with -.Nm zfs Cm allow , -can access all groups' usage. -.It Sy projectused Ns @ Ns Em project -The amount of space consumed by the specified project in this dataset. Project -is identified via the project identifier (ID) that is object-based numeral -attribute. An object can inherit the project ID from its parent object (if the -parent has the flag of inherit project ID that can be set and changed via -.Nm chattr Fl /+P -or -.Nm zfs project Fl s ) -when being created. The privileged user can set and change object's project -ID via -.Nm chattr Fl p -or -.Nm zfs project Fl s -anytime. Space is charged to the project of each file, as displayed by -.Nm lsattr Fl p -or -.Nm zfs project . -See the -.Sy userused Ns @ Ns Em user -property for more information. -.Pp -The root user, or a user who has been granted the -.Sy projectused -privilege with -.Nm zfs allow , -can access all projects' usage. -.It Sy projectobjused Ns @ Ns Em project -The -.Sy projectobjused -is similar to -.Sy projectused -but instead it counts the number of objects consumed by project. When the -property -.Sy xattr=on -is set on a fileset, ZFS will create additional objects per-file to store -extended attributes. These additional objects are reflected in the -.Sy projectobjused -value and are counted against the project's -.Sy projectobjquota . -When a filesystem is configured to use -.Sy xattr=sa -no additional internal objects are required. See the -.Sy userobjused Ns @ Ns Em user -property for more information. -.Pp -The root user, or a user who has been granted the -.Sy projectobjused -privilege with -.Nm zfs allow , -can access all projects' objects usage. -.It Sy volblocksize -For volumes, specifies the block size of the volume. -The -.Sy blocksize -cannot be changed once the volume has been written, so it should be set at -volume creation time. -The default -.Sy blocksize -for volumes is 8 Kbytes. -Any power of 2 from 512 bytes to 128 Kbytes is valid. -.Pp -This property can also be referred to by its shortened column name, -.Sy volblock . -.It Sy written -The amount of space -.Sy referenced -by this dataset, that was written since the previous snapshot -.Pq i.e. that is not referenced by the previous snapshot . -.It Sy written Ns @ Ns Em snapshot -The amount of -.Sy referenced -space written to this dataset since the specified snapshot. -This is the space that is referenced by this dataset but was not referenced by -the specified snapshot. -.Pp -The -.Em snapshot -may be specified as a short snapshot name -.Po just the part after the -.Sy @ -.Pc , -in which case it will be interpreted as a snapshot in the same filesystem as -this dataset. -The -.Em snapshot -may be a full snapshot name -.Po Em filesystem Ns @ Ns Em snapshot Pc , -which for clones may be a snapshot in the origin's filesystem -.Pq or the origin of the origin's filesystem, etc. -.El -.Pp -The following native properties can be used to change the behavior of a ZFS -dataset. -.Bl -tag -width "" -.It Xo -.Sy aclinherit Ns = Ns Sy discard Ns | Ns Sy noallow Ns | Ns -.Sy restricted Ns | Ns Sy passthrough Ns | Ns Sy passthrough-x -.Xc -Controls how ACEs are inherited when files and directories are created. -.Bl -tag -width "passthrough-x" -.It Sy discard -does not inherit any ACEs. -.It Sy noallow -only inherits inheritable ACEs that specify -.Qq deny -permissions. -.It Sy restricted -default, removes the -.Sy write_acl -and -.Sy write_owner -permissions when the ACE is inherited. -.It Sy passthrough -inherits all inheritable ACEs without any modifications. -.It Sy passthrough-x -same meaning as -.Sy passthrough , -except that the -.Sy owner@ , -.Sy group@ , -and -.Sy everyone@ -ACEs inherit the execute permission only if the file creation mode also requests -the execute bit. -.El -.Pp -When the property value is set to -.Sy passthrough , -files are created with a mode determined by the inheritable ACEs. -If no inheritable ACEs exist that affect the mode, then the mode is set in -accordance to the requested mode from the application. -.Pp -The -.Sy aclinherit -property does not apply to POSIX ACLs. -.It Sy acltype Ns = Ns Sy off Ns | Ns Sy noacl Ns | Ns Sy posixacl -Controls whether ACLs are enabled and if so what type of ACL to use. -.Bl -tag -width "posixacl" -.It Sy off -default, when a file system has the -.Sy acltype -property set to off then ACLs are disabled. -.It Sy noacl -an alias for -.Sy off -.It Sy posixacl -indicates POSIX ACLs should be used. POSIX ACLs are specific to Linux and are -not functional on other platforms. POSIX ACLs are stored as an extended -attribute and therefore will not overwrite any existing NFSv4 ACLs which -may be set. -.El -.Pp -To obtain the best performance when setting -.Sy posixacl -users are strongly encouraged to set the -.Sy xattr=sa -property. This will result in the POSIX ACL being stored more efficiently on -disk. But as a consequence, all new extended attributes will only be -accessible from OpenZFS implementations which support the -.Sy xattr=sa -property. See the -.Sy xattr -property for more details. -.It Sy atime Ns = Ns Sy on Ns | Ns Sy off -Controls whether the access time for files is updated when they are read. -Turning this property off avoids producing write traffic when reading files and -can result in significant performance gains, though it might confuse mailers -and other similar utilities. The values -.Sy on -and -.Sy off -are equivalent to the -.Sy atime -and -.Sy noatime -mount options. The default value is -.Sy on . -See also -.Sy relatime -below. -.It Sy canmount Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy noauto -If this property is set to -.Sy off , -the file system cannot be mounted, and is ignored by -.Nm zfs Cm mount Fl a . -Setting this property to -.Sy off -is similar to setting the -.Sy mountpoint -property to -.Sy none , -except that the dataset still has a normal -.Sy mountpoint -property, which can be inherited. -Setting this property to -.Sy off -allows datasets to be used solely as a mechanism to inherit properties. -One example of setting -.Sy canmount Ns = Ns Sy off -is to have two datasets with the same -.Sy mountpoint , -so that the children of both datasets appear in the same directory, but might -have different inherited characteristics. -.Pp -When set to -.Sy noauto , -a dataset can only be mounted and unmounted explicitly. -The dataset is not mounted automatically when the dataset is created or -imported, nor is it mounted by the -.Nm zfs Cm mount Fl a -command or unmounted by the -.Nm zfs Cm unmount Fl a -command. -.Pp -This property is not inherited. -.It Xo -.Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns -.Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns -.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr -.Xc -Controls the checksum used to verify data integrity. -The default value is -.Sy on , -which automatically selects an appropriate algorithm -.Po currently, -.Sy fletcher4 , -but this may change in future releases -.Pc . -The value -.Sy off -disables integrity checking on user data. -The value -.Sy noparity -not only disables integrity but also disables maintaining parity for user data. -This setting is used internally by a dump device residing on a RAID-Z pool and -should not be used by any other dataset. -Disabling checksums is -.Sy NOT -a recommended practice. -.Pp -The -.Sy sha512 , -.Sy skein , -and -.Sy edonr -checksum algorithms require enabling the appropriate features on the pool. -These pool features are not supported by GRUB and must not be used on the -pool if GRUB needs to access the pool (e.g. for /boot). -.Pp -Please see -.Xr zpool-features 5 -for more information on these algorithms. -.Pp -Changing this property affects only newly-written data. -.It Xo -.Sy compression Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy gzip Ns | Ns -.Sy gzip- Ns Em N Ns | Ns Sy lz4 Ns | Ns Sy lzjb Ns | Ns Sy zle -.Xc -Controls the compression algorithm used for this dataset. -.Pp -Setting compression to -.Sy on -indicates that the current default compression algorithm should be used. -The default balances compression and decompression speed, with compression ratio -and is expected to work well on a wide variety of workloads. -Unlike all other settings for this property, -.Sy on -does not select a fixed compression type. -As new compression algorithms are added to ZFS and enabled on a pool, the -default compression algorithm may change. -The current default compression algorithm is either -.Sy lzjb -or, if the -.Sy lz4_compress -feature is enabled, -.Sy lz4 . -.Pp -The -.Sy lz4 -compression algorithm is a high-performance replacement for the -.Sy lzjb -algorithm. -It features significantly faster compression and decompression, as well as a -moderately higher compression ratio than -.Sy lzjb , -but can only be used on pools with the -.Sy lz4_compress -feature set to -.Sy enabled . -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy lz4_compress -feature. -.Pp -The -.Sy lzjb -compression algorithm is optimized for performance while providing decent data -compression. -.Pp -The -.Sy gzip -compression algorithm uses the same compression as the -.Xr gzip 1 -command. -You can specify the -.Sy gzip -level by using the value -.Sy gzip- Ns Em N , -where -.Em N -is an integer from 1 -.Pq fastest -to 9 -.Pq best compression ratio . -Currently, -.Sy gzip -is equivalent to -.Sy gzip-6 -.Po which is also the default for -.Xr gzip 1 -.Pc . -.Pp -The -.Sy zle -compression algorithm compresses runs of zeros. -.Pp -This property can also be referred to by its shortened column name -.Sy compress . -Changing this property affects only newly-written data. -.Pp -When any setting except -.Sy off -is selected, compression will explicitly check for blocks consisting of only -zeroes (the NUL byte). When a zero-filled block is detected, it is stored as -a hole and not compressed using the indicated compression algorithm. -.Pp -Any block being compressed must be no larger than 7/8 of its original size -after compression, otherwise the compression will not be considered worthwhile -and the block saved uncompressed. Note that when the logical block is less than -8 times the disk sector size this effectively reduces the necessary compression -ratio; for example 8k blocks on disks with 4k disk sectors must compress to 1/2 -or less of their original size. -.It Xo -.Sy context Ns = Ns Sy none Ns | Ns -.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level -.Xc -This flag sets the SELinux context for all files in the file system under -a mount point for that file system. See -.Xr selinux 8 -for more information. -.It Xo -.Sy fscontext Ns = Ns Sy none Ns | Ns -.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level -.Xc -This flag sets the SELinux context for the file system file system being -mounted. See -.Xr selinux 8 -for more information. -.It Xo -.Sy defcontext Ns = Ns Sy none Ns | Ns -.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level -.Xc -This flag sets the SELinux default context for unlabeled files. See -.Xr selinux 8 -for more information. -.It Xo -.Sy rootcontext Ns = Ns Sy none Ns | Ns -.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level -.Xc -This flag sets the SELinux context for the root inode of the file system. See -.Xr selinux 8 -for more information. -.It Sy copies Ns = Ns Sy 1 Ns | Ns Sy 2 Ns | Ns Sy 3 -Controls the number of copies of data stored for this dataset. -These copies are in addition to any redundancy provided by the pool, for -example, mirroring or RAID-Z. -The copies are stored on different disks, if possible. -The space used by multiple copies is charged to the associated file and dataset, -changing the -.Sy used -property and counting against quotas and reservations. -.Pp -Changing this property only affects newly-written data. -Therefore, set this property at file system creation time by using the -.Fl o Sy copies Ns = Ns Ar N -option. -.Pp -Remember that ZFS will not import a pool with a missing top-level vdev. Do -.Sy NOT -create, for example a two-disk striped pool and set -.Sy copies=2 -on some datasets thinking you have setup redundancy for them. When a disk -fails you will not be able to import the pool and will have lost all of your -data. -.Pp -Encrypted datasets may not have -.Sy copies Ns = Ns Em 3 -since the implementation stores some encryption metadata where the third copy -would normally be. -.It Sy devices Ns = Ns Sy on Ns | Ns Sy off -Controls whether device nodes can be opened on this file system. -The default value is -.Sy on . -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy dev -and -.Sy nodev -mount options. -.It Xo -.Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns -.Sy sha256[,verify] Ns | Ns Sy sha512[,verify] Ns | Ns Sy skein[,verify] Ns | Ns -.Sy edonr,verify -.Xc -Configures deduplication for a dataset. The default value is -.Sy off . -The default deduplication checksum is -.Sy sha256 -(this may change in the future). When -.Sy dedup -is enabled, the checksum defined here overrides the -.Sy checksum -property. Setting the value to -.Sy verify -has the same effect as the setting -.Sy sha256,verify. -.Pp -If set to -.Sy verify , -ZFS will do a byte-to-byte comparsion in case of two blocks having the same -signature to make sure the block contents are identical. Specifying -.Sy verify -is mandatory for the -.Sy edonr -algorithm. -.Pp -Unless necessary, deduplication should NOT be enabled on a system. See -.Sx Deduplication -above. -.It Xo -.Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns -.Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k -.Xc -Specifies a compatibility mode or literal value for the size of dnodes in the -file system. The default value is -.Sy legacy . -Setting this property to a value other than -.Sy legacy -requires the large_dnode pool feature to be enabled. -.Pp -Consider setting -.Sy dnodesize -to -.Sy auto -if the dataset uses the -.Sy xattr=sa -property setting and the workload makes heavy use of extended attributes. This -may be applicable to SELinux-enabled systems, Lustre servers, and Samba -servers, for example. Literal values are supported for cases where the optimal -size is known in advance and for performance testing. -.Pp -Leave -.Sy dnodesize -set to -.Sy legacy -if you need to receive a send stream of this dataset on a pool that doesn't -enable the large_dnode feature, or if you need to import this pool on a system -that doesn't support the large_dnode feature. -.Pp -This property can also be referred to by its shortened column name, -.Sy dnsize . -.It Xo -.Sy encryption Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy aes-128-ccm Ns | Ns -.Sy aes-192-ccm Ns | Ns Sy aes-256-ccm Ns | Ns Sy aes-128-gcm Ns | Ns -.Sy aes-192-gcm Ns | Ns Sy aes-256-gcm -.Xc -Controls the encryption cipher suite (block cipher, key length, and mode) used -for this dataset. Requires the -.Sy encryption -feature to be enabled on the pool. -Requires a -.Sy keyformat -to be set at dataset creation time. -.Pp -Selecting -.Sy encryption Ns = Ns Sy on -when creating a dataset indicates that the default encryption suite will be -selected, which is currently -.Sy aes-256-ccm . -In order to provide consistent data protection, encryption must be specified at -dataset creation time and it cannot be changed afterwards. -.Pp -For more details and caveats about encryption see the -.Sy Encryption -section. -.It Sy keyformat Ns = Ns Sy raw Ns | Ns Sy hex Ns | Ns Sy passphrase -Controls what format the user's encryption key will be provided as. This -property is only set when the dataset is encrypted. -.Pp -Raw keys and hex keys must be 32 bytes long (regardless of the chosen -encryption suite) and must be randomly generated. A raw key can be generated -with the following command: -.Bd -literal -# dd if=/dev/urandom of=/path/to/output/key bs=32 count=1 -.Ed -.Pp -Passphrases must be between 8 and 512 bytes long and will be processed through -PBKDF2 before being used (see the -.Sy pbkdf2iters -property). Even though the -encryption suite cannot be changed after dataset creation, the keyformat can be -with -.Nm zfs Cm change-key . -.It Xo -.Sy keylocation Ns = Ns Sy prompt Ns | Ns Sy file:// Ns Em -.Xc -Controls where the user's encryption key will be loaded from by default for -commands such as -.Nm zfs Cm load-key -and -.Nm zfs Cm mount Cm -l . -This property is only set for encrypted datasets which are encryption roots. If -unspecified, the default is -.Sy prompt. -.Pp -Even though the encryption suite cannot be changed after dataset creation, the -keylocation can be with either -.Nm zfs Cm set -or -.Nm zfs Cm change-key . -If -.Sy prompt -is selected ZFS will ask for the key at the command prompt when it is required -to access the encrypted data (see -.Nm zfs Cm load-key -for details). This setting will also allow the key to be passed in via STDIN, -but users should be careful not to place keys which should be kept secret on -the command line. If a file URI is selected, the key will be loaded from the -specified absolute file path. -.It Sy pbkdf2iters Ns = Ns Ar iterations -Controls the number of PBKDF2 iterations that a -.Sy passphrase -encryption key should be run through when processing it into an encryption key. -This property is only defined when encryption is enabled and a keyformat of -.Sy passphrase -is selected. The goal of PBKDF2 is to significantly increase the -computational difficulty needed to brute force a user's passphrase. This is -accomplished by forcing the attacker to run each passphrase through a -computationally expensive hashing function many times before they arrive at the -resulting key. A user who actually knows the passphrase will only have to pay -this cost once. As CPUs become better at processing, this number should be -raised to ensure that a brute force attack is still not possible. The current -default is -.Sy 350000 -and the minimum is -.Sy 100000 . -This property may be changed with -.Nm zfs Cm change-key . -.It Sy exec Ns = Ns Sy on Ns | Ns Sy off -Controls whether processes can be executed from within this file system. -The default value is -.Sy on . -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy exec -and -.Sy noexec -mount options. -.It Sy filesystem_limit Ns = Ns Em count Ns | Ns Sy none -Limits the number of filesystems and volumes that can exist under this point in -the dataset tree. -The limit is not enforced if the user is allowed to change the limit. -Setting a -.Sy filesystem_limit -to -.Sy on -a descendent of a filesystem that already has a -.Sy filesystem_limit -does not override the ancestor's -.Sy filesystem_limit , -but rather imposes an additional limit. -This feature must be enabled to be used -.Po see -.Xr zpool-features 5 -.Pc . -.It Sy special_small_blocks Ns = Ns Em size -This value represents the threshold block size for including small file -blocks into the special allocation class. Blocks smaller than or equal to this -value will be assigned to the special allocation class while greater blocks -will be assigned to the regular class. Valid values are zero or a power of two -from 512B up to 128K. The default size is 0 which means no small file blocks -will be allocated in the special class. -.Pp -Before setting this property, a special class vdev must be added to the -pool. See -.Xr zpool 8 -for more details on the special allocation class. -.It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy -Controls the mount point used for this file system. -See the -.Sx Mount Points -section for more information on how this property is used. -.Pp -When the -.Sy mountpoint -property is changed for a file system, the file system and any children that -inherit the mount point are unmounted. -If the new value is -.Sy legacy , -then they remain unmounted. -Otherwise, they are automatically remounted in the new location if the property -was previously -.Sy legacy -or -.Sy none , -or if they were mounted before the property was changed. -In addition, any shared file systems are unshared and shared in the new -location. -.It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off -Controls whether the file system should be mounted with -.Sy nbmand -.Pq Non Blocking mandatory locks . -This is used for SMB clients. -Changes to this property only take effect when the file system is umounted and -remounted. -See -.Xr mount 8 -for more information on -.Sy nbmand -mounts. This property is not used on Linux. -.It Sy overlay Ns = Ns Sy off Ns | Ns Sy on -Allow mounting on a busy directory or a directory which already contains -files or directories. This is the default mount behavior for Linux file systems. -For consistency with OpenZFS on other platforms overlay mounts are -.Sy off -by default. Set to -.Sy on -to enable overlay mounts. -.It Sy primarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata -Controls what is cached in the primary cache -.Pq ARC . -If this property is set to -.Sy all , -then both user data and metadata is cached. -If this property is set to -.Sy none , -then neither user data nor metadata is cached. -If this property is set to -.Sy metadata , -then only metadata is cached. -The default value is -.Sy all . -.It Sy quota Ns = Ns Em size Ns | Ns Sy none -Limits the amount of space a dataset and its descendents can consume. -This property enforces a hard limit on the amount of space used. -This includes all space consumed by descendents, including file systems and -snapshots. -Setting a quota on a descendent of a dataset that already has a quota does not -override the ancestor's quota, but rather imposes an additional limit. -.Pp -Quotas cannot be set on volumes, as the -.Sy volsize -property acts as an implicit quota. -.It Sy snapshot_limit Ns = Ns Em count Ns | Ns Sy none -Limits the number of snapshots that can be created on a dataset and its -descendents. -Setting a -.Sy snapshot_limit -on a descendent of a dataset that already has a -.Sy snapshot_limit -does not override the ancestor's -.Sy snapshot_limit , -but rather imposes an additional limit. -The limit is not enforced if the user is allowed to change the limit. -For example, this means that recursive snapshots taken from the global zone are -counted against each delegated dataset within a zone. -This feature must be enabled to be used -.Po see -.Xr zpool-features 5 -.Pc . -.It Sy userquota@ Ns Em user Ns = Ns Em size Ns | Ns Sy none -Limits the amount of space consumed by the specified user. -User space consumption is identified by the -.Sy userspace@ Ns Em user -property. -.Pp -Enforcement of user quotas may be delayed by several seconds. -This delay means that a user might exceed their quota before the system notices -that they are over quota and begins to refuse additional writes with the -.Er EDQUOT -error message. -See the -.Nm zfs Cm userspace -subcommand for more information. -.Pp -Unprivileged users can only access their own groups' space usage. -The root user, or a user who has been granted the -.Sy userquota -privilege with -.Nm zfs Cm allow , -can get and set everyone's quota. -.Pp -This property is not available on volumes, on file systems before version 4, or -on pools before version 15. -The -.Sy userquota@ Ns Em ... -properties are not displayed by -.Nm zfs Cm get Sy all . -The user's name must be appended after the -.Sy @ -symbol, using one of the following forms: -.Bl -bullet -.It -.Em POSIX name -.Po for example, -.Sy joe -.Pc -.It -.Em POSIX numeric ID -.Po for example, -.Sy 789 -.Pc -.It -.Em SID name -.Po for example, -.Sy joe.smith@mydomain -.Pc -.It -.Em SID numeric ID -.Po for example, -.Sy S-1-123-456-789 -.Pc -.El -.Pp -Files created on Linux always have POSIX owners. -.It Sy userobjquota@ Ns Em user Ns = Ns Em size Ns | Ns Sy none -The -.Sy userobjquota -is similar to -.Sy userquota -but it limits the number of objects a user can create. Please refer to -.Sy userobjused -for more information about how objects are counted. -.It Sy groupquota@ Ns Em group Ns = Ns Em size Ns | Ns Sy none -Limits the amount of space consumed by the specified group. -Group space consumption is identified by the -.Sy groupused@ Ns Em group -property. -.Pp -Unprivileged users can access only their own groups' space usage. -The root user, or a user who has been granted the -.Sy groupquota -privilege with -.Nm zfs Cm allow , -can get and set all groups' quotas. -.It Sy groupobjquota@ Ns Em group Ns = Ns Em size Ns | Ns Sy none -The -.Sy groupobjquota -is similar to -.Sy groupquota -but it limits number of objects a group can consume. Please refer to -.Sy userobjused -for more information about how objects are counted. -.It Sy projectquota@ Ns Em project Ns = Ns Em size Ns | Ns Sy none -Limits the amount of space consumed by the specified project. Project -space consumption is identified by the -.Sy projectused@ Ns Em project -property. Please refer to -.Sy projectused -for more information about how project is identified and set/changed. -.Pp -The root user, or a user who has been granted the -.Sy projectquota -privilege with -.Nm zfs allow , -can access all projects' quota. -.It Sy projectobjquota@ Ns Em project Ns = Ns Em size Ns | Ns Sy none -The -.Sy projectobjquota -is similar to -.Sy projectquota -but it limits number of objects a project can consume. Please refer to -.Sy userobjused -for more information about how objects are counted. -.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off -Controls whether this dataset can be modified. -The default value is -.Sy off . -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy ro -and -.Sy rw -mount options. -.Pp -This property can also be referred to by its shortened column name, -.Sy rdonly . -.It Sy recordsize Ns = Ns Em size -Specifies a suggested block size for files in the file system. -This property is designed solely for use with database workloads that access -files in fixed-size records. -ZFS automatically tunes block sizes according to internal algorithms optimized -for typical access patterns. -.Pp -For databases that create very large files but access them in small random -chunks, these algorithms may be suboptimal. -Specifying a -.Sy recordsize -greater than or equal to the record size of the database can result in -significant performance gains. -Use of this property for general purpose file systems is strongly discouraged, -and may adversely affect performance. -.Pp -The size specified must be a power of two greater than or equal to 512 and less -than or equal to 128 Kbytes. -If the -.Sy large_blocks -feature is enabled on the pool, the size may be up to 1 Mbyte. -See -.Xr zpool-features 5 -for details on ZFS feature flags. -.Pp -Changing the file system's -.Sy recordsize -affects only files created afterward; existing files are unaffected. -.Pp -This property can also be referred to by its shortened column name, -.Sy recsize . -.It Sy redundant_metadata Ns = Ns Sy all Ns | Ns Sy most -Controls what types of metadata are stored redundantly. -ZFS stores an extra copy of metadata, so that if a single block is corrupted, -the amount of user data lost is limited. -This extra copy is in addition to any redundancy provided at the pool level -.Pq e.g. by mirroring or RAID-Z , -and is in addition to an extra copy specified by the -.Sy copies -property -.Pq up to a total of 3 copies . -For example if the pool is mirrored, -.Sy copies Ns = Ns 2 , -and -.Sy redundant_metadata Ns = Ns Sy most , -then ZFS stores 6 copies of most metadata, and 4 copies of data and some -metadata. -.Pp -When set to -.Sy all , -ZFS stores an extra copy of all metadata. -If a single on-disk block is corrupt, at worst a single block of user data -.Po which is -.Sy recordsize -bytes long -.Pc -can be lost. -.Pp -When set to -.Sy most , -ZFS stores an extra copy of most types of metadata. -This can improve performance of random writes, because less metadata must be -written. -In practice, at worst about 100 blocks -.Po of -.Sy recordsize -bytes each -.Pc -of user data can be lost if a single on-disk block is corrupt. -The exact behavior of which metadata blocks are stored redundantly may change in -future releases. -.Pp -The default value is -.Sy all . -.It Sy refquota Ns = Ns Em size Ns | Ns Sy none -Limits the amount of space a dataset can consume. -This property enforces a hard limit on the amount of space used. -This hard limit does not include space used by descendents, including file -systems and snapshots. -.It Sy refreservation Ns = Ns Em size Ns | Ns Sy none Ns | Ns Sy auto -The minimum amount of space guaranteed to a dataset, not including its -descendents. -When the amount of space used is below this value, the dataset is treated as if -it were taking up the amount of space specified by -.Sy refreservation . -The -.Sy refreservation -reservation is accounted for in the parent datasets' space used, and counts -against the parent datasets' quotas and reservations. -.Pp -If -.Sy refreservation -is set, a snapshot is only allowed if there is enough free pool space outside of -this reservation to accommodate the current number of -.Qq referenced -bytes in the dataset. -.Pp -If -.Sy refreservation -is set to -.Sy auto , -a volume is thick provisioned -.Po or -.Qq not sparse -.Pc . -.Sy refreservation Ns = Ns Sy auto -is only supported on volumes. -See -.Sy volsize -in the -.Sx Native Properties -section for more information about sparse volumes. -.Pp -This property can also be referred to by its shortened column name, -.Sy refreserv . -.It Sy relatime Ns = Ns Sy on Ns | Ns Sy off -Controls the manner in which the access time is updated when -.Sy atime=on -is set. Turning this property on causes the access time to be updated relative -to the modify or change time. Access time is only updated if the previous -access time was earlier than the current modify or change time or if the -existing access time hasn't been updated within the past 24 hours. The default -value is -.Sy off . -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy relatime -and -.Sy norelatime -mount options. -.It Sy reservation Ns = Ns Em size Ns | Ns Sy none -The minimum amount of space guaranteed to a dataset and its descendants. -When the amount of space used is below this value, the dataset is treated as if -it were taking up the amount of space specified by its reservation. -Reservations are accounted for in the parent datasets' space used, and count -against the parent datasets' quotas and reservations. -.Pp -This property can also be referred to by its shortened column name, -.Sy reserv . -.It Sy secondarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata -Controls what is cached in the secondary cache -.Pq L2ARC . -If this property is set to -.Sy all , -then both user data and metadata is cached. -If this property is set to -.Sy none , -then neither user data nor metadata is cached. -If this property is set to -.Sy metadata , -then only metadata is cached. -The default value is -.Sy all . -.It Sy setuid Ns = Ns Sy on Ns | Ns Sy off -Controls whether the setuid bit is respected for the file system. -The default value is -.Sy on . -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy suid -and -.Sy nosuid -mount options. -.It Sy sharesmb Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Em opts -Controls whether the file system is shared by using -.Sy Samba USERSHARES -and what options are to be used. Otherwise, the file system is automatically -shared and unshared with the -.Nm zfs Cm share -and -.Nm zfs Cm unshare -commands. If the property is set to on, the -.Xr net 8 -command is invoked to create a -.Sy USERSHARE . -.Pp -Because SMB shares requires a resource name, a unique resource name is -constructed from the dataset name. The constructed name is a copy of the -dataset name except that the characters in the dataset name, which would be -invalid in the resource name, are replaced with underscore (_) characters. -Linux does not currently support additional options which might be available -on Solaris. -.Pp -If the -.Sy sharesmb -property is set to -.Sy off , -the file systems are unshared. -.Pp -The share is created with the ACL (Access Control List) "Everyone:F" ("F" -stands for "full permissions", ie. read and write permissions) and no guest -access (which means Samba must be able to authenticate a real user, system -passwd/shadow, LDAP or smbpasswd based) by default. This means that any -additional access control (disallow specific user specific access etc) must -be done on the underlying file system. -.It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Em opts -Controls whether the file system is shared via NFS, and what options are to be -used. -A file system with a -.Sy sharenfs -property of -.Sy off -is managed with the -.Xr exportfs 8 -command and entries in the -.Em /etc/exports -file. -Otherwise, the file system is automatically shared and unshared with the -.Nm zfs Cm share -and -.Nm zfs Cm unshare -commands. -If the property is set to -.Sy on , -the dataset is shared using the default options: -.Pp -.Em sec=sys,rw,crossmnt,no_subtree_check,no_root_squash -.Pp -See -.Xr exports 5 -for the meaning of the default options. Otherwise, the -.Xr exportfs 8 -command is invoked with options equivalent to the contents of this property. -.Pp -When the -.Sy sharenfs -property is changed for a dataset, the dataset and any children inheriting the -property are re-shared with the new options, only if the property was previously -.Sy off , -or if they were shared before the property was changed. -If the new property is -.Sy off , -the file systems are unshared. -.It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput -Provide a hint to ZFS about handling of synchronous requests in this dataset. -If -.Sy logbias -is set to -.Sy latency -.Pq the default , -ZFS will use pool log devices -.Pq if configured -to handle the requests at low latency. -If -.Sy logbias -is set to -.Sy throughput , -ZFS will not use configured pool log devices. -ZFS will instead optimize synchronous operations for global pool throughput and -efficient use of resources. -.It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible -Controls whether the volume snapshot devices under -.Em /dev/zvol/ -are hidden or visible. The default value is -.Sy hidden . -.It Sy snapdir Ns = Ns Sy hidden Ns | Ns Sy visible -Controls whether the -.Pa .zfs -directory is hidden or visible in the root of the file system as discussed in -the -.Sx Snapshots -section. -The default value is -.Sy hidden . -.It Sy sync Ns = Ns Sy standard Ns | Ns Sy always Ns | Ns Sy disabled -Controls the behavior of synchronous requests -.Pq e.g. fsync, O_DSYNC . -.Sy standard -is the -.Tn POSIX -specified behavior of ensuring all synchronous requests are written to stable -storage and all devices are flushed to ensure data is not cached by device -controllers -.Pq this is the default . -.Sy always -causes every file system transaction to be written and flushed before its -system call returns. -This has a large performance penalty. -.Sy disabled -disables synchronous requests. -File system transactions are only committed to stable storage periodically. -This option will give the highest performance. -However, it is very dangerous as ZFS would be ignoring the synchronous -transaction demands of applications such as databases or NFS. -Administrators should only use this option when the risks are understood. -.It Sy version Ns = Ns Em N Ns | Ns Sy current -The on-disk version of this file system, which is independent of the pool -version. -This property can only be set to later supported versions. -See the -.Nm zfs Cm upgrade -command. -.It Sy volsize Ns = Ns Em size -For volumes, specifies the logical size of the volume. -By default, creating a volume establishes a reservation of equal size. -For storage pools with a version number of 9 or higher, a -.Sy refreservation -is set instead. -Any changes to -.Sy volsize -are reflected in an equivalent change to the reservation -.Po or -.Sy refreservation -.Pc . -The -.Sy volsize -can only be set to a multiple of -.Sy volblocksize , -and cannot be zero. -.Pp -The reservation is kept equal to the volume's logical size to prevent unexpected -behavior for consumers. -Without the reservation, the volume could run out of space, resulting in -undefined behavior or data corruption, depending on how the volume is used. -These effects can also occur when the volume size is changed while it is in use -.Pq particularly when shrinking the size . -Extreme care should be used when adjusting the volume size. -.Pp -Though not recommended, a -.Qq sparse volume -.Po also known as -.Qq thin provisioned -.Pc -can be created by specifying the -.Fl s -option to the -.Nm zfs Cm create Fl V -command, or by changing the value of the -.Sy refreservation -property -.Po or -.Sy reservation -property on pool version 8 or earlier -.Pc -after the volume has been created. -A -.Qq sparse volume -is a volume where the value of -.Sy refreservation -is less than the size of the volume plus the space required to store its -metadata. -Consequently, writes to a sparse volume can fail with -.Er ENOSPC -when the pool is low on space. -For a sparse volume, changes to -.Sy volsize -are not reflected in the -.Sy refreservation. -A volume that is not sparse is said to be -.Qq thick provisioned . -A sparse volume can become thick provisioned by setting -.Sy refreservation -to -.Sy auto . -.It Sy volmode Ns = Ns Cm default | full | geom | dev | none -This property specifies how volumes should be exposed to the OS. -Setting it to -.Sy full -exposes volumes as fully fledged block devices, providing maximal -functionality. The value -.Sy geom -is just an alias for -.Sy full -and is kept for compatibility. -Setting it to -.Sy dev -hides its partitions. -Volumes with property set to -.Sy none -are not exposed outside ZFS, but can be snapshoted, cloned, replicated, etc, -that can be suitable for backup purposes. -Value -.Sy default -means that volumes exposition is controlled by system-wide tunable -.Va zvol_volmode , -where -.Sy full , -.Sy dev -and -.Sy none -are encoded as 1, 2 and 3 respectively. -The default values is -.Sy full . -.It Sy vscan Ns = Ns Sy on Ns | Ns Sy off -Controls whether regular files should be scanned for viruses when a file is -opened and closed. -In addition to enabling this property, the virus scan service must also be -enabled for virus scanning to occur. -The default value is -.Sy off . -This property is not used on Linux. -.It Sy xattr Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy sa -Controls whether extended attributes are enabled for this file system. Two -styles of extended attributes are supported either directory based or system -attribute based. -.Pp -The default value of -.Sy on -enables directory based extended attributes. This style of extended attribute -imposes no practical limit on either the size or number of attributes which -can be set on a file. Although under Linux the -.Xr getxattr 2 -and -.Xr setxattr 2 -system calls limit the maximum size to 64K. This is the most compatible -style of extended attribute and is supported by all OpenZFS implementations. -.Pp -System attribute based xattrs can be enabled by setting the value to -.Sy sa . -The key advantage of this type of xattr is improved performance. Storing -extended attributes as system attributes significantly decreases the amount of -disk IO required. Up to 64K of data may be stored per-file in the space -reserved for system attributes. If there is not enough space available for -an extended attribute then it will be automatically written as a directory -based xattr. System attribute based extended attributes are not accessible -on platforms which do not support the -.Sy xattr=sa -feature. -.Pp -The use of system attribute based xattrs is strongly encouraged for users of -SELinux or POSIX ACLs. Both of these features heavily rely of extended -attributes and benefit significantly from the reduced access time. -.Pp -The values -.Sy on -and -.Sy off -are equivalent to the -.Sy xattr -and -.Sy noxattr -mount options. -.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off -Controls whether the dataset is managed from a non-global zone. Zones are a -Solaris feature and are not relevant on Linux. The default value is -.Sy off . -.El -.Pp -The following three properties cannot be changed after the file system is -created, and therefore, should be set when the file system is created. -If the properties are not set with the -.Nm zfs Cm create -or -.Nm zpool Cm create -commands, these properties are inherited from the parent dataset. -If the parent dataset lacks these properties due to having been created prior to -these features being supported, the new file system will have the default values -for these properties. -.Bl -tag -width "" -.It Xo -.Sy casesensitivity Ns = Ns Sy sensitive Ns | Ns -.Sy insensitive Ns | Ns Sy mixed -.Xc -Indicates whether the file name matching algorithm used by the file system -should be case-sensitive, case-insensitive, or allow a combination of both -styles of matching. -The default value for the -.Sy casesensitivity -property is -.Sy sensitive . -Traditionally, -.Ux -and -.Tn POSIX -file systems have case-sensitive file names. -.Pp -The -.Sy mixed -value for the -.Sy casesensitivity -property indicates that the file system can support requests for both -case-sensitive and case-insensitive matching behavior. -Currently, case-insensitive matching behavior on a file system that supports -mixed behavior is limited to the SMB server product. -For more information about the -.Sy mixed -value behavior, see the "ZFS Administration Guide". -.It Xo -.Sy normalization Ns = Ns Sy none Ns | Ns Sy formC Ns | Ns -.Sy formD Ns | Ns Sy formKC Ns | Ns Sy formKD -.Xc -Indicates whether the file system should perform a -.Sy unicode -normalization of file names whenever two file names are compared, and which -normalization algorithm should be used. -File names are always stored unmodified, names are normalized as part of any -comparison process. -If this property is set to a legal value other than -.Sy none , -and the -.Sy utf8only -property was left unspecified, the -.Sy utf8only -property is automatically set to -.Sy on . -The default value of the -.Sy normalization -property is -.Sy none . -This property cannot be changed after the file system is created. -.It Sy utf8only Ns = Ns Sy on Ns | Ns Sy off -Indicates whether the file system should reject file names that include -characters that are not present in the -.Sy UTF-8 -character code set. -If this property is explicitly set to -.Sy off , -the normalization property must either not be explicitly set or be set to -.Sy none . -The default value for the -.Sy utf8only -property is -.Sy off . -This property cannot be changed after the file system is created. -.El -.Pp -The -.Sy casesensitivity , -.Sy normalization , -and -.Sy utf8only -properties are also new permissions that can be assigned to non-privileged users -by using the ZFS delegated administration feature. -.Ss "Temporary Mount Point Properties" -When a file system is mounted, either through -.Xr mount 8 -for legacy mounts or the -.Nm zfs Cm mount -command for normal file systems, its mount options are set according to its -properties. -The correlation between properties and mount options is as follows: -.Bd -literal - PROPERTY MOUNT OPTION - atime atime/noatime - canmount auto/noauto - devices dev/nodev - exec exec/noexec - readonly ro/rw - relatime relatime/norelatime - setuid suid/nosuid - xattr xattr/noxattr -.Ed -.Pp -In addition, these options can be set on a per-mount basis using the -.Fl o -option, without affecting the property that is stored on disk. -The values specified on the command line override the values stored in the -dataset. -The -.Sy nosuid -option is an alias for -.Sy nodevices Ns \&, Ns Sy nosetuid . -These properties are reported as -.Qq temporary -by the -.Nm zfs Cm get -command. -If the properties are changed while the dataset is mounted, the new setting -overrides any temporary settings. -.Ss "User Properties" -In addition to the standard native properties, ZFS supports arbitrary user -properties. -User properties have no effect on ZFS behavior, but applications or -administrators can use them to annotate datasets -.Pq file systems, volumes, and snapshots . -.Pp -User property names must contain a colon -.Pq Qq Sy \&: -character to distinguish them from native properties. -They may contain lowercase letters, numbers, and the following punctuation -characters: colon -.Pq Qq Sy \&: , -dash -.Pq Qq Sy - , -period -.Pq Qq Sy \&. , -and underscore -.Pq Qq Sy _ . -The expected convention is that the property name is divided into two portions -such as -.Em module Ns \&: Ns Em property , -but this namespace is not enforced by ZFS. -User property names can be at most 256 characters, and cannot begin with a dash -.Pq Qq Sy - . -.Pp -When making programmatic use of user properties, it is strongly suggested to use -a reversed -.Sy DNS -domain name for the -.Em module -component of property names to reduce the chance that two -independently-developed packages use the same property name for different -purposes. -.Pp -The values of user properties are arbitrary strings, are always inherited, and -are never validated. -All of the commands that operate on properties -.Po Nm zfs Cm list , -.Nm zfs Cm get , -.Nm zfs Cm set , -and so forth -.Pc -can be used to manipulate both native properties and user properties. -Use the -.Nm zfs Cm inherit -command to clear a user property. -If the property is not defined in any parent dataset, it is removed entirely. -Property values are limited to 8192 bytes. -.Ss ZFS Volumes as Swap -ZFS volumes may be used as swap devices. After creating the volume with the -.Nm zfs Cm create Fl V -command set up and enable the swap area using the -.Xr mkswap 8 -and -.Xr swapon 8 -commands. Do not swap to a file on a ZFS file system. A ZFS swap file -configuration is not supported. +For more information about properties, see +.Xr zfsprops 7 . +. .Ss Encryption Enabling the .Sy encryption -feature allows for the creation of encrypted filesystems and volumes. ZFS -will encrypt file and zvol data, file attributes, ACLs, permission bits, +feature allows for the creation of encrypted filesystems and volumes. +ZFS will encrypt file and zvol data, file attributes, ACLs, permission bits, directory listings, FUID mappings, and -.Sy userused -/ -.Sy groupused -data. ZFS will not encrypt metadata related to the pool structure, including -dataset and snapshot names, dataset hierarchy, properties, file size, file -holes, and deduplication tables (though the deduplicated data itself is -encrypted). -.Pp -Key rotation is managed by ZFS. Changing the user's key (e.g. a passphrase) -does not require re-encrypting the entire dataset. Datasets can be scrubbed, -resilvered, renamed, and deleted without the encryption keys being loaded (see the -.Nm zfs Cm load-key -subcommand for more info on key loading). -.Pp -Creating an encrypted dataset requires specifying the -.Sy encryption -and -.Sy keyformat -properties at creation time, along with an optional -.Sy keylocation -and -.Sy pbkdf2iters . -After entering an encryption key, the -created dataset will become an encryption root. Any descendant datasets will -inherit their encryption key from the encryption root by default, meaning that -loading, unloading, or changing the key for the encryption root will implicitly -do the same for all inheriting datasets. If this inheritance is not desired, -simply supply a -.Sy keyformat -when creating the child dataset or use -.Nm zfs Cm change-key -to break an existing relationship, creating a new encryption root on the child. -Note that the child's -.Sy keyformat -may match that of the parent while still creating a new encryption root, and -that changing the -.Sy encryption -property alone does not create a new encryption root; this would simply use a -different cipher suite with the same key as its encryption root. The one -exception is that clones will always use their origin's encryption key. -As a result of this exception, some encryption-related properties (namely -.Sy keystatus , -.Sy keyformat , -.Sy keylocation , -and -.Sy pbkdf2iters ) -do not inherit like other ZFS properties and instead use the value determined -by their encryption root. Encryption root inheritance can be tracked via the -read-only -.Sy encryptionroot -property. -.Pp -Encryption changes the behavior of a few ZFS -operations. Encryption is applied after compression so compression ratios are -preserved. Normally checksums in ZFS are 256 bits long, but for encrypted data -the checksum is 128 bits of the user-chosen checksum and 128 bits of MAC from -the encryption suite, which provides additional protection against maliciously -altered data. Deduplication is still possible with encryption enabled but for -security, datasets will only dedup against themselves, their snapshots, and -their clones. -.Pp -There are a few limitations on encrypted datasets. Encrypted data cannot be -embedded via the -.Sy embedded_data -feature. Encrypted datasets may not have -.Sy copies Ns = Ns Em 3 -since the implementation stores some encryption metadata where the third copy -would normally be. Since compression is applied before encryption datasets may -be vulnerable to a CRIME-like attack if applications accessing the data allow -for it. Deduplication with encryption will leak information about which blocks -are equivalent in a dataset and will incur an extra CPU cost per block written. +.Sy userused Ns / Ns Sy groupused Ns / Ns Sy projectused +data. +For an overview of encryption, see +.Xr zfs-load-key 8 . +. .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. @@ -2462,2184 +129,8 @@ original form. Displays a help message. .It Xo .Nm -.Fl V, -version +.Fl V , -version .Xc -An alias for the -.Nm zfs Cm version -subcommand. -.It Xo -.Nm -.Cm create -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem -.Xc -Creates a new ZFS file system. -The file system is automatically mounted according to the -.Sy mountpoint -property inherited from the parent. -.Bl -tag -width "-o" -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property as if the command -.Nm zfs Cm set Ar property Ns = Ns Ar value -was invoked at the same time the dataset was created. -Any editable ZFS property can also be set at creation time. -Multiple -.Fl o -options can be specified. -An error results if the same property is specified in multiple -.Fl o -options. -.It Fl p -Creates all the non-existing parent datasets. -Datasets created in this manner are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. -Any property specified on the command line using the -.Fl o -option is ignored. -If the target filesystem already exists, the operation completes successfully. -.El -.It Xo -.Nm -.Cm create -.Op Fl ps -.Op Fl b Ar blocksize -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Fl V Ar size Ar volume -.Xc -Creates a volume of the given size. -The volume is exported as a block device in -.Pa /dev/zvol/path , -where -.Em path -is the name of the volume in the ZFS namespace. -The size represents the logical size as exported by the device. -By default, a reservation of equal size is created. -.Pp -.Ar size -is automatically rounded up to the nearest 128 Kbytes to ensure that the volume -has an integral number of blocks regardless of -.Sy blocksize . -.Bl -tag -width "-b" -.It Fl b Ar blocksize -Equivalent to -.Fl o Sy volblocksize Ns = Ns Ar blocksize . -If this option is specified in conjunction with -.Fl o Sy volblocksize , -the resulting behavior is undefined. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property as if the -.Nm zfs Cm set Ar property Ns = Ns Ar value -command was invoked at the same time the dataset was created. -Any editable ZFS property can also be set at creation time. -Multiple -.Fl o -options can be specified. -An error results if the same property is specified in multiple -.Fl o -options. -.It Fl p -Creates all the non-existing parent datasets. -Datasets created in this manner are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. -Any property specified on the command line using the -.Fl o -option is ignored. -If the target filesystem already exists, the operation completes successfully. -.It Fl s -Creates a sparse volume with no reservation. -See -.Sy volsize -in the -.Sx Native Properties -section for more information about sparse volumes. -.El -.It Xo -.Nm -.Cm destroy -.Op Fl Rfnprv -.Ar filesystem Ns | Ns Ar volume -.Xc -Destroys the given dataset. -By default, the command unshares any file systems that are currently shared, -unmounts any file systems that are currently mounted, and refuses to destroy a -dataset that has active dependents -.Pq children or clones . -.Bl -tag -width "-R" -.It Fl R -Recursively destroy all dependents, including cloned file systems outside the -target hierarchy. -.It Fl f -Force an unmount of any file systems using the -.Nm unmount Fl f -command. -This option has no effect on non-file systems or unmounted file systems. -.It Fl n -Do a dry-run -.Pq Qq No-op -deletion. -No data will be deleted. -This is useful in conjunction with the -.Fl v -or -.Fl p -flags to determine what data would be deleted. -.It Fl p -Print machine-parsable verbose information about the deleted data. -.It Fl r -Recursively destroy all children. -.It Fl v -Print verbose information about the deleted data. -.El -.Pp -Extreme care should be taken when applying either the -.Fl r -or the -.Fl R -options, as they can destroy large portions of a pool and cause unexpected -behavior for mounted file systems in use. -.It Xo -.Nm -.Cm destroy -.Op Fl Rdnprv -.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns -.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns ... -.Xc -The given snapshots are destroyed immediately if and only if the -.Nm zfs Cm destroy -command without the -.Fl d -option would have destroyed it. -Such immediate destruction would occur, for example, if the snapshot had no -clones and the user-initiated reference count were zero. -.Pp -If a snapshot does not qualify for immediate destruction, it is marked for -deferred deletion. -In this state, it exists as a usable, visible snapshot until both of the -preconditions listed above are met, at which point it is destroyed. -.Pp -An inclusive range of snapshots may be specified by separating the first and -last snapshots with a percent sign. -The first and/or last snapshots may be left blank, in which case the -filesystem's oldest or newest snapshot will be implied. -.Pp -Multiple snapshots -.Pq or ranges of snapshots -of the same filesystem or volume may be specified in a comma-separated list of -snapshots. -Only the snapshot's short name -.Po the part after the -.Sy @ -.Pc -should be specified when using a range or comma-separated list to identify -multiple snapshots. -.Bl -tag -width "-R" -.It Fl R -Recursively destroy all clones of these snapshots, including the clones, -snapshots, and children. -If this flag is specified, the -.Fl d -flag will have no effect. -.It Fl d -Destroy immediately. If a snapshot cannot be destroyed now, mark it for -deferred destruction. -.It Fl n -Do a dry-run -.Pq Qq No-op -deletion. -No data will be deleted. -This is useful in conjunction with the -.Fl p -or -.Fl v -flags to determine what data would be deleted. -.It Fl p -Print machine-parsable verbose information about the deleted data. -.It Fl r -Destroy -.Pq or mark for deferred deletion -all snapshots with this name in descendent file systems. -.It Fl v -Print verbose information about the deleted data. -.Pp -Extreme care should be taken when applying either the -.Fl r -or the -.Fl R -options, as they can destroy large portions of a pool and cause unexpected -behavior for mounted file systems in use. -.El -.It Xo -.Nm -.Cm destroy -.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark -.Xc -The given bookmark is destroyed. -.It Xo -.Nm -.Cm snapshot -.Op Fl r -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns @ Ns Ar snapname Ns | Ns Ar volume Ns @ Ns Ar snapname Ns ... -.Xc -Creates snapshots with the given names. -All previous modifications by successful system calls to the file system are -part of the snapshots. -Snapshots are taken atomically, so that all snapshots correspond to the same -moment in time. -.Nm zfs Cm snap -can be used as an alias for -.Nm zfs Cm snapshot. -See the -.Sx Snapshots -section for details. -.Bl -tag -width "-o" -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property; see -.Nm zfs Cm create -for details. -.It Fl r -Recursively create snapshots of all descendent datasets -.El -.It Xo -.Nm -.Cm rollback -.Op Fl Rfr -.Ar snapshot -.Xc -Roll back the given dataset to a previous snapshot. -When a dataset is rolled back, all data that has changed since the snapshot is -discarded, and the dataset reverts to the state at the time of the snapshot. -By default, the command refuses to roll back to a snapshot other than the most -recent one. -In order to do so, all intermediate snapshots and bookmarks must be destroyed by -specifying the -.Fl r -option. -.Pp -The -.Fl rR -options do not recursively destroy the child snapshots of a recursive snapshot. -Only direct snapshots of the specified filesystem are destroyed by either of -these options. -To completely roll back a recursive snapshot, you must rollback the individual -child snapshots. -.Bl -tag -width "-R" -.It Fl R -Destroy any more recent snapshots and bookmarks, as well as any clones of those -snapshots. -.It Fl f -Used with the -.Fl R -option to force an unmount of any clone file systems that are to be destroyed. -.It Fl r -Destroy any snapshots and bookmarks more recent than the one specified. -.El -.It Xo -.Nm -.Cm clone -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar snapshot Ar filesystem Ns | Ns Ar volume -.Xc -Creates a clone of the given snapshot. -See the -.Sx Clones -section for details. -The target dataset can be located anywhere in the ZFS hierarchy, and is created -as the same type as the original. -.Bl -tag -width "-o" -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property; see -.Nm zfs Cm create -for details. -.It Fl p -Creates all the non-existing parent datasets. -Datasets created in this manner are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. -If the target filesystem or volume already exists, the operation completes -successfully. -.El -.It Xo -.Nm -.Cm promote -.Ar clone-filesystem -.Xc -Promotes a clone file system to no longer be dependent on its -.Qq origin -snapshot. -This makes it possible to destroy the file system that the clone was created -from. -The clone parent-child dependency relationship is reversed, so that the origin -file system becomes a clone of the specified file system. -.Pp -The snapshot that was cloned, and any snapshots previous to this snapshot, are -now owned by the promoted clone. -The space they use moves from the origin file system to the promoted clone, so -enough space must be available to accommodate these snapshots. -No new space is consumed by this operation, but the space accounting is -adjusted. -The promoted clone must not have any conflicting snapshot names of its own. -The -.Cm rename -subcommand can be used to rename any conflicting snapshots. -.It Xo -.Nm -.Cm rename -.Op Fl f -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.It Xo -.Nm -.Cm rename -.Op Fl fp -.Ar filesystem Ns | Ns Ar volume -.Ar filesystem Ns | Ns Ar volume -.Xc -Renames the given dataset. -The new target can be located anywhere in the ZFS hierarchy, with the exception -of snapshots. -Snapshots can only be renamed within the parent file system or volume. -When renaming a snapshot, the parent file system of the snapshot does not need -to be specified as part of the second argument. -Renamed file systems can inherit new mount points, in which case they are -unmounted and remounted at the new mount point. -.Bl -tag -width "-a" -.It Fl f -Force unmount any filesystems that need to be unmounted in the process. -.It Fl p -Creates all the nonexistent parent datasets. -Datasets created in this manner are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. -.El -.It Xo -.Nm -.Cm rename -.Fl r -.Ar snapshot Ar snapshot -.Xc -Recursively rename the snapshots of all descendent datasets. -Snapshots are the only dataset that can be renamed recursively. -.It Xo -.Nm -.Cm list -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... Oc -.Oo Fl s Ar property Oc Ns ... -.Oo Fl S Ar property Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns ... -.Xc -Lists the property information for the given datasets in tabular form. -If specified, you can list property information by the absolute pathname or the -relative pathname. -By default, all file systems and volumes are displayed. -Snapshots are displayed if the -.Sy listsnaps -property is -.Sy on -.Po the default is -.Sy off -.Pc . -The following fields are displayed: -.Sy name Ns \&, Sy used Ns \&, Sy available Ns \&, Sy referenced Ns \&, Sy mountpoint Ns . -.Bl -tag -width "-H" -.It Fl H -Used for scripting mode. -Do not print headers and separate fields by a single tab instead of arbitrary -white space. -.It Fl S Ar property -Same as the -.Fl s -option, but sorts by property in descending order. -.It Fl d Ar depth -Recursively display any children of the dataset, limiting the recursion to -.Ar depth . -A -.Ar depth -of -.Sy 1 -will display only the dataset and its direct children. -.It Fl o Ar property -A comma-separated list of properties to display. -The property must be: -.Bl -bullet -.It -One of the properties described in the -.Sx Native Properties -section -.It -A user property -.It -The value -.Sy name -to display the dataset name -.It -The value -.Sy space -to display space usage properties on file systems and volumes. -This is a shortcut for specifying -.Fl o Sy name Ns \&, Ns Sy avail Ns \&, Ns Sy used Ns \&, Ns Sy usedsnap Ns \&, Ns -.Sy usedds Ns \&, Ns Sy usedrefreserv Ns \&, Ns Sy usedchild Fl t -.Sy filesystem Ns \&, Ns Sy volume -syntax. -.El -.It Fl p -Display numbers in parsable -.Pq exact -values. -.It Fl r -Recursively display any children of the dataset on the command line. -.It Fl s Ar property -A property for sorting the output by column in ascending order based on the -value of the property. -The property must be one of the properties described in the -.Sx Properties -section or the value -.Sy name -to sort by the dataset name. -Multiple properties can be specified at one time using multiple -.Fl s -property options. -Multiple -.Fl s -options are evaluated from left to right in decreasing order of importance. -The following is a list of sorting criteria: -.Bl -bullet -.It -Numeric types sort in numeric order. -.It -String types sort in alphabetical order. -.It -Types inappropriate for a row sort that row to the literal bottom, regardless of -the specified ordering. -.El -.Pp -If no sorting options are specified the existing behavior of -.Nm zfs Cm list -is preserved. -.It Fl t Ar type -A comma-separated list of types to display, where -.Ar type -is one of -.Sy filesystem , -.Sy snapshot , -.Sy volume , -.Sy bookmark , -or -.Sy all . -For example, specifying -.Fl t Sy snapshot -displays only snapshots. -.El -.It Xo -.Nm -.Cm set -.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Xc -Sets the property or list of properties to the given value(s) for each dataset. -Only some properties can be edited. -See the -.Sx Properties -section for more information on what properties can be set and acceptable -values. -Numeric values can be specified as exact values, or in a human-readable form -with a suffix of -.Sy B , K , M , G , T , P , E , Z -.Po for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, -or zettabytes, respectively -.Pc . -User properties can be set on snapshots. -For more information, see the -.Sx User Properties -section. -.It Xo -.Nm -.Cm get -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... Oc -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Cm all | Ar property Ns Oo , Ns Ar property Oc Ns ... -.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns ... -.Xc -Displays properties for the given datasets. -If no datasets are specified, then the command displays properties for all -datasets on the system. -For each property, the following columns are displayed: -.Bd -literal - name Dataset name - property Property name - value Property value - source Property source \fBlocal\fP, \fBdefault\fP, \fBinherited\fP, - \fBtemporary\fP, \fBreceived\fP or none (\fB-\fP). -.Ed -.Pp -All columns are displayed by default, though this can be controlled by using the -.Fl o -option. -This command takes a comma-separated list of properties as described in the -.Sx Native Properties -and -.Sx User Properties -sections. -.Pp -The value -.Sy all -can be used to display all properties that apply to the given dataset's type -.Pq filesystem, volume, snapshot, or bookmark . -.Bl -tag -width "-H" -.It Fl H -Display output in a form more easily parsed by scripts. -Any headers are omitted, and fields are explicitly separated by a single tab -instead of an arbitrary amount of space. -.It Fl d Ar depth -Recursively display any children of the dataset, limiting the recursion to -.Ar depth . -A depth of -.Sy 1 -will display only the dataset and its direct children. -.It Fl o Ar field -A comma-separated list of columns to display. -.Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source -is the default value. -.It Fl p -Display numbers in parsable -.Pq exact -values. -.It Fl r -Recursively display properties for any children. -.It Fl s Ar source -A comma-separated list of sources to display. -Those properties coming from a source other than those in this list are ignored. -Each source must be one of the following: -.Sy local , -.Sy default , -.Sy inherited , -.Sy temporary , -.Sy received , -and -.Sy none . -The default value is all sources. -.It Fl t Ar type -A comma-separated list of types to display, where -.Ar type -is one of -.Sy filesystem , -.Sy snapshot , -.Sy volume , -.Sy bookmark , -or -.Sy all . -.El -.It Xo -.Nm -.Cm inherit -.Op Fl rS -.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Xc -Clears the specified property, causing it to be inherited from an ancestor, -restored to default if no ancestor has the property set, or with the -.Fl S -option reverted to the received value if one exists. -See the -.Sx Properties -section for a listing of default values, and details on which properties can be -inherited. -.Bl -tag -width "-r" -.It Fl r -Recursively inherit the given property for all children. -.It Fl S -Revert the property to the received value if one exists; otherwise operate as -if the -.Fl S -option was not specified. -.El -.It Xo -.Nm -.Cm upgrade -.Xc -Displays a list of file systems that are not the most recent version. -.It Xo -.Nm -.Cm upgrade -.Fl v -.Xc -Displays a list of currently supported file system versions. -.It Xo -.Nm -.Cm upgrade -.Op Fl r -.Op Fl V Ar version -.Fl a | Ar filesystem -.Xc -Upgrades file systems to a new on-disk version. -Once this is done, the file systems will no longer be accessible on systems -running older versions of the software. -.Nm zfs Cm send -streams generated from new snapshots of these file systems cannot be accessed on -systems running older versions of the software. -.Pp -In general, the file system version is independent of the pool version. -See -.Xr zpool 8 -for information on the -.Nm zpool Cm upgrade -command. -.Pp -In some cases, the file system version and the pool version are interrelated and -the pool version must be upgraded before the file system version can be -upgraded. -.Bl -tag -width "-V" -.It Fl V Ar version -Upgrade to the specified -.Ar version . -If the -.Fl V -flag is not specified, this command upgrades to the most recent version. -This -option can only be used to increase the version number, and only up to the most -recent version supported by this software. -.It Fl a -Upgrade all file systems on all imported pools. -.It Ar filesystem -Upgrade the specified file system. -.It Fl r -Upgrade the specified file system and all descendent file systems. -.El -.It Xo -.Nm -.Cm userspace -.Op Fl Hinp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar snapshot -.Xc -Displays space consumed by, and quotas on, each user in the specified filesystem -or snapshot. -This corresponds to the -.Sy userused@ Ns Em user , -.Sy userobjused@ Ns Em user , -.Sy userquota@ Ns Em user, -and -.Sy userobjquota@ Ns Em user -properties. -.Bl -tag -width "-H" -.It Fl H -Do not print headers, use tab-delimited output. -.It Fl S Ar field -Sort by this field in reverse order. -See -.Fl s . -.It Fl i -Translate SID to POSIX ID. -The POSIX ID may be ephemeral if no mapping exists. -Normal POSIX interfaces -.Po for example, -.Xr stat 2 , -.Nm ls Fl l -.Pc -perform this translation, so the -.Fl i -option allows the output from -.Nm zfs Cm userspace -to be compared directly with those utilities. -However, -.Fl i -may lead to confusion if some files were created by an SMB user before a -SMB-to-POSIX name mapping was established. -In such a case, some files will be owned by the SMB entity and some by the POSIX -entity. -However, the -.Fl i -option will report that the POSIX entity has the total usage and quota for both. -.It Fl n -Print numeric ID instead of user/group name. -.It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -Display only the specified fields from the following set: -.Sy type , -.Sy name , -.Sy used , -.Sy quota . -The default is to display all fields. -.It Fl p -Use exact -.Pq parsable -numeric output. -.It Fl s Ar field -Sort output by this field. -The -.Fl s -and -.Fl S -flags may be specified multiple times to sort first by one field, then by -another. -The default is -.Fl s Sy type Fl s Sy name . -.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -Print only the specified types from the following set: -.Sy all , -.Sy posixuser , -.Sy smbuser , -.Sy posixgroup , -.Sy smbgroup . -The default is -.Fl t Sy posixuser Ns \&, Ns Sy smbuser . -The default can be changed to include group types. -.El -.It Xo -.Nm -.Cm groupspace -.Op Fl Hinp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar snapshot -.Xc -Displays space consumed by, and quotas on, each group in the specified -filesystem or snapshot. -This subcommand is identical to -.Nm zfs Cm userspace , -except that the default types to display are -.Fl t Sy posixgroup Ns \&, Ns Sy smbgroup . -.It Xo -.Nm -.Cm projectspace -.Op Fl Hp -.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Xc -Displays space consumed by, and quotas on, each project in the specified -filesystem or snapshot. This subcommand is identical to -.Nm zfs Cm userspace , -except that the project identifier is numeral, not name. So need neither -the option -.Sy -i -for SID to POSIX ID nor -.Sy -n -for numeric ID, nor -.Sy -t -for types. -.It Xo -.Nm -.Cm project -.Oo Fl d Ns | Ns Fl r Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Xc -List project identifier (ID) and inherit flag of file(s) or directories. -.Bl -tag -width "-d" -.It Fl d -Show the directory project ID and inherit flag, not its childrens. It will -overwrite the former specified -.Fl r -option. -.It Fl r -Show on subdirectories recursively. It will overwrite the former specified -.Fl d -option. -.El -.It Xo -.Nm -.Cm project -.Fl C -.Oo Fl kr Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Xc -Clear project inherit flag and/or ID on the file(s) or directories. -.Bl -tag -width "-k" -.It Fl k -Keep the project ID unchanged. If not specified, the project ID will be reset -as zero. -.It Fl r -Clear on subdirectories recursively. -.El -.It Xo -.Nm -.Cm project -.Fl c -.Oo Fl 0 Ns Oc -.Oo Fl d Ns | Ns Fl r Ns Oc -.Op Fl p Ar id -.Ar file Ns | Ns Ar directory Ns ... -.Xc -Check project ID and inherit flag on the file(s) or directories, report the -entries without project inherit flag or with different project IDs from the -specified (via -.Fl p -option) value or the target directory's project ID. -.Bl -tag -width "-0" -.It Fl 0 -Print file name with a trailing NUL instead of newline (by default), like -"find -print0". -.It Fl d -Check the directory project ID and inherit flag, not its childrens. It will -overwrite the former specified -.Fl r -option. -.It Fl p -Specify the referenced ID for comparing with the target file(s) or directories' -project IDs. If not specified, the target (top) directory's project ID will be -used as the referenced one. -.It Fl r -Check on subdirectories recursively. It will overwrite the former specified -.Fl d -option. -.El -.It Xo -.Nm -.Cm project -.Op Fl p Ar id -.Oo Fl rs Ns Oc -.Ar file Ns | Ns Ar directory Ns ... -.Xc -.Bl -tag -width "-p" -Set project ID and/or inherit flag on the file(s) or directories. -.It Fl p -Set the file(s)' or directories' project ID with the given value. -.It Fl r -Set on subdirectories recursively. -.It Fl s -Set project inherit flag on the given file(s) or directories. It is usually used -for setup tree quota on the directory target with -.Fl r -option specified together. When setup tree quota, by default the directory's -project ID will be set to all its descendants unless you specify the project -ID via -.Fl p -option explicitly. -.El -.It Xo -.Nm -.Cm mount -.Xc -Displays all ZFS file systems currently mounted. -.It Xo -.Nm -.Cm mount -.Op Fl Olv -.Op Fl o Ar options -.Fl a | Ar filesystem -.Xc -Mount ZFS filesystem on a path described by its -.Sy mountpoint -property, if the path exists and is empty. If -.Sy mountpoint -is set to -.Em legacy , -the filesystem should be instead mounted using -.Xr mount 8 . -.Bl -tag -width "-O" -.It Fl O -Perform an overlay mount. Allows mounting in non-empty -.Sy mountpoint . -See -.Xr mount 8 -for more information. -.It Fl a -Mount all available ZFS file systems. -Invoked automatically as part of the boot process if configured. -.It Ar filesystem -Mount the specified filesystem. -.It Fl o Ar options -An optional, comma-separated list of mount options to use temporarily for the -duration of the mount. -See the -.Sx Temporary Mount Point Properties -section for details. -.It Fl l -Load keys for encrypted filesystems as they are being mounted. This is -equivalent to executing -.Nm zfs Cm load-key -on each encryption root before mounting it. Note that if a filesystem has a -.Sy keylocation -of -.Sy prompt -this will cause the terminal to interactively block after asking for the key. -.It Fl v -Report mount progress. -.El -.It Xo -.Nm -.Cm unmount -.Op Fl f -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Xc -Unmounts currently mounted ZFS file systems. -.Bl -tag -width "-a" -.It Fl a -Unmount all available ZFS file systems. -Invoked automatically as part of the shutdown process. -.It Ar filesystem Ns | Ns Ar mountpoint -Unmount the specified filesystem. -The command can also be given a path to a ZFS file system mount point on the -system. -.It Fl f -Forcefully unmount the file system, even if it is currently in use. -.El -.It Xo -.Nm -.Cm share -.Fl a | Ar filesystem -.Xc -Shares available ZFS file systems. -.Bl -tag -width "-a" -.It Fl a -Share all available ZFS file systems. -Invoked automatically as part of the boot process. -.It Ar filesystem -Share the specified filesystem according to the -.Sy sharenfs -and -.Sy sharesmb -properties. -File systems are shared when the -.Sy sharenfs -or -.Sy sharesmb -property is set. -.El -.It Xo -.Nm -.Cm unshare -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Xc -Unshares currently shared ZFS file systems. -.Bl -tag -width "-a" -.It Fl a -Unshare all available ZFS file systems. -Invoked automatically as part of the shutdown process. -.It Ar filesystem Ns | Ns Ar mountpoint -Unshare the specified filesystem. -The command can also be given a path to a ZFS file system shared on the system. -.El -.It Xo -.Nm -.Cm bookmark -.Ar snapshot bookmark -.Xc -Creates a bookmark of the given snapshot. -Bookmarks mark the point in time when the snapshot was created, and can be used -as the incremental source for a -.Nm zfs Cm send -command. -.Pp -This feature must be enabled to be used. -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy bookmarks -feature. -.It Xo -.Nm -.Cm send -.Op Fl DLPRbcehnpvw -.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot -.Ar snapshot -.Xc -Creates a stream representation of the second -.Ar snapshot , -which is written to standard output. -The output can be redirected to a file or to a different system -.Po for example, using -.Xr ssh 1 -.Pc . -By default, a full stream is generated. -.Bl -tag -width "-D" -.It Fl D, -dedup -Generate a deduplicated stream. -Blocks which would have been sent multiple times in the send stream will only be -sent once. -The receiving system must also support this feature to receive a deduplicated -stream. -This flag can be used regardless of the dataset's -.Sy dedup -property, but performance will be much better if the filesystem uses a -dedup-capable checksum -.Po for example, -.Sy sha256 -.Pc . -.It Fl I Ar snapshot -Generate a stream package that sends all intermediary snapshots from the first -snapshot to the second snapshot. -For example, -.Fl I Em @a Em fs@d -is similar to -.Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . -The incremental source may be specified as with the -.Fl i -option. -.It Fl L, -large-block -Generate a stream which may contain blocks larger than 128KB. -This flag has no effect if the -.Sy large_blocks -pool feature is disabled, or if the -.Sy recordsize -property of this filesystem has never been set above 128KB. -The receiving system must have the -.Sy large_blocks -pool feature enabled as well. -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy large_blocks -feature. -.It Fl P, -parsable -Print machine-parsable verbose information about the stream package generated. -.It Fl R, -replicate -Generate a replication stream package, which will replicate the specified -file system, and all descendent file systems, up to the named snapshot. -When received, all properties, snapshots, descendent file systems, and clones -are preserved. -.Pp -If the -.Fl i -or -.Fl I -flags are used in conjunction with the -.Fl R -flag, an incremental replication stream is generated. -The current values of properties, and current snapshot and file system names are -set when the stream is received. -If the -.Fl F -flag is specified when this stream is received, snapshots and file systems that -do not exist on the sending side are destroyed. If the -.Fl R -flag is used to send encrypted datasets, then -.Fl w -must also be specified. -.It Fl e, -embed -Generate a more compact stream by using -.Sy WRITE_EMBEDDED -records for blocks which are stored more compactly on disk by the -.Sy embedded_data -pool feature. -This flag has no effect if the -.Sy embedded_data -feature is disabled. -The receiving system must have the -.Sy embedded_data -feature enabled. -If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. Datasets that are sent with this flag may not be -received as an encrypted dataset, since encrypted datasets cannot use the -.Sy embedded_data -feature. -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy embedded_data -feature. -.It Fl b, -backup -Sends only received property values whether or not they are overridden by local -settings, but only if the dataset has ever been received. Use this option when -you want -.Nm zfs Cm receive -to restore received properties backed up on the sent dataset and to avoid -sending local settings that may have nothing to do with the source dataset, -but only with how the data is backed up. -.It Fl c, -compressed -Generate a more compact stream by using compressed WRITE records for blocks -which are compressed on disk and in memory -.Po see the -.Sy compression -property for details -.Pc . -If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. -If the -.Sy large_blocks -feature is enabled on the sending system but the -.Fl L -option is not supplied in conjunction with -.Fl c , -then the data will be decompressed before sending so it can be split into -smaller block sizes. -.It Fl w, -raw -For encrypted datasets, send data exactly as it exists on disk. This allows -backups to be taken even if encryption keys are not currently loaded. The -backup may then be received on an untrusted machine since that machine will -not have the encryption keys to read the protected data or alter it without -being detected. Upon being received, the dataset will have the same encryption -keys as it did on the send side, although the -.Sy keylocation -property will be defaulted to -.Sy prompt -if not otherwise provided. For unencrypted datasets, this flag will be -equivalent to -.Fl Lec . -Note that if you do not use this flag for sending encrypted datasets, data will -be sent unencrypted and may be re-encrypted with a different encryption key on -the receiving system, which will disable the ability to do a raw send to that -system for incrementals. -.It Fl h, -holds -Generate a stream package that includes any snapshot holds (created with the -.Sy zfs hold -command), and indicating to -.Sy zfs receive -that the holds be applied to the dataset on the receiving system. -.It Fl i Ar snapshot -Generate an incremental stream from the first -.Ar snapshot -.Pq the incremental source -to the second -.Ar snapshot -.Pq the incremental target . -The incremental source can be specified as the last component of the snapshot -name -.Po the -.Sy @ -character and following -.Pc -and it is assumed to be from the same file system as the incremental target. -.Pp -If the destination is a clone, the source may be the origin snapshot, which must -be fully specified -.Po for example, -.Em pool/fs@origin , -not just -.Em @origin -.Pc . -.It Fl n, -dryrun -Do a dry-run -.Pq Qq No-op -send. -Do not generate any actual send data. -This is useful in conjunction with the -.Fl v -or -.Fl P -flags to determine what data will be sent. -In this case, the verbose output will be written to standard output -.Po contrast with a non-dry-run, where the stream is written to standard output -and the verbose output goes to standard error -.Pc . -.It Fl p, -props -Include the dataset's properties in the stream. -This flag is implicit when -.Fl R -is specified. -The receiving system must also support this feature. Sends of encrypted datasets -must use -.Fl w -when using this flag. -.It Fl v, -verbose -Print verbose information about the stream package generated. -This information includes a per-second report of how much data has been sent. -.Pp -The format of the stream is committed. -You will be able to receive your streams on future versions of ZFS. -.El -.It Xo -.Nm -.Cm send -.Op Fl LPcenvw -.Op Fl i Ar snapshot Ns | Ns Ar bookmark -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -Generate a send stream, which may be of a filesystem, and may be incremental -from a bookmark. -If the destination is a filesystem or volume, the pool must be read-only, or the -filesystem must not be mounted. -When the stream generated from a filesystem or volume is received, the default -snapshot name will be -.Qq --head-- . -.Bl -tag -width "-L" -.It Fl L, -large-block -Generate a stream which may contain blocks larger than 128KB. -This flag has no effect if the -.Sy large_blocks -pool feature is disabled, or if the -.Sy recordsize -property of this filesystem has never been set above 128KB. -The receiving system must have the -.Sy large_blocks -pool feature enabled as well. -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy large_blocks -feature. -.It Fl P, -parsable -Print machine-parsable verbose information about the stream package generated. -.It Fl c, -compressed -Generate a more compact stream by using compressed WRITE records for blocks -which are compressed on disk and in memory -.Po see the -.Sy compression -property for details -.Pc . -If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. -If the -.Sy large_blocks -feature is enabled on the sending system but the -.Fl L -option is not supplied in conjunction with -.Fl c , -then the data will be decompressed before sending so it can be split into -smaller block sizes. -.It Fl w, -raw -For encrypted datasets, send data exactly as it exists on disk. This allows -backups to be taken even if encryption keys are not currently loaded. The -backup may then be received on an untrusted machine since that machine will -not have the encryption keys to read the protected data or alter it without -being detected. Upon being received, the dataset will have the same encryption -keys as it did on the send side, although the -.Sy keylocation -property will be defaulted to -.Sy prompt -if not otherwise provided. For unencrypted datasets, this flag will be -equivalent to -.Fl Lec . -Note that if you do not use this flag for sending encrypted datasets, data will -be sent unencrypted and may be re-encrypted with a different encryption key on -the receiving system, which will disable the ability to do a raw send to that -system for incrementals. -.It Fl e, -embed -Generate a more compact stream by using -.Sy WRITE_EMBEDDED -records for blocks which are stored more compactly on disk by the -.Sy embedded_data -pool feature. -This flag has no effect if the -.Sy embedded_data -feature is disabled. -The receiving system must have the -.Sy embedded_data -feature enabled. -If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. Datasets that are sent with this flag may not be -received as an encrypted dataset, since encrypted datasets cannot use the -.Sy embedded_data -feature. -See -.Xr zpool-features 5 -for details on ZFS feature flags and the -.Sy embedded_data -feature. -.It Fl i Ar snapshot Ns | Ns Ar bookmark -Generate an incremental send stream. -The incremental source must be an earlier snapshot in the destination's history. -It will commonly be an earlier snapshot in the destination's file system, in -which case it can be specified as the last component of the name -.Po the -.Sy # -or -.Sy @ -character and following -.Pc . -.Pp -If the incremental target is a clone, the incremental source can be the origin -snapshot, or an earlier snapshot in the origin's filesystem, or the origin's -origin, etc. -.It Fl n, -dryrun -Do a dry-run -.Pq Qq No-op -send. -Do not generate any actual send data. -This is useful in conjunction with the -.Fl v -or -.Fl P -flags to determine what data will be sent. -In this case, the verbose output will be written to standard output -.Po contrast with a non-dry-run, where the stream is written to standard output -and the verbose output goes to standard error -.Pc . -.It Fl v, -verbose -Print verbose information about the stream package generated. -This information includes a per-second report of how much data has been sent. -.El -.It Xo -.Nm -.Cm send -.Op Fl Penv -.Fl t -.Ar receive_resume_token -.Xc -Creates a send stream which resumes an interrupted receive. -The -.Ar receive_resume_token -is the value of this property on the filesystem or volume that was being -received into. -See the documentation for -.Sy zfs receive -s -for more details. -.It Xo -.Nm -.Cm receive -.Op Fl Fhnsuv -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Op Fl o Ar property Ns = Ns Ar value -.Op Fl x Ar property -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.It Xo -.Nm -.Cm receive -.Op Fl Fhnsuv -.Op Fl d Ns | Ns Fl e -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Op Fl o Ar property Ns = Ns Ar value -.Op Fl x Ar property -.Ar filesystem -.Xc -Creates a snapshot whose contents are as specified in the stream provided on -standard input. -If a full stream is received, then a new file system is created as well. -Streams are created using the -.Nm zfs Cm send -subcommand, which by default creates a full stream. -.Nm zfs Cm recv -can be used as an alias for -.Nm zfs Cm receive. -.Pp -If an incremental stream is received, then the destination file system must -already exist, and its most recent snapshot must match the incremental stream's -source. -For -.Sy zvols , -the destination device link is destroyed and recreated, which means the -.Sy zvol -cannot be accessed during the -.Cm receive -operation. -.Pp -When a snapshot replication package stream that is generated by using the -.Nm zfs Cm send Fl R -command is received, any snapshots that do not exist on the sending location are -destroyed by using the -.Nm zfs Cm destroy Fl d -command. -.Pp -If -.Fl o Em property Ns = Ns Ar value -or -.Fl x Em property -is specified, it applies to the effective value of the property throughout -the entire subtree of replicated datasets. Effective property values will be -set ( -.Fl o -) or inherited ( -.Fl x -) on the topmost in the replicated subtree. In descendant datasets, if the -property is set by the send stream, it will be overridden by forcing the -property to be inherited from the top‐most file system. Received properties -are retained in spite of being overridden and may be restored with -.Nm zfs Cm inherit Fl S . -Specifying -.Fl o Sy origin Ns = Ns Em snapshot -is a special case because, even if -.Sy origin -is a read-only property and cannot be set, it's allowed to receive the send -stream as a clone of the given snapshot. -.Pp -Raw encrypted send streams (created with -.Nm zfs Cm send Fl w -) may only be received as is, and cannot be re-encrypted, decrypted, or -recompressed by the receive process. Unencrypted streams can be received as -encrypted datasets, either through inheritance or by specifying encryption -parameters with the -.Fl o -options. Note that the -.Sy keylocation -property cannot be overridden to -.Sy prompt -during a receive. This is because the receive process itself is already using -stdin for the send stream. Instead, the property can be overridden after the -receive completes. -.Pp -The added security provided by raw sends adds some restrictions to the send -and receive process. ZFS will not allow a mix of raw receives and non-raw -receives. Specifically, any raw incremental receives that are attempted after -a non-raw receive will fail. Non-raw receives do not have this restriction and, -therefore, are always possible. Because of this, it is best practice to always -use either raw sends for their security benefits or non-raw sends for their -flexibility when working with encrypted datasets, but not a combination. -.Pp -The reason for this restriction stems from the inherent restrictions of the -AEAD ciphers that ZFS uses to encrypt data. When using ZFS native encryption, -each block of data is encrypted against a randomly generated number known as -the "initialization vector" (IV), which is stored in the filesystem metadata. -This number is required by the encryption algorithms whenever the data is to -be decrypted. Together, all of the IVs provided for all of the blocks in a -given snapshot are collectively called an "IV set". When ZFS performs a raw -send, the IV set is transferred from the source to the destination in the send -stream. When ZFS performs a non-raw send, the data is decrypted by the source -system and re-encrypted by the destination system, creating a snapshot with -effectively the same data, but a different IV set. In order for decryption to -work after a raw send, ZFS must ensure that the IV set used on both the source -and destination side match. When an incremental raw receive is performed on -top of an existing snapshot, ZFS will check to confirm that the "from" -snapshot on both the source and destination were using the same IV set, -ensuring the new IV set is consistent. -.Pp -The name of the snapshot -.Pq and file system, if a full stream is received -that this subcommand creates depends on the argument type and the use of the -.Fl d -or -.Fl e -options. -.Pp -If the argument is a snapshot name, the specified -.Ar snapshot -is created. -If the argument is a file system or volume name, a snapshot with the same name -as the sent snapshot is created within the specified -.Ar filesystem -or -.Ar volume . -If neither of the -.Fl d -or -.Fl e -options are specified, the provided target snapshot name is used exactly as -provided. -.Pp -The -.Fl d -and -.Fl e -options cause the file system name of the target snapshot to be determined by -appending a portion of the sent snapshot's name to the specified target -.Ar filesystem . -If the -.Fl d -option is specified, all but the first element of the sent snapshot's file -system path -.Pq usually the pool name -is used and any required intermediate file systems within the specified one are -created. -If the -.Fl e -option is specified, then only the last element of the sent snapshot's file -system name -.Pq i.e. the name of the source file system itself -is used as the target file system name. -.Bl -tag -width "-F" -.It Fl F -Force a rollback of the file system to the most recent snapshot before -performing the receive operation. -If receiving an incremental replication stream -.Po for example, one generated by -.Nm zfs Cm send Fl R Op Fl i Ns | Ns Fl I -.Pc , -destroy snapshots and file systems that do not exist on the sending side. -.It Fl d -Discard the first element of the sent snapshot's file system name, using the -remaining elements to determine the name of the target file system for the new -snapshot as described in the paragraph above. -.It Fl e -Discard all but the last element of the sent snapshot's file system name, using -that element to determine the name of the target file system for the new -snapshot as described in the paragraph above. -.It Fl h -Skip the receive of holds. There is no effect if holds are not sent. -.It Fl n -Do not actually receive the stream. -This can be useful in conjunction with the -.Fl v -option to verify the name the receive operation would use. -.It Fl o Sy origin Ns = Ns Ar snapshot -Forces the stream to be received as a clone of the given snapshot. -If the stream is a full send stream, this will create the filesystem -described by the stream as a clone of the specified snapshot. -Which snapshot was specified will not affect the success or failure of the -receive, as long as the snapshot does exist. -If the stream is an incremental send stream, all the normal verification will be -performed. -.It Fl o Em property Ns = Ns Ar value -Sets the specified property as if the command -.Nm zfs Cm set Em property Ns = Ns Ar value -was invoked immediately before the receive. When receiving a stream from -.Nm zfs Cm send Fl R , -causes the property to be inherited by all descendant datasets, as through -.Nm zfs Cm inherit Em property -was run on any descendant datasets that have this property set on the -sending system. -.Pp -Any editable property can be set at receive time. Set-once properties bound -to the received data, such as -.Sy normalization -and -.Sy casesensitivity , -cannot be set at receive time even when the datasets are newly created by -.Nm zfs Cm receive . -Additionally both settable properties -.Sy version -and -.Sy volsize -cannot be set at receive time. -.Pp -The -.Fl o -option may be specified multiple times, for different properties. An error -results if the same property is specified in multiple -.Fl o -or -.Fl x -options. -.Pp -The -.Fl o -option may also be used to override encryption properties upon initial -receive. This allows unencrypted streams to be received as encrypted datasets. -To cause the received dataset (or root dataset of a recursive stream) to be -received as an encryption root, specify encryption properties in the same -manner as is required for -.Nm -.Cm create . -For instance: -.Bd -literal -# zfs send tank/test@snap1 | zfs recv -o encryption=on -o keyformat=passphrase -o keylocation=file:///path/to/keyfile -.Ed -.Pp -Note that -.Op Fl o Ar keylocation Ns = Ns Ar prompt -may not be specified here, since stdin is already being utilized for the send -stream. Once the receive has completed, you can use -.Nm -.Cm set -to change this setting after the fact. Similarly, you can receive a dataset as -an encrypted child by specifying -.Op Fl x Ar encryption -to force the property to be inherited. Overriding encryption properties (except -for -.Sy keylocation Ns ) -is not possible with raw send streams. -.It Fl s -If the receive is interrupted, save the partially received state, rather -than deleting it. -Interruption may be due to premature termination of the stream -.Po e.g. due to network failure or failure of the remote system -if the stream is being read over a network connection -.Pc , -a checksum error in the stream, termination of the -.Nm zfs Cm receive -process, or unclean shutdown of the system. -.Pp -The receive can be resumed with a stream generated by -.Nm zfs Cm send Fl t Ar token , -where the -.Ar token -is the value of the -.Sy receive_resume_token -property of the filesystem or volume which is received into. -.Pp -To use this flag, the storage pool must have the -.Sy extensible_dataset -feature enabled. -See -.Xr zpool-features 5 -for details on ZFS feature flags. -.It Fl u -File system that is associated with the received stream is not mounted. -.It Fl v -Print verbose information about the stream and the time required to perform the -receive operation. -.It Fl x Em property -Ensures that the effective value of the specified property after the -receive is unaffected by the value of that property in the send stream (if any), -as if the property had been excluded from the send stream. -.Pp -If the specified property is not present in the send stream, this option does -nothing. -.Pp -If a received property needs to be overridden, the effective value will be -set or inherited, depending on whether the property is inheritable or not. -.Pp -In the case of an incremental update, -.Fl x -leaves any existing local setting or explicit inheritance unchanged. -.Pp -All -.Fl o -restrictions (e.g. set-once) apply equally to -.Fl x . -.El -.It Xo -.Nm -.Cm receive -.Fl A -.Ar filesystem Ns | Ns Ar volume -.Xc -Abort an interrupted -.Nm zfs Cm receive Fl s , -deleting its saved partially received state. -.It Xo -.Nm -.Cm allow -.Ar filesystem Ns | Ns Ar volume -.Xc -Displays permissions that have been delegated on the specified filesystem or -volume. -See the other forms of -.Nm zfs Cm allow -for more information. -.Pp -Delegations are supported under Linux with the exception of -.Sy mount , -.Sy unmount , -.Sy mountpoint , -.Sy canmount , -.Sy rename , -and -.Sy share . -These permissions cannot be delegated because the Linux -.Xr mount 8 -command restricts modifications of the global namespace to the root user. -.It Xo -.Nm -.Cm allow -.Op Fl dglu -.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm allow -.Op Fl dl -.Fl e Ns | Ns Sy everyone -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -Delegates ZFS administration permission for the file systems to non-privileged -users. -.Bl -tag -width "-d" -.It Fl d -Allow only for the descendent file systems. -.It Fl e Ns | Ns Sy everyone -Specifies that the permissions be delegated to everyone. -.It Fl g Ar group Ns Oo , Ns Ar group Oc Ns ... -Explicitly specify that permissions are delegated to the group. -.It Fl l -Allow -.Qq locally -only for the specified file system. -.It Fl u Ar user Ns Oo , Ns Ar user Oc Ns ... -Explicitly specify that permissions are delegated to the user. -.It Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... -Specifies to whom the permissions are delegated. -Multiple entities can be specified as a comma-separated list. -If neither of the -.Fl gu -options are specified, then the argument is interpreted preferentially as the -keyword -.Sy everyone , -then as a user name, and lastly as a group name. -To specify a user or group named -.Qq everyone , -use the -.Fl g -or -.Fl u -options. -To specify a group with the same name as a user, use the -.Fl g -options. -.It Xo -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Xc -The permissions to delegate. -Multiple permissions may be specified as a comma-separated list. -Permission names are the same as ZFS subcommand and property names. -See the property list below. -Property set names, which begin with -.Sy @ , -may be specified. -See the -.Fl s -form below for details. -.El -.Pp -If neither of the -.Fl dl -options are specified, or both are, then the permissions are allowed for the -file system or volume, and all of its descendents. -.Pp -Permissions are generally the ability to use a ZFS subcommand or change a ZFS -property. -The following permissions are available: -.Bd -literal -NAME TYPE NOTES -allow subcommand Must also have the permission that is - being allowed -clone subcommand Must also have the 'create' ability and - 'mount' ability in the origin file system -create subcommand Must also have the 'mount' ability. - Must also have the 'refreservation' ability to - create a non-sparse volume. -destroy subcommand Must also have the 'mount' ability -diff subcommand Allows lookup of paths within a dataset - given an object number, and the ability - to create snapshots necessary to - 'zfs diff'. -load-key subcommand Allows loading and unloading of encryption key - (see 'zfs load-key' and 'zfs unload-key'). -change-key subcommand Allows changing an encryption key via - 'zfs change-key'. -mount subcommand Allows mount/umount of ZFS datasets -promote subcommand Must also have the 'mount' and 'promote' - ability in the origin file system -receive subcommand Must also have the 'mount' and 'create' - ability -rename subcommand Must also have the 'mount' and 'create' - ability in the new parent -rollback subcommand Must also have the 'mount' ability -send subcommand -share subcommand Allows sharing file systems over NFS - or SMB protocols -snapshot subcommand Must also have the 'mount' ability - -groupquota other Allows accessing any groupquota@... - property -groupused other Allows reading any groupused@... property -userprop other Allows changing any user property -userquota other Allows accessing any userquota@... - property -userused other Allows reading any userused@... property -projectobjquota other Allows accessing any projectobjquota@... - property -projectquota other Allows accessing any projectquota@... property -projectobjused other Allows reading any projectobjused@... property -projectused other Allows reading any projectused@... property - -aclinherit property -acltype property -atime property -canmount property -casesensitivity property -checksum property -compression property -copies property -devices property -exec property -filesystem_limit property -mountpoint property -nbmand property -normalization property -primarycache property -quota property -readonly property -recordsize property -refquota property -refreservation property -reservation property -secondarycache property -setuid property -sharenfs property -sharesmb property -snapdir property -snapshot_limit property -utf8only property -version property -volblocksize property -volsize property -vscan property -xattr property -zoned property -.Ed -.It Xo -.Nm -.Cm allow -.Fl c -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -Sets -.Qq create time -permissions. -These permissions are granted -.Pq locally -to the creator of any newly-created descendent file system. -.It Xo -.Nm -.Cm allow -.Fl s No @ Ns Ar setname -.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -Defines or adds permissions to a permission set. -The set can be used by other -.Nm zfs Cm allow -commands for the specified file system and its descendents. -Sets are evaluated dynamically, so changes to a set are immediately reflected. -Permission sets follow the same naming restrictions as ZFS file systems, but the -name must begin with -.Sy @ , -and can be no more than 64 characters long. -.It Xo -.Nm -.Cm unallow -.Op Fl dglru -.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm unallow -.Op Fl dlr -.Fl e Ns | Ns Sy everyone -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm unallow -.Op Fl r -.Fl c -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -Removes permissions that were granted with the -.Nm zfs Cm allow -command. -No permissions are explicitly denied, so other permissions granted are still in -effect. -For example, if the permission is granted by an ancestor. -If no permissions are specified, then all permissions for the specified -.Ar user , -.Ar group , -or -.Sy everyone -are removed. -Specifying -.Sy everyone -.Po or using the -.Fl e -option -.Pc -only removes the permissions that were granted to everyone, not all permissions -for every user and group. -See the -.Nm zfs Cm allow -command for a description of the -.Fl ldugec -options. -.Bl -tag -width "-r" -.It Fl r -Recursively remove the permissions from this file system and all descendents. -.El -.It Xo -.Nm -.Cm unallow -.Op Fl r -.Fl s No @ Ns Ar setname -.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns -.Ar setname Oc Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -Removes permissions from a permission set. -If no permissions are specified, then all permissions are removed, thus removing -the set entirely. -.It Xo -.Nm -.Cm hold -.Op Fl r -.Ar tag Ar snapshot Ns ... -.Xc -Adds a single reference, named with the -.Ar tag -argument, to the specified snapshot or snapshots. -Each snapshot has its own tag namespace, and tags must be unique within that -space. -.Pp -If a hold exists on a snapshot, attempts to destroy that snapshot by using the -.Nm zfs Cm destroy -command return -.Er EBUSY . -.Bl -tag -width "-r" -.It Fl r -Specifies that a hold with the given tag is applied recursively to the snapshots -of all descendent file systems. -.El -.It Xo -.Nm -.Cm holds -.Op Fl rH -.Ar snapshot Ns ... -.Xc -Lists all existing user references for the given snapshot or snapshots. -.Bl -tag -width "-r" -.It Fl r -Lists the holds that are set on the named descendent snapshots, in addition to -listing the holds on the named snapshot. -.It Fl H -Do not print headers, use tab-delimited output. -.El -.It Xo -.Nm -.Cm release -.Op Fl r -.Ar tag Ar snapshot Ns ... -.Xc -Removes a single reference, named with the -.Ar tag -argument, from the specified snapshot or snapshots. -The tag must already exist for each snapshot. -If a hold exists on a snapshot, attempts to destroy that snapshot by using the -.Nm zfs Cm destroy -command return -.Er EBUSY . -.Bl -tag -width "-r" -.It Fl r -Recursively releases a hold with the given tag on the snapshots of all -descendent file systems. -.El -.It Xo -.Nm -.Cm diff -.Op Fl FHt -.Ar snapshot Ar snapshot Ns | Ns Ar filesystem -.Xc -Display the difference between a snapshot of a given filesystem and another -snapshot of that filesystem from a later time or the current contents of the -filesystem. -The first column is a character indicating the type of change, the other columns -indicate pathname, new pathname -.Pq in case of rename , -change in link count, and optionally file type and/or change time. -The types of change are: -.Bd -literal -- The path has been removed -+ The path has been created -M The path has been modified -R The path has been renamed -.Ed -.Bl -tag -width "-F" -.It Fl F -Display an indication of the type of file, in a manner similar to the -.Fl -option of -.Xr ls 1 . -.Bd -literal -B Block device -C Character device -/ Directory -> Door -| Named pipe -@ Symbolic link -P Event port -= Socket -F Regular file -.Ed -.It Fl H -Give more parsable tab-separated output, without header lines and without -arrows. -.It Fl t -Display the path's inode change time as the first column of output. -.El -.It Xo -.Nm -.Cm program -.Op Fl jn -.Op Fl t Ar instruction-limit -.Op Fl m Ar memory-limit -.Ar pool script -.Op Ar arg1 No ... -.Xc -Executes -.Ar script -as a ZFS channel program on -.Ar pool . -The ZFS channel -program interface allows ZFS administrative operations to be run -programmatically via a Lua script. -The entire script is executed atomically, with no other administrative -operations taking effect concurrently. -A library of ZFS calls is made available to channel program scripts. -Channel programs may only be run with root privileges. -.sp -For full documentation of the ZFS channel program interface, see the manual -page for -.Xr zfs-program 8 . -.Bl -tag -width "" -.It Fl j -Display channel program output in JSON format. When this flag is specified and -standard output is empty - channel program encountered an error. The details of -such an error will be printed to standard error in plain text. -.It Fl n -Executes a read-only channel program, which runs faster. -The program cannot change on-disk state by calling functions from -the zfs.sync submodule. -The program can be used to gather information such as properties and -determining if changes would succeed (zfs.check.*). -Without this flag, all pending changes must be synced to disk before -a channel program can complete. -.It Fl t Ar instruction-limit -Limit the number of Lua instructions to execute. -If a channel program executes more than the specified number of instructions, -it will be stopped and an error will be returned. -The default limit is 10 million instructions, and it can be set to a maximum of -100 million instructions. -.It Fl m Ar memory-limit -Memory limit, in bytes. -If a channel program attempts to allocate more memory than the given limit, -it will be stopped and an error returned. -The default memory limit is 10 MB, and can be set to a maximum of 100 MB. -.sp -All remaining argument strings are passed directly to the channel program as -arguments. -See -.Xr zfs-program 8 -for more information. -.El -.It Xo -.Nm -.Cm load-key -.Op Fl nr -.Op Fl L Ar keylocation -.Fl a | Ar filesystem -.Xc -Load the key for -.Ar filesystem , -allowing it and all children that inherit the -.Sy keylocation -property to be accessed. The key will be expected in the format specified by the -.Sy keyformat -and location specified by the -.Sy keylocation -property. Note that if the -.Sy keylocation -is set to -.Sy prompt -the terminal will interactively wait for the key to be entered. Loading a key -will not automatically mount the dataset. If that functionality is desired, -.Nm zfs Cm mount Sy -l -will ask for the key and mount the dataset. Once the key is loaded the -.Sy keystatus -property will become -.Sy available . -.Bl -tag -width "-r" -.It Fl r -Recursively loads the keys for the specified filesystem and all descendent -encryption roots. -.It Fl a -Loads the keys for all encryption roots in all imported pools. -.It Fl n -Do a dry-run -.Pq Qq No-op -load-key. This will cause zfs to simply check that the -provided key is correct. This command may be run even if the key is already -loaded. -.It Fl L Ar keylocation -Use -.Ar keylocation -instead of the -.Sy keylocation -property. This will not change the value of the property on the dataset. Note -that if used with either -.Fl r -or -.Fl a , -.Ar keylocation -may only be given as -.Sy prompt . -.El -.It Xo -.Nm -.Cm unload-key -.Op Fl r -.Fl a | Ar filesystem -.Xc -Unloads a key from ZFS, removing the ability to access the dataset and all of -its children that inherit the -.Sy keylocation -property. This requires that the dataset is not currently open or mounted. Once -the key is unloaded the -.Sy keystatus -property will become -.Sy unavailable . -.Bl -tag -width "-r" -.It Fl r -Recursively unloads the keys for the specified filesystem and all descendent -encryption roots. -.It Fl a -Unloads the keys for all encryption roots in all imported pools. -.El -.It Xo -.Nm -.Cm change-key -.Op Fl l -.Op Fl o Ar keylocation Ns = Ns Ar value -.Op Fl o Ar keyformat Ns = Ns Ar value -.Op Fl o Ar pbkdf2iters Ns = Ns Ar value -.Ar filesystem -.Xc -.It Xo -.Nm -.Cm change-key -.Fl i -.Op Fl l -.Ar filesystem -.Xc -Allows a user to change the encryption key used to access a dataset. This -command requires that the existing key for the dataset is already loaded into -ZFS. This command may also be used to change the -.Sy keylocation , -.Sy keyformat , -and -.Sy pbkdf2iters -properties as needed. If the dataset was not previously an encryption root it -will become one. Alternatively, the -.Fl i -flag may be provided to cause an encryption root to inherit the parent's key -instead. -.Bl -tag -width "-r" -.It Fl l -Ensures the key is loaded before attempting to change the key. This is -effectively equivalent to -.Qq Nm zfs Cm load-key Ar filesystem ; Nm zfs Cm change-key Ar filesystem -.It Fl o Ar property Ns = Ns Ar value -Allows the user to set encryption key properties ( -.Sy keyformat , -.Sy keylocation , -and -.Sy pbkdf2iters -) while changing the key. This is the only way to alter -.Sy keyformat -and -.Sy pbkdf2iters -after the dataset has been created. -.It Fl i -Indicates that zfs should make -.Ar filesystem -inherit the key of its parent. Note that this command can only be run on an -encryption root that has an encrypted parent. -.El .It Xo .Nm .Cm version @@ -4648,95 +139,242 @@ Displays the software version of the .Nm userland utility and the zfs kernel module. .El +. +.Ss Dataset Management +.Bl -tag -width "" +.It Xr zfs-list 8 +Lists the property information for the given datasets in tabular form. +.It Xr zfs-create 8 +Creates a new ZFS file system or volume. +.It Xr zfs-destroy 8 +Destroys the given dataset(s), snapshot(s), or bookmark. +.It Xr zfs-rename 8 +Renames the given dataset (filesystem or snapshot). +.It Xr zfs-upgrade 8 +Manage upgrading the on-disk version of filesystems. +.El +. +.Ss Snapshots +.Bl -tag -width "" +.It Xr zfs-snapshot 8 +Creates snapshots with the given names. +.It Xr zfs-rollback 8 +Roll back the given dataset to a previous snapshot. +.It Xr zfs-hold 8 Ns / Ns Xr zfs-release 8 +Add or remove a hold reference to the specified snapshot or snapshots. +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Sy EBUSY . +.It Xr zfs-diff 8 +Display the difference between a snapshot of a given filesystem and another +snapshot of that filesystem from a later time or the current contents of the +filesystem. +.El +. +.Ss Clones +.Bl -tag -width "" +.It Xr zfs-clone 8 +Creates a clone of the given snapshot. +.It Xr zfs-promote 8 +Promotes a clone file system to no longer be dependent on its +.Qq origin +snapshot. +.El +. +.Ss Send & Receive +.Bl -tag -width "" +.It Xr zfs-send 8 +Generate a send stream, which may be of a filesystem, and may be incremental +from a bookmark. +.It Xr zfs-receive 8 +Creates a snapshot whose contents are as specified in the stream provided on +standard input. +If a full stream is received, then a new file system is created as well. +Streams are created using the +.Xr zfs-send 8 +subcommand, which by default creates a full stream. +.It Xr zfs-bookmark 8 +Creates a new bookmark of the given snapshot or bookmark. +Bookmarks mark the point in time when the snapshot was created, and can be used +as the incremental source for a +.Nm zfs Cm send +command. +.It Xr zfs-redact 8 +Generate a new redaction bookmark. +This feature can be used to allow clones of a filesystem to be made available on +a remote system, in the case where their parent need not (or needs to not) be +usable. +.El +. +.Ss Properties +.Bl -tag -width "" +.It Xr zfs-get 8 +Displays properties for the given datasets. +.It Xr zfs-set 8 +Sets the property or list of properties to the given value(s) for each dataset. +.It Xr zfs-inherit 8 +Clears the specified property, causing it to be inherited from an ancestor, +restored to default if no ancestor has the property set, or with the +.Fl S +option reverted to the received value if one exists. +.El +. +.Ss Quotas +.Bl -tag -width "" +.It Xr zfs-userspace 8 Ns / Ns Xr zfs-groupspace 8 Ns / Ns Xr zfs-projectspace 8 +Displays space consumed by, and quotas on, each user, group, or project +in the specified filesystem or snapshot. +.It Xr zfs-project 8 +List, set, or clear project ID and/or inherit flag on the file(s) or directories. +.El +. +.Ss Mountpoints +.Bl -tag -width "" +.It Xr zfs-mount 8 +Displays all ZFS file systems currently mounted, or mount ZFS filesystem +on a path described by its +.Sy mountpoint +property. +.It Xr zfs-unmount 8 +Unmounts currently mounted ZFS file systems. +.El +. +.Ss Shares +.Bl -tag -width "" +.It Xr zfs-share 8 +Shares available ZFS file systems. +.It Xr zfs-unshare 8 +Unshares currently shared ZFS file systems. +.El +. +.Ss Delegated Administration +.Bl -tag -width "" +.It Xr zfs-allow 8 +Delegate permissions on the specified filesystem or volume. +.It Xr zfs-unallow 8 +Remove delegated permissions on the specified filesystem or volume. +.El +. +.Ss Encryption +.Bl -tag -width "" +.It Xr zfs-change-key 8 +Add or change an encryption key on the specified dataset. +.It Xr zfs-load-key 8 +Load the key for the specified encrypted dataset, enabling access. +.It Xr zfs-unload-key 8 +Unload a key for the specified dataset, removing the ability to access the dataset. +.El +. +.Ss Channel Programs +.Bl -tag -width "" +.It Xr zfs-program 8 +Execute ZFS administrative operations +programmatically via a Lua script-language channel program. +.El +. +.Ss Jails +.Bl -tag -width "" +.It Xr zfs-jail 8 +Attaches a filesystem to a jail. +.It Xr zfs-unjail 8 +Detaches a filesystem from a jail. +.El +. +.Ss Waiting +.Bl -tag -width "" +.It Xr zfs-wait 8 +Wait for background activity in a filesystem to complete. +.El +. .Sh EXIT STATUS The .Nm -utility exits 0 on success, 1 if an error occurs, and 2 if invalid command line -options were specified. +utility exits +.Sy 0 +on success, +.Sy 1 +if an error occurs, and +.Sy 2 +if invalid command line options were specified. +. .Sh EXAMPLES .Bl -tag -width "" -.It Sy Example 1 No Creating a ZFS File System Hierarchy +. +.It Sy Example 1 : No Creating a ZFS File System Hierarchy The following commands create a file system named -.Em pool/home +.Ar pool/home and a file system named -.Em pool/home/bob . +.Ar pool/home/bob . The mount point .Pa /export/home is set for the parent file system, and is automatically inherited by the child file system. -.Bd -literal -# zfs create pool/home -# zfs set mountpoint=/export/home pool/home -# zfs create pool/home/bob -.Ed -.It Sy Example 2 No Creating a ZFS Snapshot +.Dl # Nm zfs Cm create Ar pool/home +.Dl # Nm zfs Cm set Sy mountpoint Ns = Ns Ar /export/home pool/home +.Dl # Nm zfs Cm create Ar pool/home/bob +. +.It Sy Example 2 : No Creating a ZFS Snapshot The following command creates a snapshot named -.Sy yesterday . +.Ar yesterday . This snapshot is mounted on demand in the .Pa .zfs/snapshot directory at the root of the -.Em pool/home/bob +.Ar pool/home/bob file system. -.Bd -literal -# zfs snapshot pool/home/bob@yesterday -.Ed -.It Sy Example 3 No Creating and Destroying Multiple Snapshots +.Dl # Nm zfs Cm snapshot Ar pool/home/bob Ns @ Ns Ar yesterday +. +.It Sy Example 3 : No Creating and Destroying Multiple Snapshots The following command creates snapshots named -.Sy yesterday -of -.Em pool/home +.Ar yesterday No of Ar pool/home and all of its descendent file systems. Each snapshot is mounted on demand in the .Pa .zfs/snapshot directory at the root of its file system. The second command destroys the newly created snapshots. -.Bd -literal -# zfs snapshot -r pool/home@yesterday -# zfs destroy -r pool/home@yesterday -.Ed -.It Sy Example 4 No Disabling and Enabling File System Compression +.Dl # Nm zfs Cm snapshot Fl r Ar pool/home Ns @ Ns Ar yesterday +.Dl # Nm zfs Cm destroy Fl r Ar pool/home Ns @ Ns Ar yesterday +. +.It Sy Example 4 : No Disabling and Enabling File System Compression The following command disables the .Sy compression property for all file systems under -.Em pool/home . +.Ar pool/home . The next command explicitly enables .Sy compression for -.Em pool/home/anne . -.Bd -literal -# zfs set compression=off pool/home -# zfs set compression=on pool/home/anne -.Ed -.It Sy Example 5 No Listing ZFS Datasets +.Ar pool/home/anne . +.Dl # Nm zfs Cm set Sy compression Ns = Ns Sy off Ar pool/home +.Dl # Nm zfs Cm set Sy compression Ns = Ns Sy on Ar pool/home/anne +. +.It Sy Example 5 : No Listing ZFS Datasets The following command lists all active file systems and volumes in the system. -Snapshots are displayed if the -.Sy listsnaps -property is -.Sy on . +Snapshots are displayed if +.Sy listsnaps Ns = Ns Sy on . The default is .Sy off . See -.Xr zpool 8 +.Xr zpoolprops 7 for more information on pool properties. -.Bd -literal -# zfs list +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm list NAME USED AVAIL REFER MOUNTPOINT pool 450K 457G 18K /pool pool/home 315K 457G 21K /export/home pool/home/anne 18K 457G 18K /export/home/anne pool/home/bob 276K 457G 276K /export/home/bob .Ed -.It Sy Example 6 No Setting a Quota on a ZFS File System +. +.It Sy Example 6 : No Setting a Quota on a ZFS File System The following command sets a quota of 50 Gbytes for -.Em pool/home/bob . -.Bd -literal -# zfs set quota=50G pool/home/bob -.Ed -.It Sy Example 7 No Listing ZFS Properties +.Ar pool/home/bob : +.Dl # Nm zfs Cm set Sy quota Ns = Ns Ar 50G pool/home/bob +. +.It Sy Example 7 : No Listing ZFS Properties The following command lists all properties for -.Em pool/home/bob . -.Bd -literal -# zfs get all pool/home/bob +.Ar pool/home/bob : +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm get Sy all Ar pool/home/bob NAME PROPERTY VALUE SOURCE pool/home/bob type filesystem - pool/home/bob creation Tue Jul 21 15:53 2009 - @@ -4760,6 +398,7 @@ pool/home/bob readonly off default pool/home/bob zoned off default pool/home/bob snapdir hidden default pool/home/bob acltype off default +pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob xattr on default @@ -4781,63 +420,61 @@ pool/home/bob usedbychildren 0 - pool/home/bob usedbyrefreservation 0 - .Ed .Pp -The following command gets a single property value. -.Bd -literal -# zfs get -H -o value compression pool/home/bob +The following command gets a single property value: +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm get Fl H o Sy value compression Ar pool/home/bob on .Ed +.Pp The following command lists all properties with local settings for -.Em pool/home/bob . -.Bd -literal -# zfs get -r -s local -o name,property,value all pool/home/bob +.Ar pool/home/bob : +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm get Fl r s Sy local Fl o Sy name , Ns Sy property , Ns Sy value all Ar pool/home/bob NAME PROPERTY VALUE pool/home/bob quota 20G pool/home/bob compression on .Ed -.It Sy Example 8 No Rolling Back a ZFS File System +. +.It Sy Example 8 : No Rolling Back a ZFS File System The following command reverts the contents of -.Em pool/home/anne +.Ar pool/home/anne to the snapshot named -.Sy yesterday , -deleting all intermediate snapshots. -.Bd -literal -# zfs rollback -r pool/home/anne@yesterday -.Ed -.It Sy Example 9 No Creating a ZFS Clone +.Ar yesterday , +deleting all intermediate snapshots: +.Dl # Nm zfs Cm rollback Fl r Ar pool/home/anne Ns @ Ns Ar yesterday +. +.It Sy Example 9 : No Creating a ZFS Clone The following command creates a writable file system whose initial contents are the same as -.Em pool/home/bob@yesterday . -.Bd -literal -# zfs clone pool/home/bob@yesterday pool/clone -.Ed -.It Sy Example 10 No Promoting a ZFS Clone +.Ar pool/home/bob@yesterday . +.Dl # Nm zfs Cm clone Ar pool/home/bob@yesterday pool/clone +. +.It Sy Example 10 : No Promoting a ZFS Clone The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming: -.Bd -literal -# zfs create pool/project/production +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm create Ar pool/project/production populate /pool/project/production with data -# zfs snapshot pool/project/production@today -# zfs clone pool/project/production@today pool/project/beta +.No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today +.No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta make changes to /pool/project/beta and test them -# zfs promote pool/project/beta -# zfs rename pool/project/production pool/project/legacy -# zfs rename pool/project/beta pool/project/production +.No # Nm zfs Cm promote Ar pool/project/beta +.No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy +.No # Nm zfs Cm rename Ar pool/project/beta pool/project/production once the legacy version is no longer needed, it can be destroyed -# zfs destroy pool/project/legacy +.No # Nm zfs Cm destroy Ar pool/project/legacy .Ed -.It Sy Example 11 No Inheriting ZFS Properties +. +.It Sy Example 11 : No Inheriting ZFS Properties The following command causes -.Em pool/home/bob -and -.Em pool/home/anne +.Ar pool/home/bob No and Ar pool/home/anne to inherit the .Sy checksum property from their parent. -.Bd -literal -# zfs inherit checksum pool/home/bob pool/home/anne -.Ed -.It Sy Example 12 No Remotely Replicating ZFS Data +.Dl # Nm zfs Cm inherit Sy checksum Ar pool/home/bob pool/home/anne +. +.It Sy Example 12 : No Remotely Replicating ZFS Data The following commands send a full stream and then an incremental stream to a remote machine, restoring them into .Em poolB/received/fs@a @@ -4849,147 +486,145 @@ must contain the file system .Em poolB/received , and must not initially contain .Em poolB/received/fs . -.Bd -literal -# zfs send pool/fs@a | \e - ssh host zfs receive poolB/received/fs@a -# zfs send -i a pool/fs@b | \e - ssh host zfs receive poolB/received/fs +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm send Ar pool/fs@a | +.No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs Ns @ Ns Ar a +.No # Nm zfs Cm send Fl i Ar a pool/fs@b | +.No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs .Ed -.It Sy Example 13 No Using the zfs receive -d Option +. +.It Sy Example 13 : No Using the Nm zfs Cm receive Fl d No Option The following command sends a full stream of -.Em poolA/fsA/fsB@snap +.Ar poolA/fsA/fsB@snap to a remote machine, receiving it into -.Em poolB/received/fsA/fsB@snap . +.Ar poolB/received/fsA/fsB@snap . The -.Em fsA/fsB@snap +.Ar fsA/fsB@snap portion of the received snapshot's name is determined from the name of the sent snapshot. -.Em poolB +.Ar poolB must contain the file system -.Em poolB/received . +.Ar poolB/received . If -.Em poolB/received/fsA +.Ar poolB/received/fsA does not exist, it is created as an empty file system. -.Bd -literal -# zfs send poolA/fsA/fsB@snap | \e - ssh host zfs receive -d poolB/received +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm send Ar poolA/fsA/fsB@snap | +.No " " Nm ssh Ar host Nm zfs Cm receive Fl d Ar poolB/received .Ed -.It Sy Example 14 No Setting User Properties +. +.It Sy Example 14 : No Setting User Properties The following example sets the user-defined -.Sy com.example:department -property for a dataset. -.Bd -literal -# zfs set com.example:department=12345 tank/accounting -.Ed -.It Sy Example 15 No Performing a Rolling Snapshot +.Ar com.example : Ns Ar department +property for a dataset: +.Dl # Nm zfs Cm set Ar com.example : Ns Ar department Ns = Ns Ar 12345 tank/accounting +. +.It Sy Example 15 : No Performing a Rolling Snapshot The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows: -.Bd -literal -# zfs destroy -r pool/users@7daysago -# zfs rename -r pool/users@6daysago @7daysago -# zfs rename -r pool/users@5daysago @6daysago -# zfs rename -r pool/users@4daysago @5daysago -# zfs rename -r pool/users@3daysago @4daysago -# zfs rename -r pool/users@2daysago @3daysago -# zfs rename -r pool/users@yesterday @2daysago -# zfs rename -r pool/users@today @yesterday -# zfs snapshot -r pool/users@today +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago +.No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday +.No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today .Ed -.It Sy Example 16 No Setting sharenfs Property Options on a ZFS File System +. +.It Sy Example 16 : No Setting sharenfs Property Options on a ZFS File System The following commands show how to set .Sy sharenfs -property options to enable -.Sy rw -access for a set of -.Sy IP -addresses and to enable root access for system -.Sy neo +property options to enable read-write +access for a set of IP addresses and to enable root access for system +.Qq neo on the -.Em tank/home -file system. -.Bd -literal -# zfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home -.Ed +.Ar tank/home +file system: +.Dl # Nm zfs Cm set Sy sharenfs Ns = Ns ' Ns Ar rw Ns =@123.123.0.0/16:[::1],root= Ns Ar neo Ns ' tank/home .Pp -If you are using -.Sy DNS -for host name resolution, specify the fully qualified hostname. -.It Sy Example 17 No Delegating ZFS Administration Permissions on a ZFS Dataset +If you are using DNS for host name resolution, +specify the fully-qualified hostname. +. +.It Sy Example 17 : No Delegating ZFS Administration Permissions on a ZFS Dataset The following example shows how to set permissions so that user -.Sy cindys +.Ar cindys can create, destroy, mount, and take snapshots on -.Em tank/cindys . +.Ar tank/cindys . The permissions on -.Em tank/cindys +.Ar tank/cindys are also displayed. -.Bd -literal -# zfs allow cindys create,destroy,mount,snapshot tank/cindys -# zfs allow tank/cindys +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm allow Sy cindys create , Ns Sy destroy , Ns Sy mount , Ns Sy snapshot Ar tank/cindys +.No # Nm zfs Cm allow Ar tank/cindys ---- Permissions on tank/cindys -------------------------------------- Local+Descendent permissions: user cindys create,destroy,mount,snapshot .Ed .Pp Because the -.Em tank/cindys +.Ar tank/cindys mount point permission is set to 755 by default, user -.Sy cindys +.Ar cindys will be unable to mount file systems under -.Em tank/cindys . +.Ar tank/cindys . Add an ACE similar to the following syntax to provide mount point access: -.Bd -literal -# chmod A+user:cindys:add_subdirectory:allow /tank/cindys -.Ed -.It Sy Example 18 No Delegating Create Time Permissions on a ZFS Dataset +.Dl # Cm chmod No A+user: Ns Ar cindys Ns :add_subdirectory:allow Ar /tank/cindys +. +.It Sy Example 18 : No Delegating Create Time Permissions on a ZFS Dataset The following example shows how to grant anyone in the group -.Sy staff +.Ar staff to create file systems in -.Em tank/users . +.Ar tank/users . This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on -.Em tank/users +.Ar tank/users are also displayed. -.Bd -literal -# zfs allow staff create,mount tank/users -# zfs allow -c destroy tank/users -# zfs allow tank/users +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm allow Ar staff Sy create , Ns Sy mount Ar tank/users +.No # Nm zfs Cm allow Fl c Sy destroy Ar tank/users +.No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: destroy Local+Descendent permissions: group staff create,mount .Ed -.It Sy Example 19 No Defining and Granting a Permission Set on a ZFS Dataset +. +.It Sy Example 19 : No Defining and Granting a Permission Set on a ZFS Dataset The following example shows how to define and grant a permission set on the -.Em tank/users +.Ar tank/users file system. The permissions on -.Em tank/users +.Ar tank/users are also displayed. -.Bd -literal -# zfs allow -s @pset create,destroy,snapshot,mount tank/users -# zfs allow staff @pset tank/users -# zfs allow tank/users +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm allow Fl s No @ Ns Ar pset Sy create , Ns Sy destroy , Ns Sy snapshot , Ns Sy mount Ar tank/users +.No # Nm zfs Cm allow staff No @ Ns Ar pset tank/users +.No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed -.It Sy Example 20 No Delegating Property Permissions on a ZFS Dataset +. +.It Sy Example 20 : No Delegating Property Permissions on a ZFS Dataset The following example shows to grant the ability to set quotas and reservations on the -.Em users/home +.Ar users/home file system. The permissions on -.Em users/home +.Ar users/home are also displayed. -.Bd -literal -# zfs allow cindys quota,reservation users/home -# zfs allow users/home +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm allow Ar cindys Sy quota , Ns Sy reservation Ar users/home +.No # Nm zfs Cm allow Ar users/home ---- Permissions on users/home --------------------------------------- Local+Descendent permissions: user cindys quota,reservation @@ -4998,32 +633,34 @@ cindys% zfs get quota users/home/marks NAME PROPERTY VALUE SOURCE users/home/marks quota 10G local .Ed -.It Sy Example 21 No Removing ZFS Delegated Permissions on a ZFS Dataset +. +.It Sy Example 21 : No Removing ZFS Delegated Permissions on a ZFS Dataset The following example shows how to remove the snapshot permission from the -.Sy staff +.Ar staff group on the -.Em tank/users +.Sy tank/users file system. The permissions on -.Em tank/users +.Sy tank/users are also displayed. -.Bd -literal -# zfs unallow staff snapshot tank/users -# zfs allow tank/users +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm unallow Ar staff Sy snapshot Ar tank/users +.No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed -.It Sy Example 22 No Showing the differences between a snapshot and a ZFS Dataset +. +.It Sy Example 22 : No Showing the differences between a snapshot and a ZFS Dataset The following example shows how to see what has changed between a prior snapshot of a ZFS dataset and its current state. The .Fl F option is used to indicate type information for the files affected. -.Bd -literal -# zfs diff -F tank/test@before tank/test +.Bd -literal -compact -offset Ds +.No # Nm zfs Cm diff Fl F Ar tank/test@before tank/test M / /tank/test/ M F /tank/test/linked (+1) R F /tank/test/oldname -> /tank/test/newname @@ -5031,46 +668,55 @@ R F /tank/test/oldname -> /tank/test/newname + F /tank/test/created M F /tank/test/modified .Ed -.It Sy Example 23 No Creating a bookmark -The following example create a bookmark to a snapshot. This bookmark -can then be used instead of snapshot in send streams. -.Bd -literal -# zfs bookmark rpool@snapshot rpool#bookmark -.Ed -.It Sy Example 24 No Setting sharesmb Property Options on a ZFS File System -The following example show how to share SMB filesystem through ZFS. Note that -that a user and his/her password must be given. -.Bd -literal -# smbmount //127.0.0.1/share_tmp /mnt/tmp \\ - -o user=workgroup/turbo,password=obrut,uid=1000 -.Ed +. +.It Sy Example 23 : No Creating a bookmark +The following example create a bookmark to a snapshot. +This bookmark can then be used instead of snapshot in send streams. +.Dl # Nm zfs Cm bookmark Ar rpool Ns @ Ns Ar snapshot rpool Ns # Ns Ar bookmark +. +.It Sy Example 24 : No Setting Sy sharesmb No Property Options on a ZFS File System +The following example show how to share SMB filesystem through ZFS. +Note that a user and their password must be given. +.Dl # Nm smbmount Ar //127.0.0.1/share_tmp /mnt/tmp Fl o No user=workgroup/turbo,password=obrut,uid=1000 .Pp Minimal -.Em /etc/samba/smb.conf -configuration required: +.Pa /etc/samba/smb.conf +configuration is required, as follows. .Pp -Samba will need to listen to 'localhost' (127.0.0.1) for the ZFS utilities to -communicate with Samba. This is the default behavior for most Linux -distributions. +Samba will need to bind to the loopback interface for the ZFS utilities to +communicate with Samba. +This is the default behavior for most Linux distributions. .Pp -Samba must be able to authenticate a user. This can be done in a number of -ways, depending on if using the system password file, LDAP or the Samba -specific smbpasswd file. How to do this is outside the scope of this manual. -Please refer to the +Samba must be able to authenticate a user. +This can be done in a number of ways +.Pq Xr passwd 5 , LDAP , Xr smbpasswd 5 , &c.\& . +How to do this is outside the scope of this document – refer to .Xr smb.conf 5 -man page for more information. +for more information. .Pp See the -.Sy USERSHARE section -of the -.Xr smb.conf 5 -man page for all configuration options in case you need to modify any options -to the share afterwards. Do note that any changes done with the +.Sx USERSHARES +section for all configuration options, +in case you need to modify any options of the share afterwards. +Do note that any changes done with the .Xr net 8 -command will be undone if the share is ever unshared (such as at a reboot etc). +command will be undone if the share is ever unshared (like via a reboot). .El +. +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZFS_MOUNT_HELPER" +.It Sy ZFS_MOUNT_HELPER +Cause +.Nm zfs Cm mount +to use +.Xr mount 8 +to mount ZFS datasets. +This option is provided for backwards compatibility with older ZFS versions. +.El +. .Sh INTERFACE STABILITY .Sy Committed . +. .Sh SEE ALSO .Xr attr 1 , .Xr gzip 1 , @@ -5082,9 +728,46 @@ command will be undone if the share is ever unshared (such as at a reboot etc). .Xr acl 5 , .Xr attributes 5 , .Xr exports 5 , +.Xr zfsconcepts 7 , +.Xr zfsprops 7 , .Xr exportfs 8 , .Xr mount 8 , .Xr net 8 , .Xr selinux 8 , +.Xr zfs-allow 8 , +.Xr zfs-bookmark 8 , +.Xr zfs-change-key 8 , +.Xr zfs-clone 8 , +.Xr zfs-create 8 , +.Xr zfs-destroy 8 , +.Xr zfs-diff 8 , +.Xr zfs-get 8 , +.Xr zfs-groupspace 8 , +.Xr zfs-hold 8 , +.Xr zfs-inherit 8 , +.Xr zfs-jail 8 , +.Xr zfs-list 8 , +.Xr zfs-load-key 8 , +.Xr zfs-mount 8 , .Xr zfs-program 8 , +.Xr zfs-project 8 , +.Xr zfs-projectspace 8 , +.Xr zfs-promote 8 , +.Xr zfs-receive 8 , +.Xr zfs-redact 8 , +.Xr zfs-release 8 , +.Xr zfs-rename 8 , +.Xr zfs-rollback 8 , +.Xr zfs-send 8 , +.Xr zfs-set 8 , +.Xr zfs-share 8 , +.Xr zfs-snapshot 8 , +.Xr zfs-unallow 8 , +.Xr zfs-unjail 8 , +.Xr zfs-unload-key 8 , +.Xr zfs-unmount 8 , +.Xr zfs-unshare 8 , +.Xr zfs-upgrade 8 , +.Xr zfs-userspace 8 , +.Xr zfs-wait 8 , .Xr zpool 8 diff --git a/man/man8/zfs_ids_to_path.8 b/man/man8/zfs_ids_to_path.8 new file mode 100644 index 0000000000..d5b74678b2 --- /dev/null +++ b/man/man8/zfs_ids_to_path.8 @@ -0,0 +1,51 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2020 by Delphix. All rights reserved. +.\" +.Dd April 17, 2020 +.Dt ZFS_IDS_TO_PATH 8 +.Os +. +.Sh NAME +.Nm zfs_ids_to_path +.Nd convert objset and object ids to names and paths +.Sh SYNOPSIS +.Nm +.Op Fl v +.Ar pool +.Ar objset-id +.Ar object-id +. +.Sh DESCRIPTION +The +.Sy zfs_ids_to_path +utility converts a provided objset and object ids +into a path to the file they refer to. +.Bl -tag -width "-D" +.It Fl v +Verbose. +Print the dataset name and the file path within the dataset separately. +This will work correctly even if the dataset is not mounted. +.El +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zfs 8 diff --git a/man/man8/zgenhostid.8 b/man/man8/zgenhostid.8 index 607efe17f8..0dcebef73c 100644 --- a/man/man8/zgenhostid.8 +++ b/man/man8/zgenhostid.8 @@ -18,54 +18,83 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright (c) 2017 by Lawrence Livermore National Security, LLC. .\" -.Dd September 16, 2017 -.Dt ZGENHOSTID 8 SMM -.Os Linux +.Dd May 26, 2021 +.Dt ZGENHOSTID 8 +.Os +. .Sh NAME .Nm zgenhostid -.Nd generate and store a hostid in -.Em /etc/hostid +.Nd generate host ID into /etc/hostid .Sh SYNOPSIS .Nm +.Op Fl f +.Op Fl o Ar filename .Op Ar hostid +. .Sh DESCRIPTION +Creates +.Pa /etc/hostid +file and stores the host ID in it. If -.Em /etc/hostid -does not exist, create it and store a hostid in it. If the user provides -.Op Ar hostid -on the command line, store that value. Otherwise, randomly generate a -value to store. -.Pp -This emulates the -.Xr genhostid 1 -utility and is provided for use on systems which do not include the utility. +.Ar hostid +was provided, validate and store that value. +Otherwise, randomly generate an ID. +. .Sh OPTIONS -.Op Ar hostid +.Bl -tag -width "-o filename" +.It Fl h +Display a summary of the command-line options. +.It Fl f +Allow output overwrite. +.It Fl o Ar filename +Write to +.Pa filename +instead of the default +.Pa /etc/hostid . +.It Ar hostid Specifies the value to be placed in -.Em /etc/hostid . -It must be a number with a value between 1 and 2^32-1. This value -.Sy must -be unique among your systems. It must be expressed in hexadecimal and be -exactly 8 digits long. -.Sh EXAMPLES -.Bl -tag -width Ds -.It Generate a random hostid and store it -.Bd -literal -# zgenhostid -.Ed -.It Record the libc-generated hostid in Em /etc/hostid -.Bd -literal -# zgenhostid $(hostid) -.Ed -.It Record a custom hostid (0xdeadbeef) in Em etc/hostid -.Bd -literal -# zgenhostid deadbeef -.Ed +.Pa /etc/hostid . +It should be a number with a value between 1 and 2^32-1. +If +.Sy 0 , +generate a random ID. +This value +.Em must +be unique among your systems. +It +.Em must +be an 8-digit-long hexadecimal number, optionally prefixed by +.Qq 0x . .El +. +.Sh FILES +.Pa /etc/hostid +. +.Sh EXAMPLES +.Bl -tag -width Bd +.It Generate a random hostid and store it +.Dl # Nm +.It Record the libc-generated hostid in Pa /etc/hostid +.Dl # Nm Qq $ Ns Pq Nm hostid +.It Record a custom hostid Po Ar 0xdeadbeef Pc in Pa /etc/hostid +.Dl # Nm Ar deadbeef +.It Record a custom hostid Po Ar 0x01234567 Pc in Pa /tmp/hostid No and ovewrite the file if it exists +.Dl # Nm Fl f o Ar /tmp/hostid 0x01234567 +.El +. .Sh SEE ALSO .Xr genhostid 1 , .Xr hostid 1 , -.Xr spl-module-parameters 5 +.Xr sethostid 3 , +.Xr spl 4 +. +.Sh HISTORY +.Nm +emulates the +.Xr genhostid 1 +utility and is provided for use on systems which +do not include the utility or do not provide the +.Xr sethostid 3 +function. diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index f02e78ca20..a293469299 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -1,4 +1,3 @@ -'\" t .\" .\" CDDL HEADER START .\" @@ -19,180 +18,279 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright 2013 Darik Horn . All rights reserved. .\" -.TH zinject 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands" - -.SH NAME -zinject \- ZFS Fault Injector -.SH DESCRIPTION -.BR zinject -creates artificial problems in a ZFS pool by simulating data corruption or device failures. This program is dangerous. -.SH SYNOPSIS -.TP -.B "zinject" +.\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS +.\" +.Dd May 26, 2021 +.Dt ZINJECT 8 +.Os +. +.Sh NAME +.Nm zinject +.Nd ZFS Fault Injector +.Sh DESCRIPTION +.Nm +creates artificial problems in a ZFS pool by simulating data corruption +or device failures. +This program is dangerous. +. +.Sh SYNOPSIS +.Bl -tag -width Ds +.It Xo +.Nm zinject +.Xc List injection records. -.TP -.B "zinject \-b \fIobjset:object:level:blkd\fB [\-f \fIfrequency\fB] [\-amu] \fIpool\fB" +. +.It Xo +.Nm zinject +.Fl b Ar objset : Ns Ar object : Ns Ar level : Ns Ar start : Ns Ar end +.Op Fl f Ar frequency +.Fl amu +.Op pool +.Xc Force an error into the pool at a bookmark. -.TP -.B "zinject \-c <\fIid\fB | all> +. +.It Xo +.Nm zinject +.Fl c Ar id Ns | Ns Sy all +.Xc Cancel injection records. -.TP -.B "zinject \-d \fIvdev\fB \-A \fIpool\fB +. +.It Xo +.Nm zinject +.Fl d Ar vdev +.Fl A Sy degrade Ns | Ns Sy fault +.Ar pool +.Xc Force a vdev into the DEGRADED or FAULTED state. -.TP -.B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB - +. +.It Xo +.Nm zinject +.Fl d Ar vdev +.Fl D Ar latency : Ns Ar lanes +.Ar pool +.Xc Add an artificial delay to IO requests on a particular -device, such that the requests take a minimum of 'latency' -milliseconds to complete. Each delay has an associated -number of 'lanes' which defines the number of concurrent +device, such that the requests take a minimum of +.Ar latency +milliseconds to complete. +Each delay has an associated number of +.Ar lanes +which defines the number of concurrent IO requests that can be processed. - -For example, with a single lane delay of 10 ms (-D 10:1), +.Pp +For example, with a single lane delay of 10 ms +.No (\& Ns Fl D Ar 10 : Ns Ar 1 ) , the device will only be able to service a single IO request -at a time with each request taking 10 ms to complete. So, -if only a single request is submitted every 10 ms, the +at a time with each request taking 10 ms to complete. +So, if only a single request is submitted every 10 ms, the average latency will be 10 ms; but if more than one request is submitted every 10 ms, the average latency will be more than 10 ms. - +.Pp Similarly, if a delay of 10 ms is specified to have two -lanes (-D 10:2), then the device will be able to service -two requests at a time, each with a minimum latency of -10 ms. So, if two requests are submitted every 10 ms, then +lanes +.No (\& Ns Fl D Ar 10 : Ns Ar 2 ) , +then the device will be able to service +two requests at a time, each with a minimum latency of 10 ms. +So, if two requests are submitted every 10 ms, then the average latency will be 10 ms; but if more than two requests are submitted every 10 ms, the average latency will be more than 10 ms. - -Also note, these delays are additive. So two invocations -of '-D 10:1', is roughly equivalent to a single invocation -of '-D 10:2'. This also means, one can specify multiple -lanes with differing target latencies. For example, an -invocation of '-D 10:1' followed by '-D 25:2' will -create 3 lanes on the device; one lane with a latency +.Pp +Also note, these delays are additive. +So two invocations of +.Fl D Ar 10 : Ns Ar 1 +are roughly equivalent to a single invocation of +.Fl D Ar 10 : Ns Ar 2 . +This also means, that one can specify multiple +lanes with differing target latencies. +For example, an invocation of +.Fl D Ar 10 : Ns Ar 1 +followed by +.Fl D Ar 25 : Ns Ar 2 +will create 3 lanes on the device: one lane with a latency of 10 ms and two lanes with a 25 ms latency. - -.TP -.B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-f \fIfrequency\fB] [\-F] \fIpool\fB" +. +.It Xo +.Nm zinject +.Fl d Ar vdev +.Op Fl e Ar device_error +.Op Fl L Ar label_error +.Op Fl T Ar failure +.Op Fl f Ar frequency +.Op Fl F +.Ar pool +.Xc Force a vdev error. -.TP -.B "zinject \-I [\-s \fIseconds\fB | \-g \fItxgs\fB] \fIpool\fB" +. +.It Xo +.Nm zinject +.Fl I +.Op Fl s Ar seconds Ns | Ns Fl g Ar txgs +.Ar pool +.Xc Simulate a hardware failure that fails to honor a cache flush. -.TP -.B "zinject \-p \fIfunction\fB \fIpool\fB +. +.It Xo +.Nm zinject +.Fl p Ar function +.Ar pool +.Xc Panic inside the specified function. -.TP -.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" +. +.It Xo +.Nm zinject +.Fl t Sy data +.Fl C Ar dvas +.Op Fl e Ar device_error +.Op Fl f Ar frequency +.Op Fl l Ar level +.Op Fl r Ar range +.Op Fl amq +.Ar path +.Xc Force an error into the contents of a file. -.TP -.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" +. +.It Xo +.Nm zinject +.Fl t Sy dnode +.Fl C Ar dvas +.Op Fl e Ar device_error +.Op Fl f Ar frequency +.Op Fl l Ar level +.Op Fl amq +.Ar path +.Xc Force an error into the metadnode for a file or directory. -.TP -.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" +. +.It Xo +.Nm zinject +.Fl t Ar mos_type +.Fl C Ar dvas +.Op Fl e Ar device_error +.Op Fl f Ar frequency +.Op Fl l Ar level +.Op Fl r Ar range +.Op Fl amqu +.Ar pool +.Xc Force an error into the MOS of a pool. -.SH OPTIONS -.TP -.BI "\-a" +.El +.Sh OPTIONS +.Bl -tag -width "-C dvas" +.It Fl a Flush the ARC before injection. -.TP -.BI "\-b" " objset:object:level:start:end" -Force an error into the pool at this bookmark tuple. Each number is -in hexadecimal, and only one block can be specified. -.TP -.BI "\-C" " dvas" -Inject the given error only into specific DVAs. The mask should be -specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This -option is not applicable to logical data errors such as -.BR "decompress" +.It Fl b Ar objset : Ns Ar object : Ns Ar level : Ns Ar start : Ns Ar end +Force an error into the pool at this bookmark tuple. +Each number is in hexadecimal, and only one block can be specified. +.It Fl C Ar dvas +Inject the given error only into specific DVAs. +The mask should be specified as a list of 0-indexed DVAs separated by commas +.No (ex. Ar 0,2 Ns No ). +This option is not applicable to logical data errors such as +.Sy decompress and -.BR "decrypt" . -.TP -.BI "\-d" " vdev" +.Sy decrypt . +.It Fl d Ar vdev A vdev specified by path or GUID. -.TP -.BI "\-e" " device_error" +.It Fl e Ar device_error Specify -.BR "checksum" " for an ECKSUM error," -.BR "decompress" " for a data decompression error," -.BR "decrypt" " for a data decryption error," -.BR "corrupt" " to flip a bit in the data after a read," -.BR "dtl" " for an ECHILD error," -.BR "io" " for an EIO error where reopening the device will succeed, or" -.BR "nxio" " for an ENXIO error where reopening the device will fail." -For EIO and ENXIO, the "failed" reads or writes still occur. The probe simply -sets the error value reported by the I/O pipeline so it appears the read or -write failed. Decryption errors only currently work with file data. -.TP -.BI "\-f" " frequency" -Only inject errors a fraction of the time. Expressed as a real number -percentage between 0.0001 and 100. -.TP -.BI "\-F" -Fail faster. Do fewer checks. -.TP -.BI "\-g" " txgs" +.Bl -tag -compact -width "decompress" +.It Sy checksum +for an ECKSUM error, +.It Sy decompress +for a data decompression error, +.It Sy decrypt +for a data decryption error, +.It Sy corrupt +to flip a bit in the data after a read, +.It Sy dtl +for an ECHILD error, +.It Sy io +for an EIO error where reopening the device will succeed, or +.It Sy nxio +for an ENXIO error where reopening the device will fail. +.El +.Pp +For EIO and ENXIO, the "failed" reads or writes still occur. +The probe simply sets the error value reported by the I/O pipeline +so it appears the read or write failed. +Decryption errors only currently work with file data. +.It Fl f Ar frequency +Only inject errors a fraction of the time. +Expressed as a real number percentage between +.Sy 0.0001 +and +.Sy 100 . +.It Fl F +Fail faster. +Do fewer checks. +.It Fl f Ar txgs Run for this many transaction groups before reporting failure. -.TP -.BI "\-h" +.It Fl h Print the usage message. -.TP -.BI "\-l" " level" -Inject an error at a particular block level. The default is 0. -.TP -.BI "\-L" " label_error" +.It Fl l Ar level +Inject an error at a particular block level. +The default is +.Sy 0 . +.It Fl L Ar label_error Set the label error region to one of -.BR " nvlist" "," -.BR " pad1" "," -.BR " pad2" ", or" -.BR " uber" "." -.TP -.BI "\-m" +.Sy nvlist , +.Sy pad1 , +.Sy pad2 , +or +.Sy uber . +.It Fl m Automatically remount the underlying filesystem. -.TP -.BI "\-q" -Quiet mode. Only print the handler number added. -.TP -.BI "\-r" " range" +.It Fl q +Quiet mode. +Only print the handler number added. +.It Fl r Ar range Inject an error over a particular logical range of an object, which will be translated to the appropriate blkid range according to the object's properties. -.TP -.BI "\-s" " seconds" +.It Fl s Ar seconds Run for this many seconds before reporting failure. -.TP -.BI "\-T" " failure" +.It Fl T Ar failure Set the failure type to one of -.BR " all" "," -.BR " claim" "," -.BR " free" "," -.BR " read" ", or" -.BR " write" "." -.TP -.BI "\-t" " mos_type" +.Sy all , +.Sy claim , +.Sy free , +.Sy read , +or +.Sy write . +.It Fl t Ar mos_type Set this to -.BR "mos " "for any data in the MOS," -.BR "mosdir " "for an object directory," -.BR "config " "for the pool configuration," -.BR "bpobj " "for the block pointer list," -.BR "spacemap " "for the space map," -.BR "metaslab " "for the metaslab, or" -.BR "errlog " "for the persistent error log." -.TP -.BI "\-u" +.Bl -tag -compact -width "spacemap" +.It Sy mos +for any data in the MOS, +.It Sy mosdir +for an object directory, +.It Sy config +for the pool configuration, +.It Sy bpobj +for the block pointer list, +.It Sy spacemap +for the space map, +.It Sy metaslab +for the metaslab, or +.It Sy errlog +for the persistent error log. +.El +.It Fl u Unload the pool after injection. - -.SH "ENVIRONMENT VARIABLES" -.TP -.B "ZINJECT_DEBUG" -Run \fBzinject\fR in debug mode. - -.SH "AUTHORS" -This man page was written by Darik Horn -excerpting the \fBzinject\fR usage message and source code. - -.SH "SEE ALSO" -.BR zpool (8), -.BR zfs (8) +.El +. +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZF" +.It Ev ZFS_HOSTID +Run +.Nm +in debug mode. +.El +. +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zpool 8 diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8 new file mode 100644 index 0000000000..26cf33c553 --- /dev/null +++ b/man/man8/zpool-add.8 @@ -0,0 +1,101 @@ +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-ADD 8 +.Os +. +.Sh NAME +.Nm zpool-add +.Nd add vdevs to ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm add +.Op Fl fgLnP +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool vdev Ns … +. +.Sh DESCRIPTION +Adds the specified virtual devices to the given pool. +The +.Ar vdev +specification is described in the +.Em Virtual Devices +section of +.Xr zpoolconcepts 7 . +The behavior of the +.Fl f +option, and the device checks performed are described in the +.Nm zpool Cm create +subcommand. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar vdev Ns s , +even if they appear in use or specify a conflicting replication level. +Not all devices can be overridden in this manner. +.It Fl g +Display +.Ar vdev , +GUIDs instead of the normal device names. +These GUIDs can be used in place of +device names for the zpool detach/offline/remove/replace commands. +.It Fl L +Display real paths for +.Ar vdev Ns s +resolving all symbolic links. +This can be used to look up the current block +device name regardless of the +.Pa /dev/disk +path used to open it. +.It Fl n +Displays the configuration that would be used without actually adding the +.Ar vdev Ns s . +The actual pool creation can still fail due to insufficient privileges or +device sharing. +.It Fl P +Display real paths for +.Ar vdev Ns s +instead of only the last component of the path. +This can be used in conjunction with the +.Fl L +flag. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. +See the +.Xr zpoolprops 7 +manual page for a list of valid properties that can be set. +The only property supported at the moment is +.Sy ashift . +.El +. +.Sh SEE ALSO +.Xr zpool-attach 8 , +.Xr zpool-import 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 , +.Xr zpool-remove 8 diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 new file mode 100644 index 0000000000..19d8f6ac07 --- /dev/null +++ b/man/man8/zpool-attach.8 @@ -0,0 +1,98 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 15, 2020 +.Dt ZPOOL-ATTACH 8 +.Os +. +.Sh NAME +.Nm zpool-attach +.Nd attach new device to existing ZFS vdev +.Sh SYNOPSIS +.Nm zpool +.Cm attach +.Op Fl fsw +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool device new_device +. +.Sh DESCRIPTION +Attaches +.Ar new_device +to the existing +.Ar device . +The existing device cannot be part of a raidz configuration. +If +.Ar device +is not currently part of a mirrored configuration, +.Ar device +automatically transforms into a two-way mirror of +.Ar device +and +.Ar new_device . +If +.Ar device +is part of a two-way mirror, attaching +.Ar new_device +creates a three-way mirror, and so on. +In either case, +.Ar new_device +begins to resilver immediately and any running scrub is cancelled. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar new_device , +even if it appears to be in use. +Not all devices can be overridden in this manner. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. +See the +.Xr zpoolprops 7 +manual page for a list of valid properties that can be set. +The only property supported at the moment is +.Sy ashift . +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until +.Ar new_device +has finished resilvering before returning. +.El +. +.Sh SEE ALSO +.Xr zpool-add 8 , +.Xr zpool-detach 8 , +.Xr zpool-import 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 , +.Xr zpool-replace 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-checkpoint.8 b/man/man8/zpool-checkpoint.8 new file mode 100644 index 0000000000..d5add14aed --- /dev/null +++ b/man/man8/zpool-checkpoint.8 @@ -0,0 +1,72 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-CHECKPOINT 8 +.Os +. +.Sh NAME +.Nm zpool-checkpoint +.Nd check-point current ZFS storage pool state +.Sh SYNOPSIS +.Nm zpool +.Cm checkpoint +.Op Fl d Op Fl w +.Ar pool +. +.Sh DESCRIPTION +Checkpoints the current state of +.Ar pool +, which can be later restored by +.Nm zpool Cm import --rewind-to-checkpoint . +The existence of a checkpoint in a pool prohibits the following +.Nm zpool +subcommands: +.Cm remove , attach , detach , split , No and Cm reguid . +In addition, it may break reservation boundaries if the pool lacks free +space. +The +.Nm zpool Cm status +command indicates the existence of a checkpoint or the progress of discarding a +checkpoint from a pool. +.Nm zpool Cm list +can be used to check how much space the checkpoint takes from the pool. +. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl d , -discard +Discards an existing checkpoint from +.Ar pool . +.It Fl w , -wait +Waits until the checkpoint has finished being discarded before returning. +.El +. +.Sh SEE ALSO +.Xr zfs-snapshot 8 , +.Xr zpool-import 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 new file mode 100644 index 0000000000..6e41566ca6 --- /dev/null +++ b/man/man8/zpool-clear.8 @@ -0,0 +1,56 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-CLEAR 8 +.Os +. +.Sh NAME +.Nm zpool-clear +.Nd clear device errors in ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm clear +.Ar pool +.Oo Ar device Oc Ns … +. +.Sh DESCRIPTION +Clears device errors in a pool. +If no arguments are specified, all device errors within the pool are cleared. +If one or more devices is specified, only those errors associated with the +specified device or devices are cleared. +If +.Sy multihost +is enabled and the pool has been suspended, this will not resume I/O. +While the pool was suspended, it may have been imported on +another host, and resuming I/O could result in pool damage. +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-reopen 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 new file mode 100644 index 0000000000..e902c77007 --- /dev/null +++ b/man/man8/zpool-create.8 @@ -0,0 +1,211 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2021, Colm Buckley +.\" +.Dd June 2, 2021 +.Dt ZPOOL-CREATE 8 +.Os +. +.Sh NAME +.Nm zpool-create +.Nd create ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm create +.Op Fl dfn +.Op Fl m Ar mountpoint +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Oo Fl o Sy feature@ Ns Ar feature Ns = Ns Ar value Oc +.Op Fl o Ar compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns … +.Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Op Fl t Ar tname +.Ar pool +.Ar vdev Ns … +. +.Sh DESCRIPTION +Creates a new storage pool containing the virtual devices specified on the +command line. +The pool name must begin with a letter, and can only contain +alphanumeric characters as well as the underscore +.Pq Qq Sy _ , +dash +.Pq Qq Sy \&- , +colon +.Pq Qq Sy \&: , +space +.Pq Qq Sy \&\ , +and period +.Pq Qq Sy \&. . +The pool names +.Sy mirror , +.Sy raidz , +.Sy draid , +.Sy spare +and +.Sy log +are reserved, as are names beginning with +.Sy mirror , +.Sy raidz , +.Sy draid , +and +.Sy spare . +The +.Ar vdev +specification is described in the +.Sx Virtual Devices +section of +.Xr zpoolconcepts 7 . +.Pp +The command attempts to verify that each device specified is accessible and not +currently in use by another subsystem. +However this check is not robust enough +to detect simultaneous attempts to use a new device in different pools, even if +.Sy multihost Ns = Sy enabled . +The administrator must ensure, that simultaneous invocations of any combination of +.Nm zpool Cm replace , +.Nm zpool Cm create , +.Nm zpool Cm add , +or +.Nm zpool Cm labelclear , +do not refer to the same device. +Using the same device in two pools will result in pool corruption. +.Pp +There are some uses, such as being currently mounted, or specified as the +dedicated dump device, that prevents a device from ever being used by ZFS. +Other uses, such as having a preexisting UFS file system, can be overridden with +.Fl f . +.Pp +The command also checks that the replication strategy for the pool is +consistent. +An attempt to combine redundant and non-redundant storage in a single pool, +or to mix disks and files, results in an error unless +.Fl f +is specified. +The use of differently-sized devices within a single raidz or mirror group is +also flagged as an error unless +.Fl f +is specified. +.Pp +Unless the +.Fl R +option is specified, the default mount point is +.Pa / Ns Ar pool . +The mount point must not exist or must be empty, or else the root dataset +will not be able to be be mounted. +This can be overridden with the +.Fl m +option. +.Pp +By default all supported features are enabled on the new pool. +The +.Fl d +option and the +.Fl o Ar compatibility +property +.Pq e.g Fl o Sy compatibility Ns = Ns Ar 2020 +can be used to restrict the features that are enabled, so that the +pool can be imported on other releases of ZFS. +.Bl -tag -width "-t tname" +.It Fl d +Do not enable any features on the new pool. +Individual features can be enabled by setting their corresponding properties to +.Sy enabled +with +.Fl o . +See +.Xr zpool-features 7 +for details about feature properties. +.It Fl f +Forces use of +.Ar vdev Ns s , +even if they appear in use or specify a conflicting replication level. +Not all devices can be overridden in this manner. +.It Fl m Ar mountpoint +Sets the mount point for the root dataset. +The default mount point is +.Pa /pool +or +.Pa altroot/pool +if +.Sy altroot +is specified. +The mount point must be an absolute path, +.Sy legacy , +or +.Sy none . +For more information on dataset mount points, see +.Xr zfsprops 7 . +.It Fl n +Displays the configuration that would be used without actually creating the +pool. +The actual pool creation can still fail due to insufficient privileges or +device sharing. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. +See +.Xr zpoolprops 7 +for a list of valid properties that can be set. +.It Fl o Ar compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns … +Specifies compatibility feature sets. +See +.Xr zpool-features 7 +for more information about compatibility feature sets. +.It Fl o Sy feature@ Ns Ar feature Ns = Ns Ar value +Sets the given pool feature. +See the +.Xr zpool-features 7 +section for a list of valid features that can be set. +Value can be either disabled or enabled. +.It Fl O Ar file-system-property Ns = Ns Ar value +Sets the given file system properties in the root file system of the pool. +See +.Xr zfsprops 7 +for a list of valid properties that can be set. +.It Fl R Ar root +Equivalent to +.Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root +.It Fl t Ar tname +Sets the in-core pool name to +.Ar tname +while the on-disk name will be the name specified as +.Ar pool . +This will set the default of the +.Sy cachefile +property to +.Sy none . +This is intended +to handle name space collisions when creating pools for other systems, +such as virtual machines or physical machines whose pools live on network +block devices. +.El +. +.Sh SEE ALSO +.Xr zpool-destroy 8 , +.Xr zpool-export 8 , +.Xr zpool-import 8 diff --git a/man/man8/zpool-destroy.8 b/man/man8/zpool-destroy.8 new file mode 100644 index 0000000000..a2f6729c8a --- /dev/null +++ b/man/man8/zpool-destroy.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 31, 2021 +.Dt ZPOOL-DESTROY 8 +.Os +. +.Sh NAME +.Nm zpool-destroy +.Nd destroy ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm destroy +.Op Fl f +.Ar pool +. +.Sh DESCRIPTION +Destroys the given pool, freeing up any devices for other use. +This command tries to unmount any active datasets before destroying the pool. +.Bl -tag -width Ds +.It Fl f +Forcefully unmount all active datasets. +.El diff --git a/man/man8/zpool-detach.8 b/man/man8/zpool-detach.8 new file mode 100644 index 0000000000..952dd7882a --- /dev/null +++ b/man/man8/zpool-detach.8 @@ -0,0 +1,58 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-DETACH 8 +.Os +. +.Sh NAME +.Nm zpool-detach +.Nd detach device from ZFS mirror +.Sh SYNOPSIS +.Nm zpool +.Cm detach +.Ar pool device +. +.Sh DESCRIPTION +Detaches +.Ar device +from a mirror. +The operation is refused if there are no other valid replicas of the data. +If +.Ar device +may be re-added to the pool later on then consider the +.Nm zpool Cm offline +command instead. +. +.Sh SEE ALSO +.Xr zpool-attach 8 , +.Xr zpool-labelclear 8 , +.Xr zpool-offline 8 , +.Xr zpool-remove 8 , +.Xr zpool-replace 8 , +.Xr zpool-split 8 diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 new file mode 100644 index 0000000000..ab1d6ea562 --- /dev/null +++ b/man/man8/zpool-events.8 @@ -0,0 +1,483 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-EVENTS 8 +.Os +. +.Sh NAME +.Nm zpool-events +.Nd list recent events generated by kernel +.Sh SYNOPSIS +.Nm zpool +.Cm events +.Op Fl vHf +.Op Ar pool +.Nm zpool +.Cm events +.Fl c +. +.Sh DESCRIPTION +Lists all recent events generated by the ZFS kernel modules. +These events are consumed by the +.Xr zed 8 +and used to automate administrative tasks such as replacing a failed device +with a hot spare. +For more information about the subclasses and event payloads +that can be generated see +.Sx EVENTS +and the following sections. +. +.Sh OPTIONS +.Bl -tag -compact -width Ds +.It Fl c +Clear all previous events. +.It Fl f +Follow mode. +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a +single tab instead of arbitrary space. +.It Fl v +Print the entire payload for each event. +.El +. +.Sh EVENTS +Theese are the different event subclasses. +The full event name would be +.Sy ereport.fs.zfs.\& Ns Em SUBCLASS , +but only the last part is listed here. +.Pp +.Bl -tag -compact -width "vdev.bad_guid_sum" +.It Sy checksum +Issued when a checksum error has been detected. +.It Sy io +Issued when there is an I/O error in a vdev in the pool. +.It Sy data +Issued when there have been data errors in the pool. +.It Sy deadman +Issued when an I/O request is determined to be "hung", this can be caused +by lost completion events due to flaky hardware or drivers. +See +.Sy zfs_deadman_failmode +in +.Xr zfs 4 +for additional information regarding "hung" I/O detection and configuration. +.It Sy delay +Issued when a completed I/O request exceeds the maximum allowed time +specified by the +.Sy zio_slow_io_ms +module parameter. +This can be an indicator of problems with the underlying storage device. +The number of delay events is ratelimited by the +.Sy zfs_slow_io_events_per_second +module parameter. +.It Sy config +Issued every time a vdev change have been done to the pool. +.It Sy zpool +Issued when a pool cannot be imported. +.It Sy zpool.destroy +Issued when a pool is destroyed. +.It Sy zpool.export +Issued when a pool is exported. +.It Sy zpool.import +Issued when a pool is imported. +.It Sy zpool.reguid +Issued when a REGUID (new unique identifier for the pool have been regenerated) have been detected. +.It Sy vdev.unknown +Issued when the vdev is unknown. +Such as trying to clear device errors on a vdev that have failed/been kicked +from the system/pool and is no longer available. +.It Sy vdev.open_failed +Issued when a vdev could not be opened (because it didn't exist for example). +.It Sy vdev.corrupt_data +Issued when corrupt data have been detected on a vdev. +.It Sy vdev.no_replicas +Issued when there are no more replicas to sustain the pool. +This would lead to the pool being +.Em DEGRADED . +.It Sy vdev.bad_guid_sum +Issued when a missing device in the pool have been detected. +.It Sy vdev.too_small +Issued when the system (kernel) have removed a device, and ZFS +notices that the device isn't there any more. +This is usually followed by a +.Sy probe_failure +event. +.It Sy vdev.bad_label +Issued when the label is OK but invalid. +.It Sy vdev.bad_ashift +Issued when the ashift alignment requirement has increased. +.It Sy vdev.remove +Issued when a vdev is detached from a mirror (or a spare detached from a +vdev where it have been used to replace a failed drive - only works if +the original drive have been readded). +.It Sy vdev.clear +Issued when clearing device errors in a pool. +Such as running +.Nm zpool Cm clear +on a device in the pool. +.It Sy vdev.check +Issued when a check to see if a given vdev could be opened is started. +.It Sy vdev.spare +Issued when a spare have kicked in to replace a failed device. +.It Sy vdev.autoexpand +Issued when a vdev can be automatically expanded. +.It Sy io_failure +Issued when there is an I/O failure in a vdev in the pool. +.It Sy probe_failure +Issued when a probe fails on a vdev. +This would occur if a vdev +have been kicked from the system outside of ZFS (such as the kernel +have removed the device). +.It Sy log_replay +Issued when the intent log cannot be replayed. +The can occur in the case of a missing or damaged log device. +.It Sy resilver.start +Issued when a resilver is started. +.It Sy resilver.finish +Issued when the running resilver have finished. +.It Sy scrub.start +Issued when a scrub is started on a pool. +.It Sy scrub.finish +Issued when a pool has finished scrubbing. +.It Sy scrub.abort +Issued when a scrub is aborted on a pool. +.It Sy scrub.resume +Issued when a scrub is resumed on a pool. +.It Sy scrub.paused +Issued when a scrub is paused on a pool. +.It Sy bootfs.vdev.attach +.El +. +.Sh PAYLOADS +This is the payload (data, information) that accompanies an +event. +.Pp +For +.Xr zed 8 , +these are set to uppercase and prefixed with +.Sy ZEVENT_ . +.Pp +.Bl -tag -compact -width "vdev_cksum_errors" +.It Sy pool +Pool name. +.It Sy pool_failmode +Failmode - +.Sy wait , +.Sy continue , +or +.Sy panic . +See the +.Sy failmode +property in +.Xr zpoolprops 7 +for more information. +.It Sy pool_guid +The GUID of the pool. +.It Sy pool_context +The load state for the pool (0=none, 1=open, 2=import, 3=tryimport, 4=recover +5=error). +.It Sy vdev_guid +The GUID of the vdev in question (the vdev failing or operated upon with +.Nm zpool Cm clear , +etc.). +.It Sy vdev_type +Type of vdev - +.Sy disk , +.Sy file , +.Sy mirror , +etc. +See the +.Sy Virtual Devices +section of +.Xr zpoolconcepts 7 +for more information on possible values. +.It Sy vdev_path +Full path of the vdev, including any +.Em -partX . +.It Sy vdev_devid +ID of vdev (if any). +.It Sy vdev_fru +Physical FRU location. +.It Sy vdev_state +State of vdev (0=uninitialized, 1=closed, 2=offline, 3=removed, 4=failed to open, 5=faulted, 6=degraded, 7=healthy). +.It Sy vdev_ashift +The ashift value of the vdev. +.It Sy vdev_complete_ts +The time the last I/O request completed for the specified vdev. +.It Sy vdev_delta_ts +The time since the last I/O request completed for the specified vdev. +.It Sy vdev_spare_paths +List of spares, including full path and any +.Em -partX . +.It Sy vdev_spare_guids +GUID(s) of spares. +.It Sy vdev_read_errors +How many read errors that have been detected on the vdev. +.It Sy vdev_write_errors +How many write errors that have been detected on the vdev. +.It Sy vdev_cksum_errors +How many checksum errors that have been detected on the vdev. +.It Sy parent_guid +GUID of the vdev parent. +.It Sy parent_type +Type of parent. +See +.Sy vdev_type . +.It Sy parent_path +Path of the vdev parent (if any). +.It Sy parent_devid +ID of the vdev parent (if any). +.It Sy zio_objset +The object set number for a given I/O request. +.It Sy zio_object +The object number for a given I/O request. +.It Sy zio_level +The indirect level for the block. +Level 0 is the lowest level and includes data blocks. +Values > 0 indicate metadata blocks at the appropriate level. +.It Sy zio_blkid +The block ID for a given I/O request. +.It Sy zio_err +The error number for a failure when handling a given I/O request, +compatible with +.Xr errno 3 +with the value of +.Sy EBADE +used to indicate a ZFS checksum error. +.It Sy zio_offset +The offset in bytes of where to write the I/O request for the specified vdev. +.It Sy zio_size +The size in bytes of the I/O request. +.It Sy zio_flags +The current flags describing how the I/O request should be handled. +See the +.Sy I/O FLAGS +section for the full list of I/O flags. +.It Sy zio_stage +The current stage of the I/O in the pipeline. +See the +.Sy I/O STAGES +section for a full list of all the I/O stages. +.It Sy zio_pipeline +The valid pipeline stages for the I/O. +See the +.Sy I/O STAGES +section for a full list of all the I/O stages. +.It Sy zio_delay +The time elapsed (in nanoseconds) waiting for the block layer to complete the +I/O request. +Unlike +.Sy zio_delta , +this does not include any vdev queuing time and is +therefore solely a measure of the block layer performance. +.It Sy zio_timestamp +The time when a given I/O request was submitted. +.It Sy zio_delta +The time required to service a given I/O request. +.It Sy prev_state +The previous state of the vdev. +.It Sy cksum_expected +The expected checksum value for the block. +.It Sy cksum_actual +The actual checksum value for an errant block. +.It Sy cksum_algorithm +Checksum algorithm used. +See +.Xr zfsprops 7 +for more information on the available checksum algorithms. +.It Sy cksum_byteswap +Whether or not the data is byteswapped. +.It Sy bad_ranges +.No [\& Ns Ar start , end ) +pairs of corruption offsets. +Offsets are always aligned on a 64-bit boundary, +and can include some gaps of non-corruption. +(See +.Sy bad_ranges_min_gap ) +.It Sy bad_ranges_min_gap +In order to bound the size of the +.Sy bad_ranges +array, gaps of non-corruption +less than or equal to +.Sy bad_ranges_min_gap +bytes have been merged with +adjacent corruption. +Always at least 8 bytes, since corruption is detected on a 64-bit word basis. +.It Sy bad_range_sets +This array has one element per range in +.Sy bad_ranges . +Each element contains +the count of bits in that range which were clear in the good data and set +in the bad data. +.It Sy bad_range_clears +This array has one element per range in +.Sy bad_ranges . +Each element contains +the count of bits for that range which were set in the good data and clear in +the bad data. +.It Sy bad_set_bits +If this field exists, it is an array of +.Pq Ar bad data No & ~( Ns Ar good data ) ; +that is, the bits set in the bad data which are cleared in the good data. +Each element corresponds a byte whose offset is in a range in +.Sy bad_ranges , +and the array is ordered by offset. +Thus, the first element is the first byte in the first +.Sy bad_ranges +range, and the last element is the last byte in the last +.Sy bad_ranges +range. +.It Sy bad_cleared_bits +Like +.Sy bad_set_bits , +but contains +.Pq Ar good data No & ~( Ns Ar bad data ) ; +that is, the bits set in the good data which are cleared in the bad data. +.It Sy bad_set_histogram +If this field exists, it is an array of counters. +Each entry counts bits set in a particular bit of a big-endian uint64 type. +The first entry counts bits +set in the high-order bit of the first byte, the 9th byte, etc, and the last +entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. +This information is useful for observing a stuck bit in a parallel data path, +such as IDE or parallel SCSI. +.It Sy bad_cleared_histogram +If this field exists, it is an array of counters. +Each entry counts bit clears in a particular bit of a big-endian uint64 type. +The first entry counts bits +clears of the high-order bit of the first byte, the 9th byte, etc, and the +last entry counts clears of the low-order bit of the 8th byte, the 16th byte, etc. +This information is useful for observing a stuck bit in a parallel data +path, such as IDE or parallel SCSI. +.El +. +.Sh I/O STAGES +The ZFS I/O pipeline is comprised of various stages which are defined below. +The individual stages are used to construct these basic I/O +operations: Read, Write, Free, Claim, and Ioctl. +These stages may be +set on an event to describe the life cycle of a given I/O request. +.Pp +.TS +tab(:); +l l l . +Stage:Bit Mask:Operations +_:_:_ +ZIO_STAGE_OPEN:0x00000001:RWFCI + +ZIO_STAGE_READ_BP_INIT:0x00000002:R---- +ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W--- +ZIO_STAGE_FREE_BP_INIT:0x00000008:--F-- +ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF-- +ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W--- + +ZIO_STAGE_ENCRYPT:0x00000040:-W--- +ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- + +ZIO_STAGE_NOP_WRITE:0x00000100:-W--- + +ZIO_STAGE_DDT_READ_START:0x00000200:R---- +ZIO_STAGE_DDT_READ_DONE:0x00000400:R---- +ZIO_STAGE_DDT_WRITE:0x00000800:-W--- +ZIO_STAGE_DDT_FREE:0x00001000:--F-- + +ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC- +ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC- + +ZIO_STAGE_DVA_THROTTLE:0x00008000:-W--- +ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W--- +ZIO_STAGE_DVA_FREE:0x00020000:--F-- +ZIO_STAGE_DVA_CLAIM:0x00040000:---C- + +ZIO_STAGE_READY:0x00080000:RWFCI + +ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I +ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I +ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I + +ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R---- + +ZIO_STAGE_DONE:0x01000000:RWFCI +.TE +. +.Sh I/O FLAGS +Every I/O request in the pipeline contains a set of flags which describe its +function and are used to govern its behavior. +These flags will be set in an event as a +.Sy zio_flags +payload entry. +.Pp +.TS +tab(:); +l l . +Flag:Bit Mask +_:_ +ZIO_FLAG_DONT_AGGREGATE:0x00000001 +ZIO_FLAG_IO_REPAIR:0x00000002 +ZIO_FLAG_SELF_HEAL:0x00000004 +ZIO_FLAG_RESILVER:0x00000008 +ZIO_FLAG_SCRUB:0x00000010 +ZIO_FLAG_SCAN_THREAD:0x00000020 +ZIO_FLAG_PHYSICAL:0x00000040 + +ZIO_FLAG_CANFAIL:0x00000080 +ZIO_FLAG_SPECULATIVE:0x00000100 +ZIO_FLAG_CONFIG_WRITER:0x00000200 +ZIO_FLAG_DONT_RETRY:0x00000400 +ZIO_FLAG_DONT_CACHE:0x00000800 +ZIO_FLAG_NODATA:0x00001000 +ZIO_FLAG_INDUCE_DAMAGE:0x00002000 + +ZIO_FLAG_IO_ALLOCATING:0x00004000 +ZIO_FLAG_IO_RETRY:0x00008000 +ZIO_FLAG_PROBE:0x00010000 +ZIO_FLAG_TRYHARD:0x00020000 +ZIO_FLAG_OPTIONAL:0x00040000 + +ZIO_FLAG_DONT_QUEUE:0x00080000 +ZIO_FLAG_DONT_PROPAGATE:0x00100000 +ZIO_FLAG_IO_BYPASS:0x00200000 +ZIO_FLAG_IO_REWRITE:0x00400000 +ZIO_FLAG_RAW_COMPRESS:0x00800000 +ZIO_FLAG_RAW_ENCRYPT:0x01000000 + +ZIO_FLAG_GANG_CHILD:0x02000000 +ZIO_FLAG_DDT_CHILD:0x04000000 +ZIO_FLAG_GODFATHER:0x08000000 +ZIO_FLAG_NOPWRITE:0x10000000 +ZIO_FLAG_REEXECUTED:0x20000000 +ZIO_FLAG_DELEGATED:0x40000000 +ZIO_FLAG_FASTWRITE:0x80000000 +.TE +. +.Sh SEE ALSO +.Xr zfs 4 , +.Xr zed 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-export.8 b/man/man8/zpool-export.8 new file mode 100644 index 0000000000..a15291a1f5 --- /dev/null +++ b/man/man8/zpool-export.8 @@ -0,0 +1,72 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd February 16, 2020 +.Dt ZPOOL-EXPORT 8 +.Os +. +.Sh NAME +.Nm zpool-export +.Nd export ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm export +.Op Fl f +.Fl a Ns | Ns Ar pool Ns … +. +.Sh DESCRIPTION +Exports the given pools from the system. +All devices are marked as exported, but are still considered in use by other +subsystems. +The devices can be moved between systems +.Pq even those of different endianness +and imported as long as a sufficient number of devices are present. +.Pp +Before exporting the pool, all datasets within the pool are unmounted. +A pool can not be exported if it has a shared spare that is currently being +used. +.Pp +For pools to be portable, you must give the +.Nm zpool +command whole disks, not just partitions, so that ZFS can label the disks with +portable EFI labels. +Otherwise, disk drivers on platforms of different endianness will not recognize +the disks. +.Bl -tag -width Ds +.It Fl a +Exports all pools imported on the system. +.It Fl f +Forcefully unmount all datasets, and allow export of pools with active shared spares. +.Pp +This command will forcefully export the pool even if it has a shared spare that +is currently being used. +This may lead to potential data corruption. +.El +. +.Sh SEE ALSO +.Xr zpool-import 8 diff --git a/man/man8/zpool-get.8 b/man/man8/zpool-get.8 new file mode 100644 index 0000000000..55904f169e --- /dev/null +++ b/man/man8/zpool-get.8 @@ -0,0 +1,108 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-GET 8 +.Os +. +.Sh NAME +.Nm zpool-get +.Nd retrieve properties of ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm get +.Op Fl Hp +.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns … +.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns … +.Oo Ar pool Oc Ns … +.Nm zpool +.Cm set +.Ar property Ns = Ns Ar value +.Ar pool +. +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm get +.Op Fl Hp +.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns … +.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns … +.Oo Ar pool Oc Ns … +.Xc +Retrieves the given list of properties +.Po +or all properties if +.Sy all +is used +.Pc +for the specified storage pool(s). +These properties are displayed with the following fields: +.Bl -tag -compact -offset Ds -width "property" +.It Sy name +Name of storage pool. +.It Sy property +Property name. +.It Sy value +Property value. +.It Sy source +Property source, either +.Sy default No or Sy local . +.El +.Pp +See the +.Xr zpoolprops 7 +manual page for more information on the available pool properties. +.Bl -tag -compact -offset Ds -width "-o field" +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl o Ar field +A comma-separated list of columns to display, defaults to +.Sy name , Ns Sy property , Ns Sy value , Ns Sy source . +.It Fl p +Display numbers in parsable (exact) values. +.El +.It Xo +.Nm zpool +.Cm set +.Ar property Ns = Ns Ar value +.Ar pool +.Xc +Sets the given property on the specified pool. +See the +.Xr zpoolprops 7 +manual page for more information on what properties can be set and acceptable +values. +.El +. +.Sh SEE ALSO +.Xr zpool-features 7 , +.Xr zpoolprops 7 , +.Xr zpool-list 8 diff --git a/man/man8/zpool-history.8 b/man/man8/zpool-history.8 new file mode 100644 index 0000000000..2a2d500b8b --- /dev/null +++ b/man/man8/zpool-history.8 @@ -0,0 +1,58 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-HISTORY 8 +.Os +. +.Sh NAME +.Nm zpool-history +.Nd inspect command history of ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm history +.Op Fl il +.Oo Ar pool Oc Ns … +. +.Sh DESCRIPTION +Displays the command history of the specified pool(s) or all pools if no pool is +specified. +.Bl -tag -width Ds +.It Fl i +Displays internally logged ZFS events in addition to user initiated events. +.It Fl l +Displays log records in long format, which in addition to standard format +includes, the user name, the hostname, and the zone in which the operation was +performed. +.El +. +.Sh SEE ALSO +.Xr zpool-checkpoint 8 , +.Xr zpool-events 8 , +.Xr zpool-status 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-import.8 b/man/man8/zpool-import.8 new file mode 100644 index 0000000000..518e3cf1d7 --- /dev/null +++ b/man/man8/zpool-import.8 @@ -0,0 +1,409 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-IMPORT 8 +.Os +. +.Sh NAME +.Nm zpool-import +.Nd import ZFS storage pools or list available pools +.Sh SYNOPSIS +.Nm zpool +.Cm import +.Op Fl D +.Oo Fl d Ar dir Ns | Ns Ar device Oc Ns … +.Nm zpool +.Cm import +.Fl a +.Op Fl DflmN +.Op Fl F Op Fl nTX +.Op Fl -rewind-to-checkpoint +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Nm zpool +.Cm import +.Op Fl Dflmt +.Op Fl F Op Fl nTX +.Op Fl -rewind-to-checkpoint +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Op Fl s +.Ar pool Ns | Ns Ar id +.Op Ar newpool +. +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm import +.Op Fl D +.Oo Fl d Ar dir Ns | Ns Ar device Oc Ns … +.Xc +Lists pools available to import. +If the +.Fl d or +.Fl c +options are not specified, this command searches for devices using libblkid +on Linux and geom on +.Fx . +The +.Fl d +option can be specified multiple times, and all directories are searched. +If the device appears to be part of an exported pool, this command displays a +summary of the pool with the name of the pool, a numeric identifier, as well as +the vdev layout and current health of the device for each device or file. +Destroyed pools, pools that were previously destroyed with the +.Nm zpool Cm destroy +command, are not listed unless the +.Fl D +option is specified. +.Pp +The numeric identifier is unique, and can be used instead of the pool name when +multiple exported pools of the same name are available. +.Bl -tag -width Ds +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +.It Fl D +Lists destroyed pools only. +.El +.It Xo +.Nm zpool +.Cm import +.Fl a +.Op Fl DflmN +.Op Fl F Op Fl nTX +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Op Fl s +.Xc +Imports all pools found in the search directories. +Identical to the previous command, except that all pools with a sufficient +number of devices available are imported. +Destroyed pools, pools that were previously destroyed with the +.Nm zpool Cm destroy +command, will not be imported unless the +.Fl D +option is specified. +.Bl -tag -width Ds +.It Fl a +Searches for and imports all pools found. +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +This option is incompatible with the +.Fl c +option. +.It Fl D +Imports destroyed pools only. +The +.Fl f +option is also required. +.It Fl f +Forces import, even if the pool appears to be potentially active. +.It Fl F +Recovery mode for a non-importable pool. +Attempt to return the pool to an importable state by discarding the last few +transactions. +Not all damaged pools can be recovered by using this option. +If successful, the data from the discarded transactions is irretrievably lost. +This option is ignored if the pool is importable or already imported. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the pool online. +Note that if any datasets have a +.Sy keylocation +of +.Sy prompt +this command will block waiting for the keys to be entered. +Without this flag +encrypted datasets will be left unavailable until the keys are loaded. +.It Fl m +Allows a pool to import when there is a missing log device. +Recent transactions can be lost because the log device will be discarded. +.It Fl n +Used with the +.Fl F +recovery option. +Determines whether a non-importable pool can be made importable again, but does +not actually perform the pool recovery. +For more details about pool recovery mode, see the +.Fl F +option, above. +.It Fl N +Import the pool without mounting any file systems. +.It Fl o Ar mntopts +Comma-separated list of mount options to use when mounting datasets within the +pool. +See +.Xr zfs 8 +for a description of dataset properties and mount options. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property on the imported pool. +See the +.Xr zpoolprops 7 +manual page for more information on the available pool properties. +.It Fl R Ar root +Sets the +.Sy cachefile +property to +.Sy none +and the +.Sy altroot +property to +.Ar root . +.It Fl -rewind-to-checkpoint +Rewinds pool to the checkpointed state. +Once the pool is imported with this flag there is no way to undo the rewind. +All changes and data that were written after the checkpoint are lost! +The only exception is when the +.Sy readonly +mounting option is enabled. +In this case, the checkpointed state of the pool is opened and an +administrator can see how the pool would look like if they were +to fully rewind. +.It Fl s +Scan using the default search path, the libblkid cache will not be +consulted. +A custom search path may be specified by setting the +.Sy ZPOOL_IMPORT_PATH +environment variable. +.It Fl X +Used with the +.Fl F +recovery option. +Determines whether extreme measures to find a valid txg should take place. +This allows the pool to +be rolled back to a txg which is no longer guaranteed to be consistent. +Pools imported at an inconsistent txg may contain uncorrectable checksum errors. +For more details about pool recovery mode, see the +.Fl F +option, above. +WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl T +Specify the txg to use for rollback. +Implies +.Fl FX . +For more details +about pool recovery mode, see the +.Fl X +option, above. +WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.El +.It Xo +.Nm zpool +.Cm import +.Op Fl Dflmt +.Op Fl F Op Fl nTX +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Op Fl s +.Ar pool Ns | Ns Ar id +.Op Ar newpool +.Xc +Imports a specific pool. +A pool can be identified by its name or the numeric identifier. +If +.Ar newpool +is specified, the pool is imported using the name +.Ar newpool . +Otherwise, it is imported with the same name as its exported name. +.Pp +If a device is removed from a system without running +.Nm zpool Cm export +first, the device appears as potentially active. +It cannot be determined if this was a failed export, or whether the device is +really in use from another host. +To import a pool in this state, the +.Fl f +option is required. +.Bl -tag -width Ds +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +This option is incompatible with the +.Fl c +option. +.It Fl D +Imports destroyed pool. +The +.Fl f +option is also required. +.It Fl f +Forces import, even if the pool appears to be potentially active. +.It Fl F +Recovery mode for a non-importable pool. +Attempt to return the pool to an importable state by discarding the last few +transactions. +Not all damaged pools can be recovered by using this option. +If successful, the data from the discarded transactions is irretrievably lost. +This option is ignored if the pool is importable or already imported. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the pool online. +Note that if any datasets have a +.Sy keylocation +of +.Sy prompt +this command will block waiting for the keys to be entered. +Without this flag +encrypted datasets will be left unavailable until the keys are loaded. +.It Fl m +Allows a pool to import when there is a missing log device. +Recent transactions can be lost because the log device will be discarded. +.It Fl n +Used with the +.Fl F +recovery option. +Determines whether a non-importable pool can be made importable again, but does +not actually perform the pool recovery. +For more details about pool recovery mode, see the +.Fl F +option, above. +.It Fl o Ar mntopts +Comma-separated list of mount options to use when mounting datasets within the +pool. +See +.Xr zfs 8 +for a description of dataset properties and mount options. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property on the imported pool. +See the +.Xr zpoolprops 7 +manual page for more information on the available pool properties. +.It Fl R Ar root +Sets the +.Sy cachefile +property to +.Sy none +and the +.Sy altroot +property to +.Ar root . +.It Fl s +Scan using the default search path, the libblkid cache will not be +consulted. +A custom search path may be specified by setting the +.Sy ZPOOL_IMPORT_PATH +environment variable. +.It Fl X +Used with the +.Fl F +recovery option. +Determines whether extreme measures to find a valid txg should take place. +This allows the pool to +be rolled back to a txg which is no longer guaranteed to be consistent. +Pools imported at an inconsistent txg may contain uncorrectable +checksum errors. +For more details about pool recovery mode, see the +.Fl F +option, above. +WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl T +Specify the txg to use for rollback. +Implies +.Fl FX . +For more details +about pool recovery mode, see the +.Fl X +option, above. +WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl t +Used with +.Sy newpool . +Specifies that +.Sy newpool +is temporary. +Temporary pool names last until export. +Ensures that the original pool name will be used +in all label updates and therefore is retained upon export. +Will also set +.Fl o Sy cachefile Ns = Ns Sy none +when not explicitly specified. +.El +.El +. +.Sh SEE ALSO +.Xr zpool-export 8 , +.Xr zpool-list 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8 new file mode 100644 index 0000000000..0a108180db --- /dev/null +++ b/man/man8/zpool-initialize.8 @@ -0,0 +1,73 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-INITIALIZE 8 +.Os +. +.Sh NAME +.Nm zpool-initialize +.Nd write to unallocated regions of ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm initialize +.Op Fl c Ns | Ns Fl s +.Op Fl w +.Ar pool +.Oo Ar device Oc Ns … +. +.Sh DESCRIPTION +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. +Only leaf data or log devices may be initialized. +.Bl -tag -width Ds +.It Fl c , -cancel +Cancel initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no cancellation will occur on any device. +.It Fl s , -suspend +Suspend initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no suspension will occur on any device. +Initializing can then be resumed by running +.Nm zpool Cm initialize +with no flags on the relevant target devices. +.It Fl w , -wait +Wait until the devices have finished initializing before returning. +.El +. +.Sh SEE ALSO +.Xr zpool-add 8 , +.Xr zpool-attach 8 , +.Xr zpool-create 8 , +.Xr zpool-online 8 , +.Xr zpool-replace 8 , +.Xr zpool-trim 8 diff --git a/man/man8/zpool-iostat.8 b/man/man8/zpool-iostat.8 new file mode 100644 index 0000000000..969c74cf39 --- /dev/null +++ b/man/man8/zpool-iostat.8 @@ -0,0 +1,265 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-IOSTAT 8 +.Os +. +.Sh NAME +.Nm zpool-iostat +.Nd display logical I/O statistics for ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm iostat +.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl ghHLnpPvy +.Oo Ar pool Ns … Ns | Ns Oo Ar pool vdev Ns … Oc Ns | Ns Ar vdev Ns … Oc +.Op Ar interval Op Ar count +. +.Sh DESCRIPTION +Displays logical I/O statistics for the given pools/vdevs. +Physical I/O statistics may be observed via +.Xr iostat 1 . +If writes are located nearby, they may be merged into a single +larger operation. +Additional I/O may be generated depending on the level of vdev redundancy. +To filter output, you may pass in a list of pools, a pool and list of vdevs +in that pool, or a list of any vdevs from any pool. +If no items are specified, statistics for every pool in the system are shown. +When given an +.Ar interval , +the statistics are printed every +.Ar interval +seconds until killed. +If +.Fl n +flag is specified the headers are displayed only once, otherwise they are +displayed periodically. +If +.Ar count +is specified, the command exits after +.Ar count +reports are printed. +The first report printed is always the statistics since boot regardless of whether +.Ar interval +and +.Ar count +are passed. +However, this behavior can be suppressed with the +.Fl y +flag. +Also note that the units of +.Sy K , +.Sy M , +.Sy G Ns … +that are printed in the report are in base 1024. +To get the raw values, use the +.Fl p +flag. +.Bl -tag -width Ds +.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … +Run a script (or scripts) on each vdev and include the output as a new column +in the +.Nm zpool Cm iostat +output. +Users can run any script found in their +.Pa ~/.zpool.d +directory or from the system +.Pa /etc/zfs/zpool.d +directory. +Script names containing the slash +.Pq Sy / +character are not allowed. +The default search path can be overridden by setting the +.Sy ZPOOL_SCRIPTS_PATH +environment variable. +A privileged user can only run +.Fl c +if they have the +.Sy ZPOOL_SCRIPTS_AS_ROOT +environment variable set. +If a script requires the use of a privileged command, like +.Xr smartctl 8 , +then it's recommended you allow the user access to it in +.Pa /etc/sudoers +or add the user to the +.Pa /etc/sudoers.d/zfs +file. +.Pp +If +.Fl c +is passed without a script name, it prints a list of all scripts. +.Fl c +also sets verbose mode +.No \&( Ns Fl v Ns No \&). +.Pp +Script output should be in the form of "name=value". +The column name is set to "name" and the value is set to "value". +Multiple lines can be used to output multiple columns. +The first line of output not in the +"name=value" format is displayed without a column title, +and no more output after that is displayed. +This can be useful for printing error messages. +Blank or NULL values are printed as a '-' to make output AWKable. +.Pp +The following environment variables are set before running each script: +.Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH" +.It Sy VDEV_PATH +Full path to the vdev +.It Sy VDEV_UPATH +Underlying path to the vdev +.Pq Pa /dev/sd* . +For use with device mapper, multipath, or partitioned vdevs. +.It Sy VDEV_ENC_SYSFS_PATH +The sysfs path to the enclosure for the vdev (if any). +.El +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a +single tab instead of arbitrary space. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl n +Print headers only once when passed +.It Fl p +Display numbers in parsable (exact) values. +Time values are in nanoseconds. +.It Fl P +Display full paths for vdevs instead of only the last component of the path. +This can be used in conjunction with the +.Fl L +flag. +.It Fl r +Print request size histograms for the leaf vdev's I/O. +This includes histograms of individual I/O (ind) and aggregate I/O (agg). +These stats can be useful for observing how well I/O aggregation is working. +Note that TRIM I/O may exceed 16M, but will be counted as 16M. +.It Fl v +Verbose statistics Reports usage statistics for individual vdevs within the +pool, in addition to the pool-wide statistics. +.It Fl y +Normally the first line of output reports the statistics since boot: +suppress it. +.It Fl w +Display latency histograms: +.Bl -tag -compact -width "asyncq_read/write" +.It Sy total_wait +Total I/O time (queuing + disk I/O time). +.It Sy disk_wait +Disk I/O time (time reading/writing the disk). +.It Sy syncq_wait +Amount of time I/O spent in synchronous priority queues. +Does not include disk time. +.It Sy asyncq_wait +Amount of time I/O spent in asynchronous priority queues. +Does not include disk time. +.It Sy scrub +Amount of time I/O spent in scrub queue. +Does not include disk time. +.It Sy rebuild +Amount of time I/O spent in rebuild queue. +Does not include disk time. +.El +.It Fl l +Include average latency statistics: +.Bl -tag -compact -width "asyncq_read/write" +.It Sy total_wait +Average total I/O time (queuing + disk I/O time). +.It Sy disk_wait +Average disk I/O time (time reading/writing the disk). +.It Sy syncq_wait +Average amount of time I/O spent in synchronous priority queues. +Does not include disk time. +.It Sy asyncq_wait +Average amount of time I/O spent in asynchronous priority queues. +Does not include disk time. +.It Sy scrub +Average queuing time in scrub queue. +Does not include disk time. +.It Sy trim +Average queuing time in trim queue. +Does not include disk time. +.It Sy rebuild +Average queuing time in rebuild queue. +Does not include disk time. +.El +.It Fl q +Include active queue statistics. +Each priority queue has both pending +.Sy ( pend ) +and active +.Sy ( activ ) +I/O requests. +Pending requests are waiting to be issued to the disk, +and active requests have been issued to disk and are waiting for completion. +These stats are broken out by priority queue: +.Bl -tag -compact -width "asyncq_read/write" +.It Sy syncq_read/write +Current number of entries in synchronous priority +queues. +.It Sy asyncq_read/write +Current number of entries in asynchronous priority queues. +.It Sy scrubq_read +Current number of entries in scrub queue. +.It Sy trimq_write +Current number of entries in trim queue. +.It Sy rebuildq_write +Current number of entries in rebuild queue. +.El +.Pp +All queue statistics are instantaneous measurements of the number of +entries in the queues. +If you specify an interval, +the measurements will be sampled from the end of the interval. +.El +. +.Sh SEE ALSO +.Xr iostat 1 , +.Xr smartctl 8 , +.Xr zpool-list 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-labelclear.8 b/man/man8/zpool-labelclear.8 new file mode 100644 index 0000000000..c7edc91160 --- /dev/null +++ b/man/man8/zpool-labelclear.8 @@ -0,0 +1,60 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 31, 2021 +.Dt ZPOOL-LABELCLEAR 8 +.Os +. +.Sh NAME +.Nm zpool-labelclear +.Nd remove ZFS label information from device +.Sh SYNOPSIS +.Nm zpool +.Cm labelclear +.Op Fl f +.Ar device +. +.Sh DESCRIPTION +Removes ZFS label information from the specified +.Ar device . +If the +.Ar device +is a cache device, it also removes the L2ARC header +(persistent L2ARC). The +.Ar device +must not be part of an active pool configuration. +.Bl -tag -width Ds +.It Fl f +Treat exported or foreign devices as inactive. +.El +. +.Sh SEE ALSO +.Xr zpool-destroy 8 , +.Xr zpool-detach 8 , +.Xr zpool-remove 8 , +.Xr zpool-replace 8 diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8 new file mode 100644 index 0000000000..dd4e13c160 --- /dev/null +++ b/man/man8/zpool-list.8 @@ -0,0 +1,112 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-LIST 8 +.Os +. +.Sh NAME +.Nm zpool-list +.Nd list information about ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm list +.Op Fl HgLpPv +.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns … +.Op Fl T Sy u Ns | Ns Sy d +.Oo Ar pool Oc Ns … +.Op Ar interval Op Ar count +. +.Sh DESCRIPTION +Lists the given pools along with a health status and space usage. +If no +.Ar pool Ns s +are specified, all pools in the system are listed. +When given an +.Ar interval , +the information is printed every +.Ar interval +seconds until killed. +If +.Ar count +is specified, the command exits after +.Ar count +reports are printed. +.Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl o Ar property +Comma-separated list of properties to display. +See the +.Xr zpoolprops 7 +manual page for a list of valid properties. +The default list is +.Sy name , size , allocated , free , checkpoint, expandsize , fragmentation , +.Sy capacity , dedupratio , health , altroot . +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk +path used to open it. +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl v +Verbose statistics. +Reports usage statistics for individual vdevs within the pool, in addition to +the pool-wide statistics. +.El +. +.Sh SEE ALSO +.Xr zpool-import 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-offline.8 b/man/man8/zpool-offline.8 new file mode 100644 index 0000000000..9b2cf59cf4 --- /dev/null +++ b/man/man8/zpool-offline.8 @@ -0,0 +1,94 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-OFFLINE 8 +.Os +. +.Sh NAME +.Nm zpool-offline +.Nd take physical devices offline in ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm offline +.Op Fl ft +.Ar pool +.Ar device Ns … +.Nm zpool +.Cm online +.Op Fl e +.Ar pool +.Ar device Ns … +. +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm offline +.Op Fl ft +.Ar pool +.Ar device Ns … +.Xc +Takes the specified physical device offline. +While the +.Ar device +is offline, no attempt is made to read or write to the device. +This command is not applicable to spares. +.Bl -tag -width Ds +.It Fl f +Force fault. +Instead of offlining the disk, put it into a faulted state. +The fault will persist across imports unless the +.Fl t +flag was specified. +.It Fl t +Temporary. +Upon reboot, the specified physical device reverts to its previous state. +.El +.It Xo +.Nm zpool +.Cm online +.Op Fl e +.Ar pool +.Ar device Ns … +.Xc +Brings the specified physical device online. +This command is not applicable to spares. +.Bl -tag -width Ds +.It Fl e +Expand the device to use all available space. +If the device is part of a mirror or raidz then all devices must be expanded +before the new space will become available to the pool. +.El +.El +. +.Sh SEE ALSO +.Xr zpool-detach 8 , +.Xr zpool-remove 8 , +.Xr zpool-reopen 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-online.8 b/man/man8/zpool-online.8 new file mode 120000 index 0000000000..537e00e1c4 --- /dev/null +++ b/man/man8/zpool-online.8 @@ -0,0 +1 @@ +zpool-offline.8 \ No newline at end of file diff --git a/man/man8/zpool-reguid.8 b/man/man8/zpool-reguid.8 new file mode 100644 index 0000000000..7bb7c1c726 --- /dev/null +++ b/man/man8/zpool-reguid.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 31, 2021 +.Dt ZPOOL-REGUID 8 +.Os +. +.Sh NAME +.Nm zpool-reguid +.Nd generate new unique identifier for ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm reguid +.Ar pool +. +.Sh DESCRIPTION +Generates a new unique identifier for the pool. +You must ensure that all devices in this pool are online and healthy before +performing this action. +. +.Sh SEE ALSO +.Xr zpool-export 8 , +.Xr zpool-import 8 diff --git a/man/man8/zpool-remove.8 b/man/man8/zpool-remove.8 new file mode 100644 index 0000000000..a14218ee17 --- /dev/null +++ b/man/man8/zpool-remove.8 @@ -0,0 +1,111 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-REMOVE 8 +.Os +.Sh NAME +.Nm zpool-remove +.Nd remove devices from ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm remove +.Op Fl npw +.Ar pool Ar device Ns … +.Nm zpool +.Cm remove +.Fl s +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm remove +.Op Fl npw +.Ar pool Ar device Ns … +.Xc +Removes the specified device from the pool. +This command supports removing hot spare, cache, log, and both mirrored and +non-redundant primary top-level vdevs, including dedup and special vdevs. +.Pp +Top-level vdevs can only be removed if the primary pool storage does not contain +a top-level raidz vdev, all top-level vdevs have the same sector size, and the +keys for all encrypted datasets are loaded. +.Pp +Removing a top-level vdev reduces the total amount of space in the storage pool. +The specified device will be evacuated by copying all allocated space from it to +the other devices in the pool. +In this case, the +.Nm zpool Cm remove +command initiates the removal and returns, while the evacuation continues in +the background. +The removal progress can be monitored with +.Nm zpool Cm status . +If an IO error is encountered during the removal process it will be cancelled. +The +.Sy device_removal +feature flag must be enabled to remove a top-level vdev, see +.Xr zpool-features 7 . +.Pp +A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the +same. +Non-log devices or data devices that are part of a mirrored configuration can be removed using +the +.Nm zpool Cm detach +command. +.Bl -tag -width Ds +.It Fl n +Do not actually perform the removal +.Pq Qq No-op . +Instead, print the estimated amount of memory that will be used by the +mapping table after the removal completes. +This is nonzero only for top-level vdevs. +.El +.Bl -tag -width Ds +.It Fl p +Used in conjunction with the +.Fl n +flag, displays numbers as parsable (exact) values. +.It Fl w +Waits until the removal has completed before returning. +.El +.It Xo +.Nm zpool +.Cm remove +.Fl s +.Ar pool +.Xc +Stops and cancels an in-progress removal of a top-level vdev. +.El +.Sh SEE ALSO +.Xr zpool-add 8 , +.Xr zpool-detach 8 , +.Xr zpool-labelclear 8 , +.Xr zpool-offline 8 , +.Xr zpool-replace 8 , +.Xr zpool-split 8 diff --git a/man/man8/zpool-reopen.8 b/man/man8/zpool-reopen.8 new file mode 100644 index 0000000000..f1f8606f12 --- /dev/null +++ b/man/man8/zpool-reopen.8 @@ -0,0 +1,52 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd June 2, 2021 +.Dt ZPOOL-REOPEN 8 +.Os +. +.Sh NAME +.Nm zpool-reopen +.Nd reopen vdevs associated with ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm reopen +.Op Fl n +.Oo Ar pool Oc Ns … +. +.Sh DESCRIPTION +Reopen all vdevs associated with the specified pools, +or all pools if none specified. +. +.Sh OPTIONS +.Bl -tag -width "-n" +.It Fl n +Do not restart an in-progress scrub operation. +This is not recommended and can +result in partially resilvered devices unless a second scrub is performed. +.El diff --git a/man/man8/zpool-replace.8 b/man/man8/zpool-replace.8 new file mode 100644 index 0000000000..2b2875ed42 --- /dev/null +++ b/man/man8/zpool-replace.8 @@ -0,0 +1,99 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 29, 2021 +.Dt ZPOOL-REPLACE 8 +.Os +. +.Sh NAME +.Nm zpool-replace +.Nd replace one device with another in ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm replace +.Op Fl fsw +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool Ar device Op Ar new-device +. +.Sh DESCRIPTION +Replaces +.Ar device +with +.Ar new-device . +This is equivalent to attaching +.Ar new-device , +waiting for it to resilver, and then detaching +.Ar device . +Any in progress scrub will be cancelled. +.Pp +The size of +.Ar new-device +must be greater than or equal to the minimum size of all the devices in a mirror +or raidz configuration. +.Pp +.Ar new-device +is required if the pool is not redundant. +If +.Ar new-device +is not specified, it defaults to +.Ar device . +This form of replacement is useful after an existing disk has failed and has +been physically replaced. +In this case, the new disk may have the same +.Pa /dev +path as the old device, even though it is actually a different disk. +ZFS recognizes this. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar new-device , +even if it appears to be in use. +Not all devices can be overridden in this manner. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. +See the +.Xr zpoolprops 7 +manual page for a list of valid properties that can be set. +The only property supported at the moment is +.Sy ashift . +.It Fl s +The +.Ar new-device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until the replacement has completed before returning. +.El +. +.Sh SEE ALSO +.Xr zpool-detach 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-resilver.8 b/man/man8/zpool-resilver.8 new file mode 100644 index 0000000000..1ef316ac18 --- /dev/null +++ b/man/man8/zpool-resilver.8 @@ -0,0 +1,56 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-RESILVER 8 +.Os +. +.Sh NAME +.Nm zpool-resilver +.Nd resilver devices in ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm resilver +.Ar pool Ns … +. +.Sh DESCRIPTION +Starts a resilver of the specified pools. +If an existing resilver is already running it will be restarted from the beginning. +Any drives that were scheduled for a deferred +resilver will be added to the new one. +This requires the +.Sy resilver_defer +pool feature. +. +.Sh SEE ALSO +.Xr zpool-iostat 8 , +.Xr zpool-online 8 , +.Xr zpool-reopen 8 , +.Xr zpool-replace 8 , +.Xr zpool-scrub 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 new file mode 100644 index 0000000000..768f715392 --- /dev/null +++ b/man/man8/zpool-scrub.8 @@ -0,0 +1,123 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018, 2021 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd July 25, 2021 +.Dt ZPOOL-SCRUB 8 +.Os +. +.Sh NAME +.Nm zpool-scrub +.Nd begin or resume scrub of ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm scrub +.Op Fl s Ns | Ns Fl p +.Op Fl w +.Ar pool Ns … +. +.Sh DESCRIPTION +Begins a scrub or resumes a paused scrub. +The scrub examines all data in the specified pools to verify that it checksums +correctly. +For replicated +.Pq mirror, raidz, or draid +devices, ZFS automatically repairs any damage discovered during the scrub. +The +.Nm zpool Cm status +command reports the progress of the scrub and summarizes the results of the +scrub upon completion. +.Pp +Scrubbing and resilvering are very similar operations. +The difference is that resilvering only examines data that ZFS knows to be out +of date +.Po +for example, when attaching a new device to a mirror or replacing an existing +device +.Pc , +whereas scrubbing examines all data to discover silent errors due to hardware +faults or disk failure. +.Pp +Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows +one at a time. +.Pp +A scrub is split into two parts: metadata scanning and block scrubbing. +The metadata scanning sorts blocks into large sequential ranges which can then +be read much more efficiently from disk when issuing the scrub I/O. +.Pp +If a scrub is paused, the +.Nm zpool Cm scrub +resumes it. +If a resilver is in progress, ZFS does not allow a scrub to be started until the +resilver completes. +.Pp +Note that, due to changes in pool data on a live system, it is possible for +scrubs to progress slightly beyond 100% completion. +During this period, no completion time estimate will be provided. +. +.Sh OPTIONS +.Bl -tag -width "-s" +.It Fl s +Stop scrubbing. +.It Fl p +Pause scrubbing. +Scrub pause state and progress are periodically synced to disk. +If the system is restarted or pool is exported during a paused scrub, +even after import, scrub will remain paused until it is resumed. +Once resumed the scrub will pick up from the place where it was last +checkpointed to disk. +To resume a paused scrub issue +.Nm zpool Cm scrub +again. +.It Fl w +Wait until scrub has completed before returning. +.El +.Sh EXAMPLES +.Bl -tag -width "Exam" +.It Sy Example 1 : Status of pool with ongoing scrub: +Output: +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm status + ... + scan: scrub in progress since Sun Jul 25 16:07:49 2021 + 403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total + 0B repaired, 16.91% done, 00:00:04 to go + ... +.Ed +Where: +.Bl -dash -offset indent +.It +Metadata which references 403M of file data has been +scanned at 100M/s, and 68.4M of that file data has been +scrubbed sequentially at 10.0M/s. +.El +.El +. +.Sh SEE ALSO +.Xr zpool-iostat 8 , +.Xr zpool-resilver 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-set.8 b/man/man8/zpool-set.8 new file mode 120000 index 0000000000..2b8b8cfb7e --- /dev/null +++ b/man/man8/zpool-set.8 @@ -0,0 +1 @@ +zpool-get.8 \ No newline at end of file diff --git a/man/man8/zpool-split.8 b/man/man8/zpool-split.8 new file mode 100644 index 0000000000..c3b05c2366 --- /dev/null +++ b/man/man8/zpool-split.8 @@ -0,0 +1,116 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd June 2, 2021 +.Dt ZPOOL-SPLIT 8 +.Os +. +.Sh NAME +.Nm zpool-split +.Nd split devices off ZFS storage pool, creating new pool +.Sh SYNOPSIS +.Nm zpool +.Cm split +.Op Fl gLlnP +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns … +.Op Fl R Ar root +.Ar pool newpool +.Oo Ar device Oc Ns … +. +.Sh DESCRIPTION +Splits devices off +.Ar pool +creating +.Ar newpool . +All vdevs in +.Ar pool +must be mirrors and the pool must not be in the process of resilvering. +At the time of the split, +.Ar newpool +will be a replica of +.Ar pool . +By default, the +last device in each mirror is split from +.Ar pool +to create +.Ar newpool . +.Pp +The optional device specification causes the specified device(s) to be +included in the new +.Ar pool +and, should any devices remain unspecified, +the last device in each mirror is used as would be by default. +.Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the new pool online. +Note that if any datasets have +.Sy keylocation Ns = Ns Sy prompt , +this command will block waiting for the keys to be entered. +Without this flag, encrypted datasets will be left unavailable until the keys are loaded. +.It Fl n +Do a dry-run +.Pq Qq No-op +split: do not actually perform it. +Print out the expected configuration of +.Ar newpool . +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property for +.Ar newpool . +See the +.Xr zpoolprops 7 +manual page for more information on the available pool properties. +.It Fl R Ar root +Set +.Sy altroot +for +.Ar newpool +to +.Ar root +and automatically import it. +.El +. +.Sh SEE ALSO +.Xr zpool-import 8 , +.Xr zpool-list 8 , +.Xr zpool-remove 8 diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 new file mode 100644 index 0000000000..7c825f69d8 --- /dev/null +++ b/man/man8/zpool-status.8 @@ -0,0 +1,134 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd June 2, 2021 +.Dt ZPOOL-STATUS 8 +.Os +. +.Sh NAME +.Nm zpool-status +.Nd show detailed health status for ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm status +.Op Fl DigLpPstvx +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … +.Oo Ar pool Oc Ns … +.Op Ar interval Op Ar count +. +.Sh DESCRIPTION +Displays the detailed health status for the given pools. +If no +.Ar pool +is specified, then the status of each pool in the system is displayed. +For more information on pool and device health, see the +.Sx Device Failure and Recovery +section of +.Xr zpoolconcepts 7 . +.Pp +If a scrub or resilver is in progress, this command reports the percentage done +and the estimated time to completion. +Both of these are only approximate, because the amount of data in the pool and +the other workloads on the system can change. +.Bl -tag -width Ds +.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … +Run a script (or scripts) on each vdev and include the output as a new column +in the +.Nm zpool Cm status +output. +See the +.Fl c +option of +.Nm zpool Cm iostat +for complete details. +.It Fl i +Display vdev initialization status. +.It Fl g +Display vdev GUIDs instead of the normal device names +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl p +Display numbers in parsable (exact) values. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. +.It Fl D +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. +.It Fl s +Display the number of leaf VDEV slow IOs. +This is the number of IOs that +didn't complete in +.Sy zio_slow_io_ms +milliseconds (default 30 seconds). +This does not necessarily mean the IOs failed to complete, just took an +unreasonably long amount of time. +This may indicate a problem with the underlying storage. +.It Fl t +Display vdev TRIM status. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl v +Displays verbose data error information, printing out a complete list of all +data errors since the last complete pool scrub. +.It Fl x +Only display status for pools that are exhibiting errors or are otherwise +unavailable. +Warnings about pools not using the latest on-disk format will not be included. +.El +. +.Sh SEE ALSO +.Xr zpool-events 8 , +.Xr zpool-history 8 , +.Xr zpool-iostat 8 , +.Xr zpool-list 8 , +.Xr zpool-resilver 8 , +.Xr zpool-scrub 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-sync.8 b/man/man8/zpool-sync.8 new file mode 100644 index 0000000000..aa68a5729e --- /dev/null +++ b/man/man8/zpool-sync.8 @@ -0,0 +1,53 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-SYNC 8 +.Os +. +.Sh NAME +.Nm zpool-sync +.Nd flush data to primary storage of ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm sync +.Oo Ar pool Oc Ns … +. +.Sh DESCRIPTION +This command forces all in-core dirty data to be written to the primary +pool storage and not the ZIL. +It will also update administrative information including quota reporting. +Without arguments, +.Nm zpool Cm sync +will sync all pools on the system. +Otherwise, it will sync only the specified pools. +. +.Sh SEE ALSO +.Xr zpoolconcepts 7 , +.Xr zpool-export 8 , +.Xr zpool-iostat 8 diff --git a/man/man8/zpool-trim.8 b/man/man8/zpool-trim.8 new file mode 100644 index 0000000000..d9a7b44003 --- /dev/null +++ b/man/man8/zpool-trim.8 @@ -0,0 +1,91 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-TRIM 8 +.Os +. +.Sh NAME +.Nm zpool-trim +.Nd initiate TRIM of free space in ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm trim +.Op Fl dw +.Op Fl r Ar rate +.Op Fl c Ns | Ns Fl s +.Ar pool +.Oo Ar device Ns Oc Ns … +. +.Sh DESCRIPTION +Initiates an immediate on-demand TRIM operation for all of the free space in +a pool. +This operation informs the underlying storage devices of all blocks +in the pool which are no longer allocated and allows thinly provisioned +devices to reclaim the space. +.Pp +A manual on-demand TRIM operation can be initiated irrespective of the +.Sy autotrim +pool property setting. +See the documentation for the +.Sy autotrim +property above for the types of vdev devices which can be trimmed. +.Bl -tag -width Ds +.It Fl d , -secure +Causes a secure TRIM to be initiated. +When performing a secure TRIM, the +device guarantees that data stored on the trimmed blocks has been erased. +This requires support from the device and is not supported by all SSDs. +.It Fl r , -rate Ar rate +Controls the rate at which the TRIM operation progresses. +Without this +option TRIM is executed as quickly as possible. +The rate, expressed in bytes +per second, is applied on a per-vdev basis and may be set differently for +each leaf vdev. +.It Fl c , -cancel +Cancel trimming on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +trimmed, the command will fail and no cancellation will occur on any device. +.It Fl s , -suspend +Suspend trimming on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +trimmed, the command will fail and no suspension will occur on any device. +Trimming can then be resumed by running +.Nm zpool Cm trim +with no flags on the relevant target devices. +.It Fl w , -wait +Wait until the devices are done being trimmed before returning. +.El +. +.Sh SEE ALSO +.Xr zpoolprops 7 , +.Xr zpool-initialize 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-upgrade.8 b/man/man8/zpool-upgrade.8 new file mode 100644 index 0000000000..1b13bad898 --- /dev/null +++ b/man/man8/zpool-upgrade.8 @@ -0,0 +1,109 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2021, Colm Buckley +.\" +.Dd August 9, 2019 +.Dt ZPOOL-UPGRADE 8 +.Os +. +.Sh NAME +.Nm zpool-upgrade +.Nd manage version and feature flags of ZFS storage pools +.Sh SYNOPSIS +.Nm zpool +.Cm upgrade +.Nm zpool +.Cm upgrade +.Fl v +.Nm zpool +.Cm upgrade +.Op Fl V Ar version +.Fl a Ns | Ns Ar pool Ns … +. +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm upgrade +.Xc +Displays pools which do not have all supported features enabled and pools +formatted using a legacy ZFS version number. +These pools can continue to be used, but some features may not be available. +Use +.Nm zpool Cm upgrade Fl a +to enable all features on all pools (subject to the +.Fl o Sy compatibility +property). +.It Xo +.Nm zpool +.Cm upgrade +.Fl v +.Xc +Displays legacy ZFS versions supported by the this version of ZFS. +See +.Xr zpool-features 7 +for a description of feature flags features supported by this version of ZFS. +.It Xo +.Nm zpool +.Cm upgrade +.Op Fl V Ar version +.Fl a Ns | Ns Ar pool Ns … +.Xc +Enables all supported features on the given pool. +.Pp +If the pool has specified compatibility feature sets using the +.Fl o Sy compatibility +property, only the features present in all requested compatibility sets will be +enabled. +If this property is set to +.Ar legacy +then no upgrade will take place. +.Pp +Once this is done, the pool will no longer be accessible on systems that do not +support feature flags. +See +.Xr zpool-features 7 +for details on compatibility with systems that support feature flags, but do not +support all features enabled on the pool. +.Bl -tag -width Ds +.It Fl a +Enables all supported features (from specified compatibility sets, if any) on all +pools. +.It Fl V Ar version +Upgrade to the specified legacy version. +If specified, no features will be enabled on the pool. +This option can only be used to increase the version number up to the last +supported legacy version number. +.El +.El +. +.Sh SEE ALSO +.Xr zpool-features 7 , +.Xr zpoolconcepts 7 , +.Xr zpoolprops 7 , +.Xr zpool-history 8 diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 new file mode 100644 index 0000000000..38f4812ace --- /dev/null +++ b/man/man8/zpool-wait.8 @@ -0,0 +1,116 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 27, 2021 +.Dt ZPOOL-WAIT 8 +.Os +. +.Sh NAME +.Nm zpool-wait +.Nd wait for activity to stop in a ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm wait +.Op Fl Hp +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns … +.Ar pool +.Op Ar interval +. +.Sh DESCRIPTION +Waits until all background activity of the given types has ceased in the given +pool. +The activity could cease because it has completed, or because it has been +paused or canceled by a user, or because the pool has been exported or +destroyed. +If no activities are specified, the command waits until background activity of +every type listed below has ceased. +If there is no activity of the given types in progress, the command returns +immediately. +.Pp +These are the possible values for +.Ar activity , +along with what each one waits for: +.Bl -tag -compact -offset Ds -width "initialize" +.It Sy discard +Checkpoint to be discarded +.It Sy free +.Sy freeing +property to become +.Sy 0 +.It Sy initialize +All initializations to cease +.It Sy replace +All device replacements to cease +.It Sy remove +Device removal to cease +.It Sy resilver +Resilver to cease +.It Sy scrub +Scrub to cease +.It Sy trim +Manual trim to cease +.El +.Pp +If an +.Ar interval +is provided, the amount of work remaining, in bytes, for each activity is +printed every +.Ar interval +seconds. +.Bl -tag -width Ds +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl p +Display numbers in parsable (exact) values. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.El +. +.Sh SEE ALSO +.Xr zpool-checkpoint 8 , +.Xr zpool-initialize 8 , +.Xr zpool-remove 8 , +.Xr zpool-replace 8 , +.Xr zpool-resilver 8 , +.Xr zpool-scrub 8 , +.Xr zpool-status 8 , +.Xr zpool-trim 8 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index bdad81149b..192a8e2eac 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -18,7 +18,6 @@ .\" .\" CDDL HEADER END .\" -.\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. @@ -27,9 +26,10 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 2, 2019 -.Dt ZPOOL 8 SMM -.Os Linux +.Dd June 2, 2021 +.Dt ZPOOL 8 +.Os +. .Sh NAME .Nm zpool .Nd configure ZFS storage pools @@ -37,181 +37,11 @@ .Nm .Fl ?V .Nm -.Cm add -.Op Fl fgLnP -.Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool vdev Ns ... -.Nm -.Cm attach -.Op Fl f -.Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool device new_device -.Nm -.Cm checkpoint -.Op Fl d, -discard -.Ar pool -.Nm -.Cm clear -.Ar pool -.Op Ar device -.Nm -.Cm create -.Op Fl dfn -.Op Fl m Ar mountpoint -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Oo Fl o Ar feature@feature Ns = Ns Ar value Oc -.Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Ar pool vdev Ns ... -.Nm -.Cm destroy -.Op Fl f -.Ar pool -.Nm -.Cm detach -.Ar pool device -.Nm -.Cm events -.Op Fl vHf Oo Ar pool Oc | Fl c -.Nm -.Cm export -.Op Fl a -.Op Fl f -.Ar pool Ns ... -.Nm -.Cm get -.Op Fl Hp -.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... -.Oo Ar pool Oc Ns ... -.Nm -.Cm history -.Op Fl il -.Oo Ar pool Oc Ns ... -.Nm -.Cm import -.Op Fl D -.Op Fl d Ar dir Ns | Ns device -.Nm -.Cm import -.Fl a -.Op Fl DflmN -.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc -.Op Fl -rewind-to-checkpoint -.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device -.Op Fl o Ar mntopts -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Nm -.Cm import -.Op Fl Dflm -.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc -.Op Fl -rewind-to-checkpoint -.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device -.Op Fl o Ar mntopts -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Op Fl s -.Ar pool Ns | Ns Ar id -.Op Ar newpool Oo Fl t Oc -.Nm -.Cm initialize -.Op Fl c | Fl s -.Ar pool -.Op Ar device Ns ... -.Nm -.Cm iostat -.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw -.Op Fl T Sy u Ns | Ns Sy d -.Op Fl ghHLnpPvy -.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc -.Op Ar interval Op Ar count -.Nm -.Cm labelclear -.Op Fl f -.Ar device -.Nm -.Cm list -.Op Fl HgLpPv -.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -.Op Fl T Sy u Ns | Ns Sy d -.Oo Ar pool Oc Ns ... -.Op Ar interval Op Ar count -.Nm -.Cm offline -.Op Fl f -.Op Fl t -.Ar pool Ar device Ns ... -.Nm -.Cm online -.Op Fl e -.Ar pool Ar device Ns ... -.Nm -.Cm reguid -.Ar pool -.Nm -.Cm reopen -.Op Fl n -.Ar pool -.Nm -.Cm remove -.Op Fl np -.Ar pool Ar device Ns ... -.Nm -.Cm remove -.Fl s -.Ar pool -.Nm -.Cm replace -.Op Fl f -.Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool Ar device Op Ar new_device -.Nm -.Cm resilver -.Ar pool Ns ... -.Nm -.Cm scrub -.Op Fl s | Fl p -.Ar pool Ns ... -.Nm -.Cm trim -.Op Fl d -.Op Fl r Ar rate -.Op Fl c | Fl s -.Ar pool -.Op Ar device Ns ... -.Nm -.Cm set -.Ar property Ns = Ns Ar value -.Ar pool -.Nm -.Cm split -.Op Fl gLlnP -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Ar pool newpool -.Oo Ar device Oc Ns ... -.Nm -.Cm status -.Oo Fl c Ar SCRIPT Oc -.Op Fl DigLpPstvx -.Op Fl T Sy u Ns | Ns Sy d -.Oo Ar pool Oc Ns ... -.Op Ar interval Op Ar count -.Nm -.Cm sync -.Oo Ar pool Oc Ns ... -.Nm -.Cm upgrade -.Nm -.Cm upgrade -.Fl v -.Nm -.Cm upgrade -.Op Fl V Ar version -.Fl a Ns | Ns Ar pool Ns ... -.Nm .Cm version +.Nm +.Cm subcommand +.Op Ar argumentss +. .Sh DESCRIPTION The .Nm @@ -222,706 +52,12 @@ All datasets within a storage pool share the same space. See .Xr zfs 8 for information on managing datasets. -.Ss Virtual Devices (vdevs) -A "virtual device" describes a single device or a collection of devices -organized according to certain performance and fault characteristics. -The following virtual devices are supported: -.Bl -tag -width Ds -.It Sy disk -A block device, typically located under -.Pa /dev . -ZFS can use individual slices or partitions, though the recommended mode of -operation is to use whole disks. -A disk can be specified by a full path, or it can be a shorthand name -.Po the relative portion of the path under -.Pa /dev -.Pc . -A whole disk can be specified by omitting the slice or partition designation. -For example, -.Pa sda -is equivalent to -.Pa /dev/sda . -When given a whole disk, ZFS automatically labels the disk, if necessary. -.It Sy file -A regular file. -The use of files as a backing store is strongly discouraged. -It is designed primarily for experimental purposes, as the fault tolerance of a -file is only as good as the file system of which it is a part. -A file must be specified by a full path. -.It Sy mirror -A mirror of two or more devices. -Data is replicated in an identical fashion across all components of a mirror. -A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices -failing before data integrity is compromised. -.It Sy raidz , raidz1 , raidz2 , raidz3 -A variation on RAID-5 that allows for better distribution of parity and -eliminates the RAID-5 -.Qq write hole -.Pq in which data and parity become inconsistent after a power loss . -Data and parity is striped across all disks within a raidz group. .Pp -A raidz group can have single-, double-, or triple-parity, meaning that the -raidz group can sustain one, two, or three failures, respectively, without -losing any data. -The -.Sy raidz1 -vdev type specifies a single-parity raidz group; the -.Sy raidz2 -vdev type specifies a double-parity raidz group; and the -.Sy raidz3 -vdev type specifies a triple-parity raidz group. -The -.Sy raidz -vdev type is an alias for -.Sy raidz1 . -.Pp -A raidz group with N disks of size X with P parity disks can hold approximately -(N-P)*X bytes and can withstand P device(s) failing before data integrity is -compromised. -The minimum number of devices in a raidz group is one more than the number of -parity disks. -The recommended number is between 3 and 9 to help increase performance. -.It Sy spare -A pseudo-vdev which keeps track of available hot spares for a pool. -For more information, see the -.Sx Hot Spares -section. -.It Sy log -A separate intent log device. -If more than one log device is specified, then writes are load-balanced between -devices. -Log devices can be mirrored. -However, raidz vdev types are not supported for the intent log. -For more information, see the -.Sx Intent Log -section. -.It Sy dedup -A device dedicated solely for deduplication tables. -The redundancy of this device should match the redundancy of the other normal -devices in the pool. If more than one dedup device is specified, then -allocations are load-balanced between those devices. -.It Sy special -A device dedicated solely for allocating various kinds of internal metadata, -and optionally small file blocks. -The redundancy of this device should match the redundancy of the other normal -devices in the pool. If more than one special device is specified, then -allocations are load-balanced between those devices. -.Pp -For more information on special allocations, see the -.Sx Special Allocation Class -section. -.It Sy cache -A device used to cache storage pool data. -A cache device cannot be configured as a mirror or raidz group. -For more information, see the -.Sx Cache Devices -section. -.El -.Pp -Virtual devices cannot be nested, so a mirror or raidz virtual device can only -contain files or disks. -Mirrors of mirrors -.Pq or other combinations -are not allowed. -.Pp -A pool can have any number of virtual devices at the top of the configuration -.Po known as -.Qq root vdevs -.Pc . -Data is dynamically distributed across all top-level devices to balance data -among devices. -As new virtual devices are added, ZFS automatically places data on the newly -available devices. -.Pp -Virtual devices are specified one at a time on the command line, separated by -whitespace. -The keywords -.Sy mirror -and -.Sy raidz -are used to distinguish where a group ends and another begins. -For example, the following creates two root vdevs, each a mirror of two disks: -.Bd -literal -# zpool create mypool mirror sda sdb mirror sdc sdd -.Ed -.Ss Device Failure and Recovery -ZFS supports a rich set of mechanisms for handling device failure and data -corruption. -All metadata and data is checksummed, and ZFS automatically repairs bad data -from a good copy when corruption is detected. -.Pp -In order to take advantage of these features, a pool must make use of some form -of redundancy, using either mirrored or raidz groups. -While ZFS supports running in a non-redundant configuration, where each root -vdev is simply a disk or file, this is strongly discouraged. -A single case of bit corruption can render some or all of your data unavailable. -.Pp -A pool's health status is described by one of three states: online, degraded, -or faulted. -An online pool has all devices operating normally. -A degraded pool is one in which one or more devices have failed, but the data is -still available due to a redundant configuration. -A faulted pool has corrupted metadata, or one or more faulted devices, and -insufficient replicas to continue functioning. -.Pp -The health of the top-level vdev, such as mirror or raidz device, is -potentially impacted by the state of its associated vdevs, or component -devices. -A top-level vdev or component device is in one of the following states: -.Bl -tag -width "DEGRADED" -.It Sy DEGRADED -One or more top-level vdevs is in the degraded state because one or more -component devices are offline. -Sufficient replicas exist to continue functioning. -.Pp -One or more component devices is in the degraded or faulted state, but -sufficient replicas exist to continue functioning. -The underlying conditions are as follows: -.Bl -bullet -.It -The number of checksum errors exceeds acceptable levels and the device is -degraded as an indication that something may be wrong. -ZFS continues to use the device as necessary. -.It -The number of I/O errors exceeds acceptable levels. -The device could not be marked as faulted because there are insufficient -replicas to continue functioning. -.El -.It Sy FAULTED -One or more top-level vdevs is in the faulted state because one or more -component devices are offline. -Insufficient replicas exist to continue functioning. -.Pp -One or more component devices is in the faulted state, and insufficient -replicas exist to continue functioning. -The underlying conditions are as follows: -.Bl -bullet -.It -The device could be opened, but the contents did not match expected values. -.It -The number of I/O errors exceeds acceptable levels and the device is faulted to -prevent further use of the device. -.El -.It Sy OFFLINE -The device was explicitly taken offline by the -.Nm zpool Cm offline -command. -.It Sy ONLINE -The device is online and functioning. -.It Sy REMOVED -The device was physically removed while the system was running. -Device removal detection is hardware-dependent and may not be supported on all -platforms. -.It Sy UNAVAIL -The device could not be opened. -If a pool is imported when a device was unavailable, then the device will be -identified by a unique identifier instead of its path since the path was never -correct in the first place. -.El -.Pp -If a device is removed and later re-attached to the system, ZFS attempts -to put the device online automatically. -Device attach detection is hardware-dependent and might not be supported on all -platforms. -.Ss Hot Spares -ZFS allows devices to be associated with pools as -.Qq hot spares . -These devices are not actively used in the pool, but when an active device -fails, it is automatically replaced by a hot spare. -To create a pool with hot spares, specify a -.Sy spare -vdev with any number of devices. -For example, -.Bd -literal -# zpool create pool mirror sda sdb spare sdc sdd -.Ed -.Pp -Spares can be shared across multiple pools, and can be added with the -.Nm zpool Cm add -command and removed with the -.Nm zpool Cm remove -command. -Once a spare replacement is initiated, a new -.Sy spare -vdev is created within the configuration that will remain there until the -original device is replaced. -At this point, the hot spare becomes available again if another device fails. -.Pp -If a pool has a shared spare that is currently being used, the pool can not be -exported since other pools may use this shared spare, which may lead to -potential data corruption. -.Pp -Shared spares add some risk. If the pools are imported on different hosts, and -both pools suffer a device failure at the same time, both could attempt to use -the spare at the same time. This may not be detected, resulting in data -corruption. -.Pp -An in-progress spare replacement can be cancelled by detaching the hot spare. -If the original faulted device is detached, then the hot spare assumes its -place in the configuration, and is removed from the spare list of all active -pools. -.Pp -Spares cannot replace log devices. -.Ss Intent Log -The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous -transactions. -For instance, databases often require their transactions to be on stable storage -devices when returning from a system call. -NFS and other applications can also use -.Xr fsync 2 -to ensure data stability. -By default, the intent log is allocated from blocks within the main pool. -However, it might be possible to get better performance using separate intent -log devices such as NVRAM or a dedicated disk. -For example: -.Bd -literal -# zpool create pool sda sdb log sdc -.Ed -.Pp -Multiple log devices can also be specified, and they can be mirrored. -See the -.Sx EXAMPLES -section for an example of mirroring multiple log devices. -.Pp -Log devices can be added, replaced, attached, detached and removed. In -addition, log devices are imported and exported as part of the pool -that contains them. -Mirrored devices can be removed by specifying the top-level mirror vdev. -.Ss Cache Devices -Devices can be added to a storage pool as -.Qq cache devices . -These devices provide an additional layer of caching between main memory and -disk. -For read-heavy workloads, where the working set size is much larger than what -can be cached in main memory, using cache devices allow much more of this -working set to be served from low latency media. -Using cache devices provides the greatest performance improvement for random -read-workloads of mostly static content. -.Pp -To create a pool with cache devices, specify a -.Sy cache -vdev with any number of devices. -For example: -.Bd -literal -# zpool create pool sda sdb cache sdc sdd -.Ed -.Pp -Cache devices cannot be mirrored or part of a raidz configuration. -If a read error is encountered on a cache device, that read I/O is reissued to -the original storage pool device, which might be part of a mirrored or raidz -configuration. -.Pp -The content of the cache devices is considered volatile, as is the case with -other system caches. -.Ss Pool checkpoint -Before starting critical procedures that include destructive actions (e.g -.Nm zfs Cm destroy -), an administrator can checkpoint the pool's state and in the case of a -mistake or failure, rewind the entire pool back to the checkpoint. -Otherwise, the checkpoint can be discarded when the procedure has completed -successfully. -.Pp -A pool checkpoint can be thought of as a pool-wide snapshot and should be used -with care as it contains every part of the pool's state, from properties to vdev -configuration. -Thus, while a pool has a checkpoint certain operations are not allowed. -Specifically, vdev removal/attach/detach, mirror splitting, and -changing the pool's guid. -Adding a new vdev is supported but in the case of a rewind it will have to be -added again. -Finally, users of this feature should keep in mind that scrubs in a pool that -has a checkpoint do not repair checkpointed data. -.Pp -To create a checkpoint for a pool: -.Bd -literal -# zpool checkpoint pool -.Ed -.Pp -To later rewind to its checkpointed state, you need to first export it and -then rewind it during import: -.Bd -literal -# zpool export pool -# zpool import --rewind-to-checkpoint pool -.Ed -.Pp -To discard the checkpoint from a pool: -.Bd -literal -# zpool checkpoint -d pool -.Ed -.Pp -Dataset reservations (controlled by the -.Nm reservation -or -.Nm refreservation -zfs properties) may be unenforceable while a checkpoint exists, because the -checkpoint is allowed to consume the dataset's reservation. -Finally, data that is part of the checkpoint but has been freed in the -current state of the pool won't be scanned during a scrub. -.Ss Special Allocation Class -The allocations in the special class are dedicated to specific block types. -By default this includes all metadata, the indirect blocks of user data, and -any deduplication tables. The class can also be provisioned to accept -small file blocks. -.Pp -A pool must always have at least one normal (non-dedup/special) vdev before -other devices can be assigned to the special class. If the special class -becomes full, then allocations intended for it will spill back into the -normal class. -.Pp -Deduplication tables can be excluded from the special class by setting the -.Sy zfs_ddt_data_is_special -zfs module parameter to false (0). -.Pp -Inclusion of small file blocks in the special class is opt-in. Each dataset -can control the size of small file blocks allowed in the special class by -setting the -.Sy special_small_blocks -dataset property. It defaults to zero, so you must opt-in by setting it to a -non-zero value. See -.Xr zfs 8 -for more info on setting this property. -.Ss Properties -Each pool has several properties associated with it. -Some properties are read-only statistics while others are configurable and -change the behavior of the pool. -.Pp -The following are read-only properties: -.Bl -tag -width Ds -.It Cm allocated -Amount of storage used within the pool. -See -.Sy fragmentation -and -.Sy free -for more information. -.It Sy capacity -Percentage of pool space used. -This property can also be referred to by its shortened column name, -.Sy cap . -.It Sy expandsize -Amount of uninitialized space within the pool or device that can be used to -increase the total capacity of the pool. -Uninitialized space consists of any space on an EFI labeled vdev which has not -been brought online -.Po e.g, using -.Nm zpool Cm online Fl e -.Pc . -This space occurs when a LUN is dynamically expanded. -.It Sy fragmentation -The amount of fragmentation in the pool. As the amount of space -.Sy allocated -increases, it becomes more difficult to locate -.Sy free -space. This may result in lower write performance compared to pools with more -unfragmented free space. -.It Sy free -The amount of free space available in the pool. -By contrast, the -.Xr zfs 8 -.Sy available -property describes how much new data can be written to ZFS filesystems/volumes. -The zpool -.Sy free -property is not generally useful for this purpose, and can be substantially more than the zfs -.Sy available -space. This discrepancy is due to several factors, including raidz party; zfs -reservation, quota, refreservation, and refquota properties; and space set aside by -.Sy spa_slop_shift -(see -.Xr zfs-module-parameters 5 -for more information). -.It Sy freeing -After a file system or snapshot is destroyed, the space it was using is -returned to the pool asynchronously. -.Sy freeing -is the amount of space remaining to be reclaimed. -Over time -.Sy freeing -will decrease while -.Sy free -increases. -.It Sy health -The current health of the pool. -Health can be one of -.Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . -.It Sy guid -A unique identifier for the pool. -.It Sy load_guid -A unique identifier for the pool. -Unlike the -.Sy guid -property, this identifier is generated every time we load the pool (e.g. does -not persist across imports/exports) and never changes while the pool is loaded -(even if a -.Sy reguid -operation takes place). -.It Sy size -Total size of the storage pool. -.It Sy unsupported@ Ns Em feature_guid -Information about unsupported features that are enabled on the pool. -See -.Xr zpool-features 5 -for details. -.El -.Pp -The space usage properties report actual physical space available to the -storage pool. -The physical space can be different from the total amount of space that any -contained datasets can actually use. -The amount of space used in a raidz configuration depends on the characteristics -of the data being written. -In addition, ZFS reserves some space for internal accounting that the -.Xr zfs 8 -command takes into account, but the -.Nm -command does not. -For non-full pools of a reasonable size, these effects should be invisible. -For small pools, or pools that are close to being completely full, these -discrepancies may become more noticeable. -.Pp -The following property can be set at creation time and import time: -.Bl -tag -width Ds -.It Sy altroot -Alternate root directory. -If set, this directory is prepended to any mount points within the pool. -This can be used when examining an unknown pool where the mount points cannot be -trusted, or in an alternate boot environment, where the typical paths are not -valid. -.Sy altroot -is not a persistent property. -It is valid only while the system is up. -Setting -.Sy altroot -defaults to using -.Sy cachefile Ns = Ns Sy none , -though this may be overridden using an explicit setting. -.El -.Pp -The following property can be set only at import time: -.Bl -tag -width Ds -.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off -If set to -.Sy on , -the pool will be imported in read-only mode. -This property can also be referred to by its shortened column name, -.Sy rdonly . -.El -.Pp -The following properties can be set at creation time and import time, and later -changed with the -.Nm zpool Cm set -command: -.Bl -tag -width Ds -.It Sy ashift Ns = Ns Sy ashift -Pool sector size exponent, to the power of -.Sy 2 -(internally referred to as -.Sy ashift -). Values from 9 to 16, inclusive, are valid; also, the -value 0 (the default) means to auto-detect using the kernel's block -layer and a ZFS internal exception list. I/O operations will be aligned -to the specified size boundaries. Additionally, the minimum (disk) -write size will be set to the specified size, so this represents a -space vs. performance trade-off. For optimal performance, the pool -sector size should be greater than or equal to the sector size of the -underlying disks. The typical case for setting this property is when -performance is important and the underlying disks use 4KiB sectors but -report 512B sectors to the OS (for compatibility reasons); in that -case, set -.Sy ashift=12 -(which is 1<<12 = 4096). When set, this property is -used as the default hint value in subsequent vdev operations (add, -attach and replace). Changing this value will not modify any existing -vdev, not even on disk replacement; however it can be used, for -instance, to replace a dying 512B sectors disk with a newer 4KiB -sectors device: this will probably result in bad performance but at the -same time could prevent loss of data. -.It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off -Controls automatic pool expansion when the underlying LUN is grown. -If set to -.Sy on , -the pool will be resized according to the size of the expanded device. -If the device is part of a mirror or raidz then all devices within that -mirror/raidz group must be expanded before the new space is made available to -the pool. -The default behavior is -.Sy off . -This property can also be referred to by its shortened column name, -.Sy expand . -.It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off -Controls automatic device replacement. -If set to -.Sy off , -device replacement must be initiated by the administrator by using the -.Nm zpool Cm replace -command. -If set to -.Sy on , -any new device, found in the same physical location as a device that previously -belonged to the pool, is automatically formatted and replaced. -The default behavior is -.Sy off . -This property can also be referred to by its shortened column name, -.Sy replace . -Autoreplace can also be used with virtual disks (like device -mapper) provided that you use the /dev/disk/by-vdev paths setup by -vdev_id.conf. See the -.Xr vdev_id 8 -man page for more details. -Autoreplace and autoonline require the ZFS Event Daemon be configured and -running. See the -.Xr zed 8 -man page for more details. -.It Sy bootfs Ns = Ns Sy (unset) Ns | Ns Ar pool Ns / Ns Ar dataset -Identifies the default bootable dataset for the root pool. This property is -expected to be set mainly by the installation and upgrade programs. -Not all Linux distribution boot processes use the bootfs property. -.It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none -Controls the location of where the pool configuration is cached. -Discovering all pools on system startup requires a cached copy of the -configuration data that is stored on the root file system. -All pools in this cache are automatically imported when the system boots. -Some environments, such as install and clustering, need to cache this -information in a different location so that pools are not automatically -imported. -Setting this property caches the pool configuration in a different location that -can later be imported with -.Nm zpool Cm import Fl c . -Setting it to the value -.Sy none -creates a temporary pool that is never cached, and the -.Qq -.Pq empty string -uses the default location. -.Pp -Multiple pools can share the same cache file. -Because the kernel destroys and recreates this file when pools are added and -removed, care should be taken when attempting to access this file. -When the last pool using a -.Sy cachefile -is exported or destroyed, the file will be empty. -.It Sy comment Ns = Ns Ar text -A text string consisting of printable ASCII characters that will be stored -such that it is available even if the pool becomes faulted. -An administrator can provide additional information about a pool using this -property. -.It Sy dedupditto Ns = Ns Ar number -This property is deprecated. In a future release, it will no longer have any -effect. -.Pp -Threshold for the number of block ditto copies. -If the reference count for a deduplicated block increases above this number, a -new ditto copy of this block is automatically stored. -The default setting is -.Sy 0 -which causes no ditto copies to be created for deduplicated blocks. -The minimum legal nonzero setting is -.Sy 100 . -.It Sy delegation Ns = Ns Sy on Ns | Ns Sy off -Controls whether a non-privileged user is granted access based on the dataset -permissions defined on the dataset. -See -.Xr zfs 8 -for more information on ZFS delegated administration. -.It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic -Controls the system behavior in the event of catastrophic pool failure. -This condition is typically a result of a loss of connectivity to the underlying -storage device(s) or a failure of all devices within the pool. -The behavior of such an event is determined as follows: -.Bl -tag -width "continue" -.It Sy wait -Blocks all I/O access until the device connectivity is recovered and the errors -are cleared. -This is the default behavior. -.It Sy continue -Returns -.Er EIO -to any new write I/O requests but allows reads to any of the remaining healthy -devices. -Any write requests that have yet to be committed to disk would be blocked. -.It Sy panic -Prints out a message to the console and generates a system crash dump. -.El -.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off -When set to -.Sy on -space which has been recently freed, and is no longer allocated by the pool, -will be periodically trimmed. This allows block device vdevs which support -BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system -supports hole-punching, to reclaim unused blocks. The default setting for -this property is -.Sy off . -.Pp -Automatic TRIM does not immediately reclaim blocks after a free. Instead, -it will optimistically delay allowing smaller ranges to be aggregated in to -a few larger ones. These can then be issued more efficiently to the storage. -.Pp -Be aware that automatic trimming of recently freed data blocks can put -significant stress on the underlying storage devices. This will vary -depending of how well the specific device handles these commands. For -lower end devices it is often possible to achieve most of the benefits -of automatic trimming by running an on-demand (manual) TRIM periodically -using the -.Nm zpool Cm trim -command. -.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled -The value of this property is the current state of -.Ar feature_name . -The only valid value when setting this property is -.Sy enabled -which moves -.Ar feature_name -to the enabled state. -See -.Xr zpool-features 5 -for details on feature states. -.It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off -Controls whether information about snapshots associated with this pool is -output when -.Nm zfs Cm list -is run without the -.Fl t -option. -The default value is -.Sy off . -This property can also be referred to by its shortened name, -.Sy listsnaps . -.It Sy multihost Ns = Ns Sy on Ns | Ns Sy off -Controls whether a pool activity check should be performed during -.Nm zpool Cm import . -When a pool is determined to be active it cannot be imported, even with the -.Fl f -option. This property is intended to be used in failover configurations -where multiple hosts have access to a pool on shared storage. -.Pp -Multihost provides protection on import only. It does not protect against an -individual device being used in multiple pools, regardless of the type of vdev. -See the discussion under -.Sy zpool create. -.Pp -When this property is on, periodic writes to storage occur to show the pool is -in use. See -.Sy zfs_multihost_interval -in the -.Xr zfs-module-parameters 5 -man page. In order to enable this property each host must set a unique hostid. -See -.Xr genhostid 1 -.Xr zgenhostid 8 -.Xr spl-module-parameters 5 -for additional details. The default value is -.Sy off . -.It Sy version Ns = Ns Ar version -The current on-disk version of the pool. -This can be increased, but never decreased. -The preferred method of updating pools is with the -.Nm zpool Cm upgrade -command, though this property can be used when a specific version is needed for -backwards compatibility. -Once feature flags are enabled on a pool this property will no longer have a -value. -.El -.Ss Subcommands +For an overview of creating and managing ZFS storage pools see the +.Xr zpoolconcepts 7 +manual page. +. +.Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Pp @@ -933,401 +69,70 @@ The following subcommands are supported: .Bl -tag -width Ds .It Xo .Nm -.Fl ? +.Fl ?\& .Xc Displays a help message. .It Xo .Nm -.Fl V, -version +.Fl V , -version .Xc -An alias for the -.Nm zpool Cm version -subcommand. .It Xo .Nm -.Cm add -.Op Fl fgLnP -.Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool vdev Ns ... +.Cm version .Xc -Adds the specified virtual devices to the given pool. -The -.Ar vdev -specification is described in the -.Sx Virtual Devices -section. -The behavior of the -.Fl f -option, and the device checks performed are described in the -.Nm zpool Cm create -subcommand. -.Bl -tag -width Ds -.It Fl f -Forces use of -.Ar vdev Ns s , -even if they appear in use or specify a conflicting replication level. -Not all devices can be overridden in this manner. -.It Fl g -Display -.Ar vdev , -GUIDs instead of the normal device names. These GUIDs can be used in place of -device names for the zpool detach/offline/remove/replace commands. -.It Fl L -Display real paths for -.Ar vdev Ns s -resolving all symbolic links. This can be used to look up the current block -device name regardless of the /dev/disk/ path used to open it. -.It Fl n -Displays the configuration that would be used without actually adding the -.Ar vdev Ns s . -The actual pool creation can still fail due to insufficient privileges or -device sharing. -.It Fl P -Display real paths for -.Ar vdev Ns s -instead of only the last component of the path. This can be used in -conjunction with the -.Fl L -flag. -.It Fl o Ar property Ns = Ns Ar value -Sets the given pool properties. See the -.Sx Properties -section for a list of valid properties that can be set. The only property -supported at the moment is ashift. +Displays the software version of the +.Nm +userland utility and the ZFS kernel module. .El -.It Xo -.Nm -.Cm attach -.Op Fl f -.Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool device new_device -.Xc -Attaches -.Ar new_device -to the existing -.Ar device . -The existing device cannot be part of a raidz configuration. -If -.Ar device -is not currently part of a mirrored configuration, -.Ar device -automatically transforms into a two-way mirror of -.Ar device -and -.Ar new_device . -If -.Ar device -is part of a two-way mirror, attaching -.Ar new_device -creates a three-way mirror, and so on. -In either case, -.Ar new_device -begins to resilver immediately. +. +.Ss Creation .Bl -tag -width Ds -.It Fl f -Forces use of -.Ar new_device , -even if it appears to be in use. -Not all devices can be overridden in this manner. -.It Fl o Ar property Ns = Ns Ar value -Sets the given pool properties. See the -.Sx Properties -section for a list of valid properties that can be set. The only property -supported at the moment is ashift. -.El -.It Xo -.Nm -.Cm checkpoint -.Op Fl d, -discard -.Ar pool -.Xc -Checkpoints the current state of -.Ar pool -, which can be later restored by -.Nm zpool Cm import --rewind-to-checkpoint . -The existence of a checkpoint in a pool prohibits the following -.Nm zpool -commands: -.Cm remove , -.Cm attach , -.Cm detach , -.Cm split , -and -.Cm reguid . -In addition, it may break reservation boundaries if the pool lacks free -space. -The -.Nm zpool Cm status -command indicates the existence of a checkpoint or the progress of discarding a -checkpoint from a pool. -The -.Nm zpool Cm list -command reports how much space the checkpoint takes from the pool. -.Bl -tag -width Ds -.It Fl d, -discard -Discards an existing checkpoint from -.Ar pool . -.El -.It Xo -.Nm -.Cm clear -.Ar pool -.Op Ar device -.Xc -Clears device errors in a pool. -If no arguments are specified, all device errors within the pool are cleared. -If one or more devices is specified, only those errors associated with the -specified device or devices are cleared. -If multihost is enabled, and the pool has been suspended, this will not -resume I/O. While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. -.It Xo -.Nm -.Cm create -.Op Fl dfn -.Op Fl m Ar mountpoint -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Oo Fl o Ar feature@feature Ns = Ns Ar value Oc Ns ... -.Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Op Fl t Ar tname -.Ar pool vdev Ns ... -.Xc +.It Xr zpool-create 8 Creates a new storage pool containing the virtual devices specified on the command line. -The pool name must begin with a letter, and can only contain -alphanumeric characters as well as underscore -.Pq Qq Sy _ , -dash -.Pq Qq Sy \&- , -colon -.Pq Qq Sy \&: , -space -.Pq Qq Sy \&\ , -and period -.Pq Qq Sy \&. . -The pool names -.Sy mirror , -.Sy raidz , -.Sy spare -and -.Sy log -are reserved, as are names beginning with -.Sy mirror , -.Sy raidz , -.Sy spare , -and the pattern -.Sy c[0-9] . -The -.Ar vdev -specification is described in the -.Sx Virtual Devices -section. -.Pp -The command attempts to verify that each device specified is accessible and not -currently in use by another subsystem. However this check is not robust enough -to detect simultaneous attempts to use a new device in different pools, even if -.Sy multihost -is -.Sy enabled. -The -administrator must ensure that simultaneous invocations of any combination of -.Sy zpool replace , -.Sy zpool create , -.Sy zpool add , -or -.Sy zpool labelclear , -do not refer to the same device. Using the same device in two pools will -result in pool corruption. -.Pp -There are some uses, such as being currently mounted, or specified as the -dedicated dump device, that prevents a device from ever being used by ZFS. -Other uses, such as having a preexisting UFS file system, can be overridden with -the -.Fl f -option. -.Pp -The command also checks that the replication strategy for the pool is -consistent. -An attempt to combine redundant and non-redundant storage in a single pool, or -to mix disks and files, results in an error unless -.Fl f -is specified. -The use of differently sized devices within a single raidz or mirror group is -also flagged as an error unless -.Fl f -is specified. -.Pp -Unless the -.Fl R -option is specified, the default mount point is -.Pa / Ns Ar pool . -The mount point must not exist or must be empty, or else the root dataset -cannot be mounted. -This can be overridden with the -.Fl m -option. -.Pp -By default all supported features are enabled on the new pool unless the -.Fl d -option is specified. -.Bl -tag -width Ds -.It Fl d -Do not enable any features on the new pool. -Individual features can be enabled by setting their corresponding properties to -.Sy enabled -with the -.Fl o -option. -See -.Xr zpool-features 5 -for details about feature properties. -.It Fl f -Forces use of -.Ar vdev Ns s , -even if they appear in use or specify a conflicting replication level. -Not all devices can be overridden in this manner. -.It Fl m Ar mountpoint -Sets the mount point for the root dataset. -The default mount point is -.Pa /pool -or -.Pa altroot/pool -if -.Ar altroot -is specified. -The mount point must be an absolute path, -.Sy legacy , -or -.Sy none . -For more information on dataset mount points, see -.Xr zfs 8 . -.It Fl n -Displays the configuration that would be used without actually creating the -pool. -The actual pool creation can still fail due to insufficient privileges or -device sharing. -.It Fl o Ar property Ns = Ns Ar value -Sets the given pool properties. -See the -.Sx Properties -section for a list of valid properties that can be set. -.It Fl o Ar feature@feature Ns = Ns Ar value -Sets the given pool feature. See the -.Xr zpool-features 5 -section for a list of valid features that can be set. -Value can be either disabled or enabled. -.It Fl O Ar file-system-property Ns = Ns Ar value -Sets the given file system properties in the root file system of the pool. -See the -.Sx Properties -section of -.Xr zfs 8 -for a list of valid properties that can be set. -.It Fl R Ar root -Equivalent to -.Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root -.It Fl t Ar tname -Sets the in-core pool name to -.Sy tname -while the on-disk name will be the name specified as the pool name -.Sy pool . -This will set the default cachefile property to none. This is intended -to handle name space collisions when creating pools for other systems, -such as virtual machines or physical machines whose pools live on network -block devices. +.It Xr zpool-initialize 8 +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. .El -.It Xo -.Nm -.Cm destroy -.Op Fl f -.Ar pool -.Xc +. +.Ss Destruction +.Bl -tag -width Ds +.It Xr zpool-destroy 8 Destroys the given pool, freeing up any devices for other use. -This command tries to unmount any active datasets before destroying the pool. -.Bl -tag -width Ds -.It Fl f -Forces any active datasets contained within the pool to be unmounted. +.It Xr zpool-labelclear 8 +Removes ZFS label information from the specified +.Ar device . .El -.It Xo -.Nm -.Cm detach -.Ar pool device -.Xc -Detaches -.Ar device -from a mirror. -The operation is refused if there are no other valid replicas of the data. -If device may be re-added to the pool later on then consider the -.Sy zpool offline -command instead. -.It Xo -.Nm -.Cm events -.Op Fl vHf Oo Ar pool Oc | Fl c -.Xc -Lists all recent events generated by the ZFS kernel modules. These events -are consumed by the -.Xr zed 8 -and used to automate administrative tasks such as replacing a failed device -with a hot spare. For more information about the subclasses and event payloads -that can be generated see the -.Xr zfs-events 5 -man page. +. +.Ss Virtual Devices .Bl -tag -width Ds -.It Fl c -Clear all previous events. -.It Fl f -Follow mode. -.It Fl H -Scripted mode. Do not display headers, and separate fields by a -single tab instead of arbitrary space. -.It Fl v -Print the entire payload for each event. -.El .It Xo -.Nm -.Cm export -.Op Fl a -.Op Fl f -.Ar pool Ns ... +.Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8 .Xc -Exports the given pools from the system. -All devices are marked as exported, but are still considered in use by other -subsystems. -The devices can be moved between systems -.Pq even those of different endianness -and imported as long as a sufficient number of devices are present. -.Pp -Before exporting the pool, all datasets within the pool are unmounted. -A pool can not be exported if it has a shared spare that is currently being -used. -.Pp -For pools to be portable, you must give the -.Nm -command whole disks, not just partitions, so that ZFS can label the disks with -portable EFI labels. -Otherwise, disk drivers on platforms of different endianness will not recognize -the disks. -.Bl -tag -width Ds -.It Fl a -Exports all pools imported on the system. -.It Fl f -Forcefully unmount all datasets, using the -.Nm unmount Fl f -command. -.Pp -This command will forcefully export the pool even if it has a shared spare that -is currently being used. -This may lead to potential data corruption. -.El +Increases or decreases redundancy by +.Cm attach Ns ing or +.Cm detach Ns ing a device on an existing vdev (virtual device). .It Xo -.Nm -.Cm get -.Op Fl Hp -.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... -.Oo Ar pool Oc Ns ... +.Xr zpool-add 8 Ns / Ns Xr zpool-remove 8 +.Xc +Adds the specified virtual devices to the given pool, +or removes the specified device from the pool. +.It Xr zpool-replace 8 +Replaces an existing device (which may be faulted) with a new one. +.It Xr zpool-split 8 +Creates a new pool by splitting all mirrors in an existing pool (which decreases its redundancy). +.El +. +.Ss Properties +Available pool properties listed in the +.Xr zpoolprops 7 +manual page. +.Bl -tag -width Ds +.It Xr zpool-list 8 +Lists the given pools along with a health status and space usage. +.It Xo +.Xr zpool-get 8 Ns / Ns Xr zpool-set 8 .Xc Retrieves the given list of properties .Po @@ -1336,1164 +141,86 @@ or all properties if is used .Pc for the specified storage pool(s). -These properties are displayed with the following fields: -.Bd -literal - name Name of storage pool - property Property name - value Property value - source Property source, either 'default' or 'local'. -.Ed -.Pp -See the -.Sx Properties -section for more information on the available pool properties. -.Bl -tag -width Ds -.It Fl H -Scripted mode. -Do not display headers, and separate fields by a single tab instead of arbitrary -space. -.It Fl o Ar field -A comma-separated list of columns to display. -.Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source -is the default value. -.It Fl p -Display numbers in parsable (exact) values. .El -.It Xo -.Nm -.Cm history -.Op Fl il -.Oo Ar pool Oc Ns ... -.Xc -Displays the command history of the specified pool(s) or all pools if no pool is -specified. +. +.Ss Monitoring .Bl -tag -width Ds -.It Fl i -Displays internally logged ZFS events in addition to user initiated events. -.It Fl l -Displays log records in long format, which in addition to standard format -includes, the user name, the hostname, and the zone in which the operation was -performed. -.El -.It Xo -.Nm -.Cm import -.Op Fl D -.Op Fl d Ar dir Ns | Ns device -.Xc -Lists pools available to import. -If the -.Fl d -option is not specified, this command searches for devices in -.Pa /dev . -The -.Fl d -option can be specified multiple times, and all directories are searched. -If the device appears to be part of an exported pool, this command displays a -summary of the pool with the name of the pool, a numeric identifier, as well as -the vdev layout and current health of the device for each device or file. -Destroyed pools, pools that were previously destroyed with the -.Nm zpool Cm destroy -command, are not listed unless the -.Fl D -option is specified. -.Pp -The numeric identifier is unique, and can be used instead of the pool name when -multiple exported pools of the same name are available. -.Bl -tag -width Ds -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Sy cachefile -pool property. -This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir Ns | Ns Ar device -Uses -.Ar device -or searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. -.It Fl D -Lists destroyed pools only. -.El -.It Xo -.Nm -.Cm import -.Fl a -.Op Fl DflmN -.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc -.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device -.Op Fl o Ar mntopts -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Op Fl s -.Xc -Imports all pools found in the search directories. -Identical to the previous command, except that all pools with a sufficient -number of devices available are imported. -Destroyed pools, pools that were previously destroyed with the -.Nm zpool Cm destroy -command, will not be imported unless the -.Fl D -option is specified. -.Bl -tag -width Ds -.It Fl a -Searches for and imports all pools found. -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Sy cachefile -pool property. -This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir Ns | Ns Ar device -Uses -.Ar device -or searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. -This option is incompatible with the -.Fl c -option. -.It Fl D -Imports destroyed pools only. -The -.Fl f -option is also required. -.It Fl f -Forces import, even if the pool appears to be potentially active. -.It Fl F -Recovery mode for a non-importable pool. -Attempt to return the pool to an importable state by discarding the last few -transactions. -Not all damaged pools can be recovered by using this option. -If successful, the data from the discarded transactions is irretrievably lost. -This option is ignored if the pool is importable or already imported. -.It Fl l -Indicates that this command will request encryption keys for all encrypted -datasets it attempts to mount as it is bringing the pool online. Note that if -any datasets have a -.Sy keylocation -of -.Sy prompt -this command will block waiting for the keys to be entered. Without this flag -encrypted datasets will be left unavailable until the keys are loaded. -.It Fl m -Allows a pool to import when there is a missing log device. -Recent transactions can be lost because the log device will be discarded. -.It Fl n -Used with the -.Fl F -recovery option. -Determines whether a non-importable pool can be made importable again, but does -not actually perform the pool recovery. -For more details about pool recovery mode, see the -.Fl F -option, above. -.It Fl N -Import the pool without mounting any file systems. -.It Fl o Ar mntopts -Comma-separated list of mount options to use when mounting datasets within the -pool. -See -.Xr zfs 8 -for a description of dataset properties and mount options. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property on the imported pool. -See the -.Sx Properties -section for more information on the available pool properties. -.It Fl R Ar root -Sets the -.Sy cachefile -property to -.Sy none -and the -.Sy altroot -property to -.Ar root . -.It Fl -rewind-to-checkpoint -Rewinds pool to the checkpointed state. -Once the pool is imported with this flag there is no way to undo the rewind. -All changes and data that were written after the checkpoint are lost! -The only exception is when the -.Sy readonly -mounting option is enabled. -In this case, the checkpointed state of the pool is opened and an -administrator can see how the pool would look like if they were -to fully rewind. -.It Fl s -Scan using the default search path, the libblkid cache will not be -consulted. A custom search path may be specified by setting the -ZPOOL_IMPORT_PATH environment variable. -.It Fl X -Used with the -.Fl F -recovery option. Determines whether extreme -measures to find a valid txg should take place. This allows the pool to -be rolled back to a txg which is no longer guaranteed to be consistent. -Pools imported at an inconsistent txg may contain uncorrectable -checksum errors. For more details about pool recovery mode, see the -.Fl F -option, above. WARNING: This option can be extremely hazardous to the -health of your pool and should only be used as a last resort. -.It Fl T -Specify the txg to use for rollback. Implies -.Fl FX . -For more details -about pool recovery mode, see the -.Fl X -option, above. WARNING: This option can be extremely hazardous to the -health of your pool and should only be used as a last resort. -.El -.It Xo -.Nm -.Cm import -.Op Fl Dflm -.Op Fl F Oo Fl n Oc Oo Fl t Oc Oo Fl T Oc Oo Fl X Oc -.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device -.Op Fl o Ar mntopts -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Op Fl s -.Ar pool Ns | Ns Ar id -.Op Ar newpool -.Xc -Imports a specific pool. -A pool can be identified by its name or the numeric identifier. -If -.Ar newpool -is specified, the pool is imported using the name -.Ar newpool . -Otherwise, it is imported with the same name as its exported name. -.Pp -If a device is removed from a system without running -.Nm zpool Cm export -first, the device appears as potentially active. -It cannot be determined if this was a failed export, or whether the device is -really in use from another host. -To import a pool in this state, the -.Fl f -option is required. -.Bl -tag -width Ds -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Sy cachefile -pool property. -This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir Ns | Ns Ar device -Uses -.Ar device -or searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. -This option is incompatible with the -.Fl c -option. -.It Fl D -Imports destroyed pool. -The -.Fl f -option is also required. -.It Fl f -Forces import, even if the pool appears to be potentially active. -.It Fl F -Recovery mode for a non-importable pool. -Attempt to return the pool to an importable state by discarding the last few -transactions. -Not all damaged pools can be recovered by using this option. -If successful, the data from the discarded transactions is irretrievably lost. -This option is ignored if the pool is importable or already imported. -.It Fl l -Indicates that this command will request encryption keys for all encrypted -datasets it attempts to mount as it is bringing the pool online. Note that if -any datasets have a -.Sy keylocation -of -.Sy prompt -this command will block waiting for the keys to be entered. Without this flag -encrypted datasets will be left unavailable until the keys are loaded. -.It Fl m -Allows a pool to import when there is a missing log device. -Recent transactions can be lost because the log device will be discarded. -.It Fl n -Used with the -.Fl F -recovery option. -Determines whether a non-importable pool can be made importable again, but does -not actually perform the pool recovery. -For more details about pool recovery mode, see the -.Fl F -option, above. -.It Fl o Ar mntopts -Comma-separated list of mount options to use when mounting datasets within the -pool. -See -.Xr zfs 8 -for a description of dataset properties and mount options. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property on the imported pool. -See the -.Sx Properties -section for more information on the available pool properties. -.It Fl R Ar root -Sets the -.Sy cachefile -property to -.Sy none -and the -.Sy altroot -property to -.Ar root . -.It Fl s -Scan using the default search path, the libblkid cache will not be -consulted. A custom search path may be specified by setting the -ZPOOL_IMPORT_PATH environment variable. -.It Fl X -Used with the -.Fl F -recovery option. Determines whether extreme -measures to find a valid txg should take place. This allows the pool to -be rolled back to a txg which is no longer guaranteed to be consistent. -Pools imported at an inconsistent txg may contain uncorrectable -checksum errors. For more details about pool recovery mode, see the -.Fl F -option, above. WARNING: This option can be extremely hazardous to the -health of your pool and should only be used as a last resort. -.It Fl T -Specify the txg to use for rollback. Implies -.Fl FX . -For more details -about pool recovery mode, see the -.Fl X -option, above. WARNING: This option can be extremely hazardous to the -health of your pool and should only be used as a last resort. -.It Fl t -Used with -.Sy newpool . -Specifies that -.Sy newpool -is temporary. Temporary pool names last until export. Ensures that -the original pool name will be used in all label updates and therefore -is retained upon export. -Will also set -o cachefile=none when not explicitly specified. -.El -.It Xo -.Nm -.Cm initialize -.Op Fl c | Fl s -.Ar pool -.Op Ar device Ns ... -.Xc -Begins initializing by writing to all unallocated regions on the specified -devices, or all eligible devices in the pool if no individual devices are -specified. -Only leaf data or log devices may be initialized. -.Bl -tag -width Ds -.It Fl c, -cancel -Cancel initializing on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -initialized, the command will fail and no cancellation will occur on any device. -.It Fl s -suspend -Suspend initializing on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -initialized, the command will fail and no suspension will occur on any device. -Initializing can then be resumed by running -.Nm zpool Cm initialize -with no flags on the relevant target devices. -.El -.It Xo -.Nm -.Cm iostat -.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw -.Op Fl T Sy u Ns | Ns Sy d -.Op Fl ghHLnpPvy -.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc -.Op Ar interval Op Ar count -.Xc +.It Xr zpool-status 8 +Displays the detailed health status for the given pools. +.It Xr zpool-iostat 8 Displays logical I/O statistics for the given pools/vdevs. Physical I/Os may be observed via .Xr iostat 1 . -If writes are located nearby, they may be merged into a single -larger operation. Additional I/O may be generated depending on the level of -vdev redundancy. -To filter output, you may pass in a list of pools, a pool and list of vdevs -in that pool, or a list of any vdevs from any pool. If no items are specified, -statistics for every pool in the system are shown. -When given an -.Ar interval , -the statistics are printed every -.Ar interval -seconds until ^C is pressed. If -.Fl n -flag is specified the headers are displayed only once, otherwise they are -displayed periodically. If count is specified, the command exits -after count reports are printed. The first report printed is always -the statistics since boot regardless of whether -.Ar interval -and -.Ar count -are passed. However, this behavior can be suppressed with the -.Fl y -flag. Also note that the units of -.Sy K , -.Sy M , -.Sy G ... -that are printed in the report are in base 1024. To get the raw -values, use the -.Fl p -flag. +.It Xr zpool-events 8 +Lists all recent events generated by the ZFS kernel modules. +These events are consumed by the +.Xr zed 8 +and used to automate administrative tasks such as replacing a failed device +with a hot spare. +That manual page also describes the subclasses and event payloads +that can be generated. +.It Xr zpool-history 8 +Displays the command history of the specified pool(s) or all pools if no pool is +specified. +.El +. +.Ss Maintenance .Bl -tag -width Ds -.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... -Run a script (or scripts) on each vdev and include the output as a new column -in the -.Nm zpool Cm iostat -output. Users can run any script found in their -.Pa ~/.zpool.d -directory or from the system -.Pa /etc/zfs/zpool.d -directory. Script names containing the slash (/) character are not allowed. -The default search path can be overridden by setting the -ZPOOL_SCRIPTS_PATH environment variable. A privileged user can run -.Fl c -if they have the ZPOOL_SCRIPTS_AS_ROOT -environment variable set. If a script requires the use of a privileged -command, like -.Xr smartctl 8 , -then it's recommended you allow the user access to it in -.Pa /etc/sudoers -or add the user to the -.Pa /etc/sudoers.d/zfs -file. -.Pp -If -.Fl c -is passed without a script name, it prints a list of all scripts. -.Fl c -also sets verbose mode -.No \&( Ns Fl v Ns No \&). -.Pp -Script output should be in the form of "name=value". The column name is -set to "name" and the value is set to "value". Multiple lines can be -used to output multiple columns. The first line of output not in the -"name=value" format is displayed without a column title, and no more -output after that is displayed. This can be useful for printing error -messages. Blank or NULL values are printed as a '-' to make output -awk-able. -.Pp -The following environment variables are set before running each script: -.Bl -tag -width "VDEV_PATH" -.It Sy VDEV_PATH -Full path to the vdev -.El -.Bl -tag -width "VDEV_UPATH" -.It Sy VDEV_UPATH -Underlying path to the vdev (/dev/sd*). For use with device mapper, -multipath, or partitioned vdevs. -.El -.Bl -tag -width "VDEV_ENC_SYSFS_PATH" -.It Sy VDEV_ENC_SYSFS_PATH -The sysfs path to the enclosure for the vdev (if any). -.El -.It Fl T Sy u Ns | Ns Sy d -Display a time stamp. -Specify -.Sy u -for a printed representation of the internal representation of time. -See -.Xr time 2 . -Specify -.Sy d -for standard date format. -See -.Xr date 1 . -.It Fl g -Display vdev GUIDs instead of the normal device names. These GUIDs -can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl H -Scripted mode. Do not display headers, and separate fields by a -single tab instead of arbitrary space. -.It Fl L -Display real paths for vdevs resolving all symbolic links. This can -be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl n -Print headers only once when passed -.It Fl p -Display numbers in parsable (exact) values. Time values are in -nanoseconds. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. This can be used in conjunction with the -.Fl L -flag. -.It Fl r -Print request size histograms for the leaf vdev's IO. This includes -histograms of individual IOs (ind) and aggregate IOs (agg). These stats -can be useful for observing how well IO aggregation is working. Note -that TRIM IOs may exceed 16M, but will be counted as 16M. -.It Fl v -Verbose statistics Reports usage statistics for individual vdevs within the -pool, in addition to the pool-wide statistics. -.It Fl y -Omit statistics since boot. -Normally the first line of output reports the statistics since boot. -This option suppresses that first line of output. -.Ar interval -.It Fl w -Display latency histograms: -.Pp -.Ar total_wait : -Total IO time (queuing + disk IO time). -.Ar disk_wait : -Disk IO time (time reading/writing the disk). -.Ar syncq_wait : -Amount of time IO spent in synchronous priority queues. Does not include -disk time. -.Ar asyncq_wait : -Amount of time IO spent in asynchronous priority queues. Does not include -disk time. -.Ar scrub : -Amount of time IO spent in scrub queue. Does not include disk time. -.It Fl l -Include average latency statistics: -.Pp -.Ar total_wait : -Average total IO time (queuing + disk IO time). -.Ar disk_wait : -Average disk IO time (time reading/writing the disk). -.Ar syncq_wait : -Average amount of time IO spent in synchronous priority queues. Does -not include disk time. -.Ar asyncq_wait : -Average amount of time IO spent in asynchronous priority queues. -Does not include disk time. -.Ar scrub : -Average queuing time in scrub queue. Does not include disk time. -.Ar trim : -Average queuing time in trim queue. Does not include disk time. -.It Fl q -Include active queue statistics. Each priority queue has both -pending ( -.Ar pend ) -and active ( -.Ar activ ) -IOs. Pending IOs are waiting to -be issued to the disk, and active IOs have been issued to disk and are -waiting for completion. These stats are broken out by priority queue: -.Pp -.Ar syncq_read/write : -Current number of entries in synchronous priority -queues. -.Ar asyncq_read/write : -Current number of entries in asynchronous priority queues. -.Ar scrubq_read : -Current number of entries in scrub queue. -.Ar trimq_write : -Current number of entries in trim queue. -.Pp -All queue statistics are instantaneous measurements of the number of -entries in the queues. If you specify an interval, the measurements -will be sampled from the end of the interval. -.El -.It Xo -.Nm -.Cm labelclear -.Op Fl f -.Ar device -.Xc -Removes ZFS label information from the specified -.Ar device . -The -.Ar device -must not be part of an active pool configuration. -.Bl -tag -width Ds -.It Fl f -Treat exported or foreign devices as inactive. -.El -.It Xo -.Nm -.Cm list -.Op Fl HgLpPv -.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -.Op Fl T Sy u Ns | Ns Sy d -.Oo Ar pool Oc Ns ... -.Op Ar interval Op Ar count -.Xc -Lists the given pools along with a health status and space usage. -If no -.Ar pool Ns s -are specified, all pools in the system are listed. -When given an -.Ar interval , -the information is printed every -.Ar interval -seconds until ^C is pressed. -If -.Ar count -is specified, the command exits after -.Ar count -reports are printed. -.Bl -tag -width Ds -.It Fl g -Display vdev GUIDs instead of the normal device names. These GUIDs -can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl H -Scripted mode. -Do not display headers, and separate fields by a single tab instead of arbitrary -space. -.It Fl o Ar property -Comma-separated list of properties to display. -See the -.Sx Properties -section for a list of valid properties. -The default list is -.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , -.Cm capacity , dedupratio , health , altroot . -.It Fl L -Display real paths for vdevs resolving all symbolic links. This can -be used to look up the current block device name regardless of the -/dev/disk/ path used to open it. -.It Fl p -Display numbers in parsable -.Pq exact -values. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. This can be used in conjunction with the -.Fl L -flag. -.It Fl T Sy u Ns | Ns Sy d -Display a time stamp. -Specify -.Sy u -for a printed representation of the internal representation of time. -See -.Xr time 2 . -Specify -.Sy d -for standard date format. -See -.Xr date 1 . -.It Fl v -Verbose statistics. -Reports usage statistics for individual vdevs within the pool, in addition to -the pool-wise statistics. -.El -.It Xo -.Nm -.Cm offline -.Op Fl f -.Op Fl t -.Ar pool Ar device Ns ... -.Xc -Takes the specified physical device offline. -While the -.Ar device -is offline, no attempt is made to read or write to the device. -This command is not applicable to spares. -.Bl -tag -width Ds -.It Fl f -Force fault. Instead of offlining the disk, put it into a faulted -state. The fault will persist across imports unless the -.Fl t -flag was specified. -.It Fl t -Temporary. -Upon reboot, the specified physical device reverts to its previous state. -.El -.It Xo -.Nm -.Cm online -.Op Fl e -.Ar pool Ar device Ns ... -.Xc -Brings the specified physical device online. -This command is not applicable to spares. -.Bl -tag -width Ds -.It Fl e -Expand the device to use all available space. -If the device is part of a mirror or raidz then all devices must be expanded -before the new space will become available to the pool. -.El -.It Xo -.Nm -.Cm reguid -.Ar pool -.Xc -Generates a new unique identifier for the pool. -You must ensure that all devices in this pool are online and healthy before -performing this action. -.It Xo -.Nm -.Cm reopen -.Op Fl n -.Ar pool -.Xc -Reopen all the vdevs associated with the pool. -.Bl -tag -width Ds -.It Fl n -Do not restart an in-progress scrub operation. This is not recommended and can -result in partially resilvered devices unless a second scrub is performed. -.El -.It Xo -.Nm -.Cm remove -.Op Fl np -.Ar pool Ar device Ns ... -.Xc -Removes the specified device from the pool. -This command supports removing hot spare, cache, log, and both mirrored and -non-redundant primary top-level vdevs, including dedup and special vdevs. -When the primary pool storage includes a top-level raidz vdev only hot spare, -cache, and log devices can be removed. -.sp -Removing a top-level vdev reduces the total amount of space in the storage pool. -The specified device will be evacuated by copying all allocated space from it to -the other devices in the pool. -In this case, the -.Nm zpool Cm remove -command initiates the removal and returns, while the evacuation continues in -the background. -The removal progress can be monitored with -.Nm zpool Cm status . -If an IO error is encountered during the removal process it will be -cancelled. The -.Sy device_removal -feature flag must be enabled to remove a top-level vdev, see -.Xr zpool-features 5 . -.Pp -A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the -same. -Non-log devices or data devices that are part of a mirrored configuration can be removed using -the -.Nm zpool Cm detach -command. -.Bl -tag -width Ds -.It Fl n -Do not actually perform the removal ("no-op"). -Instead, print the estimated amount of memory that will be used by the -mapping table after the removal completes. -This is nonzero only for top-level vdevs. -.El -.Bl -tag -width Ds -.It Fl p -Used in conjunction with the -.Fl n -flag, displays numbers as parsable (exact) values. -.El -.It Xo -.Nm -.Cm remove -.Fl s -.Ar pool -.Xc -Stops and cancels an in-progress removal of a top-level vdev. -.It Xo -.Nm -.Cm replace -.Op Fl f -.Op Fl o Ar property Ns = Ns Ar value -.Ar pool Ar device Op Ar new_device -.Xc -Replaces -.Ar old_device -with -.Ar new_device . -This is equivalent to attaching -.Ar new_device , -waiting for it to resilver, and then detaching -.Ar old_device . -.Pp -The size of -.Ar new_device -must be greater than or equal to the minimum size of all the devices in a mirror -or raidz configuration. -.Pp -.Ar new_device -is required if the pool is not redundant. -If -.Ar new_device -is not specified, it defaults to -.Ar old_device . -This form of replacement is useful after an existing disk has failed and has -been physically replaced. -In this case, the new disk may have the same -.Pa /dev -path as the old device, even though it is actually a different disk. -ZFS recognizes this. -.Bl -tag -width Ds -.It Fl f -Forces use of -.Ar new_device , -even if it appears to be in use. -Not all devices can be overridden in this manner. -.It Fl o Ar property Ns = Ns Ar value -Sets the given pool properties. See the -.Sx Properties -section for a list of valid properties that can be set. -The only property supported at the moment is -.Sy ashift . -.El -.It Xo -.Nm -.Cm scrub -.Op Fl s | Fl p -.Ar pool Ns ... -.Xc +.It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. -The scrub examines all data in the specified pools to verify that it checksums -correctly. -For replicated -.Pq mirror or raidz -devices, ZFS automatically repairs any damage discovered during the scrub. -The -.Nm zpool Cm status -command reports the progress of the scrub and summarizes the results of the -scrub upon completion. -.Pp -Scrubbing and resilvering are very similar operations. -The difference is that resilvering only examines data that ZFS knows to be out -of date -.Po -for example, when attaching a new device to a mirror or replacing an existing -device -.Pc , -whereas scrubbing examines all data to discover silent errors due to hardware -faults or disk failure. -.Pp -Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows -one at a time. -If a scrub is paused, the -.Nm zpool Cm scrub -resumes it. -If a resilver is in progress, ZFS does not allow a scrub to be started until the -resilver completes. -.Pp -Note that, due to changes in pool data on a live system, it is possible for -scrubs to progress slightly beyond 100% completion. During this period, no -completion time estimate will be provided. -.Bl -tag -width Ds -.It Fl s -Stop scrubbing. -.El -.Bl -tag -width Ds -.It Fl p -Pause scrubbing. -Scrub pause state and progress are periodically synced to disk. -If the system is restarted or pool is exported during a paused scrub, -even after import, scrub will remain paused until it is resumed. -Once resumed the scrub will pick up from the place where it was last -checkpointed to disk. -To resume a paused scrub issue -.Nm zpool Cm scrub -again. -.El -.It Xo -.Nm -.Cm resilver -.Ar pool Ns ... -.Xc -Starts a resilver. If an existing resilver is already running it will be -restarted from the beginning. Any drives that were scheduled for a deferred -resilver will be added to the new one. This requires the -.Sy resilver_defer -feature. -.It Xo -.Nm -.Cm trim -.Op Fl d -.Op Fl c | Fl s -.Ar pool -.Op Ar device Ns ... -.Xc -Initiates an immediate on-demand TRIM operation for all of the free space in -a pool. This operation informs the underlying storage devices of all blocks +.It Xr zpool-checkpoint 8 +Checkpoints the current state of +.Ar pool , +which can be later restored by +.Nm zpool Cm import Fl -rewind-to-checkpoint . +.It Xr zpool-trim 8 +Initiates an immediate on-demand TRIM operation for all of the free space in a pool. +This operation informs the underlying storage devices of all blocks in the pool which are no longer allocated and allows thinly provisioned devices to reclaim the space. -.Pp -A manual on-demand TRIM operation can be initiated irrespective of the -.Sy autotrim -pool property setting. See the documentation for the -.Sy autotrim -property above for the types of vdev devices which can be trimmed. -.Bl -tag -width Ds -.It Fl d -secure -Causes a secure TRIM to be initiated. When performing a secure TRIM, the -device guarantees that data stored on the trimmed blocks has been erased. -This requires support from the device and is not supported by all SSDs. -.It Fl r -rate Ar rate -Controls the rate at which the TRIM operation progresses. Without this -option TRIM is executed as quickly as possible. The rate, expressed in bytes -per second, is applied on a per-vdev basis and may be set differently for -each leaf vdev. -.It Fl c, -cancel -Cancel trimming on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -trimmed, the command will fail and no cancellation will occur on any device. -.It Fl s -suspend -Suspend trimming on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -trimmed, the command will fail and no suspension will occur on any device. -Trimming can then be resumed by running -.Nm zpool Cm trim -with no flags on the relevant target devices. -.El -.It Xo -.Nm -.Cm set -.Ar property Ns = Ns Ar value -.Ar pool -.Xc -Sets the given property on the specified pool. -See the -.Sx Properties -section for more information on what properties can be set and acceptable -values. -.It Xo -.Nm -.Cm split -.Op Fl gLlnP -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Op Fl R Ar root -.Ar pool newpool -.Op Ar device ... -.Xc -Splits devices off -.Ar pool -creating -.Ar newpool . -All vdevs in -.Ar pool -must be mirrors and the pool must not be in the process of resilvering. -At the time of the split, -.Ar newpool -will be a replica of -.Ar pool . -By default, the -last device in each mirror is split from -.Ar pool -to create -.Ar newpool . -.Pp -The optional device specification causes the specified device(s) to be -included in the new -.Ar pool -and, should any devices remain unspecified, -the last device in each mirror is used as would be by default. -.Bl -tag -width Ds -.It Fl g -Display vdev GUIDs instead of the normal device names. These GUIDs -can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl L -Display real paths for vdevs resolving all symbolic links. This can -be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl l -Indicates that this command will request encryption keys for all encrypted -datasets it attempts to mount as it is bringing the new pool online. Note that -if any datasets have a -.Sy keylocation -of -.Sy prompt -this command will block waiting for the keys to be entered. Without this flag -encrypted datasets will be left unavailable until the keys are loaded. -.It Fl n -Do dry run, do not actually perform the split. -Print out the expected configuration of -.Ar newpool . -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. This can be used in conjunction with the -.Fl L -flag. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property for -.Ar newpool . -See the -.Sx Properties -section for more information on the available pool properties. -.It Fl R Ar root -Set -.Sy altroot -for -.Ar newpool -to -.Ar root -and automatically import it. -.El -.It Xo -.Nm -.Cm status -.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... -.Op Fl DigLpPstvx -.Op Fl T Sy u Ns | Ns Sy d -.Oo Ar pool Oc Ns ... -.Op Ar interval Op Ar count -.Xc -Displays the detailed health status for the given pools. -If no -.Ar pool -is specified, then the status of each pool in the system is displayed. -For more information on pool and device health, see the -.Sx Device Failure and Recovery -section. -.Pp -If a scrub or resilver is in progress, this command reports the percentage done -and the estimated time to completion. -Both of these are only approximate, because the amount of data in the pool and -the other workloads on the system can change. -.Bl -tag -width Ds -.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... -Run a script (or scripts) on each vdev and include the output as a new column -in the -.Nm zpool Cm status -output. See the -.Fl c -option of -.Nm zpool Cm iostat -for complete details. -.It Fl i -Display vdev initialization status. -.It Fl g -Display vdev GUIDs instead of the normal device names. These GUIDs -can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl L -Display real paths for vdevs resolving all symbolic links. This can -be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl p -Display numbers in parsable (exact) values. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. This can be used in conjunction with the -.Fl L -flag. -.It Fl D -Display a histogram of deduplication statistics, showing the allocated -.Pq physically present on disk -and referenced -.Pq logically referenced in the pool -block counts and sizes by reference count. -.It Fl s -Display the number of leaf VDEV slow IOs. This is the number of IOs that -didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds). -This does not necessarily mean the IOs failed to complete, just took an -unreasonably long amount of time. This may indicate a problem with the -underlying storage. -.It Fl t -Display vdev TRIM status. -.It Fl T Sy u Ns | Ns Sy d -Display a time stamp. -Specify -.Sy u -for a printed representation of the internal representation of time. -See -.Xr time 2 . -Specify -.Sy d -for standard date format. -See -.Xr date 1 . -.It Fl v -Displays verbose data error information, printing out a complete list of all -data errors since the last complete pool scrub. -.It Fl x -Only display status for pools that are exhibiting errors or are otherwise -unavailable. -Warnings about pools not using the latest on-disk format will not be included. -.El -.It Xo -.Nm -.Cm sync -.Op Ar pool ... -.Xc +.It Xr zpool-sync 8 This command forces all in-core dirty data to be written to the primary -pool storage and not the ZIL. It will also update administrative -information including quota reporting. Without arguments, -.Sy zpool sync -will sync all pools on the system. Otherwise, it will sync only the -specified pool(s). -.It Xo -.Nm -.Cm upgrade -.Xc -Displays pools which do not have all supported features enabled and pools -formatted using a legacy ZFS version number. -These pools can continue to be used, but some features may not be available. -Use -.Nm zpool Cm upgrade Fl a -to enable all features on all pools. -.It Xo -.Nm -.Cm upgrade -.Fl v -.Xc -Displays legacy ZFS versions supported by the current software. -See -.Xr zpool-features 5 -for a description of feature flags features supported by the current software. -.It Xo -.Nm -.Cm upgrade -.Op Fl V Ar version -.Fl a Ns | Ns Ar pool Ns ... -.Xc -Enables all supported features on the given pool. -Once this is done, the pool will no longer be accessible on systems that do not -support feature flags. -See -.Xr zpool-features 5 -for details on compatibility with systems that support feature flags, but do not -support all features enabled on the pool. +pool storage and not the ZIL. +It will also update administrative information including quota reporting. +Without arguments, +.Nm zpool Cm sync +will sync all pools on the system. +Otherwise, it will sync only the specified pool(s). +.It Xr zpool-upgrade 8 +Manage the on-disk format version of storage pools. +.It Xr zpool-wait 8 +Waits until all background activity of the given types has ceased in the given +pool. +.El +. +.Ss Fault Resolution .Bl -tag -width Ds -.It Fl a -Enables all supported features on all pools. -.It Fl V Ar version -Upgrade to the specified legacy version. -If the -.Fl V -flag is specified, no features will be enabled on the pool. -This option can only be used to increase the version number up to the last -supported legacy version number. -.El .It Xo -.Nm -.Cm version +.Xr zpool-offline 8 Ns / Ns Xr zpool-online 8 .Xc -Displays the software version of the -.Nm -userland utility and the zfs kernel module. +Takes the specified physical device offline or brings it online. +.It Xr zpool-resilver 8 +Starts a resilver. +If an existing resilver is already running it will be restarted from the beginning. +.It Xr zpool-reopen 8 +Reopen all the vdevs associated with the pool. +.It Xr zpool-clear 8 +Clears device errors in a pool. .El +. +.Ss Import & Export +.Bl -tag -width Ds +.It Xr zpool-import 8 +Make disks containing ZFS storage pools available for use on the system. +.It Xr zpool-export 8 +Exports the given pools from the system. +.It Xr zpool-reguid 8 +Generates a new unique identifier for the pool. +.El +. .Sh EXIT STATUS The following exit values are returned: -.Bl -tag -width Ds +.Bl -tag -compact -offset 4n -width "a" .It Sy 0 Successful completion. .It Sy 1 @@ -2501,74 +228,69 @@ An error occurred. .It Sy 2 Invalid command line options were specified. .El +. .Sh EXAMPLES -.Bl -tag -width Ds -.It Sy Example 1 No Creating a RAID-Z Storage Pool +.Bl -tag -width "Exam" +.It Sy Example 1 : No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that -consists of six disks. -.Bd -literal -# zpool create tank raidz sda sdb sdc sdd sde sdf -.Ed -.It Sy Example 2 No Creating a Mirrored Storage Pool +consists of six disks: +.Dl # Nm zpool Cm create Ar tank Sy raidz Ar sda sdb sdc sdd sde sdf +. +.It Sy Example 2 : No Creating a Mirrored Storage Pool The following command creates a pool with two mirrors, where each mirror -contains two disks. -.Bd -literal -# zpool create tank mirror sda sdb mirror sdc sdd -.Ed -.It Sy Example 3 No Creating a ZFS Storage Pool by Using Partitions -The following command creates an unmirrored pool using two disk partitions. -.Bd -literal -# zpool create tank sda1 sdb2 -.Ed -.It Sy Example 4 No Creating a ZFS Storage Pool by Using Files +contains two disks: +.Dl # Nm zpool Cm create Ar tank Sy mirror Ar sda sdb Sy mirror Ar sdc sdd +. +.It Sy Example 3 : No Creating a ZFS Storage Pool by Using Partitions +The following command creates an unmirrored pool using two disk partitions: +.Dl # Nm zpool Cm create Ar tank sda1 sdb2 +. +.It Sy Example 4 : No Creating a ZFS Storage Pool by Using Files The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes. -.Bd -literal -# zpool create tank /path/to/file/a /path/to/file/b -.Ed -.It Sy Example 5 No Adding a Mirror to a ZFS Storage Pool +.Dl # Nm zpool Cm create Ar tank /path/to/file/a /path/to/file/b +. +.It Sy Example 5 : No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool -.Em tank , +.Ar tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. -.Bd -literal -# zpool add tank mirror sda sdb -.Ed -.It Sy Example 6 No Listing Available ZFS Storage Pools +.Dl # Nm zpool Cm add Ar tank Sy mirror Ar sda sdb +. +.It Sy Example 6 : No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool -.Em zion +.Ar zion is faulted due to a missing device. The results from this command are similar to the following: -.Bd -literal -# zpool list +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm list NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT rpool 19.9G 8.43G 11.4G - 33% 42% 1.00x ONLINE - tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed -.It Sy Example 7 No Destroying a ZFS Storage Pool +. +.It Sy Example 7 : No Destroying a ZFS Storage Pool The following command destroys the pool -.Em tank -and any datasets contained within. -.Bd -literal -# zpool destroy -f tank -.Ed -.It Sy Example 8 No Exporting a ZFS Storage Pool +.Ar tank +and any datasets contained within: +.Dl # Nm zpool Cm destroy Fl f Ar tank +. +.It Sy Example 8 : No Exporting a ZFS Storage Pool The following command exports the devices in pool -.Em tank -so that they can be relocated or later imported. -.Bd -literal -# zpool export tank -.Ed -.It Sy Example 9 No Importing a ZFS Storage Pool +.Ar tank +so that they can be relocated or later imported: +.Dl # Nm zpool Cm export Ar tank +. +.It Sy Example 9 : No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool -.Em tank +.Ar tank for use on the system. The results from this command are similar to the following: -.Bd -literal -# zpool import +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm import pool: tank id: 15451357997522795478 state: ONLINE @@ -2580,66 +302,58 @@ config: sda ONLINE sdb ONLINE -# zpool import tank +.No # Nm zpool Cm import Ar tank .Ed -.It Sy Example 10 No Upgrading All ZFS Storage Pools to the Current Version +. +.It Sy Example 10 : No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of -the software. -.Bd -literal -# zpool upgrade -a +the software: +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm upgrade Fl a This system is currently running ZFS version 2. .Ed -.It Sy Example 11 No Managing Hot Spares +. +.It Sy Example 11 : No Managing Hot Spares The following command creates a new pool with an available hot spare: -.Bd -literal -# zpool create tank mirror sda sdb spare sdc -.Ed +.Dl # Nm zpool Cm create Ar tank Sy mirror Ar sda sdb Sy spare Ar sdc .Pp If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command: -.Bd -literal -# zpool replace tank sda sdd -.Ed +.Dl # Nm zpool Cm replace Ar tank sda sdd .Pp Once the data has been resilvered, the spare is automatically removed and is made available for use should another device fail. The hot spare can be permanently removed from the pool using the following command: -.Bd -literal -# zpool remove tank sdc -.Ed -.It Sy Example 12 No Creating a ZFS Pool with Mirrored Separate Intent Logs +.Dl # Nm zpool Cm remove Ar tank sdc +. +.It Sy Example 12 : No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: -.Bd -literal -# zpool create pool mirror sda sdb mirror sdc sdd log mirror \\ - sde sdf -.Ed -.It Sy Example 13 No Adding Cache Devices to a ZFS Pool +.Dl # Nm zpool Cm create Ar pool Sy mirror Ar sda sdb Sy mirror Ar sdc sdd Sy log mirror Ar sde sdf +. +.It Sy Example 13 : No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: -.Bd -literal -# zpool add pool cache sdc sdd -.Ed +.Dl # Nm zpool Cm add Ar pool Sy cache Ar sdc sdd .Pp Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the .Cm iostat -option as follows: -.Bd -literal -# zpool iostat -v pool 5 -.Ed -.It Sy Example 14 No Removing a Mirrored top-level (Log or Data) Device +subcommand as follows: +.Dl # Nm zpool Cm iostat Fl v Ar pool 5 +. +.It Sy Example 14 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device .Sy mirror-1 . .Pp Given this configuration: -.Bd -literal +.Bd -literal -compact -offset Ds pool: tank state: ONLINE scrub: none requested @@ -2660,27 +374,22 @@ config: .Ed .Pp The command to remove the mirrored log -.Sy mirror-2 -is: -.Bd -literal -# zpool remove tank mirror-2 -.Ed +.Ar mirror-2 No is: +.Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp The command to remove the mirrored data -.Sy mirror-1 -is: -.Bd -literal -# zpool remove tank mirror-1 -.Ed -.It Sy Example 15 No Displaying expanded space on a device +.Ar mirror-1 No is: +.Dl # Nm zpool Cm remove Ar tank mirror-1 +. +.It Sy Example 15 : No Displaying expanded space on a device The following command displays the detailed information for the pool -.Em data . +.Ar data . This pool is comprised of a single raidz vdev where one of its devices increased its capacity by 10GB. In this example, the pool will not be able to utilize this extra capacity until all the devices under the raidz vdev have been expanded. -.Bd -literal -# zpool list -v data +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm list Fl v Ar data NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - raidz1 23.9G 14.6G 9.30G - 48% @@ -2688,16 +397,12 @@ data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - sdb - - - 10G - sdc - - - - - .Ed -.It Sy Example 16 No Adding output columns +. +.It Sy Example 16 : No Adding output columns Additional columns can be added to the -.Nm zpool Cm status -and -.Nm zpool Cm iostat -output with -.Fl c -option. -.Bd -literal -# zpool status -c vendor,model,size +.Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c . +.Bd -literal -compact -offset Ds +.No # Nm zpool Cm status Fl c Ar vendor , Ns Ar model , Ns Ar size NAME STATE READ WRITE CKSUM vendor model size tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 @@ -2708,123 +413,148 @@ option. U13 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U14 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T -# zpool iostat -vc slaves - capacity operations bandwidth - pool alloc free read write read write slaves - ---------- ----- ----- ----- ----- ----- ----- --------- - tank 20.4G 7.23T 26 152 20.7M 21.6M - mirror 20.4G 7.23T 26 152 20.7M 21.6M - U1 - - 0 31 1.46K 20.6M sdb sdff - U10 - - 0 1 3.77K 13.3K sdas sdgw - U11 - - 0 1 288K 13.3K sdat sdgx - U12 - - 0 1 78.4K 13.3K sdau sdgy - U13 - - 0 1 128K 13.3K sdav sdgz - U14 - - 0 1 63.2K 13.3K sdfk sdg +.No # Nm zpool Cm iostat Fl vc Ar size + capacity operations bandwidth +pool alloc free read write read write size +---------- ----- ----- ----- ----- ----- ----- ---- +rpool 14.6G 54.9G 4 55 250K 2.69M + sda1 14.6G 54.9G 4 55 250K 2.69M 70G +---------- ----- ----- ----- ----- ----- ----- ---- .Ed .El +. .Sh ENVIRONMENT VARIABLES -.Bl -tag -width "ZFS_ABORT" -.It Ev ZFS_ABORT +.Bl -tag -compact -width "ZPOOL_IMPORT_UDEV_TIMEOUT_MS" +.It Sy ZFS_ABORT Cause -.Nm zpool +.Nm to dump core on exit for the purposes of running .Sy ::findleaks . -.El -.Bl -tag -width "ZPOOL_IMPORT_PATH" -.It Ev ZPOOL_IMPORT_PATH -The search path for devices or files to use with the pool. This is a colon-separated list of directories in which -.Nm zpool +.It Sy ZFS_COLOR +Use ANSI color in +.Nm zpool status +output. +.It Sy ZPOOL_IMPORT_PATH +The search path for devices or files to use with the pool. +This is a colon-separated list of directories in which +.Nm looks for device nodes and files. Similar to the .Fl d option in .Nm zpool import . -.El -.Bl -tag -width "ZPOOL_VDEV_NAME_GUID" -.It Ev ZPOOL_VDEV_NAME_GUID +.It Sy ZPOOL_IMPORT_UDEV_TIMEOUT_MS +The maximum time in milliseconds that +.Nm zpool import +will wait for an expected device to be available. +.It Sy ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE +If set, suppress warning about non-native vdev ashift in +.Nm zpool status . +The value is not used, only the presence or absence of the variable matters. +.It Sy ZPOOL_VDEV_NAME_GUID Cause -.Nm zpool -subcommands to output vdev guids by default. This behavior is identical to the -.Nm zpool status -g +.Nm +subcommands to output vdev guids by default. +This behavior is identical to the +.Nm zpool Cm status Fl g command line option. -.El -.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" -.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS +.It Sy ZPOOL_VDEV_NAME_FOLLOW_LINKS Cause -.Nm zpool -subcommands to follow links for vdev names by default. This behavior is identical to the -.Nm zpool status -L +.Nm +subcommands to follow links for vdev names by default. +This behavior is identical to the +.Nm zpool Cm status Fl L command line option. -.El -.Bl -tag -width "ZPOOL_VDEV_NAME_PATH" -.It Ev ZPOOL_VDEV_NAME_PATH +.It Sy ZPOOL_VDEV_NAME_PATH Cause -.Nm zpool -subcommands to output full vdev path names by default. This -behavior is identical to the -.Nm zpool status -p +.Nm +subcommands to output full vdev path names by default. +This behavior is identical to the +.Nm zpool Cm status Fl P command line option. -.El -.Bl -tag -width "ZFS_VDEV_DEVID_OPT_OUT" -.It Ev ZFS_VDEV_DEVID_OPT_OUT -Older ZFS on Linux implementations had issues when attempting to display pool +.It Sy ZFS_VDEV_DEVID_OPT_OUT +Older OpenZFS implementations had issues when attempting to display pool config VDEV names if a .Sy devid NVP value is present in the pool's config. .Pp -For example, a pool that originated on illumos platform would have a devid +For example, a pool that originated on illumos platform would have a +.Sy devid value in the config and .Nm zpool status would fail when listing the config. -This would also be true for future Linux based pools. +This would also be true for future Linux-based pools. .Pp A pool can be stripped of any .Sy devid values on import or prevented from adding them on -.Nm zpool create +.Nm zpool Cm create or -.Nm zpool add +.Nm zpool Cm add by setting .Sy ZFS_VDEV_DEVID_OPT_OUT . -.El -.Bl -tag -width "ZPOOL_SCRIPTS_AS_ROOT" -.It Ev ZPOOL_SCRIPTS_AS_ROOT -Allow a privileged user to run the -.Nm zpool status/iostat -with the -.Fl c -option. Normally, only unprivileged users are allowed to run +.Pp +.It Sy ZPOOL_SCRIPTS_AS_ROOT +Allow a privileged user to run +.Nm zpool status/iostat Fl c . +Normally, only unprivileged users are allowed to run .Fl c . -.El -.Bl -tag -width "ZPOOL_SCRIPTS_PATH" -.It Ev ZPOOL_SCRIPTS_PATH +.It Sy ZPOOL_SCRIPTS_PATH The search path for scripts when running -.Nm zpool status/iostat -with the -.Fl c -option. This is a colon-separated list of directories and overrides the default +.Nm zpool status/iostat Fl c . +This is a colon-separated list of directories and overrides the default .Pa ~/.zpool.d and .Pa /etc/zfs/zpool.d search paths. -.El -.Bl -tag -width "ZPOOL_SCRIPTS_ENABLED" -.It Ev ZPOOL_SCRIPTS_ENABLED +.It Sy ZPOOL_SCRIPTS_ENABLED Allow a user to run -.Nm zpool status/iostat -with the -.Fl c -option. If +.Nm zpool status/iostat Fl c . +If .Sy ZPOOL_SCRIPTS_ENABLED is not set, it is assumed that the user is allowed to run -.Nm zpool status/iostat -c . +.Nm zpool Cm status Ns / Ns Cm iostat Fl c . .El +. .Sh INTERFACE STABILITY .Sy Evolving +. .Sh SEE ALSO -.Xr zfs-events 5 , -.Xr zfs-module-parameters 5 , -.Xr zpool-features 5 , +.Xr zfs 4 , +.Xr zpool-features 7 , +.Xr zpoolconcepts 7 , +.Xr zpoolprops 7 , .Xr zed 8 , -.Xr zfs 8 +.Xr zfs 8 , +.Xr zpool-add 8 , +.Xr zpool-attach 8 , +.Xr zpool-checkpoint 8 , +.Xr zpool-clear 8 , +.Xr zpool-create 8 , +.Xr zpool-destroy 8 , +.Xr zpool-detach 8 , +.Xr zpool-events 8 , +.Xr zpool-export 8 , +.Xr zpool-get 8 , +.Xr zpool-history 8 , +.Xr zpool-import 8 , +.Xr zpool-initialize 8 , +.Xr zpool-iostat 8 , +.Xr zpool-labelclear 8 , +.Xr zpool-list 8 , +.Xr zpool-offline 8 , +.Xr zpool-online 8 , +.Xr zpool-reguid 8 , +.Xr zpool-remove 8 , +.Xr zpool-reopen 8 , +.Xr zpool-replace 8 , +.Xr zpool-resilver 8 , +.Xr zpool-scrub 8 , +.Xr zpool-set 8 , +.Xr zpool-split 8 , +.Xr zpool-status 8 , +.Xr zpool-sync 8 , +.Xr zpool-trim 8 , +.Xr zpool-upgrade 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool_influxdb.8 b/man/man8/zpool_influxdb.8 new file mode 100644 index 0000000000..021fbdeaac --- /dev/null +++ b/man/man8/zpool_influxdb.8 @@ -0,0 +1,98 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at +.\" https://opensource.org/licenses/CDDL-1.0 +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright 2020 Richard Elling +.\" +.Dd May 26, 2021 +.Dt ZPOOL_INFLUXDB 8 +.Os +. +.Sh NAME +.Nm zpool_influxdb +.Nd collect ZFS pool statistics in InfluxDB line protocol format +.Sh SYNOPSIS +.Nm +.Op Fl e Ns | Ns Fl -execd +.Op Fl n Ns | Ns Fl -no-histogram +.Op Fl s Ns | Ns Fl -sum-histogram-buckets +.Op Fl t Ns | Ns Fl -tags Ar key Ns = Ns Ar value Ns Oo , Ns Ar key Ns = Ns Ar value Oc Ns … +.Op Ar pool +. +.Sh DESCRIPTION +.Nm +produces InfluxDB-line-protocol-compatible metrics from zpools. +Like the +.Nm zpool +command, +.Nm +reads the current pool status and statistics. +Unlike the +.Nm zpool +command which is intended for humans, +.Nm +formats the output in the InfluxDB line protocol. +The expected use is as a plugin to a +metrics collector or aggregator, such as Telegraf. +.Pp +By default, +.Nm +prints pool metrics and status in the InfluxDB line protocol format. +All pools are printed, similar to the +.Nm zpool Cm status +command. +Providing a pool name restricts the output to the named pool. +. +.Sh OPTIONS +.Bl -tag -width "-e, --execd" +.It Fl e , -execd +Run in daemon mode compatible with Telegraf's +.Nm execd +plugin. +In this mode, the pools are sampled every time a +newline appears on the standard input. +.It Fl n , -no-histogram +Do not print latency and I/O size histograms. +This can reduce the total +amount of data, but one should consider the value brought by the insights +that latency and I/O size distributions provide. +The resulting values +are suitable for graphing with Grafana's heatmap plugin. +.It Fl s , -sum-histogram-buckets +Accumulates bucket values. +By default, the values are not accumulated and the raw data appears as shown by +.Nm zpool Cm iostat . +This works well for Grafana's heatmap plugin. +Summing the buckets produces output similar to Prometheus histograms. +.It Fl t , Fl -tags Ar key Ns = Ns Ar value Ns Oo , Ns Ar key Ns = Ns Ar value Oc Ns … +Adds specified tags to the tag set. +No sanity checking is performed. +See the InfluxDB Line Protocol format documentation for details on escaping +special characters used in tags. +.It Fl h , -help +Print a usage summary. +.El +. +.Sh SEE ALSO +.Xr zpool-iostat 8 , +.Xr zpool-status 8 , +.Lk https://github.com/influxdata/influxdb "InfluxDB" , +.Lk https://github.com/influxdata/telegraf "Telegraf" , +.Lk https://grafana.com "Grafana" , +.Lk https://prometheus.io "Prometheus" diff --git a/man/man8/zstream.8 b/man/man8/zstream.8 new file mode 100644 index 0000000000..c0322ee3ac --- /dev/null +++ b/man/man8/zstream.8 @@ -0,0 +1,117 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2020 by Delphix. All rights reserved. +.\" +.Dd May 8, 2021 +.Dt ZSTREAM 8 +.Os +. +.Sh NAME +.Nm zstream +.Nd manipulate ZFS send streams +.Sh SYNOPSIS +.Nm +.Cm dump +.Op Fl Cvd +.Op Ar file +.Nm +.Cm redup +.Op Fl v +.Ar file +.Nm +.Cm token +.Ar resume_token +. +.Sh DESCRIPTION +The +.Sy zstream +utility manipulates ZFS send streams output by the +.Sy zfs send +command. +.Bl -tag -width "" +.It Xo +.Nm +.Cm dump +.Op Fl Cvd +.Op Ar file +.Xc +Print information about the specified send stream, including headers and +record counts. +The send stream may either be in the specified +.Ar file , +or provided on standard input. +.Bl -tag -width "-D" +.It Fl C +Suppress the validation of checksums. +.It Fl v +Verbose. +Print metadata for each record. +.It Fl d +Dump data contained in each record. +Implies verbose. +.El +.Pp +The +.Nm zstreamdump +alias is provided for compatibility and is equivalent to running +.Nm +.Cm dump . +.It Xo +.Nm +.Cm token +.Ar resume_token +.Xc +Dumps zfs resume token information +.It Xo +.Nm +.Cm redup +.Op Fl v +.Ar file +.Xc +Deduplicated send streams can be generated by using the +.Nm zfs Cm send Fl D +command. +The ability to send deduplicated send streams is deprecated. +In the future, the ability to receive a deduplicated send stream with +.Nm zfs Cm receive +will be removed. +However, deduplicated send streams can still be received by utilizing +.Nm zstream Cm redup . +.Pp +The +.Nm zstream Cm redup +command is provided a +.Ar file +containing a deduplicated send stream, and outputs an equivalent +non-deduplicated send stream on standard output. +Therefore, a deduplicated send stream can be received by running: +.Dl # Nm zstream Cm redup Pa DEDUP_STREAM_FILE | Nm zfs Cm receive No … +.Bl -tag -width "-D" +.It Fl v +Verbose. +Print summary of converted records. +.El +.El +. +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zfs-receive 8 , +.Xr zfs-send 8 diff --git a/man/man8/zstreamdump.8 b/man/man8/zstreamdump.8 deleted file mode 100644 index 33cd047f5d..0000000000 --- a/man/man8/zstreamdump.8 +++ /dev/null @@ -1,58 +0,0 @@ -'\" te -.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with -.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zstreamdump 8 "29 Aug 2012" "ZFS pool 28, filesystem 5" "System Administration Commands" -.SH NAME -zstreamdump \- filter data in zfs send stream -.SH SYNOPSIS -.LP -.nf -\fBzstreamdump\fR [\fB-C\fR] [\fB-v\fR] [\fB-d\fR] -.fi - -.SH DESCRIPTION -.sp -.LP -The \fBzstreamdump\fR utility reads from the output of the \fBzfs send\fR -command, then displays headers and some statistics from that output. See -\fBzfs\fR(8). -.SH OPTIONS -.sp -.LP -The following options are supported: -.sp -.ne 2 -.na -\fB-C\fR -.ad -.sp .6 -.RS 4n -Suppress the validation of checksums. -.RE - -.sp -.ne 2 -.na -\fB-v\fR -.ad -.sp .6 -.RS 4n -Verbose. Dump all headers, not only begin and end headers. -.RE - -.sp -.ne 2 -.na -\fB-d\fR -.ad -.sp .6 -.RS 4n -Dump contents of blocks modified. Implies verbose. -.RE - -.SH SEE ALSO -.sp -.LP -\fBzfs\fR(8) diff --git a/man/man8/zstreamdump.8 b/man/man8/zstreamdump.8 new file mode 120000 index 0000000000..c6721daf11 --- /dev/null +++ b/man/man8/zstreamdump.8 @@ -0,0 +1 @@ +zstream.8 \ No newline at end of file diff --git a/module/.gitignore b/module/.gitignore index 1ea8ef0bb8..7a4bd3673e 100644 --- a/module/.gitignore +++ b/module/.gitignore @@ -2,13 +2,25 @@ *.ko.unsigned *.ko.out *.ko.out.sig +*.ko.debug +*.ko.full *.dwo .*.cmd .*.d +*.mod +/Kbuild /.cache.mk /.tmp_versions /Module.markers /Module.symvers +/vnode_if* +/bus_if.h +/device_if.h +/opt_global.h + +/export_syms +/machine +/x86 !Makefile.in diff --git a/module/Kbuild.in b/module/Kbuild.in new file mode 100644 index 0000000000..1507965c57 --- /dev/null +++ b/module/Kbuild.in @@ -0,0 +1,47 @@ +# When integrated in to a monolithic kernel the spl module must appear +# first. This ensures its module initialization function is run before +# any of the other module initialization functions which depend on it. +ZFS_MODULES += spl/ +ZFS_MODULES += avl/ +ZFS_MODULES += icp/ +ZFS_MODULES += lua/ +ZFS_MODULES += nvpair/ +ZFS_MODULES += unicode/ +ZFS_MODULES += zcommon/ +ZFS_MODULES += zfs/ +ZFS_MODULES += zstd/ + +# The rest is only relevant when run by kbuild +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_ZFS) := $(ZFS_MODULES) + +ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement +ZFS_MODULE_CFLAGS += -Wmissing-prototypes +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ + +ifneq ($(KBUILD_EXTMOD),) +zfs_include = @abs_top_srcdir@/include +ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h +ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include +else +zfs_include = $(srctree)/include/zfs +ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h +endif + +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs +ZFS_MODULE_CFLAGS += -I$(zfs_include) +ZFS_MODULE_CPPFLAGS += -D_KERNEL +ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +ifneq ($(KBUILD_EXTMOD),) +@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include +@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ +endif + +subdir-asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) +subdir-ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) + +endif diff --git a/module/Makefile.bsd b/module/Makefile.bsd new file mode 100644 index 0000000000..8aa4ed2227 --- /dev/null +++ b/module/Makefile.bsd @@ -0,0 +1,369 @@ +.if !defined(WITH_CTF) +WITH_CTF=1 +.endif + +.include + +SRCDIR=${.CURDIR} +INCDIR=${.CURDIR:H}/include + +KMOD= openzfs + +.PATH: ${SRCDIR}/avl \ + ${SRCDIR}/lua \ + ${SRCDIR}/nvpair \ + ${SRCDIR}/os/freebsd/spl \ + ${SRCDIR}/os/freebsd/zfs \ + ${SRCDIR}/unicode \ + ${SRCDIR}/zcommon \ + ${SRCDIR}/zfs \ + ${SRCDIR}/zstd \ + ${SRCDIR}/zstd/lib + + + +CFLAGS+= -I${.OBJDIR:H}/include +CFLAGS+= -I${INCDIR} +CFLAGS+= -I${INCDIR}/os/freebsd +CFLAGS+= -I${INCDIR}/os/freebsd/spl +CFLAGS+= -I${INCDIR}/os/freebsd/zfs +CFLAGS+= -I${SRCDIR}/zstd/include +CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h + +CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ + -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \ + -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DHAVE_KSID -DCOMPAT_FREEBSD11 + +.if ${MACHINE_ARCH} == "amd64" +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3 +.endif + +.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" +CFLAGS+= -DZFS_DEBUG -g +.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true" + CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS +.endif +.if defined(WITH_O0) && ${WITH_O0} == "true" + CFLAGS+= -O0 +.endif +.else +CFLAGS += -DNDEBUG +.endif + +.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true" +# kernel must also be built with this option for this to work +CFLAGS+= -DDEBUG_VFS_LOCKS +.endif + +.if defined(WITH_GCOV) && ${WITH_GCOV} == "true" +CFLAGS+= -fprofile-arcs -ftest-coverage +.endif + +DEBUG_FLAGS=-g + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +CFLAGS+= -DBITS_PER_LONG=32 +.else +CFLAGS+= -DBITS_PER_LONG=64 +.endif + +SRCS= vnode_if.h device_if.h bus_if.h + +# avl +SRCS+= avl.c + +#lua +SRCS+= lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +#nvpair +SRCS+= nvpair.c \ + fnvpair.c \ + nvpair_alloc_spl.c \ + nvpair_alloc_fixed.c + +#os/freebsd/spl +SRCS+= acl_common.c \ + callb.c \ + list.c \ + sha256c.c \ + sha512c.c \ + spl_acl.c \ + spl_cmn_err.c \ + spl_dtrace.c \ + spl_kmem.c \ + spl_kstat.c \ + spl_misc.c \ + spl_policy.c \ + spl_procfs_list.c \ + spl_string.c \ + spl_sunddi.c \ + spl_sysevent.c \ + spl_taskq.c \ + spl_uio.c \ + spl_vfs.c \ + spl_vm.c \ + spl_zlib.c \ + spl_zone.c + + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +SRCS+= spl_atomic.c +.endif + +#os/freebsd/zfs +SRCS+= abd_os.c \ + arc_os.c \ + crypto_os.c \ + dmu_os.c \ + hkdf.c \ + kmod_core.c \ + spa_os.c \ + sysctl_os.c \ + vdev_file.c \ + vdev_geom.c \ + vdev_label_os.c \ + zfs_acl.c \ + zfs_ctldir.c \ + zfs_debug.c \ + zfs_dir.c \ + zfs_ioctl_compat.c \ + zfs_ioctl_os.c \ + zfs_racct.c \ + zfs_vfsops.c \ + zfs_vnops_os.c \ + zfs_znode.c \ + zio_crypt.c \ + zvol_os.c + +#unicode +SRCS+= uconv.c \ + u8_textprep.c + +#zcommon +SRCS+= zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zpool_prop.c \ + zprop_common.c + +#zfs +SRCS+= abd.c \ + aggsum.c \ + arc.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + btree.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + bptree.c \ + bqueue.c \ + dataset_kstats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_bookmark.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_destroy.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_userhold.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ + vdev_indirect.c \ + vdev_indirect_births.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_rebuild.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_file_os.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_ioctl.c \ + zfs_log.c \ + zfs_onexit.c \ + zfs_quota.c \ + zfs_ratelimit.c \ + zfs_replay.c \ + zfs_rlock.c \ + zfs_sa.c \ + zfs_vnops.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c \ + zvol.c + +#zstd +SRCS+= zfs_zstd.c \ + zstd.c + +beforeinstall: +.if ${MK_DEBUG_FILES} != "no" + mtree -eu \ + -f /etc/mtree/BSD.debug.dist \ + -p ${DESTDIR}/usr/lib +.endif + +.include + + +CFLAGS.gcc+= -Wno-pointer-to-int-cast + +CFLAGS.lapi.c= -Wno-cast-qual +CFLAGS.lcompat.c= -Wno-cast-qual +CFLAGS.lobject.c= -Wno-cast-qual +CFLAGS.ltable.c= -Wno-cast-qual +CFLAGS.lvm.c= -Wno-cast-qual +CFLAGS.nvpair.c= -DHAVE_RPC_TYPES -Wno-cast-qual +CFLAGS.spl_string.c= -Wno-cast-qual +CFLAGS.spl_vm.c= -Wno-cast-qual +CFLAGS.spl_zlib.c= -Wno-cast-qual +CFLAGS.abd.c= -Wno-cast-qual +CFLAGS.zfs_log.c= -Wno-cast-qual +CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith +CFLAGS.u8_textprep.c= -Wno-cast-qual +CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zprop_common.c= -Wno-cast-qual +CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.dmu.c= -Wno-cast-qual +CFLAGS.dmu_traverse.c= -Wno-cast-qual +CFLAGS.dsl_dir.c= -Wno-cast-qual +CFLAGS.dsl_deadlist.c= -Wno-cast-qual +CFLAGS.dsl_prop.c= -Wno-cast-qual +CFLAGS.fm.c= -Wno-cast-qual +CFLAGS.lz4.c= -Wno-cast-qual +CFLAGS.spa.c= -Wno-cast-qual +CFLAGS.spa_misc.c= -Wno-cast-qual +CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_draid.c= -Wno-cast-qual +CFLAGS.vdev_raidz.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_leaf.c= -Wno-cast-qual +CFLAGS.zap_micro.c= -Wno-cast-qual +CFLAGS.zcp.c= -Wno-cast-qual +CFLAGS.zfs_fm.c= -Wno-cast-qual +CFLAGS.zfs_ioctl.c= -Wno-cast-qual +CFLAGS.zil.c= -Wno-cast-qual +CFLAGS.zio.c= -Wno-cast-qual +CFLAGS.zrlock.c= -Wno-cast-qual +CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zstd.c= -fno-tree-vectorize -U__BMI__ diff --git a/module/Makefile.in b/module/Makefile.in index 935bd26630..b15ab91097 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -1,77 +1,143 @@ -subdir-m += avl -subdir-m += icp -subdir-m += lua -subdir-m += nvpair -subdir-m += spl -subdir-m += unicode -subdir-m += zcommon -subdir-m += zfs +include Kbuild INSTALL_MOD_DIR ?= extra +INSTALL_MOD_PATH ?= $(DESTDIR) -ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement -ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ -ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h -ZFS_MODULE_CFLAGS += -I@abs_top_srcdir@/include/spl -ZFS_MODULE_CFLAGS += -I@abs_top_srcdir@/include +SUBDIR_TARGETS = icp lua zstd -ZFS_MODULE_CPPFLAGS += -D_KERNEL -ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ +all: modules +distclean maintainer-clean: clean +install: modules_install +uninstall: modules_uninstall +check: -@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include -@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ +.PHONY: all distclean maintainer-clean install uninstall check distdir \ + modules modules-Linux modules-FreeBSD modules-unknown \ + clean clean-Linux clean-FreeBSD \ + modules_install modules_install-Linux modules_install-FreeBSD \ + modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ + cppcheck cppcheck-Linux cppcheck-FreeBSD -export ZFS_MODULE_CFLAGS ZFS_MODULE_CPPFLAGS +# For FreeBSD, use debug options from ./configure if not overridden. +export WITH_DEBUG ?= @WITH_DEBUG@ +export WITH_INVARIANTS ?= @WITH_INVARIANTS@ -SUBDIR_TARGETS = icp lua +# Filter out options that FreeBSD make doesn't understand +getflags = ( \ +set -- \ + $(filter-out --%,$(firstword $(MFLAGS))) \ + $(filter -I%,$(MFLAGS)) \ + $(filter -j%,$(MFLAGS)); \ +fmakeflags=""; \ +while getopts :deiI:j:knqrstw flag; do \ + case $$flag in \ + \?) :;; \ + :) if [ $$OPTARG = "j" ]; then \ + ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \ + if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \ + fi;; \ + d) fmakeflags="$$fmakeflags -dA";; \ + *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \ + esac; \ +done; \ +echo $$fmakeflags \ +) +FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags)) -modules: +ifneq (@abs_srcdir@,@abs_builddir@) +FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@ +endif + +FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) + +modules-Linux: list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \ $(MAKE) -C $$targetdir; \ done - $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m $@ + $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules -clean: +modules-FreeBSD: + +$(FMAKE) + +modules-unknown: + @true + +modules: modules-@ac_system@ + +clean-Linux: @# Only cleanup the kernel build directories when CONFIG_KERNEL @# is defined. This indicates that kernel modules should be built. -@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ $@ +@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi if [ -f Module.markers ]; then $(RM) Module.markers; fi find . -name '*.ur-safe' -type f -print | xargs $(RM) -modules_install: +clean-FreeBSD: + +$(FMAKE) clean + +clean: clean-@ac_system@ + +modules_install-Linux: @# Install the kernel modules - $(MAKE) -C @LINUX_OBJ@ M=`pwd` $@ \ - INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \ + $(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \ + INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging - kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ + kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ if [ -n "$(DESTDIR)" ]; then \ find $$kmoddir -name 'modules.*' | xargs $(RM); \ fi - sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ + sysmap=$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ if [ -f $$sysmap ]; then \ depmod -ae -F $$sysmap @LINUX_VERSION@; \ fi -modules_uninstall: +modules_install-FreeBSD: + @# Install the kernel modules + +$(FMAKE) install + +modules_install: modules_install-@ac_system@ + +modules_uninstall-Linux: @# Uninstall the kernel modules - kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ - list='$(subdir-m)'; for subdir in $$list; do \ - $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$subdir; \ + kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ + for objdir in $(ZFS_MODULES); do \ + $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ done +modules_uninstall-FreeBSD: + @false + +modules_uninstall: modules_uninstall-@ac_system@ + +cppcheck-Linux: + @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ + --inline-suppr \ + --suppress=unmatchedSuppression \ + --suppress=noValidConfiguration \ + --enable=warning,information -D_KERNEL \ + --include=@LINUX_OBJ@/include/generated/autoconf.h \ + --include=@top_srcdir@/zfs_config.h \ + --config-exclude=@LINUX_OBJ@/include \ + -I @LINUX_OBJ@/include \ + -I @top_srcdir@/include/os/linux/kernel \ + -I @top_srcdir@/include/os/linux/spl \ + -I @top_srcdir@/include/os/linux/zfs \ + -I @top_srcdir@/include \ + avl icp lua nvpair spl unicode zcommon zfs zstd os/linux + +cppcheck-FreeBSD: + @true + +cppcheck: cppcheck-@ac_system@ + distdir: - list='$(subdir-m)'; for subdir in $$list; do \ - (cd @top_srcdir@/module && find $$subdir -name '*.c' -o -name '*.h' -o -name '*.S' |\ - xargs cp --parents -t $$distdir); \ - done - -distclean maintainer-clean: clean -install: modules_install -uninstall: modules_uninstall -all: modules -check: + (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ + while read path; do \ + mkdir -p $$distdir/$${path%/*}; \ + cp @srcdir@/$$path $$distdir/$$path; \ + done; \ + cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd diff --git a/module/avl/Makefile.in b/module/avl/Makefile.in index 217fa3ca52..991d5f95b8 100644 --- a/module/avl/Makefile.in +++ b/module/avl/Makefile.in @@ -1,10 +1,10 @@ -src = @abs_top_srcdir@/module/avl +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +endif MODULE := zavl obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - $(MODULE)-objs += avl.o diff --git a/module/avl/avl.c b/module/avl/avl.c index 736dcee845..3d36d4c87e 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -96,6 +96,9 @@ * which each have their own compilation environments and subsequent * requirements. Each of these environments must be considered when adding * dependencies from avl.c. + * + * Link to Illumos.org for more information on avl function: + * [1] https://illumos.org/man/9f/avl */ #include @@ -103,6 +106,7 @@ #include #include #include +#include /* * Small arrays to translate between balance (or diff) values and child indices. @@ -159,7 +163,7 @@ avl_walk(avl_tree_t *tree, void *oldnode, int left) node = node->avl_child[right]) ; /* - * Otherwise, return thru left children as far as we can. + * Otherwise, return through left children as far as we can. */ } else { for (;;) { @@ -268,7 +272,7 @@ avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); ASSERT(-1 <= diff && diff <= 1); if (diff == 0) { -#ifdef DEBUG +#ifdef ZFS_DEBUG if (where != NULL) *where = 0; #endif @@ -488,7 +492,6 @@ avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) int which_child = AVL_INDEX2CHILD(where); size_t off = tree->avl_offset; - ASSERT(tree); #ifdef _LP64 ASSERT(((uintptr_t)new_data & 0x7) == 0); #endif @@ -577,7 +580,7 @@ avl_insert_here( { avl_node_t *node; int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ -#ifdef DEBUG +#ifdef ZFS_DEBUG int diff; #endif @@ -592,7 +595,7 @@ avl_insert_here( */ node = AVL_DATA2NODE(here, tree->avl_offset); -#ifdef DEBUG +#ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, here); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); @@ -603,7 +606,7 @@ avl_insert_here( node = node->avl_child[child]; child = 1 - child; while (node->avl_child[child] != NULL) { -#ifdef DEBUG +#ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); @@ -612,7 +615,7 @@ avl_insert_here( #endif node = node->avl_child[child]; } -#ifdef DEBUG +#ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); @@ -676,8 +679,6 @@ avl_remove(avl_tree_t *tree, void *data) int which_child; size_t off = tree->avl_offset; - ASSERT(tree); - delete = AVL_DATA2NODE(data, off); /* @@ -808,6 +809,64 @@ avl_remove(avl_tree_t *tree, void *data) } while (parent != NULL); } +#define AVL_REINSERT(tree, obj) \ + avl_remove((tree), (obj)); \ + avl_add((tree), (obj)) + +boolean_t +avl_update_lt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) <= 0)); + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update_gt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) >= 0)); + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update(avl_tree_t *t, void *obj) +{ + void *neighbor; + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) { @@ -816,7 +875,6 @@ avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); - ASSERT3U(tree1->avl_size, ==, tree2->avl_size); temp_node = tree1->avl_root; temp_numnodes = tree1->avl_numnodes; @@ -844,7 +902,6 @@ avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), tree->avl_compar = compar; tree->avl_root = NULL; tree->avl_numnodes = 0; - tree->avl_size = size; tree->avl_offset = offset; } @@ -949,7 +1006,7 @@ avl_destroy_nodes(avl_tree_t *tree, void **cookie) --tree->avl_numnodes; /* - * If we just did a right child or there isn't one, go up to parent. + * If we just removed a right child or there isn't one, go up to parent. */ if (child == 1 || parent->avl_child[1] == NULL) { node = parent; @@ -993,7 +1050,6 @@ done: } #if defined(_KERNEL) -#include static int __init avl_init(void) @@ -1008,11 +1064,12 @@ avl_fini(void) module_init(avl_init); module_exit(avl_fini); +#endif -MODULE_DESCRIPTION("Generic AVL tree implementation"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(avl_create); EXPORT_SYMBOL(avl_find); @@ -1029,4 +1086,6 @@ EXPORT_SYMBOL(avl_remove); EXPORT_SYMBOL(avl_numnodes); EXPORT_SYMBOL(avl_destroy_nodes); EXPORT_SYMBOL(avl_destroy); -#endif +EXPORT_SYMBOL(avl_update_lt); +EXPORT_SYMBOL(avl_update_gt); +EXPORT_SYMBOL(avl_update); diff --git a/module/icp/Makefile.in b/module/icp/Makefile.in index 18e8dc313b..858c5a610c 100644 --- a/module/icp/Makefile.in +++ b/module/icp/Makefile.in @@ -1,34 +1,17 @@ -src = @abs_top_srcdir@/module/icp +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +icp_include = $(src)/include +else +icp_include = $(srctree)/$(src)/include +endif MODULE := icp -TARGET_ASM_DIR = @TARGET_ASM_DIR@ - -ifeq ($(TARGET_ASM_DIR), asm-x86_64) -ASM_SOURCES := asm-x86_64/aes/aeskey.o -ASM_SOURCES += asm-x86_64/aes/aes_amd64.o -ASM_SOURCES += asm-x86_64/aes/aes_aesni.o -ASM_SOURCES += asm-x86_64/modes/gcm_pclmulqdq.o -ASM_SOURCES += asm-x86_64/sha1/sha1-x86_64.o -ASM_SOURCES += asm-x86_64/sha2/sha256_impl.o -ASM_SOURCES += asm-x86_64/sha2/sha512_impl.o -endif - -ifeq ($(TARGET_ASM_DIR), asm-i386) -ASM_SOURCES := -endif - -ifeq ($(TARGET_ASM_DIR), asm-generic) -ASM_SOURCES := -endif - obj-$(CONFIG_ZFS) := $(MODULE).o -asflags-y := -I$(src)/include -asflags-y += $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) -ccflags-y := -I$(src)/include -ccflags-y += $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) +asflags-y := -I$(icp_include) +ccflags-y := -I$(icp_include) $(MODULE)-objs += illumos-crypto.o $(MODULE)-objs += api/kcf_cipher.o @@ -62,16 +45,37 @@ $(MODULE)-objs += algs/aes/aes_modes.o $(MODULE)-objs += algs/edonr/edonr.o $(MODULE)-objs += algs/sha1/sha1.o $(MODULE)-objs += algs/sha2/sha2.o -$(MODULE)-objs += algs/sha1/sha1.o $(MODULE)-objs += algs/skein/skein.o $(MODULE)-objs += algs/skein/skein_block.o $(MODULE)-objs += algs/skein/skein_iv.o -$(MODULE)-objs += $(ASM_SOURCES) + +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aeskey.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_amd64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_aesni.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/gcm_pclmulqdq.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha1/sha1-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha256_impl.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha512_impl.o $(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o +# Suppress objtool "can't find jump dest instruction at" warnings. They +# are caused by the constants which are defined in the text section of the +# assembly file using .byte instructions (e.g. bswap_mask). The objtool +# utility tries to interpret them as opcodes and obviously fails doing so. +OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y +OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y +# Suppress objtool "unsupported stack pointer realignment" warnings. We are +# not using a DRAP register while aligning the stack to a 64 byte boundary. +# See #6950 for the reasoning. +OBJECT_FILES_NON_STANDARD_sha1-x86_64.o := y +OBJECT_FILES_NON_STANDARD_sha256_impl.o := y +OBJECT_FILES_NON_STANDARD_sha512_impl.o := y + ICP_DIRS = \ api \ core \ diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index e150506357..037be0db60 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -40,9 +41,9 @@ void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) { - aes_impl_ops_t *ops = aes_impl_get_ops(); - aes_key_t *newbie = keysched; - uint_t keysize, i, j; + const aes_impl_ops_t *ops = aes_impl_get_ops(); + aes_key_t *newbie = keysched; + uint_t keysize, i, j; union { uint64_t ka64[4]; uint32_t ka32[8]; @@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0; static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; /* - * Selects the aes operations for encrypt/decrypt/key setup + * Returns the AES operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. */ -aes_impl_ops_t * -aes_impl_get_ops() +const aes_impl_ops_t * +aes_impl_get_ops(void) { - aes_impl_ops_t *ops = NULL; + if (!kfpu_allowed()) + return (&aes_generic_impl); + + const aes_impl_ops_t *ops = NULL; const uint32_t impl = AES_IMPL_READ(icp_aes_impl); switch (impl) { @@ -266,15 +272,13 @@ aes_impl_get_ops() ops = &aes_fastest_impl; break; case IMPL_CYCLE: - { + /* Cycle through supported implementations */ ASSERT(aes_impl_initialized); ASSERT3U(aes_supp_impl_cnt, >, 0); - /* Cycle through supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; ops = aes_supp_impl[idx]; - } - break; + break; default: ASSERT3U(impl, <, aes_supp_impl_cnt); ASSERT3U(aes_supp_impl_cnt, >, 0); @@ -288,13 +292,16 @@ aes_impl_get_ops() return (ops); } +/* + * Initialize all supported implementations. + */ void aes_impl_init(void) { aes_impl_ops_t *curr_impl; int i, c; - /* move supported impl into aes_supp_impls */ + /* Move supported implementations into aes_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; @@ -303,22 +310,27 @@ aes_impl_init(void) } aes_supp_impl_cnt = c; - /* set fastest implementation. assume hardware accelerated is fastest */ + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ #if defined(__x86_64) #if defined(HAVE_AES) - if (aes_aesni_impl.is_supported()) + if (aes_aesni_impl.is_supported()) { memcpy(&aes_fastest_impl, &aes_aesni_impl, sizeof (aes_fastest_impl)); - else + } else #endif + { memcpy(&aes_fastest_impl, &aes_x86_64_impl, sizeof (aes_fastest_impl)); + } #else memcpy(&aes_fastest_impl, &aes_generic_impl, sizeof (aes_fastest_impl)); #endif - strcpy(aes_fastest_impl.name, "fastest"); + strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX); /* Finish initialization */ atomic_swap_32(&icp_aes_impl, user_sel_impl); @@ -393,8 +405,7 @@ aes_impl_set(const char *val) return (err); } -#if defined(_KERNEL) -#include +#if defined(_KERNEL) && defined(__linux__) static int icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp) diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c index 97f7c3a478..4b5eefd71b 100644 --- a/module/icp/algs/aes/aes_impl_aesni.c +++ b/module/icp/algs/aes/aes_impl_aesni.c @@ -24,7 +24,8 @@ #if defined(__x86_64) && defined(HAVE_AES) -#include +#include +#include /* These functions are used to execute AES-NI instructions: */ extern int rijndael_key_setup_enc_intel(uint32_t rk[], @@ -108,7 +109,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], static boolean_t aes_aesni_will_work(void) { - return (zfs_aes_available()); + return (kfpu_allowed() && zfs_aes_available()); } const aes_impl_ops_t aes_aesni_impl = { diff --git a/module/icp/algs/aes/aes_impl_generic.c b/module/icp/algs/aes/aes_impl_generic.c index a3b75dbf32..427c096c6a 100644 --- a/module/icp/algs/aes/aes_impl_generic.c +++ b/module/icp/algs/aes/aes_impl_generic.c @@ -1233,7 +1233,7 @@ const aes_impl_ops_t aes_generic_impl = { .encrypt = &aes_generic_encrypt, .decrypt = &aes_generic_decrypt, .is_supported = &aes_generic_will_work, -#if defined(_LITTLE_ENDIAN) +#if defined(_ZFS_LITTLE_ENDIAN) .needs_byteswap = B_TRUE, #else .needs_byteswap = B_FALSE, diff --git a/module/icp/algs/aes/aes_impl_x86-64.c b/module/icp/algs/aes/aes_impl_x86-64.c index b4515fa22c..19f8fd5012 100644 --- a/module/icp/algs/aes/aes_impl_x86-64.c +++ b/module/icp/algs/aes/aes_impl_x86-64.c @@ -24,19 +24,7 @@ #if defined(__x86_64) -#include - -/* These functions are used to execute amd64 instructions for AMD or Intel: */ -extern int rijndael_key_setup_enc_amd64(uint32_t rk[], - const uint32_t cipherKey[], int keyBits); -extern int rijndael_key_setup_dec_amd64(uint32_t rk[], - const uint32_t cipherKey[], int keyBits); -extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, - const uint32_t pt[4], uint32_t ct[4]); -extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, - const uint32_t ct[4], uint32_t pt[4]); - - +#include #include /* diff --git a/module/icp/algs/edonr/edonr.c b/module/icp/algs/edonr/edonr.c index 7c677095f1..ee96e692ef 100644 --- a/module/icp/algs/edonr/edonr.c +++ b/module/icp/algs/edonr/edonr.c @@ -337,7 +337,7 @@ Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p) * * Checksum functions like this one can go over the stack frame size check * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can - * safely ignore the compiler error since we know that in ZoL, that + * safely ignore the compiler error since we know that in OpenZFS, that * the function will be called from a worker thread that won't be using * much stack. The only function that goes over the 1k limit is Q512(), * which only goes over it by a hair (1248 bytes on ARM32). diff --git a/module/icp/algs/edonr/edonr_byteorder.h b/module/icp/algs/edonr/edonr_byteorder.h index 532dfd7434..2b5d48287f 100644 --- a/module/icp/algs/edonr/edonr_byteorder.h +++ b/module/icp/algs/edonr/edonr_byteorder.h @@ -52,10 +52,10 @@ #endif /* __BYTE_ORDER || BYTE_ORDER */ #if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN) -#if defined(_BIG_ENDIAN) || defined(_MIPSEB) +#if defined(_ZFS_BIG_ENDIAN) || defined(_MIPSEB) #define MACHINE_IS_BIG_ENDIAN #endif -#if defined(_LITTLE_ENDIAN) || defined(_MIPSEL) +#if defined(_ZFS_LITTLE_ENDIAN) || defined(_MIPSEL) #define MACHINE_IS_LITTLE_ENDIAN #endif #endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */ diff --git a/module/icp/algs/modes/cbc.c b/module/icp/algs/modes/cbc.c index 2cc94ec726..85864f56de 100644 --- a/module/icp/algs/modes/cbc.c +++ b/module/icp/algs/modes/cbc.c @@ -60,8 +60,7 @@ cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, } lastp = (uint8_t *)ctx->cbc_iv; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ @@ -79,47 +78,28 @@ cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, blockp = datap; } - if (out == NULL) { - /* - * XOR the previous cipher block or IV with the - * current clear block. - */ - xor_block(lastp, blockp); - encrypt(ctx->cbc_keysched, blockp, blockp); + /* + * XOR the previous cipher block or IV with the + * current clear block. + */ + xor_block(blockp, lastp); + encrypt(ctx->cbc_keysched, lastp, lastp); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - ctx->cbc_lastp = blockp; - lastp = blockp; - - if (ctx->cbc_remainder_len > 0) { - bcopy(blockp, ctx->cbc_copy_to, - ctx->cbc_remainder_len); - bcopy(blockp + ctx->cbc_remainder_len, datap, - need); - } + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); } else { - /* - * XOR the previous cipher block or IV with the - * current clear block. - */ - xor_block(blockp, lastp); - encrypt(ctx->cbc_keysched, lastp, lastp); - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - if (out_data_1_len == block_size) { - copy_block(lastp, out_data_1); - } else { - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, - out_data_2, - block_size - out_data_1_len); - } + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); } - /* update offset */ - out->cd_offset += block_size; } + /* update offset */ + out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->cbc_remainder_len != 0) { @@ -187,8 +167,7 @@ cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, } lastp = ctx->cbc_lastp; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ @@ -209,13 +188,9 @@ cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, /* LINTED: pointer alignment */ copy_block(blockp, (uint8_t *)OTHER((uint64_t *)lastp, ctx)); - if (out != NULL) { - decrypt(ctx->cbc_keysched, blockp, - (uint8_t *)ctx->cbc_remainder); - blockp = (uint8_t *)ctx->cbc_remainder; - } else { - decrypt(ctx->cbc_keysched, blockp, blockp); - } + decrypt(ctx->cbc_keysched, blockp, + (uint8_t *)ctx->cbc_remainder); + blockp = (uint8_t *)ctx->cbc_remainder; /* * XOR the previous cipher block or IV with the @@ -226,25 +201,18 @@ cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, /* LINTED: pointer alignment */ lastp = (uint8_t *)OTHER((uint64_t *)lastp, ctx); - if (out != NULL) { - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - bcopy(blockp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(blockp + out_data_1_len, out_data_2, - block_size - out_data_1_len); - } - - /* update offset */ - out->cd_offset += block_size; - - } else if (ctx->cbc_remainder_len > 0) { - /* copy temporary block to where it belongs */ - bcopy(blockp, ctx->cbc_copy_to, ctx->cbc_remainder_len); - bcopy(blockp + ctx->cbc_remainder_len, datap, need); + bcopy(blockp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(blockp + out_data_1_len, out_data_2, + block_size - out_data_1_len); } + /* update offset */ + out->cd_offset += block_size; + /* Update pointer to next block of data to be processed. */ if (ctx->cbc_remainder_len != 0) { datap += need; diff --git a/module/icp/algs/modes/ccm.c b/module/icp/algs/modes/ccm.c index fb41194f81..5d6507c49d 100644 --- a/module/icp/algs/modes/ccm.c +++ b/module/icp/algs/modes/ccm.c @@ -68,8 +68,7 @@ ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, } lastp = (uint8_t *)ctx->ccm_cb; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); mac_buf = (uint8_t *)ctx->ccm_mac_buf; @@ -108,13 +107,13 @@ ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, * Increment counter. Counter bits are confined * to the bottom 64 bits of the counter block. */ -#ifdef _LITTLE_ENDIAN +#ifdef _ZFS_LITTLE_ENDIAN counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); counter = htonll(counter + 1); #else counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; counter++; -#endif /* _LITTLE_ENDIAN */ +#endif /* _ZFS_LITTLE_ENDIAN */ counter &= ctx->ccm_counter_mask; ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; @@ -126,31 +125,22 @@ ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, ctx->ccm_processed_data_len += block_size; - if (out == NULL) { - if (ctx->ccm_remainder_len > 0) { - bcopy(blockp, ctx->ccm_copy_to, - ctx->ccm_remainder_len); - bcopy(blockp + ctx->ccm_remainder_len, datap, - need); - } - } else { - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - /* copy block to where it belongs */ - if (out_data_1_len == block_size) { - copy_block(lastp, out_data_1); - } else { - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, - out_data_2, - block_size - out_data_1_len); - } + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); + } else { + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); } - /* update offset */ - out->cd_offset += block_size; } + /* update offset */ + out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->ccm_remainder_len != 0) { @@ -328,7 +318,7 @@ ccm_encrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, * This will only deal with decrypting the last block of the input that * might not be a multiple of block length. */ -void +static void ccm_decrypt_incomplete_block(ccm_ctx_t *ctx, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) { @@ -468,13 +458,13 @@ ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, * Increment counter. * Counter bits are confined to the bottom 64 bits */ -#ifdef _LITTLE_ENDIAN +#ifdef _ZFS_LITTLE_ENDIAN counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); counter = htonll(counter + 1); #else counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; counter++; -#endif /* _LITTLE_ENDIAN */ +#endif /* _ZFS_LITTLE_ENDIAN */ counter &= ctx->ccm_counter_mask; ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; @@ -583,7 +573,7 @@ ccm_decrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, return (CRYPTO_SUCCESS); } -int +static int ccm_validate_args(CK_AES_CCM_PARAMS *ccm_param, boolean_t is_encrypt_init) { size_t macSize, nonceSize; @@ -694,7 +684,7 @@ ccm_format_initial_blocks(uchar_t *nonce, ulong_t nonceSize, mask |= (1ULL << q); } -#ifdef _LITTLE_ENDIAN +#ifdef _ZFS_LITTLE_ENDIAN mask = htonll(mask); #endif aes_ctx->ccm_counter_mask = mask; @@ -768,11 +758,7 @@ encode_adata_len(ulong_t auth_data_len, uint8_t *encoded, size_t *encoded_len) } } -/* - * The following function should be call at encrypt or decrypt init time - * for AES CCM mode. - */ -int +static int ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), @@ -856,6 +842,10 @@ ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len, return (CRYPTO_SUCCESS); } +/* + * The following function should be call at encrypt or decrypt init time + * for AES CCM mode. + */ int ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag, boolean_t is_encrypt_init, size_t block_size, @@ -885,15 +875,13 @@ ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag, ccm_ctx->ccm_flags |= CCM_MODE; } else { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - goto out; + return (CRYPTO_MECHANISM_PARAM_INVALID); } if (ccm_init(ccm_ctx, ccm_param->nonce, ccm_param->ulNonceSize, ccm_param->authData, ccm_param->ulAuthDataSize, block_size, encrypt_block, xor_block) != 0) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - goto out; + return (CRYPTO_MECHANISM_PARAM_INVALID); } if (!is_encrypt_init) { /* allocate buffer for storing decrypted plaintext */ @@ -903,7 +891,6 @@ ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag, rv = CRYPTO_HOST_MEMORY; } } -out: return (rv); } diff --git a/module/icp/algs/modes/ctr.c b/module/icp/algs/modes/ctr.c index e3b0e12382..0188bdd395 100644 --- a/module/icp/algs/modes/ctr.c +++ b/module/icp/algs/modes/ctr.c @@ -61,8 +61,7 @@ ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, } lastp = (uint8_t *)ctx->ctr_cb; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ @@ -111,26 +110,17 @@ ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, */ xor_block(blockp, lastp); - if (out == NULL) { - if (ctx->ctr_remainder_len > 0) { - bcopy(lastp, ctx->ctr_copy_to, - ctx->ctr_remainder_len); - bcopy(lastp + ctx->ctr_remainder_len, datap, - need); - } - } else { - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - /* copy block to where it belongs */ - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, out_data_2, - block_size - out_data_1_len); - } - /* update offset */ - out->cd_offset += block_size; + /* copy block to where it belongs */ + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, out_data_2, + block_size - out_data_1_len); } + /* update offset */ + out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->ctr_remainder_len != 0) { diff --git a/module/icp/algs/modes/ecb.c b/module/icp/algs/modes/ecb.c index 04e6c5eaa6..025f5825cf 100644 --- a/module/icp/algs/modes/ecb.c +++ b/module/icp/algs/modes/ecb.c @@ -58,8 +58,7 @@ ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length, } lastp = (uint8_t *)ctx->ecb_iv; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ @@ -77,32 +76,18 @@ ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length, blockp = datap; } - if (out == NULL) { - cipher(ctx->ecb_keysched, blockp, blockp); + cipher(ctx->ecb_keysched, blockp, lastp); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - ctx->ecb_lastp = blockp; - lastp = blockp; - - if (ctx->ecb_remainder_len > 0) { - bcopy(blockp, ctx->ecb_copy_to, - ctx->ecb_remainder_len); - bcopy(blockp + ctx->ecb_remainder_len, datap, - need); - } - } else { - cipher(ctx->ecb_keysched, blockp, lastp); - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, out_data_2, - block_size - out_data_1_len); - } - /* update offset */ - out->cd_offset += block_size; + /* copy block to where it belongs */ + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, out_data_2, + block_size - out_data_1_len); } + /* update offset */ + out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->ecb_remainder_len != 0) { diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 13bceef0f1..7332834cbe 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -28,13 +28,53 @@ #include #include #include +#include #include +#ifdef CAN_USE_GCM_ASM +#include +#endif #define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ (uint64_t *)(void *)(t)); +/* Select GCM implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#ifdef CAN_USE_GCM_ASM +#define IMPL_AVX (UINT32_MAX-2) +#endif +#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_gcm_impl = IMPL_FASTEST; +static uint32_t user_sel_impl = IMPL_FASTEST; + +#ifdef CAN_USE_GCM_ASM +/* Does the architecture we run on support the MOVBE instruction? */ +boolean_t gcm_avx_can_use_movbe = B_FALSE; +/* + * Whether to use the optimized openssl gcm and ghash implementations. + * Set to true if module parameter icp_gcm_impl == "avx". + */ +static boolean_t gcm_use_avx = B_FALSE; +#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) + +extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + +static inline boolean_t gcm_avx_will_work(void); +static inline void gcm_set_avx(boolean_t); +static inline boolean_t gcm_toggle_avx(void); +static inline size_t gcm_simd_get_htab_size(boolean_t); + +static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, + crypto_data_t *, size_t); + +static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, + size_t, size_t); +#endif /* ifdef CAN_USE_GCM_ASM */ + /* * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode * is done in another function. @@ -46,7 +86,13 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_mode_encrypt_contiguous_blocks_avx( + ctx, data, length, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; @@ -66,13 +112,14 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, length); ctx->gcm_remainder_len += length; - ctx->gcm_copy_to = datap; + if (ctx->gcm_copy_to == NULL) { + ctx->gcm_copy_to = datap; + } return (CRYPTO_SUCCESS); } lastp = (uint8_t *)ctx->gcm_cb; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_init_ptrs(out, &iov_or_mp, &offset); gops = gcm_impl_get_ops(); do { @@ -108,31 +155,22 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, ctx->gcm_processed_data_len += block_size; - if (out == NULL) { - if (ctx->gcm_remainder_len > 0) { - bcopy(blockp, ctx->gcm_copy_to, - ctx->gcm_remainder_len); - bcopy(blockp + ctx->gcm_remainder_len, datap, - need); - } - } else { - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); - /* copy block to where it belongs */ - if (out_data_1_len == block_size) { - copy_block(lastp, out_data_1); - } else { - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, - out_data_2, - block_size - out_data_1_len); - } + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); + } else { + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); } - /* update offset */ - out->cd_offset += block_size; } + /* update offset */ + out->cd_offset += block_size; /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); @@ -168,7 +206,12 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_encrypt_final_avx(ctx, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; int i, rv; @@ -299,11 +342,13 @@ gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, if (length > 0) { new_len = ctx->gcm_pt_buf_len + length; new = vmem_alloc(new_len, ctx->gcm_kmflag); + if (new == NULL) { + vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); + ctx->gcm_pt_buf = NULL; + return (CRYPTO_HOST_MEMORY); + } bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len); vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); - if (new == NULL) - return (CRYPTO_HOST_MEMORY); - ctx->gcm_pt_buf = new; ctx->gcm_pt_buf_len = new_len; bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len], @@ -320,7 +365,12 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_decrypt_final_avx(ctx, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; uint8_t *ghash; @@ -427,7 +477,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; uint8_t *cb; ulong_t remainder = iv_len; ulong_t processed = 0; @@ -470,18 +520,14 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, } } -/* - * The following function is called at encrypt or decrypt init time - * for AES GCM mode. - */ -int +static int gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { - gcm_impl_ops_t *gops; + const gcm_impl_ops_t *gops; uint8_t *ghash, *datap, *authp; size_t remainder, processed; @@ -525,6 +571,12 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, return (CRYPTO_SUCCESS); } +/* + * The following function is called at encrypt or decrypt init time + * for AES GCM mode. + * + * Init the GCM context struct. Handle the cycle and avx implementations here. + */ int gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), @@ -552,16 +604,65 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GCM_MODE; } else { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - goto out; + return (CRYPTO_MECHANISM_PARAM_INVALID); } - if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, - gcm_param->pAAD, gcm_param->ulAADLen, block_size, - encrypt_block, copy_block, xor_block) != 0) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; +#ifdef CAN_USE_GCM_ASM + if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { + gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + } else { + /* + * Handle the "cycle" implementation by creating avx and + * non-avx contexts alternately. + */ + gcm_ctx->gcm_use_avx = gcm_toggle_avx(); + /* + * We don't handle byte swapped key schedules in the avx + * code path. + */ + aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; + if (ks->ops->needs_byteswap == B_TRUE) { + gcm_ctx->gcm_use_avx = B_FALSE; + } + /* Use the MOVBE and the BSWAP variants alternately. */ + if (gcm_ctx->gcm_use_avx == B_TRUE && + zfs_movbe_available() == B_TRUE) { + (void) atomic_toggle_boolean_nv( + (volatile boolean_t *)&gcm_avx_can_use_movbe); + } } -out: + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + /* Avx and non avx context initialization differs from here on. */ + if (gcm_ctx->gcm_use_avx == B_FALSE) { +#endif /* ifdef CAN_USE_GCM_ASM */ + if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, + gcm_param->pAAD, gcm_param->ulAADLen, block_size, + encrypt_block, copy_block, xor_block) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } +#ifdef CAN_USE_GCM_ASM + } else { + if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, + gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } + } +#endif /* ifdef CAN_USE_GCM_ASM */ + return (rv); } @@ -587,16 +688,57 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GMAC_MODE; } else { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - goto out; + return (CRYPTO_MECHANISM_PARAM_INVALID); } - if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, - gmac_param->pAAD, gmac_param->ulAADLen, block_size, - encrypt_block, copy_block, xor_block) != 0) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; +#ifdef CAN_USE_GCM_ASM + /* + * Handle the "cycle" implementation by creating avx and non avx + * contexts alternately. + */ + if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { + gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + } else { + gcm_ctx->gcm_use_avx = gcm_toggle_avx(); } -out: + /* We don't handle byte swapped key schedules in the avx code path. */ + aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; + if (ks->ops->needs_byteswap == B_TRUE) { + gcm_ctx->gcm_use_avx = B_FALSE; + } + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + + /* Avx and non avx context initialization differs from here on. */ + if (gcm_ctx->gcm_use_avx == B_FALSE) { +#endif /* ifdef CAN_USE_GCM_ASM */ + if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, + gmac_param->pAAD, gmac_param->ulAADLen, block_size, + encrypt_block, copy_block, xor_block) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } +#ifdef CAN_USE_GCM_ASM + } else { + if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, + gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } + } +#endif /* ifdef CAN_USE_GCM_ASM */ + return (rv); } @@ -646,26 +788,22 @@ const gcm_impl_ops_t *gcm_all_impl[] = { /* Indicate that benchmark has been completed */ static boolean_t gcm_impl_initialized = B_FALSE; -/* Select aes implementation */ -#define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX-1) - -#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) - -static uint32_t icp_gcm_impl = IMPL_FASTEST; -static uint32_t user_sel_impl = IMPL_FASTEST; - /* Hold all supported implementations */ static size_t gcm_supp_impl_cnt = 0; static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; /* - * Selects the gcm operation + * Returns the GCM operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. */ -gcm_impl_ops_t * +const gcm_impl_ops_t * gcm_impl_get_ops() { - gcm_impl_ops_t *ops = NULL; + if (!kfpu_allowed()) + return (&gcm_generic_impl); + + const gcm_impl_ops_t *ops = NULL; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); switch (impl) { @@ -674,15 +812,23 @@ gcm_impl_get_ops() ops = &gcm_fastest_impl; break; case IMPL_CYCLE: - { + /* Cycle through supported implementations */ ASSERT(gcm_impl_initialized); ASSERT3U(gcm_supp_impl_cnt, >, 0); - /* Cycle through supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; ops = gcm_supp_impl[idx]; - } - break; + break; +#ifdef CAN_USE_GCM_ASM + case IMPL_AVX: + /* + * Make sure that we return a valid implementation while + * switching to the avx implementation since there still + * may be unfinished non-avx contexts around. + */ + ops = &gcm_generic_impl; + break; +#endif default: ASSERT3U(impl, <, gcm_supp_impl_cnt); ASSERT3U(gcm_supp_impl_cnt, >, 0); @@ -696,13 +842,16 @@ gcm_impl_get_ops() return (ops); } +/* + * Initialize all supported implementations. + */ void gcm_impl_init(void) { gcm_impl_ops_t *curr_impl; int i, c; - /* move supported impl into aes_supp_impls */ + /* Move supported implementations into gcm_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; @@ -711,18 +860,39 @@ gcm_impl_init(void) } gcm_supp_impl_cnt = c; - /* set fastest implementation. assume hardware accelerated is fastest */ + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) - if (gcm_pclmulqdq_impl.is_supported()) + if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, sizeof (gcm_fastest_impl)); - else + } else #endif + { memcpy(&gcm_fastest_impl, &gcm_generic_impl, sizeof (gcm_fastest_impl)); + } - strcpy(gcm_fastest_impl.name, "fastest"); + strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); +#ifdef CAN_USE_GCM_ASM + /* + * Use the avx implementation if it's available and the implementation + * hasn't changed from its default value of fastest on module load. + */ + if (gcm_avx_will_work()) { +#ifdef HAVE_MOVBE + if (zfs_movbe_available() == B_TRUE) { + atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); + } +#endif + if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { + gcm_set_avx(B_TRUE); + } + } +#endif /* Finish initialization */ atomic_swap_32(&icp_gcm_impl, user_sel_impl); gcm_impl_initialized = B_TRUE; @@ -734,6 +904,9 @@ static const struct { } gcm_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, +#ifdef CAN_USE_GCM_ASM + { "avx", IMPL_AVX }, +#endif }; /* @@ -742,7 +915,7 @@ static const struct { * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update - * icp_aes_impl. + * icp_gcm_impl. * * @val Name of gcm implementation to use * @param Unused. @@ -767,6 +940,12 @@ gcm_impl_set(const char *val) /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { +#ifdef CAN_USE_GCM_ASM + /* Ignore avx implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + continue; + } +#endif if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { impl = gcm_impl_opts[i].sel; err = 0; @@ -785,6 +964,18 @@ gcm_impl_set(const char *val) } } } +#ifdef CAN_USE_GCM_ASM + /* + * Use the avx implementation if available and the requested one is + * avx or fastest. + */ + if (gcm_avx_will_work() == B_TRUE && + (impl == IMPL_AVX || impl == IMPL_FASTEST)) { + gcm_set_avx(B_TRUE); + } else { + gcm_set_avx(B_FALSE); + } +#endif if (err == 0) { if (gcm_impl_initialized) @@ -796,8 +987,7 @@ gcm_impl_set(const char *val) return (err); } -#if defined(_KERNEL) -#include +#if defined(_KERNEL) && defined(__linux__) static int icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) @@ -816,6 +1006,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { +#ifdef CAN_USE_GCM_ASM + /* Ignore avx implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + continue; + } +#endif fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name); } @@ -832,4 +1028,560 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); -#endif +#endif /* defined(__KERNEL) */ + +#ifdef CAN_USE_GCM_ASM +#define GCM_BLOCK_LEN 16 +/* + * The openssl asm routines are 6x aggregated and need that many bytes + * at minimum. + */ +#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) +#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) +/* + * Ensure the chunk size is reasonable since we are allocating a + * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. + */ +#define GCM_AVX_MAX_CHUNK_SIZE \ + (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) + +/* Get the chunk size module parameter. */ +#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size + +/* Clear the FPU registers since they hold sensitive internal state. */ +#define clear_fpu_regs() clear_fpu_regs_avx() +#define GHASH_AVX(ctx, in, len) \ + gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ + in, len) + +#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) + +/* + * Module parameter: number of bytes to process at once while owning the FPU. + * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is + * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. + */ +static uint32_t gcm_avx_chunk_size = + ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + +extern void clear_fpu_regs_avx(void); +extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); +extern void aes_encrypt_intel(const uint32_t rk[], int nr, + const uint32_t pt[4], uint32_t ct[4]); + +extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, + const uint8_t *in, size_t len); + +extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, + const void *, uint64_t *, uint64_t *); + +extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, + const void *, uint64_t *, uint64_t *); + +static inline boolean_t +gcm_avx_will_work(void) +{ + /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ + return (kfpu_allowed() && + zfs_avx_available() && zfs_aes_available() && + zfs_pclmulqdq_available()); +} + +static inline void +gcm_set_avx(boolean_t val) +{ + if (gcm_avx_will_work() == B_TRUE) { + atomic_swap_32(&gcm_use_avx, val); + } +} + +static inline boolean_t +gcm_toggle_avx(void) +{ + if (gcm_avx_will_work() == B_TRUE) { + return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); + } else { + return (B_FALSE); + } +} + +static inline size_t +gcm_simd_get_htab_size(boolean_t simd_mode) +{ + switch (simd_mode) { + case B_TRUE: + return (2 * 6 * 2 * sizeof (uint64_t)); + + default: + return (0); + } +} + +/* + * Clear sensitive data in the context. + * + * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and + * ctx->gcm_Htable contain the hash sub key which protects authentication. + * + * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for + * a known plaintext attack, they consists of the IV and the first and last + * counter respectively. If they should be cleared is debatable. + */ +static inline void +gcm_clear_ctx(gcm_ctx_t *ctx) +{ + bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder)); + bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); + bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0)); + bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp)); +} + +/* Increment the GCM counter block by n. */ +static inline void +gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) +{ + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); + + counter = htonll(counter + n); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; +} + +/* + * Encrypt multiple blocks of data in GCM mode. + * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines + * if possible. While processing a chunk the FPU is "locked". + */ +static int +gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, + size_t length, crypto_data_t *out, size_t block_size) +{ + size_t bleft = length; + size_t need = 0; + size_t done = 0; + uint8_t *datap = (uint8_t *)data; + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); + uint64_t *ghash = ctx->gcm_ghash; + uint64_t *cb = ctx->gcm_cb; + uint8_t *ct_buf = NULL; + uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; + int rv = CRYPTO_SUCCESS; + + ASSERT(block_size == GCM_BLOCK_LEN); + /* + * If the last call left an incomplete block, try to fill + * it first. + */ + if (ctx->gcm_remainder_len > 0) { + need = block_size - ctx->gcm_remainder_len; + if (length < need) { + /* Accumulate bytes here and return. */ + bcopy(datap, (uint8_t *)ctx->gcm_remainder + + ctx->gcm_remainder_len, length); + + ctx->gcm_remainder_len += length; + if (ctx->gcm_copy_to == NULL) { + ctx->gcm_copy_to = datap; + } + return (CRYPTO_SUCCESS); + } else { + /* Complete incomplete block. */ + bcopy(datap, (uint8_t *)ctx->gcm_remainder + + ctx->gcm_remainder_len, need); + + ctx->gcm_copy_to = NULL; + } + } + + /* Allocate a buffer to encrypt to if there is enough input. */ + if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { + ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag); + if (ct_buf == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + + /* If we completed an incomplete block, encrypt and write it out. */ + if (ctx->gcm_remainder_len > 0) { + kfpu_begin(); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, + (const uint32_t *)cb, (uint32_t *)tmp); + + gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); + GHASH_AVX(ctx, tmp, block_size); + clear_fpu_regs(); + kfpu_end(); + rv = crypto_put_output_data(tmp, out, block_size); + out->cd_offset += block_size; + gcm_incr_counter_block(ctx); + ctx->gcm_processed_data_len += block_size; + bleft -= need; + datap += need; + ctx->gcm_remainder_len = 0; + } + + /* Do the bulk encryption in chunk_size blocks. */ + for (; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + done = aesni_gcm_encrypt( + datap, ct_buf, chunk_size, key, cb, ghash); + + clear_fpu_regs(); + kfpu_end(); + if (done != chunk_size) { + rv = CRYPTO_FAILED; + goto out_nofpu; + } + rv = crypto_put_output_data(ct_buf, out, chunk_size); + if (rv != CRYPTO_SUCCESS) { + goto out_nofpu; + } + out->cd_offset += chunk_size; + datap += chunk_size; + ctx->gcm_processed_data_len += chunk_size; + } + /* Check if we are already done. */ + if (bleft == 0) { + goto out_nofpu; + } + /* Bulk encrypt the remaining data. */ + kfpu_begin(); + if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { + done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); + if (done == 0) { + rv = CRYPTO_FAILED; + goto out; + } + rv = crypto_put_output_data(ct_buf, out, done); + if (rv != CRYPTO_SUCCESS) { + goto out; + } + out->cd_offset += done; + ctx->gcm_processed_data_len += done; + datap += done; + bleft -= done; + + } + /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ + while (bleft > 0) { + if (bleft < block_size) { + bcopy(datap, ctx->gcm_remainder, bleft); + ctx->gcm_remainder_len = bleft; + ctx->gcm_copy_to = datap; + goto out; + } + /* Encrypt, hash and write out. */ + aes_encrypt_intel(key->encr_ks.ks32, key->nr, + (const uint32_t *)cb, (uint32_t *)tmp); + + gcm_xor_avx(datap, tmp); + GHASH_AVX(ctx, tmp, block_size); + rv = crypto_put_output_data(tmp, out, block_size); + if (rv != CRYPTO_SUCCESS) { + goto out; + } + out->cd_offset += block_size; + gcm_incr_counter_block(ctx); + ctx->gcm_processed_data_len += block_size; + datap += block_size; + bleft -= block_size; + } +out: + clear_fpu_regs(); + kfpu_end(); +out_nofpu: + if (ct_buf != NULL) { + vmem_free(ct_buf, chunk_size); + } + return (rv); +} + +/* + * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual + * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. + */ +static int +gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +{ + uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; + uint32_t *J0 = (uint32_t *)ctx->gcm_J0; + uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; + size_t rem_len = ctx->gcm_remainder_len; + const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; + int aes_rounds = ((aes_key_t *)keysched)->nr; + int rv; + + ASSERT(block_size == GCM_BLOCK_LEN); + + if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { + return (CRYPTO_DATA_LEN_RANGE); + } + + kfpu_begin(); + /* Pad last incomplete block with zeros, encrypt and hash. */ + if (rem_len > 0) { + uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; + const uint32_t *cb = (uint32_t *)ctx->gcm_cb; + + aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); + bzero(remainder + rem_len, block_size - rem_len); + for (int i = 0; i < rem_len; i++) { + remainder[i] ^= tmp[i]; + } + GHASH_AVX(ctx, remainder, block_size); + ctx->gcm_processed_data_len += rem_len; + /* No need to increment counter_block, it's the last block. */ + } + /* Finish tag. */ + ctx->gcm_len_a_len_c[1] = + htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); + GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_encrypt_intel(keysched, aes_rounds, J0, J0); + + gcm_xor_avx((uint8_t *)J0, ghash); + clear_fpu_regs(); + kfpu_end(); + + /* Output remainder. */ + if (rem_len > 0) { + rv = crypto_put_output_data(remainder, out, rem_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + } + out->cd_offset += rem_len; + ctx->gcm_remainder_len = 0; + rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + + out->cd_offset += ctx->gcm_tag_len; + /* Clear sensitive data in the context before returning. */ + gcm_clear_ctx(ctx); + return (CRYPTO_SUCCESS); +} + +/* + * Finalize decryption: We just have accumulated crypto text, so now we + * decrypt it here inplace. + */ +static int +gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +{ + ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); + ASSERT3U(block_size, ==, 16); + + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; + uint8_t *datap = ctx->gcm_pt_buf; + const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); + uint32_t *cb = (uint32_t *)ctx->gcm_cb; + uint64_t *ghash = ctx->gcm_ghash; + uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; + int rv = CRYPTO_SUCCESS; + size_t bleft, done; + + /* + * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be + * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of + * GCM_AVX_MIN_DECRYPT_BYTES. + */ + for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + done = aesni_gcm_decrypt(datap, datap, chunk_size, + (const void *)key, ctx->gcm_cb, ghash); + clear_fpu_regs(); + kfpu_end(); + if (done != chunk_size) { + return (CRYPTO_FAILED); + } + datap += done; + } + /* Decrypt remainder, which is less than chunk size, in one go. */ + kfpu_begin(); + if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { + done = aesni_gcm_decrypt(datap, datap, bleft, + (const void *)key, ctx->gcm_cb, ghash); + if (done == 0) { + clear_fpu_regs(); + kfpu_end(); + return (CRYPTO_FAILED); + } + datap += done; + bleft -= done; + } + ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); + + /* + * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, + * decrypt them block by block. + */ + while (bleft > 0) { + /* Incomplete last block. */ + if (bleft < block_size) { + uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; + + bzero(lastb, block_size); + bcopy(datap, lastb, bleft); + /* The GCM processing. */ + GHASH_AVX(ctx, lastb, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); + for (size_t i = 0; i < bleft; i++) { + datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; + } + break; + } + /* The GCM processing. */ + GHASH_AVX(ctx, datap, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); + gcm_xor_avx((uint8_t *)tmp, datap); + gcm_incr_counter_block(ctx); + + datap += block_size; + bleft -= block_size; + } + if (rv != CRYPTO_SUCCESS) { + clear_fpu_regs(); + kfpu_end(); + return (rv); + } + /* Decryption done, finish the tag. */ + ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); + GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, + (uint32_t *)ctx->gcm_J0); + + gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); + + /* We are done with the FPU, restore its state. */ + clear_fpu_regs(); + kfpu_end(); + + /* Compare the input authentication tag with what we calculated. */ + if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { + /* They don't match. */ + return (CRYPTO_INVALID_MAC); + } + rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); + if (rv != CRYPTO_SUCCESS) { + return (rv); + } + out->cd_offset += pt_len; + gcm_clear_ctx(ctx); + return (CRYPTO_SUCCESS); +} + +/* + * Initialize the GCM params H, Htabtle and the counter block. Save the + * initial counter block. + */ +static int +gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, + unsigned char *auth_data, size_t auth_data_len, size_t block_size) +{ + uint8_t *cb = (uint8_t *)ctx->gcm_cb; + uint64_t *H = ctx->gcm_H; + const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; + int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; + uint8_t *datap = auth_data; + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t bleft; + + ASSERT(block_size == GCM_BLOCK_LEN); + + /* Init H (encrypt zero block) and create the initial counter block. */ + bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash)); + bzero(H, sizeof (ctx->gcm_H)); + kfpu_begin(); + aes_encrypt_intel(keysched, aes_rounds, + (const uint32_t *)H, (uint32_t *)H); + + gcm_init_htab_avx(ctx->gcm_Htable, H); + + if (iv_len == 12) { + bcopy(iv, cb, 12); + cb[12] = 0; + cb[13] = 0; + cb[14] = 0; + cb[15] = 1; + /* We need the ICB later. */ + bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0)); + } else { + /* + * Most consumers use 12 byte IVs, so it's OK to use the + * original routines for other IV sizes, just avoid nesting + * kfpu_begin calls. + */ + clear_fpu_regs(); + kfpu_end(); + gcm_format_initial_blocks(iv, iv_len, ctx, block_size, + aes_copy_block, aes_xor_block); + kfpu_begin(); + } + + /* Openssl post increments the counter, adjust for that. */ + gcm_incr_counter_block(ctx); + + /* Ghash AAD in chunk_size blocks. */ + for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { + GHASH_AVX(ctx, datap, chunk_size); + datap += chunk_size; + clear_fpu_regs(); + kfpu_end(); + kfpu_begin(); + } + /* Ghash the remainder and handle possible incomplete GCM block. */ + if (bleft > 0) { + size_t incomp = bleft % block_size; + + bleft -= incomp; + if (bleft > 0) { + GHASH_AVX(ctx, datap, bleft); + datap += bleft; + } + if (incomp > 0) { + /* Zero pad and hash incomplete last block. */ + uint8_t *authp = (uint8_t *)ctx->gcm_tmp; + + bzero(authp, block_size); + bcopy(datap, authp, incomp); + GHASH_AVX(ctx, authp, block_size); + } + } + clear_fpu_regs(); + kfpu_end(); + return (CRYPTO_SUCCESS); +} + +#if defined(_KERNEL) +static int +icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + char val_rounded[16]; + int error = 0; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + + if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) + return (-EINVAL); + + snprintf(val_rounded, 16, "%u", (uint32_t)val); + error = param_set_uint(val_rounded, kp); + return (error); +} + +module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, + param_get_uint, &gcm_avx_chunk_size, 0644); + +MODULE_PARM_DESC(icp_gcm_avx_chunk_size, + "How many bytes to process while owning the FPU"); + +#endif /* defined(__KERNEL) */ +#endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c index be00ba37b6..05920115ce 100644 --- a/module/icp/algs/modes/gcm_pclmulqdq.c +++ b/module/icp/algs/modes/gcm_pclmulqdq.c @@ -24,12 +24,12 @@ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) -#include +#include +#include /* These functions are used to execute pclmulqdq based assembly methods */ extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); - #include /* @@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) static boolean_t gcm_pclmulqdq_will_work(void) { - return (zfs_pclmulqdq_available()); + return (kfpu_allowed() && zfs_pclmulqdq_available()); } const gcm_impl_ops_t gcm_pclmulqdq_impl = { diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c index 1d33c42688..59743c7d68 100644 --- a/module/icp/algs/modes/modes.c +++ b/module/icp/algs/modes/modes.c @@ -43,17 +43,14 @@ crypto_init_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset) break; case CRYPTO_DATA_UIO: { - uio_t *uiop = out->cd_uio; - uintptr_t vec_idx; + zfs_uio_t *uiop = out->cd_uio; + uint_t vec_idx; offset = out->cd_offset; - for (vec_idx = 0; vec_idx < uiop->uio_iovcnt && - offset >= uiop->uio_iov[vec_idx].iov_len; - offset -= uiop->uio_iov[vec_idx++].iov_len) - ; + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); *current_offset = offset; - *iov_or_mp = (void *)vec_idx; + *iov_or_mp = (void *)(uintptr_t)vec_idx; break; } } /* end switch */ @@ -88,34 +85,35 @@ crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, } case CRYPTO_DATA_UIO: { - uio_t *uio = out->cd_uio; - iovec_t *iov; + zfs_uio_t *uio = out->cd_uio; offset_t offset; - uintptr_t vec_idx; + uint_t vec_idx; uint8_t *p; + uint64_t iov_len; + void *iov_base; offset = *current_offset; vec_idx = (uintptr_t)(*iov_or_mp); - iov = (iovec_t *)&uio->uio_iov[vec_idx]; - p = (uint8_t *)iov->iov_base + offset; + zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + p = (uint8_t *)iov_base + offset; *out_data_1 = p; - if (offset + amt <= iov->iov_len) { + if (offset + amt <= iov_len) { /* can fit one block into this iov */ *out_data_1_len = amt; *out_data_2 = NULL; *current_offset = offset + amt; } else { /* one block spans two iovecs */ - *out_data_1_len = iov->iov_len - offset; - if (vec_idx == uio->uio_iovcnt) + *out_data_1_len = iov_len - offset; + if (vec_idx == zfs_uio_iovcnt(uio)) return; vec_idx++; - iov = (iovec_t *)&uio->uio_iov[vec_idx]; - *out_data_2 = (uint8_t *)iov->iov_base; + zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + *out_data_2 = (uint8_t *)iov_base; *current_offset = amt - *out_data_1_len; } - *iov_or_mp = (void *)vec_idx; + *iov_or_mp = (void *)(uintptr_t)vec_idx; break; } } /* end switch */ @@ -154,6 +152,14 @@ crypto_free_mode_ctx(void *ctx) vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf, ((gcm_ctx_t *)ctx)->gcm_pt_buf_len); +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) { + gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx; + bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + } +#endif + kmem_free(ctx, sizeof (gcm_ctx_t)); } } diff --git a/module/icp/algs/sha1/sha1.c b/module/icp/algs/sha1/sha1.c index 7f28b3796b..da34222c8f 100644 --- a/module/icp/algs/sha1/sha1.c +++ b/module/icp/algs/sha1/sha1.c @@ -80,28 +80,6 @@ static uint8_t PADDING[64] = { 0x80, /* all zeros */ }; #define G(b, c, d) ((b) ^ (c) ^ (d)) #define H(b, c, d) (((b) & (c)) | (((b)|(c)) & (d))) -/* - * ROTATE_LEFT rotates x left n bits. - */ - -#if defined(__GNUC__) && defined(_LP64) -static __inline__ uint64_t -ROTATE_LEFT(uint64_t value, uint32_t n) -{ - uint32_t t32; - - t32 = (uint32_t)value; - return ((t32 << n) | (t32 >> (32 - n))); -} - -#else - -#define ROTATE_LEFT(x, n) \ - (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n)))) - -#endif - - /* * SHA1Init() * @@ -248,16 +226,14 @@ typedef uint32_t sha1word; * careful programming can guarantee this for us. */ -#if defined(_BIG_ENDIAN) +#if defined(_ZFS_BIG_ENDIAN) #define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) #elif defined(HAVE_HTONL) #define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr))) #else -/* little endian -- will work on big endian, but slowly */ -#define LOAD_BIG_32(addr) \ - (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3]) +#define LOAD_BIG_32(addr) BE_32(*((uint32_t *)(addr))) #endif /* _BIG_ENDIAN */ /* @@ -269,6 +245,27 @@ typedef uint32_t sha1word; #define W(n) w_ ## n #endif /* !defined(W_ARRAY) */ +/* + * ROTATE_LEFT rotates x left n bits. + */ + +#if defined(__GNUC__) && defined(_LP64) +static __inline__ uint64_t +ROTATE_LEFT(uint64_t value, uint32_t n) +{ + uint32_t t32; + + t32 = (uint32_t)value; + return ((t32 << n) | (t32 >> (32 - n))); +} + +#else + +#define ROTATE_LEFT(x, n) \ + (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n)))) + +#endif + #if defined(__sparc) diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c index 05a2e6ad14..75f6a3c1af 100644 --- a/module/icp/algs/sha2/sha2.c +++ b/module/icp/algs/sha2/sha2.c @@ -43,7 +43,7 @@ #define _RESTRICT_KYWD -#ifdef _LITTLE_ENDIAN +#ifdef _ZFS_LITTLE_ENDIAN #include #define HAVE_HTONL #endif @@ -123,7 +123,7 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ }; * careful programming can guarantee this for us. */ -#if defined(_BIG_ENDIAN) +#if defined(_ZFS_BIG_ENDIAN) #define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) #define LOAD_BIG_64(addr) (*(uint64_t *)(addr)) diff --git a/module/icp/algs/skein/skein.c b/module/icp/algs/skein/skein.c index 0187f7be6c..83fe842603 100644 --- a/module/icp/algs/skein/skein.c +++ b/module/icp/algs/skein/skein.c @@ -5,21 +5,11 @@ */ /* Copyright 2013 Doug Whiting. This code is released to the public domain. */ -#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ - #include #include #include /* get the Skein API definitions */ #include "skein_impl.h" /* get internal definitions */ -/* External function to process blkCnt (nonzero) full block(s) of data. */ -void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, - size_t blkCnt, size_t byteCntAdd); -void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, - size_t blkCnt, size_t byteCntAdd); -void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, - size_t blkCnt, size_t byteCntAdd); - /* 256-bit Skein */ /* init the context for a straight hashing operation */ int diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c index 6d85cb7d9e..7ba165a485 100644 --- a/module/icp/algs/skein/skein_block.c +++ b/module/icp/algs/skein/skein_block.c @@ -159,7 +159,7 @@ Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, ts[r + (R) + 2] = ts[r + (R) - 1]; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); - /* loop thru it */ + /* loop through it */ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) #endif { @@ -385,7 +385,7 @@ Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, ts[r + (R)+2] = ts[r + (R) - 1]; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); - /* loop thru it */ + /* loop through it */ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) #endif /* end of looped code definitions */ { @@ -667,7 +667,7 @@ Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, ts[r + (R) + 2] = ts[r + (R) - 1]; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); - /* loop thru it */ + /* loop through it */ for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) #endif { diff --git a/module/icp/algs/skein/skein_impl.h b/module/icp/algs/skein/skein_impl.h index ea834e6199..2f6307fa7b 100644 --- a/module/icp/algs/skein/skein_impl.h +++ b/module/icp/algs/skein/skein_impl.h @@ -26,7 +26,6 @@ #include #include -#include #include "skein_impl.h" #include "skein_port.h" @@ -139,7 +138,6 @@ #define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ do { \ (ctxPtr)->h.T[TWK_NUM] = (tVal); \ - _NOTE(CONSTCOND) \ } while (0) #define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0) @@ -152,7 +150,6 @@ do { \ Skein_Set_T0(ctxPtr, (T0)); \ Skein_Set_T1(ctxPtr, (T1)); \ - _NOTE(CONSTCOND) \ } while (0) #define Skein_Set_Type(ctxPtr, BLK_TYPE) \ @@ -166,24 +163,20 @@ Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | \ SKEIN_T1_BLK_TYPE_ ## BLK_TYPE); \ (ctxPtr)->h.bCnt = 0; \ - _NOTE(CONSTCOND) \ } while (0) #define Skein_Clear_First_Flag(hdr) \ do { \ (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \ - _NOTE(CONSTCOND) \ } while (0) #define Skein_Set_Bit_Pad_Flag(hdr) \ do { \ (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ - _NOTE(CONSTCOND) \ } while (0) #define Skein_Set_Tree_Level(hdr, height) \ do { \ (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \ - _NOTE(CONSTCOND) \ } while (0) /* @@ -212,7 +205,6 @@ do { \ if (!(x)) \ return (retCode); \ - _NOTE(CONSTCOND) \ } while (0) /* internal error */ #define Skein_assert(x) ASSERT(x) @@ -281,4 +273,12 @@ extern const uint64_t SKEIN1024_IV_384[]; extern const uint64_t SKEIN1024_IV_512[]; extern const uint64_t SKEIN1024_IV_1024[]; +/* Functions to process blkCnt (nonzero) full block(s) of data. */ +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); + #endif /* _SKEIN_IMPL_H_ */ diff --git a/module/icp/algs/skein/skein_port.h b/module/icp/algs/skein/skein_port.h index 4fe268bb5a..ce43530825 100644 --- a/module/icp/algs/skein/skein_port.h +++ b/module/icp/algs/skein/skein_port.h @@ -44,19 +44,16 @@ #include /* get endianness selection */ -#define PLATFORM_MUST_ALIGN _ALIGNMENT_REQUIRED -#if defined(_BIG_ENDIAN) +#if defined(_ZFS_BIG_ENDIAN) /* here for big-endian CPUs */ #define SKEIN_NEED_SWAP (1) #else /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ #define SKEIN_NEED_SWAP (0) -#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ #define Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt) #define Skein_Get64_LSB_First(dst64, src08, wCnt) \ bcopy(src08, dst64, 8 * (wCnt)) #endif -#endif #endif /* ifndef SKEIN_NEED_SWAP */ @@ -80,9 +77,8 @@ #endif /* ifndef Skein_Swap64 */ #ifndef Skein_Put64_LSB_First -void +static inline void Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ { /* * this version is fully portable (big-endian or little-endian), @@ -93,15 +89,11 @@ Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt) for (n = 0; n < bCnt; n++) dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7))); } -#else -; /* output only the function prototype */ -#endif #endif /* ifndef Skein_Put64_LSB_First */ #ifndef Skein_Get64_LSB_First -void +static inline void Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ { /* * this version is fully portable (big-endian or little-endian), @@ -119,9 +111,6 @@ Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt) (((uint64_t)src[n + 6]) << 48) + (((uint64_t)src[n + 7]) << 56); } -#else -; /* output only the function prototype */ -#endif #endif /* ifndef Skein_Get64_LSB_First */ #endif /* _SKEIN_PORT_H_ */ diff --git a/module/icp/api/kcf_cipher.c b/module/icp/api/kcf_cipher.c index 1c9f6873e2..d6aa48147e 100644 --- a/module/icp/api/kcf_cipher.c +++ b/module/icp/api/kcf_cipher.c @@ -30,9 +30,6 @@ #include #include -#define CRYPTO_OPS_OFFSET(f) offsetof(crypto_ops_t, co_##f) -#define CRYPTO_CIPHER_OFFSET(f) offsetof(crypto_cipher_ops_t, f) - /* * Encryption and decryption routines. */ @@ -916,8 +913,6 @@ crypto_decrypt_single(crypto_context_t context, crypto_data_t *ciphertext, } #if defined(_KERNEL) -EXPORT_SYMBOL(crypto_cipher_init_prov); -EXPORT_SYMBOL(crypto_cipher_init); EXPORT_SYMBOL(crypto_encrypt_prov); EXPORT_SYMBOL(crypto_encrypt); EXPORT_SYMBOL(crypto_encrypt_init_prov); diff --git a/module/icp/api/kcf_ctxops.c b/module/icp/api/kcf_ctxops.c index b9b9cb74e0..21b0977d36 100644 --- a/module/icp/api/kcf_ctxops.c +++ b/module/icp/api/kcf_ctxops.c @@ -63,7 +63,7 @@ * * Returns: * CRYPTO_SUCCESS when the context template is successfully created. - * CRYPTO_HOST_MEMEORY: mem alloc failure + * CRYPTO_HOST_MEMORY: mem alloc failure * CRYPTO_ARGUMENTS_BAD: NULL storage for the ctx template. * RYPTO_MECHANISM_INVALID: invalid mechanism 'mech'. */ @@ -123,7 +123,7 @@ crypto_create_ctx_template(crypto_mechanism_t *mech, crypto_key_t *key, * crypto_create_ctx_template() * * Description: - * Frees the inbedded crypto_spi_ctx_template_t, then the + * Frees the embedded crypto_spi_ctx_template_t, then the * kcf_ctx_template_t. * * Context: diff --git a/module/icp/api/kcf_digest.c b/module/icp/api/kcf_digest.c index 87090fd527..aa68d69bc1 100644 --- a/module/icp/api/kcf_digest.c +++ b/module/icp/api/kcf_digest.c @@ -30,9 +30,6 @@ #include #include -#define CRYPTO_OPS_OFFSET(f) offsetof(crypto_ops_t, co_##f) -#define CRYPTO_DIGEST_OFFSET(f) offsetof(crypto_digest_ops_t, f) - /* * Message digest routines */ diff --git a/module/icp/api/kcf_mac.c b/module/icp/api/kcf_mac.c index 21ab94fa5b..a7722d8f91 100644 --- a/module/icp/api/kcf_mac.c +++ b/module/icp/api/kcf_mac.c @@ -30,9 +30,6 @@ #include #include -#define CRYPTO_OPS_OFFSET(f) offsetof(crypto_ops_t, co_##f) -#define CRYPTO_MAC_OFFSET(f) offsetof(crypto_mac_ops_t, f) - /* * Message authentication codes routines. */ diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl index a2c4adcbe6..92c9e196a3 100644 --- a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl @@ -101,7 +101,7 @@ * must display the following acknowledgement: * "This product includes cryptographic software written by * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library + * The word 'cryptographic' can be left out if the routines from the library * being used are not cryptographic related :-). * 4. If you include any Windows specific code (or a derivative thereof) from * the apps directory (application code) you must include an acknowledgement: diff --git a/module/icp/asm-x86_64/aes/aesopt.h b/module/icp/asm-x86_64/aes/aesopt.h index 6aa61db827..472111f96e 100644 --- a/module/icp/asm-x86_64/aes/aesopt.h +++ b/module/icp/asm-x86_64/aes/aesopt.h @@ -327,7 +327,7 @@ extern "C" { * On some systems speed will be improved by aligning the AES large lookup * tables on particular boundaries. This define should be set to a power of * two giving the desired alignment. It can be left undefined if alignment - * is not needed. This option is specific to the Micrsoft VC++ compiler - + * is not needed. This option is specific to the Microsoft VC++ compiler - * it seems to sometimes cause trouble for the VC++ version 6 compiler. */ diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams new file mode 100644 index 0000000000..0de1883dc8 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams @@ -0,0 +1,36 @@ +Copyright (c) 2006-2017, CRYPTOGAMS by +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the CRYPTOGAMS nor the names of its + copyright holder and contributors may be used to endorse or + promote products derived from this software without specific + prior written permission. + +ALTERNATIVELY, provided that this notice is retained in full, this +product may be distributed under the terms of the GNU General Public +License (GPL), in which case the provisions of the GPL apply INSTEAD OF +those given above. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip new file mode 100644 index 0000000000..6184759c8b --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl new file mode 100644 index 0000000000..49cc83d2ee --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl @@ -0,0 +1,177 @@ + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 0000000000..6184759c8b --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S new file mode 100644 index 0000000000..dc71ae2c1c --- /dev/null +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -0,0 +1,1261 @@ +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] +# +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) + +.extern gcm_avx_can_use_movbe + +.text + +#ifdef HAVE_MOVBE +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: +.cfi_startproc + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + cmpl $14,%ebp // ICP does not zero key schedule. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.cfi_endproc +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +#endif /* ifdef HAVE_MOVBE */ + +.type _aesni_ctr32_ghash_no_movbe_6x,@function +.align 32 +_aesni_ctr32_ghash_no_movbe_6x: +.cfi_startproc + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x_nmb + +.align 32 +.Loop6x_nmb: + addl $100663296,%ebx + jc .Lhandle_ctr32_nmb + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32_nmb: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movq 88(%r14),%r13 + bswapq %r13 + vaesenc %xmm15,%xmm11,%xmm11 + movq 80(%r14),%r12 + bswapq %r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movq 72(%r14),%r13 + bswapq %r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movq 64(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movq 56(%r14),%r13 + bswapq %r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movq 48(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movq 40(%r14),%r13 + bswapq %r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movq 32(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movq 24(%r14),%r13 + bswapq %r13 + vaesenc %xmm15,%xmm11,%xmm11 + movq 16(%r14),%r12 + bswapq %r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movq 8(%r14),%r13 + bswapq %r13 + vaesenc %xmm1,%xmm13,%xmm13 + movq 0(%r14),%r12 + bswapq %r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. + jb .Lenc_tail_nmb + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + cmpl $14,%ebp // ICP does not zero key schedule. + jb .Lenc_tail_nmb + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail_nmb + +.align 32 +.Lhandle_ctr32_nmb: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32_nmb + +.align 32 +.Lenc_tail_nmb: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done_nmb + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x_nmb + +.L6x_done_nmb: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.cfi_endproc +.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + movq 32(%r9),%r9 + leaq 32(%r9),%r9 + movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + leaq -192(%rdi,%rdx,1),%r15 + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + +#ifdef HAVE_MOVBE +#ifdef _KERNEL + testl $1,gcm_avx_can_use_movbe(%rip) +#else + testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) +#endif + jz 1f + call _aesni_ctr32_ghash_6x + jmp 2f +1: +#endif + call _aesni_ctr32_ghash_no_movbe_6x +2: + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + leaq -192(%rsi,%rdx,1),%r15 + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + movq 32(%r9),%r9 + leaq 32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + +#ifdef HAVE_MOVBE +#ifdef _KERNEL + testl $1,gcm_avx_can_use_movbe(%rip) +#else + testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) +#endif + jz 1f + call _aesni_ctr32_ghash_6x + jmp 2f +1: +#endif + call _aesni_ctr32_ghash_no_movbe_6x +2: + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +/* Some utility routines */ + +/* + * clear all fpu registers + * void clear_fpu_regs_avx(void); + */ +.globl clear_fpu_regs_avx +.type clear_fpu_regs_avx,@function +.align 32 +clear_fpu_regs_avx: + vzeroall + ret +.size clear_fpu_regs_avx,.-clear_fpu_regs_avx + +/* + * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); + * + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and + * stores the result at `dst'. The XOR is performed using FPU registers, + * so make sure FPU state is saved when running this in the kernel. + */ +.globl gcm_xor_avx +.type gcm_xor_avx,@function +.align 32 +gcm_xor_avx: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%rsi) + ret +.size gcm_xor_avx,.-gcm_xor_avx + +/* + * Toggle a boolean_t value atomically and return the new value. + * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + */ +.globl atomic_toggle_boolean_nv +.type atomic_toggle_boolean_nv,@function +.align 32 +atomic_toggle_boolean_nv: + xorl %eax, %eax + lock + xorl $1, (%rdi) + jz 1f + movl $1, %eax +1: + ret +.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv + +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S new file mode 100644 index 0000000000..90cc36b43a --- /dev/null +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -0,0 +1,714 @@ +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) +# Goldmont 1.08(+24%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# Knights Landing achieves 1.09 cpb. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) + +.text + +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.cfi_startproc +.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 +.byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_gmult_clmul,.-gcm_gmult_clmul + +.globl gcm_init_htab_avx +.type gcm_init_htab_avx,@function +.align 32 +gcm_init_htab_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rsi),%xmm2 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords. + vmovdqu .Lbswap_mask(%rip),%xmm4 + vpshufb %xmm4,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_init_htab_avx,.-gcm_init_htab_avx + +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: +.cfi_startproc + jmp .L_gmult_clmul +.cfi_endproc +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_ghash_avx,.-gcm_ghash_avx +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 +.align 64 +.type .Lrem_4bit,@object +.Lrem_4bit: +.long 0,0,0,471859200,0,943718400,0,610271232 +.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 +.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 +.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 +.type .Lrem_8bit,@object +.Lrem_8bit: +.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E +.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E +.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E +.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E +.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E +.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E +.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E +.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E +.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE +.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE +.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE +.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE +.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E +.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E +.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE +.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE +.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E +.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E +.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E +.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E +.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E +.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E +.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E +.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E +.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE +.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE +.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE +.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE +.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E +.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E +.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE +.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/sha1/sha1-x86_64.S index cb923784a7..fc844cd8c7 100644 --- a/module/icp/asm-x86_64/sha1/sha1-x86_64.S +++ b/module/icp/asm-x86_64/sha1/sha1-x86_64.S @@ -69,16 +69,27 @@ sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks) #define _ASM #include ENTRY_NP(sha1_block_data_order) - push %rbx - push %rbp - push %r12 +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_offset %rbx,-16 + push %rbp +.cfi_offset %rbp,-24 + push %r12 +.cfi_offset %r12,-32 mov %rdi,%r8 # reassigned argument +.cfi_register %rdi, %r8 sub $72,%rsp mov %rsi,%r9 # reassigned argument +.cfi_register %rsi, %r9 and $-64,%rsp mov %rdx,%r10 # reassigned argument +.cfi_register %rdx, %r10 mov %rax,64(%rsp) +# echo ".cfi_cfa_expression %rsp+64,deref,+8" | +# openssl/crypto/perlasm/x86_64-xlate.pl +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 mov 0(%r8),%edx mov 4(%r8),%esi @@ -1337,10 +1348,15 @@ ENTRY_NP(sha1_block_data_order) sub $1,%r10 jnz .Lloop mov 64(%rsp),%rsp - pop %r12 - pop %rbp - pop %rbx +.cfi_def_cfa %rsp,8 + movq -24(%rsp),%r12 +.cfi_restore %r12 + movq -16(%rsp),%rbp +.cfi_restore %rbp + movq -8(%rsp),%rbx +.cfi_restore %rbx ret +.cfi_endproc SET_SIZE(sha1_block_data_order) .data diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index 766b75355f..28b048d2db 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -83,12 +83,21 @@ SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) #include ENTRY_NP(SHA256TransformBlocks) +.cfi_startproc + movq %rsp, %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_offset %rbx,-16 push %rbp +.cfi_offset %rbp,-24 push %r12 +.cfi_offset %r12,-32 push %r13 +.cfi_offset %r13,-40 push %r14 +.cfi_offset %r14,-48 push %r15 +.cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*4+4*8,%rsp @@ -99,6 +108,9 @@ ENTRY_NP(SHA256TransformBlocks) mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*4+3*8(%rsp) # save copy of %rsp +# echo ".cfi_cfa_expression %rsp+88,deref,+56" | +# openssl/crypto/perlasm/x86_64-xlate.pl +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts @@ -2026,14 +2038,28 @@ ENTRY_NP(SHA256TransformBlocks) jb .Lloop mov 16*4+3*8(%rsp),%rsp +.cfi_def_cfa %rsp,56 pop %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 pop %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 pop %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 pop %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 pop %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp pop %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx ret +.cfi_endproc SET_SIZE(SHA256TransformBlocks) .data diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index 6e37618761..746c85a985 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -84,12 +84,21 @@ SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) #include ENTRY_NP(SHA512TransformBlocks) +.cfi_startproc + movq %rsp, %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_offset %rbx,-16 push %rbp +.cfi_offset %rbp,-24 push %r12 +.cfi_offset %r12,-32 push %r13 +.cfi_offset %r13,-40 push %r14 +.cfi_offset %r14,-48 push %r15 +.cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*8+4*8,%rsp @@ -100,6 +109,9 @@ ENTRY_NP(SHA512TransformBlocks) mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*8+3*8(%rsp) # save copy of %rsp +# echo ".cfi_cfa_expression %rsp+152,deref,+56" | +# openssl/crypto/perlasm/x86_64-xlate.pl +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts @@ -2027,14 +2039,28 @@ ENTRY_NP(SHA512TransformBlocks) jb .Lloop mov 16*8+3*8(%rsp),%rsp +.cfi_def_cfa %rsp,56 pop %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 pop %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 pop %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 pop %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 pop %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp pop %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx ret +.cfi_endproc SET_SIZE(SHA512TransformBlocks) .data diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c index 741dae7a74..2642b317d6 100644 --- a/module/icp/core/kcf_mech_tabs.c +++ b/module/icp/core/kcf_mech_tabs.c @@ -103,7 +103,7 @@ kcf_mech_entry_tab_t kcf_mech_tabs_tab[KCF_LAST_OPSCLASS + 1] = { * Per-algorithm internal thresholds for the minimum input size of before * offloading to hardware provider. * Dispatching a crypto operation to a hardware provider entails paying the - * cost of an additional context switch. Measurments with Sun Accelerator 4000 + * cost of an additional context switch. Measurements with Sun Accelerator 4000 * shows that 512-byte jobs or smaller are better handled in software. * There is room for refinement here. * diff --git a/module/icp/core/kcf_prov_lib.c b/module/icp/core/kcf_prov_lib.c index 3cae872ddc..1b115d9762 100644 --- a/module/icp/core/kcf_prov_lib.c +++ b/module/icp/core/kcf_prov_lib.c @@ -40,7 +40,7 @@ int crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, void *digest_ctx, void (*update)(void)) { - uio_t *uiop = data->cd_uio; + zfs_uio_t *uiop = data->cd_uio; off_t offset = data->cd_offset; size_t length = len; uint_t vec_idx; @@ -48,7 +48,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, uchar_t *datap; ASSERT(data->cd_format == CRYPTO_DATA_UIO); - if (uiop->uio_segflg != UIO_SYSSPACE) { + if (zfs_uio_segflg(uiop) != UIO_SYSSPACE) { return (CRYPTO_ARGUMENTS_BAD); } @@ -56,12 +56,9 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, * Jump to the first iovec containing data to be * processed. */ - for (vec_idx = 0; vec_idx < uiop->uio_iovcnt && - offset >= uiop->uio_iov[vec_idx].iov_len; - offset -= uiop->uio_iov[vec_idx++].iov_len) - ; + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); - if (vec_idx == uiop->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The caller specified an offset that is larger than * the total size of the buffers it provided. @@ -69,12 +66,11 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, return (CRYPTO_DATA_LEN_RANGE); } - while (vec_idx < uiop->uio_iovcnt && length > 0) { - cur_len = MIN(uiop->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) - offset, length); - datap = (uchar_t *)(uiop->uio_iov[vec_idx].iov_base + - offset); + datap = (uchar_t *)(zfs_uio_iovbase(uiop, vec_idx) + offset); switch (cmd) { case COPY_FROM_DATA: bcopy(datap, buf, cur_len); @@ -101,7 +97,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, offset = 0; } - if (vec_idx == uiop->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed. @@ -149,6 +145,7 @@ crypto_update_iov(void *ctx, crypto_data_t *input, crypto_data_t *output, common_ctx_t *common_ctx = ctx; int rv; + ASSERT(input != output); if (input->cd_miscdata != NULL) { copy_block((uint8_t *)input->cd_miscdata, &common_ctx->cc_iv[0]); @@ -158,7 +155,7 @@ crypto_update_iov(void *ctx, crypto_data_t *input, crypto_data_t *output, return (CRYPTO_ARGUMENTS_BAD); rv = (cipher)(ctx, input->cd_raw.iov_base + input->cd_offset, - input->cd_length, (input == output) ? NULL : output); + input->cd_length, output); return (rv); } @@ -169,18 +166,19 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, void (*copy_block)(uint8_t *, uint64_t *)) { common_ctx_t *common_ctx = ctx; - uio_t *uiop = input->cd_uio; + zfs_uio_t *uiop = input->cd_uio; off_t offset = input->cd_offset; size_t length = input->cd_length; uint_t vec_idx; size_t cur_len; + ASSERT(input != output); if (input->cd_miscdata != NULL) { copy_block((uint8_t *)input->cd_miscdata, &common_ctx->cc_iv[0]); } - if (input->cd_uio->uio_segflg != UIO_SYSSPACE) { + if (zfs_uio_segflg(input->cd_uio) != UIO_SYSSPACE) { return (CRYPTO_ARGUMENTS_BAD); } @@ -188,11 +186,8 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, * Jump to the first iovec containing data to be * processed. */ - for (vec_idx = 0; vec_idx < uiop->uio_iovcnt && - offset >= uiop->uio_iov[vec_idx].iov_len; - offset -= uiop->uio_iov[vec_idx++].iov_len) - ; - if (vec_idx == uiop->uio_iovcnt && length > 0) { + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -203,19 +198,22 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, /* * Now process the iovecs. */ - while (vec_idx < uiop->uio_iovcnt && length > 0) { - cur_len = MIN(uiop->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) - offset, length); - (cipher)(ctx, uiop->uio_iov[vec_idx].iov_base + offset, - cur_len, (input == output) ? NULL : output); + int rv = (cipher)(ctx, zfs_uio_iovbase(uiop, vec_idx) + offset, + cur_len, output); + if (rv != CRYPTO_SUCCESS) { + return (rv); + } length -= cur_len; vec_idx++; offset = 0; } - if (vec_idx == uiop->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. diff --git a/module/icp/core/kcf_prov_tabs.c b/module/icp/core/kcf_prov_tabs.c index 94e6937bcd..9d303d0225 100644 --- a/module/icp/core/kcf_prov_tabs.c +++ b/module/icp/core/kcf_prov_tabs.c @@ -377,7 +377,7 @@ kcf_provider_zero_refcnt(kcf_provider_desc_t *desc) mutex_exit(&desc->pd_lock); break; } - /* FALLTHRU */ + fallthrough; case CRYPTO_HW_PROVIDER: case CRYPTO_LOGICAL_PROVIDER: diff --git a/module/icp/core/kcf_sched.c b/module/icp/core/kcf_sched.c index da2346f7ec..81fd15f8ea 100644 --- a/module/icp/core/kcf_sched.c +++ b/module/icp/core/kcf_sched.c @@ -182,7 +182,7 @@ kcf_areqnode_alloc(kcf_provider_desc_t *pd, kcf_context_t *ictx, * reached, signal the creator thread for more threads. * * If the two conditions above are not met, we don't need to do - * any thing. The request will be picked up by one of the + * anything. The request will be picked up by one of the * worker threads when it becomes available. */ static int @@ -872,7 +872,7 @@ kcf_free_req(kcf_areq_node_t *areq) * Utility routine to remove a request from the chain of requests * hanging off a context. */ -void +static void kcf_removereq_in_ctxchain(kcf_context_t *ictx, kcf_areq_node_t *areq) { kcf_areq_node_t *cur, *prev; @@ -909,7 +909,7 @@ kcf_removereq_in_ctxchain(kcf_context_t *ictx, kcf_areq_node_t *areq) * * The caller must hold the queue lock and request lock (an_lock). */ -void +static void kcf_remove_node(kcf_areq_node_t *node) { kcf_areq_node_t *nextp = node->an_next; @@ -1182,7 +1182,7 @@ kcf_aop_done(kcf_areq_node_t *areq, int error) /* * Handle recoverable errors. This has to be done first - * before doing any thing else in this routine so that + * before doing anything else in this routine so that * we do not change the state of the request. */ if (error != CRYPTO_SUCCESS && IS_RECOVERABLE(error)) { @@ -1308,9 +1308,7 @@ kcf_reqid_insert(kcf_areq_node_t *areq) kcf_areq_node_t *headp; kcf_reqid_table_t *rt; - kpreempt_disable(); - rt = kcf_reqid_table[CPU_SEQID & REQID_TABLE_MASK]; - kpreempt_enable(); + rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK]; mutex_enter(&rt->rt_lock); @@ -1432,7 +1430,7 @@ crypto_cancel_req(crypto_req_id_t id) /* * There is no interface to remove an entry * once it is on the taskq. So, we do not do - * any thing for a hardware provider. + * anything for a hardware provider. */ break; default: @@ -1535,7 +1533,7 @@ kcf_misc_kstat_update(kstat_t *ksp, int rw) } /* - * Allocate and initiatize a kcf_dual_req, used for saving the arguments of + * Allocate and initialize a kcf_dual_req, used for saving the arguments of * a dual operation or an atomic operation that has to be internally * simulated with multiple single steps. * crq determines the memory allocation flags. @@ -1551,7 +1549,7 @@ kcf_alloc_req(crypto_call_req_t *crq) if (kcr == NULL) return (NULL); - /* Copy the whole crypto_call_req struct, as it isn't persistant */ + /* Copy the whole crypto_call_req struct, as it isn't persistent */ if (crq != NULL) kcr->kr_callreq = *crq; else @@ -1579,7 +1577,7 @@ kcf_next_req(void *next_req_arg, int status) kcf_provider_desc_t *pd = NULL; crypto_dual_data_t *ct = NULL; - /* Stop the processing if an error occured at this step */ + /* Stop the processing if an error occurred at this step */ if (error != CRYPTO_SUCCESS) { out: areq->an_reqarg = next_req->kr_callreq; diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c index c2fcf1ff72..3c5ef43939 100644 --- a/module/icp/illumos-crypto.c +++ b/module/icp/illumos-crypto.c @@ -93,7 +93,7 @@ * will use the generic implementation. * * 7) Removing sha384 and sha512 code: The sha code was actually very - * wasy to port. However, the generic sha384 and sha512 code actually + * easy to port. However, the generic sha384 and sha512 code actually * exceeds the stack size on arm and powerpc architectures. In an effort * to remove warnings, this code was removed. * diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index 95cfddf9e0..41dccaa384 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -107,6 +107,11 @@ typedef union { } aes_ks_t; typedef struct aes_impl_ops aes_impl_ops_t; + +/* + * The absolute offset of the encr_ks (0) and the nr (504) fields are hard + * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly). + */ typedef struct aes_key aes_key_t; struct aes_key { aes_ks_t encr_ks; /* encryption key schedule */ @@ -162,7 +167,7 @@ typedef enum aes_mech_type { #endif /* _AES_IMPL */ /* - * Methods used to define aes implementation + * Methods used to define AES implementation * * @aes_gen_f Key generation * @aes_enc_f Function encrypts one block @@ -190,6 +195,16 @@ struct aes_impl_ops { extern const aes_impl_ops_t aes_generic_impl; #if defined(__x86_64) extern const aes_impl_ops_t aes_x86_64_impl; + +/* These functions are used to execute amd64 instructions for AMD or Intel: */ +extern int rijndael_key_setup_enc_amd64(uint32_t rk[], + const uint32_t cipherKey[], int keyBits); +extern int rijndael_key_setup_dec_amd64(uint32_t rk[], + const uint32_t cipherKey[], int keyBits); +extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, + const uint32_t pt[4], uint32_t ct[4]); +extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, + const uint32_t ct[4], uint32_t pt[4]); #endif #if defined(__x86_64) && defined(HAVE_AES) extern const aes_impl_ops_t aes_aesni_impl; @@ -201,9 +216,9 @@ extern const aes_impl_ops_t aes_aesni_impl; void aes_impl_init(void); /* - * Get selected aes implementation + * Returns optimal allowed AES implementation */ -struct aes_impl_ops *aes_impl_get_ops(void); +const struct aes_impl_ops *aes_impl_get_ops(void); #ifdef __cplusplus } diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h index cbb904c059..28c8f63a7d 100644 --- a/module/icp/include/modes/gcm_impl.h +++ b/module/icp/include/modes/gcm_impl.h @@ -37,12 +37,12 @@ extern "C" { #include /* - * Methods used to define gcm implementation + * Methods used to define GCM implementation * * @gcm_mul_f Perform carry-less multiplication * @gcm_will_work_f Function tests whether implementation will function */ -typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *); +typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *); typedef boolean_t (*gcm_will_work_f)(void); #define GCM_IMPL_NAME_MAX (16) @@ -64,9 +64,9 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl; void gcm_impl_init(void); /* - * Get selected aes implementation + * Returns optimal allowed GCM implementation */ -struct gcm_impl_ops *gcm_impl_get_ops(void); +const struct gcm_impl_ops *gcm_impl_get_ops(void); #ifdef __cplusplus } diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h index 7c1f10b16e..ab71197542 100644 --- a/module/icp/include/modes/modes.h +++ b/module/icp/include/modes/modes.h @@ -34,6 +34,17 @@ extern "C" { #include #include +/* + * Does the build chain support all instructions needed for the GCM assembler + * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure + * anyhow. + */ +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +#define CAN_USE_GCM_ASM +extern boolean_t gcm_avx_can_use_movbe; +#endif + #define ECB_MODE 0x00000002 #define CBC_MODE 0x00000004 #define CTR_MODE 0x00000008 @@ -189,13 +200,17 @@ typedef struct ccm_ctx { * * gcm_H: Subkey. * + * gcm_Htable: Pre-computed and pre-shifted H, H^2, ... H^6 for the + * Karatsuba Algorithm in host byte order. + * * gcm_J0: Pre-counter block generated from the IV. * * gcm_len_a_len_c: 64-bit representations of the bit lengths of * AAD and ciphertext. * - * gcm_kmflag: Current value of kmflag. Used only for allocating - * the plaintext buffer during decryption. + * gcm_kmflag: Current value of kmflag. Used for allocating + * the plaintext buffer during decryption and a + * gcm_avx_chunk_size'd buffer for avx enabled encryption. */ typedef struct gcm_ctx { struct common_ctx gcm_common; @@ -203,12 +218,23 @@ typedef struct gcm_ctx { size_t gcm_processed_data_len; size_t gcm_pt_buf_len; uint32_t gcm_tmp[4]; + /* + * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded + * in aesni-gcm-x86_64.S, so please don't change (or adjust there). + */ uint64_t gcm_ghash[2]; uint64_t gcm_H[2]; +#ifdef CAN_USE_GCM_ASM + uint64_t *gcm_Htable; + size_t gcm_htab_len; +#endif uint64_t gcm_J0[2]; uint64_t gcm_len_a_len_c[2]; uint8_t *gcm_pt_buf; int gcm_kmflag; +#ifdef CAN_USE_GCM_ASM + boolean_t gcm_use_avx; +#endif } gcm_ctx_t; #define gcm_keysched gcm_common.cc_keysched diff --git a/module/icp/include/sys/bitmap.h b/module/icp/include/sys/bitmap.h index b1f6823e61..4e86ee70ed 100644 --- a/module/icp/include/sys/bitmap.h +++ b/module/icp/include/sys/bitmap.h @@ -157,9 +157,9 @@ extern int odd_parity(ulong_t); * to 0 otherwise. */ #define BT_ATOMIC_SET(bitmap, bitindex) \ - { atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } + { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } #define BT_ATOMIC_CLEAR(bitmap, bitindex) \ - { atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } + { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } #define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \ { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \ diff --git a/module/icp/include/sys/crypto/impl.h b/module/icp/include/sys/crypto/impl.h index 258cb5fedc..0f37f3f635 100644 --- a/module/icp/include/sys/crypto/impl.h +++ b/module/icp/include/sys/crypto/impl.h @@ -237,7 +237,7 @@ typedef struct kcf_provider_list { struct kcf_provider_desc *pl_provider; } kcf_provider_list_t; -/* atomic operations in linux implictly form a memory barrier */ +/* atomic operations in linux implicitly form a memory barrier */ #define membar_exit() /* diff --git a/module/icp/include/sys/crypto/ioctl.h b/module/icp/include/sys/crypto/ioctl.h index dd59ca7f2b..6e371e3439 100644 --- a/module/icp/include/sys/crypto/ioctl.h +++ b/module/icp/include/sys/crypto/ioctl.h @@ -241,9 +241,6 @@ typedef struct crypto_logout32 { #define CRYPTO_LOGIN CRYPTO(40) #define CRYPTO_LOGOUT CRYPTO(41) -/* flag for encrypt and decrypt operations */ -#define CRYPTO_INPLACE_OPERATION 0x00000001 - /* * Cryptographic Ioctls */ diff --git a/module/icp/include/sys/crypto/sched_impl.h b/module/icp/include/sys/crypto/sched_impl.h index 32ffa77495..85ea0ba1d0 100644 --- a/module/icp/include/sys/crypto/sched_impl.h +++ b/module/icp/include/sys/crypto/sched_impl.h @@ -381,7 +381,7 @@ typedef struct kcf_pool { /* * cv & lock for the condition where more threads need to be - * created. kp_user_lock also protects the three fileds above. + * created. kp_user_lock also protects the three fields above. */ kcondvar_t kp_user_cv; /* Creator cond. variable */ kmutex_t kp_user_lock; /* Creator lock */ @@ -448,13 +448,13 @@ typedef struct kcf_ntfy_elem { * The following values are based on the assumption that it would * take around eight cpus to load a hardware provider (This is true for * at least one product) and a kernel client may come from different - * low-priority interrupt levels. We will have CYRPTO_TASKQ_MIN number + * low-priority interrupt levels. We will have CRYPTO_TASKQ_MIN number * of cached taskq entries. The CRYPTO_TASKQ_MAX number is based on * a throughput of 1GB/s using 512-byte buffers. These are just * reasonable estimates and might need to change in future. */ #define CRYPTO_TASKQ_THREADS 8 -#define CYRPTO_TASKQ_MIN 64 +#define CRYPTO_TASKQ_MIN 64 #define CRYPTO_TASKQ_MAX 2 * 1024 * 1024 extern int crypto_taskq_threads; diff --git a/module/icp/include/sys/crypto/spi.h b/module/icp/include/sys/crypto/spi.h index 0aae9181ad..2c62b57066 100644 --- a/module/icp/include/sys/crypto/spi.h +++ b/module/icp/include/sys/crypto/spi.h @@ -699,7 +699,7 @@ typedef struct crypto_provider_info { /* * Provider status passed by a provider to crypto_provider_notification(9F) - * and returned by the provider_stauts(9E) entry point. + * and returned by the provider_status(9E) entry point. */ #define CRYPTO_PROVIDER_READY 0 #define CRYPTO_PROVIDER_BUSY 1 diff --git a/module/icp/include/sys/ia32/stack.h b/module/icp/include/sys/ia32/stack.h index c4deb7bcaf..9e7c089e11 100644 --- a/module/icp/include/sys/ia32/stack.h +++ b/module/icp/include/sys/ia32/stack.h @@ -126,7 +126,7 @@ extern "C" { #if defined(_KERNEL) && !defined(_ASM) -#if defined(DEBUG) +#if defined(ZFS_DEBUG) #if STACK_ALIGN == 4 #define ASSERT_STACK_ALIGNED() \ { \ diff --git a/module/icp/include/sys/modctl.h b/module/icp/include/sys/modctl.h index a0b94ef39d..6c26ad618c 100644 --- a/module/icp/include/sys/modctl.h +++ b/module/icp/include/sys/modctl.h @@ -398,7 +398,7 @@ typedef struct modctl { char mod_delay_unload; /* deferred unload */ struct modctl_list *mod_requisites; /* mods this one depends on. */ - void *__unused; /* NOTE: reuse (same size) is OK, */ + void *____unused; /* NOTE: reuse (same size) is OK, */ /* deletion causes mdb.vs.core issues */ int mod_loadcnt; /* number of times mod was loaded */ int mod_nenabled; /* # of enabled DTrace probes in mod */ diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index 53b1936938..c47c7567b9 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -92,11 +92,6 @@ static crypto_mech_info_t aes_mech_info_tab[] = { AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES} }; -/* operations are in-place if the output buffer is NULL */ -#define AES_ARG_INPLACE(input, output) \ - if ((output) == NULL) \ - (output) = (input); - static void aes_provider_status(crypto_provider_handle_t, uint_t *); static crypto_control_ops_t aes_control_ops = { @@ -206,7 +201,7 @@ aes_mod_init(void) { int ret; - /* find fastest implementations and set any requested implementations */ + /* Determine the fastest available implementation. */ aes_impl_init(); gcm_impl_init(); @@ -413,7 +408,7 @@ aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) return (CRYPTO_DATA_LEN_RANGE); - AES_ARG_INPLACE(plaintext, ciphertext); + ASSERT(ciphertext != NULL); /* * We need to just return the length needed to store the output. @@ -530,7 +525,7 @@ aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); } - AES_ARG_INPLACE(ciphertext, plaintext); + ASSERT(plaintext != NULL); /* * Return length needed to store the output. @@ -635,7 +630,7 @@ aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext, ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; - AES_ARG_INPLACE(plaintext, ciphertext); + ASSERT(ciphertext != NULL); /* compute number of bytes that will hold the ciphertext */ out_len = aes_ctx->ac_remainder_len; @@ -705,7 +700,7 @@ aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; - AES_ARG_INPLACE(ciphertext, plaintext); + ASSERT(plaintext != NULL); /* * Compute number of bytes that will hold the plaintext. @@ -947,7 +942,7 @@ aes_encrypt_atomic(crypto_provider_handle_t provider, size_t length_needed; int ret; - AES_ARG_INPLACE(plaintext, ciphertext); + ASSERT(ciphertext != NULL); /* * CTR, CCM, GCM, and GMAC modes do not require that plaintext @@ -981,7 +976,7 @@ aes_encrypt_atomic(crypto_provider_handle_t provider, case AES_GMAC_MECH_INFO_TYPE: if (plaintext->cd_length != 0) return (CRYPTO_ARGUMENTS_BAD); - /* FALLTHRU */ + fallthrough; case AES_GCM_MECH_INFO_TYPE: length_needed = plaintext->cd_length + aes_ctx.ac_tag_len; break; @@ -1056,6 +1051,16 @@ out: bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } +#ifdef CAN_USE_GCM_ASM + if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) && + ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif return (ret); } @@ -1073,7 +1078,7 @@ aes_decrypt_atomic(crypto_provider_handle_t provider, size_t length_needed; int ret; - AES_ARG_INPLACE(ciphertext, plaintext); + ASSERT(plaintext != NULL); /* * CCM, GCM, CTR, and GMAC modes do not require that ciphertext @@ -1214,6 +1219,14 @@ out: vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf, ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len); } +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif } return (ret); diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c index 544814a984..a806af6106 100644 --- a/module/icp/io/edonr_mod.c +++ b/module/icp/io/edonr_mod.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/module/icp/io/sha1_mod.c b/module/icp/io/sha1_mod.c index e7c38542a7..6dcee6b2ec 100644 --- a/module/icp/io/sha1_mod.c +++ b/module/icp/io/sha1_mod.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -270,19 +271,15 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) size_t cur_len; /* we support only kernel buffer */ - if (data->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - while (vec_idx < data->cd_uio->uio_iovcnt && - offset >= data->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= data->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == data->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -293,12 +290,12 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < data->cd_uio->uio_iovcnt && length > 0) { - cur_len = MIN(data->cd_uio->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) - offset, length); SHA1Update(sha1_ctx, - (uint8_t *)data->cd_uio->uio_iov[vec_idx].iov_base + offset, + (uint8_t *)zfs_uio_iovbase(data->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -306,7 +303,7 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) offset = 0; } - if (vec_idx == data->cd_uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -333,19 +330,15 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, uint_t vec_idx = 0; /* we support only kernel buffer */ - if (digest->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to * be returned. */ - while (vec_idx < digest->cd_uio->uio_iovcnt && - offset >= digest->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= digest->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == digest->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -355,7 +348,7 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, } if (offset + digest_len <= - digest->cd_uio->uio_iov[vec_idx].iov_len) { + zfs_uio_iovlen(digest->cd_uio, vec_idx)) { /* * The computed SHA1 digest will fit in the current * iovec. @@ -367,12 +360,12 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, * the user only what was requested. */ SHA1Final(digest_scratch, sha1_ctx); - bcopy(digest_scratch, (uchar_t *)digest-> - cd_uio->uio_iov[vec_idx].iov_base + offset, + bcopy(digest_scratch, (uchar_t *) + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, digest_len); } else { - SHA1Final((uchar_t *)digest-> - cd_uio->uio_iov[vec_idx].iov_base + offset, + SHA1Final((uchar_t *)zfs_uio_iovbase(digest-> + cd_uio, vec_idx) + offset, sha1_ctx); } } else { @@ -389,11 +382,11 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, SHA1Final(digest_tmp, sha1_ctx); - while (vec_idx < digest->cd_uio->uio_iovcnt && length > 0) { - cur_len = MIN(digest->cd_uio->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - digest->cd_uio->uio_iov[vec_idx].iov_base + offset, + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -402,7 +395,7 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, offset = 0; } - if (vec_idx == digest->cd_uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -1103,16 +1096,12 @@ sha1_mac_verify_atomic(crypto_provider_handle_t provider, size_t cur_len; /* we support only kernel buffer */ - if (mac->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* jump to the first iovec containing the expected digest */ - while (vec_idx < mac->cd_uio->uio_iovcnt && - offset >= mac->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= mac->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == mac->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -1123,12 +1112,12 @@ sha1_mac_verify_atomic(crypto_provider_handle_t provider, } /* do the comparison of computed digest vs specified one */ - while (vec_idx < mac->cd_uio->uio_iovcnt && length > 0) { - cur_len = MIN(mac->cd_uio->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) - offset, length); if (bcmp(digest + scratch_offset, - mac->cd_uio->uio_iov[vec_idx].iov_base + offset, + zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset, cur_len) != 0) { ret = CRYPTO_INVALID_MAC; break; diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index 3254f55975..d690cd0bcb 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -296,19 +296,15 @@ sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) size_t cur_len; /* we support only kernel buffer */ - if (data->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - while (vec_idx < data->cd_uio->uio_iovcnt && - offset >= data->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= data->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == data->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -319,18 +315,18 @@ sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < data->cd_uio->uio_iovcnt && length > 0) { - cur_len = MIN(data->cd_uio->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) - offset, length); - SHA2Update(sha2_ctx, (uint8_t *)data->cd_uio-> - uio_iov[vec_idx].iov_base + offset, cur_len); + SHA2Update(sha2_ctx, (uint8_t *)zfs_uio_iovbase(data->cd_uio, + vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; offset = 0; } - if (vec_idx == data->cd_uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -357,19 +353,15 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, uint_t vec_idx = 0; /* we support only kernel buffer */ - if (digest->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to * be returned. */ - while (vec_idx < digest->cd_uio->uio_iovcnt && - offset >= digest->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= digest->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == digest->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -379,7 +371,7 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, } if (offset + digest_len <= - digest->cd_uio->uio_iov[vec_idx].iov_len) { + zfs_uio_iovlen(digest->cd_uio, vec_idx)) { /* * The computed SHA2 digest will fit in the current * iovec. @@ -395,12 +387,12 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, */ SHA2Final(digest_scratch, sha2_ctx); - bcopy(digest_scratch, (uchar_t *)digest-> - cd_uio->uio_iov[vec_idx].iov_base + offset, + bcopy(digest_scratch, (uchar_t *) + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, digest_len); } else { - SHA2Final((uchar_t *)digest-> - cd_uio->uio_iov[vec_idx].iov_base + offset, + SHA2Final((uchar_t *)zfs_uio_iovbase(digest-> + cd_uio, vec_idx) + offset, sha2_ctx); } @@ -418,12 +410,12 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, SHA2Final(digest_tmp, sha2_ctx); - while (vec_idx < digest->cd_uio->uio_iovcnt && length > 0) { + while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) { cur_len = - MIN(digest->cd_uio->uio_iov[vec_idx].iov_len - + MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - digest->cd_uio->uio_iov[vec_idx].iov_base + offset, + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -432,7 +424,7 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, offset = 0; } - if (vec_idx == digest->cd_uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -1259,16 +1251,12 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider, size_t cur_len; /* we support only kernel buffer */ - if (mac->cd_uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* jump to the first iovec containing the expected digest */ - while (vec_idx < mac->cd_uio->uio_iovcnt && - offset >= mac->cd_uio->uio_iov[vec_idx].iov_len) { - offset -= mac->cd_uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == mac->cd_uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -1279,12 +1267,12 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider, } /* do the comparison of computed digest vs specified one */ - while (vec_idx < mac->cd_uio->uio_iovcnt && length > 0) { - cur_len = MIN(mac->cd_uio->uio_iov[vec_idx].iov_len - + while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) - offset, length); if (bcmp(digest + scratch_offset, - mac->cd_uio->uio_iov[vec_idx].iov_base + offset, + zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset, cur_len) != 0) { ret = CRYPTO_INVALID_MAC; break; diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c index afd7f56806..ac7d201eb7 100644 --- a/module/icp/io/skein_mod.c +++ b/module/icp/io/skein_mod.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #define SKEIN_MODULE_IMPL @@ -178,7 +179,6 @@ typedef struct skein_ctx { (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ break; \ } \ - _NOTE(CONSTCOND) \ } while (0) static int @@ -271,22 +271,18 @@ skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) size_t length = data->cd_length; uint_t vec_idx = 0; size_t cur_len; - const uio_t *uio = data->cd_uio; + zfs_uio_t *uio = data->cd_uio; /* we support only kernel buffer */ - if (uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - while (vec_idx < uio->uio_iovcnt && - offset >= uio->uio_iov[vec_idx].iov_len) { - offset -= uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -297,16 +293,16 @@ skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < uio->uio_iovcnt && length > 0) { - cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length); - SKEIN_OP(ctx, Update, (uint8_t *)uio->uio_iov[vec_idx].iov_base + while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length); + SKEIN_OP(ctx, Update, (uint8_t *)zfs_uio_iovbase(uio, vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; offset = 0; } - if (vec_idx == uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -325,23 +321,19 @@ static int skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) { - off_t offset = digest->cd_offset; - uint_t vec_idx = 0; - uio_t *uio = digest->cd_uio; + off_t offset = digest->cd_offset; + uint_t vec_idx = 0; + zfs_uio_t *uio = digest->cd_uio; /* we support only kernel buffer */ - if (uio->uio_segflg != UIO_SYSSPACE) + if (zfs_uio_segflg(uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to be returned. */ - while (vec_idx < uio->uio_iovcnt && - offset >= uio->uio_iov[vec_idx].iov_len) { - offset -= uio->uio_iov[vec_idx].iov_len; - vec_idx++; - } - if (vec_idx == uio->uio_iovcnt) { + offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -349,10 +341,10 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, return (CRYPTO_DATA_LEN_RANGE); } if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= - uio->uio_iov[vec_idx].iov_len) { + zfs_uio_iovlen(uio, vec_idx)) { /* The computed digest will fit in the current iovec. */ SKEIN_OP(ctx, Final, - (uchar_t *)uio->uio_iov[vec_idx].iov_base + offset); + (uchar_t *)zfs_uio_iovbase(uio, vec_idx) + offset); } else { uint8_t *digest_tmp; off_t scratch_offset = 0; @@ -364,11 +356,11 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, if (digest_tmp == NULL) return (CRYPTO_HOST_MEMORY); SKEIN_OP(ctx, Final, digest_tmp); - while (vec_idx < uio->uio_iovcnt && length > 0) { - cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, + while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - uio->uio_iov[vec_idx].iov_base + offset, cur_len); + zfs_uio_iovbase(uio, vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; @@ -377,7 +369,7 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, } kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); - if (vec_idx == uio->uio_iovcnt && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. diff --git a/module/icp/os/modhash.c b/module/icp/os/modhash.c index 497e843966..a897871001 100644 --- a/module/icp/os/modhash.c +++ b/module/icp/os/modhash.c @@ -48,7 +48,7 @@ * The number returned need _not_ be between 0 and nchains. The mod_hash * code will take care of doing that. The second argument (after the * key) to the hashing function is a void * that represents - * hash_alg_data-- this is provided so that the hashing algrorithm can + * hash_alg_data-- this is provided so that the hashing algorithm can * maintain some state across calls, or keep algorithm-specific * constants associated with the hash table. * @@ -453,17 +453,19 @@ mod_hash_create_extended( int sleep) /* whether to sleep for mem */ { mod_hash_t *mod_hash; + size_t size; ASSERT(hname && keycmp && hash_alg && vdtor && kdtor); if ((mod_hash = kmem_zalloc(MH_SIZE(nchains), sleep)) == NULL) return (NULL); - mod_hash->mh_name = kmem_alloc(strlen(hname) + 1, sleep); + size = strlen(hname) + 1; + mod_hash->mh_name = kmem_alloc(size, sleep); if (mod_hash->mh_name == NULL) { kmem_free(mod_hash, MH_SIZE(nchains)); return (NULL); } - (void) strcpy(mod_hash->mh_name, hname); + (void) strlcpy(mod_hash->mh_name, hname, size); rw_init(&mod_hash->mh_contents, NULL, RW_DEFAULT, NULL); mod_hash->mh_sleep = sleep; diff --git a/module/icp/spi/kcf_spi.c b/module/icp/spi/kcf_spi.c index 0a6e38df86..34b36b81c0 100644 --- a/module/icp/spi/kcf_spi.c +++ b/module/icp/spi/kcf_spi.c @@ -40,7 +40,7 @@ * minalloc and maxalloc values to be used for taskq_create(). */ int crypto_taskq_threads = CRYPTO_TASKQ_THREADS; -int crypto_taskq_minalloc = CYRPTO_TASKQ_MIN; +int crypto_taskq_minalloc = CRYPTO_TASKQ_MIN; int crypto_taskq_maxalloc = CRYPTO_TASKQ_MAX; static void remove_provider(kcf_provider_desc_t *); @@ -261,7 +261,7 @@ crypto_register_provider(crypto_provider_info_t *info, prov_desc->pd_kstat->ks_update = kcf_prov_kstat_update; kstat_install(prov_desc->pd_kstat); } - strfree(ks_name); + kmem_strfree(ks_name); } if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER) diff --git a/module/lua/Makefile.in b/module/lua/Makefile.in index d49065fbe8..0a74c17e64 100644 --- a/module/lua/Makefile.in +++ b/module/lua/Makefile.in @@ -1,16 +1,13 @@ -src = @abs_top_srcdir@/module/lua +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +endif MODULE := zlua obj-$(CONFIG_ZFS) := $(MODULE).o -asflags-y += $(ZFS_MODULE_CFLAGS) -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) -ccflags-y += -DLUA_USE_LONGLONG - -# Suppress unused but set variable warnings often due to ASSERTs -ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) +ccflags-y := -DLUA_USE_LONGLONG $(MODULE)-objs += lapi.o $(MODULE)-objs += lauxlib.o diff --git a/module/lua/lapi.c b/module/lua/lapi.c index 81969673b9..6a845c4610 100644 --- a/module/lua/lapi.c +++ b/module/lua/lapi.c @@ -1295,10 +1295,13 @@ lua_fini(void) module_init(lua_init); module_exit(lua_fini); -MODULE_DESCRIPTION("Lua Interpreter for ZFS"); -MODULE_AUTHOR("Lua.org"); -MODULE_LICENSE("MIT"); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +#endif +/* END CSTYLED */ + +ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS"); +ZFS_MODULE_AUTHOR("Lua.org"); +ZFS_MODULE_LICENSE("Dual MIT/GPL"); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(lua_absindex); EXPORT_SYMBOL(lua_atpanic); @@ -1340,6 +1343,3 @@ EXPORT_SYMBOL(lua_tonumberx); EXPORT_SYMBOL(lua_touserdata); EXPORT_SYMBOL(lua_type); EXPORT_SYMBOL(lua_typename); - -#endif -/* END CSTYLED */ diff --git a/module/lua/lcode.c b/module/lua/lcode.c index ae9a3d91d8..4d88c792a2 100644 --- a/module/lua/lcode.c +++ b/module/lua/lcode.c @@ -8,6 +8,10 @@ #define lcode_c #define LUA_CORE +#if defined(HAVE_IMPLICIT_FALLTHROUGH) +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#endif + #include #include "lcode.h" diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c index 15fe91b0b7..da005c4437 100644 --- a/module/lua/ldebug.c +++ b/module/lua/ldebug.c @@ -597,10 +597,12 @@ l_noret luaG_errormsg (lua_State *L) { l_noret luaG_runerror (lua_State *L, const char *fmt, ...) { + L->runerror++; va_list argp; va_start(argp, fmt); addinfo(L, luaO_pushvfstring(L, fmt, argp)); va_end(argp); luaG_errormsg(L); + L->runerror--; } /* END CSTYLED */ diff --git a/module/lua/ldo.c b/module/lua/ldo.c index aca02b2347..f3c3dcb4d8 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -29,6 +29,26 @@ +/* Return the number of bytes available on the stack. */ +#if defined (_KERNEL) && defined(__linux__) +#include +static intptr_t stack_remaining(void) { + intptr_t local; + local = (intptr_t)&local - (intptr_t)current->stack; + return local; +} +#elif defined (_KERNEL) && defined(__FreeBSD__) +#include +static intptr_t stack_remaining(void) { + intptr_t local; + local = (intptr_t)&local - (intptr_t)curthread->td_kstack; + return local; +} +#else +static intptr_t stack_remaining(void) { + return INTPTR_MAX; +} +#endif /* ** {====================================================== @@ -46,6 +66,7 @@ #ifdef _KERNEL +#ifdef __linux__ #if defined(__i386__) #define JMP_BUF_CNT 6 #elif defined(__x86_64__) @@ -61,7 +82,9 @@ #elif defined(__mips__) #define JMP_BUF_CNT 12 #elif defined(__s390x__) -#define JMP_BUF_CNT 9 +#define JMP_BUF_CNT 18 +#elif defined(__riscv) +#define JMP_BUF_CNT 64 #else #define JMP_BUF_CNT 1 #endif @@ -75,7 +98,7 @@ extern void longjmp(label_t *) __attribute__((__noreturn__)); #define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } #define luai_jmpbuf label_t -/* unsupported archs will build but not be able to run lua programs */ +/* unsupported arches will build but not be able to run lua programs */ #if JMP_BUF_CNT == 1 int setjmp (label_t *buf) { return 1; @@ -85,6 +108,11 @@ void longjmp (label_t * buf) { for (;;); } #endif +#else +#define LUAI_THROW(L,c) longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf +#endif #else /* _KERNEL */ @@ -436,8 +464,13 @@ void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { if (L->nCcalls == LUAI_MAXCCALLS) luaG_runerror(L, "C stack overflow"); else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) - luaD_throw(L, LUA_ERRERR); /* error while handing stack error */ + luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ } + intptr_t remaining = stack_remaining(); + if (L->runerror == 0 && remaining < LUAI_MINCSTACK) + luaG_runerror(L, "C stack overflow"); + if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2) + luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ if (!allowyield) L->nny++; if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ luaV_execute(L); /* call it */ diff --git a/module/lua/lgc.c b/module/lua/lgc.c index 55feb24119..227ad723a0 100644 --- a/module/lua/lgc.c +++ b/module/lua/lgc.c @@ -676,7 +676,7 @@ static void freeobj (lua_State *L, GCObject *o) { case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break; case LUA_TSHRSTR: G(L)->strt.nuse--; - /* FALLTHROUGH */ + fallthrough; case LUA_TLNGSTR: { luaM_freemem(L, o, sizestring(gco2ts(o))); break; diff --git a/module/lua/llex.c b/module/lua/llex.c index 8760155d05..f2c9bf826c 100644 --- a/module/lua/llex.c +++ b/module/lua/llex.c @@ -431,9 +431,12 @@ static int llex (LexState *ls, SemInfo *seminfo) { if (sep >= 0) { read_long_string(ls, seminfo, sep); return TK_STRING; - } - else if (sep == -1) return '['; - else lexerror(ls, "invalid long string delimiter", TK_STRING); + } else if (sep == -1) { + return '['; + } else { + lexerror(ls, "invalid long string delimiter", TK_STRING); + break; + } } case '=': { next(ls); @@ -474,7 +477,7 @@ static int llex (LexState *ls, SemInfo *seminfo) { else if (!lisdigit(ls->current)) return '.'; /* else go through */ } - /* FALLTHROUGH */ + fallthrough; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { read_numeral(ls, seminfo); diff --git a/module/lua/llimits.h b/module/lua/llimits.h index eee8f0c2d5..177092fbc2 100644 --- a/module/lua/llimits.h +++ b/module/lua/llimits.h @@ -98,7 +98,7 @@ typedef LUAI_UACNUMBER l_uacNumber; /* ** non-return type ** -** Supress noreturn attribute in kernel builds to avoid objtool check warnings +** Suppress noreturn attribute in kernel builds to avoid objtool check warnings */ #if defined(__GNUC__) && !defined(_KERNEL) #define l_noret void __attribute__((noreturn)) @@ -122,6 +122,12 @@ typedef LUAI_UACNUMBER l_uacNumber; #define LUAI_MAXCCALLS 20 #endif +/* + * Minimum amount of available stack space (in bytes) to make a C call. With + * gsub() recursion, the stack space between each luaD_call() is 1256 bytes. + */ +#define LUAI_MINCSTACK 4096 + /* ** maximum number of upvalues in a closure (both C and Lua). (Value ** must fit in an unsigned char.) @@ -281,8 +287,6 @@ union luai_Cast { double l_d; LUA_INT32 l_p[2]; }; #if defined(ltable_c) && !defined(luai_hashnum) -extern int lcompat_hashnum(int64_t); - #define luai_hashnum(i,n) (i = lcompat_hashnum(n)) #endif diff --git a/module/lua/lstate.c b/module/lua/lstate.c index 1b1d948fac..4d196eced6 100644 --- a/module/lua/lstate.c +++ b/module/lua/lstate.c @@ -214,6 +214,7 @@ static void preinit_state (lua_State *L, global_State *g) { L->nny = 1; L->status = LUA_OK; L->errfunc = 0; + L->runerror = 0; } diff --git a/module/lua/lstate.h b/module/lua/lstate.h index 22e575e9a2..b636396a60 100644 --- a/module/lua/lstate.h +++ b/module/lua/lstate.h @@ -166,6 +166,7 @@ struct lua_State { unsigned short nCcalls; /* number of nested C calls */ lu_byte hookmask; lu_byte allowhook; + lu_byte runerror; /* handling a runtime error */ int basehookcount; int hookcount; lua_Hook hook; diff --git a/module/lua/lstrlib.c b/module/lua/lstrlib.c index 49ba70fafd..46e3d8fb35 100644 --- a/module/lua/lstrlib.c +++ b/module/lua/lstrlib.c @@ -501,7 +501,7 @@ static const char *match (MatchState *ms, const char *s, const char *p) { } case '+': /* 1 or more repetitions */ s++; /* 1 match already done */ - /* FALLTHROUGH */ + fallthrough; case '*': /* 0 or more repetitions */ s = max_expand(ms, s, p, ep); break; @@ -853,9 +853,9 @@ static void addquoted (lua_State *L, luaL_Buffer *b, int arg) { else if (*s == '\0' || iscntrl(uchar(*s))) { char buff[10]; if (!isdigit(uchar(*(s+1)))) - sprintf(buff, "\\%d", (int)uchar(*s)); + snprintf(buff, sizeof(buff), "\\%d", (int)uchar(*s)); else - sprintf(buff, "\\%03d", (int)uchar(*s)); + snprintf(buff, sizeof(buff), "\\%03d", (int)uchar(*s)); luaL_addstring(b, buff); } else @@ -890,11 +890,11 @@ static const char *scanformat (lua_State *L, const char *strfrmt, char *form) { /* ** add length modifier into formats */ -static void addlenmod (char *form, const char *lenmod) { +static void addlenmod (char *form, const char *lenmod, size_t size) { size_t l = strlen(form); size_t lm = strlen(lenmod); char spec = form[l - 1]; - strcpy(form + l - 1, lenmod); + strlcpy(form + l - 1, lenmod, size - (l - 1)); form[l + lm - 1] = spec; form[l + lm] = '\0'; } @@ -931,7 +931,7 @@ static int str_format (lua_State *L) { lua_Number diff = n - (lua_Number)ni; luaL_argcheck(L, -1 < diff && diff < 1, arg, "not a number in proper range"); - addlenmod(form, LUA_INTFRMLEN); + addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT); nb = str_sprintf(buff, form, ni); break; } @@ -941,7 +941,7 @@ static int str_format (lua_State *L) { lua_Number diff = n - (lua_Number)ni; luaL_argcheck(L, -1 < diff && diff < 1, arg, "not a non-negative number in proper range"); - addlenmod(form, LUA_INTFRMLEN); + addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT); nb = str_sprintf(buff, form, ni); break; } @@ -951,7 +951,7 @@ static int str_format (lua_State *L) { case 'a': case 'A': #endif case 'g': case 'G': { - addlenmod(form, LUA_FLTFRMLEN); + addlenmod(form, LUA_FLTFRMLEN, MAX_FORMAT); nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg)); break; } diff --git a/module/lua/ltable.c b/module/lua/ltable.c index f60418721b..f6872babc6 100644 --- a/module/lua/ltable.c +++ b/module/lua/ltable.c @@ -492,7 +492,7 @@ const TValue *luaH_get (Table *t, const TValue *key) { return luaH_getint(t, k); /* use specialized version */ /* else go through */ } - /* FALLTHROUGH */ + fallthrough; default: { Node *n = mainposition(t, key); do { /* check whether `key' is somewhere in the chain */ diff --git a/module/lua/ltablib.c b/module/lua/ltablib.c index be5b6375e4..51cafffaaf 100644 --- a/module/lua/ltablib.c +++ b/module/lua/ltablib.c @@ -244,7 +244,7 @@ static void auxsort (lua_State *L, int l, int u) { } /* repeat the routine for the larger one */ } -static int sort (lua_State *L) { +static int tsort (lua_State *L) { int n = aux_getn(L, 1); luaL_checkstack(L, 40, ""); /* assume array is smaller than 2^40 */ if (!lua_isnoneornil(L, 2)) /* is there a 2nd argument? */ @@ -266,7 +266,7 @@ static const luaL_Reg tab_funcs[] = { {"pack", pack}, {"unpack", unpack}, {"remove", tremove}, - {"sort", sort}, + {"sort", tsort}, {NULL, NULL} }; diff --git a/module/lua/lvm.c b/module/lua/lvm.c index bde1d30bc6..4685be52b4 100644 --- a/module/lua/lvm.c +++ b/module/lua/lvm.c @@ -929,32 +929,4 @@ void luaV_execute (lua_State *L) { } } -/* - * this can live in SPL - */ -#if BITS_PER_LONG == 32 -#if defined(_KERNEL) && !defined(SPL_HAS_MODDI3) -extern uint64_t __umoddi3(uint64_t dividend, uint64_t divisor); - -/* 64-bit signed modulo for 32-bit machines. */ -int64_t -__moddi3(int64_t n, int64_t d) -{ - int64_t q; - boolean_t nn = B_FALSE; - - if (n < 0) { - nn = B_TRUE; - n = -n; - } - if (d < 0) - d = -d; - - q = __umoddi3(n, d); - - return (nn ? -q : q); -} -EXPORT_SYMBOL(__moddi3); -#endif -#endif /* END CSTYLED */ diff --git a/module/lua/setjmp/setjmp.S b/module/lua/setjmp/setjmp.S index 8d06d3f8ca..1f461a0a4e 100644 --- a/module/lua/setjmp/setjmp.S +++ b/module/lua/setjmp/setjmp.S @@ -14,4 +14,6 @@ #include "setjmp_mips.S" #elif defined(__s390x__) #include "setjmp_s390x.S" +#elif defined(__riscv) +#include "setjmp_rv64g.S" #endif diff --git a/module/lua/setjmp/setjmp_arm.S b/module/lua/setjmp/setjmp_arm.S index 8c08f4e6f2..78bc3e0b34 100644 --- a/module/lua/setjmp/setjmp_arm.S +++ b/module/lua/setjmp/setjmp_arm.S @@ -31,12 +31,19 @@ #if defined(__arm__) && !defined(__aarch64__) +#if defined(__thumb2__) +#define _FUNC_MODE .code 16; .thumb_func +#else +#define _FUNC_MODE .code 32 +#endif + #define ENTRY(x) \ .text; \ + .syntax unified; \ .align 2; \ .global x; \ .type x,#function; \ - .code 32; \ + _FUNC_MODE; \ x: #define END(x) \ @@ -49,13 +56,23 @@ x: * setjump + longjmp */ ENTRY(setjmp) +#if defined(__thumb2__) + mov ip, sp + stmia r0, {r4-r12,r14} +#else stmia r0, {r4-r14} +#endif mov r0, #0x00000000 RET END(setjmp) ENTRY(longjmp) +#if defined(__thumb2__) + ldmia r0, {r4-r12,r14} + mov sp, ip +#else ldmia r0, {r4-r14} +#endif mov r0, #0x00000001 RET END(longjmp) diff --git a/module/lua/setjmp/setjmp_ppc.S b/module/lua/setjmp/setjmp_ppc.S index f787ef3491..72aa5d5ab5 100644 --- a/module/lua/setjmp/setjmp_ppc.S +++ b/module/lua/setjmp/setjmp_ppc.S @@ -56,7 +56,7 @@ #define ENTRY(name) \ .align 2 ; \ .type name,@function; \ - .globl name; \ + .weak name; \ name: #else /* PPC64_ELF_ABI_v1 */ @@ -65,8 +65,8 @@ name: #define GLUE(a,b) XGLUE(a,b) #define ENTRY(name) \ .align 2 ; \ - .globl name; \ - .globl GLUE(.,name); \ + .weak name; \ + .weak GLUE(.,name); \ .pushsection ".opd","aw"; \ name: \ .quad GLUE(.,name); \ @@ -83,8 +83,8 @@ GLUE(.,name): #define ENTRY(name) \ .text; \ .p2align 4; \ - .globl name; \ - .type name,@function; \ + .weak name; \ + .type name,@function; \ name: #endif /* __powerpc64__ */ diff --git a/module/lua/setjmp/setjmp_rv64g.S b/module/lua/setjmp/setjmp_rv64g.S new file mode 100644 index 0000000000..7f6c50d25a --- /dev/null +++ b/module/lua/setjmp/setjmp_rv64g.S @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2015-2016 Ruslan Bukin + * All rights reserved. + * + * Portions of this software were developed by SRI International and the + * University of Cambridge Computer Laboratory under DARPA/AFRL contract + * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Portions of this software were developed by the University of Cambridge + * Computer Laboratory as part of the CTSRD Project, with support from the + * UK Higher Education Innovation Fund (HEIF). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#define ENTRY(sym) \ + .text; .globl sym; .type sym,@function; sym: +#define END(sym) .size sym, . - sym + + +ENTRY(setjmp) + /* Store the stack pointer */ + sd sp, (0 * 8)(a0) + addi a0, a0, (1 * 8) + + /* Store the general purpose registers and ra */ + sd s0, (0 * 8)(a0) + sd s1, (1 * 8)(a0) + sd s2, (2 * 8)(a0) + sd s3, (3 * 8)(a0) + sd s4, (4 * 8)(a0) + sd s5, (5 * 8)(a0) + sd s6, (6 * 8)(a0) + sd s7, (7 * 8)(a0) + sd s8, (8 * 8)(a0) + sd s9, (9 * 8)(a0) + sd s10, (10 * 8)(a0) + sd s11, (11 * 8)(a0) + sd ra, (12 * 8)(a0) + addi a0, a0, (13 * 8) + + /* Return value */ + li a0, 0 + ret +END(setjmp) + +ENTRY(longjmp) + /* Restore the stack pointer */ + ld t0, 0(a0) + mv sp, t0 + addi a0, a0, (1 * 8) + + /* Restore the general purpose registers and ra */ + ld s0, (0 * 8)(a0) + ld s1, (1 * 8)(a0) + ld s2, (2 * 8)(a0) + ld s3, (3 * 8)(a0) + ld s4, (4 * 8)(a0) + ld s5, (5 * 8)(a0) + ld s6, (6 * 8)(a0) + ld s7, (7 * 8)(a0) + ld s8, (8 * 8)(a0) + ld s9, (9 * 8)(a0) + ld s10, (10 * 8)(a0) + ld s11, (11 * 8)(a0) + ld ra, (12 * 8)(a0) + addi a0, a0, (13 * 8) + + /* Load the return value */ + mv a0, a1 + ret +END(longjmp) diff --git a/module/nvpair/Makefile.in b/module/nvpair/Makefile.in index f420ef98bc..d814523667 100644 --- a/module/nvpair/Makefile.in +++ b/module/nvpair/Makefile.in @@ -1,12 +1,12 @@ -src = @abs_top_srcdir@/module/nvpair +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +endif MODULE := znvpair obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - $(MODULE)-objs += nvpair.o $(MODULE)-objs += fnvpair.o $(MODULE)-objs += nvpair_alloc_spl.o diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index 5f6423ccce..9834dedd85 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -25,13 +25,24 @@ * Copyright 2018 RackTop Systems. */ +/* + * Links to Illumos.org for more information on Interface Libraries: + * [1] https://illumos.org/man/3lib/libnvpair + * [2] https://illumos.org/man/3nvpair/nvlist_alloc + * [3] https://illumos.org/man/9f/nvlist_alloc + * [4] https://illumos.org/man/9f/nvlist_next_nvpair + * [5] https://illumos.org/man/9f/nvpair_value_byte + */ + #include #include #include #include #include +#include #include #include +#include #if defined(_KERNEL) #include @@ -522,12 +533,14 @@ nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); + // cppcheck-suppress nullPointerRedundantCheck i_nvp_t *bucket = tab[index]; /* insert link at the beginning of the bucket */ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); new_entry->nvi_hashtable_next = bucket; + // cppcheck-suppress nullPointerRedundantCheck tab[index] = new_entry; priv->nvp_nentries++; @@ -557,10 +570,10 @@ nvlist_nv_alloc(int kmflag) switch (kmflag) { case KM_SLEEP: return (nv_alloc_sleep); - case KM_PUSHPAGE: - return (nv_alloc_pushpage); - default: + case KM_NOSLEEP: return (nv_alloc_nosleep); + default: + return (nv_alloc_pushpage); } #else return (nv_alloc_nosleep); @@ -1872,7 +1885,7 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate * multiple levels of embedded nvlists, with 'sep' as the separator. As an * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or - * "a.d[3].e[1]". This matches the C syntax for array embed (for convience, + * "a.d[3].e[1]". This matches the C syntax for array embed (for convenience, * code also supports "a.d[3]e[1]" syntax). * * If 'ip' is non-NULL and the last name component is an array, return the @@ -2553,12 +2566,14 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, int err = 0; nvstream_t nvs; int nvl_endian; -#ifdef _LITTLE_ENDIAN +#if defined(_ZFS_LITTLE_ENDIAN) int host_endian = 1; -#else +#elif defined(_ZFS_BIG_ENDIAN) int host_endian = 0; -#endif /* _LITTLE_ENDIAN */ - nvs_header_t *nvh = (void *)buf; +#else +#error "No endian defined!" +#endif /* _ZFS_LITTLE_ENDIAN */ + nvs_header_t *nvh; if (buflen == NULL || nvl == NULL || (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) @@ -2577,6 +2592,7 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); + nvh = (void *)buf; nvh->nvh_encoding = encoding; nvh->nvh_endian = nvl_endian = host_endian; nvh->nvh_reserved1 = 0; @@ -2588,6 +2604,7 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, return (EINVAL); /* get method of encoding from first byte */ + nvh = (void *)buf; encoding = nvh->nvh_encoding; nvl_endian = nvh->nvh_endian; break; @@ -3105,7 +3122,7 @@ nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) * * An xdr packed nvlist is encoded as: * - * - encoding methode and host endian (4 bytes) + * - encoding method and host endian (4 bytes) * - nvl_version (4 bytes) * - nvl_nvflag (4 bytes) * @@ -3198,6 +3215,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs) return (0); } +/* + * xdrproc_t-compatible callbacks for xdr_array() + */ + +#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, void *ptr) \ +{ \ + return (xdr_##type(xdrs, ptr)); \ +} + +#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, ...) \ +{ \ + va_list args; \ + void *ptr; \ + \ + va_start(args, xdrs); \ + ptr = va_arg(args, void *); \ + va_end(args); \ + \ + return (xdr_##type(xdrs, ptr)); \ +} + +#else /* FreeBSD, sunrpc */ + +#define NVS_BUILD_XDRPROC_T(type) \ +static bool_t \ +nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \ +{ \ + return (xdr_##type(xdrs, ptr)); \ +} + +#endif + +/* BEGIN CSTYLED */ +NVS_BUILD_XDRPROC_T(char); +NVS_BUILD_XDRPROC_T(short); +NVS_BUILD_XDRPROC_T(u_short); +NVS_BUILD_XDRPROC_T(int); +NVS_BUILD_XDRPROC_T(u_int); +NVS_BUILD_XDRPROC_T(longlong_t); +NVS_BUILD_XDRPROC_T(u_longlong_t); +/* END CSTYLED */ + /* * The format of xdr encoded nvpair is: * encode_size, decode_size, name string, data type, nelem, data @@ -3205,6 +3272,8 @@ nvs_xdr_nvl_fini(nvstream_t *nvs) static int nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { + ASSERT(nvs != NULL && nvp != NULL); + data_type_t type; char *buf; char *buf_end = (char *)nvp + nvp->nvp_size; @@ -3213,7 +3282,7 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) bool_t ret = FALSE; XDR *xdr = nvs->nvs_private; - ASSERT(xdr != NULL && nvp != NULL); + ASSERT(xdr != NULL); /* name string */ if ((buf = NVP_NAME(nvp)) >= buf_end) @@ -3320,38 +3389,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), - (xdrproc_t)xdr_char); + nvs_xdr_nvp_char); break; case DATA_TYPE_INT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), - sizeof (int16_t), (xdrproc_t)xdr_short); + sizeof (int16_t), nvs_xdr_nvp_short); break; case DATA_TYPE_UINT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), - sizeof (uint16_t), (xdrproc_t)xdr_u_short); + sizeof (uint16_t), nvs_xdr_nvp_u_short); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), - sizeof (int32_t), (xdrproc_t)xdr_int); + sizeof (int32_t), nvs_xdr_nvp_int); break; case DATA_TYPE_UINT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), - sizeof (uint32_t), (xdrproc_t)xdr_u_int); + sizeof (uint32_t), nvs_xdr_nvp_u_int); break; case DATA_TYPE_INT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), - sizeof (int64_t), (xdrproc_t)xdr_longlong_t); + sizeof (int64_t), nvs_xdr_nvp_longlong_t); break; case DATA_TYPE_UINT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), - sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); + sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t); break; case DATA_TYPE_STRING_ARRAY: { @@ -3499,7 +3568,7 @@ nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) * the strings. These pointers are not encoded into the packed xdr buffer. * * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are - * of length 0, then each string is endcoded in xdr format as a single word. + * of length 0, then each string is encoded in xdr format as a single word. * Therefore when expanded to an nvpair there will be 2.25 word used for * each string. (a int64_t allocated for pointer usage, and a single char * for the null termination.) @@ -3601,11 +3670,12 @@ nvpair_fini(void) module_init(nvpair_init); module_exit(nvpair_fini); +#endif -MODULE_DESCRIPTION("Generic name/value pair implementation"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(nv_alloc_init); EXPORT_SYMBOL(nv_alloc_reset); @@ -3720,5 +3790,3 @@ EXPORT_SYMBOL(nvpair_value_uint64_array); EXPORT_SYMBOL(nvpair_value_string_array); EXPORT_SYMBOL(nvpair_value_nvlist_array); EXPORT_SYMBOL(nvpair_value_hrtime); - -#endif diff --git a/module/os/freebsd/spl/acl_common.c b/module/os/freebsd/spl/acl_common.c new file mode 100644 index 0000000000..7fd0e36e1b --- /dev/null +++ b/module/os/freebsd/spl/acl_common.c @@ -0,0 +1,1709 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#if defined(_KERNEL) +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ + ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ + ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) + + +#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 +#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 +#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 +#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 + +#define ACL_WRITE_OWNER_SET_DENY 0x0000010 +#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 +#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 +#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 + +#define ACL_DELETE_SET_DENY 0x0000100 +#define ACL_DELETE_SET_ALLOW 0x0000200 +#define ACL_DELETE_ERR_DENY 0x0000400 +#define ACL_DELETE_ERR_ALLOW 0x0000800 + +#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 +#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 +#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 +#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 + +#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 +#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 +#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 +#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 + +#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 +#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 +#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 +#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 + +#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 +#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 +#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 +#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 + + +#define ACE_VALID_MASK_BITS (\ + ACE_READ_DATA | \ + ACE_LIST_DIRECTORY | \ + ACE_WRITE_DATA | \ + ACE_ADD_FILE | \ + ACE_APPEND_DATA | \ + ACE_ADD_SUBDIRECTORY | \ + ACE_READ_NAMED_ATTRS | \ + ACE_WRITE_NAMED_ATTRS | \ + ACE_EXECUTE | \ + ACE_DELETE_CHILD | \ + ACE_READ_ATTRIBUTES | \ + ACE_WRITE_ATTRIBUTES | \ + ACE_DELETE | \ + ACE_READ_ACL | \ + ACE_WRITE_ACL | \ + ACE_WRITE_OWNER | \ + ACE_SYNCHRONIZE) + +#define ACE_MASK_UNDEFINED 0x80000000 + +#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ + ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ + ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) + +/* + * ACL conversion helpers + */ + +typedef enum { + ace_unused, + ace_user_obj, + ace_user, + ace_group, /* includes GROUP and GROUP_OBJ */ + ace_other_obj +} ace_to_aent_state_t; + +typedef struct acevals { + uid_t key; + avl_node_t avl; + uint32_t mask; + uint32_t allowed; + uint32_t denied; + int aent_type; +} acevals_t; + +typedef struct ace_list { + acevals_t user_obj; + avl_tree_t user; + int numusers; + acevals_t group_obj; + avl_tree_t group; + int numgroups; + acevals_t other_obj; + uint32_t acl_mask; + int hasmask; + int dfacl_flag; + ace_to_aent_state_t state; + int seen; /* bitmask of all aclent_t a_type values seen */ +} ace_list_t; + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)(void *, void *)) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT3U(((uintptr_t)v & 0x3), ==, 0); + ASSERT3S((s & 0x3), ==, 0); + ASSERT3S(s, >, 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (void *)(v + j * s); + p2 = (void *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} + +static int +cacl_malloc(void **ptr, size_t size) +{ + *ptr = kmem_zalloc(size, KM_SLEEP); + return (0); +} + + +#if !defined(_KERNEL) +acl_t * +acl_alloc(enum acl_type type) +{ + acl_t *aclp; + + if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) + return (NULL); + + aclp->acl_aclp = NULL; + aclp->acl_cnt = 0; + + switch (type) { + case ACE_T: + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + break; + case ACLENT_T: + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + break; + default: + acl_free(aclp); + aclp = NULL; + } + return (aclp); +} + +/* + * Free acl_t structure + */ +void +acl_free(acl_t *aclp) +{ + int acl_size; + + if (aclp == NULL) + return; + + if (aclp->acl_aclp) { + acl_size = aclp->acl_cnt * aclp->acl_entry_size; + cacl_free(aclp->acl_aclp, acl_size); + } + + cacl_free(aclp, sizeof (acl_t)); +} + +static uint32_t +access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) +{ + uint32_t access_mask = 0; + int acl_produce; + int synchronize_set = 0, write_owner_set = 0; + int delete_set = 0, write_attrs_set = 0; + int read_named_set = 0, write_named_set = 0; + + acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_WRITER_SET_DENY); + + if (isallow) { + synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; + write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; + delete_set = ACL_DELETE_SET_ALLOW; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + } else { + + synchronize_set = ACL_SYNCHRONIZE_SET_DENY; + write_owner_set = ACL_WRITE_OWNER_SET_DENY; + delete_set = ACL_DELETE_SET_DENY; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_DENY; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; + else + /* + * If the entity is not the owner and does not + * have write permissions ACE_WRITE_ATTRIBUTES will + * always go in the DENY ACE. + */ + access_mask |= ACE_WRITE_ATTRIBUTES; + } + + if (acl_produce & synchronize_set) + access_mask |= ACE_SYNCHRONIZE; + if (acl_produce & write_owner_set) + access_mask |= ACE_WRITE_OWNER; + if (acl_produce & delete_set) + access_mask |= ACE_DELETE; + if (acl_produce & write_attrs_set) + access_mask |= ACE_WRITE_ATTRIBUTES; + if (acl_produce & read_named_set) + access_mask |= ACE_READ_NAMED_ATTRS; + if (acl_produce & write_named_set) + access_mask |= ACE_WRITE_NAMED_ATTRS; + + return (access_mask); +} + +/* + * Given an mode_t, convert it into an access_mask as used + * by nfsace, assuming aclent_t -> nfsace semantics. + */ +static uint32_t +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) +{ + uint32_t access = 0; + int haswriteperm = 0; + int hasreadperm = 0; + + if (isallow) { + haswriteperm = (mode & S_IWOTH); + hasreadperm = (mode & S_IROTH); + } else { + haswriteperm = !(mode & S_IWOTH); + hasreadperm = !(mode & S_IROTH); + } + + /* + * The following call takes care of correctly setting the following + * mask bits in the access_mask: + * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, + * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS + */ + access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); + + if (isallow) { + access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; + if (isowner) + access |= ACE_WRITE_ACL; + } else { + if (! isowner) + access |= ACE_WRITE_ACL; + } + + /* read */ + if (mode & S_IROTH) { + access |= ACE_READ_DATA; + } + /* write */ + if (mode & S_IWOTH) { + access |= ACE_WRITE_DATA | + ACE_APPEND_DATA; + if (isdir) + access |= ACE_DELETE_CHILD; + } + /* exec */ + if (mode & S_IXOTH) { + access |= ACE_EXECUTE; + } + + return (access); +} + +/* + * Given an nfsace (presumably an ALLOW entry), make a + * corresponding DENY entry at the address given. + */ +static void +ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) +{ + (void) memcpy(deny, allow, sizeof (ace_t)); + + deny->a_who = allow->a_who; + + deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; + if (isdir) + deny->a_access_mask ^= ACE_DELETE_CHILD; + + deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | + ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_WRITE_NAMED_ATTRS); + deny->a_access_mask |= access_mask_set((allow->a_access_mask & + ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, + B_FALSE); +} +/* + * Make an initial pass over an array of aclent_t's. Gather + * information such as an ACL_MASK (if any), number of users, + * number of groups, and whether the array needs to be sorted. + */ +static int +ln_aent_preprocess(aclent_t *aclent, int n, + int *hasmask, mode_t *mask, + int *numuser, int *numgroup, int *needsort) +{ + int error = 0; + int i; + int curtype = 0; + + *hasmask = 0; + *mask = 07; + *needsort = 0; + *numuser = 0; + *numgroup = 0; + + for (i = 0; i < n; i++) { + if (aclent[i].a_type < curtype) + *needsort = 1; + else if (aclent[i].a_type > curtype) + curtype = aclent[i].a_type; + if (aclent[i].a_type & USER) + (*numuser)++; + if (aclent[i].a_type & (GROUP | GROUP_OBJ)) + (*numgroup)++; + if (aclent[i].a_type & CLASS_OBJ) { + if (*hasmask) { + error = EINVAL; + goto out; + } else { + *hasmask = 1; + *mask = aclent[i].a_perm; + } + } + } + + if ((! *hasmask) && (*numuser + *numgroup > 1)) { + error = EINVAL; + goto out; + } + +out: + return (error); +} + +/* + * Convert an array of aclent_t into an array of nfsace entries, + * following POSIX draft -> nfsv4 conversion semantics as outlined in + * the IETF draft. + */ +static int +ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) +{ + int error = 0; + mode_t mask; + int numuser, numgroup, needsort; + int resultsize = 0; + int i, groupi = 0, skip; + ace_t *acep, *result = NULL; + int hasmask; + + error = ln_aent_preprocess(aclent, n, &hasmask, &mask, + &numuser, &numgroup, &needsort); + if (error != 0) + goto out; + + /* allow + deny for each aclent */ + resultsize = n * 2; + if (hasmask) { + /* + * stick extra deny on the group_obj and on each + * user|group for the mask (the group_obj was added + * into the count for numgroup) + */ + resultsize += numuser + numgroup; + /* ... and don't count the mask itself */ + resultsize -= 2; + } + + /* sort the source if necessary */ + if (needsort) + ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); + + if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) + goto out; + + acep = result; + + for (i = 0; i < n; i++) { + /* + * don't process CLASS_OBJ (mask); mask was grabbed in + * ln_aent_preprocess() + */ + if (aclent[i].a_type & CLASS_OBJ) + continue; + + /* If we need an ACL_MASK emulator, prepend it now */ + if ((hasmask) && + (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { + acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + acep->a_flags = 0; + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= + (ACE_IDENTIFIER_GROUP|ACE_GROUP); + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + } else { + acep->a_who = aclent[i].a_id; + acep->a_flags |= ACE_IDENTIFIER_GROUP; + } + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + /* + * Set the access mask for the prepended deny + * ace. To do this, we invert the mask (found + * in ln_aent_preprocess()) then convert it to an + * DENY ace access_mask. + */ + acep->a_access_mask = mode_to_ace_access((mask ^ 07), + isdir, 0, 0); + acep += 1; + } + + /* handle a_perm -> access_mask */ + acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, + isdir, aclent[i].a_type & USER_OBJ, 1); + + /* emulate a default aclent */ + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + + /* + * handle a_perm and a_id + * + * this must be done last, since it involves the + * corresponding deny aces, which are handled + * differently for each different a_type. + */ + if (aclent[i].a_type & USER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_OWNER; + ace_make_deny(acep, acep + 1, isdir, B_TRUE); + acep += 2; + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_GROUP; + } else { + acep->a_who = aclent[i].a_id; + } + acep->a_flags |= ACE_IDENTIFIER_GROUP; + /* + * Set the corresponding deny for the group ace. + * + * The deny aces go after all of the groups, unlike + * everything else, where they immediately follow + * the allow ace. + * + * We calculate "skip", the number of slots to + * skip ahead for the deny ace, here. + * + * The pattern is: + * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 + * thus, skip is + * (2 * numgroup) - 1 - groupi + * (2 * numgroup) to account for MD + A + * - 1 to account for the fact that we're on the + * access (A), not the mask (MD) + * - groupi to account for the fact that we have + * passed up groupi number of MD's. + */ + skip = (2 * numgroup) - 1 - groupi; + ace_make_deny(acep, acep + skip, isdir, B_FALSE); + /* + * If we just did the last group, skip acep past + * all of the denies; else, just move ahead one. + */ + if (++groupi >= numgroup) + acep += numgroup + 1; + else + acep += 1; + } else if (aclent[i].a_type & OTHER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_EVERYONE; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else { + error = EINVAL; + goto out; + } + } + + *acepp = result; + *rescount = resultsize; + +out: + if (error != 0) { + if ((result != NULL) && (resultsize > 0)) { + cacl_free(result, resultsize * sizeof (ace_t)); + } + } + + return (error); +} + +static int +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, + ace_t **retacep, int *retacecnt) +{ + ace_t *acep; + ace_t *dfacep; + int acecnt = 0; + int dfacecnt = 0; + int dfaclstart = 0; + int dfaclcnt = 0; + aclent_t *aclp; + int i; + int error; + int acesz, dfacesz; + + ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); + + for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + if (i < aclcnt) { + dfaclstart = i; + dfaclcnt = aclcnt - i; + } + + if (dfaclcnt && !isdir) { + return (EINVAL); + } + + error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); + if (error) + return (error); + + if (dfaclcnt) { + error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, + &dfacep, &dfacecnt, isdir); + if (error) { + if (acep) { + cacl_free(acep, acecnt * sizeof (ace_t)); + } + return (error); + } + } + + if (dfacecnt != 0) { + acesz = sizeof (ace_t) * acecnt; + dfacesz = sizeof (ace_t) * dfacecnt; + acep = cacl_realloc(acep, acesz, acesz + dfacesz); + if (acep == NULL) + return (ENOMEM); + if (dfaclcnt) { + (void) memcpy(acep + acecnt, dfacep, dfacesz); + } + } + if (dfaclcnt) + cacl_free(dfacep, dfacecnt * sizeof (ace_t)); + + *retacecnt = acecnt + dfacecnt; + *retacep = acep; + return (0); +} + +static int +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + int error = 0; + o_mode_t mode = 0; + uint32_t bits, wantbits; + + /* read */ + if (mask & ACE_READ_DATA) + mode |= S_IROTH; + + /* write */ + wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); + if (isdir) + wantbits |= ACE_DELETE_CHILD; + bits = mask & wantbits; + if (bits != 0) { + if (bits != wantbits) { + error = ENOTSUP; + goto out; + } + mode |= S_IWOTH; + } + + /* exec */ + if (mask & ACE_EXECUTE) { + mode |= S_IXOTH; + } + + *modep = mode; + +out: + return (error); +} + +static void +acevals_init(acevals_t *vals, uid_t key) +{ + bzero(vals, sizeof (*vals)); + vals->allowed = ACE_MASK_UNDEFINED; + vals->denied = ACE_MASK_UNDEFINED; + vals->mask = ACE_MASK_UNDEFINED; + vals->key = key; +} + +static void +ace_list_init(ace_list_t *al, int dfacl_flag) +{ + acevals_init(&al->user_obj, 0); + acevals_init(&al->group_obj, 0); + acevals_init(&al->other_obj, 0); + al->numusers = 0; + al->numgroups = 0; + al->acl_mask = 0; + al->hasmask = 0; + al->state = ace_unused; + al->seen = 0; + al->dfacl_flag = dfacl_flag; +} + +/* + * Find or create an acevals holder for a given id and avl tree. + * + * Note that only one thread will ever touch these avl trees, so + * there is no need for locking. + */ +static acevals_t * +acevals_find(ace_t *ace, avl_tree_t *avl, int *num) +{ + acevals_t key, *rc; + avl_index_t where; + + key.key = ace->a_who; + rc = avl_find(avl, &key, &where); + if (rc != NULL) + return (rc); + + /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ + if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) + return (NULL); + + acevals_init(rc, ace->a_who); + avl_insert(avl, rc, where); + (*num)++; + + return (rc); +} + +static int +access_mask_check(ace_t *acep, int mask_bit, int isowner) +{ + int set_deny, err_deny; + int set_allow, err_allow; + int acl_consume; + int haswriteperm, hasreadperm; + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; + } else { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; + } + + acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | + ACL_DELETE_ERR_DENY | + ACL_WRITE_OWNER_ERR_DENY | + ACL_WRITE_OWNER_ERR_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_ERR_DENY | + ACL_WRITE_ATTRS_WRITER_SET_DENY | + ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | + ACL_WRITE_NAMED_WRITER_ERR_DENY | + ACL_READ_NAMED_READER_ERR_DENY); + + if (mask_bit == ACE_SYNCHRONIZE) { + set_deny = ACL_SYNCHRONIZE_SET_DENY; + err_deny = ACL_SYNCHRONIZE_ERR_DENY; + set_allow = ACL_SYNCHRONIZE_SET_ALLOW; + err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_OWNER) { + set_deny = ACL_WRITE_OWNER_SET_DENY; + err_deny = ACL_WRITE_OWNER_ERR_DENY; + set_allow = ACL_WRITE_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_OWNER_ERR_ALLOW; + } else if (mask_bit == ACE_DELETE) { + set_deny = ACL_DELETE_SET_DENY; + err_deny = ACL_DELETE_ERR_DENY; + set_allow = ACL_DELETE_SET_ALLOW; + err_allow = ACL_DELETE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { + if (isowner) { + set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; + } else if (haswriteperm) { + set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; + } else { + if ((acep->a_access_mask & mask_bit) && + (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { + return (ENOTSUP); + } + return (0); + } + } else if (mask_bit == ACE_READ_NAMED_ATTRS) { + if (!hasreadperm) + return (0); + + set_deny = ACL_READ_NAMED_READER_SET_DENY; + err_deny = ACL_READ_NAMED_READER_ERR_DENY; + set_allow = ACL_READ_NAMED_READER_SET_ALLOW; + err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { + if (!haswriteperm) + return (0); + + set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; + err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; + set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; + } else { + return (EINVAL); + } + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + if (acl_consume & set_deny) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_deny) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } else { + /* ACE_ACCESS_ALLOWED_ACE_TYPE */ + if (acl_consume & set_allow) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_allow) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } + return (0); +} + +static int +ace_to_aent_legal(ace_t *acep) +{ + int error = 0; + int isowner; + + /* only ALLOW or DENY */ + if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && + (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid flags */ + if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { + error = EINVAL; + goto out; + } + + /* some flags are illegal */ + if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | + ACE_FAILED_ACCESS_ACE_FLAG | + ACE_NO_PROPAGATE_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid masks */ + if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { + error = EINVAL; + goto out; + } + + if ((acep->a_flags & ACE_OWNER)) { + isowner = 1; + } else { + isowner = 0; + } + + error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_DELETE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); + if (error) + goto out; + + /* more detailed checking of masks */ + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (! (acep->a_access_mask & ACE_APPEND_DATA))) { + error = ENOTSUP; + goto out; + } + if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && + (acep->a_access_mask & ACE_APPEND_DATA)) { + error = ENOTSUP; + goto out; + } + } + + /* ACL enforcement */ + if ((acep->a_access_mask & ACE_READ_ACL) && + (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + if (acep->a_access_mask & ACE_WRITE_ACL) { + if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && + (isowner)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && + (! isowner)) { + error = ENOTSUP; + goto out; + } + } + +out: + return (error); +} + +static int +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ + if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != + (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { + return (ENOTSUP); + } + + return (ace_mask_to_mode(mask, modep, isdir)); +} + +static int +acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error; + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + + if (isdir) + flips |= ACE_DELETE_CHILD; + if (vals->allowed != (vals->denied ^ flips)) { + error = ENOTSUP; + goto out; + } + if ((list->hasmask) && (list->acl_mask != vals->mask) && + (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { + error = ENOTSUP; + goto out; + } + error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); + if (error != 0) + goto out; + dest->a_type = vals->aent_type; + if (dest->a_type & (USER | GROUP)) { + dest->a_id = vals->key; + } else if (dest->a_type & USER_OBJ) { + dest->a_id = owner; + } else if (dest->a_type & GROUP_OBJ) { + dest->a_id = group; + } else if (dest->a_type & OTHER_OBJ) { + dest->a_id = 0; + } else { + error = EINVAL; + goto out; + } + +out: + return (error); +} + + +static int +ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error = 0; + aclent_t *aent, *result = NULL; + acevals_t *vals; + int resultcount; + + if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != + (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { + error = ENOTSUP; + goto out; + } + if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { + error = ENOTSUP; + goto out; + } + + resultcount = 3 + list->numusers + list->numgroups; + /* + * This must be the same condition as below, when we add the CLASS_OBJ + * (aka ACL mask) + */ + if ((list->hasmask) || (! list->dfacl_flag)) + resultcount += 1; + + if (cacl_malloc((void **)&result, + resultcount * sizeof (aclent_t)) != 0) { + error = ENOMEM; + goto out; + } + aent = result; + + /* USER_OBJ */ + if (!(list->user_obj.aent_type & USER_OBJ)) { + error = EINVAL; + goto out; + } + + error = acevals_to_aent(&list->user_obj, aent, list, owner, group, + isdir); + + if (error != 0) + goto out; + ++aent; + /* USER */ + vals = NULL; + for (vals = avl_first(&list->user); vals != NULL; + vals = AVL_NEXT(&list->user, vals)) { + if (!(vals->aent_type & USER)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* GROUP_OBJ */ + if (!(list->group_obj.aent_type & GROUP_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->group_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + /* GROUP */ + vals = NULL; + for (vals = avl_first(&list->group); vals != NULL; + vals = AVL_NEXT(&list->group, vals)) { + if (!(vals->aent_type & GROUP)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* + * CLASS_OBJ (aka ACL_MASK) + * + * An ACL_MASK is not fabricated if the ACL is a default ACL. + * This is to follow UFS's behavior. + */ + if ((list->hasmask) || (! list->dfacl_flag)) { + if (list->hasmask) { + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + if (isdir) + flips |= ACE_DELETE_CHILD; + error = ace_mask_to_mode(list->acl_mask ^ flips, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } else { + /* fabricate the ACL_MASK from the group permissions */ + error = ace_mask_to_mode(list->group_obj.allowed, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } + aent->a_id = 0; + aent->a_type = CLASS_OBJ | list->dfacl_flag; + ++aent; + } + /* OTHER_OBJ */ + if (!(list->other_obj.aent_type & OTHER_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->other_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + + *aclentp = result; + *aclcnt = resultcount; + +out: + if (error != 0) { + if (result != NULL) + cacl_free(result, resultcount * sizeof (aclent_t)); + } + + return (error); +} + + +/* + * free all data associated with an ace_list + */ +static void +ace_list_free(ace_list_t *al) +{ + acevals_t *node; + void *cookie; + + if (al == NULL) + return; + + cookie = NULL; + while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + cookie = NULL; + while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + + avl_destroy(&al->user); + avl_destroy(&al->group); + + /* free the container itself */ + cacl_free(al, sizeof (ace_list_t)); +} + +static int +acevals_compare(const void *va, const void *vb) +{ + const acevals_t *a = va, *b = vb; + + if (a->key == b->key) + return (0); + + if (a->key > b->key) + return (1); + + else + return (-1); +} + +/* + * Convert a list of ace_t entries to equivalent regular and default + * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. + */ +static int +ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, + aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, + boolean_t isdir) +{ + int error = 0; + ace_t *acep; + uint32_t bits; + int i; + ace_list_t *normacl = NULL, *dfacl = NULL, *acl; + acevals_t *vals; + + *aclentp = NULL; + *aclcnt = 0; + *dfaclentp = NULL; + *dfaclcnt = 0; + + /* we need at least user_obj, group_obj, and other_obj */ + if (n < 6) { + error = ENOTSUP; + goto out; + } + if (ace == NULL) { + error = EINVAL; + goto out; + } + + error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + + ace_list_init(normacl, 0); + + error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + ace_list_init(dfacl, ACL_DEFAULT); + + /* process every ace_t... */ + for (i = 0; i < n; i++) { + acep = &ace[i]; + + /* rule out certain cases quickly */ + error = ace_to_aent_legal(acep); + if (error != 0) + goto out; + + /* + * Turn off these bits in order to not have to worry about + * them when doing the checks for compliments. + */ + acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | + ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); + + /* see if this should be a regular or default acl */ + bits = acep->a_flags & + (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE); + if (bits != 0) { + /* all or nothing on these inherit bits */ + if (bits != (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + acl = dfacl; + } else { + acl = normacl; + } + + if ((acep->a_flags & ACE_OWNER)) { + if (acl->state > ace_user_obj) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user_obj; + acl->seen |= USER_OBJ; + vals = &acl->user_obj; + vals->aent_type = USER_OBJ | acl->dfacl_flag; + } else if ((acep->a_flags & ACE_EVERYONE)) { + acl->state = ace_other_obj; + acl->seen |= OTHER_OBJ; + vals = &acl->other_obj; + vals->aent_type = OTHER_OBJ | acl->dfacl_flag; + } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { + if (acl->state > ace_group) { + error = ENOTSUP; + goto out; + } + if ((acep->a_flags & ACE_GROUP)) { + acl->seen |= GROUP_OBJ; + vals = &acl->group_obj; + vals->aent_type = GROUP_OBJ | acl->dfacl_flag; + } else { + acl->seen |= GROUP; + vals = acevals_find(acep, &acl->group, + &acl->numgroups); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = GROUP | acl->dfacl_flag; + } + acl->state = ace_group; + } else { + if (acl->state > ace_user) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user; + acl->seen |= USER; + vals = acevals_find(acep, &acl->user, + &acl->numusers); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = USER | acl->dfacl_flag; + } + + if (!(acl->state > ace_unused)) { + error = EINVAL; + goto out; + } + + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + /* no more than one allowed per aclent_t */ + if (vals->allowed != ACE_MASK_UNDEFINED) { + error = ENOTSUP; + goto out; + } + vals->allowed = acep->a_access_mask; + } else { + /* + * it's a DENY; if there was a previous DENY, it + * must have been an ACL_MASK. + */ + if (vals->denied != ACE_MASK_UNDEFINED) { + /* ACL_MASK is for USER and GROUP only */ + if ((acl->state != ace_user) && + (acl->state != ace_group)) { + error = ENOTSUP; + goto out; + } + + if (! acl->hasmask) { + acl->hasmask = 1; + acl->acl_mask = vals->denied; + /* check for mismatched ACL_MASK emulations */ + } else if (acl->acl_mask != vals->denied) { + error = ENOTSUP; + goto out; + } + vals->mask = vals->denied; + } + vals->denied = acep->a_access_mask; + } + } + + /* done collating; produce the aclent_t lists */ + if (normacl->state != ace_unused) { + error = ace_list_to_aent(normacl, aclentp, aclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + if (dfacl->state != ace_unused) { + error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + +out: + if (normacl != NULL) + ace_list_free(normacl); + if (dfacl != NULL) + ace_list_free(dfacl); + + return (error); +} + +static int +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, + uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) +{ + int error = 0; + aclent_t *aclentp, *dfaclentp; + int aclcnt, dfaclcnt; + int aclsz, dfaclsz; + + error = ln_ace_to_aent(acebufp, acecnt, owner, group, + &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); + + if (error) + return (error); + + + if (dfaclcnt != 0) { + /* + * Slap aclentp and dfaclentp into a single array. + */ + aclsz = sizeof (aclent_t) * aclcnt; + dfaclsz = sizeof (aclent_t) * dfaclcnt; + aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); + if (aclentp != NULL) { + (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); + } else { + error = ENOMEM; + } + } + + if (aclentp) { + *retaclentp = aclentp; + *retaclcnt = aclcnt + dfaclcnt; + } + + if (dfaclentp) + cacl_free(dfaclentp, dfaclsz); + + return (error); +} + + +int +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, + gid_t group) +{ + int aclcnt; + void *acldata; + int error; + + /* + * See if we need to translate + */ + if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || + (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACLENT_T)) + return (0); + + if (target_flavor == -1) { + error = EINVAL; + goto out; + } + + if (target_flavor == _ACL_ACE_ENABLED && + aclp->acl_type == ACLENT_T) { + error = convert_aent_to_ace(aclp->acl_aclp, + aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); + if (error) + goto out; + + } else if (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACE_T) { + error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, + isdir, owner, group, (aclent_t **)&acldata, &aclcnt); + if (error) + goto out; + } else { + error = ENOTSUP; + goto out; + } + + /* + * replace old acl with newly translated acl + */ + cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); + aclp->acl_aclp = acldata; + aclp->acl_cnt = aclcnt; + if (target_flavor == _ACL_ACE_ENABLED) { + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + } else { + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + } + return (0); + +out: + +#if !defined(_KERNEL) + errno = error; + return (-1); +#else + return (error); +#endif +} +#endif /* !_KERNEL */ + +#define SET_ACE(acl, index, who, mask, type, flags) { \ + acl[0][index].a_who = (uint32_t)who; \ + acl[0][index].a_type = type; \ + acl[0][index].a_flags = flags; \ + acl[0][index++].a_access_mask = mask; \ +} + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + (void) isdir; /* will need this later */ + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +int +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) +{ + int index = 0; + int error; + trivial_acl_t masks; + + *count = 3; + acl_trivial_access_masks(mode, isdir, &masks); + + if (masks.allow0) + (*count)++; + if (masks.deny1) + (*count)++; + if (masks.deny2) + (*count)++; + + if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) + return (error); + + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); + } + + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_IDENTIFIER_GROUP|ACE_GROUP); + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_EVERYONE); + + return (0); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + return (0); +} diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c new file mode 100644 index 0000000000..0b7fefc89a --- /dev/null +++ b/module/os/freebsd/spl/callb.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for delay() */ +#include /* For TASKQ_NAMELEN */ +#include + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(void *, int); + /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + boolean_t ct_busy; /* B_TRUE prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; + +/* + * Init all callb tables in the system. + */ +static void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = B_FALSE; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +static void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + int i; + + mutex_enter(&ct->ct_lock); + for (i = 0; i < 16; i++) { + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + if (ct->ct_ncallb == 0) + break; + /* Not all callbacks finished, waiting for the rest. */ + mutex_exit(&ct->ct_lock); + tsleep(ct, 0, "callb", hz / 4); + mutex_enter(&ct->ct_lock); + } + if (ct->ct_ncallb > 0) + printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT3S(class, <, NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef ZFS_DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef ZFS_DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT3S(class, <, NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; +#ifdef CPR_NOT_THREAD_SAFE + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_reltimedwait(&cp->cc_callb_cv, + cp->cc_lockp, (callb_timeout_sec * hz), + TR_CLOCK_TICK)) == -1) + break; +#endif + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(!ct->ct_busy); + ct->ct_busy = B_TRUE; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy); + ct->ct_busy = B_FALSE; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/freebsd/spl/list.c b/module/os/freebsd/spl/list.c new file mode 100644 index 0000000000..62374a4177 --- /dev/null +++ b/module/os/freebsd/spl/list.c @@ -0,0 +1,243 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include +#include + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT3P(list, !=, NULL); + ASSERT3U(size, >=, offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT3P(list, !=, NULL); + ASSERT3P(list->list_head.list_next, ==, node); + ASSERT3P(list->list_head.list_prev, ==, node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT3P(lold->list_next, !=, NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT3U(dst->list_size, ==, src->list_size); + ASSERT3U(dst->list_offset, ==, src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + EQUIV(link->list_next == NULL, link->list_prev == NULL); + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h new file mode 100644 index 0000000000..0abd430687 --- /dev/null +++ b/module/os/freebsd/spl/sha224.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA224_H_ +#define _SHA224_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA224_DIGEST_LENGTH 28 +#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA224Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA224_BLOCK_LENGTH]; +} SHA224_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA224_Init +#define SHA224_Init _libmd_SHA224_Init +#endif +#ifndef SHA224_Update +#define SHA224_Update _libmd_SHA224_Update +#endif +#ifndef SHA224_Final +#define SHA224_Final _libmd_SHA224_Final +#endif +#ifndef SHA224_End +#define SHA224_End _libmd_SHA224_End +#endif +#ifndef SHA224_Fd +#define SHA224_Fd _libmd_SHA224_Fd +#endif +#ifndef SHA224_FdChunk +#define SHA224_FdChunk _libmd_SHA224_FdChunk +#endif +#ifndef SHA224_File +#define SHA224_File _libmd_SHA224_File +#endif +#ifndef SHA224_FileChunk +#define SHA224_FileChunk _libmd_SHA224_FileChunk +#endif +#ifndef SHA224_Data +#define SHA224_Data _libmd_SHA224_Data +#endif + +#ifndef SHA224_version +#define SHA224_version _libmd_SHA224_version +#endif + +void SHA224_Init(SHA224_CTX *); +void SHA224_Update(SHA224_CTX *, const void *, size_t); +void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)], + SHA224_CTX *); +#ifndef _KERNEL +char *SHA224_End(SHA224_CTX *, char *); +char *SHA224_Data(const void *, unsigned int, char *); +char *SHA224_Fd(int, char *); +char *SHA224_FdChunk(int, char *, off_t, off_t); +char *SHA224_File(const char *, char *); +char *SHA224_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA224_H_ */ diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h new file mode 100644 index 0000000000..193c0c0251 --- /dev/null +++ b/module/os/freebsd/spl/sha256.h @@ -0,0 +1,99 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA256_BLOCK_LENGTH 64 +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA256Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA256_BLOCK_LENGTH]; +} SHA256_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA256_Init +#define SHA256_Init _libmd_SHA256_Init +#endif +#ifndef SHA256_Update +#define SHA256_Update _libmd_SHA256_Update +#endif +#ifndef SHA256_Final +#define SHA256_Final _libmd_SHA256_Final +#endif +#ifndef SHA256_End +#define SHA256_End _libmd_SHA256_End +#endif +#ifndef SHA256_Fd +#define SHA256_Fd _libmd_SHA256_Fd +#endif +#ifndef SHA256_FdChunk +#define SHA256_FdChunk _libmd_SHA256_FdChunk +#endif +#ifndef SHA256_File +#define SHA256_File _libmd_SHA256_File +#endif +#ifndef SHA256_FileChunk +#define SHA256_FileChunk _libmd_SHA256_FileChunk +#endif +#ifndef SHA256_Data +#define SHA256_Data _libmd_SHA256_Data +#endif + +#ifndef SHA256_Transform +#define SHA256_Transform _libmd_SHA256_Transform +#endif +#ifndef SHA256_version +#define SHA256_version _libmd_SHA256_version +#endif + +void SHA256_Init(SHA256_CTX *); +void SHA256_Update(SHA256_CTX *, const void *, size_t); +void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)], + SHA256_CTX *); +#ifndef _KERNEL +char *SHA256_End(SHA256_CTX *, char *); +char *SHA256_Data(const void *, unsigned int, char *); +char *SHA256_Fd(int, char *); +char *SHA256_FdChunk(int, char *, off_t, off_t); +char *SHA256_File(const char *, char *); +char *SHA256_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA256_H_ */ diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c new file mode 100644 index 0000000000..241cf8c9ae --- /dev/null +++ b/module/os/freebsd/spl/sha256c.c @@ -0,0 +1,378 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + + +#include +#include +#include "sha224.h" +#include "sha256.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint32_t into a vector of bytes */ +#define be32enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint32_t */ +#define be32dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA256 round constants. */ +static const uint32_t K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t *state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 64); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen = len << 3; + + /* Update number of bits */ + ctx->count += bitlen; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-224: ******************************************************* */ +/* + * the SHA224 and SHA256 transforms are identical + */ + +/* SHA-224 initialization. Begins a SHA-224 operation. */ +void +SHA224_Init(SHA224_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xC1059ED8; + ctx->state[1] = 0x367CD507; + ctx->state[2] = 0x3070DD17; + ctx->state[3] = 0xF70E5939; + ctx->state[4] = 0xFFC00B31; + ctx->state[5] = 0x68581511; + ctx->state[6] = 0x64f98FA7; + ctx->state[7] = 0xBEFA4FA4; +} + +/* Add bytes into the SHA-224 hash */ +void +SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len) +{ + + SHA256_Update((SHA256_CTX *)ctx, in, len); +} + +/* + * SHA-224 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad((SHA256_CTX *)ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#ifdef WEAK_REFS +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA256_Init +__weak_reference(_libmd_SHA256_Init, SHA256_Init); +#undef SHA256_Update +__weak_reference(_libmd_SHA256_Update, SHA256_Update); +#undef SHA256_Final +__weak_reference(_libmd_SHA256_Final, SHA256_Final); +#undef SHA256_Transform +__weak_reference(_libmd_SHA256_Transform, SHA256_Transform); + +#undef SHA224_Init +__weak_reference(_libmd_SHA224_Init, SHA224_Init); +#undef SHA224_Update +__weak_reference(_libmd_SHA224_Update, SHA224_Update); +#undef SHA224_Final +__weak_reference(_libmd_SHA224_Final, SHA224_Final); +#endif diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h new file mode 100644 index 0000000000..67250cee03 --- /dev/null +++ b/module/os/freebsd/spl/sha384.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA384_H_ +#define _SHA384_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA384_BLOCK_LENGTH 128 +#define SHA384_DIGEST_LENGTH 48 +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA384Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA384_BLOCK_LENGTH]; +} SHA384_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA384_Init +#define SHA384_Init _libmd_SHA384_Init +#endif +#ifndef SHA384_Update +#define SHA384_Update _libmd_SHA384_Update +#endif +#ifndef SHA384_Final +#define SHA384_Final _libmd_SHA384_Final +#endif +#ifndef SHA384_End +#define SHA384_End _libmd_SHA384_End +#endif +#ifndef SHA384_Fd +#define SHA384_Fd _libmd_SHA384_Fd +#endif +#ifndef SHA384_FdChunk +#define SHA384_FdChunk _libmd_SHA384_FdChunk +#endif +#ifndef SHA384_File +#define SHA384_File _libmd_SHA384_File +#endif +#ifndef SHA384_FileChunk +#define SHA384_FileChunk _libmd_SHA384_FileChunk +#endif +#ifndef SHA384_Data +#define SHA384_Data _libmd_SHA384_Data +#endif + +#ifndef SHA384_version +#define SHA384_version _libmd_SHA384_version +#endif + +void SHA384_Init(SHA384_CTX *); +void SHA384_Update(SHA384_CTX *, const void *, size_t); +void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)], + SHA384_CTX *); +#ifndef _KERNEL +char *SHA384_End(SHA384_CTX *, char *); +char *SHA384_Data(const void *, unsigned int, char *); +char *SHA384_Fd(int, char *); +char *SHA384_FdChunk(int, char *, off_t, off_t); +char *SHA384_File(const char *, char *); +char *SHA384_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA384_H_ */ diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h new file mode 100644 index 0000000000..b6fb733ca5 --- /dev/null +++ b/module/os/freebsd/spl/sha512.h @@ -0,0 +1,101 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512_H_ +#define _SHA512_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA512_BLOCK_LENGTH 128 +#define SHA512_DIGEST_LENGTH 64 +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA512Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA512_BLOCK_LENGTH]; +} SHA512_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#if 0 +#ifndef SHA512_Init +#define SHA512_Init _libmd_SHA512_Init +#endif +#ifndef SHA512_Update +#define SHA512_Update _libmd_SHA512_Update +#endif +#ifndef SHA512_Final +#define SHA512_Final _libmd_SHA512_Final +#endif +#endif +#ifndef SHA512_End +#define SHA512_End _libmd_SHA512_End +#endif +#ifndef SHA512_Fd +#define SHA512_Fd _libmd_SHA512_Fd +#endif +#ifndef SHA512_FdChunk +#define SHA512_FdChunk _libmd_SHA512_FdChunk +#endif +#ifndef SHA512_File +#define SHA512_File _libmd_SHA512_File +#endif +#ifndef SHA512_FileChunk +#define SHA512_FileChunk _libmd_SHA512_FileChunk +#endif +#ifndef SHA512_Data +#define SHA512_Data _libmd_SHA512_Data +#endif + +#ifndef SHA512_Transform +#define SHA512_Transform _libmd_SHA512_Transform +#endif +#ifndef SHA512_version +#define SHA512_version _libmd_SHA512_version +#endif + +void SHA512_Init(SHA512_CTX *); +void SHA512_Update(SHA512_CTX *, const void *, size_t); +void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_End(SHA512_CTX *, char *); +char *SHA512_Data(const void *, unsigned int, char *); +char *SHA512_Fd(int, char *); +char *SHA512_FdChunk(int, char *, off_t, off_t); +char *SHA512_File(const char *, char *); +char *SHA512_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512_H_ */ diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c new file mode 100644 index 0000000000..146f338f0e --- /dev/null +++ b/module/os/freebsd/spl/sha512c.c @@ -0,0 +1,508 @@ +/* + * Copyright 2005 Colin Percival + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "sha512.h" +#include "sha512t.h" +#include "sha384.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint64_t into a vector of bytes */ +#define be64enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint64_t */ +#define be64dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint64_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 8. + */ +static void +be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + be64enc(dst + i * 8, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint64_t). Assumes len is a multiple of 8. + */ +static void +be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + dst[i] = be64dec(src + i * 8); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA512 round constants. */ +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +/* Elementary functions used by SHA512 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (64 - n))) +#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39)) +#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41)) +#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) +#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6)) + +/* SHA512 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(80 - i) % 8], S[(81 - i) % 8], \ + S[(82 - i) % 8], S[(83 - i) % 8], \ + S[(84 - i) % 8], S[(85 - i) % 8], \ + S[(86 - i) % 8], S[(87 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA512 block compression function. The 512-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA512_Transform(uint64_t *state, + const unsigned char block[SHA512_BLOCK_LENGTH]) +{ + uint64_t W[80]; + uint64_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be64dec_vect(W, block, SHA512_BLOCK_LENGTH); + + /* 2. Initialize working variables. */ + memcpy(S, state, SHA512_DIGEST_LENGTH); + + /* 3. Mix. */ + for (i = 0; i < 80; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 64) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[SHA512_BLOCK_LENGTH] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA512_Pad(SHA512_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Pad to 112 mod 128, transforming if we finish a block en route. */ + if (r < 112) { + /* Pad to 112 mod 128. */ + memcpy(&ctx->buf[r], PAD, 112 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 128 - r); + SHA512_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 112); + } + + /* Add the terminating bit-count. */ + be64enc_vect(&ctx->buf[112], ctx->count, 16); + + /* Mix in the final block. */ + SHA512_Transform(ctx->state, ctx->buf); +} + +/* SHA-512 initialization. Begins a SHA-512 operation. */ +void +SHA512_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6a09e667f3bcc908ULL; + ctx->state[1] = 0xbb67ae8584caa73bULL; + ctx->state[2] = 0x3c6ef372fe94f82bULL; + ctx->state[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state[4] = 0x510e527fade682d1ULL; + ctx->state[5] = 0x9b05688c2b3e6c1fULL; + ctx->state[6] = 0x1f83d9abfb41bd6bULL; + ctx->state[7] = 0x5be0cd19137e2179ULL; +} + +/* Add bytes into the hash */ +void +SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen[2]; + uint64_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint64_t)len) << 3; + bitlen[0] = ((uint64_t)len) >> 61; + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < SHA512_BLOCK_LENGTH - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r); + SHA512_Transform(ctx->state, ctx->buf); + src += SHA512_BLOCK_LENGTH - r; + len -= SHA512_BLOCK_LENGTH - r; + + /* Perform complete blocks */ + while (len >= SHA512_BLOCK_LENGTH) { + SHA512_Transform(ctx->state, src); + src += SHA512_BLOCK_LENGTH; + len -= SHA512_BLOCK_LENGTH; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-512 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-512t: ******************************************************** */ +/* + * the SHA512t transforms are identical to SHA512 so reuse the existing function + */ +void +SHA512_224_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x8c3d37c819544da2ULL; + ctx->state[1] = 0x73e1996689dcd4d6ULL; + ctx->state[2] = 0x1dfab7ae32ff9c82ULL; + ctx->state[3] = 0x679dd514582f9fcfULL; + ctx->state[4] = 0x0f6d2b697bd44da8ULL; + ctx->state[5] = 0x77e36f7304c48942ULL; + ctx->state[6] = 0x3f9d85a86a1d36c8ULL; + ctx->state[7] = 0x1112e6ad91d692a1ULL; +} + +void +SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH], + SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +void +SHA512_256_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x22312194fc2bf72cULL; + ctx->state[1] = 0x9f555fa3c84c64c2ULL; + ctx->state[2] = 0x2393b86b6f53b151ULL; + ctx->state[3] = 0x963877195940eabdULL; + ctx->state[4] = 0x96283ee2a88effe3ULL; + ctx->state[5] = 0xbe5e1e2553863992ULL; + ctx->state[6] = 0x2b0199fc2c85b8aaULL; + ctx->state[7] = 0x0eb72ddc81c52ca2ULL; +} + +void +SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH], + SHA512_CTX * ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* ** SHA-384: ******************************************************** */ +/* + * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped + */ + +/* SHA-384 initialization. Begins a SHA-384 operation. */ +void +SHA384_Init(SHA384_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state[1] = 0x629a292a367cd507ULL; + ctx->state[2] = 0x9159015a3070dd17ULL; + ctx->state[3] = 0x152fecd8f70e5939ULL; + ctx->state[4] = 0x67332667ffc00b31ULL; + ctx->state[5] = 0x8eb44a8768581511ULL; + ctx->state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state[7] = 0x47b5481dbefa4fa4ULL; +} + +/* Add bytes into the SHA-384 hash */ +void +SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update((SHA512_CTX *)ctx, in, len); +} + +/* + * SHA-384 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad((SHA512_CTX *)ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#if 0 +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA512_Init +__weak_reference(_libmd_SHA512_Init, SHA512_Init); +#undef SHA512_Update +__weak_reference(_libmd_SHA512_Update, SHA512_Update); +#undef SHA512_Final +__weak_reference(_libmd_SHA512_Final, SHA512_Final); +#undef SHA512_Transform +__weak_reference(_libmd_SHA512_Transform, SHA512_Transform); + +#undef SHA512_224_Init +__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init); +#undef SHA512_224_Update +__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update); +#undef SHA512_224_Final +__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final); + +#undef SHA512_256_Init +__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init); +#undef SHA512_256_Update +__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update); +#undef SHA512_256_Final +__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final); + +#undef SHA384_Init +__weak_reference(_libmd_SHA384_Init, SHA384_Init); +#undef SHA384_Update +__weak_reference(_libmd_SHA384_Update, SHA384_Update); +#undef SHA384_Final +__weak_reference(_libmd_SHA384_Final, SHA384_Final); +#endif diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h new file mode 100644 index 0000000000..703867fc02 --- /dev/null +++ b/module/os/freebsd/spl/sha512t.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512T_H_ +#define _SHA512T_H_ + +#include "sha512.h" + +#ifndef _KERNEL +#include +#endif + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1) +#define SHA512_256_DIGEST_LENGTH 32 +#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1) + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA512_224_Init +#define SHA512_224_Init _libmd_SHA512_224_Init +#endif +#ifndef SHA512_224_Update +#define SHA512_224_Update _libmd_SHA512_224_Update +#endif +#ifndef SHA512_224_Final +#define SHA512_224_Final _libmd_SHA512_224_Final +#endif +#ifndef SHA512_224_End +#define SHA512_224_End _libmd_SHA512_224_End +#endif +#ifndef SHA512_224_Fd +#define SHA512_224_Fd _libmd_SHA512_224_Fd +#endif +#ifndef SHA512_224_FdChunk +#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk +#endif +#ifndef SHA512_224_File +#define SHA512_224_File _libmd_SHA512_224_File +#endif +#ifndef SHA512_224_FileChunk +#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk +#endif +#ifndef SHA512_224_Data +#define SHA512_224_Data _libmd_SHA512_224_Data +#endif + +#ifndef SHA512_224_Transform +#define SHA512_224_Transform _libmd_SHA512_224_Transform +#endif +#ifndef SHA512_224_version +#define SHA512_224_version _libmd_SHA512_224_version +#endif + +#ifndef SHA512_256_Init +#define SHA512_256_Init _libmd_SHA512_256_Init +#endif +#ifndef SHA512_256_Update +#define SHA512_256_Update _libmd_SHA512_256_Update +#endif +#ifndef SHA512_256_Final +#define SHA512_256_Final _libmd_SHA512_256_Final +#endif +#ifndef SHA512_256_End +#define SHA512_256_End _libmd_SHA512_256_End +#endif +#ifndef SHA512_256_Fd +#define SHA512_256_Fd _libmd_SHA512_256_Fd +#endif +#ifndef SHA512_256_FdChunk +#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk +#endif +#ifndef SHA512_256_File +#define SHA512_256_File _libmd_SHA512_256_File +#endif +#ifndef SHA512_256_FileChunk +#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk +#endif +#ifndef SHA512_256_Data +#define SHA512_256_Data _libmd_SHA512_256_Data +#endif + +#ifndef SHA512_256_Transform +#define SHA512_256_Transform _libmd_SHA512_256_Transform +#endif +#ifndef SHA512_256_version +#define SHA512_256_version _libmd_SHA512_256_version +#endif + +void SHA512_224_Init(SHA512_CTX *); +void SHA512_224_Update(SHA512_CTX *, const void *, size_t); +void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_224_End(SHA512_CTX *, char *); +char *SHA512_224_Data(const void *, unsigned int, char *); +char *SHA512_224_Fd(int, char *); +char *SHA512_224_FdChunk(int, char *, off_t, off_t); +char *SHA512_224_File(const char *, char *); +char *SHA512_224_FileChunk(const char *, char *, off_t, off_t); +#endif +void SHA512_256_Init(SHA512_CTX *); +void SHA512_256_Update(SHA512_CTX *, const void *, size_t); +void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_256_End(SHA512_CTX *, char *); +char *SHA512_256_Data(const void *, unsigned int, char *); +char *SHA512_256_Fd(int, char *); +char *SHA512_256_FdChunk(int, char *, off_t, off_t); +char *SHA512_256_File(const char *, char *); +char *SHA512_256_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512T_H_ */ diff --git a/module/os/freebsd/spl/spl_acl.c b/module/os/freebsd/spl/spl_acl.c new file mode 100644 index 0000000000..74c26d03f8 --- /dev/null +++ b/module/os/freebsd/spl/spl_acl.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2008, 2009 Edward Tomasz Napierała + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +struct zfs2bsd { + uint32_t zb_zfs; + int zb_bsd; +}; + +struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA}, + {ACE_WRITE_DATA, ACL_WRITE_DATA}, + {ACE_EXECUTE, ACL_EXECUTE}, + {ACE_APPEND_DATA, ACL_APPEND_DATA}, + {ACE_DELETE_CHILD, ACL_DELETE_CHILD}, + {ACE_DELETE, ACL_DELETE}, + {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES}, + {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES}, + {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS}, + {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS}, + {ACE_READ_ACL, ACL_READ_ACL}, + {ACE_WRITE_ACL, ACL_WRITE_ACL}, + {ACE_WRITE_OWNER, ACL_WRITE_OWNER}, + {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE}, + {0, 0}}; + +struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE, + ACL_ENTRY_FILE_INHERIT}, + {ACE_DIRECTORY_INHERIT_ACE, + ACL_ENTRY_DIRECTORY_INHERIT}, + {ACE_NO_PROPAGATE_INHERIT_ACE, + ACL_ENTRY_NO_PROPAGATE_INHERIT}, + {ACE_INHERIT_ONLY_ACE, + ACL_ENTRY_INHERIT_ONLY}, + {ACE_INHERITED_ACE, + ACL_ENTRY_INHERITED}, + {ACE_SUCCESSFUL_ACCESS_ACE_FLAG, + ACL_ENTRY_SUCCESSFUL_ACCESS}, + {ACE_FAILED_ACCESS_ACE_FLAG, + ACL_ENTRY_FAILED_ACCESS}, + {0, 0}}; + +static int +_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + int bsd = 0; + + for (tmp = table; tmp->zb_zfs != 0; tmp++) { + if (zfs & tmp->zb_zfs) + bsd |= tmp->zb_bsd; + } + + return (bsd); +} + +static uint32_t +_zfs_from_bsd(int bsd, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + uint32_t zfs = 0; + + for (tmp = table; tmp->zb_bsd != 0; tmp++) { + if (bsd & tmp->zb_bsd) + zfs |= tmp->zb_zfs; + } + + return (zfs); +} + +int +acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries) +{ + int i; + struct acl_entry *entry; + const ace_t *ace; + + if (nentries < 1) { + printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n"); + return (EINVAL); + } + + if (nentries > ACL_MAX_ENTRIES) { + /* + * I believe it may happen only when moving a pool + * from SunOS to FreeBSD. + */ + printf("acl_from_aces: ZFS ACL too big to fit " + "into 'struct acl'; returning EINVAL.\n"); + return (EINVAL); + } + + bzero(aclp, sizeof (*aclp)); + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + aclp->acl_cnt = nentries; + + for (i = 0; i < nentries; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + if (ace->a_flags & ACE_OWNER) + entry->ae_tag = ACL_USER_OBJ; + else if (ace->a_flags & ACE_GROUP) + entry->ae_tag = ACL_GROUP_OBJ; + else if (ace->a_flags & ACE_EVERYONE) + entry->ae_tag = ACL_EVERYONE; + else if (ace->a_flags & ACE_IDENTIFIER_GROUP) + entry->ae_tag = ACL_GROUP; + else + entry->ae_tag = ACL_USER; + + if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP) + entry->ae_id = ace->a_who; + else + entry->ae_id = ACL_UNDEFINED_ID; + + entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms); + entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags); + + switch (ace->a_type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM; + break; + default: + panic("acl_from_aces: a_type is 0x%x", ace->a_type); + } + } + + return (0); +} + +void +aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp) +{ + int i; + const struct acl_entry *entry; + ace_t *ace; + + bzero(aces, sizeof (*aces) * aclp->acl_cnt); + + *nentries = aclp->acl_cnt; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + ace->a_who = entry->ae_id; + + if (entry->ae_tag == ACL_USER_OBJ) + ace->a_flags = ACE_OWNER; + else if (entry->ae_tag == ACL_GROUP_OBJ) + ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP); + else if (entry->ae_tag == ACL_GROUP) + ace->a_flags = ACE_IDENTIFIER_GROUP; + else if (entry->ae_tag == ACL_EVERYONE) + ace->a_flags = ACE_EVERYONE; + else /* ACL_USER */ + ace->a_flags = 0; + + ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms); + ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags); + + switch (entry->ae_entry_type) { + case ACL_ENTRY_TYPE_ALLOW: + ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_DENY: + ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_ALARM: + ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_AUDIT: + ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + default: + panic("aces_from_acl: ae_entry_type is 0x%x", + entry->ae_entry_type); + } + } +} diff --git a/module/os/freebsd/spl/spl_atomic.c b/module/os/freebsd/spl/spl_atomic.c new file mode 100644 index 0000000000..80040fc6a3 --- /dev/null +++ b/module/os/freebsd/spl/spl_atomic.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \ + !defined(HAS_EMULATED_ATOMIC64) + +#ifdef _KERNEL +#include + +struct mtx atomic_mtx; +MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF); +#else +#include + +#define mtx_lock(lock) pthread_mutex_lock(lock) +#define mtx_unlock(lock) pthread_mutex_unlock(lock) + +static pthread_mutex_t atomic_mtx; + +static __attribute__((constructor)) void +atomic_init(void) +{ + pthread_mutex_init(&atomic_mtx, NULL); +} +#endif + +void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + + mtx_lock(&atomic_mtx); + *target += delta; + mtx_unlock(&atomic_mtx); +} + +void +atomic_dec_64(volatile uint64_t *target) +{ + + mtx_lock(&atomic_mtx); + *target -= 1; + mtx_unlock(&atomic_mtx); +} + +uint64_t +atomic_swap_64(volatile uint64_t *a, uint64_t value) +{ + uint64_t ret; + + mtx_lock(&atomic_mtx); + ret = *a; + *a = value; + mtx_unlock(&atomic_mtx); + return (ret); +} + +uint64_t +atomic_load_64(volatile uint64_t *a) +{ + uint64_t ret; + + mtx_lock(&atomic_mtx); + ret = *a; + mtx_unlock(&atomic_mtx); + return (ret); +} + +uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + uint64_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target += delta); + mtx_unlock(&atomic_mtx); + return (newval); +} + +uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ + uint64_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} +#endif diff --git a/module/os/freebsd/spl/spl_cmn_err.c b/module/os/freebsd/spl/spl_cmn_err.c new file mode 100644 index 0000000000..22c7338b73 --- /dev/null +++ b/module/os/freebsd/spl/spl_cmn_err.c @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 John Birrell . All rights reserved. + * Copyright 2012 Martin Matuska . All rights reserved. + */ + +#include +#include +#include +#include + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + char buf[256]; + const char *prefix; + + prefix = NULL; /* silence unwitty compilers */ + switch (ce) { + case CE_CONT: + prefix = "Solaris(cont): "; + break; + case CE_NOTE: + prefix = "Solaris: NOTICE: "; + break; + case CE_WARN: + prefix = "Solaris: WARNING: "; + break; + case CE_PANIC: + prefix = "Solaris(panic): "; + break; + case CE_IGNORE: + break; + default: + panic("Solaris: unknown severity level"); + } + if (ce == CE_PANIC) { + vsnprintf(buf, sizeof (buf), fmt, adx); + panic("%s%s", prefix, buf); + } + if (ce != CE_IGNORE) { + printf("%s", prefix); + vprintf(fmt, adx); + printf("\n"); + } +} + +void +cmn_err(int type, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(type, fmt, ap); + va_end(ap); +} diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c new file mode 100644 index 0000000000..6b2872bcc0 --- /dev/null +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -0,0 +1,38 @@ +/* + * Copyright 2014 The FreeBSD Project. + * All rights reserved. + * + * This software was developed by Steven Hartland. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +/* CSTYLED */ +SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/spl/spl_kmem.c b/module/os/freebsd/spl/spl_kmem.c new file mode 100644 index 0000000000..ee8f1d851a --- /dev/null +++ b/module/os/freebsd/spl/spl_kmem.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include + +#ifdef KMEM_DEBUG +#include +#include +#endif + +#ifdef _KERNEL +MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris"); +#else +#define malloc(size, type, flags) malloc(size) +#define free(addr, type) free(addr) +#endif + +#ifdef KMEM_DEBUG +struct kmem_item { + struct stack stack; + LIST_ENTRY(kmem_item) next; +}; +static LIST_HEAD(, kmem_item) kmem_items; +static struct mtx kmem_items_mtx; +MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF); +#endif /* KMEM_DEBUG */ + +#include + +void * +zfs_kmem_alloc(size_t size, int kmflags) +{ + void *p; +#ifdef KMEM_DEBUG + struct kmem_item *i; + + size += sizeof (struct kmem_item); +#endif + p = malloc(MAX(size, 16), M_SOLARIS, kmflags); +#ifndef _KERNEL + if (kmflags & KM_SLEEP) + assert(p != NULL); +#endif +#ifdef KMEM_DEBUG + if (p != NULL) { + i = p; + p = (uint8_t *)p + sizeof (struct kmem_item); + stack_save(&i->stack); + mtx_lock(&kmem_items_mtx); + LIST_INSERT_HEAD(&kmem_items, i, next); + mtx_unlock(&kmem_items_mtx); + } +#endif + return (p); +} + +void +zfs_kmem_free(void *buf, size_t size __unused) +{ +#ifdef KMEM_DEBUG + if (buf == NULL) { + printf("%s: attempt to free NULL\n", __func__); + return; + } + struct kmem_item *i; + + buf = (uint8_t *)buf - sizeof (struct kmem_item); + mtx_lock(&kmem_items_mtx); + LIST_FOREACH(i, &kmem_items, next) { + if (i == buf) + break; + } + ASSERT3P(i, !=, NULL); + LIST_REMOVE(i, next); + mtx_unlock(&kmem_items_mtx); + memset(buf, 0xDC, MAX(size, 16)); +#endif + free(buf, M_SOLARIS); +} + +static uint64_t kmem_size_val; + +static void +kmem_size_init(void *unused __unused) +{ + + kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE; + if (kmem_size_val > vm_kmem_size) + kmem_size_val = vm_kmem_size; +} +SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); + +uint64_t +kmem_size(void) +{ + + return (kmem_size_val); +} + +static int +kmem_std_constructor(void *mem, int size __unused, void *private, int flags) +{ + struct kmem_cache *cache = private; + + return (cache->kc_constructor(mem, cache->kc_private, flags)); +} + +static void +kmem_std_destructor(void *mem, int size __unused, void *private) +{ + struct kmem_cache *cache = private; + + cache->kc_destructor(mem, cache->kc_private); +} + +kmem_cache_t * +kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags) +{ + kmem_cache_t *cache; + + ASSERT3P(vmp, ==, NULL); + + cache = kmem_alloc(sizeof (*cache), KM_SLEEP); + strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); + cache->kc_constructor = constructor; + cache->kc_destructor = destructor; + cache->kc_private = private; +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, + constructor != NULL ? kmem_std_constructor : NULL, + destructor != NULL ? kmem_std_destructor : NULL, + NULL, NULL, align > 0 ? align - 1 : 0, cflags); +#else + cache->kc_size = bufsize; +#endif + + return (cache); +} + +void +kmem_cache_destroy(kmem_cache_t *cache) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zdestroy(cache->kc_zone); +#endif + kmem_free(cache, sizeof (*cache)); +} + +void * +kmem_cache_alloc(kmem_cache_t *cache, int flags) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + return (uma_zalloc_arg(cache->kc_zone, cache, flags)); +#else + void *p; + + p = kmem_alloc(cache->kc_size, flags); + if (p != NULL && cache->kc_constructor != NULL) + kmem_std_constructor(p, cache->kc_size, cache, flags); + return (p); +#endif +} + +void +kmem_cache_free(kmem_cache_t *cache, void *buf) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zfree_arg(cache->kc_zone, buf, cache); +#else + if (cache->kc_destructor != NULL) + kmem_std_destructor(buf, cache->kc_size, cache); + kmem_free(buf, cache->kc_size); +#endif +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + + return (B_FALSE); +} + +/* + * Reap (almost) everything soon. + * + * Note: this does not wait for the reap-tasks to complete. Caller + * should use kmem_cache_reap_active() (above) and/or moderation to + * avoid scheduling too many reap-tasks. + */ +#ifdef _KERNEL +void +kmem_cache_reap_soon(kmem_cache_t *cache) +{ +#ifndef KMEM_DEBUG +#if __FreeBSD_version >= 1300043 + uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); +#else + zone_drain(cache->kc_zone); +#endif +#endif +} + +void +kmem_reap(void) +{ +#if __FreeBSD_version >= 1300043 + uma_reclaim(UMA_RECLAIM_TRIM); +#else + uma_reclaim(); +#endif +} +#else +void +kmem_cache_reap_soon(kmem_cache_t *cache __unused) +{ +} + +void +kmem_reap(void) +{ +} +#endif + +int +kmem_debugging(void) +{ + return (0); +} + +void * +calloc(size_t n, size_t s) +{ + return (kmem_zalloc(n * s, KM_NOSLEEP)); +} + +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *msg; + va_list adx2; + + va_copy(adx2, adx); + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + va_end(adx2); + + return (msg); +} + +#include +#include +#ifdef KMEM_DEBUG +#error "KMEM_DEBUG not currently supported" +#endif + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (uma_zone_get_cur(cache->kc_zone)); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->kc_zone->uz_size); +} + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT3P(move, !=, NULL); +} + +#ifdef KMEM_DEBUG +void kmem_show(void *); +void +kmem_show(void *dummy __unused) +{ + struct kmem_item *i; + + mtx_lock(&kmem_items_mtx); + if (LIST_EMPTY(&kmem_items)) + printf("KMEM_DEBUG: No leaked elements.\n"); + else { + printf("KMEM_DEBUG: Leaked elements:\n\n"); + LIST_FOREACH(i, &kmem_items, next) { + printf("address=%p\n", i); + stack_print_ddb(&i->stack); + printf("\n"); + } + } + mtx_unlock(&kmem_items_mtx); +} + +SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL); +#endif /* KMEM_DEBUG */ diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c new file mode 100644 index 0000000000..059ada235c --- /dev/null +++ b/module/os/freebsd/spl/spl_kstat.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Links to Illumos.org for more information on kstat function: + * [1] https://illumos.org/man/1M/kstat + * [2] https://illumos.org/man/9f/kstat_create + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); + +SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +void +__kstat_set_seq_raw_ops(kstat_t *ksp, + int (*headers)(struct seq_file *f), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.seq_headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT3P(ksp, !=, NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + free(ksp->ks_raw_buf, M_TEMP); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK); + + return (0); +} + +static void * +kstat_raw_default_addr(kstat_t *ksp, loff_t n) +{ + if (n == 0) + return (ksp->ks_data); + return (NULL); +} + +static int +kstat_sysctl(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent; + uint64_t val; + + ksent = ksp->ks_data; + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = ksent->value.ui64; + + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +static int +kstat_sysctl_string(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; + char *val; + uint32_t len = 0; + + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = KSTAT_NAMED_STR_PTR(ksent); + len = KSTAT_NAMED_STR_BUFLEN(ksent); + val[len-1] = '\0'; + + return (sysctl_handle_string(oidp, val, len, req)); +} + +static int +kstat_sysctl_io(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + kstat_t *ksp = arg1; + kstat_io_t *kip = ksp->ks_data; + int rc; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + /* though wlentime & friends are signed, they will never be negative */ + sbuf_printf(sb, + "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " + "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + rc = sbuf_finish(sb); + if (rc == 0) + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + +static int +kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + void *data; + kstat_t *ksp = arg1; + void *(*addr_op)(kstat_t *ksp, loff_t index); + int n, has_header, rc = 0; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + + if (ksp->ks_raw_ops.addr) + addr_op = ksp->ks_raw_ops.addr; + else + addr_op = kstat_raw_default_addr; + + mutex_enter(ksp->ks_lock); + + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK); + + n = 0; + has_header = (ksp->ks_raw_ops.headers || + ksp->ks_raw_ops.seq_headers); + +restart_headers: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + } else if (ksp->ks_raw_ops.seq_headers) { + struct seq_file f; + + f.sf_buf = ksp->ks_raw_buf; + f.sf_size = ksp->ks_raw_bufsize; + rc = ksp->ks_raw_ops.seq_headers(&f); + } + if (has_header) { + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart_headers; + if (rc == 0) + sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + } + + while ((data = addr_op(ksp, n)) != NULL) { +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf, + ksp->ks_raw_bufsize, data); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (rc == 0) + sbuf_printf(sb, "%s", ksp->ks_raw_buf); + + } else { + ASSERT3U(ksp->ks_ndata, ==, 1); + sbuf_hexdump(sb, ksp->ks_data, + ksp->ks_data_size, NULL, 0); + } + n++; + } + free(ksp->ks_raw_buf, M_TEMP); + mutex_exit(ksp->ks_lock); + sbuf_trim(sb); + rc = sbuf_finish(sb); + if (rc == 0) + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + +kstat_t * +__kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) +{ + char buf[KSTAT_STRLEN]; + struct sysctl_oid *root; + kstat_t *ksp; + char *pool; + + KASSERT(instance == 0, ("instance=%d", instance)); + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT3U(ks_ndata, ==, 1); + + if (class == NULL) + class = "misc"; + + /* + * Allocate the main structure. We don't need to keep a copy of + * module in here, because it is only used for sysctl node creation + * done in this function. + */ + ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = instance; + (void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN); + (void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = flags; + ksp->ks_update = kstat_default_update; + + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + panic("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) + ksp->ks_data = NULL; + else + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + + /* + * Some kstats use a module name like "zfs/poolname" to distinguish a + * set of kstats belonging to a specific pool. Split on '/' to add an + * extra node for the pool name if needed. + */ + (void) strlcpy(buf, module, KSTAT_STRLEN); + module = buf; + pool = strchr(module, '/'); + if (pool != NULL) + *pool++ = '\0'; + + /* + * Create sysctl tree for those statistics: + * + * kstat.[.].. + */ + sysctl_ctx_init(&ksp->ks_sysctl_ctx); + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, + ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s tree!\n", __func__, module); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + if (pool != NULL) { + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s tree!\n", + __func__, module, pool); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, class, CTLFLAG_RW, 0, ""); + if (root == NULL) { + if (pool != NULL) + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", + __func__, module, pool, class); + else + printf("%s: Cannot create kstat.%s.%s tree!\n", + __func__, module, class); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(root), + OID_AUTO, name, CTLFLAG_RW, 0, ""); + if (root == NULL) { + if (pool != NULL) + printf("%s: Cannot create kstat.%s.%s.%s.%s " + "tree!\n", __func__, module, pool, class, + name); + else + printf("%s: Cannot create kstat.%s.%s.%s " + "tree!\n", __func__, module, class, name); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + + } + ksp->ks_sysctl_root = root; + + return (ksp); +} + +static void +kstat_install_named(kstat_t *ksp) +{ + kstat_named_t *ksent; + char *namelast; + int typelast; + + ksent = ksp->ks_data; + + VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL); + + typelast = 0; + namelast = NULL; + + for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { + if (ksent->data_type != 0) { + typelast = ksent->data_type; + namelast = ksent->name; + } + switch (typelast) { + case KSTAT_DATA_CHAR: + /* Not Implemented */ + break; + case KSTAT_DATA_INT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "I", namelast); + break; + case KSTAT_DATA_UINT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "IU", namelast); + break; + case KSTAT_DATA_INT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "Q", namelast); + break; + case KSTAT_DATA_UINT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "QU", namelast); + break; + case KSTAT_DATA_LONG: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "L", namelast); + break; + case KSTAT_DATA_ULONG: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl, "LU", namelast); + break; + case KSTAT_DATA_STRING: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, i, kstat_sysctl_string, "A", namelast); + break; + default: + panic("unsupported type: %d", typelast); + } + } +} + +void +kstat_install(kstat_t *ksp) +{ + struct sysctl_oid *root; + + if (ksp->ks_ndata == UINT32_MAX) + VERIFY3U(ksp->ks_type, ==, KSTAT_TYPE_RAW); + + switch (ksp->ks_type) { + case KSTAT_TYPE_NAMED: + return (kstat_install_named(ksp)); + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.data) { + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD + | CTLFLAG_MPSAFE | CTLFLAG_SKIP, + ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name); + } else { + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD + | CTLFLAG_MPSAFE | CTLFLAG_SKIP, + ksp, 0, kstat_sysctl_raw, "", ksp->ks_name); + } + break; + case KSTAT_TYPE_IO: + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, + ksp, 0, kstat_sysctl_io, "A", ksp->ks_name); + break; + case KSTAT_TYPE_TIMER: + case KSTAT_TYPE_INTR: + default: + panic("unsupported kstat type %d\n", ksp->ks_type); + } + VERIFY3P(root, !=, NULL); + ksp->ks_sysctl_root = root; +} + +void +kstat_delete(kstat_t *ksp) +{ + + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + free(ksp, M_KSTAT); +} diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c new file mode 100644 index 0000000000..0354b986cd --- /dev/null +++ b/module/os/freebsd/spl/spl_misc.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct opensolaris_utsname hw_utsname = { + .machine = MACHINE +}; + +#ifndef KERNEL_STATIC +char hw_serial[11] = "0"; + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} +#endif + +static void +opensolaris_utsname_init(void *arg) +{ + + hw_utsname.sysname = ostype; + hw_utsname.nodename = prison0.pr_hostname; + hw_utsname.release = osrelease; + snprintf(hw_utsname.version, sizeof (hw_utsname.version), + "%d", osreldate); +} + +char * +kmem_strdup(const char *s) +{ + char *buf; + + buf = kmem_alloc(strlen(s) + 1, KM_SLEEP); + strcpy(buf, s); + return (buf); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vpanic(fmt, ap); + va_end(ap); +} + + +SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, + opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/spl/spl_policy.c b/module/os/freebsd/spl/spl_policy.c new file mode 100644 index 0000000000..5ecd3d3103 --- /dev/null +++ b/module/os/freebsd/spl/spl_policy.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int +secpolicy_nfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON)); +} + +int +secpolicy_zfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_zfs_proc(cred_t *cr, proc_t *proc) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_sys_config(cred_t *cr, int checkonly __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG)); +} + +int +secpolicy_zinject(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT)); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT)); +} + +int +secpolicy_fs_owner(struct mount *mp, cred_t *cr) +{ + + if (zfs_super_owner) { + if (cr->cr_uid == mp->mnt_cred->cr_uid && + cr->cr_prison == mp->mnt_cred->cr_prison) { + return (0); + } + } + return (EPERM); +} + +/* + * This check is done in kern_link(), so we could just return 0 here. + */ +extern int hardlink_check_uid; +int +secpolicy_basic_link(vnode_t *vp, cred_t *cr) +{ + + if (!hardlink_check_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_LINK)); +} + +int +secpolicy_vnode_stky_modify(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_remove(vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0) + return (EACCES); + if ((accmode & VWRITE) && + spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) { + return (EACCES); + } + if (accmode & VEXEC) { + if (vp->v_type == VDIR) { + if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0) + return (EACCES); + } else { + if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0) + return (EACCES); + } + } + return (0); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode) +{ + accmode_t mode; + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + return (secpolicy_vnode_access(cr, vp, owner, mode)); +} + +int +secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_VFS_ADMIN, + PRIV_VFS_READ, + PRIV_VFS_WRITE, + PRIV_VFS_EXEC, + PRIV_VFS_LOOKUP + }; + int i; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + int priv; + + switch (priv = privs[i]) { + case PRIV_VFS_EXEC: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_VFS_LOOKUP: + if (vp->v_type != VDIR) + continue; + break; + } + if (spl_priv_check_cred(cr, priv) == 0) + return (0); + } + return (EPERM); +} + +int +secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + int mask = vap->va_mask; + int error; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) + return (EISDIR); + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); + if (error) + return (error); + } else { + vap->va_mode = ovap->va_mode; + } + if (mask & (AT_UID | AT_GID)) { + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + + /* + * To change the owner of a file, or change the group of + * a file to a group of which we are not a member, the + * caller must have privilege. + */ + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { + error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN); + if (error) + return (error); + } + } + + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { + secpolicy_setid_clear(vap, vp, cr); + } + } + if (mask & (AT_ATIME | AT_MTIME)) { + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error && (vap->va_vaflags & VA_UTIMES_NULL)) + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_vnode_create_gid(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) +{ + + if (groupmember(gid, cr)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SETGID)); +} + +int +secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr, + boolean_t issuidroot __unused) +{ + + if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); +} + +void +secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return; + + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { + if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } + } +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr) +{ + int error; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the process + * is not a member of. Both of these are allowed in jail(8). + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { + if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE)) + return (EFTYPE); + } + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0) { + error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); + if (error) + return (error); + } + /* + * Deny setting setuid if we are not the file owner. + */ + if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { + error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* XXX: vfs_suser()? */ + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER)); +} + +int +secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN)); +} + +void +secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) +{ + + if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) { + MNT_ILOCK(vfsp); + vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; + vfs_clearmntopt(vfsp, MNTOPT_SETUID); + vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); + MNT_IUNLOCK(vfsp); + } +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS)); +} + +int +secpolicy_smb(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NETSMB)); +} diff --git a/module/os/freebsd/spl/spl_procfs_list.c b/module/os/freebsd/spl/spl_procfs_list.c new file mode 100644 index 0000000000..e8448ce006 --- /dev/null +++ b/module/os/freebsd/spl/spl_procfs_list.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +typedef struct procfs_list_iter { + procfs_list_t *pli_pl; + void *pli_elt; +} pli_t; + +void +seq_printf(struct seq_file *f, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + (void) vsnprintf(f->sf_buf, f->sf_size, fmt, adx); + va_end(adx); +} + +static int +procfs_list_update(kstat_t *ksp, int rw) +{ + procfs_list_t *pl = ksp->ks_private; + + if (rw == KSTAT_WRITE) + pl->pl_clear(pl); + + return (0); +} + +static int +procfs_list_data(char *buf, size_t size, void *data) +{ + pli_t *p; + void *elt; + procfs_list_t *pl; + struct seq_file f; + + p = data; + pl = p->pli_pl; + elt = p->pli_elt; + free(p, M_TEMP); + f.sf_buf = buf; + f.sf_size = size; + return (pl->pl_show(&f, elt)); +} + +static void * +procfs_list_addr(kstat_t *ksp, loff_t n) +{ + procfs_list_t *pl = ksp->ks_private; + void *elt = ksp->ks_private1; + pli_t *p = NULL; + + + if (n == 0) + ksp->ks_private1 = list_head(&pl->pl_list); + else if (elt) + ksp->ks_private1 = list_next(&pl->pl_list, elt); + + if (ksp->ks_private1) { + p = malloc(sizeof (*p), M_TEMP, M_WAITOK); + p->pli_pl = pl; + p->pli_elt = ksp->ks_private1; + } + + return (p); +} + +void +procfs_list_install(const char *module, + const char *submodule, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + kstat_t *procfs_kstat; + + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_show = show; + procfs_list->pl_show_header = show_header; + procfs_list->pl_clear = clear; + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; + + procfs_kstat = kstat_create(module, 0, name, submodule, + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (procfs_kstat) { + procfs_kstat->ks_lock = &procfs_list->pl_lock; + procfs_kstat->ks_ndata = UINT32_MAX; + procfs_kstat->ks_private = procfs_list; + procfs_kstat->ks_update = procfs_list_update; + kstat_set_seq_raw_ops(procfs_kstat, show_header, + procfs_list_data, procfs_list_addr); + kstat_install(procfs_kstat); + procfs_list->pl_private = procfs_kstat; + } +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + kstat_delete(procfs_list->pl_private); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/freebsd/spl/spl_string.c b/module/os/freebsd/spl/spl_string.c new file mode 100644 index 0000000000..00b1df766a --- /dev/null +++ b/module/os/freebsd/spl/spl_string.c @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +char * +strpbrk(const char *s, const char *b) +{ + const char *p; + + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + + return (NULL); +} + +/* + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +void +kmem_strfree(char *str) +{ + ASSERT3P(str, !=, NULL); + kmem_free(str, strlen(str) + 1); +} diff --git a/module/os/freebsd/spl/spl_sunddi.c b/module/os/freebsd/spl/spl_sunddi.c new file mode 100644 index 0000000000..ebec77bdb3 --- /dev/null +++ b/module/os/freebsd/spl/spl_sunddi.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + + *result = strtol(str, nptr, base); + return (0); +} + +int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + + if (str == hw_serial) { + *result = prison0.pr_hostid; + return (0); + } + + *result = strtoul(str, nptr, base); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) +{ + + *result = (unsigned long long)strtouq(str, nptr, base); + return (0); +} + +int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + + *result = (long long)strtoq(str, nptr, base); + return (0); +} diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c new file mode 100644 index 0000000000..d5d50080fa --- /dev/null +++ b/module/os/freebsd/spl/spl_sysevent.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +log_sysevent(nvlist_t *event) +{ + struct sbuf *sb; + const char *type; + char typestr[128]; + nvpair_t *elem = NULL; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + type = NULL; + + while ((elem = nvlist_next_nvpair(event, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + { + boolean_t value; + + (void) nvpair_value_boolean_value(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), + value ? "true" : "false"); + break; + } + case DATA_TYPE_UINT8: + { + uint8_t value; + + (void) nvpair_value_uint8(elem, &value); + sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); + break; + } + case DATA_TYPE_INT32: + { + int32_t value; + + (void) nvpair_value_int32(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT32: + { + uint32_t value; + + (void) nvpair_value_uint32(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_INT64: + { + int64_t value; + + (void) nvpair_value_int64(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT64: + { + uint64_t value; + + (void) nvpair_value_uint64(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_STRING: + { + char *value; + + (void) nvpair_value_string(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); + if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) + type = value; + break; + } + case DATA_TYPE_UINT8_ARRAY: + { + uint8_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint8_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%02hhx", value[ii]); + break; + } + case DATA_TYPE_UINT16_ARRAY: + { + uint16_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint16_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%04hx", value[ii]); + break; + } + case DATA_TYPE_UINT32_ARRAY: + { + uint32_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint32_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_INT64_ARRAY: + { + int64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_int64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016lld", + (long long)value[ii]); + break; + } + case DATA_TYPE_UINT64_ARRAY: + { + uint64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_STRING_ARRAY: + { + char **strarr; + uint_t ii, nelem; + + (void) nvpair_value_string_array(elem, &strarr, &nelem); + + for (ii = 0; ii < nelem; ii++) { + if (strarr[ii] == NULL) { + sbuf_printf(sb, " "); + continue; + } + + sbuf_printf(sb, " %s", strarr[ii]); + if (strcmp(FM_CLASS, strarr[ii]) == 0) + type = strarr[ii]; + } + break; + } + case DATA_TYPE_NVLIST: + /* XXX - requires recursing in log_sysevent */ + break; + default: + printf("%s: type %d is not implemented\n", __func__, + nvpair_type(elem)); + break; + } + } + + if (sbuf_finish(sb) != 0) { + sbuf_delete(sb); + return (ENOMEM); + } + + if (type == NULL) + type = ""; + if (strncmp(type, "ESC_ZFS_", 8) == 0) { + snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8); + type = typestr; + } + devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); + sbuf_delete(sb); + + return (0); +} + +static void +sysevent_worker(void *arg __unused) +{ + zfs_zevent_t *ze; + nvlist_t *event; + uint64_t dropped = 0; + uint64_t dst_size; + int error; + + zfs_zevent_init(&ze); + for (;;) { + dst_size = 131072; + dropped = 0; + event = NULL; + error = zfs_zevent_next(ze, &event, + &dst_size, &dropped); + if (error) { + error = zfs_zevent_wait(ze); + if (error == ESHUTDOWN) + break; + } else { + VERIFY3P(event, !=, NULL); + log_sysevent(event); + nvlist_free(event); + } + } + zfs_zevent_destroy(ze); + kthread_exit(); +} + +void +ddi_sysevent_init(void) +{ + kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0, + "zfskern", "sysevent"); +} diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c new file mode 100644 index 0000000000..3fa7939bdb --- /dev/null +++ b/module/os/freebsd/spl/spl_taskq.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2009 Pawel Jakub Dawidek + * All rights reserved. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) +#include +#endif + +#include + +#if __FreeBSD_version < 1201522 +#define taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \ + taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__) +#endif + +static uint_t taskq_tsd; +static uma_zone_t taskq_zone; + +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; +taskq_t *dynamic_taskq = NULL; + +proc_t *system_proc; + +extern int uma_align_cache; + +static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); + +static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static unsigned long tqenthash; +static unsigned long tqenthashlock; +static struct sx *tqenthashtbl_lock; + +static taskqid_t tqidnext; + +#define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) +#define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) + +#define TIMEOUT_TASK 1 +#define NORMAL_TASK 2 + +static void +system_taskq_init(void *arg) +{ + int i; + + tsd_create(&taskq_tsd, NULL); + tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash); + tqenthashlock = (tqenthash + 1) / 8; + if (tqenthashlock > 0) + tqenthashlock--; + tqenthashtbl_lock = + malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1), + M_TASKQ, M_WAITOK | M_ZERO); + for (i = 0; i < tqenthashlock + 1; i++) + sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK); + taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t), + NULL, NULL, NULL, NULL, + UMA_ALIGN_CACHE, 0); + system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri, + 0, 0, 0); + system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus, + minclsyspri, 0, 0, 0); +} +SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, + NULL); + +static void +system_taskq_fini(void *arg) +{ + int i; + + taskq_destroy(system_delay_taskq); + taskq_destroy(system_taskq); + uma_zdestroy(taskq_zone); + tsd_destroy(&taskq_tsd); + for (i = 0; i < tqenthashlock + 1; i++) + sx_destroy(&tqenthashtbl_lock[i]); + for (i = 0; i < tqenthash + 1; i++) + VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + free(tqenthashtbl_lock, M_TASKQ); + free(tqenthashtbl, M_TASKQ); +} +SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, + NULL); + +#ifdef __LP64__ +static taskqid_t +__taskq_genid(void) +{ + taskqid_t tqid; + + /* + * Assume a 64-bit counter will not wrap in practice. + */ + tqid = atomic_add_64_nv(&tqidnext, 1); + VERIFY(tqid); + return (tqid); +} +#else +static taskqid_t +__taskq_genid(void) +{ + taskqid_t tqid; + + for (;;) { + tqid = atomic_add_32_nv(&tqidnext, 1); + if (__predict_true(tqid != 0)) + break; + } + VERIFY(tqid); + return (tqid); +} +#endif + +static taskq_ent_t * +taskq_lookup(taskqid_t tqid) +{ + taskq_ent_t *ent = NULL; + + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (ent->tqent_id == tqid) + break; + } + if (ent != NULL) + refcount_acquire(&ent->tqent_rc); + sx_xunlock(TQIDHASHLOCK(tqid)); + return (ent); +} + +static taskqid_t +taskq_insert(taskq_ent_t *ent) +{ + taskqid_t tqid; + + tqid = __taskq_genid(); + ent->tqent_id = tqid; + ent->tqent_registered = B_TRUE; + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + sx_xunlock(TQIDHASHLOCK(tqid)); + return (tqid); +} + +static void +taskq_remove(taskq_ent_t *ent) +{ + taskqid_t tqid = ent->tqent_id; + + if (!ent->tqent_registered) + return; + + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_REMOVE(ent, tqent_hash); + sx_xunlock(TQIDHASHLOCK(tqid)); + ent->tqent_registered = B_FALSE; +} + +static void +taskq_tsd_set(void *context) +{ + taskq_t *tq = context; + +#if defined(__amd64__) || defined(__i386__) || defined(__aarch64__) + if (context != NULL && tsd_get(taskq_tsd) == NULL) + fpu_kern_thread(FPU_KERN_NORMAL); +#endif + tsd_set(taskq_tsd, tq); +} + +static taskq_t * +taskq_create_impl(const char *name, int nthreads, pri_t pri, + proc_t *proc __maybe_unused, uint_t flags) +{ + taskq_t *tq; + + if ((flags & TASKQ_THREADS_CPU_PCT) != 0) + nthreads = MAX((mp_ncpus * nthreads) / 100, 1); + + tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_queue = taskqueue_create(name, M_WAITOK, + taskqueue_thread_enqueue, &tq->tq_queue); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, + taskq_tsd_set, tq); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + taskq_tsd_set, NULL); + (void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri, + proc, "%s", name); + + return ((taskq_t *)tq); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + return (taskq_create_impl(name, nthreads, pri, system_proc, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags) +{ + return (taskq_create_impl(name, nthreads, pri, proc, flags)); +} + +void +taskq_destroy(taskq_t *tq) +{ + + taskqueue_free(tq->tq_queue); + kmem_free(tq, sizeof (*tq)); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + + return (taskqueue_member(tq->tq_queue, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +static void +taskq_free(taskq_ent_t *task) +{ + taskq_remove(task); + if (refcount_release(&task->tqent_rc)) + uma_zfree(taskq_zone, task); +} + +int +taskq_cancel_id(taskq_t *tq, taskqid_t tid) +{ + uint32_t pend; + int rc; + taskq_ent_t *ent; + + if (tid == 0) + return (0); + + if ((ent = taskq_lookup(tid)) == NULL) + return (0); + + ent->tqent_cancelled = B_TRUE; + if (ent->tqent_type == TIMEOUT_TASK) { + rc = taskqueue_cancel_timeout(tq->tq_queue, + &ent->tqent_timeout_task, &pend); + } else + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) { + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else if (pend) { + /* + * Tasks normally free themselves when run, but here the task + * was cancelled so it did not free itself. + */ + taskq_free(ent); + } + /* Free the extra reference we added with taskq_lookup. */ + taskq_free(ent); + return (rc); +} + +static void +taskq_run(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + if (!task->tqent_cancelled) + task->tqent_func(task->tqent_arg); + taskq_free(task); +} + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskq_ent_t *task; + taskqid_t tqid; + clock_t timo; + int mflag; + + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = TIMEOUT_TASK; + task->tqent_cancelled = B_FALSE; + refcount_init(&task->tqent_rc, 1); + tqid = taskq_insert(task); + TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, + taskq_run, task); + + taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task, + timo); + return (tqid); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *task; + int mflag, prio; + taskqid_t tqid; + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + refcount_init(&task->tqent_rc, 1); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_cancelled = B_FALSE; + task->tqent_type = NORMAL_TASK; + tqid = taskq_insert(task); + TASK_INIT(&task->tqent_task, prio, taskq_run, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); + return (tqid); +} + +static void +taskq_run_ent(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + int prio; + + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + task->tqent_cancelled = B_FALSE; + task->tqent_registered = B_FALSE; + task->tqent_id = 0; + task->tqent_func = func; + task->tqent_arg = arg; + + TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); +} + +void +taskq_wait(taskq_t *tq) +{ + taskqueue_quiesce(tq->tq_queue); +} + +void +taskq_wait_id(taskq_t *tq, taskqid_t tid) +{ + taskq_ent_t *ent; + + if (tid == 0) + return; + if ((ent = taskq_lookup(tid)) == NULL) + return; + + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + taskq_free(ent); +} + +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) +{ + taskqueue_drain_all(tq->tq_queue); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (t->tqent_task.ta_pending == 0); +} diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c new file mode 100644 index 0000000000..0bf251a1ed --- /dev/null +++ b/module/os/freebsd/spl/spl_uio.c @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +/* + * $FreeBSD$ + */ + +#include +#include +#include +#include + +int +zfs_uiomove(void *cp, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) +{ + ASSERT3U(zfs_uio_rw(uio), ==, dir); + return (uiomove(cp, (int)n, GET_UIO_STRUCT(uio))); +} + +/* + * same as zfs_uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) +{ + struct iovec small_iovec[1]; + struct uio small_uio_clone; + struct uio *uio_clone; + int error; + + ASSERT3U(zfs_uio_rw(uio), ==, rw); + if (zfs_uio_iovcnt(uio) == 1) { + small_uio_clone = *(GET_UIO_STRUCT(uio)); + small_iovec[0] = *(GET_UIO_STRUCT(uio)->uio_iov); + small_uio_clone.uio_iov = small_iovec; + uio_clone = &small_uio_clone; + } else { + uio_clone = cloneuio(GET_UIO_STRUCT(uio)); + } + + error = vn_io_fault_uiomove(p, n, uio_clone); + *cbytes = zfs_uio_resid(uio) - uio_clone->uio_resid; + if (uio_clone != &small_uio_clone) + free(uio_clone, M_IOV); + return (error); +} + +/* + * Drop the next n chars out of *uiop. + */ +void +zfs_uioskip(zfs_uio_t *uio, size_t n) +{ + zfs_uio_seg_t segflg; + + /* For the full compatibility with illumos. */ + if (n > zfs_uio_resid(uio)) + return; + + segflg = zfs_uio_segflg(uio); + zfs_uio_segflg(uio) = UIO_NOCOPY; + zfs_uiomove(NULL, n, zfs_uio_rw(uio), uio); + zfs_uio_segflg(uio) = segflg; +} + +int +zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) +{ + ASSERT3U(zfs_uio_rw(uio), ==, dir); + return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); +} diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c new file mode 100644 index 0000000000..3f4feb140d --- /dev/null +++ b/module/os/freebsd/spl/spl_vfs.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +MALLOC_DECLARE(M_MOUNT); + +void +vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused) +{ + struct vfsopt *opt; + size_t namesize; + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + + if (vfsp->mnt_opt == NULL) { + void *opts; + + MNT_IUNLOCK(vfsp); + opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK); + MNT_ILOCK(vfsp); + if (vfsp->mnt_opt == NULL) { + vfsp->mnt_opt = opts; + TAILQ_INIT(vfsp->mnt_opt); + } else { + free(opts, M_MOUNT); + } + } + + MNT_IUNLOCK(vfsp); + + opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK); + namesize = strlen(name) + 1; + opt->name = malloc(namesize, M_MOUNT, M_WAITOK); + strlcpy(opt->name, name, namesize); + opt->pos = -1; + opt->seen = 1; + if (arg == NULL) { + opt->value = NULL; + opt->len = 0; + } else { + opt->len = strlen(arg) + 1; + opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); + bcopy(arg, opt->value, opt->len); + } + + MNT_ILOCK(vfsp); + TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +void +vfs_clearmntopt(vfs_t *vfsp, const char *name) +{ + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + vfs_deleteopt(vfsp->mnt_opt, name); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +int +vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) +{ + struct vfsoptlist *opts = vfsp->mnt_optnew; + int error; + + if (opts == NULL) + return (0); + error = vfs_getopt(opts, opt, (void **)argp, NULL); + return (error != 0 ? 0 : 1); +} + +int +mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, + char *fspec, int fsflags) +{ + struct vfsconf *vfsp; + struct mount *mp; + vnode_t *vp, *mvp; + struct ucred *pcr, *tcr; + int error; + + ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); + + vp = *vpp; + *vpp = NULL; + error = 0; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + error = ENAMETOOLONG; + if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) + error = ENODEV; + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; + /* + * We need vnode lock to protect v_mountedhere and vnode interlock + * to protect v_iflag. + */ + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; + VI_UNLOCK(vp); + } + if (error != 0) { + vput(vp); + return (error); + } + vn_seqc_write_begin(vp); + VOP_UNLOCK1(vp); + + /* + * Allocate and initialize the filesystem. + * We don't want regular user that triggered snapshot mount to be able + * to unmount it, so pass credentials of the parent mount. + */ + mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred); + + mp->mnt_optnew = NULL; + vfs_setmntopt(mp, "from", fspec, 0); + mp->mnt_optnew = mp->mnt_opt; + mp->mnt_opt = NULL; + + /* + * Set the mount level flags. + */ + mp->mnt_flag = fsflags & MNT_UPDATEMASK; + /* + * Snapshots are always read-only. + */ + mp->mnt_flag |= MNT_RDONLY; + /* + * We don't want snapshots to allow access to vulnerable setuid + * programs, so we turn off setuid when mounting snapshots. + */ + mp->mnt_flag |= MNT_NOSUID; + /* + * We don't want snapshots to be visible in regular + * mount(8) and df(1) output. + */ + mp->mnt_flag |= MNT_IGNORE; + + /* + * XXX: This is evil, but we can't mount a snapshot as a regular user. + * XXX: Is is safe when snapshot is mounted from within a jail? + */ + tcr = td->td_ucred; + pcr = td->td_proc->p_ucred; + td->td_ucred = kcred; + td->td_proc->p_ucred = kcred; + error = VFS_MOUNT(mp); + td->td_ucred = tcr; + td->td_proc->p_ucred = pcr; + + if (error != 0) { + /* + * Clear VI_MOUNT and decrement the use count "atomically", + * under the vnode lock. This is not strictly required, + * but makes it easier to reason about the life-cycle and + * ownership of the covered vnode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vn_seqc_write_end(vp); + vput(vp); + vfs_unbusy(mp); + vfs_freeopts(mp->mnt_optnew); + mp->mnt_vnodecovered = NULL; + vfs_mount_destroy(mp); + return (error); + } + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + (void) VFS_STATFS(mp, &mp->mnt_stat); + + /* + * Prevent external consumers of mount options from reading + * mnt_optnew. + */ + mp->mnt_optnew = NULL; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef FREEBSD_NAMECACHE + cache_purge(vp); +#endif + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; +#ifdef VIRF_MOUNTPOINT + vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); +#endif + vp->v_mountedhere = mp; + VI_UNLOCK(vp); + /* Put the new filesystem on the mount list. */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + vfs_event_signal(NULL, VQ_MOUNT, 0); + if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) + panic("mount: lost mount"); + vn_seqc_write_end(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300048 + vfs_op_exit(mp); +#endif + vfs_unbusy(mp); + *vpp = mvp; + return (0); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY3U(vp->v_usecount, >, 0); + if (refcount_release_if_not_last(&vp->v_usecount)) { +#if __FreeBSD_version < 1300045 + vdrop(vp); +#endif + return; + } + VERIFY3U(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vrele, vp, TQ_SLEEP), !=, 0); +} diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c new file mode 100644 index 0000000000..739ddb05e8 --- /dev/null +++ b/module/os/freebsd/spl/spl_vm.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +const int zfs_vm_pagerret_bad = VM_PAGER_BAD; +const int zfs_vm_pagerret_error = VM_PAGER_ERROR; +const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; +const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; + +void +zfs_vmobject_assert_wlocked(vm_object_t object) +{ + + /* + * This is not ideal because FILE/LINE used by assertions will not + * be too helpful, but it must be an hard function for + * compatibility reasons. + */ + VM_OBJECT_ASSERT_WLOCKED(object); +} + +void +zfs_vmobject_wlock(vm_object_t object) +{ + + VM_OBJECT_WLOCK(object); +} + +void +zfs_vmobject_wunlock(vm_object_t object) +{ + + VM_OBJECT_WUNLOCK(object); +} diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c new file mode 100644 index 0000000000..3644eba77c --- /dev/null +++ b/module/os/freebsd/spl/spl_zlib.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#if __FreeBSD_version >= 1300041 +#include +#else +#include +#endif +#include + + +/*ARGSUSED*/ +static void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + + return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT)); +} + +/*ARGSUSED*/ +static void +zcfree(void *opaque, void *ptr) +{ + + free(ptr, M_SOLARIS); +} + +static int +zlib_deflateInit(z_stream *stream, int level) +{ + + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (deflateInit(stream, level)); +} + +static int +zlib_deflate(z_stream *stream, int flush) +{ + return (deflate(stream, flush)); +} + +static int +zlib_deflateEnd(z_stream *stream) +{ + return (deflateEnd(stream)); +} + +static int +zlib_inflateInit(z_stream *stream) +{ + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (inflateInit(stream)); +} + +static int +zlib_inflate(z_stream *stream, int finish) +{ +#if __FreeBSD_version >= 1300024 + return (inflate(stream, finish)); +#else + return (_zlib104_inflate(stream, finish)); +#endif +} + + +static int +zlib_inflateEnd(z_stream *stream) +{ + return (inflateEnd(stream)); +} + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + // return (kmem_cache_alloc(zlib_workspace_cache, flags)); + return (NULL); +} + +static void +zlib_workspace_free(void *workspace) +{ + // kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + stream.opaque = NULL; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err); +} + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + return (err); +} diff --git a/module/os/freebsd/spl/spl_zone.c b/module/os/freebsd/spl/spl_zone.c new file mode 100644 index 0000000000..bd3f019b2f --- /dev/null +++ b/module/os/freebsd/spl/spl_zone.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); + +/* + * Structure to record list of ZFS datasets exported to a zone. + */ +typedef struct zone_dataset { + LIST_ENTRY(zone_dataset) zd_next; + char zd_dataset[0]; +} zone_dataset_t; + +LIST_HEAD(zone_dataset_head, zone_dataset); + +static int zone_slot; + +int +zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd, *zd2; + struct prison *pr; + int dofree, error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + /* Allocate memory before we grab prison's mutex. */ + zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ + sx_sunlock(&allprison_lock); + if (pr == NULL) { + free(zd, M_ZONES); + return (ENOENT); + } + + head = osd_jail_get(pr, zone_slot); + if (head != NULL) { + dofree = 0; + LIST_FOREACH(zd2, head, zd_next) { + if (strcmp(dataset, zd2->zd_dataset) == 0) { + free(zd, M_ZONES); + error = EEXIST; + goto end; + } + } + } else { + dofree = 1; + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + head = malloc(sizeof (*head), M_ZONES, M_WAITOK); + LIST_INIT(head); + mtx_lock(&pr->pr_mtx); + error = osd_jail_set(pr, zone_slot, head); + KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", + error)); + } + strcpy(zd->zd_dataset, dataset); + LIST_INSERT_HEAD(head, zd, zd_next); +end: + if (dofree) + prison_free_locked(pr); + else + mtx_unlock(&pr->pr_mtx); + return (error); +} + +int +zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + int error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) { + error = ENOENT; + goto end; + } + LIST_FOREACH(zd, head, zd_next) { + if (strcmp(dataset, zd->zd_dataset) == 0) + break; + } + if (zd == NULL) + error = ENOENT; + else { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + if (LIST_EMPTY(head)) + osd_jail_del(pr, zone_slot); + error = 0; + } +end: + mtx_unlock(&pr->pr_mtx); + return (error); +} + +/* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + size_t len; + int ret = 0; + + if (dataset[0] == '\0') + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) + goto end; + + /* + * Walk the list once, looking for datasets which match exactly, or + * specify a dataset underneath an exported dataset. If found, return + * true and note that it is writable. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(zd->zd_dataset); + if (strlen(dataset) >= len && + bcmp(dataset, zd->zd_dataset, len) == 0 && + (dataset[len] == '\0' || dataset[len] == '/' || + dataset[len] == '@')) { + if (write) + *write = 1; + ret = 1; + goto end; + } + } + + /* + * Walk the list a second time, searching for datasets which are parents + * of exported datasets. These should be visible, but read-only. + * + * Note that we also have to support forms such as 'pool/dataset/', with + * a trailing slash. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(dataset); + if (dataset[len - 1] == '/') + len--; /* Ignore trailing slash */ + if (len < strlen(zd->zd_dataset) && + bcmp(dataset, zd->zd_dataset, len) == 0 && + zd->zd_dataset[len] == '/') { + if (write) + *write = 0; + ret = 1; + goto end; + } + } +end: + mtx_unlock(&pr->pr_mtx); + return (ret); +} + +static void +zone_destroy(void *arg) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + + head = arg; + while ((zd = LIST_FIRST(head)) != NULL) { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + } + free(head, M_ZONES); +} + +uint32_t +zone_get_hostid(void *ptr) +{ + + KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); + + return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); +} + +static void +zone_sysinit(void *arg __unused) +{ + + zone_slot = osd_jail_register(zone_destroy, NULL); +} + +static void +zone_sysuninit(void *arg __unused) +{ + + osd_jail_deregister(zone_slot); +} + +SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); +SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c new file mode 100644 index 0000000000..fa1034ff88 --- /dev/null +++ b/module/os/freebsd/zfs/abd_os.c @@ -0,0 +1,509 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +struct { + wmsum_t abdstat_struct_size; + wmsum_t abdstat_scatter_cnt; + wmsum_t abdstat_scatter_data_size; + wmsum_t abdstat_scatter_chunk_waste; + wmsum_t abdstat_linear_cnt; + wmsum_t abdstat_linear_data_size; +} abd_sums; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's for. Smaller allocations will use linear ABD's which use + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. + * + * Linear ABDs for multi-page allocations are easier to use, and in some cases + * it allows to avoid buffer copying. But allocation and especially free + * of multi-page linear ABDs are expensive operations due to KVA mapping and + * unmapping, and with time they cause KVA fragmentations. + */ +size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN, + &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations."); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are + * just a single zero'd page-sized buffer. This allows us to conserve + * memory by only using a single zero buffer for the scatter chunks. + */ +abd_t *abd_zero_scatter = NULL; +static char *abd_zero_buf = NULL; + +static uint_t +abd_chunkcnt_for_bytes(size_t size) +{ + return ((size + PAGE_MASK) >> PAGE_SHIFT); +} + +static inline uint_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + uint_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + int waste = (n << PAGE_SHIFT) - abd->abd_size; + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); + arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); + arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + uint_t i, n; + + /* + * There is no scatter linear pages in FreeBSD so there is + * an error if the ABD has been marked as a linear page. + */ + ASSERT(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE); + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + uint_t i, n; + + n = abd_chunkcnt_for_bytes(size); + for (i = 0; i < n; i++) { + ABD_SCATTER(abd).abd_chunks[i] = + kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + } +} + +void +abd_free_chunks(abd_t *abd) +{ + uint_t i, n; + + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + kmem_cache_free(abd_chunk_cache, + ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct_impl(size_t size) +{ + uint_t chunkcnt = abd_chunkcnt_for_bytes(size); + /* + * In the event we are allocating a gang ABD, the size passed in + * will be 0. We must make sure to set abd_size to the size of an + * ABD struct as opposed to an ABD scatter with 0 chunks. The gang + * ABD struct allocation accounts for an additional 24 bytes over + * a scatter ABD with 0 chunks. + */ + size_t abd_size = MAX(sizeof (abd_t), + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct_impl(abd_t *abd) +{ + uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : + abd_scatter_chunkcnt(abd); + ssize_t size = MAX(sizeof (abd_t), + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each chunk in the scatterlist will be set to abd_zero_buf. + */ +static void +abd_alloc_zero_scatter(void) +{ + uint_t i, n; + + n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + bzero(abd_zero_buf, PAGE_SIZE); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + + for (i = 0; i < n; i++) { + ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = + abd_zero_buf; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE); +} + +static void +abd_free_zero_scatter(void) +{ + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE); + + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + kmem_cache_free(abd_chunk_cache, abd_zero_buf); +} + +static int +abd_kstats_update(kstat_t *ksp, int rw) +{ + abd_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + as->abdstat_struct_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_struct_size); + as->abdstat_scatter_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_cnt); + as->abdstat_scatter_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_data_size); + as->abdstat_scatter_chunk_waste.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); + as->abdstat_linear_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_cnt); + as->abdstat_linear_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_data_size); + return (0); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0, + NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); + + wmsum_init(&abd_sums.abdstat_struct_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); + wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); + wmsum_init(&abd_sums.abdstat_linear_cnt, 0); + wmsum_init(&abd_sums.abdstat_linear_data_size, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + abd_ksp->ks_update = abd_kstats_update; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + wmsum_fini(&abd_sums.abdstat_struct_size); + wmsum_fini(&abd_sums.abdstat_scatter_cnt); + wmsum_fini(&abd_sums.abdstat_scatter_data_size); + wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); + wmsum_fini(&abd_sums.abdstat_linear_cnt); + wmsum_fini(&abd_sums.abdstat_linear_data_size); + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, + size_t size) +{ + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_chunkcnt_for_bytes( + (new_offset & PAGE_MASK) + size); + + ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); + + /* + * If an abd struct is provided, it is only the minimum size. If we + * need additional chunks, we need to allocate a new struct. + */ + if (abd != NULL && + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) > + sizeof (abd_t)) { + abd = NULL; + } + + if (abd == NULL) + abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + + ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], + chunkcnt * sizeof (void *)); + + return (abd); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + abd_t *abd = aiter->iter_abd; + size_t offset = aiter->iter_pos; + if (abd_is_linear(abd)) { + aiter->iter_mapsize = abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(abd); + } else { + offset += ABD_SCATTER(abd).abd_offset; + paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; + offset &= PAGE_MASK; + aiter->iter_mapsize = MIN(PAGE_SIZE - offset, + abd->abd_size - aiter->iter_pos); + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + if (!abd_iter_at_end(aiter)) { + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + } + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ + kmem_cache_reap_soon(abd_chunk_cache); +} diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c new file mode 100644 index 0000000000..fddb1f0e87 --- /dev/null +++ b/module/os/freebsd/zfs/arc_os.c @@ -0,0 +1,271 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __FreeBSD_version >= 1300139 +static struct sx arc_vnlru_lock; +static struct vnode *arc_vnlru_marker; +#endif + +extern struct vfsops zfs_vfsops; + +uint_t zfs_arc_free_target = 0; + +static void +arc_free_target_init(void *unused __unused) +{ + zfs_arc_free_target = vm_cnt.v_free_target; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + uint_t val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > vm_cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} +SYSCTL_DECL(_vfs_zfs); +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); +/* END CSTYLED */ + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n __unused; + + /* + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. + */ + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + } +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = uma_avail() - (long)(uma_limit() / 4); + if (n < lowest) { + lowest = n; + } +#endif + + DTRACE_PROBE1(arc__available_memory, int64_t, lowest); + return (lowest); +} + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *arg) +{ + int64_t nr_scan = (intptr_t)arg; + + arc_reduce_target_size(ptob(nr_scan)); +#if __FreeBSD_version >= 1300139 + sx_xlock(&arc_vnlru_lock); + vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); + sx_xunlock(&arc_vnlru_lock); +#else + vnlru_free(nr_scan, &zfs_vfsops); +#endif +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + +#ifndef __LP64__ + if (adjust > INTPTR_MAX) + adjust = INTPTR_MAX; +#endif + taskq_dispatch(arc_prune_taskq, arc_prune_task, + (void *)(intptr_t)adjust, TQ_SLEEP); + ARCSTAT_BUMP(arcstat_prune); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem)); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_free_memory(void) +{ + return (ptob(freemem)); +} + +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + int64_t free_memory, to_free; + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); + to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); + arc_reduce_target_size(to_free); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curproc == pageproc) + arc_wait_for_eviction(to_free, B_FALSE); +} + +void +arc_lowmem_init(void) +{ + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#if __FreeBSD_version >= 1300139 + arc_vnlru_marker = vnlru_alloc_marker(); + sx_init(&arc_vnlru_lock, "arc vnlru lock"); +#endif +} + +void +arc_lowmem_fini(void) +{ + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +#if __FreeBSD_version >= 1300139 + if (arc_vnlru_marker != NULL) { + vnlru_free_marker(arc_vnlru_marker); + sx_destroy(&arc_vnlru_lock); + } +#endif +} + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} diff --git a/module/os/freebsd/zfs/crypto_os.c b/module/os/freebsd/zfs/crypto_os.c new file mode 100644 index 0000000000..6a67dbc9f6 --- /dev/null +++ b/module/os/freebsd/zfs/crypto_os.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2005-2010 Pawel Jakub Dawidek + * Copyright (c) 2018 Sean Eric Fagan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#endif + +#include +#include +#include + +#include + +#define SHA512_HMAC_BLOCK_SIZE 128 + +static int crypt_sessions = 0; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD, + &crypt_sessions, 0, "Number of cryptographic sessions created"); + +void +crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key) +{ + uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE], + k_opad[SHA512_HMAC_BLOCK_SIZE], + key[SHA512_HMAC_BLOCK_SIZE]; + SHA512_CTX lctx; + int i; + size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length); + + /* + * This code is based on the similar code in geom/eli/g_eli_hmac.c + */ + explicit_bzero(key, sizeof (key)); + if (c_key->ck_length == 0) + /* do nothing */; + else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE) + bcopy(c_key->ck_data, key, cl_bytes); + else { + /* + * If key is longer than 128 bytes reset it to + * key = SHA512(key). + */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, c_key->ck_data, cl_bytes); + SHA512_Final(key, &lctx); + } + + /* XOR key with ipad and opad values. */ + for (i = 0; i < sizeof (key); i++) { + k_ipad[i] = key[i] ^ 0x36; + k_opad[i] = key[i] ^ 0x5c; + } + explicit_bzero(key, sizeof (key)); + + /* Start inner SHA512. */ + SHA512_Init(&ctx->innerctx); + SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad)); + explicit_bzero(k_ipad, sizeof (k_ipad)); + /* Start outer SHA512. */ + SHA512_Init(&ctx->outerctx); + SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad)); + explicit_bzero(k_opad, sizeof (k_opad)); +} + +void +crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize) +{ + SHA512_Update(&ctx->innerctx, data, datasize); +} + +void +crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize) +{ + uint8_t digest[SHA512_DIGEST_LENGTH]; + + /* Complete inner hash */ + SHA512_Final(digest, &ctx->innerctx); + + /* Complete outer hash */ + SHA512_Update(&ctx->outerctx, digest, sizeof (digest)); + SHA512_Final(digest, &ctx->outerctx); + + explicit_bzero(ctx, sizeof (*ctx)); + /* mdsize == 0 means "Give me the whole hash!" */ + if (mdsize == 0) + mdsize = SHA512_DIGEST_LENGTH; + bcopy(digest, md, mdsize); + explicit_bzero(digest, sizeof (digest)); +} + +void +crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size, + void *out_data, size_t out_data_size) +{ + struct hmac_ctx ctx; + + crypto_mac_init(&ctx, key); + crypto_mac_update(&ctx, in_data, in_data_size); + crypto_mac_final(&ctx, out_data, out_data_size); +} + +static int +freebsd_zfs_crypt_done(struct cryptop *crp) +{ + freebsd_crypt_session_t *ses; + + ses = crp->crp_opaque; + mtx_lock(&ses->fs_lock); + ses->fs_done = true; + mtx_unlock(&ses->fs_lock); + wakeup(crp); + return (0); +} + +void +freebsd_crypt_freesession(freebsd_crypt_session_t *sess) +{ + mtx_destroy(&sess->fs_lock); + crypto_freesession(sess->fs_sid); + explicit_bzero(sess, sizeof (*sess)); +} + +static int +zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp) +{ + int error; + + crp->crp_opaque = session; + crp->crp_callback = freebsd_zfs_crypt_done; + for (;;) { + error = crypto_dispatch(crp); + if (error) + break; + mtx_lock(&session->fs_lock); + while (session->fs_done == false) + msleep(crp, &session->fs_lock, 0, + "zfs_crypto", 0); + mtx_unlock(&session->fs_lock); + + if (crp->crp_etype == ENOMEM) { + pause("zcrnomem", 1); + } else if (crp->crp_etype != EAGAIN) { + error = crp->crp_etype; + break; + } + crp->crp_etype = 0; + crp->crp_flags &= ~CRYPTO_F_DONE; + session->fs_done = false; +#if __FreeBSD_version < 1300087 + /* + * Session ID changed, so we should record that, + * and try again + */ + session->fs_sid = crp->crp_session; +#endif + } + return (error); +} +static void +freebsd_crypt_uio_debug_log(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + zfs_uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ +#ifdef FCRYPTO_DEBUG + struct cryptodesc *crd; + uint8_t *p = NULL; + size_t total = 0; + + printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, " + "%p, %u, %u)\n", + __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + data_uio, key->ck_format, key->ck_data, + (unsigned int)key->ck_length, + ivbuf, (unsigned int)datalen, (unsigned int)auth_len); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); + for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) { + printf("\tiovec #%d: <%p, %u>\n", i, + zfs_uio_iovbase(data_uio, i), + (unsigned int)zfs_uio_iovlen(data_uio, i)); + total += zfs_uio_iovlen(data_uio, i); + } + zfs_uio_resid(data_uio) = total; +#endif +} +/* + * Create a new cryptographic session. This should + * happen every time the key changes (including when + * it's first loaded). + */ +#if __FreeBSD_version >= 1300087 +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct crypto_session_params csp; + int error = 0; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + bzero(&csp, sizeof (csp)); + csp.csp_mode = CSP_MODE_AEAD; + csp.csp_cipher_key = key->ck_data; + csp.csp_cipher_klen = key->ck_length / 8; + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16; + csp.csp_ivlen = AES_GCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + case AES_192_GMAC_KEY_LEN: + case AES_256_GMAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + csp.csp_cipher_alg = CRYPTO_AES_CCM_16; + csp.csp_ivlen = AES_CCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + case AES_192_CBC_MAC_KEY_LEN: + case AES_256_CBC_MAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + + /* + * Disable the use of hardware drivers on FreeBSD 13 and later since + * common crypto offload drivers impose constraints on AES-GCM AAD + * lengths that make them unusable for ZFS, and we currently do not have + * a mechanism to fall back to a software driver for requests not + * handled by a hardware driver. + * + * On 12 we continue to permit the use of hardware drivers since + * CPU-accelerated drivers such as aesni(4) register themselves as + * hardware drivers. + */ + error = crypto_newsession(&sessp->fs_sid, &csp, CRYPTOCAP_F_SOFTWARE); + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} + +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + zfs_uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + freebsd_crypt_session_t *session = NULL; + int error = 0; + size_t total = 0; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) + total += zfs_uio_iovlen(data_uio, i); + zfs_uio_resid(data_uio) = total; + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(session->fs_sid, M_WAITOK); + if (encrypt) { + crp->crp_op = CRYPTO_OP_ENCRYPT | + CRYPTO_OP_COMPUTE_DIGEST; + } else { + crp->crp_op = CRYPTO_OP_DECRYPT | + CRYPTO_OP_VERIFY_DIGEST; + } + crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE; + crypto_use_uio(crp, GET_UIO_STRUCT(data_uio)); + + crp->crp_aad_start = 0; + crp->crp_aad_length = auth_len; + crp->crp_payload_start = auth_len; + crp->crp_payload_length = datalen; + crp->crp_digest_start = auth_len + datalen; + + bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN); + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } + return (error); +} + +#else +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct cryptoini cria, crie, *crip; + struct enc_xform *xform; + struct auth_hash *xauth; + int error = 0; + crypto_session_t sid; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + bzero(&crie, sizeof (crie)); + bzero(&cria, sizeof (cria)); + + crie.cri_alg = xform->type; + crie.cri_key = key->ck_data; + crie.cri_klen = key->ck_length; + + cria.cri_alg = xauth->type; + cria.cri_key = key->ck_data; + cria.cri_klen = key->ck_length; + + cria.cri_next = &crie; + crie.cri_next = NULL; + crip = &cria; + // Everything else is bzero'd + + error = crypto_newsession(&sid, crip, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + if (error != 0) { + printf("%s(%d): crypto_newsession failed with %d\n", + __FUNCTION__, __LINE__, error); + goto bad; + } + sessp->fs_sid = sid; + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: + return (error); +} + +/* + * The meat of encryption/decryption. + * If sessp is NULL, then it will create a + * temporary cryptographic session, and release + * it when done. + */ +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + zfs_uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + struct cryptodesc *enc_desc, *auth_desc; + struct enc_xform *xform; + struct auth_hash *xauth; + freebsd_crypt_session_t *session = NULL; + int error; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(2); + if (crp == NULL) { + error = ENOMEM; + goto bad; + } + + auth_desc = crp->crp_desc; + enc_desc = auth_desc->crd_next; + + crp->crp_session = session->fs_sid; + crp->crp_ilen = auth_len + datalen; + crp->crp_buf = (void*)GET_UIO_STRUCT(data_uio); + crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC; + + auth_desc->crd_skip = 0; + auth_desc->crd_len = auth_len; + auth_desc->crd_inject = auth_len + datalen; + auth_desc->crd_alg = xauth->type; +#ifdef FCRYPTO_DEBUG + printf("%s: auth: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len, + auth_desc->crd_inject); +#endif + + enc_desc->crd_skip = auth_len; + enc_desc->crd_len = datalen; + enc_desc->crd_inject = auth_len; + enc_desc->crd_alg = xform->type; + enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; + bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN); + enc_desc->crd_next = NULL; + +#ifdef FCRYPTO_DEBUG + printf("%s: enc: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len, + enc_desc->crd_inject); +#endif + + if (encrypt) + enc_desc->crd_flags |= CRD_F_ENCRYPT; + + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} +#endif diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c new file mode 100644 index 0000000000..38488dbda6 --- /dev/null +++ b/module/os/freebsd/zfs/dmu_os.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef IDX_TO_OFF +#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) +#endif + +#if __FreeBSD_version < 1300051 +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY +#else +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY +#endif + + +#if __FreeBSD_version < 1300072 +#define dmu_page_lock(m) vm_page_lock(m) +#define dmu_page_unlock(m) vm_page_unlock(m) +#else +#define dmu_page_lock(m) +#define dmu_page_unlock(m) +#endif + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT3U(size, >, 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, + db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size) +{ + struct sf_buf *sf; + vm_object_t vmobj; + vm_page_t m; + dmu_buf_t **dbp; + dmu_buf_t *db; + caddr_t va; + int numbufs, i; + int bufoff, pgoff, tocpy; + int mi, di; + int err; + + ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); + ASSERT3S(last_size, <=, PAGE_SIZE); + + err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), + IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); + if (err != 0) + return (err); + +#ifdef ZFS_DEBUG + IMPLY(last_size < PAGE_SIZE, *rahead == 0); + if (dbp[0]->db_offset != 0 || numbufs > 1) { + for (i = 0; i < numbufs; i++) { + ASSERT(ISP2(dbp[i]->db_size)); + ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0); + ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); + } + } +#endif + + vmobj = ma[0]->object; + zfs_vmobject_wlock_12(vmobj); + + db = dbp[0]; + for (i = 0; i < *rbehind; i++) { + m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT3U(m->dirty, ==, 0); + ASSERT(!pmap_page_is_write_mapped(m)); + + ASSERT3U(db->db_size, >, PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, PAGESIZE); + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rbehind = i; + + bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; + pgoff = 0; + for (mi = 0, di = 0; mi < count && di < numbufs; ) { + if (pgoff == 0) { + m = ma[mi]; + if (m != bogus_page) { + vm_page_assert_xbusied(m); + ASSERT(vm_page_none_valid(m)); + ASSERT3U(m->dirty, ==, 0); + ASSERT(!pmap_page_is_write_mapped(m)); + va = zfs_map_page(m, &sf); + } + } + if (bufoff == 0) + db = dbp[di]; + + if (m != bogus_page) { + ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, + db->db_offset + bufoff); + } + + /* + * We do not need to clamp the copy size by the file + * size as the last block is zero-filled beyond the + * end of file anyway. + */ + tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); + ASSERT3S(tocpy, >=, 0); + if (m != bogus_page) + bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); + + pgoff += tocpy; + ASSERT3S(pgoff, >=, 0); + ASSERT3S(pgoff, <=, PAGESIZE); + if (pgoff == PAGESIZE) { + if (m != bogus_page) { + zfs_unmap_page(sf); + vm_page_valid(m); + } + ASSERT3S(mi, <, count); + mi++; + pgoff = 0; + } + + bufoff += tocpy; + ASSERT3S(bufoff, >=, 0); + ASSERT3S(bufoff, <=, db->db_size); + if (bufoff == db->db_size) { + ASSERT3S(di, <, numbufs); + di++; + bufoff = 0; + } + } + +#ifdef ZFS_DEBUG + /* + * Three possibilities: + * - last requested page ends at a buffer boundary and , thus, + * all pages and buffers have been iterated; + * - all requested pages are filled, but the last buffer + * has not been exhausted; + * the read-ahead is possible only in this case; + * - all buffers have been read, but the last page has not been + * fully filled; + * this is only possible if the file has only a single buffer + * with a size that is not a multiple of the page size. + */ + if (mi == count) { + ASSERT3S(di, >=, numbufs - 1); + IMPLY(*rahead != 0, di == numbufs - 1); + IMPLY(*rahead != 0, bufoff != 0); + ASSERT0(pgoff); + } + if (di == numbufs) { + ASSERT3S(mi, >=, count - 1); + ASSERT0(*rahead); + IMPLY(pgoff == 0, mi == count); + if (pgoff != 0) { + ASSERT3S(mi, ==, count - 1); + ASSERT3U((dbp[0]->db_size & PAGE_MASK), !=, 0); + } + } +#endif + if (pgoff != 0) { + ASSERT3P(m, !=, bogus_page); + bzero(va + pgoff, PAGESIZE - pgoff); + zfs_unmap_page(sf); + vm_page_valid(m); + } + + for (i = 0; i < *rahead; i++) { + m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT3U(m->dirty, ==, 0); + ASSERT(!pmap_page_is_write_mapped(m)); + + ASSERT3U(db->db_size, >, PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + tocpy = MIN(db->db_size - bufoff, PAGESIZE); + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, tocpy); + if (tocpy < PAGESIZE) { + ASSERT3S(i, ==, *rahead - 1); + ASSERT3U((db->db_size & PAGE_MASK), !=, 0); + bzero(va + tocpy, PAGESIZE - tocpy); + } + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rahead = i; + zfs_vmobject_wunlock_12(vmobj); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} diff --git a/module/os/freebsd/zfs/hkdf.c b/module/os/freebsd/zfs/hkdf.c new file mode 100644 index 0000000000..8324ff2319 --- /dev/null +++ b/module/os/freebsd/zfs/hkdf.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + crypto_key_t key; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + struct hmac_ctx ctx; + crypto_key_t key; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + for (i = 1; i <= N; i++) { + c = i; + + crypto_mac_init(&ctx, &key); + crypto_mac_update(&ctx, T, T_len); + crypto_mac_update(&ctx, info, info_len); + crypto_mac_update(&ctx, &c, 1); + crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH); + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c new file mode 100644 index 0000000000..2b808357ec --- /dev/null +++ b/module/os/freebsd/zfs/kmod_core.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" +#include "zfs_deleg.h" +#include "zfs_namecheck.h" +#include "zfs_prop.h" + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_DECL(_vfs_zfs_vdev); + +extern uint_t rrw_tsd_key; +static int zfs_version_ioctl = ZFS_IOCVER_OZFS; +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, + 0, "ZFS_IOCTL_VERSION"); + +static struct cdev *zfsdev; + +static struct root_hold_token *zfs_root_token; + +extern uint_t rrw_tsd_key; +extern uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; + +#define ZFS_MIN_KSTACK_PAGES 4 + +static int +zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, + struct thread *td) +{ + uint_t len; + int vecnum; + zfs_iocparm_t *zp; + zfs_cmd_t *zc; + zfs_cmd_legacy_t *zcl; + int rc, error; + void *uaddr; + + len = IOCPARM_LEN(zcmd); + vecnum = zcmd & 0xff; + zp = (void *)arg; + uaddr = (void *)zp->zfs_cmd; + error = 0; + zcl = NULL; + + if (len != sizeof (zfs_iocparm_t)) { + printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n", + len, vecnum, (uintmax_t)sizeof (zfs_cmd_t)); + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + /* + * Remap ioctl code for legacy user binaries + */ + if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) { + vecnum = zfs_ioctl_legacy_to_ozfs(vecnum); + if (vecnum < 0) { + kmem_free(zc, sizeof (zfs_cmd_t)); + return (ENOTSUP); + } + zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + zfs_cmd_legacy_to_ozfs(zcl, zc); + } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + error = zfsdev_ioctl_common(vecnum, zc, 0); + if (zcl) { + zfs_cmd_ozfs_to_legacy(zc, zcl); + rc = copyout(zcl, uaddr, sizeof (*zcl)); + } else { + rc = copyout(zc, uaddr, sizeof (*zc)); + } + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +out: + if (zcl) + kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + kmem_free(zc, sizeof (zfs_cmd_t)); + MPASS(tsd_get(rrw_tsd_key) == NULL); + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfsdev_state_destroy(data); +} + +void +zfsdev_private_set_state(void *priv __unused, zfsdev_state_t *zs) +{ + devfs_set_cdevpriv(zs, zfsdev_close); +} + +zfsdev_state_t * +zfsdev_private_get_state(void *priv) +{ + return (priv); +} + +static int +zfsdev_open(struct cdev *devp __unused, int flag __unused, int mode __unused, + struct thread *td __unused) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_init(NULL); + mutex_exit(&zfsdev_state_lock); + + return (error); +} + +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DRIVER +}; + +int +zfsdev_attach(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DRIVER); + return (0); +} + +void +zfsdev_detach(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +int +zfs__init(void) +{ + int error; + +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif + zfs_root_token = root_mount_hold("ZFS"); + if ((error = zfs_kmod_init()) != 0) { + printf("ZFS: Failed to Load ZFS Filesystem" + ", rc = %d\n", error); + root_mount_rel(zfs_root_token); + return (error); + } + + + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" + SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + ddi_sysevent_init(); + return (0); +} + +int +zfs__fini(void) +{ + if (zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + zfs_kmod_fini(); + tsd_destroy(&zfs_geom_probe_vdev_key); + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + zfs__fini(); +} + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; + +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif + +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +#if __FreeBSD_version > 1300092 +MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1); +#else +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +#endif +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); +MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); +MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1); diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c new file mode 100644 index 0000000000..070e7a5b9f --- /dev/null +++ b/module/os/freebsd/zfs/spa_os.c @@ -0,0 +1,272 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + txg = fnvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + nvtop = fnvlist_lookup_nvlist(configs[i], + ZPOOL_CONFIG_VDEV_TREE); + tops[i] = fnvlist_dup(nvtop); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + tops[holes[i]] = fnvlist_alloc(); + fnvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE); + fnvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, holes[i]); + fnvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + tops[i] = fnvlist_alloc(); + fnvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING); + fnvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, i); + fnvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 0); + } + + /* + * Create pool config based on the best vdev config. + */ + config = fnvlist_dup(best_cfg); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + pgid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); + nvroot = fnvlist_alloc(); + fnvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + fnvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL); + fnvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, tops, + nchildren); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + fnvlist_remove(config, ZPOOL_CONFIG_GUID); + fnvlist_remove(config, ZPOOL_CONFIG_TOP_GUID); + + for (i = 0; i < count; i++) + fnvlist_free(configs[i]); + kmem_free(configs, count * sizeof (void *)); + for (i = 0; i < nchildren; i++) + fnvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof (void *)); + fnvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name, bool checkpointrewind) +{ + spa_t *spa; + vdev_t *rvd; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); + VERIFY0(strcmp(name, pname)); + txg = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * The pool could already be imported, + * e.g., after reboot -r. + */ + if (spa->spa_state == POOL_STATE_ACTIVE) { + mutex_exit(&spa_namespace_lock); + fnvlist_free(config); + return (0); + } + + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + fnvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + config = fnvlist_dup(spa->spa_config); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + if (checkpointrewind) { + spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT; + } + + /* + * Build up a vdev tree based on the boot device's label config. + */ + nvtop = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + fnvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + fnvlist_free(config); + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("freebsd"); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c new file mode 100644 index 0000000000..5315b60982 --- /dev/null +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + + +/* BEGIN CSTYLED */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); + +SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, + "ZFS livelist condense"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV mirror"); + +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, + (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); + +extern arc_state_t ARC_anon; +extern arc_state_t ARC_mru; +extern arc_state_t ARC_mru_ghost; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mfu_ghost; +extern arc_state_t ARC_l2c_only; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ + +/* arc.c */ + +int +param_set_arc_max(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_max; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min || + val >= arc_all_memory())) + return (SET_ERROR(EINVAL)); + + zfs_arc_max = val; + arc_tuning_update(B_TRUE); + + /* Update the sysctl to the tuned value */ + if (val != 0) + zfs_arc_max = arc_c_max; + + return (0); +} + +int +param_set_arc_min(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_min; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max)) + return (SET_ERROR(EINVAL)); + + zfs_arc_min = val; + arc_tuning_update(B_TRUE); + + /* Update the sysctl to the tuned value */ + if (val != 0) + zfs_arc_min = arc_c_min; + + return (0); +} + +/* legacy compat */ +extern uint64_t l2arc_write_max; /* def max write size */ +extern uint64_t l2arc_write_boost; /* extra warmup write */ +extern uint64_t l2arc_headroom; /* # of dev writes */ +extern uint64_t l2arc_headroom_boost; +extern uint64_t l2arc_feed_secs; /* interval seconds */ +extern uint64_t l2arc_feed_min_ms; /* min interval msecs */ +extern int l2arc_noprefetch; /* don't cache prefetch bufs */ +extern int l2arc_feed_again; /* turbo warmup */ +extern int l2arc_norw; /* no reads during writes */ + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, + &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, + &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, + &l2arc_norw, 0, "no reads during writes (LEGACY)"); +#if 0 +extern int zfs_compressed_arc_enabled; +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW, + &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)"); +#endif + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + +static int +sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = arc_no_grow_shift; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int), + sysctl_vfs_zfs_arc_no_grow_shift, "I", + "log2(fraction of ARC which must be free to allow growing)"); + +int +param_set_arc_long(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_int(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_int(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_min, "LU", + "min arc size (LEGACY)"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_max, "LU", + "max arc size (LEGACY)"); + +/* dbuf.c */ + + +/* dmu.c */ + +/* dmu_zfetch.c */ +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); + +/* max bytes to prefetch per stream (default 8MB) */ +extern uint32_t zfetch_max_distance; +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); + +/* max bytes to prefetch indirects for per stream (default 64MB) */ +extern uint32_t zfetch_max_idistance; +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, + "Max bytes to prefetch indirects for per stream (LEGACY)"); + +/* dsl_pool.c */ + +/* dnode.c */ +extern int zfs_default_bs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, + &zfs_default_bs, 0, "Default dnode block shift"); + +extern int zfs_default_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); + + +/* dsl_scan.c */ + +/* metaslab.c */ + +/* + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. So a sane default for the space map block size + * is 8~16K. + */ +extern int zfs_metaslab_sm_blksz_no_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_no_log, 0, + "Block size for space map in pools with log space map disabled. " + "Power of 2 and greater than 4096."); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +extern int zfs_metaslab_sm_blksz_with_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_with_log, 0, + "Block size for space map in pools with log space map enabled. " + "Power of 2 and greater than 4096."); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +extern int zfs_condense_pct; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +extern int zfs_remove_max_segment; +SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, + &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to" + " allocate when removing a device"); + +extern int zfs_removal_suspend_progress; +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, + &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while" + " in the middle of a removal"); + + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +extern uint64_t metaslab_df_alloc_threshold; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +extern int metaslab_df_free_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +extern int metaslab_load_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Max number of metaslabs per group to preload. + */ +extern int metaslab_preload_limit; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* spa.c */ +extern int zfs_ccw_retry_interval; +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + +extern uint64_t zfs_max_missing_tvds_cachefile; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_cachefile, 0, + "allow importing pools with missing top-level vdevs in cache file"); + +extern uint64_t zfs_max_missing_tvds_scan; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_scan, 0, + "allow importing pools with missing top-level vdevs during scan"); + +/* spa_misc.c */ +extern int zfs_flags; +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +int +param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_synctime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_synctime_ms = val; + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_ziotime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_ziotime_ms = val; + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) +{ + char buf[16]; + int rc; + + if (req->newptr == NULL) + strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); + + rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (rc || req->newptr == NULL) + return (rc); + if (strcmp(buf, zfs_deadman_failmode) == 0) + return (0); + if (!strcmp(buf, "wait")) + zfs_deadman_failmode = "wait"; + if (!strcmp(buf, "continue")) + zfs_deadman_failmode = "continue"; + if (!strcmp(buf, "panic")) + zfs_deadman_failmode = "panic"; + + return (-param_set_deadman_failmode_common(buf)); +} + + +/* spacemap.c */ +extern int space_map_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + + +/* vdev.c */ +int +param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_vdev_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(EINVAL)); + + zfs_vdev_min_auto_ashift = val; + + return (0); +} + +int +param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_vdev_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(EINVAL)); + + zfs_vdev_max_auto_ashift = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + param_set_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), + param_set_max_auto_ashift, "QU", + "Max ashift used when optimizing for logical -> physical sector size on " + "new top-level vdevs. (LEGACY)"); + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +extern int zfs_vdev_dtl_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_dtl_sm_blksz, 0, + "Block size for DTL space map. Power of 2 and greater than 4096."); + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +extern int zfs_vdev_standard_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_standard_sm_blksz, 0, + "Block size for standard space map. Power of 2 and greater than 4096."); + +extern int vdev_validate_skip; +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, + &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + + +/* vdev_cache.c */ + +/* vdev_mirror.c */ +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + + +/* vdev_queue.c */ +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +extern uint32_t zfs_vdev_ ## name ## _min_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +extern uint32_t zfs_vdev_ ## name ## _max_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + + +#undef ZFS_VDEV_QUEUE_KNOB + +extern uint32_t zfs_vdev_max_active; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device. (LEGACY)"); + +extern int zfs_vdev_def_queue_depth; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, + &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); + +/*extern uint64_t zfs_multihost_history; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN, + &zfs_multihost_history, 0, + "Historical staticists for the last N multihost updates");*/ + +#ifdef notyet +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); +#endif + + +/* zio.c */ +#if defined(__LP64__) +int zio_use_uma = 1; +#else +int zio_use_uma = 0; +#endif + +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, + "Use uma(9) for ZIO allocations"); +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); + +int +param_set_slop_shift(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = *(int *)arg1; + + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 1 || val > 31) + return (EINVAL); + + *(int *)arg1 = val; + + return (0); +} + +int +param_set_multihost_interval(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (spa_mode_global != SPA_MODE_UNINIT) + mmp_signal_all_threads(); + + return (0); +} diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c new file mode 100644 index 0000000000..fc04a74761 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_file.c @@ -0,0 +1,355 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), + minclsyspri, max_ncpus, INT_MAX, 0); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT3P(vd->vdev_path, !=, NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT3P(vd->vdev_path, !=, NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT3P(vd->vdev_path, !=, NULL); + ASSERT(vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + + error = zfs_file_getattr(vf->vf_file, &zfa); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *logical_ashift = vdev_file_logical_ashift; + *physical_ashift = vdev_file_physical_ashift; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(zio_t *zio) +{ + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf; + void *buf; + ssize_t resid; + loff_t off; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + vf = vd->vdev_tsd; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + vdev_file_io_intr(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { +#ifdef notyet + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); +#endif + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +#endif + +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, + "Logical ashift for file-based devices"); +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, + "Physical ashift for file-based devices"); diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c new file mode 100644 index 0000000000..2ef4811a8a --- /dev/null +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -0,0 +1,1324 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef g_topology_locked +#define g_topology_locked() sx_xlocked(&topology_lock) +#endif + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +struct consumer_vdev_elem { + SLIST_ENTRY(consumer_vdev_elem) elems; + vdev_t *vd; +}; + +SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); +/* BEGIN CSTYLED */ +_Static_assert(sizeof (((struct g_consumer *)NULL)->private) + == sizeof (struct consumer_priv_t*), + "consumer_priv_t* can't be stored in g_consumer.private"); + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); +/* END CSTYLED */ + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, + boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); + return; + } + } +} + +static void +vdev_geom_resize(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + spa_t *spa; + vdev_t *vd; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vd = elem->vd; + if (vd->vdev_state != VDEV_STATE_HEALTHY) + continue; + spa = vd->vdev_spa; + if (!spa->spa_autoexpand) + continue; + vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + // cppcheck-suppress uninitvar + struct consumer_vdev_elem *elem; + + g_topology_assert(); + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + /* Vdev close in progress. Ignore the event. */ + return; + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + SLIST_FOREACH(elem, priv, elems) { + // cppcheck-suppress uninitvar + vdev_t *vd = elem->vd; + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + } +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (sanity) { + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + gp->resize = vdev_geom_resize; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + if (vd != NULL) + vd->vdev_tsd = cp; + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem, *elem_temp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); + priv = (struct consumer_priv_t *)&cp->private; + vd->vdev_tsd = NULL; + SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { + if (elem->vd == vd) { + SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); + g_free(elem); + } + } + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + uint8_t *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + +#if __FreeBSD_version > 1300130 + maxio = maxphys - (maxphys % cp->provider->sectorsize); +#else + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); +#endif + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof (struct bio *); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT0(off % cp->provider->sectorsize); + ASSERT0(s % cp->provider->sectorsize); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = (caddr_t)p; + g_io_request(bios[j], cp); + } + } + ASSERT3S(j, ==, n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || + errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) +{ + struct g_provider *pp; + nvlist_t *config; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, nlabels; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + + size = sizeof (*vdev_lists[0]) + pp->sectorsize - + ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof (vdev_lists[0]->vp_nvlist); + + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT0(offsets[l] % pp->sectorsize); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (nlabels); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof (void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t *known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid; + uint64_t id, txg, known_txg; + char *pname; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + known_txg = fnvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int nlabels; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL, B_TRUE); + if (zcp == NULL) + continue; + g_topology_unlock(); + nlabels = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (nlabels == 0) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + int nlabels; + + cp = vdev_geom_attach(pp, NULL, B_TRUE); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + nlabels = vdev_geom_read_config(cp, &config); + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp, *best_pp; + struct g_consumer *cp; + const char *vdpath; + enum match match, best_match; + + g_topology_assert(); + + vdpath = vd->vdev_path + sizeof ("/dev/") - 1; + cp = NULL; + best_pp = NULL; + best_match = NO_MATCH; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + match = vdev_attach_ok(vd, pp); + if (match > best_match) { + best_match = match; + best_pp = pp; + } else if (match == best_match) { + if (strcmp(pp->name, vdpath) == 0) { + best_pp = pp; + } + } + if (match == FULL_MATCH) + goto out; + } + } + } + +out: + if (best_pp) { + cp = vdev_geom_attach(best_pp, vd, B_TRUE); + if (cp == NULL) { + printf("ZFS WARNING: Unable to attach to %s.\n", + best_pp->name); + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, cp->provider->name); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd, B_FALSE); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + int error, has_trim; + uint16_t rate; + + /* + * Set the TLS to indicate downstack that we + * should not access zvols + */ + VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd)); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_is_splitting || + ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user really wants to do this, and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL)); + + if (cp == NULL) { + ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); + error = ENOENT; + } else { + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int spamode; + + priv = (struct consumer_priv_t *)&cp->private; + if (cp->private == NULL) + SLIST_INIT(priv); + elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); + elem->vd = vd; + SLIST_INSERT_HEAD(priv, elem, elems); + + spamode = spa_mode(vd->vdev_spa); + if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + cp->provider->name); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for " + "writing (error=%d).\n", + cp->provider->name, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) { + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + /* Set other GEOM characteristics */ + vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); + } + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", + error); + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = 0; + if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && + ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && + pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; + + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational. */ + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0 && rate == DISK_RR_NON_ROTATING) + vd->vdev_nonrot = B_TRUE; + else + vd->vdev_nonrot = B_FALSE; + + /* Set when device reports it supports TRIM. */ + error = g_getattr("GEOM::candelete", cp, &has_trim); + vd->vdev_has_trim = (error == 0 && has_trim); + + /* Set when device reports it supports secure TRIM. */ + /* unavailable on FreeBSD */ + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + struct g_consumer *cp; + boolean_t locked; + + cp = vd->vdev_tsd; + + DROP_GIANT(); + locked = g_topology_locked(); + if (!locked) + g_topology_lock(); + + if (!vd->vdev_reopening || + (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || + (cp->provider != NULL && cp->provider->error != 0)))) + vdev_geom_close_locked(vd); + + if (!locked) + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch (zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch (bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + + /* + * We have to split bio freeing into two parts, because the ABD code + * cannot be called in this context and vdev_op_io_done is not called + * for ZIO_TYPE_IOCTL zio-s. + */ + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + g_destroy_bio(bp); + zio->io_bio = NULL; + } + zio_delay_interrupt(zio); +} + +struct vdev_geom_check_unmapped_cb_state { + int pages; + uint_t end; +}; + +/* + * Callback to check the ABD segment size/alignment and count the pages. + * GEOM requires data buffer to look virtually contiguous. It means only + * the first page of the buffer may not start and only the last may not + * end on a page boundary. All other physical pages must be full. + */ +static int +vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv) +{ + struct vdev_geom_check_unmapped_cb_state *s = priv; + vm_offset_t off = (vm_offset_t)buf & PAGE_MASK; + + if (s->pages != 0 && off != 0) + return (1); + if (s->end != 0) + return (1); + s->end = (off + len) & PAGE_MASK; + s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT; + return (0); +} + +/* + * Check whether we can use unmapped I/O for this ZIO on this device to + * avoid data copying between scattered and/or gang ABD buffer and linear. + */ +static int +vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp) +{ + struct vdev_geom_check_unmapped_cb_state s; + + /* If unmapped I/O is administratively disabled, respect that. */ + if (!unmapped_buf_allowed) + return (0); + + /* If the buffer is already linear, then nothing to do here. */ + if (abd_is_linear(zio->io_abd)) + return (0); + + /* + * If unmapped I/O is not supported by the GEOM provider, + * then we can't do anything and have to copy the data. + */ + if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) + return (0); + + /* Check the buffer chunks sizes/alignments and count pages. */ + s.pages = s.end = 0; + if (abd_iterate_func(zio->io_abd, 0, zio->io_size, + vdev_geom_check_unmapped_cb, &s)) + return (0); + return (s.pages); +} + +/* + * Callback to translate the ABD segment into array of physical pages. + */ +static int +vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) +{ + struct bio *bp = priv; + vm_offset_t addr = (vm_offset_t)buf; + vm_offset_t end = addr + len; + + if (bp->bio_ma_n == 0) + bp->bio_ma_offset = addr & PAGE_MASK; + do { + bp->bio_ma[bp->bio_ma_n++] = + PHYS_TO_VM_PAGE(pmap_kextract(addr)); + addr += PAGE_SIZE; + } while (addr < end); + return (0); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || + vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_TRIM: + if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + default: + ; + /* PASSTHROUGH --- placate compiler */ + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + if (zio->io_type == ZIO_TYPE_READ) + bp->bio_cmd = BIO_READ; + else + bp->bio_cmd = BIO_WRITE; + + /* + * If possible, represent scattered and/or gang ABD buffer to + * GEOM as an array of physical pages. It allows to satisfy + * requirement of virtually contiguous buffer without copying. + */ + int pgs = vdev_geom_check_unmapped(zio, cp); + if (pgs > 0) { + bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs, + M_DEVBUF, M_WAITOK); + bp->bio_ma_n = 0; + bp->bio_ma_offset = 0; + abd_iterate_func(zio->io_abd, 0, zio->io_size, + vdev_geom_fill_unmap_cb, bp); + bp->bio_data = unmapped_buf; + bp->bio_flags |= BIO_UNMAPPED; + } else { + if (zio->io_type == ZIO_TYPE_READ) { + bp->bio_data = abd_borrow_buf(zio->io_abd, + zio->io_size); + } else { + bp->bio_data = abd_borrow_buf_copy(zio->io_abd, + zio->io_size); + } + } + break; + case ZIO_TYPE_TRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + default: + panic("invalid zio->io_type: %d\n", zio->io_type); + } + bp->bio_done = vdev_geom_io_intr; + zio->io_bio = bp; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + struct bio *bp = zio->io_bio; + + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + ASSERT3P(bp, ==, NULL); + return; + } + + if (bp == NULL) { + ASSERT3S(zio->io_error, ==, ENXIO); + return; + } + + if (bp->bio_ma != NULL) { + free(bp->bio_ma, M_DEVBUF); + } else { + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->bio_data, + zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->bio_data, + zio->io_size); + } + } + + g_destroy_bio(bp); + zio->io_bio = NULL; +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_geom_open, + .vdev_op_close = vdev_geom_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_geom_io_start, + .vdev_op_io_done = vdev_geom_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_geom_hold, + .vdev_op_rele = vdev_geom_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c new file mode 100644 index 0000000000..48f58807e8 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + abd_t *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_WRITER), ==, SCL_ALL); + + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + abd_copy_from_buf(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(pad2); + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c new file mode 100644 index 0000000000..ae758bcefe --- /dev/null +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -0,0 +1,2672 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + fallthrough; + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa); + VERIFY3S(error, ==, ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa); + VERIFY3S(error, ==, ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT3P(aclp, !=, NULL); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +static int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT3U(aclp->z_version, ==, ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY0(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr)); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize; + int acl_count; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + } + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + if (zp->z_zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + } + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT3U(aclp->z_version, >=, ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); + boolean_t isreg = (vtype == VREG); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + data2sz = aclp->z_ops->ace_data(acep, &data2); + VERIFY3U(data2sz, ==, data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + aclp->z_acl_count != 0) { + *need_chmod = B_FALSE; + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + if ((flag & IS_ROOT_NODE) == 0) { + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + } else + ASSERT3P(dzp->z_vnode, ==, NULL); + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = + zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT3U((caddr_t)start - (caddr_t)vsecp->vsa_aclentp, + ==, aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(error); + ASSERT3P(zp->z_acl_cached, ==, NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (SET_ERROR(EPERM)); + } + + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT3P(zp->z_acl_cached, !=, NULL); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + fallthrough; + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != UID_NOBODY && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (ZTOV(zp)->v_type != VDIR) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Check if VEXEC is allowed. + * + * This routine is based on zfs_fastaccesschk_execute which has slowpath + * calling zfs_zaccess. This would be incorrect on FreeBSD (see + * zfs_freebsd_access for the difference). Thus this variant let's the + * caller handle the slowpath (if necessary). + * + * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED. + * + * Safe access to znode_t is provided by the vnode lock. + */ +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t is_attr; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (1); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + return (1); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) + return (0); + + return (1); +} + + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); + + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + vnode_t *check_vp = ZTOV(check_zp); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT3U(working_mode, !=, 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, check_vp, owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(check_vp, cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(check_vp, cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(check_vp, cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(check_vp, cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * NFSv4-style ZFS ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error); + ASSERT(zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) { + /* XXXPJD: s/dzp/zp/ ? */ + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); + } + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c new file mode 100644 index 0000000000..3b405e9d68 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -0,0 +1,1361 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" + +#include +#include + +/* Common access mode for all virtual directories under the ctldir */ +const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + +/* + * "Synthetic" filesystem implementation. + */ + +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; + +/* + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artificial + * IDs, znode IDs) may clash. + */ +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} + +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof (node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof (*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof (node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + zfs_uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (zfs_uio_resid(uio) < sizeof (entry)) + return (SET_ERROR(EINVAL)); + + if (zfs_uio_offset(uio) < 0) + return (SET_ERROR(EINVAL)); + if (zfs_uio_offset(uio) == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '\0'; + entry.d_namlen = 1; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); + if (error != 0) + return (SET_ERROR(error)); + } + + if (zfs_uio_offset(uio) < sizeof (entry)) + return (SET_ERROR(EINVAL)); + if (zfs_uio_offset(uio) == sizeof (entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_name[2] = '\0'; + entry.d_namlen = 2; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof (entry); + return (0); +} + + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/ objectid(snap) + */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; + +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot)); + +} + +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; + + +/* + * Create the '.zfs' directory. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; + + ASSERT3P(zfsvfs->z_ctldir, ==, NULL); + + snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY0(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp)); + VERIFY0(sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); + + zfsvfs->z_ctldir = dot_zfs; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) +{ + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); +} + +/* + * Common open routine. Disallow any write access. + */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (SET_ERROR(EACCES)); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +static int +zfsctl_common_access(struct vop_access_args *ap) +{ + accmode_t accmode = ap->a_accmode; + + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vn_fsid(vp, vap); + vap->va_mode = zfsctl_ctldir_mode; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; +} + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfsctl_common_fid(struct vop_fid_args *ap) +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; + zfid_short_t *zfid; + int i; + + zfid = (zfid_short_t *)fidp; + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfsctl_common_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + + (void) sfs_reclaim_vnode(vp); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_print_args { + struct vnode *a_vp; +}; +#endif + +static int +zfsctl_common_print(struct vop_print_args *ap) +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +/* + * Get root directory attributes. + */ +static int +zfsctl_root_getattr(struct vop_getattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; + + zfsctl_common_getattr(vp, vap); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} + +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +static int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); +} + +/* + * Special case the handling of "..". + */ +static int +zfsctl_root_lookup(struct vop_lookup_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + int err; + + ASSERT3S(dvp->v_type, ==, VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); + } else { + err = SET_ERROR(ENOENT); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_root_readdir(struct vop_readdir_args *ap) +{ + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + zfs_uio_t uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + zfs_uio_init(&uio, ap->a_uio); + + ASSERT3S(vp->v_type, ==, VDIR); + + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + if (zfs_uio_offset(&uio) != dots_offset) + return (SET_ERROR(EINVAL)); + + CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; + return (0); +} + +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) +{ + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; + + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); + + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); + + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); +} + +static int +zfsctl_common_pathconf(struct vop_pathconf_args *ap) +{ + /* + * We care about ACL variables so that user land utilities like ls + * can display them correctly. Since the ctldir's st_dev is set to be + * the same as the parent dataset, we must support all variables that + * it supports. + */ + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + return (0); + + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = (int)SPA_MINBLOCKSIZE; + return (0); + + case _PC_ACL_EXTENDED: + *ap->a_retval = 0; + return (0); + + case _PC_ACL_NFS4: + *ap->a_retval = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *ap->a_retval = ACL_MAX_ENTRIES; + return (0); + + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * Returns a trivial ACL + */ +static int +zfsctl_common_getacl(struct vop_getacl_args *ap) +{ + int i; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); + /* + * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify + * attributes. That is not the case for the ctldir, so we must clear + * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs + * aren't supported by the ctldir. + */ + for (i = 0; i < ap->a_aclp->acl_cnt; i++) { + struct acl_entry *entry; + entry = &(ap->a_aclp->acl_entry[i]); + entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | + ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | + ACL_READ_NAMED_ATTRS); + } + + return (0); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, +#if __FreeBSD_version >= 1300121 + .vop_fplookup_vexec = VOP_EAGAIN, +#endif + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + int err; + + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); + return (err); +} + +/* + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. + */ +static int +zfsctl_mounted_here(vnode_t **vpp, int flags) +{ + struct mount *mp; + int err; + + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); + + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} + +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; + +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; + + ASSERT_VOP_ELOCKED(vp, __func__); + + node = sfs_alloc_node(sizeof (sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + * There are four possibilities: + * - the snapshot node and vnode do not exist + * - the snapshot vnode is covered by the mounted snapshot + * - the snapshot vnode is not covered yet, the mount operation is in progress + * - the snapshot vnode is not covered, because the snapshot has been unmounted + * The last two states are transient and should be relatively short-lived. + */ +static int +zfsctl_snapdir_lookup(struct vop_lookup_args *ap) +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + size_t mountpoint_len; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; + int err; + + ASSERT3S(dvp->v_type, ==, VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); + } + + if (cnp->cn_namelen >= sizeof (name)) + return (SET_ERROR(ENAMETOOLONG)); + + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) + return (err); + + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); + + /* + * If the vnode is not covered, then either the mount operation + * is in progress or the snapshot has already been unmounted + * but the vnode hasn't been inactivated and reclaimed yet. + * We can try to re-use the vnode in the latter case. + */ + VI_LOCK(*vpp); + if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + /* + * Upgrade to exclusive lock in order to: + * - avoid race conditions + * - satisfy the contract of mount_snapshot() + */ + err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + if (err == 0) + break; + } else { + VI_UNLOCK(*vpp); + } + + /* + * In this state we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. + * So, yield to prevent that from happening. + */ + vput(*vpp); + kern_yield(PRI_USER); + } + + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/ accessible over NFS + * without requiring manual mounts of . + */ + ASSERT3P(VTOZ(*vpp)->z_zfsvfs, !=, zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; + } + + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_snapdir_readdir(struct vop_readdir_args *ap) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfs_uio_t uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + zfs_uio_init(&uio, ap->a_uio); + + ASSERT3S(vp->v_type, ==, VDIR); + + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, + &uio, &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + + ZFS_ENTER(zfsvfs); + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = zfs_uio_offset(&uio) - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(error)); + } + zfs_uio_setoffset(&uio, cookie + dots_offset); + } + __builtin_unreachable(); +} + +static int +zfsctl_snapdir_getattr(struct vop_getattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + dsl_dataset_t *ds; + uint64_t snap_count; + int err; + + ZFS_ENTER(zfsvfs); + ds = dmu_objset_ds(zfsvfs->z_os); + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) { + ZFS_EXIT(zfsvfs); + return (err); + } + vap->va_nlink += snap_count; + } + vap->va_size = vap->va_nlink; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, +#if __FreeBSD_version >= 1300121 + .vop_fplookup_vexec = VOP_EAGAIN, +#endif + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); + + +static int +zfsctl_snapshot_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + VERIFY3S(vrecycle(vp), ==, 1); + return (0); +} + +static int +zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); + return (0); +} + +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) +{ + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; + + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); + + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); + + /* + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. + */ + locked = VOP_ISLOCKED(vp); +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(vp); +#else + vhold(vp); +#endif + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); +#if __FreeBSD_version >= 1300045 + vget_finish(vp, locked | LK_RETRY, vs); +#else + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); +#endif + return (error); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ +#if __FreeBSD_version >= 1300121 + .vop_fplookup_vexec = VOP_EAGAIN, +#endif + .vop_inactive = zfsctl_snapshot_inactive, +#if __FreeBSD_version >= 1300045 + .vop_need_inactive = vop_stdneed_inactive, +#endif + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs __unused = vfsp->vfs_data; + vnode_t *vp; + int error; + + ASSERT3P(zfsvfs->z_ctldir, !=, NULL); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { + /* + * XXX Probably need to at least reference, if not busy, the mp. + */ + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); + } + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; + vnode_t *vp; + uint64_t cookie; + int error; + + ASSERT3P(zfsvfs->z_ctldir, !=, NULL); + + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; + + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) + break; + vput(vp); + } + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; + } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} + +int +zfsctl_snapshot_unmount(const char *snapname, int flags __unused) +{ + vfs_t *vfsp = NULL; + zfsvfs_t *zfsvfs = NULL; + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + vfsp = zfsvfs->z_vfs; + + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + vfs_ref(vfsp); + vfs_unbusy(vfsp); + return (dounmount(vfsp, MS_FORCE, curthread)); +} diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c new file mode 100644 index 0000000000..dad342b06f --- /dev/null +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +kstat_t *zfs_dbgmsg_kstat; + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages as they're logged + * dtrace -n 'zfs-dbgmsg { print(stringof(arg0)); }' + * + * # Print all logged dbgmsg entries + * sysctl kstat.zfs.misc.dbgmsg + * + * # Disable the kernel debug message log. + * sysctl vfs.zfs.dbgmsg_enable=0 + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); + + return (0); +} + +static int +zfs_dbgmsg_data(char *buf, size_t size, void *data) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; + + (void) snprintf(buf, size, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + + return (0); +} + +static void * +zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + if (n == 0) + ksp->ks_private = list_head(&zfs_dbgmsgs); + else if (zdm) + ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); + + return (ksp->ks_private); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + zfs_dbgmsg_t *zdm; + int size; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + while (zfs_dbgmsg_size > max_size) { + zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + zfs_dbgmsg_purge(0); + + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (zfs_dbgmsg_kstat) { + zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; + zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; + zfs_dbgmsg_kstat->ks_private = NULL; + zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; + kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, + zfs_dbgmsg_data, zfs_dbgmsg_addr); + kstat_install(zfs_dbgmsg_kstat); + } +} + +void +zfs_dbgmsg_fini(void) +{ + if (zfs_dbgmsg_kstat) + kstat_delete(zfs_dbgmsg_kstat); + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + mutex_enter(&zfs_dbgmsgs_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs_lock); + mutex_destroy(&zfs_dbgmsgs_lock); +#endif +} + +void +__zfs_dbgmsg(char *buf) +{ + zfs_dbgmsg_t *zdm; + int size; + + DTRACE_PROBE1(zfs__dbgmsg, char *, buf); + + size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", (ulong_t)err); +} + +#ifdef _KERNEL +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline. + */ + nl = strrchr(buf, '\n'); + if (nl != NULL) + *nl = '\0'; + + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} +#endif /* _KERNEL */ + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_enable, INT, ZMOD_RW, + "Enable ZFS debug message log"); + +ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_maxsize, INT, ZMOD_RW, + "Maximum ZFS debug log size"); +/* END CSTYLED */ diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c new file mode 100644 index 0000000000..7fff329a93 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -0,0 +1,963 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, NULL, 0, NULL); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + + *zpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect how we perform zap lookups. + * + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if (zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. + */ + + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + return (error); + } + } else { + if (flag & ZNEW) { + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, &zp); + if (error) + return (error); + ASSERT(!zp->z_unlinked); + *zpp = zp; + } + + return (0); +} + +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; + +#ifdef ZFS_DEBUG + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); +#endif + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); +} + +int +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) +{ + zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; + znode_t *zp = NULL; + int error = 0; + +#ifdef ZFS_DEBUG + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); +#endif + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, &zp); + if (error == 0) + *zpp = zp; + } else { + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); + if (error == 0) { + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; + } + } + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT3U(zp->z_links, ==, 0); + + VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + dmu_tx_t *tx; + int error; + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); + + /* + * Due to changes in zfs_rmnode we need to make sure the + * link count is set to zero here. + */ + if (zp->z_links != 0) { + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + vput(ZTOV(zp)); + continue; + } + zp->z_links = 0; + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &zp->z_links, sizeof (zp->z_links), tx)); + dmu_tx_commit(tx); + } + + zp->z_unlinked = B_TRUE; + vput(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + vput(ZTOV(xzp)); + skipped += 1; + continue; + } + + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + vput(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +extern taskq_t *zfsvfs_taskq; + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t count; + int error; + + ASSERT3U(zp->z_links, ==, 0); + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } else { + /* + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error) + xattr_obj = 0; + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xattr_obj) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * FreeBSD's implementation of zfs_zget requires a vnode to back it. + * This means that we could end up calling into getnewvnode while + * calling zfs_rmnode as a result of a prior call to getnewvnode + * trying to clear vnodes out of the cache. If this repeats we can + * recurse enough that we overflow our stack. To avoid this, we + * avoid calling zfs_zget on the xattr znode and instead simply add + * it to the unlinked set and schedule a call to zfs_unlinked_drain. + */ + if (xattr_obj) { + /* Add extended attribute directory to the unlinked set. */ + VERIFY3U(0, ==, + zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xattr_obj) { + /* + * We're using the FreeBSD taskqueue API here instead of + * the Solaris taskq API since the FreeBSD API allows for a + * task to be enqueued multiple times but executed once. + */ + taskqueue_enqueue(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + } +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dzp. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (zp_is_dir) { + if (dzp->z_links >= ZFS_LINK_MAX) + return (SET_ERROR(EMLINK)); + } + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + return (SET_ERROR(ENOENT)); + } + if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { + return (SET_ERROR(EMLINK)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } else { + ASSERT(!zp->z_unlinked); + } + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, + 8, 1, &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + zp->z_links--; + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); + } + + return (error); +} + +/* + * Unlink zp from dzp, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (!(flag & ZRENAMING)) { + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); + } else { + ASSERT(!zp->z_unlinked); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); + } + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t parent __maybe_unused; + + *xvpp = NULL; + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + ASSERT0(sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, + sizeof (parent))); + ASSERT3U(parent, ==, zp->z_id); + + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + *xvpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + vattr_t va; + int error; +top: + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) + return (SET_ERROR(ENOATTR)); + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK1(ZTOV(*xzpp)); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c new file mode 100644 index 0000000000..fd86a75416 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct thread *td; + int rc, fd; + + td = curthread; + pwd_ensure_dirs(); + /* 12.x doesn't take a const char * */ + rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), + UIO_SYSSPACE, flags, mode); + if (rc) + return (SET_ERROR(rc)); + fd = td->td_retval[0]; + td->td_retval[0] = 0; + if (fget(curthread, fd, &cap_no_rights, fpp)) + kern_close(td, fd); + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + fo_close(fp, curthread); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + + rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + if (resid) + *resid = auio.uio_resid; + else if (auio.uio_resid) + return (SET_ERROR(EIO)); + *offp += count - auio.uio_resid; + return (rc); +} + +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static int +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FREAD) == 0) + return (SET_ERROR(EBADF)); + + rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + if (resid) + *resid = auio.uio_resid; + *offp += count - auio.uio_resid; + return (SET_ERROR(0)); +} + +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + int rc; + struct thread *td; + + td = curthread; + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) + return (SET_ERROR(ESPIPE)); + rc = fo_seek(fp, *offp, whence, td); + if (rc == 0) + *offp = td->td_uretoff.tdu_off; + return (SET_ERROR(rc)); +} + +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct thread *td; + struct stat sb; + int rc; + + td = curthread; + +#if __FreeBSD_version < 1400037 + rc = fo_stat(fp, &sb, td->td_ucred, td); +#else + rc = fo_stat(fp, &sb, td->td_ucred); +#endif + if (rc) + return (SET_ERROR(rc)); + zfattr->zfa_size = sb.st_size; + zfattr->zfa_mode = sb.st_mode; + + return (0); +} + +static __inline int +zfs_vop_fsync(vnode_t *vp) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto drop; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + VOP_UNLOCK1(vp); + vn_finished_write(mp); +drop: + return (SET_ERROR(error)); +} + +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + + return (zfs_vop_fsync(fp->f_vnode)); +} + +zfs_file_t * +zfs_file_get(int fd) +{ + struct file *fp; + + if (fget(curthread, fd, &cap_no_rights, &fp)) + return (NULL); + + return (fp); +} + +void +zfs_file_put(zfs_file_t *fp) +{ + fdrop(fp, curthread); +} + +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +void * +zfs_file_private(zfs_file_t *fp) +{ + file_t *tmpfp; + void *data; + int error; + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + curthread->td_fpop = tmpfp; + if (error != 0) + return (NULL); + return (data); +} + +int +zfs_file_unlink(const char *fnamep) +{ + zfs_uio_seg_t seg = UIO_SYSSPACE; + int rc; + +#if __FreeBSD_version >= 1300018 + rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); +#elif __FreeBSD_version >= 1202504 || defined(AT_BENEATH) + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0, 0); +#else + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0); +#endif + return (SET_ERROR(rc)); +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_compat.c b/module/os/freebsd/zfs/zfs_ioctl_compat.c new file mode 100644 index 0000000000..81967bed73 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_compat.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum zfs_ioc_legacy { + ZFS_IOC_LEGACY_NONE = -1, + ZFS_IOC_LEGACY_FIRST = 0, + ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST, + ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST, + ZFS_IOC_LEGACY_POOL_DESTROY, + ZFS_IOC_LEGACY_POOL_IMPORT, + ZFS_IOC_LEGACY_POOL_EXPORT, + ZFS_IOC_LEGACY_POOL_CONFIGS, + ZFS_IOC_LEGACY_POOL_STATS, + ZFS_IOC_LEGACY_POOL_TRYIMPORT, + ZFS_IOC_LEGACY_POOL_SCAN, + ZFS_IOC_LEGACY_POOL_FREEZE, + ZFS_IOC_LEGACY_POOL_UPGRADE, + ZFS_IOC_LEGACY_POOL_GET_HISTORY, + ZFS_IOC_LEGACY_VDEV_ADD, + ZFS_IOC_LEGACY_VDEV_REMOVE, + ZFS_IOC_LEGACY_VDEV_SET_STATE, + ZFS_IOC_LEGACY_VDEV_ATTACH, + ZFS_IOC_LEGACY_VDEV_DETACH, + ZFS_IOC_LEGACY_VDEV_SETPATH, + ZFS_IOC_LEGACY_VDEV_SETFRU, + ZFS_IOC_LEGACY_OBJSET_STATS, + ZFS_IOC_LEGACY_OBJSET_ZPLPROPS, + ZFS_IOC_LEGACY_DATASET_LIST_NEXT, + ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT, + ZFS_IOC_LEGACY_SET_PROP, + ZFS_IOC_LEGACY_CREATE, + ZFS_IOC_LEGACY_DESTROY, + ZFS_IOC_LEGACY_ROLLBACK, + ZFS_IOC_LEGACY_RENAME, + ZFS_IOC_LEGACY_RECV, + ZFS_IOC_LEGACY_SEND, + ZFS_IOC_LEGACY_INJECT_FAULT, + ZFS_IOC_LEGACY_CLEAR_FAULT, + ZFS_IOC_LEGACY_INJECT_LIST_NEXT, + ZFS_IOC_LEGACY_ERROR_LOG, + ZFS_IOC_LEGACY_CLEAR, + ZFS_IOC_LEGACY_PROMOTE, + ZFS_IOC_LEGACY_DESTROY_SNAPS, + ZFS_IOC_LEGACY_SNAPSHOT, + ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME, + ZFS_IOC_LEGACY_OBJ_TO_PATH, + ZFS_IOC_LEGACY_POOL_SET_PROPS, + ZFS_IOC_LEGACY_POOL_GET_PROPS, + ZFS_IOC_LEGACY_SET_FSACL, + ZFS_IOC_LEGACY_GET_FSACL, + ZFS_IOC_LEGACY_SHARE, + ZFS_IOC_LEGACY_INHERIT_PROP, + ZFS_IOC_LEGACY_SMB_ACL, + ZFS_IOC_LEGACY_USERSPACE_ONE, + ZFS_IOC_LEGACY_USERSPACE_MANY, + ZFS_IOC_LEGACY_USERSPACE_UPGRADE, + ZFS_IOC_LEGACY_HOLD, + ZFS_IOC_LEGACY_RELEASE, + ZFS_IOC_LEGACY_GET_HOLDS, + ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS, + ZFS_IOC_LEGACY_VDEV_SPLIT, + ZFS_IOC_LEGACY_NEXT_OBJ, + ZFS_IOC_LEGACY_DIFF, + ZFS_IOC_LEGACY_TMP_SNAPSHOT, + ZFS_IOC_LEGACY_OBJ_TO_STATS, + ZFS_IOC_LEGACY_JAIL, + ZFS_IOC_LEGACY_UNJAIL, + ZFS_IOC_LEGACY_POOL_REGUID, + ZFS_IOC_LEGACY_SPACE_WRITTEN, + ZFS_IOC_LEGACY_SPACE_SNAPS, + ZFS_IOC_LEGACY_SEND_PROGRESS, + ZFS_IOC_LEGACY_POOL_REOPEN, + ZFS_IOC_LEGACY_LOG_HISTORY, + ZFS_IOC_LEGACY_SEND_NEW, + ZFS_IOC_LEGACY_SEND_SPACE, + ZFS_IOC_LEGACY_CLONE, + ZFS_IOC_LEGACY_BOOKMARK, + ZFS_IOC_LEGACY_GET_BOOKMARKS, + ZFS_IOC_LEGACY_DESTROY_BOOKMARKS, + ZFS_IOC_LEGACY_NEXTBOOT, + ZFS_IOC_LEGACY_CHANNEL_PROGRAM, + ZFS_IOC_LEGACY_REMAP, + ZFS_IOC_LEGACY_POOL_CHECKPOINT, + ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT, + ZFS_IOC_LEGACY_POOL_INITIALIZE, + ZFS_IOC_LEGACY_POOL_SYNC, + ZFS_IOC_LEGACY_LAST +}; + +unsigned static long zfs_ioctl_legacy_to_ozfs_[] = { + ZFS_IOC_POOL_CREATE, /* 0x00 */ + ZFS_IOC_POOL_DESTROY, /* 0x01 */ + ZFS_IOC_POOL_IMPORT, /* 0x02 */ + ZFS_IOC_POOL_EXPORT, /* 0x03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x04 */ + ZFS_IOC_POOL_STATS, /* 0x05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */ + ZFS_IOC_POOL_SCAN, /* 0x07 */ + ZFS_IOC_POOL_FREEZE, /* 0x08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */ + ZFS_IOC_VDEV_ADD, /* 0x0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x0e */ + ZFS_IOC_VDEV_DETACH, /* 0x0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x11 */ + ZFS_IOC_OBJSET_STATS, /* 0x12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */ + ZFS_IOC_SET_PROP, /* 0x16 */ + ZFS_IOC_CREATE, /* 0x17 */ + ZFS_IOC_DESTROY, /* 0x18 */ + ZFS_IOC_ROLLBACK, /* 0x19 */ + ZFS_IOC_RENAME, /* 0x1a */ + ZFS_IOC_RECV, /* 0x1b */ + ZFS_IOC_SEND, /* 0x1c */ + ZFS_IOC_INJECT_FAULT, /* 0x1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */ + ZFS_IOC_ERROR_LOG, /* 0x20 */ + ZFS_IOC_CLEAR, /* 0x21 */ + ZFS_IOC_PROMOTE, /* 0x22 */ + /* start of mismatch */ + + ZFS_IOC_DESTROY_SNAPS, /* 0x23:0x3b */ + ZFS_IOC_SNAPSHOT, /* 0x24:0x23 */ + ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x25:0x24 */ + ZFS_IOC_OBJ_TO_PATH, /* 0x26:0x25 */ + ZFS_IOC_POOL_SET_PROPS, /* 0x27:0x26 */ + ZFS_IOC_POOL_GET_PROPS, /* 0x28:0x27 */ + ZFS_IOC_SET_FSACL, /* 0x29:0x28 */ + ZFS_IOC_GET_FSACL, /* 0x30:0x29 */ + ZFS_IOC_SHARE, /* 0x2b:0x2a */ + ZFS_IOC_INHERIT_PROP, /* 0x2c:0x2b */ + ZFS_IOC_SMB_ACL, /* 0x2d:0x2c */ + ZFS_IOC_USERSPACE_ONE, /* 0x2e:0x2d */ + ZFS_IOC_USERSPACE_MANY, /* 0x2f:0x2e */ + ZFS_IOC_USERSPACE_UPGRADE, /* 0x30:0x2f */ + ZFS_IOC_HOLD, /* 0x31:0x30 */ + ZFS_IOC_RELEASE, /* 0x32:0x31 */ + ZFS_IOC_GET_HOLDS, /* 0x33:0x32 */ + ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x34:0x33 */ + ZFS_IOC_VDEV_SPLIT, /* 0x35:0x34 */ + ZFS_IOC_NEXT_OBJ, /* 0x36:0x35 */ + ZFS_IOC_DIFF, /* 0x37:0x36 */ + ZFS_IOC_TMP_SNAPSHOT, /* 0x38:0x37 */ + ZFS_IOC_OBJ_TO_STATS, /* 0x39:0x38 */ + ZFS_IOC_JAIL, /* 0x3a:0xc2 */ + ZFS_IOC_UNJAIL, /* 0x3b:0xc3 */ + ZFS_IOC_POOL_REGUID, /* 0x3c:0x3c */ + ZFS_IOC_SPACE_WRITTEN, /* 0x3d:0x39 */ + ZFS_IOC_SPACE_SNAPS, /* 0x3e:0x3a */ + ZFS_IOC_SEND_PROGRESS, /* 0x3f:0x3e */ + ZFS_IOC_POOL_REOPEN, /* 0x40:0x3d */ + ZFS_IOC_LOG_HISTORY, /* 0x41:0x3f */ + ZFS_IOC_SEND_NEW, /* 0x42:0x40 */ + ZFS_IOC_SEND_SPACE, /* 0x43:0x41 */ + ZFS_IOC_CLONE, /* 0x44:0x42 */ + ZFS_IOC_BOOKMARK, /* 0x45:0x43 */ + ZFS_IOC_GET_BOOKMARKS, /* 0x46:0x44 */ + ZFS_IOC_DESTROY_BOOKMARKS, /* 0x47:0x45 */ + ZFS_IOC_NEXTBOOT, /* 0x48:0xc1 */ + ZFS_IOC_CHANNEL_PROGRAM, /* 0x49:0x48 */ + ZFS_IOC_REMAP, /* 0x4a:0x4c */ + ZFS_IOC_POOL_CHECKPOINT, /* 0x4b:0x4d */ + ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x4c:0x4e */ + ZFS_IOC_POOL_INITIALIZE, /* 0x4d:0x4f */ +}; + +unsigned static long zfs_ioctl_ozfs_to_legacy_common_[] = { + ZFS_IOC_POOL_CREATE, /* 0x00 */ + ZFS_IOC_POOL_DESTROY, /* 0x01 */ + ZFS_IOC_POOL_IMPORT, /* 0x02 */ + ZFS_IOC_POOL_EXPORT, /* 0x03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x04 */ + ZFS_IOC_POOL_STATS, /* 0x05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */ + ZFS_IOC_POOL_SCAN, /* 0x07 */ + ZFS_IOC_POOL_FREEZE, /* 0x08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */ + ZFS_IOC_VDEV_ADD, /* 0x0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x0e */ + ZFS_IOC_VDEV_DETACH, /* 0x0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x11 */ + ZFS_IOC_OBJSET_STATS, /* 0x12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */ + ZFS_IOC_SET_PROP, /* 0x16 */ + ZFS_IOC_CREATE, /* 0x17 */ + ZFS_IOC_DESTROY, /* 0x18 */ + ZFS_IOC_ROLLBACK, /* 0x19 */ + ZFS_IOC_RENAME, /* 0x1a */ + ZFS_IOC_RECV, /* 0x1b */ + ZFS_IOC_SEND, /* 0x1c */ + ZFS_IOC_INJECT_FAULT, /* 0x1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */ + ZFS_IOC_ERROR_LOG, /* 0x20 */ + ZFS_IOC_CLEAR, /* 0x21 */ + ZFS_IOC_PROMOTE, /* 0x22 */ + /* start of mismatch */ + ZFS_IOC_LEGACY_SNAPSHOT, /* 0x23 */ + ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME, /* 0x24 */ + ZFS_IOC_LEGACY_OBJ_TO_PATH, /* 0x25 */ + ZFS_IOC_LEGACY_POOL_SET_PROPS, /* 0x26 */ + ZFS_IOC_LEGACY_POOL_GET_PROPS, /* 0x27 */ + ZFS_IOC_LEGACY_SET_FSACL, /* 0x28 */ + ZFS_IOC_LEGACY_GET_FSACL, /* 0x29 */ + ZFS_IOC_LEGACY_SHARE, /* 0x2a */ + ZFS_IOC_LEGACY_INHERIT_PROP, /* 0x2b */ + ZFS_IOC_LEGACY_SMB_ACL, /* 0x2c */ + ZFS_IOC_LEGACY_USERSPACE_ONE, /* 0x2d */ + ZFS_IOC_LEGACY_USERSPACE_MANY, /* 0x2e */ + ZFS_IOC_LEGACY_USERSPACE_UPGRADE, /* 0x2f */ + ZFS_IOC_LEGACY_HOLD, /* 0x30 */ + ZFS_IOC_LEGACY_RELEASE, /* 0x31 */ + ZFS_IOC_LEGACY_GET_HOLDS, /* 0x32 */ + ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS, /* 0x33 */ + ZFS_IOC_LEGACY_VDEV_SPLIT, /* 0x34 */ + ZFS_IOC_LEGACY_NEXT_OBJ, /* 0x35 */ + ZFS_IOC_LEGACY_DIFF, /* 0x36 */ + ZFS_IOC_LEGACY_TMP_SNAPSHOT, /* 0x37 */ + ZFS_IOC_LEGACY_OBJ_TO_STATS, /* 0x38 */ + ZFS_IOC_LEGACY_SPACE_WRITTEN, /* 0x39 */ + ZFS_IOC_LEGACY_SPACE_SNAPS, /* 0x3a */ + ZFS_IOC_LEGACY_DESTROY_SNAPS, /* 0x3b */ + ZFS_IOC_LEGACY_POOL_REGUID, /* 0x3c */ + ZFS_IOC_LEGACY_POOL_REOPEN, /* 0x3d */ + ZFS_IOC_LEGACY_SEND_PROGRESS, /* 0x3e */ + ZFS_IOC_LEGACY_LOG_HISTORY, /* 0x3f */ + ZFS_IOC_LEGACY_SEND_NEW, /* 0x40 */ + ZFS_IOC_LEGACY_SEND_SPACE, /* 0x41 */ + ZFS_IOC_LEGACY_CLONE, /* 0x42 */ + ZFS_IOC_LEGACY_BOOKMARK, /* 0x43 */ + ZFS_IOC_LEGACY_GET_BOOKMARKS, /* 0x44 */ + ZFS_IOC_LEGACY_DESTROY_BOOKMARKS, /* 0x45 */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */ + ZFS_IOC_LEGACY_POOL_SYNC, /* 0x47 */ + ZFS_IOC_LEGACY_CHANNEL_PROGRAM, /* 0x48 */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */ + ZFS_IOC_LEGACY_REMAP, /* 0x4c */ + ZFS_IOC_LEGACY_POOL_CHECKPOINT, /* 0x4d */ + ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT, /* 0x4e */ + ZFS_IOC_LEGACY_POOL_INITIALIZE, /* 0x4f */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */ +}; + +unsigned static long zfs_ioctl_ozfs_to_legacy_platform_[] = { + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */ + ZFS_IOC_LEGACY_NEXTBOOT, + ZFS_IOC_LEGACY_JAIL, + ZFS_IOC_LEGACY_UNJAIL, + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */ +}; + +int +zfs_ioctl_legacy_to_ozfs(int request) +{ + if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long)) + return (-1); + return (zfs_ioctl_legacy_to_ozfs_[request]); +} + +int +zfs_ioctl_ozfs_to_legacy(int request) +{ + if (request > ZFS_IOC_LAST) + return (-1); + + if (request > ZFS_IOC_PLATFORM) { + request -= ZFS_IOC_PLATFORM + 1; + return (zfs_ioctl_ozfs_to_legacy_platform_[request]); + } + if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long)) + return (-1); + return (zfs_ioctl_ozfs_to_legacy_common_[request]); +} + +void +zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_zoneid = src->zc_jailid; +} + +void +zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + *&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record; + dst->zc_begin_record.drr_payloadlen = 0; + dst->zc_begin_record.drr_type = 0; + + memcpy(&dst->zc_inject_record, &src->zc_inject_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_inject_record)); + dst->zc_resumable = B_FALSE; + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_jailid = src->zc_zoneid; +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c new file mode 100644 index 0000000000..7f7e2b72c5 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __FreeBSD_version < 1201517 +#define vm_page_max_user_wired vm_page_max_wired +#endif + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, 0); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +int +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static const zfs_ioc_key_t zfs_keys_nextboot[] = { + {"command", DATA_TYPE_STRING, 0}, + { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0}, + { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0} +}; + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; + + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); +} + +/* Update the VFS's cache of mountpoint properties */ +void +zfs_ioctl_update_mount_cache(const char *dsname) +{ + zfsvfs_t *zfsvfs; + + if (getzfsvfs(dsname, &zfsvfs) == 0) { + struct mount *mp = zfsvfs->z_vfs; + VFS_STATFS(mp, &mp->mnt_stat); + zfs_vfs_rele(zfsvfs); + } + /* + * Ignore errors; we can't do anything useful if either getzfsvfs or + * VFS_STATFS fails. + */ +} + +uint64_t +zfs_max_nvlist_src_size_os(void) +{ + if (zfs_max_nvlist_src_size != 0) + return (zfs_max_nvlist_src_size); + + return (ptob(vm_page_max_user_wired) / 4); +} + +void +zfs_ioctl_init_os(void) +{ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3); + +} diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c new file mode 100644 index 0000000000..b46cc04626 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_racct.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +void +zfs_racct_read(uint64_t size, uint64_t iops) +{ + curthread->td_ru.ru_inblock += iops; +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_READBPS, size); + racct_add_force(curproc, RACCT_READIOPS, iops); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +} + +void +zfs_racct_write(uint64_t size, uint64_t iops) +{ + curthread->td_ru.ru_oublock += iops; +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, size); + racct_add_force(curproc, RACCT_WRITEIOPS, iops); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +} diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c new file mode 100644 index 0000000000..42e11eeb18 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -0,0 +1,2328 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" + +#ifndef MNTK_VMSETSIZE_BUG +#define MNTK_VMSETSIZE_BUG 0 +#endif +#ifndef MNTK_NOMSYNC +#define MNTK_NOMSYNC 8 +#endif + +/* BEGIN CSTYLED */ +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); +/* END CSTYLED */ + +#if __FreeBSD_version >= 1400018 +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, + bool *mp_busy); +#else +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); +#endif +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +#if __FreeBSD_version >= 1300098 +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, + struct ucred **credanonp, int *numsecflavors, int *secflavors); +#else +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +#endif +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, +#if __FreeBSD_version >= 1300049 + .vfs_root = vfs_cache_root, + .vfs_cachedroot = zfs_root, +#else + .vfs_root = zfs_root, +#endif + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, + .vfs_quotactl = zfs_quotactl, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, &zfvp); + if (error != 0) + return (error); + if (zfvp == NULL) + return (ENOENT); + vfsp = zfvp->z_vfs; + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + tmp = 1; + break; + case ZFS_PROP_DEVICES: + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + tmp = 1; + break; + case ZFS_PROP_EXEC: + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + tmp = 1; + break; + case ZFS_PROP_SETUID: + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + tmp = 1; + break; + case ZFS_PROP_READONLY: + if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + tmp = 1; + break; + case ZFS_PROP_XATTR: + if (zfvp->z_flags & ZSB_XATTR) + tmp = zfvp->z_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) + tmp = 1; + break; + default: + vfs_unbusy(vfsp); + return (ENOENT); + } + + vfs_unbusy(vfsp); + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +} + +static int +zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) +{ + int error = 0; + char buf[32]; + uint64_t usedobj, quotaobj; + uint64_t quota, used = 0; + timespec_t now; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) { + error = ENOENT; + goto done; + } + (void) sprintf(buf, "%llx", (longlong_t)id); + if ((error = zap_lookup(zfsvfs->z_os, quotaobj, + buf, sizeof (quota), 1, "a)) != 0) { + dprintf("%s(%d): quotaobj lookup failed\n", + __FUNCTION__, __LINE__); + goto done; + } + /* + * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". + * So we set them to be the same. + */ + dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); + error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); + if (error && error != ENOENT) { + dprintf("%s(%d): usedobj failed; %d\n", + __FUNCTION__, __LINE__, error); + goto done; + } + dqp->dqb_curblocks = btodb(used); + dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; + vfs_timestamp(&now); + /* + * Setting this to 0 causes FreeBSD quota(8) to print + * the number of days since the epoch, which isn't + * particularly useful. + */ + dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; +done: + return (error); +} + +static int +#if __FreeBSD_version >= 1400018 +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) +#else +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) +#endif +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct thread *td; + int cmd, type, error = 0; + int bitsize; + zfs_userquota_prop_t quota_type; + struct dqblk64 dqblk = { 0 }; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + + ZFS_ENTER(zfsvfs); + if (id == -1) { + switch (type) { + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + default: + error = EINVAL; +#if __FreeBSD_version < 1400018 + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); +#endif + goto done; + } + } + /* + * Map BSD type to: + * ZFS_PROP_USERUSED, + * ZFS_PROP_USERQUOTA, + * ZFS_PROP_GROUPUSED, + * ZFS_PROP_GROUPQUOTA + */ + switch (cmd) { + case Q_SETQUOTA: + case Q_SETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERQUOTA; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPQUOTA; + else + error = EINVAL; + break; + case Q_GETQUOTA: + case Q_GETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERUSED; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPUSED; + else + error = EINVAL; + break; + } + + /* + * Depending on the cmd, we may need to get + * the ruid and domain (see fuidstr_to_sid?), + * the fuid (how?), or other information. + * Create fuid using zfs_fuid_create(zfsvfs, id, + * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? + * I think I can use just the id? + * + * Look at zfs_id_overquota() to look up a quota. + * zap_lookup(something, quotaobj, fuidstring, + * sizeof (long long), 1, "a) + * + * See zfs_set_userquota() to set a quota. + */ + if ((uint32_t)type >= MAXQUOTAS) { + error = EINVAL; + goto done; + } + + switch (cmd) { + case Q_GETQUOTASIZE: + bitsize = 64; + error = copyout(&bitsize, arg, sizeof (int)); + break; + case Q_QUOTAON: + // As far as I can tell, you can't turn quotas on or off on zfs + error = 0; +#if __FreeBSD_version < 1400018 + vfs_unbusy(vfsp); +#endif + break; + case Q_QUOTAOFF: + error = ENOTSUP; +#if __FreeBSD_version < 1400018 + vfs_unbusy(vfsp); +#endif + break; + case Q_SETQUOTA: + error = copyin(arg, &dqblk, sizeof (dqblk)); + if (error == 0) + error = zfs_set_userquota(zfsvfs, quota_type, + "", id, dbtob(dqblk.dqb_bhardlimit)); + break; + case Q_GETQUOTA: + error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); + if (error == 0) + error = copyout(&dqblk, arg, sizeof (dqblk)); + break; + default: + error = EINVAL; + break; + } +done: + ZFS_EXIT(zfsvfs); + return (error); +} + + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); +} + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. + */ + if (waitfor == MNT_LAZY) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (rebooting && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(8). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static void +acl_type_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_type = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t do_xattr = B_FALSE; + int error = 0; + + ASSERT3P(vfsp, !=, NULL); + zfsvfs = vfsp->vfs_data; + ASSERT3P(zfsvfs, !=, NULL); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_write_cachefile() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); + if (error != 0) + return (error); + zfsvfs->z_acl_type = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if (error == 0 && val == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT3U(zfsvfs->z_root, !=, 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +taskq_t *zfsvfs_taskq; + +static void +zfsvfs_task_unlinked_drain(void *context, int pending __unused) +{ + + zfs_unlinked_drain((zfsvfs_t *)context); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, + &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + + return (error); +} + + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, + zfsvfs_task_unlinked_drain, zfsvfs); + ZFS_TEARDOWN_INIT(zfsvfs); + ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) + return (SET_ERROR(EROFS)); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) { + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + } else { + dsl_dir_t *dd; + zap_stats_t zs; + + if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + &zs) == 0) { + dataset_kstats_update_nunlinks_kstat( + &zfsvfs->z_kstat, zs.zs_num_entries); + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "num_entries in unlinked set: %llu", + (u_longlong_t)zs.zs_num_entries); + } + + zfs_unlinked_drain(zfsvfs); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + boolean_t use_nc = zfsvfs->z_use_namecache; + zfsvfs->z_use_namecache = B_FALSE; + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + zfsvfs->z_use_namecache = use_nc; + } + } + + /* restore readonly bit */ + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); + list_destroy(&zfsvfs->z_all_znodes); + ZFS_TEARDOWN_DESTROY(zfsvfs); + ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + dataset_kstats_destroy(&zfsvfs->z_kstat); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + uint64_t recordsize, fsid_guid; + int error = 0; + zfsvfs_t *zfsvfs; + + ASSERT3P(vfsp, !=, NULL); + ASSERT3P(osname, !=, NULL); + + error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + if ((error = dsl_prop_get_integer(osname, + "recordsize", &recordsize, NULL))) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + /* + * This can cause a loss of coherence between ARC and page cache + * on ZoF - unclear if the problem is in FreeBSD or ZoF + */ + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ + vfsp->mnt_kern_flag |= MNTK_NOMSYNC; + vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; + +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; +#endif + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | + (vfsp->mnt_vfc->vfc_typenum & 0xFF); + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + if ((error = dsl_prop_get_integer(osname, + "acltype", &pval, NULL))) + goto out; + acl_type_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +static void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + +static void +fetch_osname_options(char *name, bool *checkpointrewind) +{ + + if (name[0] == '!') { + *checkpointrewind = true; + memmove(name, name + 1, strlen(name)); + } else { + *checkpointrewind = false; + } +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp) +{ + kthread_t *td = curthread; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + bool checkpointrewind; + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + fetch_osname_options(osname, &checkpointrewind); + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK1(mvp); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK1(mvp); + goto out; + } + VOP_UNLOCK1(mvp); + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = SET_ERROR(EPERM); + goto out; + } + + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname, checkpointrewind); + if (error) + goto out; + } + DROP_GIANT(); + error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + +out: + return (error); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + strlcpy(statp->f_fstypename, "zfs", + sizeof (statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof (statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof (statp->f_mntonname)); + + statp->f_namemax = MAXNAMELEN - 1; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + + if (error == 0) { + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + dsl_dir_t *dd; + + /* + * If someone has not already unmounted this file system, + * drain the zrele_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but zreles run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ +#ifdef FREEBSD_NAMECACHE +#if __FreeBSD_version >= 1300117 + cache_purgevfs(zfsvfs->z_parent->z_vfs); +#else + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_sa_hdl != NULL) { + zfs_znode_dmu_fini(zp); + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (!zfs_is_readonly(zfsvfs)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + cred_t *cr = td->td_ucred; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + if (dsl_deleg_access((char *)vfsp->vfs_resource, + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + } + + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); + zfsvfs->z_unmounted = B_TRUE; + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) + return (ret); + while (taskqueue_cancel(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task, NULL) != 0) + taskqueue_drain(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + + VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + vrele(ZTOV(zp)); + err = EINVAL; + } + if (err == 0) + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + if (err == 0) { + err = vn_lock(*vpp, flags); + if (err != 0) + vrele(*vpp); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +#if __FreeBSD_version >= 1300098 +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, + struct ucred **credanonp, int *numsecflavors, int *secflavors) +#else +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +#endif +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); + if (object == ZFSCTL_INO_SNAPDIR) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else { + *vpp = dvp; + } + return (err); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, + (u_longlong_t)fid_gen, + (u_longlong_t)gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%llu) != fid gen (%llu)\n", + (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); + vrele(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err; + znode_t *zp; + + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); + ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + +bail: + /* release the VOPs */ + ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#include + + +#include +#include +#include +#include +#endif + +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof (struct vm_object) + + sizeof (struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); + + zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); +} + +void +zfs_fini(void) +{ + taskq_destroy(zfsvfs_taskq); + zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); + ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_vfs, 0); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY0(sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %ju to %ju", (uintmax_t)zfsvfs->z_version, + (uintmax_t)newvers); + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_NFSV4; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the corresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void) strlcpy(fromname, newname, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", + newname, fromname + oldlen); + (void) strlcpy(fromname, tmpbuf, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +} +#endif diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c new file mode 100644 index 0000000000..6f63fb9db5 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -0,0 +1,6221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __FreeBSD_version >= 1300102 +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#ifndef VN_OPEN_INVFS +#define VN_OPEN_INVFS 0x0 +#endif + +VFS_SMR_DECLARE; + +#if __FreeBSD_version >= 1300047 +#define vm_page_wire_lock(pp) +#define vm_page_wire_unlock(pp) +#else +#define vm_page_wire_lock(pp) vm_page_lock(pp) +#define vm_page_wire_unlock(pp) vm_page_unlock(pp) +#endif + +#ifdef DEBUG_VFS_LOCKS +#define VNCHECKREF(vp) \ + VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ + ("%s: wrong ref counts", __func__)); +#else +#define VNCHECKREF(vp) +#endif + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, + int *rvalp) +{ + loff_t off; + int error; + + switch (com) { + case _FIOFFS: + { + return (0); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + } + case _FIOGDIO: + case _FIOSDIO: + { + return (0); + } + + case F_SEEK_DATA: + case F_SEEK_HOLE: + { + off = *(offset_t *)data; + /* offset parameter is in/out */ + error = zfs_holey(VTOZ(vp), com, &off); + if (error) + return (error); + *(offset_t *)data = off; + return (0); + } + } + return (SET_ERROR(ENOTTY)); +} + +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considered clean despite have some + * dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked_12(obj); +#if __FreeBSD_version < 1300050 + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } +#else + vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), + VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | + VM_ALLOC_IGN_SBUSY); + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } +#endif + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(pp->object); +#else + vm_object_pip_subtract(pp->object, 1); +#endif +} + +#if __FreeBSD_version > 1300051 +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t m; + + obj = vp->v_object; + vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), + VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | + VM_ALLOC_NOBUSY); + return (m); +} +#else +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_wire_lock(pp); + vm_page_hold(pp); + vm_page_wire_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} +#endif + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_wire_lock(pp); +#if __FreeBSD_version >= 1300035 + vm_page_unwire(pp, PQ_ACTIVE); +#else + vm_page_unhold(pp); +#endif + vm_page_wire_unlock(pp); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) +{ + vm_object_t obj; + struct sf_buf *sf; + vnode_t *vp = ZTOV(zp); + caddr_t va; + int off; + + ASSERT3P(vp->v_mount, !=, NULL); + obj = vp->v_object; + ASSERT3P(obj, !=, NULL); + + off = start & PAGEOFFSET; + zfs_vmobject_wlock_12(obj); +#if __FreeBSD_version >= 1300041 + vm_object_pip_add(obj, 1); +#endif + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); + + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock_12(obj); + + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + va + off, DMU_READ_PREFETCH); + zfs_unmap_page(sf); + + zfs_vmobject_wlock_12(obj); + page_unbusy(pp); + } + len -= nbytes; + off = 0; + } +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(obj); +#else + vm_object_pip_wakeupn(obj, 0); +#endif + zfs_vmobject_wunlock_12(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +int +mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) +{ + vnode_t *vp = ZTOV(zp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int error = 0; + + ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY); + ASSERT3P(vp->v_mount, !=, NULL); + obj = vp->v_object; + ASSERT3P(obj, !=, NULL); + ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET); + + zfs_vmobject_wlock_12(obj); + for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), + VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (vm_page_none_valid(pp)) { + zfs_vmobject_wunlock_12(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock_12(obj); +#if __FreeBSD_version >= 1300081 + if (error == 0) { + vm_page_valid(pp); + vm_page_activate(pp); + vm_page_do_sunbusy(pp); + } else { + zfs_vmobject_wlock(obj); + if (!vm_page_wired(pp) && pp->valid == 0 && + vm_page_busy_tryupgrade(pp)) + vm_page_free(pp); + else + vm_page_sunbusy(pp); + zfs_vmobject_wunlock(obj); + } +#else + vm_page_do_sunbusy(pp); + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); +#endif + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(pp); + } + if (error) + break; + zfs_uio_advance(uio, bytes); + len -= bytes; + } + zfs_vmobject_wunlock_12(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +int +mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) +{ + vnode_t *vp = ZTOV(zp); + vm_object_t obj; + int64_t start; + int len = nbytes; + int off; + int error = 0; + + ASSERT3P(vp->v_mount, !=, NULL); + obj = vp->v_object; + ASSERT3P(obj, !=, NULL); + + start = zfs_uio_offset(uio); + off = start & PAGEOFFSET; + zfs_vmobject_wlock_12(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if ((pp = page_hold(vp, start))) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock_12(obj); + va = zfs_map_page(pp, &sf); + error = vn_io_fault_uiomove(va + off, bytes, + GET_UIO_STRUCT(uio)); + zfs_unmap_page(sf); + zfs_vmobject_wlock_12(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock_12(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock_12(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock_12(obj); + return (error); +} + +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, + UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +void +zfs_zrele_async(znode_t *zp) +{ + vnode_t *vp = ZTOV(zp); + objset_t *os = ITOZSB(vp)->z_os; + + VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os))); +} + +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; + int error; + int ltype; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(dvp, __func__); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, + struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, + int flags, boolean_t cached) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; +#if __FreeBSD_version > 1300124 + seqc_t dvp_seqc; +#endif + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, + const char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + +#if __FreeBSD_version > 1300124 + dvp_seqc = vn_seqc_read_notmodify(dvp); +#endif + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + *vpp = ZTOV(zp); + + /* + * Do we have permission to get into attribute directory? + */ + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); + if (error) { + vrele(ZTOV(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory if we're not coming in via + * VOP_CACHEDLOOKUP. + */ + if (!cached) { +#ifdef NOEXECCHECK + if ((cnp->cn_flags & NOEXECCHECK) != 0) { + cnp->cn_flags &= ~NOEXECCHECK; + } else +#endif + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK1(dvp); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + fallthrough; + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + +#if __FreeBSD_version > 1300124 + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * FIXME: zfs_lookup_lock relocks vnodes and does nothing to + * handle races. In particular different callers may end up + * with different vnodes and will try to add conflicting + * entries to the namecache. + * + * While finding different result may be acceptable in face + * of concurrent modification, adding conflicting entries + * trips over an assert in the namecache. + * + * Ultimately let an entry through once everything settles. + */ + if (!vn_seqc_consistent(dvp, dvp_seqc)) { + cnp->cn_flags &= ~MAKEENTRY; + } + } +#endif + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + uint64_t projid = ZFS_DEFAULT_PROJID; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype; +#ifdef DEBUG_VFS_LOCKS + vnode_t *dvp = ZTOV(dzp); +#endif + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + *zpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + +out: + VNCHECKREF(dvp); + if (error == 0) { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +/*ARGSUSED*/ +static int +zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t xattr_obj; + uint64_t obj = 0; + dmu_tx_t *tx; + boolean_t unlinked; + uint64_t txtype; + int error; + + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zp = VTOZ(vp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + xattr_obj = 0; + xzp = NULL; + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + obj = zp->z_id; + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; + } + /* XXX check changes to linux vnops */ + txtype = TX_REMOVE; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + + if (xzp) + vrele(ZTOV(xzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + + ZFS_EXIT(zfsvfs); + return (error); +} + + +static int +zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, + struct componentname *cnp, int nameiop) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + cnp->cn_nameptr = __DECONST(char *, name); + cnp->cn_namelen = strlen(name); + cnp->cn_nameiop = nameiop; + cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cnp->cn_cred = kcred; +#if __FreeBSD_version < 1400037 + cnp->cn_thread = curthread; +#endif + + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { + struct vop_lookup_args a; + + a.a_gen.a_desc = &vop_lookup_desc; + a.a_dvp = ZTOV(dzp); + a.a_vpp = vpp; + a.a_cnp = cnp; + error = vfs_cache_lookup(&a); + } else { + error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, + curthread, 0, B_FALSE); + } +#ifdef ZFS_DEBUG + if (error) { + printf("got error %d on name %s on op %d\n", error, name, + nameiop); + kdb_backtrace(); + } +#endif + return (error); +} + +int +zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) +{ + vnode_t *vp; + int error; + struct componentname cn; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_remove_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t txtype; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + + ASSERT3U(vap->va_type, ==, VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ + *zpp = NULL; + + if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if __FreeBSD_version < 1300124 +static void +cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) +{ + + cache_purge(dvp); + cache_purge(vp); +} +#endif + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (vp->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + zfs_log_remove(zilog, tx, txtype, dzp, name, + ZFS_NO_OBJECT, B_FALSE); + } + + dmu_tx_commit(tx); + + cache_vop_rmdir(dvp, vp); +out: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) +{ + struct componentname cn; + vnode_t *vp; + int error; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure). + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, + int *ncookies, ulong_t **cookies) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int ncooks; + ulong_t *cooks = NULL; + int flags = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = zfs_uio_offset(uio); + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = GET_UIO_STRUCT(uio)->uio_iov; + bytes_wanted = iovp->iov_len; + if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + outbuf = NULL; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + if (ncookies != NULL) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) - + sizeof (((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); + *cookies = cooks; + *ncookies = ncooks; + } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next = NULL; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + vrele(ZTOV(ezp)); + goto skip_entry; + } + vrele(ZTOV(ezp)); + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = SET_ERROR(EINVAL); + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT3S(outcount, <=, bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + skip_entry: + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Fill the offset right after advancing the cursor. */ + if (next != NULL) + *next = offset; + if (cooks != NULL) { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ncookies != NULL) + *ncookies -= ncooks; + + if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + zfs_uio_resid(uio) -= outcount; + } else if ((error = + zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = zfs_uio_offset(uio); + } + +update: + zap_cursor_fini(&zc); + if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + zfs_uio_setoffset(uio, offset); + ZFS_EXIT(zfsvfs); + if (error != 0 && cookies != NULL) { + free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } + return (error); +} + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds). + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint32_t blksize; + u_longlong_t nblocks; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vn_fsid(vp, vap); + vap->va_nodeid = zp->z_id; + vap->va_nlink = zp->z_links; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && + zp->z_links < ZFS_LINK_MAX) + vap->va_nlink++; + vap->va_size = zp->z_size; + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); + + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + xva_init(&tmpxvattr); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + } + if (xoap != NULL && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + } + + attrzp = NULL; + aclp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT0(err); + if (attrzp) { + vn_seqc_write_begin(ZTOV(attrzp)); + err = zfs_acl_chown_setattr(attrzp); + vn_seqc_write_end(ZTOV(attrzp)); + ASSERT0(err); + } + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime); + } + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT3S(vp->v_type, ==, VREG); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } +out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT0(err2); + } + + if (attrzp) + vput(ZTOV(attrzp)); + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. + */ +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) +{ + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK1(tdvp); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK1(*tvpp); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK1(tdvp); + goto relock; + } + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK1(nvp); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + VOP_UNLOCK1(*svpp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + vput(nvp); + goto relock; + } + *tvpp = nvp; + } + + return (0); + +out: + return (error); +} + +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); +} + +#if __FreeBSD_version < 1300124 +static void +cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, + struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) +{ + + cache_purge(fvp); + if (tvp != NULL) + cache_purge(tvp); + cache_purge_negative(tdvp); +} +#endif + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr, int log) +{ + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; + dmu_tx_t *tx; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error = 0; + bool want_seqc_end __maybe_unused = false; + + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Lock all four vnodes to ensure safety and semantics of renaming. + */ + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); + } + + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + + /* + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. + */ + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; + } + + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; + } + + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; + } + + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_check(szp, sdzp, tdzp))) + goto unlockout; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); + } + } else { + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; + } + } + } + + vn_seqc_write_begin(*svpp); + vn_seqc_write_begin(sdvp); + if (*tvpp != NULL) + vn_seqc_write_begin(*tvpp); + if (tdvp != *tvpp) + vn_seqc_write_begin(tdvp); +#if __FreeBSD_version >= 1300102 + want_seqc_end = true; +#endif + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); + if (tzp) + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto unlockout; + } + + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL)); + } + } + if (error == 0) { + cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp); + } + } + + dmu_tx_commit(tx); + +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + if (want_seqc_end) { + vn_seqc_write_end(*svpp); + vn_seqc_write_end(sdvp); + if (*tvpp != NULL) + vn_seqc_write_end(*tvpp); + if (tdvp != *tvpp) + vn_seqc_write_end(tdvp); + want_seqc_end = false; + } + VOP_UNLOCK1(*svpp); + VOP_UNLOCK1(sdvp); + + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); + +out: /* original two vnodes are locked */ + MPASS(!want_seqc_end); + + if (*tvpp != NULL) + VOP_UNLOCK1(*tvpp); + if (tdvp != *tvpp) + VOP_UNLOCK1(tdvp); + return (error); +} + +int +zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, + cred_t *cr, int flags) +{ + struct componentname scn, tcn; + vnode_t *sdvp, *tdvp; + vnode_t *svp, *tvp; + int error; + svp = tvp = NULL; + + sdvp = ZTOV(sdzp); + tdvp = ZTOV(tdzp); + error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); + if (sdzp->z_zfsvfs->z_replay == B_FALSE) + VOP_UNLOCK1(sdvp); + if (error != 0) + goto fail; + VOP_UNLOCK1(svp); + + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK1(tdvp); + goto fail; + } + + error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + + ASSERT3S(vap->va_type, ==, VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, + 0 /* projid */)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datasets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + __DECONST(void *, link), len, tx); + else + zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + *zpp = zp; + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uio - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure containing the link path. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, + int flags) +{ + znode_t *tzp; + zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + uint64_t parent; + uid_t owner; + + ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (ZTOV(szp)->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (szp->z_pflags & (ZFS_APPENDONLY | + ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(tdzp, name, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + + dmu_tx_commit(tx); + + if (error == 0) { + vnevent_link(ZTOV(szp), ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: ip - inode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); +} + + +CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); +CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + fidp->fid_len = size; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp; + zfsvfs_t *zfsvfs; + + switch (cmd) { + case _PC_LINK_MAX: + *valp = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); + case _PC_ACL_EXTENDED: +#if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0; + ZFS_EXIT(zfsvfs); +#else + *valp = 0; +#endif + return (0); + + case _PC_ACL_NFS4: + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0; + ZFS_EXIT(zfsvfs); + return (0); + + case _PC_ACL_PATH_MAX: + *valp = ACL_MAX_ENTRIES; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +static int +zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, + int *rahead) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + vm_object_t object; + off_t start, end, obj_size; + uint_t blksz; + int pgsin_b, pgsin_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + start = IDX_TO_OFF(ma[0]->pindex); + end = IDX_TO_OFF(ma[count - 1]->pindex + 1); + + /* + * Lock a range covering all required and optional pages. + * Note that we need to handle the case of the block size growing. + */ + for (;;) { + blksz = zp->z_blksz; + lr = zfs_rangelock_tryenter(&zp->z_rangelock, + rounddown(start, blksz), + roundup(end, blksz) - rounddown(start, blksz), RL_READER); + if (lr == NULL) { + if (rahead != NULL) { + *rahead = 0; + rahead = NULL; + } + if (rbehind != NULL) { + *rbehind = 0; + rbehind = NULL; + } + break; + } + if (blksz == zp->z_blksz) + break; + zfs_rangelock_exit(lr); + } + + object = ma[0]->object; + zfs_vmobject_wlock(object); + obj_size = object->un_pager.vnp.vnp_size; + zfs_vmobject_wunlock(object); + if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { + if (lr != NULL) + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (zfs_vm_pagerret_bad); + } + + pgsin_b = 0; + if (rbehind != NULL) { + pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); + pgsin_b = MIN(*rbehind, pgsin_b); + } + + pgsin_a = 0; + if (rahead != NULL) { + pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); + if (end + IDX_TO_OFF(pgsin_a) >= obj_size) + pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); + pgsin_a = MIN(*rahead, pgsin_a); + } + + /* + * NB: we need to pass the exact byte size of the data that we expect + * to read after accounting for the file size. This is required because + * ZFS will panic if we request DMU to read beyond the end of the last + * allocated block. + */ + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, + &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + + if (lr != NULL) + zfs_rangelock_exit(lr); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + if (error != 0) + return (zfs_vm_pagerret_error); + + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); + if (rbehind != NULL) + *rbehind = pgsin_b; + if (rahead != NULL) + *rahead = pgsin_a; + return (zfs_vm_pagerret_ok); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; +}; +#endif + +static int +zfs_freebsd_getpages(struct vop_getpages_args *ap) +{ + + return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead)); +} + +static int +zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, + int *rtvals) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + struct sf_buf *sf; + vm_object_t object; + vm_page_t m; + caddr_t va; + size_t tocopy; + size_t lo_len; + vm_ooffset_t lo_off; + vm_ooffset_t off; + uint_t blksz; + int ncount; + int pcount; + int err; + int i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + object = vp->v_object; + pcount = btoc(len); + ncount = pcount; + + KASSERT(ma[0]->object == object, ("mismatching object")); + KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + + for (i = 0; i < pcount; i++) + rtvals[i] = zfs_vm_pagerret_error; + + off = IDX_TO_OFF(ma[0]->pindex); + blksz = zp->z_blksz; + lo_off = rounddown(off, blksz); + lo_len = roundup(len + (off - lo_off), blksz); + lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); + + zfs_vmobject_wlock(object); + if (len + off > object->un_pager.vnp.vnp_size) { + if (object->un_pager.vnp.vnp_size > off) { + int pgoff; + + len = object->un_pager.vnp.vnp_size - off; + ncount = btoc(len); + if ((pgoff = (int)len & PAGE_MASK) != 0) { + /* + * If the object is locked and the following + * conditions hold, then the page's dirty + * field cannot be concurrently changed by a + * pmap operation. + */ + m = ma[ncount - 1]; + vm_page_assert_sbusied(m); + KASSERT(!pmap_page_is_write_mapped(m), + ("zfs_putpages: page %p is not read-only", + m)); + vm_page_clear_dirty(m, pgoff, PAGE_SIZE - + pgoff); + } + } else { + len = 0; + ncount = 0; + } + if (ncount < pcount) { + for (i = ncount; i < pcount; i++) { + rtvals[i] = zfs_vm_pagerret_bad; + } + } + } + zfs_vmobject_wunlock(object); + + if (ncount == 0) + goto out; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + goto out; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz < PAGE_SIZE) { + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + va = zfs_map_page(ma[i], &sf); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + zfs_unmap_page(sf); + } + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + /* + * XXX we should be passing a callback to undirty + * but that would make the locking messier + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, + len, 0, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (rtvals[0]); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_putpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; +}; +#endif + +static int +zfs_freebsd_putpages(struct vop_putpages_args *ap) +{ + + return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, + ap->a_rtvals)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_bmap_args { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +}; +#endif + +static int +zfs_freebsd_bmap(struct vop_bmap_args *ap) +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_open_args { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_open(struct vop_open_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int error; + + error = zfs_open(&vp, ap->a_mode, ap->a_cred); + if (error == 0) + vnode_create_vobject(vp, zp->z_size, ap->a_td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_close_args { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_close(struct vop_close_args *ap) +{ + + return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_ioctl(struct vop_ioctl_args *ap) +{ + + return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, + ap->a_fflag, ap->a_cred, NULL)); +} + +static int +ioflags(int ioflags) +{ + int flags = 0; + + if (ioflags & IO_APPEND) + flags |= FAPPEND; + if (ioflags & IO_NDELAY) + flags |= FNONBLOCK; + if (ioflags & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_read(struct vop_read_args *ap) +{ + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_write(struct vop_write_args *ap) +{ + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#if __FreeBSD_version >= 1300102 +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + */ +static int +zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) +{ + vnode_t *vp; + znode_t *zp; + uint64_t pflags; + + vp = v->a_vp; + zp = VTOZ_SMR(vp); + if (__predict_false(zp == NULL)) + return (EAGAIN); + pflags = atomic_load_64(&zp->z_pflags); + if (pflags & ZFS_AV_QUARANTINED) + return (EAGAIN); + if (pflags & ZFS_XATTR) + return (EAGAIN); + if ((pflags & ZFS_NO_EXECS_DENIED) == 0) + return (EAGAIN); + return (0); +} +#endif + +#if __FreeBSD_version >= 1300139 +static int +zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v) +{ + vnode_t *vp; + znode_t *zp; + char *target; + + vp = v->a_vp; + zp = VTOZ_SMR(vp); + if (__predict_false(zp == NULL)) { + return (EAGAIN); + } + + target = atomic_load_consume_ptr(&zp->z_cached_symlink); + if (target == NULL) { + return (EAGAIN); + } + return (cache_symlink_resolve(v->a_fpl, target, strlen(target))); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_access_args { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_access(struct vop_access_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + accmode_t accmode; + int error = 0; + + + if (ap->a_accmode == VEXEC) { + if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) + return (0); + } + + /* + * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, + */ + accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) + error = zfs_access(zp, accmode, 0, ap->a_cred); + + /* + * VADMIN has to be handled by vaccess(). + */ + if (error == 0) { + accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) { +#if __FreeBSD_version >= 1300105 + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred); +#else + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred, NULL); +#endif + } + } + + /* + * For VEXEC, ensure that at least one execute bit is set for + * non-directories. + */ + if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && + (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + error = EACCES; + } + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + + ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); + + return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, curthread, 0, cached)); +} + +static int +zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) +{ + + return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_cache_lookup(struct vop_lookup_args *ap) +{ + zfsvfs_t *zfsvfs; + + zfsvfs = ap->a_dvp->v_mount->mnt_data; + if (zfsvfs->z_use_namecache) + return (vfs_cache_lookup(ap)); + else + return (zfs_freebsd_lookup(ap, B_FALSE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_create(struct vop_create_args *ap) +{ + zfsvfs_t *zfsvfs; + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc, mode; + + ASSERT(cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + zfsvfs = ap->a_dvp->v_mount->mnt_data; + *ap->a_vpp = NULL; + + rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + if (zfsvfs->z_use_namecache && + rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, cnp); + + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_remove(struct vop_remove_args *ap) +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_mkdir(struct vop_mkdir_args *ap) +{ + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, + ap->a_cnp->cn_cred, 0, NULL); + + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_rmdir(struct vop_rmdir_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readdir_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + ulong_t **a_cookies; +}; +#endif + +static int +zfs_freebsd_readdir(struct vop_readdir_args *ap) +{ + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag, + ap->a_ncookies, ap->a_cookies)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_fsync(struct vop_fsync_args *ap) +{ + + vop_stdfsync(ap); + return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_getattr(struct vop_getattr_args *ap) +{ + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + ulong_t fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + XVA_SET_REQ(&xvap, XAT_READONLY); + XVA_SET_REQ(&xvap, XAT_ARCHIVE); + XVA_SET_REQ(&xvap, XAT_SYSTEM); + XVA_SET_REQ(&xvap, XAT_HIDDEN); + XVA_SET_REQ(&xvap, XAT_REPARSE); + XVA_SET_REQ(&xvap, XAT_OFFLINE); + XVA_SET_REQ(&xvap, XAT_SPARSE); + + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); + if (error != 0) + return (error); + + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHECK(UF_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHECK(UF_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHECK(UF_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); + +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_setattr(struct vop_setattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cred = ap->a_cred; + xvattr_t xvap; + ulong_t fflags; + uint64_t zflags; + + vattr_init_mask(vap); + vap->va_mask &= ~AT_NOSET; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + + zflags = VTOZ(vp)->z_pflags; + + if (vap->va_flags != VNOVAL) { + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + int error; + + if (zfsvfs->z_use_fuids == B_FALSE) + return (EOPNOTSUPP); + + fflags = vap->va_flags; + /* + * XXX KDM + * We need to figure out whether it makes sense to allow + * UF_REPARSE through, since we don't really have other + * facilities to handle reparse points and zfs_setattr() + * doesn't currently allow setting that attribute anyway. + */ + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| + UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| + UF_OFFLINE|UF_SPARSE)) != 0) + return (EOPNOTSUPP); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the PR_ALLOW_CHFLAGS permission bit is set; + * otherwise, they behave like unprivileged processes. + */ + if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || + spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + error = securelevel_gt(cred, 0); + if (error != 0) + return (error); + } + } else { + /* + * Callers may only modify the file flags on + * objects they have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, + curthread)) != 0) + return (error); + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | + ZFS_NOUNLINK)) { + return (EPERM); + } + if (fflags & + (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { + return (EPERM); + } + } + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); +#undef FLAG_CHANGE + } + if (vap->va_birthtime.tv_sec != VNOVAL) { + xvap.xva_vattr.va_mask |= AT_XVATTR; + XVA_SET_REQ(&xvap, XAT_CREATETIME); + } + return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; +}; +#endif + +static int +zfs_freebsd_rename(struct vop_rename_args *ap) +{ + vnode_t *fdvp = ap->a_fdvp; + vnode_t *fvp = ap->a_fvp; + vnode_t *tdvp = ap->a_tdvp; + vnode_t *tvp = ap->a_tvp; + int error; + + ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); + ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); + + error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred, 1); + + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; +}; +#endif + +static int +zfs_freebsd_symlink(struct vop_symlink_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; +#if __FreeBSD_version >= 1300139 + char *symlink; + size_t symlink_len; +#endif + int rc; + + ASSERT(cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, + ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); + if (rc == 0) { + *ap->a_vpp = ZTOV(zp); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); +#if __FreeBSD_version >= 1300139 + MPASS(zp->z_cached_symlink == NULL); + symlink_len = strlen(ap->a_target); + symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); + if (symlink != NULL) { + memcpy(symlink, ap->a_target, symlink_len); + symlink[symlink_len] = '\0'; + atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, + (uintptr_t)symlink); + } +#endif + } + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readlink_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_readlink(struct vop_readlink_args *ap) +{ + zfs_uio_t uio; + int error; +#if __FreeBSD_version >= 1300139 + znode_t *zp = VTOZ(ap->a_vp); + char *symlink, *base; + size_t symlink_len; + bool trycache; +#endif + + zfs_uio_init(&uio, ap->a_uio); +#if __FreeBSD_version >= 1300139 + trycache = false; + if (zfs_uio_segflg(&uio) == UIO_SYSSPACE && + zfs_uio_iovcnt(&uio) == 1) { + base = zfs_uio_iovbase(&uio, 0); + symlink_len = zfs_uio_iovlen(&uio, 0); + trycache = true; + } +#endif + error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL); +#if __FreeBSD_version >= 1300139 + if (atomic_load_ptr(&zp->z_cached_symlink) != NULL || + error != 0 || !trycache) { + return (error); + } + symlink_len -= zfs_uio_resid(&uio); + symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); + if (symlink != NULL) { + memcpy(symlink, base, symlink_len); + symlink[symlink_len] = '\0'; + if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink, + (uintptr_t)NULL, (uintptr_t)symlink)) { + cache_symlink_free(symlink, symlink_len + 1); + } + } +#endif + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_link_args { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_link(struct vop_link_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *vp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + + if (tdvp->v_mount != vp->v_mount) + return (EXDEV); + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_link(VTOZ(tdvp), VTOZ(vp), + cnp->cn_nameptr, cnp->cn_cred, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + +#if __FreeBSD_version >= 1300123 + zfs_inactive(vp, curthread->td_ucred, NULL); +#else + zfs_inactive(vp, ap->a_td->td_ucred, NULL); +#endif + return (0); +} + +#if __FreeBSD_version >= 1300042 +#ifndef _SYS_SYSPROTO_H_ +struct vop_need_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int need; + + if (vn_need_pageq_flush(vp)) + return (1); + + if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs)) + return (1); + need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); + ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); + + return (need); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT3P(zp, !=, NULL); + +#if __FreeBSD_version < 1300042 + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); +#endif + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); + if (zp->z_sa_hdl == NULL) + zfs_znode_free(zp); + else + zfs_zinactive(zp); + ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); + + vp->v_data = NULL; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfs_freebsd_fid(struct vop_fid_args *ap) +{ + + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); +} + + +#ifndef _SYS_SYSPROTO_H_ +struct vop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; +} *ap; +#endif + +static int +zfs_freebsd_pathconf(struct vop_pathconf_args *ap) +{ + ulong_t val; + int error; + + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, + curthread->td_ucred, NULL); + if (error == 0) { + *ap->a_retval = val; + return (error); + } + if (error != EOPNOTSUPP) + return (error); + + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); +#if __FreeBSD_version >= 1400032 + case _PC_DEALLOC_PRESENT: + *ap->a_retval = 1; + return (0); +#endif + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { + *ap->a_retval = PIPE_BUF; + return (0); + } + return (EINVAL); + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (SET_ERROR(EINVAL)); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (SET_ERROR(EINVAL)); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (SET_ERROR(EINVAL)); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (SET_ERROR(ENAMETOOLONG)); + } + return (0); +} + +static int +zfs_ensure_xattr_cached(znode_t *zp) +{ + int error = 0; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + if (zp->z_xattr_cached != NULL) + return (0); + + if (rw_write_held(&zp->z_xattr_lock)) + return (zfs_sa_get_xattr(zp)); + + if (!rw_tryupgrade(&zp->z_xattr_lock)) { + rw_exit(&zp->z_xattr_lock); + rw_enter(&zp->z_xattr_lock, RW_WRITER); + } + if (zp->z_xattr_cached == NULL) + error = zfs_sa_get_xattr(zp); + rw_downgrade(&zp->z_xattr_lock); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +static int +zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) +{ + struct thread *td = ap->a_td; + struct nameidata nd; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) + return (error); + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) + return (SET_ERROR(error)); + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + return (error); +} + +static int +zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname) +{ + znode_t *zp = VTOZ(ap->a_vp); + uchar_t *nv_value; + uint_t nv_size; + int error; + + error = zfs_ensure_xattr_cached(zp); + if (error != 0) + return (error); + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + ASSERT3P(zp->z_xattr_cached, !=, NULL); + + error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname, + &nv_value, &nv_size); + if (error != 0) + return (SET_ERROR(error)); + + if (ap->a_size != NULL) + *ap->a_size = nv_size; + else if (ap->a_uio != NULL) + error = uiomove(nv_value, nv_size, ap->a_uio); + if (error != 0) + return (SET_ERROR(error)); + + return (0); +} + +/* + * Vnode operation to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + char attrname[EXTATTR_MAXNAMELEN+1]; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) + return (SET_ERROR(EOPNOTSUPP)); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (SET_ERROR(error)); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + error = ENOENT; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp) + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zfsvfs->z_use_sa && zp->z_is_sa) + error = zfs_getextattr_sa(ap, attrname); + if (error == ENOENT) + error = zfs_getextattr_dir(ap, attrname); + rw_exit(&zp->z_xattr_lock); + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = SET_ERROR(ENOATTR); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +static int +zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) +{ + struct thread *td = ap->a_td; + struct nameidata nd; + vnode_t *xvp = NULL, *vp; + int error; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) + return (error); + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + if (error != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + return (SET_ERROR(error)); + } + + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + NDFREE(&nd, NDF_ONLY_PNBUF); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + + return (error); +} + +static int +zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname) +{ + znode_t *zp = VTOZ(ap->a_vp); + nvlist_t *nvl; + int error; + + error = zfs_ensure_xattr_cached(zp); + if (error != 0) + return (error); + + ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); + ASSERT3P(zp->z_xattr_cached, !=, NULL); + + nvl = zp->z_xattr_cached; + error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY); + if (error != 0) + error = SET_ERROR(error); + else + error = zfs_sa_set_xattr(zp); + if (error != 0) { + zp->z_xattr_cached = NULL; + nvlist_free(nvl); + } + return (error); +} + +/* + * Vnode operation to remove a named attribute. + */ +static int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + char attrname[EXTATTR_MAXNAMELEN+1]; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) + return (SET_ERROR(EOPNOTSUPP)); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (SET_ERROR(error)); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + size_t size = 0; + struct vop_getextattr_args vga = { + .a_vp = ap->a_vp, + .a_size = &size, + .a_cred = ap->a_cred, + .a_td = ap->a_td, + }; + error = ENOENT; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zfs_getextattr_sa(&vga, attrname); + if (error == 0) + error = zfs_deleteextattr_sa(ap, attrname); + } + if (error == ENOENT) { + error = zfs_getextattr_dir(&vga, attrname); + if (error == 0) + error = zfs_deleteextattr_dir(ap, attrname); + } + rw_exit(&zp->z_xattr_lock); + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = SET_ERROR(ENOATTR); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +static int +zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) +{ + struct thread *td = ap->a_td; + struct nameidata nd; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); + if (error != 0) + return (error); + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); + error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, + NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) + return (SET_ERROR(error)); + + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + return (error); +} + +static int +zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname) +{ + znode_t *zp = VTOZ(ap->a_vp); + nvlist_t *nvl; + size_t sa_size; + int error; + + error = zfs_ensure_xattr_cached(zp); + if (error != 0) + return (error); + + ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); + ASSERT3P(zp->z_xattr_cached, !=, NULL); + + nvl = zp->z_xattr_cached; + size_t entry_size = ap->a_uio->uio_resid; + if (entry_size > DXATTR_MAX_ENTRY_SIZE) + return (SET_ERROR(EFBIG)); + error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error != 0) + return (SET_ERROR(error)); + if (sa_size > DXATTR_MAX_SA_SIZE) + return (SET_ERROR(EFBIG)); + uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP); + error = uiomove(buf, entry_size, ap->a_uio); + if (error != 0) { + error = SET_ERROR(error); + } else { + error = nvlist_add_byte_array(nvl, attrname, buf, entry_size); + if (error != 0) + error = SET_ERROR(error); + } + kmem_free(buf, entry_size); + if (error == 0) + error = zfs_sa_set_xattr(zp); + if (error != 0) { + zp->z_xattr_cached = NULL; + nvlist_free(nvl); + } + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + char attrname[EXTATTR_MAXNAMELEN+1]; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) + return (SET_ERROR(EOPNOTSUPP)); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (SET_ERROR(error)); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + struct vop_deleteextattr_args vda = { + .a_vp = ap->a_vp, + .a_cred = ap->a_cred, + .a_td = ap->a_td, + }; + error = ENOENT; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) { + error = zfs_setextattr_sa(ap, attrname); + if (error == 0) + /* + * Successfully put into SA, we need to clear the one + * in dir if present. + */ + zfs_deleteextattr_dir(&vda, attrname); + } + if (error) { + error = zfs_setextattr_dir(ap, attrname); + if (error == 0 && zp->z_is_sa) + /* + * Successfully put into dir, we need to clear the one + * in SA if present. + */ + zfs_deleteextattr_sa(&vda, attrname); + } + rw_exit(&zp->z_xattr_lock); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +static int +zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) +{ + struct thread *td = ap->a_td; + struct nameidata nd; + uint8_t dirbuf[sizeof (struct dirent)]; + struct iovec aiov; + struct uio auio; + vnode_t *xvp = NULL, *vp; + int error, eof; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + /* + * ENOATTR means that the EA directory does not yet exist, + * i.e. there are no extended attributes there. + */ + if (error == ENOATTR) + error = 0; + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, + UIO_SYSSPACE, ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) + return (SET_ERROR(error)); + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + size_t plen = strlen(attrprefix); + + do { + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof (dirbuf); + auio.uio_resid = sizeof (dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + if (error != 0) + break; + int done = sizeof (dirbuf) - auio.uio_resid; + for (int pos = 0; pos < done; ) { + struct dirent *dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + else if (plen == 0 && + strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + uint8_t nlen = dp->d_namlen - plen; + if (ap->a_size != NULL) { + *ap->a_size += 1 + nlen; + } else if (ap->a_uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, ap->a_uio); + if (error == 0) { + char *namep = dp->d_name + plen; + error = uiomove(namep, nlen, ap->a_uio); + } + if (error != 0) { + error = SET_ERROR(error); + break; + } + } + } + } while (!eof && error == 0); + + vput(vp); + return (error); +} + +static int +zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix) +{ + znode_t *zp = VTOZ(ap->a_vp); + int error; + + error = zfs_ensure_xattr_cached(zp); + if (error != 0) + return (error); + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + ASSERT3P(zp->z_xattr_cached, !=, NULL); + + size_t plen = strlen(attrprefix); + nvpair_t *nvp = NULL; + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + const char *name = nvpair_name(nvp); + if (plen == 0 && strncmp(name, "freebsd:", 8) == 0) + continue; + else if (strncmp(name, attrprefix, plen) != 0) + continue; + uint8_t nlen = strlen(name) - plen; + if (ap->a_size != NULL) { + *ap->a_size += 1 + nlen; + } else if (ap->a_uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, ap->a_uio); + if (error == 0) { + char *namep = __DECONST(char *, name) + plen; + error = uiomove(namep, nlen, ap->a_uio); + } + if (error != 0) { + error = SET_ERROR(error); + break; + } + } + } + + return (error); +} + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + char attrprefix[16]; + int error; + + if (ap->a_size != NULL) + *ap->a_size = 0; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) + return (SET_ERROR(EOPNOTSUPP)); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (SET_ERROR(error)); + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof (attrprefix)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zfsvfs->z_use_sa && zp->z_is_sa) + error = zfs_listextattr_sa(ap, attrprefix); + if (error == 0) + error = zfs_listextattr_dir(ap, attrprefix); + rw_exit(&zp->z_xattr_lock); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_getacl(struct vop_getacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if ((error = zfs_getsecattr(VTOZ(ap->a_vp), + &vsecattr, 0, ap->a_cred))) + return (error); + + error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt); + if (vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_setacl(struct vop_setacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + int aclbsize; /* size of acl list in bytes */ + aclent_t *aaclp; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + if (ap->a_aclp == NULL) + return (EINVAL); + + if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries, + * splitting every entry into two and appending "canonical six" + * entries at the end. Don't allow for setting an ACL that would + * cause chmod(2) to run out of ACL entries. + */ + if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) + return (ENOSPC); + + error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); + if (error != 0) + return (error); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + + aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); + error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); + kmem_free(aaclp, aclbsize); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_aclcheck_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) +{ + + return (EOPNOTSUPP); +} + +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + if (*ap->a_buflen < len) + error = SET_ERROR(ENOMEM); + } + if (error == 0) { + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } + ZFS_EXIT(zfsvfs); + return (error); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(covered_vp); +#else + vhold(covered_vp); +#endif + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300045 + error = vget_finish(covered_vp, LK_SHARED, vs); +#else + error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); +#endif + if (error == 0) { +#if __FreeBSD_version >= 1300123 + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, + ap->a_buflen); +#else + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); +#endif + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if (VN_IS_DOOMED(vp)) + error = SET_ERROR(ENOENT); + return (error); +} + +#if __FreeBSD_version >= 1400032 +static int +zfs_deallocate(struct vop_deallocate_args *ap) +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + off_t off, len, file_sz; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + zilog = zfsvfs->z_log; + off = *ap->a_offset; + len = *ap->a_len; + file_sz = zp->z_size; + if (off + len > file_sz) + len = file_sz - off; + /* Fast path for out-of-range request. */ + if (len <= 0) { + *ap->a_len = 0; + ZFS_EXIT(zfsvfs); + return (0); + } + + error = zfs_freesp(zp, off, len, O_RDWR, TRUE); + if (error == 0) { + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS || + (ap->a_ioflag & IO_SYNC) != 0) + zil_commit(zilog, zp->z_id); + *ap->a_offset = off + len; + *ap->a_len = 0; + } + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, +#if __FreeBSD_version >= 1300042 + .vop_need_inactive = zfs_freebsd_need_inactive, +#endif + .vop_reclaim = zfs_freebsd_reclaim, +#if __FreeBSD_version >= 1300102 + .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, +#endif + .vop_access = zfs_freebsd_access, + .vop_allocate = VOP_EINVAL, +#if __FreeBSD_version >= 1400032 + .vop_deallocate = zfs_deallocate, +#endif + .vop_lookup = zfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_cachedlookup, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = zfs_freebsd_bmap, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_getpages = zfs_freebsd_getpages, + .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, +#if __FreeBSD_version >= 1300064 + .vop_lock1 = vop_lock, + .vop_unlock = vop_unlock, + .vop_islocked = vop_islocked, +#endif +}; +VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = zfs_freebsd_fsync, +#if __FreeBSD_version >= 1300102 + .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, +#endif + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_fid = zfs_freebsd_fid, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, +}; +VFS_VOP_VECTOR_REGISTER(zfs_fifoops); + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, +#if __FreeBSD_version >= 1300121 + .vop_fplookup_vexec = VOP_EAGAIN, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = VOP_EAGAIN, +#endif + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; +VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c new file mode 100644 index 0000000000..9b48dcda1b --- /dev/null +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -0,0 +1,2097 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, + SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef ZFS_DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +#if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102 +#define _ZFS_USE_SMR +static uma_zone_t znode_uma_zone; +#else +static kmem_cache_t *znode_cache = NULL; +#endif + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + POINTER_INVALIDATE(&zp->z_zfsvfs); + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); + + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; + zp->z_vnode = NULL; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); + zfs_rangelock_fini(&zp->z_rangelock); + + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); +} + + +#ifdef _ZFS_USE_SMR +VFS_SMR_DECLARE; + +static int +zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, + int flags) +{ + return (zfs_znode_cache_constructor(mem, private, flags)); +} + +static void +zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) +{ + zfs_znode_cache_destructor(mem, private); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + ASSERT3P(znode_uma_zone, ==, NULL); + znode_uma_zone = uma_zcreate("zfs_znode_cache", + sizeof (znode_t), zfs_znode_cache_constructor_smr, + zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); + VFS_SMR_ZONE_SET(znode_uma_zone); +} + +static znode_t * +zfs_znode_alloc_kmem(int flags) +{ + return (uma_zalloc_smr(znode_uma_zone, flags)); +} + +static void +zfs_znode_free_kmem(znode_t *zp) +{ + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + uma_zfree_smr(znode_uma_zone, zp); +} +#else +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + ASSERT3P(znode_cache, ==, NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); +} + +static znode_t * +zfs_znode_alloc_kmem(int flags) +{ + return (kmem_cache_alloc(znode_cache, flags)); +} + +static void +zfs_znode_free_kmem(znode_t *zp) +{ + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + kmem_cache_free(znode_cache, zp); +} +#endif + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ +#ifdef _ZFS_USE_SMR + if (znode_uma_zone) { + uma_zdestroy(znode_uma_zone); + znode_uma_zone = NULL; + } +#else + if (znode_cache) { + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + } +#endif +} + + +static int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + znode_t *zp; + int error; + + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = zfs_znode_alloc_kmem(KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + sa_handle_destroy(sharezp->z_sa_hdl); + zfs_znode_free_kmem(sharezp); + + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + ASSERT3P(zp->z_sa_hdl, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + if (sa_hdl == NULL) { + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + /* + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. + */ + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) + ZTOV(zp)->v_flag |= VROOT; + + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +static void +zfs_vnode_forget(vnode_t *vp) +{ + + /* copied from insmntque_stddtr */ + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Construct a new znode/vnode and initialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + vnode_t *vp; + uint64_t mode; + uint64_t parent; +#ifdef notyet + uint64_t mtime[2], ctime[2]; +#endif + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[9]; + int count = 0; + int error; + + zp = zfs_znode_alloc_kmem(KM_SLEEP); + +#ifndef _ZFS_USE_SMR + KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, + ("%s: fast path lookup enabled without smr", __func__)); +#endif + +#if __FreeBSD_version >= 1300076 + KASSERT(curthread->td_vp_reserved != NULL, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#else + KASSERT(curthread->td_vp_reserv > 0, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#endif + error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); + if (error != 0) { + zfs_znode_free_kmem(zp); + return (NULL); + } + zp->z_vnode = vp; + vp->v_data = zp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; +#if __FreeBSD_version >= 1300139 + atomic_store_ptr(&zp->z_cached_symlink, NULL); +#endif + + vp = ZTOV(zp); + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); +#ifdef notyet + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); +#endif + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zfs_vnode_forget(vp); + zp->z_vnode = NULL; + zfs_znode_free_kmem(zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + vp->v_type = IFTOVT((mode_t)mode); + + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VFIFO: + vp->v_op = &zfs_fifoops; + break; + case VREG: + if (parent == zfsvfs->z_shares_dir) { + ASSERT0(zp->z_uid); + ASSERT0(zp->z_gid); + vp->v_op = &zfs_shareops; + } + break; + default: + break; + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Acquire vnode lock before making it available to the world. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(vp); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + + return (zp); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int err; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + + ASSERT3P(vap, !=, NULL); + ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + vfs_timestamp(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + /* + * No execs denied will be determined when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); + + if (!(flag & IS_ROOT_NODE)) { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT3P(*zpp, !=, NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT3P(xoap, !=, NULL); + + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + sa_handle_t *hdl; + struct thread *td; + int locked; + int err; + + td = curthread; + getnewvnode_reserve_(); +again: + *zpp = NULL; + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = SET_ERROR(ENOENT); + } else { + vp = ZTOV(zp); + /* + * Don't let the vnode disappear after + * ZFS_OBJ_HOLD_EXIT. + */ + VN_HOLD(vp); + *zpp = zp; + err = 0; + } + + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err) { + getnewvnode_drop_reserve(); + return (err); + } + + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + + /* + * XXX vrele() locks the vnode when the last reference + * is dropped. Although in this case the vnode is + * doomed / dead and so no inactivation is required, + * the vnode lock is still acquired. That could result + * in a LOR with z_teardown_lock if another thread holds + * the vnode's lock and tries to take z_teardown_lock. + * But that is only possible if the other thread peforms + * a ZFS vnode operation on the vnode. That either + * should not happen if the vnode is dead or the thread + * should also have a reference to the vnode and thus + * our reference is not last. + */ + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + getnewvnode_drop_reserve(); + return (err); + } + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + if (err == 0) { + vnode_t *vp = ZTOV(zp); + + err = insmntque(vp, zfsvfs->z_vfs); + if (err == 0) { + vp->v_hash = obj_num; + VOP_UNLOCK1(vp); + } else { + zp->z_vnode = NULL; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + *zpp = NULL; + } + } + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + vnode_t *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + + /* + * Remove cached pages before reloading the znode, so that they are not + * lingering after we run into any error. Ideally, we should vgone() + * the vnode in case of error, but currently we cannot do that + * because of the LOR between the vnode lock and z_teardown_lock. + * So, instead, we have to "doom" the znode in the illumos style. + */ + vp = ZTOV(zp); + vn_pages_remove(vp, 0, 0); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + rw_exit(&zp->z_xattr_lock); + + ASSERT3P(zp->z_sa_hdl, ==, NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. + */ + if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatically removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (0); + } + + zp->z_blksz = doi.doi_data_block_size; + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY0(dmu_object_free(os, acl_obj, tx)); + } + VERIFY0(dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT3P(zp->z_sa_hdl, !=, NULL); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + } + + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; +#if __FreeBSD_version >= 1300139 + char *symlink; +#endif + + ASSERT3P(zp->z_sa_hdl, ==, NULL); + zp->z_vnode = NULL; + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + mutex_exit(&zfsvfs->z_znodes_lock); + +#if __FreeBSD_version >= 1300139 + symlink = atomic_load_ptr(&zp->z_cached_symlink); + if (symlink != NULL) { + atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, + (uintptr_t)NULL); + cache_symlink_free(symlink, strlen(symlink) + 1); + } +#endif + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + zfs_znode_free_kmem(zp); +} + +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + vfs_timestamp(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { +#if __FreeBSD_version >= 1400032 + vnode_pager_purge_range(ZTOV(zp), off, off + len); +#else + /* + * Before __FreeBSD_version 1400032 we cannot free block in the + * middle of a file, but only at the end of a file, so this code + * path should never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); +#endif + } + + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + int i; + znode_t *rootzp = NULL; + zfsvfs_t *zfsvfs; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT0(error); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64); + val = fnvpair_value_uint64(elem); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT0(error); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT3U(version, !=, 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT0(error); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + rootzp = zfs_znode_alloc_kmem(KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT0(error); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rootzp->z_zfsvfs = zfsvfs; + VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT0(error); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + zfs_znode_free_kmem(rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT0(error); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +static void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if (prevdb) { + ASSERT3P(prevhdl, !=, NULL); + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + } + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT3P(path, >=, buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT3P(sa_db, !=, NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + + +void +zfs_znode_update_vfs(znode_t *zp) +{ + vm_object_t object; + + if ((object = ZTOV(zp)->v_object) == NULL || + zp->z_size == object->un_pager.vnp.vnp_size) + return; + + vnode_pager_setsize(ZTOV(zp), zp->z_size); +} + + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c new file mode 100644 index 0000000000..832378a92a --- /dev/null +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -0,0 +1,1824 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +/* + * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many + * calls, to test decryption error handling code paths. + */ +uint64_t zio_decrypt_fail_fraction = 0; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +static void +zio_crypt_key_destroy_early(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + bzero(&key->zk_session, sizeof (key->zk_session)); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + + freebsd_crypt_freesession(&key->zk_session); + zio_crypt_key_destroy_early(key); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech __unused; + uint_t keydata_len; + zio_crypt_info_t *ci = NULL; + + ASSERT3P(key, !=, NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + ret = freebsd_crypt_newsession(&key->zk_session, ci, + &key->zk_current_key); + if (ret) + goto error; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech __unused; + + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + freebsd_crypt_freesession(&key->zk_session); + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[key->zk_crypt], &key->zk_current_key); + if (ret != 0) + goto out_unlock; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int failed_decrypt_size; + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +/* + * The implementation for FreeBSD's OpenCrypto. + * + * The big difference between ICP and FOC is that FOC uses a single + * buffer for input and output. This means that (for AES-GCM, the + * only one supported right now) the source must be copied into the + * destination, and the destination must have the AAD, and the tag/MAC, + * already associated with it. (Both implementations can use a uio.) + * + * Since the auth data is part of the iovec array, all we need to know + * is the length: 0 means there's no AAD. + * + */ +static int +zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, + uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, + zfs_uio_t *uio, uint_t auth_len) +{ + zio_crypt_info_t *ci; + int ret; + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + + ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, + datalen, auth_len); + if (ret != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Returning error %s\n", + __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); +#endif + ret = SET_ERROR(encrypt ? EIO : ECKSUM); + } + + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + zfs_uio_t cuio; + struct uio cuio_s; + iovec_t iovecs[4]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + zfs_uio_init(&cuio, &cuio_s); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* + * Since we only support one buffer, we need to copy + * the plain text (source) to the cipher buffer (dest). + * We set iovecs[0] -- the authentication data -- below. + */ + bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len); + bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out, + SHA512_HMAC_KEYLEN); + iovecs[1].iov_base = keydata_out; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = hmac_keydata_out; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; + zfs_uio_iovcnt(&cuio) = 4; + zfs_uio_segflg(&cuio) = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + zfs_uio_t cuio; + struct uio cuio_s; + iovec_t iovecs[4]; + void *src, *dst; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + zfs_uio_init(&cuio, &cuio_s); + + /* + * Since we only support one buffer, we need to copy + * the encrypted buffer (source) to the plain buffer + * (dest). We set iovecs[0] -- the authentication data -- + * below. + */ + dst = key->zk_master_keydata; + src = keydata; + + bcopy(src, dst, keydata_len); + + dst = key->zk_hmac_keydata; + src = hmac_keydata; + bcopy(src, dst, SHA512_HMAC_KEYLEN); + + iovecs[1].iov_base = key->zk_master_keydata; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = key->zk_hmac_keydata; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + + GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; + zfs_uio_iovcnt(&cuio) = 4; + zfs_uio_segflg(&cuio) = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[crypt], &key->zk_current_key); + if (ret != 0) + goto error; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + crypto_mac(&key->zk_hmac_key, data, datalen, + raw_digestbuf, SHA512_DIGEST_LENGTH); + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + int avoidlint = SPA_MINBLOCKSIZE; + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, avoidlint); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, avoidlint); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + crypto_mac_update(ctx, &bab, bab_len); + + return (0); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +/* ARGSUSED */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + struct hmac_ctx hash_ctx; + struct hmac_ctx *ctx = &hash_ctx; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + + /* calculate the portable MAC from the portable fields and metadnode */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* XXX check dnode type ... */ + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(zfs_uio_t *uio) +{ + if (GET_UIO_STRUCT(uio)->uio_iov) + kmem_free(GET_UIO_STRUCT(uio)->uio_iov, + zfs_uio_iovcnt(uio) * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); +#endif + return (SET_ERROR(ECKSUM)); + } + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +/* + * The OpenCrypto used in FreeBSD does not use separate source and + * destination buffers; instead, the same buffer is used. Further, to + * accommodate some of the drivers, the authbuf needs to be logically before + * the data. This means that we need to copy the source to the destination, + * and set up an extra iovec_t at the beginning to handle the authbuf. + * It also means we'll only return one zfs_uio_t. + */ + +/* ARGSUSED */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, + zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + uint8_t *aadbuf = zio_buf_alloc(datalen); + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + iovec_t *dst_iovecs; + zil_chain_t *zilc; + lr_t *lr; + uint64_t txtype, lr_len; + uint_t crypt_len, nr_iovecs, vec; + uint_t aad_len = 0, total_len = 0; + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + + /* Find the start and end record of the log block. */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* + * Calculate the number of encrypted iovecs we will need. + */ + + /* We need at least two iovecs -- one for the AAD, one for the MAC. */ + nr_iovecs = 2; + + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (byteswap) { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } else { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + /* + * Loop over records again, filling in iovecs. + */ + + /* The first iovec will contain the authbuf. */ + vec = 1; + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[vec].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + vec++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + dst_iovecs[vec].iov_base = (char *) + dlrp + sizeof (lr_write_t); + dst_iovecs[vec].iov_len = crypt_len; + vec++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[vec].iov_len = crypt_len; + vec++; + total_len += crypt_len; + } + } + + /* The last iovec will contain the MAC. */ + ASSERT3U(vec, ==, nr_iovecs - 1); + + /* AAD */ + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + /* MAC */ + dst_iovecs[vec].iov_base = 0; + dst_iovecs[vec].iov_len = 0; + + *no_crypt = (vec == 1); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; + zfs_uio_iovcnt(out_uio) = nr_iovecs; + + return (0); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + uint8_t *aadbuf = zio_buf_alloc(datalen); + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + iovec_t *dst_iovecs; + uint_t nr_iovecs, crypt_len, vec; + uint_t aad_len = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + + /* We need at least two iovecs -- one for the AAD, one for the MAC. */ + nr_iovecs = 2; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + + /* The first iovec will contain the authbuf. */ + vec = 1; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[vec].iov_len = crypt_len; + + vec++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + /* The last iovec will contain the MAC. */ + ASSERT3U(vec, ==, nr_iovecs - 1); + + /* AAD */ + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + /* MAC */ + dst_iovecs[vec].iov_base = 0; + dst_iovecs[vec].iov_len = 0; + + *no_crypt = (vec == 1); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; + zfs_uio_iovcnt(out_uio) = nr_iovecs; + + return (0); +} + +/* ARGSUSED */ +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + void *src, *dst; + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + cipher_iovecs[0].iov_base = dst; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs; + zfs_uio_iovcnt(out_uio) = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + GET_UIO_STRUCT(out_uio)->uio_iov = NULL; + zfs_uio_iovcnt(out_uio) = 0; + + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + zfs_uio_segflg(cuio) = UIO_SYSSPACE; + + mac_iov = + ((iovec_t *)&(GET_UIO_STRUCT(cuio)-> + uio_iov[zfs_uio_iovcnt(cuio) - 1])); + mac_iov->iov_base = (void *)mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int faile_decrypt_size; + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + zfs_uio_t puio, cuio; + struct uio puio_s, cuio_s; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + freebsd_crypt_session_t *tmpl = NULL; + uint8_t *authbuf = NULL; + + + zfs_uio_init(&puio, &puio_s); + zfs_uio_init(&cuio, &cuio_s); + bzero(GET_UIO_STRUCT(&puio), sizeof (struct uio)); + bzero(GET_UIO_STRUCT(&cuio), sizeof (struct uio)); + +#ifdef FCRYPTO_DEBUG + printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", + __FUNCTION__, + encrypt ? "encrypt" : "decrypt", + key, salt, ot, iv, mac, datalen, + byteswap ? "byteswap" : "native_endian", plainbuf, + cipherbuf, no_crypt); + + printf("\tkey = {"); + for (int i = 0; i < key->zk_current_key.ck_length/8; i++) + printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); + printf("}\n"); +#endif + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = &key->zk_session; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, + ckey, iv, enc_len, &cuio, auth_len); + if (ret != 0) + goto error; + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (!encrypt) { + if (failed_decrypt_buf != NULL) + kmem_free(failed_decrypt_buf, failed_decrypt_size); + failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); + failed_decrypt_size = datalen; + bcopy(cipherbuf, failed_decrypt_buf, datalen); + } + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + return (SET_ERROR(ret)); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (SET_ERROR(ret)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c new file mode 100644 index 0000000000..4503691925 --- /dev/null +++ b/module/os/freebsd/zfs/zvol_os.c @@ -0,0 +1,1543 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2006-2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2011 Martin Matuska */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol// + * + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + * + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device + * in the system. Except when they're simply character devices (volmode=dev). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zfs_namecheck.h" + +#define ZVOL_DUMPSIZE "dumpsize" + +#ifdef ZVOL_LOCK_DEBUG +#define ZVOL_RW_READER RW_WRITER +#define ZVOL_RW_READ_HELD RW_WRITE_HELD +#else +#define ZVOL_RW_READER RW_READER +#define ZVOL_RW_READ_HELD RW_READ_HELD +#endif + +enum zvol_geom_state { + ZVOL_GEOM_UNINIT, + ZVOL_GEOM_STOPPED, + ZVOL_GEOM_RUNNING, +}; + +struct zvol_state_os { +#define zso_dev _zso_state._zso_dev +#define zso_geom _zso_state._zso_geom + union { + /* volmode=dev */ + struct zvol_state_dev { + struct cdev *zsd_cdev; + uint64_t zsd_sync_cnt; + } _zso_dev; + + /* volmode=geom */ + struct zvol_state_geom { + struct g_provider *zsg_provider; + struct bio_queue_head zsg_queue; + struct mtx zsg_queue_mtx; + enum zvol_geom_state zsg_state; + } _zso_geom; + } _zso_state; + int zso_dying; +}; + +static uint32_t zvol_minors; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, + "Expose as GEOM providers (1), device files (2) or neither"); +static boolean_t zpool_on_zvol = B_FALSE; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, + "Allow zpools to use zvols as vdevs (DANGEROUS)"); + +/* + * Toggle unmap functionality. + */ +boolean_t zvol_unmap_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, + &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS / 2; + +static void zvol_ensure_zilog(zvol_state_t *zv); + +static d_open_t zvol_cdev_open; +static d_close_t zvol_cdev_close; +static d_ioctl_t zvol_cdev_ioctl; +static d_read_t zvol_cdev_read; +static d_write_t zvol_cdev_write; +static d_strategy_t zvol_geom_bio_strategy; + +static struct cdevsw zvol_cdevsw = { + .d_name = "zvol", + .d_version = D_VERSION, + .d_flags = D_DISK | D_TRACKCLOSE, + .d_open = zvol_cdev_open, + .d_close = zvol_cdev_close, + .d_ioctl = zvol_cdev_ioctl, + .d_read = zvol_cdev_read, + .d_write = zvol_cdev_write, + .d_strategy = zvol_geom_bio_strategy, +}; + +extern uint_t zfs_geom_probe_vdev_key; + +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +static int zvol_geom_open(struct g_provider *pp, int flag, int count); +static int zvol_geom_close(struct g_provider *pp, int flag, int count); +static void zvol_geom_run(zvol_state_t *zv); +static void zvol_geom_destroy(zvol_state_t *zv); +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); +static void zvol_geom_worker(void *arg); +static void zvol_geom_bio_start(struct bio *bp); +static int zvol_geom_bio_getattr(struct bio *bp); +/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ + +/* + * GEOM mode implementation + */ + +/*ARGSUSED*/ +static int +zvol_geom_open(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + int err = 0; + boolean_t drop_suspend = B_FALSE; + boolean_t drop_namespace = B_FALSE; + + if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { + /* + * if zfs_geom_probe_vdev_key is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (SET_ERROR(EOPNOTSUPP)); + } + +retry: + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_locked; + } + + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * We need to guarantee that the namespace lock is held + * to avoid spurious failures in zvol_first_open. + */ + drop_namespace = B_TRUE; + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + goto retry; + } + } + mutex_enter(&zv->zv_state_lock); + if (zv->zv_zso->zso_dying) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_zv_locked; + } + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flag & FWRITE)); + if (err) + goto out_zv_locked; + pp->mediasize = zv->zv_volsize; + pp->stripeoffset = 0; + pp->stripesize = zv->zv_volblocksize; + } + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || + dmu_objset_incompatible_encryption_version(zv->zv_objset))) { + err = SET_ERROR(EROFS); + goto out_opened; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = SET_ERROR(EBUSY); + goto out_opened; + } +#ifdef FEXCL + if (flag & FEXCL) { + if (zv->zv_open_count != 0) { + err = SET_ERROR(EBUSY); + goto out_opened; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count += count; +out_opened: + if (zv->zv_open_count == 0) { + zvol_last_close(zv); + wakeup(zv); + } +out_zv_locked: + mutex_exit(&zv->zv_state_lock); +out_locked: + if (drop_namespace) + mutex_exit(&spa_namespace_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (err); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + int new_open_count; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT3U(zv->zv_open_count, ==, 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT3U(zv->zv_open_count, >, 0); + + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + new_open_count = zv->zv_open_count - count; + if (new_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + new_open_count = zv->zv_open_count - count; + if (new_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count = new_open_count; + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + wakeup(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static void +zvol_geom_run(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); + + g_error_provider(pp, 0); + + kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, + "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); +} + +static void +zvol_geom_destroy(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); + + g_topology_assert(); + + mutex_enter(&zv->zv_state_lock); + VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); + mutex_exit(&zv->zv_state_lock); + zsg->zsg_provider = NULL; + g_wither_geom(pp->geom, ENXIO); +} + +void +zvol_wait_close(zvol_state_t *zv) +{ + + if (zv->zv_volmode != ZFS_VOLMODE_GEOM) + return; + mutex_enter(&zv->zv_state_lock); + zv->zv_zso->zso_dying = B_TRUE; + + if (zv->zv_open_count) + msleep(zv, &zv->zv_state_lock, + PRIBIO, "zvol:dying", 10*hz); + mutex_exit(&zv->zv_state_lock); +} + + +static int +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) +{ + int count, error, flags; + + g_topology_assert(); + + /* + * To make it easier we expect either open or close, but not both + * at the same time. + */ + KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || + (acr <= 0 && acw <= 0 && ace <= 0), + ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", + pp->name, acr, acw, ace)); + + if (pp->private == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + /* + * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if + * ace != 0, because GEOM already handles that and handles it a bit + * differently. GEOM allows for multiple read/exclusive consumers and + * ZFS allows only one exclusive consumer, no matter if it is reader or + * writer. I like better the way GEOM works so I'll leave it for GEOM + * to decide what to do. + */ + + count = acr + acw + ace; + if (count == 0) + return (0); + + flags = 0; + if (acr != 0 || ace != 0) + flags |= FREAD; + if (acw != 0) + flags |= FWRITE; + + g_topology_unlock(); + if (count > 0) + error = zvol_geom_open(pp, flags, count); + else + error = zvol_geom_close(pp, flags, -count); + g_topology_lock(); + return (error); +} + +static void +zvol_geom_worker(void *arg) +{ + zvol_state_t *zv = arg; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct bio *bp; + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); + + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + for (;;) { + mtx_lock(&zsg->zsg_queue_mtx); + bp = bioq_takefirst(&zsg->zsg_queue); + if (bp == NULL) { + if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { + zsg->zsg_state = ZVOL_GEOM_RUNNING; + wakeup(&zsg->zsg_state); + mtx_unlock(&zsg->zsg_queue_mtx); + kthread_exit(); + } + msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, + PRIBIO | PDROP, "zvol:io", 0); + continue; + } + mtx_unlock(&zsg->zsg_queue_mtx); + zvol_geom_bio_strategy(bp); + } +} + +static void +zvol_geom_bio_start(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_to->private; + struct zvol_state_geom *zsg; + boolean_t first; + + if (zv == NULL) { + g_io_deliver(bp, ENXIO); + return; + } + if (bp->bio_cmd == BIO_GETATTR) { + if (zvol_geom_bio_getattr(bp)) + g_io_deliver(bp, EOPNOTSUPP); + return; + } + + if (!THREAD_CAN_SLEEP()) { + zsg = &zv->zv_zso->zso_geom; + mtx_lock(&zsg->zsg_queue_mtx); + first = (bioq_first(&zsg->zsg_queue) == NULL); + bioq_insert_tail(&zsg->zsg_queue, bp); + mtx_unlock(&zsg->zsg_queue_mtx); + if (first) + wakeup_one(&zsg->zsg_queue); + return; + } + + zvol_geom_bio_strategy(bp); +} + +static int +zvol_geom_bio_getattr(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT3P(zv, !=, NULL); + + spa_t *spa = dmu_objset_spa(zv->zv_objset); + uint64_t refd, avail, usedobjs, availobjs; + + if (g_handleattr_int(bp, "GEOM::candelete", 1)) + return (0); + if (strcmp(bp->bio_attribute, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksavail", + avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) + return (0); + } + return (1); +} + +static void +zvol_geom_bio_strategy(struct bio *bp) +{ + zvol_state_t *zv; + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + zfs_locked_range_t *lr; + int error = 0; + boolean_t doread = B_FALSE; + boolean_t is_dumpified; + boolean_t sync; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + goto out; + } + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + + switch (bp->bio_cmd) { + case BIO_READ: + doread = B_TRUE; + break; + case BIO_WRITE: + case BIO_FLUSH: + case BIO_DELETE: + if (zv->zv_flags & ZVOL_RDONLY) { + error = SET_ERROR(EROFS); + goto resume; + } + zvol_ensure_zilog(zv); + if (bp->bio_cmd == BIO_FLUSH) + goto sync; + break; + default: + error = SET_ERROR(EOPNOTSUPP); + goto resume; + } + + off = bp->bio_offset; + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT3P(os, !=, NULL); + + addr = bp->bio_data; + resid = bp->bio_length; + + if (resid > 0 && off >= volsize) { + error = SET_ERROR(EIO); + goto resume; + } + + is_dumpified = B_FALSE; + sync = !doread && !is_dumpified && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, + doread ? RL_READER : RL_WRITER); + + if (bp->bio_cmd == BIO_DELETE) { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, off, resid, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + off, resid); + resid = 0; + } + goto unlock; + } + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); + if (doread) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, sync); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + off += size; + addr += size; + resid -= size; + } +unlock: + zfs_rangelock_exit(lr); + + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length && off > volsize) + error = SET_ERROR(EINVAL); + + switch (bp->bio_cmd) { + case BIO_FLUSH: + break; + case BIO_READ: + dataset_kstats_update_read_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_WRITE: + dataset_kstats_update_write_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_DELETE: + break; + default: + break; + } + + if (sync) { +sync: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } +resume: + rw_exit(&zv->zv_suspend_lock); +out: + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); +} + +/* + * Character device mode implementation + */ + +static int +zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + zfs_uio_t uio; + + zfs_uio_init(&uio, uio_s); + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + /* + * uio_loffset == volsize isn't an error as + * it's required for EOF processing. + */ + if (zfs_uio_resid(&uio) > 0 && + (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) + return (SET_ERROR(EIO)); + + ssize_t start_resid = zfs_uio_resid(&uio); + lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), + zfs_uio_resid(&uio), RL_READER); + while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { + uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - zfs_uio_offset(&uio)) + bytes = volsize - zfs_uio_offset(&uio); + + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + int64_t nread = start_resid - zfs_uio_resid(&uio); + dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + + return (error); +} + +static int +zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + zfs_uio_t uio; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + + zfs_uio_init(&uio, uio_s); + + if (zfs_uio_resid(&uio) > 0 && + (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) + return (SET_ERROR(EIO)); + + ssize_t start_resid = zfs_uio_resid(&uio); + sync = (ioflag & IO_SYNC) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), + zfs_uio_resid(&uio), RL_WRITER); + while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { + uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); + uint64_t off = zfs_uio_offset(&uio); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + int64_t nwritten = start_resid - zfs_uio_resid(&uio); + dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +static int +zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + int err = 0; + boolean_t drop_suspend = B_FALSE; + boolean_t drop_namespace = B_FALSE; + +retry: + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_locked; + } + + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * We need to guarantee that the namespace lock is held + * to avoid spurious failures in zvol_first_open. + */ + drop_namespace = B_TRUE; + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + goto retry; + } + } + mutex_enter(&zv->zv_state_lock); + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flags & FWRITE)); + if (err) + goto out_zv_locked; + } + + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = SET_ERROR(EROFS); + goto out_opened; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = SET_ERROR(EBUSY); + goto out_opened; + } +#ifdef FEXCL + if (flags & FEXCL) { + if (zv->zv_open_count != 0) { + err = SET_ERROR(EBUSY); + goto out_opened; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count++; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt++; + if (zsd->zsd_sync_cnt == 1 && + (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) + zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); + } +out_opened: + if (zv->zv_open_count == 0) { + zvol_last_close(zv); + wakeup(zv); + } +out_zv_locked: + mutex_exit(&zv->zv_state_lock); +out_locked: + if (drop_namespace) + mutex_exit(&spa_namespace_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (err); +} + +static int +zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT3U(zv->zv_open_count, ==, 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT3U(zv->zv_open_count, >, 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count--; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt--; + } + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + wakeup(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static int +zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, + int fflag, struct thread *td) +{ + zvol_state_t *zv; + zfs_locked_range_t *lr; + off_t offset, length; + int i, error; + boolean_t sync; + + zv = dev->si_drv2; + + error = 0; + KASSERT(zv->zv_open_count > 0, + ("Device with zero access count in %s", __func__)); + + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(uint32_t *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + break; + case DIOCGFLUSH: + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + if (zv->zv_zilog != NULL) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGDELETE: + if (!zvol_unmap_enabled) + break; + + offset = ((off_t *)data)[0]; + length = ((off_t *)data)[1]; + if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || + offset < 0 || offset >= zv->zv_volsize || + length <= 0) { + printf("%s: offset=%jd length=%jd\n", __func__, offset, + length); + error = SET_ERROR(EINVAL); + break; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, + RL_WRITER); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + sync = FALSE; + dmu_tx_abort(tx); + } else { + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_log_truncate(zv, tx, offset, length, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + offset, length); + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGSTRIPESIZE: + *(off_t *)data = zv->zv_volblocksize; + break; + case DIOCGSTRIPEOFFSET: + *(off_t *)data = 0; + break; + case DIOCGATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + struct diocgattr_arg *arg = (struct diocgattr_arg *)data; + uint64_t refd, avail, usedobjs, availobjs; + + if (strcmp(arg->name, "GEOM::candelete") == 0) + arg->value.i = 1; + else if (strcmp(arg->name, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = refd / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc( + spa_normal_class(spa)); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = refd / DEV_BSIZE; + } else + error = SET_ERROR(ENOIOCTL); + break; + } + case FIOSEEKHOLE: + case FIOSEEKDATA: { + off_t *off = (off_t *)data; + uint64_t noff; + boolean_t hole; + + hole = (cmd == FIOSEEKHOLE); + noff = *off; + error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + *off = noff; + break; + } + default: + error = SET_ERROR(ENOIOCTL); + } + + return (error); +} + +/* + * Misc. helpers + */ + +static void +zvol_ensure_zilog(zvol_state_t *zv) +{ + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + if (!rw_tryupgrade(&zv->zv_suspend_lock)) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + } + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + /* replay / destroy done in zvol_create_minor_impl() */ + VERIFY0(zv->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED); + } + rw_downgrade(&zv->zv_suspend_lock); + } +} + +static boolean_t +zvol_is_zvol_impl(const char *device) +{ + return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + struct g_geom *gp; + + g_topology_lock(); + gp = pp->geom; + ASSERT3P(gp, !=, NULL); + + zsg->zsg_provider = NULL; + g_wither_provider(pp, ENXIO); + + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = zv->zv_volsize; + pp->private = zv; + zsg->zsg_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + dev = zsd->zsd_cdev; + if (dev != NULL) { + destroy_dev(dev); + dev = zsd->zsd_cdev = NULL; + if (zv->zv_open_count > 0) { + zv->zv_flags &= ~ZVOL_EXCL; + zv->zv_open_count = 0; + /* XXX need suspend lock but lock order */ + zvol_last_close(zv); + } + } + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) + == 0) { +#if __FreeBSD_version > 1300130 + dev->si_iosize_max = maxphys; +#else + dev->si_iosize_max = MAXPHYS; +#endif + zsd->zsd_cdev = dev; + } + } + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); +} + +/* + * Remove minor node for the specified volume. + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp __maybe_unused = zsg->zsg_provider; + + ASSERT3P(pp->private, ==, NULL); + + g_topology_lock(); + zvol_geom_destroy(zv); + g_topology_unlock(); + mtx_destroy(&zsg->zsg_queue_mtx); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + if (dev != NULL) { + ASSERT3P(dev->si_drv2, ==, NULL); + destroy_dev(dev); + } + } + + mutex_destroy(&zv->zv_state_lock); + dataset_kstats_destroy(&zv->zv_kstat); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + zvol_minors--; +} + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +static int +zvol_create_minor_impl(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t volmode, hash; + int error; + + ZFS_LOG(1, "Creating ZVOL %s...", name); + hash = zvol_name_hash(name); + if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + DROP_GIANT(); + + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); + if (error || volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + error = 0; + + /* + * zvol_alloc equivalent ... + */ + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + zv->zv_hash = hash; + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_volmode = volmode; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp; + struct g_geom *gp; + + zsg->zsg_state = ZVOL_GEOM_UNINIT; + mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + bioq_init(&zsg->zsg_queue); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) + == 0) { +#if __FreeBSD_version > 1300130 + dev->si_iosize_max = maxphys; +#else + dev->si_iosize_max = MAXPHYS; +#endif + zsd->zsd_cdev = dev; + } + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + ASSERT3P(zv->zv_zilog, ==, NULL); + zv->zv_zilog = zil_open(os, zvol_get_data); + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(zv->zv_zilog, B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + + /* TODO: prefetch for geom tasting */ + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); + + if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { + zvol_geom_run(zv); + g_topology_unlock(); + } +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + zvol_minors++; + rw_exit(&zvol_state_lock); + ZFS_LOG(1, "ZVOL %s created.", name); + } + PICKUP_GIANT(); + return (error); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp->private == NULL) /* already cleared */ + return; + + mtx_lock(&zsg->zsg_queue_mtx); + zsg->zsg_state = ZVOL_GEOM_STOPPED; + pp->private = NULL; + wakeup_one(&zsg->zsg_queue); + while (zsg->zsg_state != ZVOL_GEOM_RUNNING) + msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, + 0, "zvol:w", 0); + mtx_unlock(&zsg->zsg_queue_mtx); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + if (dev != NULL) + dev->si_drv2 = NULL; + } +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + g_topology_lock(); + + if (pp->private == NULL) { + g_topology_unlock(); + return (SET_ERROR(ENXIO)); + } + + /* + * Do not invoke resize event when initial size was zero. + * ZVOL initializes the size on first open, this is not + * real resizing. + */ + if (pp->mediasize == 0) + pp->mediasize = zv->zv_volsize; + else + g_resize_provider(pp, zv->zv_volsize); + + g_topology_unlock(); + } + return (0); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_freebsd_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_create_minor_impl, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +/* + * Public interfaces + */ + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +int +zvol_init(void) +{ + zvol_init_impl(); + zvol_register_ops(&zvol_freebsd_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); +} diff --git a/module/os/linux/spl/Makefile.in b/module/os/linux/spl/Makefile.in new file mode 100644 index 0000000000..b2325f91b4 --- /dev/null +++ b/module/os/linux/spl/Makefile.in @@ -0,0 +1,17 @@ +$(MODULE)-objs += ../os/linux/spl/spl-atomic.o +$(MODULE)-objs += ../os/linux/spl/spl-condvar.o +$(MODULE)-objs += ../os/linux/spl/spl-cred.o +$(MODULE)-objs += ../os/linux/spl/spl-err.o +$(MODULE)-objs += ../os/linux/spl/spl-generic.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o +$(MODULE)-objs += ../os/linux/spl/spl-kstat.o +$(MODULE)-objs += ../os/linux/spl/spl-proc.o +$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o +$(MODULE)-objs += ../os/linux/spl/spl-taskq.o +$(MODULE)-objs += ../os/linux/spl/spl-thread.o +$(MODULE)-objs += ../os/linux/spl/spl-trace.o +$(MODULE)-objs += ../os/linux/spl/spl-tsd.o +$(MODULE)-objs += ../os/linux/spl/spl-vmem.o +$(MODULE)-objs += ../os/linux/spl/spl-xdr.o +$(MODULE)-objs += ../os/linux/spl/spl-zlib.o diff --git a/module/spl/README.md b/module/os/linux/spl/README.md similarity index 87% rename from module/spl/README.md rename to module/os/linux/spl/README.md index 57f635aed8..906530bcf2 100644 --- a/module/spl/README.md +++ b/module/os/linux/spl/README.md @@ -1,5 +1,5 @@ The Solaris Porting Layer, SPL, is a Linux kernel module which provides a -compatibility layer used by the [ZFS on Linux](http://zfsonlinux.org) project. +compatibility layer used by the [OpenZFS](https://github.com/openzfs/zfs) project. # Installation diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 similarity index 100% rename from module/spl/THIRDPARTYLICENSE.gplv2 rename to module/os/linux/spl/THIRDPARTYLICENSE.gplv2 diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip similarity index 100% rename from module/spl/THIRDPARTYLICENSE.gplv2.descrip rename to module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip diff --git a/module/spl/spl-atomic.c b/module/os/linux/spl/spl-atomic.c similarity index 96% rename from module/spl/spl-atomic.c rename to module/os/linux/spl/spl-atomic.c index 47ed1886e1..accf656fbc 100644 --- a/module/spl/spl-atomic.c +++ b/module/os/linux/spl/spl-atomic.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/module/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c similarity index 76% rename from module/spl/spl-condvar.c rename to module/os/linux/spl/spl-condvar.c index a7a9d1db9a..d0461a9f12 100644 --- a/module/spl/spl-condvar.c +++ b/module/os/linux/spl/spl-condvar.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,8 +25,44 @@ #include #include +#include #include #include +#include + +#include + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include +#endif + +#define MAX_HRTIMEOUT_SLACK_US 1000 +unsigned int spl_schedule_hrtimeout_slack_us = 0; + +static int +param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + if (val > MAX_HRTIMEOUT_SLACK_US) + return (-EINVAL); + + error = param_set_uint(buf, kp); + if (error < 0) + return (error); + + return (0); +} + +module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack, + param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644); +MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us, + "schedule_hrtimeout_range() delta/slack value in us, default(0)"); void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) @@ -144,13 +179,36 @@ __cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) } EXPORT_SYMBOL(__cv_wait_io); -void +int +__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1); + + return (signal_pending(current) ? 0 : 1); +} +EXPORT_SYMBOL(__cv_wait_io_sig); + +int __cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) { cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); + + return (signal_pending(current) ? 0 : 1); } EXPORT_SYMBOL(__cv_wait_sig); +void +__cv_wait_idle(kcondvar_t *cvp, kmutex_t *mp) +{ + sigset_t blocked, saved; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); +} +EXPORT_SYMBOL(__cv_wait_idle); + #if defined(HAVE_IO_SCHEDULE_TIMEOUT) #define spl_io_schedule_timeout(t) io_schedule_timeout(t) #else @@ -254,10 +312,10 @@ __cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, * with a thread holding the mutex and call cv_destroy. */ mutex_enter(mp); - return (time_left > 0 ? time_left : -1); + return (time_left > 0 ? 1 : -1); } -clock_t +int __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { return (__cv_timedwait_common(cvp, mp, exp_time, @@ -265,7 +323,7 @@ __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) } EXPORT_SYMBOL(__cv_timedwait); -clock_t +int __cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { return (__cv_timedwait_common(cvp, mp, exp_time, @@ -273,26 +331,45 @@ __cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) } EXPORT_SYMBOL(__cv_timedwait_io); -clock_t +int __cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { - return (__cv_timedwait_common(cvp, mp, exp_time, - TASK_INTERRUPTIBLE, 0)); + int rc; + + rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0); + return (signal_pending(current) ? 0 : rc); } EXPORT_SYMBOL(__cv_timedwait_sig); +int +__cv_timedwait_idle(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + sigset_t blocked, saved; + int rc; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + rc = __cv_timedwait_common(cvp, mp, exp_time, + TASK_INTERRUPTIBLE, 0); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); + + return (rc); +} +EXPORT_SYMBOL(__cv_timedwait_idle); /* * 'expire_time' argument is an absolute clock time in nanoseconds. * Return value is time left (expire_time - now) or -1 if timeout occurred. */ static clock_t __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, - int state) + hrtime_t res, int state) { DEFINE_WAIT(wait); kmutex_t *m; hrtime_t time_left; ktime_t ktime_left; + u64 slack = 0; + int rc; ASSERT(cvp); ASSERT(mp); @@ -319,13 +396,11 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, * race where 'cvp->cv_waiters > 0' but the list is empty. */ mutex_exit(mp); - /* - * Allow a 100 us range to give kernel an opportunity to coalesce - * interrupts - */ + ktime_left = ktime_set(0, time_left); - schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, - HRTIMER_MODE_REL); + slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), + MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); + rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); /* No more waiters a different mutex could be used */ if (atomic_dec_and_test(&cvp->cv_waiters)) { @@ -341,33 +416,23 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, atomic_dec(&cvp->cv_refs); mutex_enter(mp); - time_left = expire_time - gethrtime(); - return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); + return (rc == -EINTR ? 1 : -1); } /* * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. */ -static clock_t +static int cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag, int state) { - if (res > 1) { - /* - * Align expiration to the specified resolution. - */ - if (flag & CALLOUT_FLAG_ROUNDUP) - tim += res - 1; - tim = (tim / res) * res; - } - if (!(flag & CALLOUT_FLAG_ABSOLUTE)) tim += gethrtime(); - return (__cv_timedwait_hires(cvp, mp, tim, state)); + return (__cv_timedwait_hires(cvp, mp, tim, res, state)); } -clock_t +int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { @@ -376,15 +441,35 @@ cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, } EXPORT_SYMBOL(cv_timedwait_hires); -clock_t +int cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { - return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, - TASK_INTERRUPTIBLE)); + int rc; + + rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE); + return (signal_pending(current) ? 0 : rc); } EXPORT_SYMBOL(cv_timedwait_sig_hires); +int +cv_timedwait_idle_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + sigset_t blocked, saved; + int rc; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); + + return (rc); +} +EXPORT_SYMBOL(cv_timedwait_idle_hires); + void __cv_signal(kcondvar_t *cvp) { @@ -394,8 +479,8 @@ __cv_signal(kcondvar_t *cvp) /* * All waiters are added with WQ_FLAG_EXCLUSIVE so only one - * waiter will be set runable with each call to wake_up(). - * Additionally wake_up() holds a spin_lock assoicated with + * waiter will be set runnable with each call to wake_up(). + * Additionally wake_up() holds a spin_lock associated with * the wait queue to ensure we don't race waking up processes. */ if (atomic_read(&cvp->cv_waiters) > 0) diff --git a/module/spl/spl-cred.c b/module/os/linux/spl/spl-cred.c similarity index 96% rename from module/spl/spl-cred.c rename to module/os/linux/spl/spl-cred.c index ea3e903f90..8fe1cc30ba 100644 --- a/module/spl/spl-cred.c +++ b/module/os/linux/spl/spl-cred.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -27,11 +26,7 @@ #include static int -#ifdef HAVE_KUIDGID_T cr_groups_search(const struct group_info *group_info, kgid_t grp) -#else -cr_groups_search(const struct group_info *group_info, gid_t grp) -#endif { unsigned int left, right, mid; int cmp; diff --git a/module/spl/spl-err.c b/module/os/linux/spl/spl-err.c similarity index 98% rename from module/spl/spl-err.c rename to module/os/linux/spl/spl-err.c index 3c0bb71c06..10b768d573 100644 --- a/module/spl/spl-err.c +++ b/module/os/linux/spl/spl-err.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/module/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c similarity index 87% rename from module/spl/spl-generic.c rename to module/os/linux/spl/spl-generic.c index cd2fa20205..5ea4fc6351 100644 --- a/module/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -27,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -40,12 +38,16 @@ #include #include #include +#include #include #include #include #include #include #include "zfs_gitrev.h" +#include +#include +#include char spl_gitrev[64] = ZFS_META_GITREV; @@ -79,7 +81,7 @@ EXPORT_SYMBOL(p0); * to generate words larger than 128 bits will paradoxically be limited to * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` * 128-bit words and selecting the first will implicitly select the second. If - * a caller finds this behavior undesireable, random_get_bytes() should be used + * a caller finds this behavior undesirable, random_get_bytes() should be used * instead. * * XXX: Linux interrupt handlers that trigger within the critical section @@ -90,7 +92,7 @@ EXPORT_SYMBOL(p0); * and use them when in_interrupt() from linux/preempt_mask.h evaluates to * true. */ -static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); +void __percpu *spl_pseudo_entropy; /* * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed @@ -139,7 +141,7 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) ASSERT(ptr); - xp = get_cpu_var(spl_pseudo_entropy); + xp = get_cpu_ptr(spl_pseudo_entropy); s[0] = xp[0]; s[1] = xp[1]; @@ -161,7 +163,7 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) xp[0] = s[0]; xp[1] = s[1]; - put_cpu_var(spl_pseudo_entropy); + put_cpu_ptr(spl_pseudo_entropy); return (0); } @@ -170,6 +172,7 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) EXPORT_SYMBOL(random_get_pseudo_bytes); #if BITS_PER_LONG == 32 + /* * Support 64/64 => 64 division on a 32-bit platform. While the kernel * provides a div64_u64() function for this we do not use it because the @@ -207,7 +210,7 @@ nlz64(uint64_t x) /* * Newer kernels have a div_u64() function but we define our own - * to simplify portibility between kernel versions. + * to simplify portability between kernel versions. */ static inline uint64_t __div_u64(uint64_t u, uint32_t v) @@ -216,6 +219,14 @@ __div_u64(uint64_t u, uint32_t v) return (u); } +/* + * Turn off missing prototypes warning for these functions. They are + * replacements for libgcc-provided functions and will never be called + * directly. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + /* * Implementation of 64-bit unsigned division for 32-bit machines. * @@ -289,6 +300,26 @@ __umoddi3(uint64_t dividend, uint64_t divisor) } EXPORT_SYMBOL(__umoddi3); +/* 64-bit signed modulo for 32-bit machines. */ +int64_t +__moddi3(int64_t n, int64_t d) +{ + int64_t q; + boolean_t nn = B_FALSE; + + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) + d = -d; + + q = __umoddi3(n, d); + + return (nn ? -q : q); +} +EXPORT_SYMBOL(__moddi3); + /* * Implementation of 64-bit unsigned division/modulo for 32-bit machines. */ @@ -392,6 +423,9 @@ __aeabi_ldivmod(int64_t u, int64_t v) } EXPORT_SYMBOL(__aeabi_ldivmod); #endif /* __arm || __arm__ */ + +#pragma GCC diagnostic pop + #endif /* BITS_PER_LONG */ /* @@ -519,6 +553,50 @@ ddi_copyout(const void *from, void *to, size_t len, int flags) } EXPORT_SYMBOL(ddi_copyout); +static ssize_t +spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + ret = vfs_read(file, (void __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +static int +spl_getattr(struct file *filp, struct kstat *stat) +{ + int rc; + + ASSERT(filp); + ASSERT(stat); + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, stat); +#elif defined(HAVE_3ARGS_VFS_GETATTR) + rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, stat); +#else +#error "No available vfs_getattr()" +#endif + if (rc) + return (-rc); + + return (0); +} + /* * Read the unique system identifier from the /etc/hostid file. * @@ -562,38 +640,43 @@ static int hostid_read(uint32_t *hostid) { uint64_t size; - struct _buf *file; uint32_t value = 0; int error; + loff_t off; + struct file *filp; + struct kstat stat; - file = kobj_open_file(spl_hostid_path); - if (file == (struct _buf *)-1) + filp = filp_open(spl_hostid_path, 0, 0); + + if (IS_ERR(filp)) return (ENOENT); - error = kobj_get_filesize(file, &size); + error = spl_getattr(filp, &stat); if (error) { - kobj_close_file(file); + filp_close(filp, 0); return (error); } - + size = stat.size; + // cppcheck-suppress sizeofwithnumericparameter if (size < sizeof (HW_HOSTID_MASK)) { - kobj_close_file(file); + filp_close(filp, 0); return (EINVAL); } + off = 0; /* * Read directly into the variable like eglibc does. * Short reads are okay; native behavior is preserved. */ - error = kobj_read_file(file, (char *)&value, sizeof (value), 0); + error = spl_kernel_read(filp, &value, sizeof (value), &off); if (error < 0) { - kobj_close_file(file); + filp_close(filp, 0); return (EIO); } /* Mask down to 32 bits like coreutils does. */ *hostid = (value & HW_HOSTID_MASK); - kobj_close_file(file); + filp_close(filp, 0); return (0); } @@ -649,7 +732,10 @@ static void __init spl_random_init(void) { uint64_t s[2]; - int i; + int i = 0; + + spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t), + sizeof (uint64_t)); get_random_bytes(s, sizeof (s)); @@ -667,7 +753,7 @@ spl_random_init(void) } for_each_possible_cpu(i) { - uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); + uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i); spl_rand_jump(s); @@ -676,6 +762,12 @@ spl_random_init(void) } } +static void +spl_random_fini(void) +{ + free_percpu(spl_pseudo_entropy); +} + static void spl_kvmem_fini(void) { @@ -694,51 +786,36 @@ spl_init(void) if ((rc = spl_kvmem_init())) goto out1; - if ((rc = spl_mutex_init())) + if ((rc = spl_tsd_init())) goto out2; - if ((rc = spl_rw_init())) + if ((rc = spl_taskq_init())) goto out3; - if ((rc = spl_tsd_init())) + if ((rc = spl_kmem_cache_init())) goto out4; - if ((rc = spl_taskq_init())) + if ((rc = spl_proc_init())) goto out5; - if ((rc = spl_kmem_cache_init())) + if ((rc = spl_kstat_init())) goto out6; - if ((rc = spl_vn_init())) - goto out7; - - if ((rc = spl_proc_init())) - goto out8; - - if ((rc = spl_kstat_init())) - goto out9; - if ((rc = spl_zlib_init())) - goto out10; + goto out7; return (rc); -out10: - spl_kstat_fini(); -out9: - spl_proc_fini(); -out8: - spl_vn_fini(); out7: - spl_kmem_cache_fini(); + spl_kstat_fini(); out6: - spl_taskq_fini(); + spl_proc_fini(); out5: - spl_tsd_fini(); + spl_kmem_cache_fini(); out4: - spl_rw_fini(); + spl_taskq_fini(); out3: - spl_mutex_fini(); + spl_tsd_fini(); out2: spl_kvmem_fini(); out1: @@ -751,19 +828,17 @@ spl_fini(void) spl_zlib_fini(); spl_kstat_fini(); spl_proc_fini(); - spl_vn_fini(); spl_kmem_cache_fini(); spl_taskq_fini(); spl_tsd_fini(); - spl_rw_fini(); - spl_mutex_fini(); spl_kvmem_fini(); + spl_random_fini(); } module_init(spl_init); module_exit(spl_fini); -MODULE_DESCRIPTION("Solaris Porting Layer"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE("GPL"); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +ZFS_MODULE_DESCRIPTION("Solaris Porting Layer"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE("GPL"); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c similarity index 71% rename from module/spl/spl-kmem-cache.c rename to module/os/linux/spl/spl-kmem-cache.c index 44e112cccb..2151ef008f 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -22,9 +21,9 @@ * with the SPL. If not, see . */ +#include #include #include -#include #include #include #include @@ -57,20 +56,7 @@ #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif -/* - * Cache expiration was implemented because it was part of the default Solaris - * kmem_cache behavior. The idea is that per-cpu objects which haven't been - * accessed in several seconds should be returned to the cache. On the other - * hand Linux slabs never move objects back to the slabs unless there is - * memory pressure on the system. By default the Linux method is enabled - * because it has been shown to improve responsiveness on low memory systems. - * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. - */ /* BEGIN CSTYLED */ -unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; -EXPORT_SYMBOL(spl_kmem_cache_expire); -module_param(spl_kmem_cache_expire, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); /* * Cache magazines are an optimization designed to minimize the cost of @@ -106,11 +92,6 @@ unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); -unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; -module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, - "Minimal number of objects per slab"); - unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); @@ -119,28 +100,14 @@ MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); * For small objects the Linux slab allocator should be used to make the most * efficient use of the memory. However, large objects are not supported by * the Linux slab and therefore the SPL implementation is preferred. A cutoff - * of 16K was determined to be optimal for architectures using 4K pages. + * of 16K was determined to be optimal for architectures using 4K pages and + * to also work well on architecutres using larger 64K page sizes. */ -#if PAGE_SIZE == 4096 unsigned int spl_kmem_cache_slab_limit = 16384; -#else -unsigned int spl_kmem_cache_slab_limit = 0; -#endif module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); -/* - * This value defaults to a threshold designed to avoid allocations which - * have been deemed costly by the kernel. - */ -unsigned int spl_kmem_cache_kmem_limit = - ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / - SPL_KMEM_CACHE_OBJ_PER_SLAB; -module_param(spl_kmem_cache_kmem_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, - "Objects less than N bytes use the kmalloc"); - /* * The number of threads available to allocate new slabs for caches. This * should not need to be tuned but it is available for performance analysis. @@ -185,26 +152,17 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); -SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); -SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, - spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); - static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; - if (skc->skc_flags & KMC_KMEM) { - ASSERT(ISP2(size)); - ptr = (void *)__get_free_pages(lflags, get_order(size)); - } else { - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); - } + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -227,12 +185,7 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - if (skc->skc_flags & KMC_KMEM) { - ASSERT(ISP2(size)); - free_pages((unsigned long)ptr, get_order(size)); - } else { - vfree(ptr); - } + vfree(ptr); } /* @@ -257,6 +210,20 @@ spl_obj_size(spl_kmem_cache_t *skc) P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (cache->skc_obj_total); +} +EXPORT_SYMBOL(spl_kmem_cache_inuse); + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->skc_obj_size); +} +EXPORT_SYMBOL(spl_kmem_cache_entry_size); + /* * Lookup the spl_kmem_object_t for an object given that object. */ @@ -267,16 +234,6 @@ spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) skc->skc_obj_align, uint32_t)); } -/* - * Required space for each offslab object taking in to account alignment - * restrictions and the power-of-two requirement of kv_alloc(). - */ -static inline uint32_t -spl_offslab_size(spl_kmem_cache_t *skc) -{ - return (1UL << (fls64(spl_obj_size(skc)) + 1)); -} - /* * It's important that we pack the spl_kmem_obj_t structure and the * actual objects in to one large address space to minimize the number @@ -297,25 +254,21 @@ spl_offslab_size(spl_kmem_cache_t *skc) * different allocation functions for small and large objects should * give us the best of both worlds. * - * KMC_ONSLAB KMC_OFFSLAB - * - * +------------------------+ +-----------------+ - * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ - * | skc_obj_size <-+ | | +-----------------+ | | - * | spl_kmem_obj_t | | | | - * | skc_obj_size <---+ | +-----------------+ | | - * | spl_kmem_obj_t | | | skc_obj_size | <-+ | - * | ... v | | spl_kmem_obj_t | | - * +------------------------+ +-----------------+ v + * +------------------------+ + * | spl_kmem_slab_t --+-+ | + * | skc_obj_size <-+ | | + * | spl_kmem_obj_t | | + * | skc_obj_size <---+ | + * | spl_kmem_obj_t | | + * | ... v | + * +------------------------+ */ static spl_kmem_slab_t * spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko, *n; - void *base, *obj; - uint32_t obj_size, offslab_size = 0; - int i, rc = 0; + void *base; + uint32_t obj_size; base = kv_alloc(skc, skc->skc_slab_size, flags); if (base == NULL) @@ -331,22 +284,11 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) sks->sks_ref = 0; obj_size = spl_obj_size(skc); - if (skc->skc_flags & KMC_OFFSLAB) - offslab_size = spl_offslab_size(skc); - - for (i = 0; i < sks->sks_objs; i++) { - if (skc->skc_flags & KMC_OFFSLAB) { - obj = kv_alloc(skc, offslab_size, flags); - if (!obj) { - rc = -ENOMEM; - goto out; - } - } else { - obj = base + spl_sks_size(skc) + (i * obj_size); - } + for (int i = 0; i < sks->sks_objs; i++) { + void *obj = base + spl_sks_size(skc) + (i * obj_size); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - sko = spl_sko_from_obj(skc, obj); + spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); sko->sko_addr = obj; sko->sko_magic = SKO_MAGIC; sko->sko_slab = sks; @@ -354,18 +296,6 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) list_add_tail(&sko->sko_list, &sks->sks_free_list); } -out: - if (rc) { - if (skc->skc_flags & KMC_OFFSLAB) - list_for_each_entry_safe(sko, - n, &sks->sks_free_list, sko_list) { - kv_free(skc, sko->sko_addr, offslab_size); - } - - kv_free(skc, base, skc->skc_slab_size); - sks = NULL; - } - return (sks); } @@ -405,11 +335,10 @@ spl_slab_free(spl_kmem_slab_t *sks, static void spl_slab_reclaim(spl_kmem_cache_t *skc) { - spl_kmem_slab_t *sks, *m; - spl_kmem_obj_t *sko, *n; + spl_kmem_slab_t *sks = NULL, *m = NULL; + spl_kmem_obj_t *sko = NULL, *n = NULL; LIST_HEAD(sks_list); LIST_HEAD(sko_list); - uint32_t size = 0; /* * Empty slabs and objects must be moved to a private list so they @@ -429,21 +358,15 @@ spl_slab_reclaim(spl_kmem_cache_t *skc) spin_unlock(&skc->skc_lock); /* - * The following two loops ensure all the object destructors are - * run, any offslab objects are freed, and the slabs themselves - * are freed. This is all done outside the skc->skc_lock since - * this allows the destructor to sleep, and allows us to perform - * a conditional reschedule when a freeing a large number of - * objects and slabs back to the system. + * The following two loops ensure all the object destructors are run, + * and the slabs themselves are freed. This is all done outside the + * skc->skc_lock since this allows the destructor to sleep, and + * allows us to perform a conditional reschedule when a freeing a + * large number of objects and slabs back to the system. */ - if (skc->skc_flags & KMC_OFFSLAB) - size = spl_offslab_size(skc); list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ASSERT(sko->sko_magic == SKO_MAGIC); - - if (skc->skc_flags & KMC_OFFSLAB) - kv_free(skc, sko->sko_addr, size); } list_for_each_entry_safe(sks, m, &sks_list, sks_list) { @@ -579,148 +502,45 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj) * argument contains the max number of entries to remove from the magazine. */ static void -__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) { - int i, count = MIN(flush, skm->skm_avail); + spin_lock(&skc->skc_lock); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); - for (i = 0; i < count; i++) + int count = MIN(flush, skm->skm_avail); + for (int i = 0; i < count; i++) spl_cache_shrink(skc, skm->skm_objs[i]); skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), sizeof (void *) * skm->skm_avail); -} -static void -spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - spin_lock(&skc->skc_lock); - __spl_cache_flush(skc, skm, flush); spin_unlock(&skc->skc_lock); } -static void -spl_magazine_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_cpu == smp_processor_id()); - ASSERT(irqs_disabled()); - - /* There are no available objects or they are too young to age out */ - if ((skm->skm_avail == 0) || - time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) - return; - - /* - * Because we're executing in interrupt context we may have - * interrupted the holder of this lock. To avoid a potential - * deadlock return if the lock is contended. - */ - if (!spin_trylock(&skc->skc_lock)) - return; - - __spl_cache_flush(skc, skm, skm->skm_refill); - spin_unlock(&skc->skc_lock); -} - -/* - * Called regularly to keep a downward pressure on the cache. - * - * Objects older than skc->skc_delay seconds in the per-cpu magazines will - * be returned to the caches. This is done to prevent idle magazines from - * holding memory which could be better used elsewhere. The delay is - * present to prevent thrashing the magazine. - * - * The newly released objects may result in empty partial slabs. Those - * slabs should be released to the system. Otherwise moving the objects - * out of the magazines is just wasted work. - */ -static void -spl_cache_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - taskqid_t id = 0; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* Dynamically disabled at run time */ - if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) - return; - - atomic_inc(&skc->skc_ref); - - if (!(skc->skc_flags & KMC_NOMAGAZINE)) - on_each_cpu(spl_magazine_age, skc, 1); - - spl_slab_reclaim(skc); - - while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { - id = taskq_dispatch_delay( - spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - /* Destroy issued after dispatch immediately cancel it */ - if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) - taskq_cancel_id(spl_kmem_cache_taskq, id); - } - - spin_lock(&skc->skc_lock); - skc->skc_taskqid = id; - spin_unlock(&skc->skc_lock); - - atomic_dec(&skc->skc_ref); -} - /* * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, * for very small objects we may end up with more than this so as not - * to waste space in the minimal allocation of a single page. Also for - * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, - * lower than this and we will fail. + * to waste space in the minimal allocation of a single page. */ static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; - if (skc->skc_flags & KMC_OFFSLAB) { - tgt_objs = spl_kmem_cache_obj_per_slab; - tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); - if ((skc->skc_flags & KMC_KMEM) && - (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) - return (-ENOSPC); + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; } else { - sks_size = spl_sks_size(skc); - obj_size = spl_obj_size(skc); - max_size = (spl_kmem_cache_max_size * 1024 * 1024); - tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); - - /* - * KMC_KMEM slabs are allocated by __get_free_pages() which - * rounds up to the nearest order. Knowing this the size - * should be rounded up to the next power of two with a hard - * maximum defined by the maximum allowed allocation order. - */ - if (skc->skc_flags & KMC_KMEM) { - max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; - tgt_size = MIN(max_size, - PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); - } - - if (tgt_size <= max_size) { - tgt_objs = (tgt_size - sks_size) / obj_size; - } else { - tgt_objs = (max_size - sks_size) / obj_size; - tgt_size = (tgt_objs * obj_size) + sks_size; - } + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; } if (tgt_objs == 0) @@ -778,7 +598,6 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; skm->skm_cache = skc; - skm->skm_age = jiffies; skm->skm_cpu = cpu; } @@ -802,10 +621,9 @@ spl_magazine_free(spl_kmem_magazine_t *skm) static int spl_magazine_create(spl_kmem_cache_t *skc) { - int i; + int i = 0; - if (skc->skc_flags & KMC_NOMAGAZINE) - return (0); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); @@ -833,10 +651,9 @@ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { spl_kmem_magazine_t *skm; - int i; + int i = 0; - if (skc->skc_flags & KMC_NOMAGAZINE) - return; + ASSERT((skc->skc_flags & KMC_SLAB) == 0); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; @@ -858,19 +675,13 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) * priv cache private data for ctor/dtor/reclaim * vmp unused must be NULL * flags - * KMC_KMEM Force SPL kmem backed cache - * KMC_VMEM Force SPL vmem backed cache + * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache - * KMC_OFFSLAB Locate objects off the slab - * KMC_NOTOUCH unsupported - * KMC_NODEBUG unsupported - * KMC_NOHASH unsupported - * KMC_QCACHE unsupported - * KMC_NOMAGAZINE unsupported + * KMC_NODEBUG Disable debugging (unsupported) */ spl_kmem_cache_t * spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); @@ -880,10 +691,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, /* * Unsupported flags */ - ASSERT0(flags & KMC_NOMAGAZINE); - ASSERT0(flags & KMC_NOHASH); - ASSERT0(flags & KMC_QCACHE); ASSERT(vmp == NULL); + ASSERT(reclaim == NULL); might_sleep(); @@ -902,15 +711,12 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_ctor = ctor; skc->skc_dtor = dtor; - skc->skc_reclaim = reclaim; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; - skc->skc_delay = SPL_KMEM_CACHE_DELAY; - skc->skc_reap = SPL_KMEM_CACHE_REAP; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); @@ -932,6 +738,13 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; + rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0, + GFP_KERNEL); + if (rc != 0) { + kfree(skc); + return (NULL); + } + /* * Verify the requested alignment restriction is sane. */ @@ -947,8 +760,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * linuxslab) then select a cache type based on the object size * and default tunables. */ - if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { - + if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) { /* @@ -956,26 +768,19 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * use the Linux slab for better space-efficiency. */ skc->skc_flags |= KMC_SLAB; - } else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) { - /* - * Small objects, less than spl_kmem_cache_kmem_limit - * per object should use kmem because their slabs are - * small. - */ - skc->skc_flags |= KMC_KMEM; } else { /* * All other objects are considered large and are - * placed on vmem backed slabs. + * placed on kvmem backed slabs. */ - skc->skc_flags |= KMC_VMEM; + skc->skc_flags |= KMC_KVMEM; } } /* * Given the type of slab allocate the required resources. */ - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + if (skc->skc_flags & KMC_KVMEM) { rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); if (rc) @@ -995,7 +800,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be - * used for coping between user and kernel space. + * used for copying between user and kernel space. */ slabflags |= SLAB_USERCOPY; #endif @@ -1015,19 +820,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, rc = ENOMEM; goto out; } - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= __GFP_COMP; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= __GFP_COMP; -#endif - skc->skc_flags |= KMC_NOMAGAZINE; - } - - if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) { - skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, - spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); } down_write(&spl_kmem_cache_sem); @@ -1037,6 +829,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, return (skc); out: kfree(skc->skc_name); + percpu_counter_destroy(&skc->skc_linux_alloc); kfree(skc); return (NULL); } @@ -1064,7 +857,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) taskqid_t id; ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); @@ -1086,7 +879,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + if (skc->skc_flags & KMC_KVMEM) { spl_magazine_destroy(skc); spl_slab_reclaim(skc); } else { @@ -1107,6 +900,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); + ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); + percpu_counter_destroy(&skc->skc_linux_alloc); + spin_unlock(&skc->skc_lock); kfree(skc->skc_name); @@ -1175,7 +971,6 @@ __spl_cache_grow(spl_kmem_cache_t *skc, int flags) smp_mb__before_atomic(); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); - wake_up_all(&skc->skc_waitq); } spin_unlock(&skc->skc_lock); @@ -1188,12 +983,14 @@ spl_cache_grow_work(void *data) spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; spl_kmem_cache_t *skc = ska->ska_cache; - (void) __spl_cache_grow(skc, ska->ska_flags); + int error = __spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); + if (error == 0) + wake_up_all(&skc->skc_waitq); kfree(ska); } @@ -1234,19 +1031,13 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) } /* - * To reduce the overhead of context switch and improve NUMA locality, - * it tries to allocate a new slab in the current process context with - * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the - * allocation. + * Note: It would be nice to reduce the overhead of context switch + * and improve NUMA locality, by trying to allocate a new slab in the + * current process context with KM_NOSLEEP flag. * - * However, this can't be applied to KVM_VMEM due to a bug that - * __vmalloc() doesn't honor gfp flags in page table allocation. + * However, this can't be applied to vmem/kvmem due to a bug that + * spl_vmalloc() doesn't honor gfp flags in page table allocation. */ - if (!(skc->skc_flags & KMC_VMEM)) { - rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); - if (rc == 0) - return (0); - } /* * This is handled by dispatching a work request to the global work @@ -1453,6 +1244,15 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + if (obj != NULL) { + /* + * Even though we leave everything up to the + * underlying cache we still keep track of + * how many objects we've allocated in it for + * better debuggability. + */ + percpu_counter_inc(&skc->skc_linux_alloc); + } goto ret; } @@ -1471,7 +1271,6 @@ restart: if (likely(skm->skm_avail)) { /* Object available in CPU cache, use it */ obj = skm->skm_objs[--skm->skm_avail]; - skm->skm_age = jiffies; } else { obj = spl_cache_refill(skc, skm, flags); if ((obj == NULL) && !(flags & KM_NOSLEEP)) @@ -1526,6 +1325,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) */ if (skc->skc_flags & KMC_SLAB) { kmem_cache_free(skc->skc_linux_cache, obj); + percpu_counter_dec(&skc->skc_linux_alloc); return; } @@ -1576,101 +1376,22 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) EXPORT_SYMBOL(spl_kmem_cache_free); /* - * The generic shrinker function for all caches. Under Linux a shrinker - * may not be tightly coupled with a slab cache. In fact Linux always - * systematically tries calling all registered shrinker callbacks which - * report that they contain unused objects. Because of this we only - * register one shrinker function in the shim layer for all slab caches. - * We always attempt to shrink all caches when this generic shrinker - * is called. - * - * If sc->nr_to_scan is zero, the caller is requesting a query of the - * number of objects which can potentially be freed. If it is nonzero, - * the request is to free that many objects. - * - * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks - * in struct shrinker and also require the shrinker to return the number - * of objects freed. - * - * Older kernels require the shrinker to return the number of freeable - * objects following the freeing of nr_to_free. - * - * Linux semantics differ from those under Solaris, which are to - * free all available objects which may (and probably will) be more - * objects than the requested nr_to_scan. - */ -static spl_shrinker_t -__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc; - int alloc = 0; - - /* - * No shrinking in a transaction context. Can cause deadlocks. - */ - if (sc->nr_to_scan && spl_fstrans_check()) - return (SHRINK_STOP); - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (sc->nr_to_scan) { -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK - uint64_t oldalloc = skc->skc_obj_alloc; - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); - if (oldalloc > skc->skc_obj_alloc) - alloc += oldalloc - skc->skc_obj_alloc; -#else - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); - alloc += skc->skc_obj_alloc; -#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ - } else { - /* Request to query number of freeable objects */ - alloc += skc->skc_obj_alloc; - } - } - up_read(&spl_kmem_cache_sem); - - /* - * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. - * This functionality only exists to work around a rare issue where - * shrink_slabs() is repeatedly invoked by many cores causing the - * system to thrash. - */ - if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) - return (SHRINK_STOP); - - return (MAX(alloc, 0)); -} - -SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); - -/* - * Call the registered reclaim function for a cache. Depending on how - * many and which objects are released it may simply repopulate the - * local magazine which will then need to age-out. Objects which cannot - * fit in the magazine we will be released back to their slabs which will - * also need to age out before being release. This is all just best - * effort and we do not want to thrash creating and destroying slabs. + * Depending on how many and which objects are released it may simply + * repopulate the local magazine which will then need to age-out. Objects + * which cannot fit in the magazine will be released back to their slabs + * which will also need to age out before being released. This is all just + * best effort and we do not want to thrash creating and destroying slabs. */ void -spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - atomic_inc(&skc->skc_ref); + if (skc->skc_flags & KMC_SLAB) + return; - /* - * Execute the registered reclaim callback if it exists. - */ - if (skc->skc_flags & KMC_SLAB) { - if (skc->skc_reclaim) - skc->skc_reclaim(skc->skc_private); - goto out; - } + atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. @@ -1678,49 +1399,12 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; - /* - * When a reclaim function is available it may be invoked repeatedly - * until at least a single slab can be freed. This ensures that we - * do free memory back to the system. This helps minimize the chance - * of an OOM event when the bulk of memory is used by the slab. - * - * When free slabs are already available the reclaim callback will be - * skipped. Additionally, if no forward progress is detected despite - * a reclaim function the cache will be skipped to avoid deadlock. - * - * Longer term this would be the correct place to add the code which - * repacks the slabs in order minimize fragmentation. - */ - if (skc->skc_reclaim) { - uint64_t objects = UINT64_MAX; - int do_reclaim; - - do { - spin_lock(&skc->skc_lock); - do_reclaim = - (skc->skc_slab_total > 0) && - ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && - (skc->skc_obj_alloc < objects); - - objects = skc->skc_obj_alloc; - spin_unlock(&skc->skc_lock); - - if (do_reclaim) - skc->skc_reclaim(skc->skc_private); - - } while (do_reclaim); - } - /* Reclaim from the magazine and free all now empty slabs. */ - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { - spl_kmem_magazine_t *skm; - unsigned long irq_flags; - - local_irq_save(irq_flags); - skm = skc->skc_mag[smp_processor_id()]; - spl_cache_flush(skc, skm, skm->skm_avail); - local_irq_restore(irq_flags); - } + unsigned long irq_flags; + local_irq_save(irq_flags); + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); @@ -1749,12 +1433,13 @@ EXPORT_SYMBOL(spl_kmem_cache_reap_active); void spl_kmem_reap(void) { - struct shrink_control sc; + spl_kmem_cache_t *skc = NULL; - sc.nr_to_scan = KMC_REAP_CHUNK; - sc.gfp_mask = GFP_KERNEL; - - (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + spl_kmem_cache_reap_now(skc); + } + up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); @@ -1767,7 +1452,6 @@ spl_kmem_cache_init(void) spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - spl_register_shrinker(&spl_kmem_cache_shrinker); return (0); } @@ -1775,6 +1459,5 @@ spl_kmem_cache_init(void) void spl_kmem_cache_fini(void) { - spl_unregister_shrinker(&spl_kmem_cache_shrinker); taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c similarity index 80% rename from module/spl/spl-kmem.c rename to module/os/linux/spl/spl-kmem.c index 1fdb61e6fc..2b342140d0 100644 --- a/module/spl/spl-kmem.c +++ b/module/os/linux/spl/spl-kmem.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -26,7 +25,6 @@ #include #include #include -#include /* * As a general rule kmem_alloc() allocations should be small, preferably @@ -120,18 +118,81 @@ __strdup(const char *str, int flags) } char * -strdup(const char *str) +kmem_strdup(const char *str) { return (__strdup(str, KM_SLEEP)); } -EXPORT_SYMBOL(strdup); +EXPORT_SYMBOL(kmem_strdup); void -strfree(char *str) +kmem_strfree(char *str) { kfree(str); } -EXPORT_SYMBOL(strfree); +EXPORT_SYMBOL(kmem_strfree); + +void * +spl_kvmalloc(size_t size, gfp_t lflags) +{ +#ifdef HAVE_KVMALLOC + /* + * GFP_KERNEL allocations can safely use kvmalloc which may + * improve performance by avoiding a) high latency caused by + * vmalloc's on-access allocation, b) performance loss due to + * MMU memory address mapping and c) vmalloc locking overhead. + * This has the side-effect that the slab statistics will + * incorrectly report this as a vmem allocation, but that is + * purely cosmetic. + */ + if ((lflags & GFP_KERNEL) == GFP_KERNEL) + return (kvmalloc(size, lflags)); +#endif + + gfp_t kmalloc_lflags = lflags; + + if (size > PAGE_SIZE) { + /* + * We need to set __GFP_NOWARN here since spl_kvmalloc is not + * only called by spl_kmem_alloc_impl but can be called + * directly with custom lflags, too. In that case + * kmem_flags_convert does not get called, which would + * implicitly set __GFP_NOWARN. + */ + kmalloc_lflags |= __GFP_NOWARN; + + /* + * N.B. __GFP_RETRY_MAYFAIL is supported only for large + * e (>32kB) allocations. + * + * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY + * for !costly requests because there is no other way to tell + * the allocator that we want to fail rather than retry + * endlessly. + */ + if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || + (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + kmalloc_lflags |= __GFP_NORETRY; + } + } + + /* + * We first try kmalloc - even for big sizes - and fall back to + * spl_vmalloc if that fails. + * + * For non-__GFP-RECLAIM allocations we always stick to + * kmalloc_node, and fail when kmalloc is not successful (returns + * NULL). + * We cannot fall back to spl_vmalloc in this case because spl_vmalloc + * internally uses GPF_KERNEL allocations. + */ + void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); + if (ptr || size <= PAGE_SIZE || + (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { + return (ptr); + } + + return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); +} /* * General purpose unified implementation of kmem_alloc(). It is an @@ -144,7 +205,6 @@ inline void * spl_kmem_alloc_impl(size_t size, int flags, int node) { gfp_t lflags = kmem_flags_convert(flags); - int use_vmem = 0; void *ptr; /* @@ -156,7 +216,7 @@ spl_kmem_alloc_impl(size_t size, int flags, int node) !(flags & KM_VMEM)) { printk(KERN_WARNING "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" - "https://github.com/zfsonlinux/zfs/issues/new\n", + "https://github.com/openzfs/zfs/issues/new\n", (unsigned long)size, flags); dump_stack(); } @@ -172,33 +232,49 @@ spl_kmem_alloc_impl(size_t size, int flags, int node) * kmem_zalloc() callers. * * For vmem_alloc() and vmem_zalloc() callers it is permissible - * to use __vmalloc(). However, in general use of __vmalloc() - * is strongly discouraged because a global lock must be - * acquired. Contention on this lock can significantly + * to use spl_vmalloc(). However, in general use of + * spl_vmalloc() is strongly discouraged because a global lock + * must be acquired. Contention on this lock can significantly * impact performance so frequently manipulating the virtual * address space is strongly discouraged. */ - if ((size > spl_kmem_alloc_max) || use_vmem) { + if (size > spl_kmem_alloc_max) { if (flags & KM_VMEM) { - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); } else { return (NULL); } } else { - ptr = kmalloc_node(size, lflags, node); + /* + * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), + * because kvmalloc/vmalloc may sleep. We also use + * kmalloc on systems with limited kernel VA space (e.g. + * 32-bit), which have HIGHMEM. Otherwise we use + * kvmalloc, which tries to get contiguous physical + * memory (fast, like kmalloc) and falls back on using + * virtual memory to stitch together pages (slow, like + * vmalloc). + */ +#ifdef CONFIG_HIGHMEM + if (flags & KM_VMEM) { +#else + if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { +#endif + ptr = spl_kvmalloc(size, lflags); + } else { + ptr = kmalloc_node(size, lflags, node); + } } if (likely(ptr) || (flags & KM_NOSLEEP)) return (ptr); /* - * For vmem_alloc() and vmem_zalloc() callers retry immediately - * using __vmalloc() which is unlikely to fail. + * Try hard to satisfy the allocation. However, when progress + * cannot be made, the allocation is allowed to fail. */ - if ((flags & KM_VMEM) && (use_vmem == 0)) { - use_vmem = 1; - continue; - } + if ((lflags & GFP_KERNEL) == GFP_KERNEL) + lflags |= __GFP_RETRY_MAYFAIL; /* * Use cond_resched() instead of congestion_wait() to avoid @@ -302,7 +378,7 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) { struct hlist_head *head; - struct hlist_node *node; + struct hlist_node *node = NULL; struct kmem_debug *p; unsigned long flags; @@ -499,7 +575,7 @@ static void spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) { unsigned long flags; - kmem_debug_t *kd; + kmem_debug_t *kd = NULL; char str[17]; spin_lock_irqsave(lock, flags); diff --git a/module/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c similarity index 91% rename from module/spl/spl-kstat.c rename to module/os/linux/spl/spl-kstat.c index 1f67bf157f..0c46708326 100644 --- a/module/spl/spl-kstat.c +++ b/module/os/linux/spl/spl-kstat.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -22,6 +21,10 @@ * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Kstat Implementation. + * + * Links to Illumos.org for more information on kstat function: + * [1] https://illumos.org/man/1M/kstat + * [2] https://illumos.org/man/9f/kstat_create */ #include @@ -47,72 +50,6 @@ kstat_resize_raw(kstat_t *ksp) return (0); } -void -kstat_waitq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt++; - if (wcnt != 0) { - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; - } -} -EXPORT_SYMBOL(kstat_waitq_enter); - -void -kstat_waitq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt--; - ASSERT((int)wcnt > 0); - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; -} -EXPORT_SYMBOL(kstat_waitq_exit); - -void -kstat_runq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt++; - if (rcnt != 0) { - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; - } -} -EXPORT_SYMBOL(kstat_runq_enter); - -void -kstat_runq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt--; - ASSERT((int)rcnt > 0); - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; -} -EXPORT_SYMBOL(kstat_runq_exit); - static int kstat_seq_show_headers(struct seq_file *f) { @@ -431,7 +368,7 @@ static struct seq_operations kstat_seq_ops = { static kstat_module_t * kstat_find_module(char *name) { - kstat_module_t *module; + kstat_module_t *module = NULL; list_for_each_entry(module, &kstat_module_list, ksm_module_list) { if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) @@ -483,7 +420,7 @@ proc_kstat_open(struct inode *inode, struct file *filp) f = filp->private_data; f->private = PDE_DATA(inode); - return (rc); + return (0); } static ssize_t @@ -507,12 +444,20 @@ proc_kstat_write(struct file *filp, const char __user *buf, size_t len, return (len); } -static struct file_operations proc_kstat_operations = { +static const kstat_proc_op_t proc_kstat_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_kstat_open, + .proc_write = proc_kstat_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else .open = proc_kstat_open, .write = proc_kstat_write, .read = seq_read, .llseek = seq_lseek, .release = seq_release, +#endif }; void @@ -624,14 +569,14 @@ static int kstat_detect_collision(kstat_proc_entry_t *kpep) { kstat_module_t *module; - kstat_proc_entry_t *tmp; + kstat_proc_entry_t *tmp = NULL; char *parent; char *cp; parent = kmem_asprintf("%s", kpep->kpe_module); if ((cp = strrchr(parent, '/')) == NULL) { - strfree(parent); + kmem_strfree(parent); return (0); } @@ -639,13 +584,13 @@ kstat_detect_collision(kstat_proc_entry_t *kpep) if ((module = kstat_find_module(parent)) != NULL) { list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { - strfree(parent); + kmem_strfree(parent); return (EEXIST); } } } - strfree(parent); + kmem_strfree(parent); return (0); } @@ -656,10 +601,10 @@ kstat_detect_collision(kstat_proc_entry_t *kpep) */ void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, - const struct file_operations *file_ops, void *data) + const kstat_proc_op_t *proc_ops, void *data) { kstat_module_t *module; - kstat_proc_entry_t *tmp; + kstat_proc_entry_t *tmp = NULL; ASSERT(kpep); @@ -690,7 +635,7 @@ kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, kpep->kpe_owner = module; kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, - module->ksm_proc, file_ops, data); + module->ksm_proc, proc_ops, data); if (kpep->kpe_proc == NULL) { list_del_init(&kpep->kpe_list); if (list_empty(&module->ksm_kstat_list)) diff --git a/module/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c similarity index 82% rename from module/spl/spl-proc.c rename to module/os/linux/spl/spl-proc.c index a75bcc2145..c4af27a7fc 100644 --- a/module/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -54,73 +53,19 @@ static struct proc_dir_entry *proc_spl_taskq_all = NULL; static struct proc_dir_entry *proc_spl_taskq = NULL; struct proc_dir_entry *proc_spl_kstat = NULL; -static int -proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, - int ubuffer_size) -{ - int size; - - if (ubuffer_size > kbuffer_size) - return (-EOVERFLOW); - - if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) - return (-EFAULT); - - /* strip trailing whitespace */ - size = strnlen(kbuffer, ubuffer_size); - while (size-- >= 0) - if (!isspace(kbuffer[size])) - break; - - /* empty string */ - if (size < 0) - return (-EINVAL); - - /* no space to terminate */ - if (size == kbuffer_size) - return (-EOVERFLOW); - - kbuffer[size + 1] = 0; - return (0); -} - -static int -proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, - char *append) -{ - /* - * NB if 'append' != NULL, it's a single character to append to the - * copied out string - usually "\n", for /proc entries and - * (i.e. a terminating zero byte) for sysctl entries - */ - int size = MIN(strlen(kbuffer), ubuffer_size); - - if (copy_to_user(ubuffer, kbuffer, size)) - return (-EFAULT); - - if (append != NULL && size < ubuffer_size) { - if (copy_to_user(ubuffer + size, append, 1)) - return (-EFAULT); - - size++; - } - - return (size); -} - #ifdef DEBUG_KMEM static int proc_domemused(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; - unsigned long min = 0, max = ~0, val; + unsigned long val; spl_ctl_table dummy = *table; dummy.data = &val; dummy.proc_handler = &proc_dointvec; - dummy.extra1 = &min; - dummy.extra2 = &max; + dummy.extra1 = &table_min; + dummy.extra2 = &table_max; if (write) { *ppos += *lenp; @@ -142,14 +87,14 @@ proc_doslab(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; - unsigned long min = 0, max = ~0, val = 0, mask; + unsigned long val = 0, mask; spl_ctl_table dummy = *table; - spl_kmem_cache_t *skc; + spl_kmem_cache_t *skc = NULL; dummy.data = &val; dummy.proc_handler = &proc_dointvec; - dummy.extra1 = &min; - dummy.extra2 = &max; + dummy.extra1 = &table_min; + dummy.extra2 = &table_max; if (write) { *ppos += *lenp; @@ -188,39 +133,34 @@ static int proc_dohostid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int len, rc = 0; char *end, str[32]; + unsigned long hid; + spl_ctl_table dummy = *table; + + dummy.data = str; + dummy.maxlen = sizeof (str) - 1; + + if (!write) + snprintf(str, sizeof (str), "%lx", + (unsigned long) zone_get_hostid(NULL)); + + /* always returns 0 */ + proc_dostring(&dummy, write, buffer, lenp, ppos); if (write) { /* * We can't use proc_doulongvec_minmax() in the write - * case here because hostid while a hex value has no - * leading 0x which confuses the helper function. + * case here because hostid, while a hex value, has no + * leading 0x, which confuses the helper function. */ - rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); - if (rc < 0) - return (rc); - spl_hostid = simple_strtoul(str, &end, 16); + hid = simple_strtoul(str, &end, 16); if (str == end) return (-EINVAL); - - } else { - len = snprintf(str, sizeof (str), "%lx", - (unsigned long) zone_get_hostid(NULL)); - if (*ppos >= len) - rc = 0; - else - rc = proc_copyout_string(buffer, - *lenp, str + *ppos, "\n"); - - if (rc >= 0) { - *lenp = rc; - *ppos += rc; - } + spl_hostid = hid; } - return (rc); + return (0); } static void @@ -249,7 +189,7 @@ static int taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) { taskq_t *tq = p; - taskq_thread_t *tqt; + taskq_thread_t *tqt = NULL; spl_wait_queue_entry_t *wq; struct task_struct *tsk; taskq_ent_t *tqe; @@ -437,11 +377,31 @@ slab_seq_show(struct seq_file *f, void *p) ASSERT(skc->skc_magic == SKC_MAGIC); - /* - * Backed by Linux slab see /proc/slabinfo. - */ - if (skc->skc_flags & KMC_SLAB) + if (skc->skc_flags & KMC_SLAB) { + /* + * This cache is backed by a generic Linux kmem cache which + * has its own accounting. For these caches we only track + * the number of active allocated objects that exist within + * the underlying Linux slabs. For the overall statistics of + * the underlying Linux cache please refer to /proc/slabinfo. + */ + spin_lock(&skc->skc_lock); + uint64_t objs_allocated = + percpu_counter_sum(&skc->skc_linux_alloc); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9s %9lu %8s %8u " + "%5s %5s %5s %5s %5lu %5s %5s %5s %5s\n", + (long unsigned)skc->skc_flags, + "-", + (long unsigned)(skc->skc_obj_size * objs_allocated), + "-", + (unsigned)skc->skc_obj_size, + "-", "-", "-", "-", + (long unsigned)objs_allocated, + "-", "-", "-", "-"); + spin_unlock(&skc->skc_lock); return (0); + } spin_lock(&skc->skc_lock); seq_printf(f, "%-36s ", skc->skc_name); @@ -461,9 +421,7 @@ slab_seq_show(struct seq_file *f, void *p) (long unsigned)skc->skc_obj_deadlock, (long unsigned)skc->skc_obj_emergency, (long unsigned)skc->skc_obj_emergency_max); - spin_unlock(&skc->skc_lock); - return (0); } @@ -516,11 +474,18 @@ proc_slab_open(struct inode *inode, struct file *filp) return (seq_open(filp, &slab_seq_ops)); } -static struct file_operations proc_slab_operations = { - .open = proc_slab_open, - .read = seq_read, - .llseek = seq_lseek, +static const kstat_proc_op_t proc_slab_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_slab_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else + .open = proc_slab_open, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release, +#endif }; static void @@ -555,18 +520,32 @@ proc_taskq_open(struct inode *inode, struct file *filp) return (seq_open(filp, &taskq_seq_ops)); } -static struct file_operations proc_taskq_all_operations = { +static const kstat_proc_op_t proc_taskq_all_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_taskq_all_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else .open = proc_taskq_all_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, +#endif }; -static struct file_operations proc_taskq_operations = { +static const kstat_proc_op_t proc_taskq_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_taskq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else .open = proc_taskq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, +#endif }; static struct ctl_table spl_kmem_table[] = { @@ -593,17 +572,8 @@ static struct ctl_table spl_kmem_table[] = { }, #endif /* DEBUG_KMEM */ { - .procname = "slab_kmem_total", - .data = (void *)(KMC_KMEM | KMC_TOTAL), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_kmem_alloc", - .data = (void *)(KMC_KMEM | KMC_ALLOC), + .procname = "slab_kvmem_total", + .data = (void *)(KMC_KVMEM | KMC_TOTAL), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, @@ -611,8 +581,8 @@ static struct ctl_table spl_kmem_table[] = { .proc_handler = &proc_doslab, }, { - .procname = "slab_kmem_max", - .data = (void *)(KMC_KMEM | KMC_MAX), + .procname = "slab_kvmem_alloc", + .data = (void *)(KMC_KVMEM | KMC_ALLOC), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, @@ -620,26 +590,8 @@ static struct ctl_table spl_kmem_table[] = { .proc_handler = &proc_doslab, }, { - .procname = "slab_vmem_total", - .data = (void *)(KMC_VMEM | KMC_TOTAL), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_vmem_alloc", - .data = (void *)(KMC_VMEM | KMC_ALLOC), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_vmem_max", - .data = (void *)(KMC_VMEM | KMC_MAX), + .procname = "slab_kvmem_max", + .data = (void *)(KMC_KVMEM | KMC_MAX), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, @@ -696,9 +648,6 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { -#ifdef HAVE_CTL_NAME - .ctl_name = CTL_KERN, -#endif .procname = "kernel", .mode = 0555, .child = spl_dir, diff --git a/module/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c similarity index 90% rename from module/spl/spl-procfs-list.c rename to module/os/linux/spl/spl-procfs-list.c index f6a00da5c9..cae13228c6 100644 --- a/module/spl/spl-procfs-list.c +++ b/module/os/linux/spl/spl-procfs-list.c @@ -89,7 +89,17 @@ procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos) cursor->cached_node = next_node; cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node); *pos = cursor->cached_pos; + } else { + /* + * seq_read() expects ->next() to update the position even + * when there are no more entries. Advance the position to + * prevent a warning from being logged. + */ + cursor->cached_node = NULL; + cursor->cached_pos++; + *pos = cursor->cached_pos; } + return (next_node); } @@ -105,6 +115,8 @@ procfs_list_seq_start(struct seq_file *f, loff_t *pos) cursor->cached_node = SEQ_START_TOKEN; cursor->cached_pos = 0; return (SEQ_START_TOKEN); + } else if (cursor->cached_node == NULL) { + return (NULL); } /* @@ -185,13 +197,20 @@ procfs_list_write(struct file *filp, const char __user *buf, size_t len, return (len); } -static struct file_operations procfs_list_operations = { - .owner = THIS_MODULE, +static const kstat_proc_op_t procfs_list_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = procfs_list_open, + .proc_write = procfs_list_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release_private, +#else .open = procfs_list_open, .write = procfs_list_write, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, +#endif }; /* @@ -200,6 +219,7 @@ static struct file_operations procfs_list_operations = { */ void procfs_list_install(const char *module, + const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, @@ -208,6 +228,12 @@ procfs_list_install(const char *module, int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off) { + char *modulestr; + + if (submodule != NULL) + modulestr = kmem_asprintf("%s/%s", module, submodule); + else + modulestr = kmem_asprintf("%s", module); mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), @@ -218,9 +244,10 @@ procfs_list_install(const char *module, procfs_list->pl_clear = clear; procfs_list->pl_node_offset = procfs_list_node_off; - kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name); + kstat_proc_entry_init(&procfs_list->pl_kstat_entry, modulestr, name); kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode, &procfs_list_operations, procfs_list); + kmem_strfree(modulestr); } EXPORT_SYMBOL(procfs_list_install); diff --git a/module/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c similarity index 88% rename from module/spl/spl-taskq.c rename to module/os/linux/spl/spl-taskq.c index a39f94e4cc..fb25a41544 100644 --- a/module/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -28,6 +27,10 @@ #include #include #include +#include +#ifdef HAVE_CPU_HOTPLUG +#include +#endif int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); @@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); int spl_taskq_thread_dynamic = 1; -module_param(spl_taskq_thread_dynamic, int, 0644); +module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); int spl_taskq_thread_priority = 1; @@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq); static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); +#ifdef HAVE_CPU_HOTPLUG +/* Multi-callback id for cpu hotplugging. */ +static int spl_taskq_cpuhp_state; +#endif + /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; @@ -82,7 +90,7 @@ task_km_flags(uint_t flags) static int taskq_find_by_name(const char *name) { - struct list_head *tql; + struct list_head *tql = NULL; taskq_t *tq; list_for_each_prev(tql, &tq_list) { @@ -211,7 +219,7 @@ task_expire_impl(taskq_ent_t *t) { taskq_ent_t *w; taskq_t *tq = t->tqent_taskq; - struct list_head *l; + struct list_head *l = NULL; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); @@ -223,6 +231,8 @@ task_expire_impl(taskq_ent_t *t) } t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); + /* * The priority list must be maintained in strict task id order * from lowest to highest for lowest_id to be easily calculable. @@ -264,8 +274,6 @@ taskq_lowest_id(taskq_t *tq) taskq_ent_t *t; taskq_thread_t *tqt; - ASSERT(tq); - if (!list_empty(&tq->tq_pend_list)) { t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); @@ -298,7 +306,7 @@ static void taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) { taskq_thread_t *w; - struct list_head *l; + struct list_head *l = NULL; ASSERT(tq); ASSERT(tqt); @@ -321,7 +329,7 @@ taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) static taskq_ent_t * taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) { - struct list_head *l; + struct list_head *l = NULL; taskq_ent_t *t; list_for_each(l, lh) { @@ -347,7 +355,7 @@ static taskq_ent_t * taskq_find(taskq_t *tq, taskqid_t id) { taskq_thread_t *tqt; - struct list_head *l; + struct list_head *l = NULL; taskq_ent_t *t; t = taskq_find_list(tq, &tq->tq_delay_list, id); @@ -488,6 +496,13 @@ taskq_member(taskq_t *tq, kthread_t *t) } EXPORT_SYMBOL(taskq_member); +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} +EXPORT_SYMBOL(taskq_of_curthread); + /* * Cancel an already dispatched task given the task id. Still pending tasks * will be immediately canceled, and if the task is active the function will @@ -593,7 +608,9 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) t->tqent_taskq = tq; t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; + t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); @@ -706,7 +723,9 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; + t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); spin_unlock(&t->tqent_lock); @@ -906,15 +925,15 @@ taskq_thread(void *args) * tqent_flags here. * * Also use an on stack taskq_ent_t for tqt_task - * assignment in this case. We only populate the two - * fields used by the only user in taskq proc file. + * assignment in this case; we want to make sure + * to duplicate all fields, so the values are + * correct when it's accessed via DTRACE_PROBE*. */ tqt->tqt_id = t->tqent_id; tqt->tqt_flags = t->tqent_flags; if (t->tqent_flags & TQENT_FLAG_PREALLOC) { - dup_task.tqent_func = t->tqent_func; - dup_task.tqent_arg = t->tqent_arg; + dup_task = *t; t = &dup_task; } tqt->tqt_task = t; @@ -923,9 +942,13 @@ taskq_thread(void *args) tq->tq_nactive++; spin_unlock_irqrestore(&tq->tq_lock, flags); + DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); + /* Perform the requested task */ t->tqent_func(t->tqent_arg); + DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_nactive--; @@ -970,6 +993,7 @@ error: spin_unlock_irqrestore(&tq->tq_lock, flags); tsd_set(taskq_tsd, NULL); + thread_exit(); return (0); } @@ -1007,13 +1031,14 @@ taskq_thread_create(taskq_t *tq) } taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, +taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; + int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); @@ -1024,23 +1049,36 @@ taskq_create(const char *name, int nthreads, pri_t pri, if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); - nthreads = MIN(nthreads, 100); + nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); - nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); + tq->tq_hp_support = B_FALSE; +#ifdef HAVE_CPU_HOTPLUG + if (flags & TASKQ_THREADS_CPU_PCT) { + tq->tq_hp_support = B_TRUE; + if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, + &tq->tq_hp_cb_node) != 0) { + kmem_free(tq, sizeof (*tq)); + return (NULL); + } + } +#endif + spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); - tq->tq_name = strdup(name); + tq->tq_name = kmem_strdup(name); tq->tq_nactive = 0; tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; + tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; @@ -1114,6 +1152,12 @@ taskq_destroy(taskq_t *tq) tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); +#ifdef HAVE_CPU_HOTPLUG + if (tq->tq_hp_support) { + VERIFY0(cpuhp_state_remove_instance_nocalls( + spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); + } +#endif /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. @@ -1176,12 +1220,11 @@ taskq_destroy(taskq_t *tq) spin_unlock_irqrestore(&tq->tq_lock, flags); - strfree(tq->tq_name); + kmem_strfree(tq->tq_name); kmem_free(tq, sizeof (taskq_t)); } EXPORT_SYMBOL(taskq_destroy); - static unsigned int spl_taskq_kick = 0; /* @@ -1198,7 +1241,7 @@ param_set_taskq_kick(const char *val, struct kernel_param *kp) #endif { int ret; - taskq_t *tq; + taskq_t *tq = NULL; taskq_ent_t *t; unsigned long flags; @@ -1238,12 +1281,97 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); +#ifdef HAVE_CPU_HOTPLUG +/* + * This callback will be called exactly once for each core that comes online, + * for each dynamic taskq. We attempt to expand taskqs that have + * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every + * time, to correctly determine whether or not to add a thread. + */ +static int +spl_taskq_expand(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + int err = 0; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (err); + } + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads > tq->tq_nthreads) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + taskq_thread_t *tqt = taskq_thread_create(tq); + if (tqt == NULL) + err = -1; + return (err); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (err); +} + +/* + * While we don't support offlining CPUs, it is possible that CPUs will fail + * to online successfully. We do need to be able to handle this case + * gracefully. + */ +static int +spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads < tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); + taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + struct task_struct *thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + return (0); + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); +} +#endif + int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); +#ifdef HAVE_CPU_HOTPLUG + spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); +#endif + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) @@ -1252,6 +1380,9 @@ spl_taskq_init(void) system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); return (1); } @@ -1259,6 +1390,9 @@ spl_taskq_init(void) dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (1); @@ -1287,4 +1421,9 @@ spl_taskq_fini(void) system_taskq = NULL; tsd_destroy(&taskq_tsd); + +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); + spl_taskq_cpuhp_state = 0; +#endif } diff --git a/module/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c similarity index 78% rename from module/spl/spl-thread.c rename to module/os/linux/spl/spl-thread.c index d441ad65f3..834c527117 100644 --- a/module/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -153,8 +152,60 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) if (PTR_ERR(tsk) == -ENOMEM) continue; return (NULL); - } else + } else { return (tsk); + } } while (1); } EXPORT_SYMBOL(spl_kthread_create); + +/* + * The "why" argument indicates the allowable side-effects of the call: + * + * FORREAL: Extract the next pending signal from p_sig into p_cursig; + * stop the process if a stop has been requested or if a traced signal + * is pending. + * + * JUSTLOOKING: Don't stop the process, just indicate whether or not + * a signal might be pending (FORREAL is needed to tell for sure). + */ +int +issig(int why) +{ + ASSERT(why == FORREAL || why == JUSTLOOKING); + + if (!signal_pending(current)) + return (0); + + if (why != FORREAL) + return (1); + + struct task_struct *task = current; + spl_kernel_siginfo_t __info; + sigset_t set; + siginitsetinv(&set, 1ULL << (SIGSTOP - 1) | 1ULL << (SIGTSTP - 1)); + sigorsets(&set, &task->blocked, &set); + + spin_lock_irq(&task->sighand->siglock); + int ret; + if ((ret = dequeue_signal(task, &set, &__info)) != 0) { +#ifdef HAVE_SIGNAL_STOP + spin_unlock_irq(&task->sighand->siglock); + kernel_signal_stop(); +#else + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + spl_set_special_state(TASK_STOPPED); + + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +#endif + return (0); + } + + spin_unlock_irq(&task->sighand->siglock); + + return (1); +} + +EXPORT_SYMBOL(issig); diff --git a/module/os/linux/spl/spl-trace.c b/module/os/linux/spl/spl-trace.c new file mode 100644 index 0000000000..7912a38129 --- /dev/null +++ b/module/os/linux/spl/spl-trace.c @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Each DTRACE_PROBE must define its trace point in one (and only one) + * source file, so this dummy file exists for that purpose. + */ + +#include + +#ifdef _KERNEL +#define CREATE_TRACE_POINTS +#include +#include +#endif diff --git a/module/spl/spl-tsd.c b/module/os/linux/spl/spl-tsd.c similarity index 99% rename from module/spl/spl-tsd.c rename to module/os/linux/spl/spl-tsd.c index 4c800292ae..546db9ab8b 100644 --- a/module/spl/spl-tsd.c +++ b/module/os/linux/spl/spl-tsd.c @@ -5,7 +5,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -42,7 +41,7 @@ * type is entry is called a 'key' entry and it is added to the hash during * tsd_create(). It is used to store the address of the destructor function * and it is used as an anchor point. All tsd entries which use the same - * key will be linked to this entry. This is used during tsd_destory() to + * key will be linked to this entry. This is used during tsd_destroy() to * quickly call the destructor function for all tsd associated with the key. * The 'key' entry may be looked up with tsd_hash_search() by passing the * key you wish to lookup and DTOR_PID constant as the pid. @@ -98,7 +97,7 @@ static tsd_hash_table_t *tsd_hash_table = NULL; static tsd_hash_entry_t * tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) { - struct hlist_node *node; + struct hlist_node *node = NULL; tsd_hash_entry_t *entry; tsd_hash_bin_t *bin; ulong_t hash; @@ -269,7 +268,7 @@ tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) * @table: hash table * @pid: search pid * - * For every process these is a single entry in the hash which is used + * For every process there is a single entry in the hash which is used * as anchor. All other thread specific entries for this process are * linked to this anchor via the 'he_pid_list' list head. */ diff --git a/module/spl/spl-vmem.c b/module/os/linux/spl/spl-vmem.c similarity index 66% rename from module/spl/spl-vmem.c rename to module/os/linux/spl/spl-vmem.c index e1a84a9117..cab3e9549c 100644 --- a/module/spl/spl-vmem.c +++ b/module/os/linux/spl/spl-vmem.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -22,57 +21,13 @@ * with the SPL. If not, see . */ +#include #include #include #include #include #include -vmem_t *heap_arena = NULL; -EXPORT_SYMBOL(heap_arena); - -vmem_t *zio_alloc_arena = NULL; -EXPORT_SYMBOL(zio_alloc_arena); - -vmem_t *zio_arena = NULL; -EXPORT_SYMBOL(zio_arena); - -#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */ - -/* - * Return approximate virtual memory usage based on these assumptions: - * - * 1) The major SPL consumer of virtual memory is the kmem cache. - * 2) Memory allocated with vmem_alloc() is short lived and can be ignored. - * 3) Allow a 4MB floor as a generous pad given normal consumption. - * 4) The spl_kmem_cache_sem only contends with cache create/destroy. - */ -size_t -vmem_size(vmem_t *vmp, int typemask) -{ - spl_kmem_cache_t *skc; - size_t alloc = VMEM_FLOOR_SIZE; - - if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE)) - return (VMALLOC_TOTAL); - - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (skc->skc_flags & KMC_VMEM) - alloc += skc->skc_slab_size * skc->skc_slab_total; - } - up_read(&spl_kmem_cache_sem); - - if (typemask & VMEM_ALLOC) - return (MIN(alloc, VMALLOC_TOTAL)); - else if (typemask & VMEM_FREE) - return (MAX(VMALLOC_TOTAL - alloc, 0)); - else - return (0); -} -EXPORT_SYMBOL(vmem_size); - /* * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. */ diff --git a/module/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c similarity index 99% rename from module/spl/spl-xdr.c rename to module/os/linux/spl/spl-xdr.c index 1dd31ffc14..5e763c2560 100644 --- a/module/spl/spl-xdr.c +++ b/module/os/linux/spl/spl-xdr.c @@ -3,7 +3,6 @@ * Written by Ricardo Correia * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/module/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c similarity index 97% rename from module/spl/spl-zlib.c rename to module/os/linux/spl/spl-zlib.c index 229e6a44b0..589496da0c 100644 --- a/module/spl/spl-zlib.c +++ b/module/os/linux/spl/spl-zlib.c @@ -6,7 +6,6 @@ * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. - * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -54,6 +53,7 @@ */ +#include #include #include #include @@ -196,13 +196,13 @@ spl_zlib_init(void) { int size; - size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + size = MAX(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); zlib_workspace_cache = kmem_cache_create( "spl_zlib_workspace_cache", size, 0, NULL, NULL, NULL, NULL, NULL, - KMC_VMEM | KMC_NOEMERGENCY); + KMC_KVMEM); if (!zlib_workspace_cache) return (1); diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in new file mode 100644 index 0000000000..fa990776db --- /dev/null +++ b/module/os/linux/zfs/Makefile.in @@ -0,0 +1,38 @@ +# +# Linux specific sources included from module/zfs/Makefile.in +# + +# Suppress unused-value warnings in sparc64 architecture headers +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value + +$(MODULE)-objs += ../os/linux/zfs/abd_os.o +$(MODULE)-objs += ../os/linux/zfs/arc_os.o +$(MODULE)-objs += ../os/linux/zfs/mmp_os.o +$(MODULE)-objs += ../os/linux/zfs/policy.o +$(MODULE)-objs += ../os/linux/zfs/trace.o +$(MODULE)-objs += ../os/linux/zfs/qat.o +$(MODULE)-objs += ../os/linux/zfs/qat_compress.o +$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o +$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o +$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o +$(MODULE)-objs += ../os/linux/zfs/vdev_file.o +$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o +$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o +$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o +$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o +$(MODULE)-objs += ../os/linux/zfs/zfs_racct.o +$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o +$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o +$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o +$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o +$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zpl_export.o +$(MODULE)-objs += ../os/linux/zfs/zpl_file.o +$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o +$(MODULE)-objs += ../os/linux/zfs/zpl_super.o +$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o +$(MODULE)-objs += ../os/linux/zfs/zvol_os.o diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c new file mode 100644 index 0000000000..a8f1ea7ca3 --- /dev/null +++ b/module/os/linux/zfs/abd_os.c @@ -0,0 +1,1147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we are using HIGHMEM (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_chunks() for details. + */ + +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +struct { + wmsum_t abdstat_struct_size; + wmsum_t abdstat_linear_cnt; + wmsum_t abdstat_linear_data_size; + wmsum_t abdstat_scatter_cnt; + wmsum_t abdstat_scatter_data_size; + wmsum_t abdstat_scatter_chunk_waste; + wmsum_t abdstat_scatter_orders[MAX_ORDER]; + wmsum_t abdstat_scatter_page_multi_chunk; + wmsum_t abdstat_scatter_page_multi_zone; + wmsum_t abdstat_scatter_page_alloc_retry; + wmsum_t abdstat_scatter_sg_table_retry; +} abd_sums; + +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's. Smaller allocations will use linear ABD's which uses + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. This can + * improve memory efficiency, but it also makes it much harder for ARC + * evictions to actually free pages, because all the buffers on one slab need + * to be freed in order for the slab (and underlying pages) to be freed. + * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's + * possible for them to actually waste more memory than scatter (one page per + * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). + * + * Spill blocks are typically 512B and are heavily used on systems running + * selinux with the default dnode size and the `xattr=sa` property set. + * + * By default we use linear allocations for 512B and 1KB, and scatter + * allocations for larger (1.5KB and up). + */ +int zfs_abd_scatter_min_size = 512 * 3; + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are + * just a single zero'd page. This allows us to conserve memory by + * only using a single zero page for the scatterlist. + */ +abd_t *abd_zero_scatter = NULL; + +struct page; +/* + * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is + * assigned to set each of the pages of abd_zero_scatter. + */ +static struct page *abd_zero_page = NULL; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static uint_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +abd_t * +abd_alloc_struct_impl(size_t size) +{ + /* + * In Linux we do not use the size passed in during ABD + * allocation, so we just ignore it. + */ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +void +abd_free_struct_impl(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + +#ifdef _KERNEL +/* + * Mark zfs data pages so they can be excluded from kernel crash dumps + */ +#ifdef _LP64 +#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E + +static inline void +abd_mark_zfs_page(struct page *page) +{ + get_page(page); + SetPagePrivate(page); + set_page_private(page, ABD_FILE_CACHE_PAGE); +} + +static inline void +abd_unmark_zfs_page(struct page *page) +{ + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#else +#define abd_mark_zfs_page(page) +#define abd_unmark_zfs_page(page) +#endif /* _LP64 */ + +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page = NULL; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned chunk_pages; + int order; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + abd_mark_zfs_page(page); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } +} +#else + +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + abd_mark_zfs_page(page); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + +void +abd_free_chunks(abd_t *abd) +{ + struct scatterlist *sg = NULL; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i = 0; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + abd_free_sg_table(abd); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in + * the scatterlist will be set to the zero'd out buffer abd_zero_page. + */ +static void +abd_alloc_zero_scatter(void) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_zero_page = gfp | __GFP_ZERO; + int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + int i = 0; + + while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + abd_mark_zfs_page(abd_zero_page); + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + ASSERT3U(table.nents, ==, nr_pages); + + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, abd_zero_page, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +#define zfs_kmap_atomic(chunk) ((void *)chunk) +#define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) +{ + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +void +abd_free_chunks(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + + abd_for_each_sg(abd, sg, n, i) { + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); + } + } + abd_free_sg_table(abd); +} + +static void +abd_alloc_zero_scatter(void) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + struct scatterlist *sg; + int i; + + abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + memset(abd_zero_page, 0, PAGESIZE); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + zfs_refcount_create(&abd_zero_scatter->abd_children); + ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + + sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, abd_zero_page, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); +} + +#endif /* _KERNEL */ + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); + arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); + arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } +} + +static void +abd_free_zero_scatter(void) +{ + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_free_sg_table(abd_zero_scatter); + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + ASSERT3P(abd_zero_page, !=, NULL); +#if defined(_KERNEL) + abd_unmark_zfs_page(abd_zero_page); + __free_page(abd_zero_page); +#else + umem_free(abd_zero_page, PAGESIZE); +#endif /* _KERNEL */ +} + +static int +abd_kstats_update(kstat_t *ksp, int rw) +{ + abd_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + as->abdstat_struct_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_struct_size); + as->abdstat_linear_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_cnt); + as->abdstat_linear_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_data_size); + as->abdstat_scatter_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_cnt); + as->abdstat_scatter_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_data_size); + as->abdstat_scatter_chunk_waste.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); + for (int i = 0; i < MAX_ORDER; i++) { + as->abdstat_scatter_orders[i].value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_orders[i]); + } + as->abdstat_scatter_page_multi_chunk.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); + as->abdstat_scatter_page_multi_zone.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); + as->abdstat_scatter_page_alloc_retry.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); + as->abdstat_scatter_sg_table_retry.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); + return (0); +} + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + wmsum_init(&abd_sums.abdstat_struct_size, 0); + wmsum_init(&abd_sums.abdstat_linear_cnt, 0); + wmsum_init(&abd_sums.abdstat_linear_data_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); + wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); + for (i = 0; i < MAX_ORDER; i++) + wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); + wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); + wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); + wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); + wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + abd_ksp->ks_data = &abd_stats; + abd_ksp->ks_update = abd_kstats_update; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + wmsum_fini(&abd_sums.abdstat_struct_size); + wmsum_fini(&abd_sums.abdstat_linear_cnt); + wmsum_fini(&abd_sums.abdstat_linear_data_size); + wmsum_fini(&abd_sums.abdstat_scatter_cnt); + wmsum_fini(&abd_sums.abdstat_scatter_data_size); + wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); + for (int i = 0; i < MAX_ORDER; i++) + wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); + wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); + wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); + wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); + wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_chunks(abd); + + abd_update_scatter_stats(abd, ABDSTAT_DECR); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, + size_t size) +{ + int i = 0; + struct scatterlist *sg = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + if (abd == NULL) + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + + return (abd); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ +} + +#if defined(_KERNEL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_gang(abd)) { + unsigned long count = 0; + + for (abd_t *cabd = abd_gang_get_offset(abd, &off); + cabd != NULL && size != 0; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT3U(off, <, cabd->abd_size); + int mysize = MIN(size, cabd->abd_size - off); + count += abd_nr_pages_off(cabd, mysize, off); + size -= mysize; + off = 0; + } + return (count); + } + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = ABD_SCATTER(abd).abd_offset + off; + + return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT)); +} + +static unsigned int +bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) +{ + unsigned int offset, size, i; + struct page *page; + + offset = offset_in_page(buf_ptr); + for (i = 0; i < bio->bi_max_vecs; i++) { + size = PAGE_SIZE - offset; + + if (bio_size <= 0) + break; + + if (size > bio_size) + size = bio_size; + + if (is_vmalloc_addr(buf_ptr)) + page = vmalloc_to_page(buf_ptr); + else + page = virt_to_page(buf_ptr); + + /* + * Some network related block device uses tcp_sendpage, which + * doesn't behave well when using 0-count page, this is a + * safety net to catch them. + */ + ASSERT3S(page_count(page), >, 0); + + if (bio_add_page(bio, page, size, offset) != size) + break; + + buf_ptr += size; + bio_size -= size; + offset = 0; + } + + return (bio_size); +} + +/* + * bio_map for gang ABD. + */ +static unsigned int +abd_gang_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + ASSERT(abd_is_gang(abd)); + + for (abd_t *cabd = abd_gang_get_offset(abd, &off); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT3U(off, <, cabd->abd_size); + int size = MIN(io_size, cabd->abd_size - off); + int remainder = abd_bio_map_off(bio, cabd, size, off); + io_size -= (size - remainder); + if (io_size == 0 || remainder > 0) + return (io_size); + off = 0; + } + ASSERT0(io_size); + return (io_size); +} + +/* + * bio_map for ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + struct abd_iter aiter; + + ASSERT3U(io_size, <=, abd->abd_size - off); + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); + + ASSERT(!abd_is_linear(abd)); + if (abd_is_gang(abd)) + return (abd_gang_bio_map_off(bio, abd, io_size, off)); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + for (int i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_min_size, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_min_size, + "Minimum size of scatter allocations."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c new file mode 100644 index 0000000000..f96cd1271e --- /dev/null +++ b/module/os/linux/zfs/arc_os.c @@ -0,0 +1,541 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +/* + * This is a limit on how many pages the ARC shrinker makes available for + * eviction in response to one page allocation attempt. Note that in + * practice, the kernel's shrinker can ask us to evict up to about 4x this + * for one allocation attempt. + * + * The default limit of 10,000 (in practice, 160MB per allocation attempt + * with 4K pages) limits the amount of time spent attempting to reclaim ARC + * memory to less than 100ms per allocation attempt, even with a small + * average compressed block size of ~8KB. + * + * See also the comment in arc_shrinker_count(). + * Set to 0 to disable limit. + */ +int zfs_arc_shrinker_limit = 10000; + +#ifdef CONFIG_MEMORY_HOTPLUG +static struct notifier_block arc_hotplug_callback_mem_nb; +#endif + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + /* Default to 1/2 of all memory. */ + return (MAX(allmem / 2, min)); +} + +#ifdef _KERNEL +/* + * Return maximum amount of memory that we could possibly use. Reduced + * to half of all memory in user space which is primarily used for testing. + */ +uint64_t +arc_all_memory(void) +{ +#ifdef CONFIG_HIGHMEM + return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); +#else + return (ptob(zfs_totalram_pages)); +#endif /* CONFIG_HIGHMEM */ +} + +/* + * Return the amount of memory that is considered free. In user space + * which is primarily used for testing we pretend that free memory ranges + * from 0-20% of all memory. + */ +uint64_t +arc_free_memory(void) +{ +#ifdef CONFIG_HIGHMEM + struct sysinfo si; + si_meminfo(&si); + return (ptob(si.freeram - si.freehigh)); +#else + return (ptob(nr_free_pages() + + nr_inactive_file_pages())); +#endif /* CONFIG_HIGHMEM */ +} + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +int64_t +arc_available_memory(void) +{ + return (arc_free_memory() - arc_sys_free); +} + +static uint64_t +arc_evictable_memory(void) +{ + int64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t arc_clean = + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); + + /* + * Scale reported evictable memory in proportion to page cache, cap + * at specified min/max. + */ + uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; + min = MAX(arc_c_min, MIN(arc_c_max, min)); + + if (arc_dirty >= min) + return (arc_clean); + + return (MAX((int64_t)asize - (int64_t)min, 0)); +} + +/* + * The _count() function returns the number of free-able objects. + * The _scan() function returns the number of objects that were freed. + */ +static unsigned long +arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) +{ + /* + * __GFP_FS won't be set if we are called from ZFS code (see + * kmem_flags_convert(), which removes it). To avoid a deadlock, we + * don't allow evicting in this case. We return 0 rather than + * SHRINK_STOP so that the shrinker logic doesn't accumulate a + * deficit against us. + */ + if (!(sc->gfp_mask & __GFP_FS)) { + return (0); + } + + /* + * This code is reached in the "direct reclaim" case, where the + * kernel (outside ZFS) is trying to allocate a page, and the system + * is low on memory. + * + * The kernel's shrinker code doesn't understand how many pages the + * ARC's callback actually frees, so it may ask the ARC to shrink a + * lot for one page allocation. This is problematic because it may + * take a long time, thus delaying the page allocation, and because + * it may force the ARC to unnecessarily shrink very small. + * + * Therefore, we limit the amount of data that we say is evictable, + * which limits the amount that the shrinker will ask us to evict for + * one page allocation attempt. + * + * In practice, we may be asked to shrink 4x the limit to satisfy one + * page allocation, before the kernel's shrinker code gives up on us. + * When that happens, we rely on the kernel code to find the pages + * that we freed before invoking the OOM killer. This happens in + * __alloc_pages_slowpath(), which retries and finds the pages we + * freed when it calls get_page_from_freelist(). + * + * See also the comment above zfs_arc_shrinker_limit. + */ + int64_t limit = zfs_arc_shrinker_limit != 0 ? + zfs_arc_shrinker_limit : INT64_MAX; + return (MIN(limit, btop((int64_t)arc_evictable_memory()))); +} + +static unsigned long +arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + ASSERT((sc->gfp_mask & __GFP_FS) != 0); + + /* The arc is considered warm once reclaim has occurred */ + if (unlikely(arc_warm == B_FALSE)) + arc_warm = B_TRUE; + + /* + * Evict the requested number of pages by reducing arc_c and waiting + * for the requested amount of data to be evicted. + */ + arc_reduce_target_size(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); + if (current->reclaim_state != NULL) + current->reclaim_state->reclaimed_slab += sc->nr_to_scan; + + /* + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. + */ + arc_no_grow = B_TRUE; + + /* + * When direct reclaim is observed it usually indicates a rapid + * increase in memory pressure. This occurs because the kswapd + * threads were unable to asynchronously keep enough free memory + * available. + */ + if (current_is_kswapd()) { + ARCSTAT_BUMP(arcstat_memory_indirect_count); + } else { + ARCSTAT_BUMP(arcstat_memory_direct_count); + } + + return (sc->nr_to_scan); +} + +SPL_SHRINKER_DECLARE(arc_shrinker, + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + uint64_t free_memory = arc_free_memory(); + + if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100) + return (0); + + if (txg > spa->spa_lowmem_last_txg) { + spa->spa_lowmem_last_txg = txg; + spa->spa_lowmem_page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (current_is_kswapd()) { + if (spa->spa_lowmem_page_load > + MAX(arc_sys_free / 4, free_memory) / 4) { + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); + return (SET_ERROR(ERESTART)); + } + /* Note: reserve is inflated, so we deflate */ + atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); + return (0); + } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); + return (SET_ERROR(EAGAIN)); + } + spa->spa_lowmem_page_load = 0; + return (0); +} + +static void +arc_set_sys_free(uint64_t allmem) +{ + /* + * The ARC tries to keep at least this much memory available for the + * system. This gives the ARC time to shrink in response to memory + * pressure, before running completely out of memory and invoking the + * direct-reclaim ARC shrinker. + * + * This should be more than twice high_wmark_pages(), so that + * arc_wait_for_eviction() will wait until at least the + * high_wmark_pages() are free (see arc_evict_state_impl()). + * + * Note: Even when the system is very low on memory, the kernel's + * shrinker code may only ask for one "batch" of pages (512KB) to be + * evicted. If concurrent allocations consume these pages, there may + * still be insufficient free pages, and the OOM killer takes action. + * + * By setting arc_sys_free large enough, and having + * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 + * free memory, it is much less likely that concurrent allocations can + * consume all the memory that was evicted before checking for + * OOM. + * + * It's hard to iterate the zones from a linux kernel module, which + * makes it difficult to determine the watermark dynamically. Instead + * we compute the maximum high watermark for this system, based + * on the amount of memory, assuming default parameters on Linux kernel + * 5.3. + */ + + /* + * Base wmark_low is 4 * the square root of Kbytes of RAM. + */ + long wmark = 4 * int_sqrt(allmem/1024) * 1024; + + /* + * Clamp to between 128K and 64MB. + */ + wmark = MAX(wmark, 128 * 1024); + wmark = MIN(wmark, 64 * 1024 * 1024); + + /* + * watermark_boost can increase the wmark by up to 150%. + */ + wmark += wmark * 150 / 100; + + /* + * arc_sys_free needs to be more than 2x the watermark, because + * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up + * to 3x to ensure we're above it. + */ + arc_sys_free = wmark * 3 + allmem / 32; +} + +void +arc_lowmem_init(void) +{ + uint64_t allmem = arc_all_memory(); + + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + arc_set_sys_free(allmem); +} + +void +arc_lowmem_fini(void) +{ + spl_unregister_shrinker(&arc_shrinker); +} + +int +param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_long(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_min(const char *buf, zfs_kernel_param_t *kp) +{ + return (param_set_arc_long(buf, kp)); +} + +int +param_set_arc_max(const char *buf, zfs_kernel_param_t *kp) +{ + return (param_set_arc_long(buf, kp)); +} + +int +param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_int(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + arc_tuning_update(B_TRUE); + + return (0); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* ARGSUSED */ +static int +arc_hotplug_callback(struct notifier_block *self, unsigned long action, + void *arg) +{ + uint64_t allmem = arc_all_memory(); + if (action != MEM_ONLINE) + return (NOTIFY_OK); + + arc_set_limits(allmem); + +#ifdef __LP64__ + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif + + arc_set_sys_free(allmem); + return (NOTIFY_OK); +} +#endif + +void +arc_register_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; + /* There is no significance to the value 100 */ + arc_hotplug_callback_mem_nb.priority = 100; + register_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} + +void +arc_unregister_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} +#else /* _KERNEL */ +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + + /* Every 100 calls, free a small amount */ + if (random_in_range(100) == 0) + lowest = -1024; + + return (lowest); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem) / 2); +} + +uint64_t +arc_free_memory(void) +{ + return (random_in_range(arc_all_memory() * 20 / 100)); +} + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} +#endif /* _KERNEL */ + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, + "Limit on number of pages that ARC shrinker can reclaim at once"); +/* END CSTYLED */ diff --git a/module/os/linux/zfs/mmp_os.c b/module/os/linux/zfs/mmp_os.c new file mode 100644 index 0000000000..ff3ef1bf6a --- /dev/null +++ b/module/os/linux/zfs/mmp_os.c @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. + */ + +#include +#include + +int +param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return (ret); + + if (spa_mode_global != SPA_MODE_UNINIT) + mmp_signal_all_threads(); + + return (ret); +} diff --git a/module/zfs/policy.c b/module/os/linux/zfs/policy.c similarity index 75% rename from module/zfs/policy.c rename to module/os/linux/zfs/policy.c index 55c9327479..bbccb2e572 100644 --- a/module/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -42,15 +42,13 @@ * all other cases this function must fail and return the passed err. */ static int -priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, +priv_policy_ns(const cred_t *cr, int capability, int err, struct user_namespace *ns) { - ASSERT3S(all, ==, B_FALSE); - if (cr != CRED() && (cr != kcred)) return (err); -#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE) +#if defined(CONFIG_USER_NS) if (!(ns ? ns_capable(ns, capability) : capable(capability))) #else if (!capable(capability)) @@ -61,25 +59,24 @@ priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, } static int -priv_policy(const cred_t *cr, int capability, boolean_t all, int err) +priv_policy(const cred_t *cr, int capability, int err) { - return (priv_policy_ns(cr, capability, all, err, NULL)); + return (priv_policy_ns(cr, capability, err, NULL)); } static int -priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) +priv_policy_user(const cred_t *cr, int capability, int err) { /* - * All priv_policy_user checks are preceeded by kuid/kgid_has_mapping() + * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() * checks. If we cannot do them, we shouldn't be using ns_capable() * since we don't know whether the affected files are valid in our - * namespace. Note that kuid_has_mapping() came after cred->user_ns, so - * we shouldn't need to re-check for HAVE_CRED_USER_NS + * namespace. */ -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - return (priv_policy_ns(cr, capability, all, err, cr->user_ns)); +#if defined(CONFIG_USER_NS) + return (priv_policy_ns(cr, capability, err, cr->user_ns)); #else - return (priv_policy_ns(cr, capability, all, err, NULL)); + return (priv_policy_ns(cr, capability, err, NULL)); #endif } @@ -90,7 +87,7 @@ priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) int secpolicy_nfs(const cred_t *cr) { - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); + return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); } /* @@ -99,7 +96,7 @@ secpolicy_nfs(const cred_t *cr) int secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) { - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); + return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); } /* @@ -127,18 +124,18 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) if (crgetfsuid(cr) == owner) return (0); - if (zpl_inode_owner_or_capable(ip)) + if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (0); -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) +#if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif - if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0) return (0); - if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) + if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0) return (0); return (EPERM); @@ -153,12 +150,12 @@ secpolicy_vnode_chown(const cred_t *cr, uid_t owner) if (crgetfsuid(cr) == owner) return (0); -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) +#if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif - return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); + return (priv_policy_user(cr, CAP_FOWNER, EPERM)); } /* @@ -167,7 +164,7 @@ secpolicy_vnode_chown(const cred_t *cr, uid_t owner) int secpolicy_vnode_create_gid(const cred_t *cr) { - return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); + return (priv_policy(cr, CAP_SETGID, EPERM)); } /* @@ -177,7 +174,7 @@ secpolicy_vnode_create_gid(const cred_t *cr) int secpolicy_vnode_remove(const cred_t *cr) { - return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); + return (priv_policy(cr, CAP_FOWNER, EPERM)); } /* @@ -190,12 +187,12 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) if (crgetfsuid(cr) == owner) return (0); -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) +#if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif - return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); + return (priv_policy_user(cr, CAP_FOWNER, EPERM)); } /* @@ -207,9 +204,10 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) * Enforced in the Linux VFS. */ int -secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, + boolean_t issuidroot) { - return (0); + return (priv_policy_user(cr, CAP_FSETID, EPERM)); } /* @@ -218,12 +216,12 @@ secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) int secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) { -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) +#if defined(CONFIG_USER_NS) if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) return (EPERM); #endif if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) - return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); + return (priv_policy_user(cr, CAP_FSETID, EPERM)); return (0); } @@ -235,7 +233,7 @@ secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) int secpolicy_zinject(const cred_t *cr) { - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); + return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); } /* @@ -245,14 +243,36 @@ secpolicy_zinject(const cred_t *cr) int secpolicy_zfs(const cred_t *cr) { - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); + return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); +} + +/* + * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of + * the current process. Takes both cred_t and proc_t so that this can work + * easily on all platforms. + * + * The has_capability() function was first exported in the 4.10 Linux kernel + * then backported to some LTS kernels. Prior to this change there was no + * mechanism to perform this check therefore EACCES is returned when the + * functionality is not present in the kernel. + */ +int +secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) +{ +#if defined(HAVE_HAS_CAPABILITY) + if (!has_capability(proc, CAP_SYS_ADMIN)) + return (EACCES); + return (0); +#else + return (EACCES); +#endif } void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, + secpolicy_vnode_setid_retain(NULL, cr, (vap->va_mode & S_ISUID) != 0 && (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { vap->va_mask |= AT_MODE; @@ -269,12 +289,12 @@ secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) if (crgetfsuid(cr) == owner) return (0); -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) +#if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif - return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); + return (priv_policy_user(cr, CAP_FSETID, EPERM)); } /* @@ -325,7 +345,7 @@ secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, * Check privileges for setting xvattr attributes */ int -secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type) { return (secpolicy_vnode_chown(cr, owner)); } diff --git a/module/zfs/qat.c b/module/os/linux/zfs/qat.c similarity index 99% rename from module/zfs/qat.c rename to module/os/linux/zfs/qat.c index a6f024cb44..08613b3a20 100644 --- a/module/zfs/qat.c +++ b/module/os/linux/zfs/qat.c @@ -21,7 +21,7 @@ #if defined(_KERNEL) && defined(HAVE_QAT) #include -#include "qat.h" +#include qat_stats_t qat_stats = { { "comp_requests", KSTAT_DATA_UINT64 }, diff --git a/module/zfs/qat_compress.c b/module/os/linux/zfs/qat_compress.c similarity index 87% rename from module/zfs/qat_compress.c rename to module/os/linux/zfs/qat_compress.c index 1c5c0a4e72..1d099c95bc 100644 --- a/module/zfs/qat_compress.c +++ b/module/os/linux/zfs/qat_compress.c @@ -24,11 +24,10 @@ #include #include #include -#include #include #include #include -#include "qat.h" +#include /* * Max instances in a QAT device, each instance is a channel to submit @@ -249,7 +248,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, Cpa8U *buffer_meta_dst = NULL; Cpa32U buffer_meta_size = 0; CpaDcRqResults dc_results; - CpaStatus status = CPA_STATUS_SUCCESS; + CpaStatus status = CPA_STATUS_FAIL; Cpa32U hdr_sz = 0; Cpa32U compressed_sz; Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; @@ -278,16 +277,19 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); - if (QAT_PHYS_CONTIG_ALLOC(&in_pages, - num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&in_pages, + num_src_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) goto fail; - if (QAT_PHYS_CONTIG_ALLOC(&out_pages, - num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&out_pages, + num_dst_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) goto fail; - if (QAT_PHYS_CONTIG_ALLOC(&add_pages, - num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&add_pages, + num_add_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) goto fail; i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; @@ -296,19 +298,19 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, &buffer_meta_size); - if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) != - CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) goto fail; cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, &buffer_meta_size); - if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) != - CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) goto fail; /* build source buffer list */ - if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) != - CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) goto fail; flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); @@ -316,8 +318,8 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ /* build destination buffer list */ - if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) != - CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) goto fail; flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); @@ -404,11 +406,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, } /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + wait_for_completion(&complete); if (dc_results.status != CPA_STATUS_SUCCESS) { status = CPA_STATUS_FAIL; @@ -421,30 +419,11 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, goto fail; } - flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); - /* move to the last page */ - flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; + /* get adler32 checksum and append footer */ + *(Cpa32U*)(dst + hdr_sz + compressed_sz) = + BSWAP_32(dc_results.checksum); - /* no space for gzip footer in the last page */ - if (((compressed_sz + hdr_sz) % PAGE_SIZE) - + ZLIB_FOOT_SZ > PAGE_SIZE) { - status = CPA_STATUS_INCOMPRESSIBLE; - goto fail; - } - - /* jump to the end of the buffer and append footer */ - flat_buf_dst->pData = - (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) - + ((compressed_sz + hdr_sz) % PAGE_SIZE); - flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; - - dc_results.produced = 0; - status = cpaDcGenerateFooter(session_handle, - flat_buf_dst, &dc_results); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - *c_len = compressed_sz + dc_results.produced + hdr_sz; + *c_len = hdr_sz + compressed_sz + ZLIB_FOOT_SZ; QAT_STAT_INCR(comp_total_out_bytes, *c_len); } else { ASSERT3U(dir, ==, QAT_DECOMPRESS); @@ -463,11 +442,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, } /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + wait_for_completion(&complete); if (dc_results.status != CPA_STATUS_SUCCESS) { status = CPA_STATUS_FAIL; @@ -547,7 +522,7 @@ qat_compress(qat_compress_dir_t dir, char *src, int src_len, } static int -param_set_qat_compress(const char *val, struct kernel_param *kp) +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; diff --git a/module/zfs/qat_crypt.c b/module/os/linux/zfs/qat_crypt.c similarity index 96% rename from module/zfs/qat_crypt.c rename to module/os/linux/zfs/qat_crypt.c index 34c19b5823..4771b2f3be 100644 --- a/module/zfs/qat_crypt.c +++ b/module/os/linux/zfs/qat_crypt.c @@ -31,12 +31,11 @@ #include #include #include -#include #include #include #include "lac/cpa_cy_im.h" #include "lac/cpa_cy_common.h" -#include "qat.h" +#include /* * Max instances in a QAT device, each instance is a channel to submit @@ -415,6 +414,9 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, op_data.messageLenToCipherInBytes = enc_len; op_data.ivLenInBytes = ZIO_DATA_IV_LEN; bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */ + if (dir == QAT_DECRYPT) + bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN); cb.verify_result = CPA_FALSE; init_completion(&cb.complete); @@ -423,23 +425,21 @@ qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, if (status != CPA_STATUS_SUCCESS) goto fail; - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); if (cb.verify_result == CPA_FALSE) { status = CPA_STATUS_FAIL; goto fail; } - /* save digest result to digest_buf */ - bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); - if (dir == QAT_ENCRYPT) + if (dir == QAT_ENCRYPT) { + /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); - else + } else { QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + } fail: if (status != CPA_STATUS_SUCCESS) @@ -549,11 +549,9 @@ qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) if (status != CPA_STATUS_SUCCESS) goto fail; - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + if (cb.verify_result == CPA_FALSE) { status = CPA_STATUS_FAIL; goto fail; @@ -578,7 +576,7 @@ fail: } static int -param_set_qat_encrypt(const char *val, struct kernel_param *kp) +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; @@ -600,7 +598,7 @@ param_set_qat_encrypt(const char *val, struct kernel_param *kp) } static int -param_set_qat_checksum(const char *val, struct kernel_param *kp) +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) { int ret; int *pvalue = kp->arg; diff --git a/module/os/linux/zfs/spa_misc_os.c b/module/os/linux/zfs/spa_misc_os.c new file mode 100644 index 0000000000..5672cd6d5c --- /dev/null +++ b/module/os/linux/zfs/spa_misc_os.c @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_prop.h" + + +int +param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = -param_set_deadman_failmode_common(val); + if (error == 0) + error = param_set_charp(val, kp); + + return (error); +} + +int +param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_ulong(val, kp); + if (error < 0) + return (SET_ERROR(error)); + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_ziotime_ms)); + + return (0); +} + +int +param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_ulong(val, kp); + if (error < 0) + return (SET_ERROR(error)); + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (SET_ERROR(error)); + + if (val < 1 || val > 31) + return (SET_ERROR(-EINVAL)); + + error = param_set_int(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("linux"); +} diff --git a/module/zfs/trace.c b/module/os/linux/zfs/trace.c similarity index 87% rename from module/zfs/trace.c rename to module/os/linux/zfs/trace.c index eb6efe841c..a690822ae1 100644 --- a/module/zfs/trace.c +++ b/module/os/linux/zfs/trace.c @@ -18,9 +18,10 @@ * * CDDL HEADER END */ + /* - * Each Linux tracepoints subsystem must define CREATE_TRACE_POINTS in one - * (and only one) C file, so this dummy file exists for that purpose. + * Each DTRACE_PROBE must define its trace point in one (and only one) + * source file, so this dummy file exists for that purpose. */ #include @@ -32,21 +33,23 @@ #include #include #include -#include #include #include -#include +#ifdef _KERNEL #define CREATE_TRACE_POINTS #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include +#endif diff --git a/module/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c similarity index 69% rename from module/zfs/vdev_disk.c rename to module/os/linux/zfs/vdev_disk.c index 1419ae6ad5..42144322a3 100644 --- a/module/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -34,14 +34,30 @@ #include #include #include -#include +#include #include #include -char *zfs_vdev_scheduler = VDEV_SCHEDULER; +typedef struct vdev_disk { + struct block_device *vd_bdev; + krwlock_t vd_lock; +} vdev_disk_t; + +/* + * Unique identifier for the exclusive vdev holder. + */ static void *zfs_vdev_holder = VDEV_HOLDER; -/* size of the "reserved" partition, in blocks */ +/* + * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the + * device is missing. The missing path may be transient since the links + * can be briefly removed and recreated in response to udev events. + */ +static unsigned zfs_vdev_open_timeout_ms = 1000; + +/* + * Size of the "reserved" partition, in blocks. + */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* @@ -55,37 +71,19 @@ typedef struct dio_request { struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; - -#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH) static fmode_t -vdev_bdev_mode(int smode) +vdev_bdev_mode(spa_mode_t spa_mode) { fmode_t mode = 0; - ASSERT3S(smode & (FREAD | FWRITE), !=, 0); - - if (smode & FREAD) + if (spa_mode & SPA_MODE_READ) mode |= FMODE_READ; - if (smode & FWRITE) + if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; return (mode); } -#else -static int -vdev_bdev_mode(int smode) -{ - int mode = 0; - - ASSERT3S(smode & (FREAD | FWRITE), !=, 0); - - if ((smode & FREAD) && !(smode & FWRITE)) - mode = SB_RDONLY; - - return (mode); -} -#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ /* * Returns the usable capacity (in bytes) for the partition or disk. @@ -96,6 +94,14 @@ bdev_capacity(struct block_device *bdev) return (i_size_read(bdev->bd_inode)); } +#if !defined(HAVE_BDEV_WHOLE) +static inline struct block_device * +bdev_whole(struct block_device *bdev) +{ + return (bdev->bd_contains); +} +#endif + /* * Returns the maximum expansion capacity of the block device (in bytes). * @@ -120,7 +126,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) uint64_t psize; int64_t available; - if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to @@ -134,7 +140,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) * "reserved" EFI partition: in such cases return the device * usable capacity. */ - available = i_size_read(bdev->bd_contains->bd_inode) - + available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); @@ -160,83 +166,13 @@ vdev_disk_error(zio_t *zio) zio->io_flags); } -/* - * Use the Linux 'noop' elevator for zfs managed block devices. This - * strikes the ideal balance by allowing the zfs elevator to do all - * request ordering and prioritization. While allowing the Linux - * elevator to do the maximum front/back merging allowed by the - * physical device. This yields the largest possible requests for - * the device with the lowest total overhead. - */ -static void -vdev_elevator_switch(vdev_t *v, char *elevator) -{ - vdev_disk_t *vd = v->vdev_tsd; - struct request_queue *q; - char *device; - int error; - - for (int c = 0; c < v->vdev_children; c++) - vdev_elevator_switch(v->vdev_child[c], elevator); - - if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) - return; - - q = bdev_get_queue(vd->vd_bdev); - device = vd->vd_bdev->bd_disk->disk_name; - - /* - * Skip devices which are not whole disks (partitions). - * Device-mapper devices are excepted since they may be whole - * disks despite the vdev_wholedisk flag, in which case we can - * and should switch the elevator. If the device-mapper device - * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the - * "Skip devices without schedulers" check below will fail. - */ - if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) - return; - - /* Leave existing scheduler when set to "none" */ - if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) - return; - - /* - * The elevator_change() function was available in kernels from - * 2.6.36 to 4.11. When not available fall back to using the user - * mode helper functionality to set the elevator via sysfs. This - * requires /bin/echo and sysfs to be mounted which may not be true - * early in the boot process. - */ -#ifdef HAVE_ELEVATOR_CHANGE - error = elevator_change(q, elevator); -#else -#define SET_SCHEDULER_CMD \ - "exec 0/sys/block/%s/queue/scheduler " \ - " 2>/dev/null; " \ - "echo %s" - - char *argv[] = { "/bin/sh", "-c", NULL, NULL }; - char *envp[] = { NULL }; - - argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - strfree(argv[2]); -#endif /* HAVE_ELEVATOR_CHANGE */ - if (error) { - zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d", - elevator, v->vdev_path, device, error); - } -} - static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); - int count = 0, block_size; - int bdev_retry_count = 50; + hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ @@ -248,10 +184,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* * Reopen the device if it is currently open. When expanding a - * partition force re-scanning the partition table while closed + * partition force re-scanning the partition table if userland + * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the - * open retry count before reporting the device as unavailable. + * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { @@ -263,21 +200,40 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, vd->vd_bdev = NULL; if (bdev) { - if (v->vdev_expanding && bdev != bdev->bd_contains) { - bdevname(bdev->bd_contains, disk_name + 5); - reread_part = B_TRUE; + if (v->vdev_expanding && bdev != bdev_whole(bdev)) { + bdevname(bdev_whole(bdev), disk_name + 5); + /* + * If userland has BLKPG_RESIZE_PARTITION, + * then it should have updated the partition + * table already. We can detect this by + * comparing our current physical size + * with that of the device. If they are + * the same, then we must not have + * BLKPG_RESIZE_PARTITION or it failed to + * update the partition table online. We + * fallback to rescanning the partition + * table from the kernel below. However, + * if the capacity already reflects the + * updated partition, then we skip + * rescanning the partition table here. + */ + if (v->vdev_psize == bdev_capacity(bdev)) + reread_part = B_TRUE; } - vdev_bdev_close(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); } if (reread_part) { - bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); + bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, + zfs_vdev_holder); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); - vdev_bdev_close(bdev, mode); - if (error == 0) - bdev_retry_count = 100; + blkdev_put(bdev, mode | FMODE_EXCL); + if (error == 0) { + timeout = MSEC2NSEC( + zfs_vdev_open_timeout_ms * 2); + } } } } else { @@ -310,12 +266,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. */ + hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); - while (IS_ERR(bdev) && count < bdev_retry_count) { - bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); + while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { + bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, + zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { schedule_timeout(MSEC_TO_TICK(10)); - count++; } else if (IS_ERR(bdev)) { break; } @@ -323,7 +280,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (IS_ERR(bdev)) { int error = -PTR_ERR(bdev); - vdev_dbgmsg(v, "open error=%d count=%d", error, count); + vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, + (u_longlong_t)(gethrtime() - start), + (u_longlong_t)timeout); vd->vd_bdev = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); @@ -337,7 +296,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, struct request_queue *q = bdev_get_queue(vd->vd_bdev); /* Determine the physical block size */ - block_size = vdev_bdev_block_size(vd->vd_bdev); + int physical_block_size = bdev_physical_block_size(vd->vd_bdev); + + /* Determine the logical block size */ + int logical_block_size = bdev_logical_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; @@ -358,10 +320,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ - *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = highbit64(MAX(physical_block_size, + SPA_MINBLOCKSIZE)) - 1; - /* Try to set the io scheduler elevator algorithm */ - (void) vdev_elevator_switch(v, zfs_vdev_scheduler); + *logical_ashift = highbit64(MAX(logical_block_size, + SPA_MINBLOCKSIZE)) - 1; return (0); } @@ -375,8 +338,8 @@ vdev_disk_close(vdev_t *v) return; if (vd->vd_bdev != NULL) { - vdev_bdev_close(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa))); + blkdev_put(vd->vd_bdev, + vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); } rw_destroy(&vd->vd_lock); @@ -387,19 +350,14 @@ vdev_disk_close(vdev_t *v) static dio_request_t * vdev_disk_dio_alloc(int bio_count) { - dio_request_t *dr; - int i; - - dr = kmem_zalloc(sizeof (dio_request_t) + + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); - if (dr) { - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; - for (i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - } + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; return (dr); } @@ -471,66 +429,62 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) rc = vdev_disk_dio_put(dr); } -static unsigned int -bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) -{ - unsigned int offset, size, i; - struct page *page; - - offset = offset_in_page(bio_ptr); - for (i = 0; i < bio->bi_max_vecs; i++) { - size = PAGE_SIZE - offset; - - if (bio_size <= 0) - break; - - if (size > bio_size) - size = bio_size; - - if (is_vmalloc_addr(bio_ptr)) - page = vmalloc_to_page(bio_ptr); - else - page = virt_to_page(bio_ptr); - - /* - * Some network related block device uses tcp_sendpage, which - * doesn't behave well when using 0-count page, this is a - * safety net to catch them. - */ - ASSERT3S(page_count(page), >, 0); - - if (bio_add_page(bio, page, size, offset) != size) - break; - - bio_ptr += size; - bio_size -= size; - offset = 0; - } - - return (bio_size); -} - -static unsigned int -bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) -{ - if (abd_is_linear(abd)) - return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); - - return (abd_scatter_bio_map_off(bio, abd, size, off)); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { #ifdef HAVE_1ARG_SUBMIT_BIO - submit_bio(bio); + (void) submit_bio(bio); #else - submit_bio(0, bio); + (void) submit_bio(0, bio); #endif } +/* + * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so + * replace it with preempt_schedule under the following condition: + */ +#if defined(CONFIG_ARM64) && \ + defined(CONFIG_PREEMPTION) && \ + defined(CONFIG_BLK_CGROUP) +#define preempt_schedule_notrace(x) preempt_schedule(x) +#endif + #ifdef HAVE_BIO_SET_DEV #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) +/* + * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by + * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). + * As a side effect the function was converted to GPL-only. Define our + * own version when needed which uses rcu_read_lock_sched(). + */ +#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) +static inline bool +vdev_blkg_tryget(struct blkcg_gq *blkg) +{ + struct percpu_ref *ref = &blkg->refcnt; + unsigned long __percpu *count; + bool rc; + + rcu_read_lock_sched(); + + if (__ref_is_percpu(ref, &count)) { + this_cpu_inc(*count); + rc = true; + } else { +#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA + rc = atomic_long_inc_not_zero(&ref->data->count); +#else + rc = atomic_long_inc_not_zero(&ref->count); +#endif + } + + rcu_read_unlock_sched(); + + return (rc); +} +#elif defined(HAVE_BLKG_TRYGET) +#define vdev_blkg_tryget(bg) blkg_tryget(bg) +#endif /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting @@ -540,12 +494,16 @@ vdev_submit_bio_impl(struct bio *bio) static inline void vdev_bio_associate_blkg(struct bio *bio) { +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else struct request_queue *q = bio->bi_disk->queue; +#endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); - if (blkg_tryget(q->root_blkg)) + if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg @@ -564,17 +522,10 @@ bio_set_dev(struct bio *bio, struct block_device *bdev) static inline void vdev_submit_bio(struct bio *bio) { -#ifdef HAVE_CURRENT_BIO_TAIL - struct bio **bio_tail = current->bio_tail; - current->bio_tail = NULL; - vdev_submit_bio_impl(bio); - current->bio_tail = bio_tail; -#else struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; vdev_submit_bio_impl(bio); current->bio_list = bio_list; -#endif } static int @@ -584,25 +535,25 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; - int bio_size, bio_count = 16; - int i = 0, error = 0; -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) + int bio_size; + int bio_count = 16; + int error = 0; struct blk_plug plug; -#endif + /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev->bd_inode->i_size) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", - io_offset, io_size, i_size_read(bdev->bd_inode)); + (u_longlong_t)io_offset, + (u_longlong_t)io_size, + (u_longlong_t)i_size_read(bdev->bd_inode)); return (SET_ERROR(EIO)); } retry: dr = vdev_disk_dio_alloc(bio_count); - if (dr == NULL) - return (SET_ERROR(ENOMEM)); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); @@ -610,26 +561,28 @@ retry: dr->dr_zio = zio; /* - * When the IO size exceeds the maximum bio size for the request - * queue we are forced to break the IO in multiple bio's and wait - * for them all to complete. Ideally, all pool users will set - * their volume block size to match the maximum request size and - * the common case will be one bio per vdev IO request. + * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which + * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio + * can cover at least 128KB and at most 1MB. When the required number + * of iovec's exceeds this, we are forced to break the IO in multiple + * bio's and wait for them all to complete. This is likely if the + * recordsize property is increased beyond 1MB. The default + * bio_count=16 should typically accommodate the maximum-size zio of + * 16MB. */ abd_offset = 0; bio_offset = io_offset; - bio_size = io_size; - for (i = 0; i <= dr->dr_bio_count; i++) { + bio_size = io_size; + for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* - * By default only 'bio_count' bio's per dio are allowed. - * However, if we find ourselves in a situation where more - * are needed we allocate a larger dio and warn the user. + * If additional bio's are required, we have to retry, but + * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); @@ -638,9 +591,14 @@ retry: } /* bio_alloc() with __GFP_WAIT never returns NULL */ +#ifdef HAVE_BIO_MAX_SEGS + dr->dr_bio[i] = bio_alloc(GFP_NOIO, bio_max_segs( + abd_nr_pages_off(zio->io_abd, bio_size, abd_offset))); +#else dr->dr_bio[i] = bio_alloc(GFP_NOIO, MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), BIO_MAX_PAGES)); +#endif if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); @@ -656,7 +614,7 @@ retry: bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ @@ -667,20 +625,17 @@ retry: /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) if (dr->dr_bio_count > 1) blk_start_plug(&plug); -#endif /* Submit all bio's associated with this dio */ - for (i = 0; i < dr->dr_bio_count; i++) + for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); + } -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) if (dr->dr_bio_count > 1) blk_finish_plug(&plug); -#endif (void) vdev_disk_dio_put(dr); @@ -737,7 +692,7 @@ vdev_disk_io_start(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; unsigned long trim_flags = 0; - int rw, flags, error; + int rw, error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. @@ -802,24 +757,10 @@ vdev_disk_io_start(zio_t *zio) return; case ZIO_TYPE_WRITE: rw = WRITE; -#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) - flags = (1 << BIO_RW_UNPLUG); -#elif defined(REQ_UNPLUG) - flags = REQ_UNPLUG; -#else - flags = 0; -#endif break; case ZIO_TYPE_READ: rw = READ; -#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) - flags = (1 << BIO_RW_UNPLUG); -#elif defined(REQ_UNPLUG) - flags = REQ_UNPLUG; -#else - flags = 0; -#endif break; case ZIO_TYPE_TRIM: @@ -844,7 +785,7 @@ vdev_disk_io_start(zio_t *zio) zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, - zio->io_size, zio->io_offset, rw, flags); + zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); if (error) { @@ -866,8 +807,8 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (check_disk_change(vd->vd_bdev)) { - vdev_bdev_invalidate(vd->vd_bdev); + if (zfs_check_media_change(vd->vd_bdev)) { + invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } @@ -890,9 +831,6 @@ vdev_disk_hold(vdev_t *vd) if (vd->vdev_tsd != NULL) return; - /* XXX: Implement me as a vnode lookup for the device */ - vd->vdev_name_vp = NULL; - vd->vdev_devid_vp = NULL; } static void @@ -903,53 +841,90 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +/* + * The zfs_vdev_scheduler module option has been deprecated. Setting this + * value no longer has any effect. It has not yet been entirely removed + * to allow the module to be loaded if this option is specified in the + * /etc/modprobe.d/zfs.conf file. The following warning will be logged. + */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { - spa_t *spa = NULL; - char *p; - - if (val == NULL) - return (SET_ERROR(-EINVAL)); - - if ((p = strchr(val, '\n')) != NULL) - *p = '\0'; - - if (spa_mode_global != 0) { - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || - !spa_writeable(spa) || spa_suspended(spa)) - continue; - - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - vdev_elevator_switch(spa->spa_root_vdev, (char *)val); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - } - mutex_exit(&spa_namespace_lock); + int error = param_set_charp(val, kp); + if (error == 0) { + printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " + "is not supported.\n"); } - return (param_set_charp(val, kp)); + return (error); } -vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - NULL, - vdev_disk_hold, - vdev_disk_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - +char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); + +int +param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +int +param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} diff --git a/module/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c similarity index 58% rename from module/zfs/vdev_file.c rename to module/os/linux/zfs/vdev_file.c index c155057852..bf8a13ae61 100644 --- a/module/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. */ #include @@ -35,13 +35,27 @@ #include #include #include - +#include +#ifdef _KERNEL +#include +#endif /* * Virtual device vector for files. */ static taskq_t *vdev_file_taskq; +/* + * By default, the logical/physical ashift for file vdevs is set to + * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9) + * blocksizes. Users may opt to change one or both of these for testing + * or performance reasons. Care should be taken as these values will + * impact the vdev_ashift setting which can only be set at vdev creation + * time. + */ +unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; + static void vdev_file_hold(vdev_t *vd) { @@ -54,13 +68,29 @@ vdev_file_rele(vdev_t *vd) ASSERT(vd->vdev_path != NULL); } +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; - vnode_t *vp; - vattr_t vattr; + zfs_file_t *fp; + zfs_file_attr_t zfa; int error; /* @@ -108,39 +138,40 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } - vf->vf_vnode = vp; + vf->vf_file = fp; #ifdef _KERNEL /* * Make sure it's a regular file. */ - if (vp->v_type != VREG) { + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(ENODEV)); } #endif skip_open: - /* - * Determine the physical size of the file. - */ - vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); + + error = zfs_file_getattr(vf->vf_file, &zfa); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } - *max_psize = *psize = vattr.va_size; - *ashift = SPA_MINBLOCKSHIFT; + *max_psize = *psize = zfa.zfa_size; + *logical_ashift = vdev_file_logical_ashift; + *physical_ashift = vdev_file_physical_ashift; return (0); } @@ -153,10 +184,8 @@ vdev_file_close(vdev_t *vd) if (vd->vdev_reopening || vf == NULL) return; - if (vf->vf_vnode != NULL) { - (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, - kcred, NULL); + if (vf->vf_file != NULL) { + (void) zfs_file_close(vf->vf_file); } vd->vdev_delayed_close = B_FALSE; @@ -172,21 +201,24 @@ vdev_file_io_strategy(void *arg) vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; void *buf; + loff_t off; + ssize_t size; + int err; - if (zio->io_type == ZIO_TYPE_READ) + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + if (zio->io_type == ZIO_TYPE_READ) { buf = abd_borrow_buf(zio->io_abd, zio->io_size); - else + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, - zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, buf, zio->io_size); - else - abd_return_buf(zio->io_abd, buf, zio->io_size); - + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + zio->io_error = err; if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); @@ -199,7 +231,7 @@ vdev_file_io_fsync(void *arg) zio_t *zio = (zio_t *)arg; vdev_file_t *vf = zio->io_vd->vdev_tsd; - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL); + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); zio_interrupt(zio); } @@ -238,8 +270,8 @@ vdev_file_io_start(zio_t *zio) return; } - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, - kcred, NULL); + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC | O_DSYNC); break; default: zio->io_error = SET_ERROR(ENOTSUP); @@ -248,18 +280,14 @@ vdev_file_io_start(zio_t *zio) zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { - struct flock flck; + int mode = 0; ASSERT3U(zio->io_size, !=, 0); - bzero(&flck, sizeof (flck)); - flck.l_type = F_FREESP; - flck.l_start = zio->io_offset; - flck.l_len = zio->io_size; - flck.l_whence = SEEK_SET; - - zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck, - 0, 0, kcred, NULL); - +#ifdef __linux__ + mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; +#endif + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); zio_execute(zio); return; } @@ -277,19 +305,28 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; void @@ -313,19 +350,33 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif + +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, + "Logical ashift for file-based devices"); +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, + "Physical ashift for file-based devices"); diff --git a/module/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c similarity index 82% rename from module/zfs/zfs_acl.c rename to module/os/linux/zfs/zfs_acl.c index b1af4da2f4..cf37aecf8a 100644 --- a/module/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ @@ -36,14 +37,13 @@ #include #include #include -#include #include -#include #include #include #include #include #include +#include #include #include #include @@ -269,7 +269,7 @@ zfs_ace_fuid_size(void *acep) entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (sizeof (zfs_ace_hdr_t)); - /*FALLTHROUGH*/ + fallthrough; default: return (sizeof (zfs_ace_t)); } @@ -648,7 +648,7 @@ zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, * While processing the ACL each ACE will be validated for correctness. * ACE FUIDs will be created later. */ -int +static int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) @@ -810,7 +810,7 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) * for zfs_copy_ace_2_fuid(). * * We only convert an ACL once, so this won't happen - * everytime. + * every time. */ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, KM_SLEEP); @@ -879,7 +879,6 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, /* * Determine mode of file based on ACL. - * Also, create FUIDs for any User/Group ACEs */ uint64_t zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, @@ -905,11 +904,9 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, entry_type = (iflags & ACE_TYPE_FLAGS); /* - * Skip over owner@, group@ or everyone@ inherit only ACEs + * Skip over any inherit_only ACEs */ - if ((iflags & ACE_INHERIT_ONLY_ACE) && - (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - entry_type == OWNING_GROUP)) + if (iflags & ACE_INHERIT_ONLY_ACE) continue; if (entry_type == ACE_OWNER || (entry_type == 0 && @@ -1156,7 +1153,7 @@ zfs_acl_chown_setattr(znode_t *zp) int error; zfs_acl_t *aclp; - if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) + if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX) return (0); ASSERT(MUTEX_HELD(&zp->z_lock)); @@ -1183,60 +1180,77 @@ zfs_acl_chown_setattr(znode_t *zp) return (error); } +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + static void -acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, - uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone) +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) { - *deny1 = *deny2 = *allow0 = *group = 0; + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + if (isdir) + write_mask |= ACE_DELETE_CHILD; + + masks->deny1 = 0; if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) - *deny1 |= ACE_READ_DATA; + masks->deny1 |= read_mask; if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) - *deny1 |= ACE_WRITE_DATA; + masks->deny1 |= write_mask; if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) - *deny1 |= ACE_EXECUTE; + masks->deny1 |= execute_mask; + masks->deny2 = 0; if (!(mode & S_IRGRP) && (mode & S_IROTH)) - *deny2 = ACE_READ_DATA; + masks->deny2 |= read_mask; if (!(mode & S_IWGRP) && (mode & S_IWOTH)) - *deny2 |= ACE_WRITE_DATA; + masks->deny2 |= write_mask; if (!(mode & S_IXGRP) && (mode & S_IXOTH)) - *deny2 |= ACE_EXECUTE; + masks->deny2 |= execute_mask; + masks->allow0 = 0; if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) - *allow0 |= ACE_READ_DATA; + masks->allow0 |= read_mask; if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) - *allow0 |= ACE_WRITE_DATA; + masks->allow0 |= write_mask; if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) - *allow0 |= ACE_EXECUTE; + masks->allow0 |= execute_mask; - *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; if (mode & S_IRUSR) - *owner |= ACE_READ_DATA; + masks->owner |= read_mask; if (mode & S_IWUSR) - *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->owner |= write_mask; if (mode & S_IXUSR) - *owner |= ACE_EXECUTE; + masks->owner |= execute_mask; - *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IRGRP) - *group |= ACE_READ_DATA; + masks->group |= read_mask; if (mode & S_IWGRP) - *group |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->group |= write_mask; if (mode & S_IXGRP) - *group |= ACE_EXECUTE; + masks->group |= execute_mask; - *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IROTH) - *everyone |= ACE_READ_DATA; + masks->everyone |= read_mask; if (mode & S_IWOTH) - *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA; + masks->everyone |= write_mask; if (mode & S_IXOTH) - *everyone |= ACE_EXECUTE; + masks->everyone |= execute_mask; } /* @@ -1284,10 +1298,17 @@ ace_trivial_common(void *acep, int aclcnt, return (1); /* - * Delete permissions are never set by default + * Delete permission is never set by default */ - if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + if (mask & ACE_DELETE) return (1); + + /* + * Child delete permission should be accompanied by write + */ + if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) + return (1); + /* * only allow owner@ to have * write_acl/write_owner/write_attributes/write_xattr/ @@ -1463,7 +1484,8 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) } static void -zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; @@ -1475,31 +1497,29 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) zfs_acl_node_t *newnode; size_t abstract_size = aclp->z_ops->ace_abstract_size(); void *zacep; - uint32_t owner, group, everyone; - uint32_t deny1, deny2, allow0; + trivial_acl_t masks; new_count = new_bytes = 0; - acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, - &owner, &group, &everyone); + acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; - if (allow0) { - zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } - if (deny1) { - zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } - if (deny2) { - zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; @@ -1507,21 +1527,40 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { - uint16_t inherit_flags; - entry_type = (iflags & ACE_TYPE_FLAGS); - inherit_flags = (iflags & ALL_INHERIT); - - if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) && - ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) { - continue; + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; } + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + if ((type != ALLOW && type != DENY) || - (inherit_flags & ACE_INHERIT_ONLY_ACE)) { - if (inherit_flags) - aclp->z_hints |= ZFS_INHERIT_ACE; + (iflags & ACE_INHERIT_ONLY_ACE)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: @@ -1531,23 +1570,15 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) break; } } else { - /* * Limit permissions to be no greater than - * group permissions + * group permissions. + * The "aclinherit" and "aclmode" properties + * affect policy for create and chmod(2), + * respectively. */ - if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) { - if (!(mode & S_IRGRP)) - access_mask &= ~ACE_READ_DATA; - if (!(mode & S_IWGRP)) - access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - if (!(mode & S_IXGRP)) - access_mask &= ~ACE_EXECUTE; - access_mask &= - ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); - } + if ((type == ALLOW) && trim) + access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops->ace_size(acep); @@ -1555,11 +1586,11 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) new_count++; new_bytes += ace_size; } - zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; @@ -1571,32 +1602,27 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) list_insert_tail(&aclp->z_acl, newnode); } -void +int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { + int error = 0; + mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); - *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(ZTOZSB(zp), mode, *aclp); + if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE, + (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); - ASSERT(*aclp); -} -/* - * strip off write_owner and write_acl - */ -static void -zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) -{ - uint32_t mask = aclp->z_ops->ace_mask_get(acep); - - if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && - (aclp->z_ops->ace_type_get(acep) == ALLOW)) { - mask &= ~RESTRICTED_CLEAR; - aclp->z_ops->ace_mask_set(acep, mask); - } + return (error); } /* @@ -1619,10 +1645,10 @@ zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) * inherit inheritable ACEs from parent */ static zfs_acl_t * -zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, +zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { - void *pacep; + void *pacep = NULL; void *acep; zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; @@ -1632,22 +1658,17 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; - boolean_t vdir = S_ISDIR(obj_mode); - boolean_t vreg = S_ISREG(obj_mode); - boolean_t passthrough, passthrough_x, noallow; - - passthrough_x = - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; - passthrough = passthrough_x || - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH; - noallow = - zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW; + uint_t aclinherit; + boolean_t isdir = S_ISDIR(va_mode); + boolean_t isreg = S_ISREG(va_mode); *need_chmod = B_TRUE; - pacep = NULL; + aclp = zfs_acl_alloc(paclp->z_version); - if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode)) + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode)) return (aclp); + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type))) { @@ -1657,31 +1678,42 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, if (!zfs_acl_valid_ace_type(type, iflags)) continue; - if (noallow && type == ALLOW) - continue; - - ace_size = aclp->z_ops->ace_size(pacep); - - if (!zfs_ace_can_use(obj_mode, iflags)) + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(va_mode, iflags)) continue; /* * If owner@, group@, or everyone@ inheritable * then zfs_acl_chmod() isn't needed. */ - if (passthrough && + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && ((iflags & (ACE_OWNER|ACE_EVERYONE)) || - ((iflags & OWNING_GROUP) == - OWNING_GROUP)) && (vreg || (vdir && (iflags & - ACE_DIRECTORY_INHERIT_ACE)))) { + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) *need_chmod = B_FALSE; - } - if (!vdir && passthrough_x && - ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { access_mask &= ~ACE_EXECUTE; } + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; @@ -1703,18 +1735,21 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops->ace_flags_get(acep); - if (vdir) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); - zfs_restricted_update(zfsvfs, aclp, acep); continue; } - ASSERT(vdir); + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; /* * If only FILE_INHERIT is set then turn on @@ -1731,12 +1766,18 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, newflags|ACE_INHERITED_ACE); } } + if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + aclp->z_acl_count != 0) { + *need_chmod = B_FALSE; + } + return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. + * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, @@ -1747,6 +1788,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, zfs_acl_t *paclp; gid_t gid = vap->va_gid; boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; bzero(acl_ids, sizeof (zfs_acl_ids_t)); @@ -1833,8 +1875,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); - if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) && - (dzp->z_pflags & ZFS_INHERIT_ACE)) && + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); @@ -1848,10 +1890,18 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, } mutex_exit(&dzp->z_lock); mutex_exit(&dzp->z_acl_lock); + if (need_chmod) { - acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ? - ZFS_ACL_AUTO_INHERIT : 0; - zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp); + if (S_ISDIR(vap->va_mode)) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); } } @@ -2158,19 +2208,17 @@ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && - (!S_ISDEV(ZTOI(zp)->i_mode) || - (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { + (!Z_ISDEV(ZTOI(zp)->i_mode) || + (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { return (SET_ERROR(EROFS)); } /* - * Only check for READONLY on non-directories. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). */ if ((v4_mode & WRITE_MASK_DATA) && - ((!S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || - (S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & ZFS_IMMUTABLE)))) { + (zp->z_pflags & ZFS_IMMUTABLE)) { return (SET_ERROR(EPERM)); } @@ -2269,7 +2317,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, break; case OWNING_GROUP: who = gowner; - /*FALLTHROUGH*/ + fallthrough; case ACE_IDENTIFIER_GROUP: checkit = zfs_groupmember(zfsvfs, who, cr); break; @@ -2384,6 +2432,24 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, return (0); } + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); } @@ -2540,14 +2606,14 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) - iput(ZTOI(xzp)); + zrele(xzp); return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, needed_bits, needed_bits)); } if (error && !check_privs) { if (is_attr) - iput(ZTOI(xzp)); + zrele(xzp); return (error); } @@ -2609,14 +2675,14 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } if (is_attr) - iput(ZTOI(xzp)); + zrele(xzp); return (error); } /* * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into - * native ACL format and call zfs_zaccess() + * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) @@ -2635,47 +2701,32 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); } -static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t available_perms, cred_t *cr) -{ - int error; - uid_t downer; - - downer = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid), - cr, ZFS_OWNER); - - error = secpolicy_vnode_access2(cr, ZTOI(dzp), - downer, available_perms, S_IWUSR|S_IXUSR); - - if (error == 0) - error = zfs_sticky_remove_access(dzp, zp, cr); - - return (error); -} +/* See zfs_zaccess_delete() */ +int zfs_write_implies_delete_child = 1; /* - * Determine whether Access should be granted/deny, without - * consulting least priv subsystem. + * Determine whether delete access should be granted. * - * The following chart is the recommended NFSv4 enforcement for - * ability to delete an object. + * The following chart outlines how we handle delete permissions which is + * how recent versions of windows (Windows 2008) handles it. The efficiency + * comes from not having to check the parent ACL where the object itself grants + * delete: * * ------------------------------------------------------- - * | Parent Dir | Target Object Permissions | + * | Parent Dir | Target Object Permissions | * | permissions | | * ------------------------------------------------------- * | | ACL Allows | ACL Denies| Delete | * | | Delete | Delete | unspecified| * ------------------------------------------------------- - * | ACL Allows | Permit | Permit | Permit | - * | DELETE_CHILD | | + * | ACL Allows | Permit | Deny * | Permit | + * | DELETE_CHILD | | | | * ------------------------------------------------------- - * | ACL Denies | Permit | Deny | Deny | - * | DELETE_CHILD | | | | + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL specifies | | | | - * | only allow | Permit | Permit | Permit | + * | only allow | Permit | Deny * | Permit | * | write and | | | | * | execute | | | | * ------------------------------------------------------- @@ -2685,91 +2736,172 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, * ------------------------------------------------------- * ^ * | - * No search privilege, can't even look up file? + * Re. execute permission on the directory: if that's missing, + * the vnode lookup of the target will fail before we get here. * + * Re [*] in the table above: NFSv4 would normally Permit delete for + * these two cells of the matrix. + * See acl.h for notes on which ACE_... flags should be checked for which + * operations. Specifically, the NFSv4 committee recommendation is in + * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs + * should take precedence ahead of ALLOW ACEs. + * + * This implementation always consults the target object's ACL first. + * If a DENY ACE is present on the target object that specifies ACE_DELETE, + * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on + * the target object, access is allowed. If and only if no entries with + * ACE_DELETE are present in the object's ACL, check the container's ACL + * for entries with ACE_DELETE_CHILD. + * + * A summary of the logic implemented from the table above is as follows: + * + * First check for DENY ACEs that apply. + * If either target or container has a deny, EACCES. + * + * Delete access can then be summarized as follows: + * 1: The object to be deleted grants ACE_DELETE, or + * 2: The containing directory grants ACE_DELETE_CHILD. + * In a Windows system, that would be the end of the story. + * In this system, (2) has some complications... + * 2a: "sticky" bit on a directory adds restrictions, and + * 2b: existing ACEs from previous versions of ZFS may + * not carry ACE_DELETE_CHILD where they should, so we + * also allow delete when ACE_WRITE_DATA is granted. + * + * Note: 2b is technically a work-around for a prior bug, + * which hopefully can go away some day. For those who + * no longer need the work around, and for testing, this + * work-around is made conditional via the tunable: + * zfs_write_implies_delete_child */ int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) { + uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; - mode_t available_perms; - boolean_t dzpcheck_privs = B_TRUE; - boolean_t zpcheck_privs = B_TRUE; - - /* - * We want specific DELETE permissions to - * take precedence over WRITE/EXECUTE. We don't - * want an ACL such as this to mess us up. - * user:joe:write_data:deny,user:joe:delete:allow - * - * However, deny permissions may ultimately be overridden - * by secpolicy_vnode_access(). - * - * We will ask for all of the necessary permissions and then - * look at the working modes from the directory and target object - * to determine what was found. - */ + boolean_t dzpcheck_privs; + boolean_t zpcheck_privs; if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (SET_ERROR(EPERM)); /* - * First row - * If the directory permissions allow the delete, we are done. - */ - if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - /* - * If target object has delete permission then we are done - */ - if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - ASSERT(dzp_error && zp_error); - - if (!dzpcheck_privs) - return (dzp_error); - if (!zpcheck_privs) - return (zp_error); - - /* - * Second row + * Case 1: + * If target object grants ACE_DELETE then we are done. This is + * indicated by a return value of 0. For this case we don't worry + * about the sticky bit because sticky only applies to the parent + * directory and this is the child access result. * - * If directory returns EACCES then delete_child was denied - * due to deny delete_child. In this case send the request through - * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() - * since that *could* allow the delete based on write/execute permission - * and we want delete permissions to override write/execute. + * If we encounter a DENY ACE here, we're also done (EACCES). + * Note that if we hit a DENY ACE here (on the target) it should + * take precedence over a DENY ACE on the container, so that when + * we have more complete auditing support we will be able to + * report an access failure against the specific target. + * (This is part of why we're checking the target first.) */ - - if (dzp_error == EACCES) + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr); + if (zp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!zpcheck_privs) + return (SET_ERROR(zp_error)); return (secpolicy_vnode_remove(cr)); - /* - * Third Row - * only need to see if we have write/execute on directory. - */ + } + if (zp_error == 0) + return (0); - dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + /* + * Case 2: + * If the containing directory grants ACE_DELETE_CHILD, + * or we're in backward compatibility mode and the + * containing directory has ACE_WRITE_DATA, allow. + * Case 2b is handled with wanted_dirperms. + */ + wanted_dirperms = ACE_DELETE_CHILD; + if (zfs_write_implies_delete_child) + wanted_dirperms |= ACE_WRITE_DATA; + dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); - - if (dzp_error != 0 && !dzpcheck_privs) - return (dzp_error); + if (dzp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!dzpcheck_privs) + return (SET_ERROR(dzp_error)); + return (secpolicy_vnode_remove(cr)); + } /* - * Fourth row + * Cases 2a, 2b (continued) + * + * Note: dzp_working_mode now contains any permissions + * that were NOT granted. Therefore, if any of the + * wanted_dirperms WERE granted, we will have: + * dzp_working_mode != wanted_dirperms + * We're really asking if ANY of those permissions + * were granted, and if so, grant delete access. */ + if (dzp_working_mode != wanted_dirperms) + dzp_error = 0; - available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR; - available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR; + /* + * dzp_error is 0 if the container granted us permissions to "modify". + * If we do not have permission via one or more ACEs, our current + * privileges may still permit us to modify the container. + * + * dzpcheck_privs is false when i.e. the FS is read-only. + * Otherwise, do privilege checks for the container. + */ + if (dzp_error != 0 && dzpcheck_privs) { + uid_t owner; - return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + /* + * The secpolicy call needs the requested access and + * the current access mode of the container, but it + * only knows about Unix-style modes (VEXEC, VWRITE), + * so this must condense the fine-grained ACE bits into + * Unix modes. + * + * The VEXEC flag is easy, because we know that has + * always been checked before we get here (during the + * lookup of the target vnode). The container has not + * granted us permissions to "modify", so we do not set + * the VWRITE flag in the current access mode. + */ + owner = zfs_fuid_map_id(ZTOZSB(dzp), + KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER); + dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp), + owner, S_IXUSR, S_IWUSR|S_IXUSR); + } + if (dzp_error != 0) { + /* + * Note: We may have dzp_error = -1 here (from + * zfs_zacess_common). Don't return that. + */ + return (SET_ERROR(EACCES)); + } + + /* + * At this point, we know that the directory permissions allow + * us to modify, but we still need to check for the additional + * restrictions that apply when the "sticky bit" is set. + * + * Yes, zfs_sticky_remove_access() also checks this bit, but + * checking it here and skipping the call below is nice when + * you're watching all of this with dtrace. + */ + if ((dzp->z_mode & S_ISVTX) == 0) + return (0); + + /* + * zfs_sticky_remove_access will succeed if: + * 1. The sticky bit is absent. + * 2. We pass the sticky bit restrictions. + * 3. We have privileges that always allow file removal. + */ + return (zfs_sticky_remove_access(dzp, zp, cr)); } int diff --git a/module/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c similarity index 90% rename from module/zfs/zfs_ctldir.c rename to module/os/linux/zfs/zfs_ctldir.c index c8071a7c21..c58d851d77 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -30,6 +30,8 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. + * Copyright (c) 2020 The MathWorks, Inc. All rights reserved. */ /* @@ -116,6 +118,7 @@ typedef struct { spa_t *se_spa; /* pool spa */ uint64_t se_objsetid; /* snapshot objset id */ struct dentry *se_root_dentry; /* snapshot root dentry */ + krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */ taskqid_t se_taskqid; /* scheduled unmount taskqid */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ @@ -129,19 +132,20 @@ static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); * the snapshot name and provided mount point. No reference is taken. */ static zfs_snapentry_t * -zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa, +zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa, uint64_t objsetid, struct dentry *root_dentry) { zfs_snapentry_t *se; se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); - se->se_name = strdup(full_name); - se->se_path = strdup(full_path); + se->se_name = kmem_strdup(full_name); + se->se_path = kmem_strdup(full_path); se->se_spa = spa; se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; se->se_taskqid = TASKQID_INVALID; + rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL); zfs_refcount_create(&se->se_refcount); @@ -156,8 +160,9 @@ static void zfsctl_snapshot_free(zfs_snapentry_t *se) { zfs_refcount_destroy(&se->se_refcount); - strfree(se->se_name); - strfree(se->se_path); + kmem_strfree(se->se_name); + kmem_strfree(se->se_path); + rw_destroy(se->se_taskqid_lock); kmem_free(se, sizeof (zfs_snapentry_t)); } @@ -191,7 +196,7 @@ static void zfsctl_snapshot_add(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); avl_add(&zfs_snapshots_by_name, se); avl_add(&zfs_snapshots_by_objsetid, se); } @@ -259,16 +264,16 @@ snapentry_compare_by_objsetid(const void *a, const void *b) * NULL will be returned. */ static zfs_snapentry_t * -zfsctl_snapshot_find_by_name(char *snapname) +zfsctl_snapshot_find_by_name(const char *snapname) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); - search.se_name = snapname; + search.se_name = (char *)snapname; se = avl_find(&zfs_snapshots_by_name, &search, NULL); if (se) - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); return (se); } @@ -289,7 +294,7 @@ zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) search.se_objsetid = objsetid; se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); if (se) - zfs_refcount_add(&se->se_refcount, NULL); + zfsctl_snapshot_hold(se); return (se); } @@ -299,7 +304,7 @@ zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) * removed, renamed, and added back to the new correct location in the tree. */ static int -zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) +zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname) { zfs_snapentry_t *se; @@ -310,8 +315,8 @@ zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) return (SET_ERROR(ENOENT)); zfsctl_snapshot_remove(se); - strfree(se->se_name); - se->se_name = strdup(new_snapname); + kmem_strfree(se->se_name); + se->se_name = kmem_strdup(new_snapname); zfsctl_snapshot_add(se); zfsctl_snapshot_rele(se); @@ -333,7 +338,9 @@ snapentry_expire(void *data) return; } + rw_enter(&se->se_taskqid_lock, RW_WRITER); se->se_taskqid = TASKQID_INVALID; + rw_exit(&se->se_taskqid_lock); (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); zfsctl_snapshot_rele(se); @@ -357,8 +364,18 @@ snapentry_expire(void *data) static void zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { - if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { - se->se_taskqid = TASKQID_INVALID; + int err = 0; + rw_enter(&se->se_taskqid_lock, RW_WRITER); + err = taskq_cancel_id(system_delay_taskq, se->se_taskqid); + /* + * if we get ENOENT, the taskq couldn't be found to be + * canceled, so we can just mark it as invalid because + * it's already gone. If we got EBUSY, then we already + * blocked until it was gone _anyway_, so we don't care. + */ + se->se_taskqid = TASKQID_INVALID; + rw_exit(&se->se_taskqid_lock); + if (err == 0) { zfsctl_snapshot_rele(se); } } @@ -369,14 +386,16 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { - ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); if (delay <= 0) return; zfsctl_snapshot_hold(se); + rw_enter(&se->se_taskqid_lock, RW_WRITER); + ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); + rw_exit(&se->se_taskqid_lock); } /* @@ -408,7 +427,7 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) * and zero when unmounted. */ static boolean_t -zfsctl_snapshot_ismounted(char *snapname) +zfsctl_snapshot_ismounted(const char *snapname) { zfs_snapentry_t *se; boolean_t ismounted = B_FALSE; @@ -462,10 +481,13 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_id = id; - zp->z_unlinked = 0; - zp->z_atime_dirty = 0; - zp->z_zn_prefetch = 0; - zp->z_moved = 0; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_zn_prefetch = B_FALSE; + zp->z_is_sa = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_TRUE; + zp->z_is_stale = B_FALSE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; @@ -474,10 +496,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_is_mapped = B_FALSE; - zp->z_is_ctldir = B_TRUE; - zp->z_is_sa = B_FALSE; - zp->z_is_stale = B_FALSE; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); @@ -589,13 +607,14 @@ struct inode * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); - igrab(ZTOZSB(zp)->z_ctldir); + /* Must have an existing ref, so igrab() cannot return NULL */ + VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL); return (ZTOZSB(zp)->z_ctldir); } /* * Generate a long fid to indicate a snapdir. We encode whether snapdir is - * already monunted in gen field. We do this because nfsd lookup will not + * already mounted in gen field. We do this because nfsd lookup will not * trigger automount. Next time the nfsd does fh_to_dentry, we will notice * this and do automount and return ESTALE to force nfsd revalidate and follow * mount. @@ -703,37 +722,6 @@ zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, return (0); } -/* - * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" - */ -static int -zfsctl_snapshot_path(struct path *path, int len, char *full_path) -{ - char *path_buffer, *path_ptr; - int path_len, error = 0; - - path_buffer = kmem_alloc(len, KM_SLEEP); - - path_ptr = d_path(path, path_buffer, len); - if (IS_ERR(path_ptr)) { - error = -PTR_ERR(path_ptr); - goto out; - } - - path_len = path_buffer + len - 1 - path_ptr; - if (path_len > len) { - error = SET_ERROR(EFAULT); - goto out; - } - - memcpy(full_path, path_ptr, path_len); - full_path[path_len] = '\0'; -out: - kmem_free(path_buffer, len); - - return (error); -} - /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ @@ -780,7 +768,7 @@ out: * Special case the handling of "..". */ int -zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, +zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); @@ -813,7 +801,7 @@ zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, * snapshot if it exist, creating the pseudo filesystem inode as necessary. */ int -zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, +zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); @@ -844,8 +832,8 @@ zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. */ int -zfsctl_snapdir_rename(struct inode *sdip, char *snm, - struct inode *tdip, char *tnm, cred_t *cr, int flags) +zfsctl_snapdir_rename(struct inode *sdip, const char *snm, + struct inode *tdip, const char *tnm, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(sdip); char *to, *from, *real, *fsname; @@ -922,7 +910,8 @@ out: * the removal of the snapshot with the given name. */ int -zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) +zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, + int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); char *snapname, *real; @@ -970,7 +959,7 @@ out: * the creation of a new snapshot with the given name. */ int -zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, +zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap, struct inode **ipp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); @@ -1007,6 +996,22 @@ out: return (error); } +/* + * Flush everything out of the kernel's export table and such. + * This is needed as once the snapshot is used over NFS, its + * entries in svc_export and svc_expkey caches hold reference + * to the snapshot mount point. There is no known way of flushing + * only the entries related to the snapshot. + */ +static void +exportfs_flush(void) +{ + char *argv[] = { "/usr/sbin/exportfs", "-f", NULL }; + char *envp[] = { NULL }; + + (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); +} + /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a @@ -1014,7 +1019,7 @@ out: * it's in use, the unmount will fail harmlessly. */ int -zfsctl_snapshot_unmount(char *snapname, int flags) +zfsctl_snapshot_unmount(const char *snapname, int flags) { char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, NULL }; @@ -1029,6 +1034,8 @@ zfsctl_snapshot_unmount(char *snapname, int flags) } rw_exit(&zfs_snapshot_lock); + exportfs_flush(); + if (flags & MNT_FORCE) argv[4] = "-fn"; argv[5] = se->se_path; @@ -1077,9 +1084,14 @@ zfsctl_snapshot_mount(struct path *path, int flags) if (error) goto error; - error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path); - if (error) - goto error; + /* + * Construct a mount point path from sb of the ctldir inode and dirent + * name, instead of from d_path(), so that chroot'd process doesn't fail + * on mount.zfs(8). + */ + snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", + dname(dentry)); /* * Multiple concurrent automounts of a snapshot are never allowed. @@ -1108,8 +1120,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { - cmn_err(CE_WARN, "Unable to automount %s/%s: %d", - full_path, full_name, error); + zfs_dbgmsg("Unable to automount %s error=%d", + full_path, error); error = SET_ERROR(EISDIR); } else { /* @@ -1131,7 +1143,7 @@ zfsctl_snapshot_mount(struct path *path, int flags) */ spath = *path; path_get(&spath); - if (zpl_follow_down_one(&spath)) { + if (follow_down_one(&spath)) { snap_zfsvfs = ITOZSB(spath.dentry->d_inode); snap_zfsvfs->z_parent = zfsvfs; dentry = spath.dentry; @@ -1210,7 +1222,7 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); - struct inode *ip; + znode_t *zp; znode_t *dzp; int error; @@ -1222,8 +1234,8 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { - error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL); - iput(ZTOI(dzp)); + error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL); + zrele(dzp); } ZFS_EXIT(zfsvfs); diff --git a/module/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c similarity index 97% rename from module/zfs/zfs_debug.c rename to module/os/linux/zfs/zfs_debug.c index 538533d27d..98c9923d59 100644 --- a/module/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -24,10 +24,11 @@ */ #include +#include typedef struct zfs_dbgmsg { procfs_list_node_t zdm_node; - time_t zdm_timestamp; + uint64_t zdm_timestamp; int zdm_size; char zdm_msg[1]; /* variable length allocation */ } zfs_dbgmsg_t; @@ -93,6 +94,7 @@ void zfs_dbgmsg_init(void) { procfs_list_install("zfs", + NULL, "dbgmsg", 0600, &zfs_dbgmsgs, @@ -125,7 +127,8 @@ __set_error(const char *file, const char *func, int line, int err) * $ echo 512 >/sys/module/zfs/parameters/zfs_flags */ if (zfs_flags & ZFS_DEBUG_SET_ERROR) - __dprintf(B_FALSE, file, func, line, "error %lu", err); + __dprintf(B_FALSE, file, func, line, "error %lu", + (ulong_t)err); } void diff --git a/module/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c similarity index 93% rename from module/zfs/zfs_dir.c rename to module/os/linux/zfs/zfs_dir.c index 63ac97754d..82b32d1cc3 100644 --- a/module/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -53,14 +52,17 @@ #include #include #include +#include +#include /* - * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, - boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp, + uint64_t *zoid) { boolean_t conflict = B_FALSE; int error; @@ -138,8 +140,8 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, * but return znode pointers to a single match. */ int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag, int *direntflags, pathname_t *realpnp) +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, + znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); zfs_dirlock_t *dl; @@ -232,7 +234,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, /* * Wait until there are no locks on this name. * - * Don't grab the the lock if it is already held. However, cannot + * Don't grab the lock if it is already held. However, cannot * have both ZSHARED and ZHAVELOCK together. */ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); @@ -378,17 +380,18 @@ zfs_dirent_unlock(zfs_dirlock_t *dl) * special pseudo-directory. */ int -zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, +zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, int *deflg, pathname_t *rpnp) { zfs_dirlock_t *dl; znode_t *zp; + struct inode *ip; int error = 0; uint64_t parent; if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *ipp = ZTOI(dzp); - igrab(*ipp); + *zpp = dzp; + zhold(*zpp); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -402,16 +405,18 @@ zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", ipp, 0, kcred, NULL, NULL); + "snapshot", &ip, 0, kcred, NULL, NULL); + *zpp = ITOZ(ip); return (error); } rw_enter(&dzp->z_parent_lock, RW_READER); error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) - *ipp = ZTOI(zp); + *zpp = zp; rw_exit(&dzp->z_parent_lock); } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *ipp = zfsctl_root(dzp); + ip = zfsctl_root(dzp); + *zpp = ITOZ(ip); } else { int zf; @@ -421,7 +426,7 @@ zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); if (error == 0) { - *ipp = ZTOI(zp); + *zpp = zp; zfs_dirent_unlock(dl); dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ } @@ -514,13 +519,13 @@ zfs_unlinked_drain_task(void *arg) zp->z_unlinked = B_TRUE; /* - * iput() is Linux's equivalent to illumos' VN_RELE(). It will - * decrement the inode's ref count and may cause the inode to be - * synchronously freed. We interrupt freeing of this inode, by - * checking the return value of dmu_objset_zfs_unmounting() in - * dmu_free_long_range(), when an unmount is requested. + * zrele() decrements the znode's ref count and may cause + * it to be synchronously freed. We interrupt freeing + * of this znode by checking the return value of + * dmu_objset_zfs_unmounting() in dmu_free_long_range() + * when an unmount is requested. */ - iput(ZTOI(zp)); + zrele(zp); ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); } zap_cursor_fini(&zc); @@ -616,7 +621,7 @@ zfs_purgedir(znode_t *dzp) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_iput_async(ZTOI(xzp)); + zfs_zrele_async(xzp); skipped += 1; continue; } @@ -629,7 +634,7 @@ zfs_purgedir(znode_t *dzp) skipped += 1; dmu_tx_commit(tx); - zfs_iput_async(ZTOI(xzp)); + zfs_zrele_async(xzp); } zap_cursor_fini(&zc); if (error != ENOENT) @@ -737,9 +742,24 @@ zfs_rmnode(znode_t *zp) zfs_unlinked_add(xzp, tx); } - /* Remove this znode from the unlinked set */ - VERIFY3U(0, ==, - zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* + * Remove this znode from the unlinked set. If a has rollback has + * occurred while a file is open and unlinked. Then when the file + * is closed post rollback it will not exist in the rolled back + * version of the unlinked object. + */ + error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zp->z_id, tx); + VERIFY(error == 0 || error == ENOENT); + + uint64_t count; + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); @@ -748,7 +768,7 @@ zfs_rmnode(znode_t *zp) dmu_tx_commit(tx); out: if (xzp) - zfs_iput_async(ZTOI(xzp)); + zfs_zrele_async(xzp); } static uint64_t @@ -1032,7 +1052,7 @@ zfs_dirempty(znode_t *dzp) } int -zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); znode_t *xzp; @@ -1040,11 +1060,11 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; -#ifdef DEBUG +#ifdef ZFS_DEBUG uint64_t parent; #endif - *xipp = NULL; + *xzpp = NULL; if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) return (error); @@ -1076,7 +1096,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); -#ifdef DEBUG +#ifdef ZFS_DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); @@ -1086,13 +1106,13 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) sizeof (xzp->z_id), tx)); if (!zp->z_unlinked) - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, acl_ids.z_fuidp, vap); + zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, + acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); - *xipp = ZTOI(xzp); + *xzpp = xzp; return (0); } @@ -1111,7 +1131,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) * error number on failure */ int -zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags) +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ZTOZSB(zp); znode_t *xzp; @@ -1124,7 +1144,7 @@ top: return (error); if (xzp != NULL) { - *xipp = ZTOI(xzp); + *xzpp = xzp; zfs_dirent_unlock(dl); return (0); } @@ -1154,7 +1174,7 @@ top: zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); va.va_dentry = NULL; - error = zfs_make_xattrdir(zp, &va, xipp, cr); + error = zfs_make_xattrdir(zp, &va, xzpp, cr); zfs_dirent_unlock(dl); if (error == ERESTART) { diff --git a/module/os/linux/zfs/zfs_file_os.c b/module/os/linux/zfs/zfs_file_os.c new file mode 100644 index 0000000000..e12f7c3ced --- /dev/null +++ b/module/os/linux/zfs/zfs_file_os.c @@ -0,0 +1,428 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_FDTABLE_HEADER +#include +#endif + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct file *filp; + int saved_umask; + + if (!(flags & O_CREAT) && (flags & O_WRONLY)) + flags |= O_EXCL; + + if (flags & O_CREAT) + saved_umask = xchg(¤t->fs->umask, 0); + + filp = filp_open(path, flags, mode); + + if (flags & O_CREAT) + (void) xchg(¤t->fs->umask, saved_umask); + + if (IS_ERR(filp)) + return (-PTR_ERR(filp)); + + *fpp = filp; + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + filp_close(fp, 0); +} + +static ssize_t +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off) +{ +#if defined(HAVE_KERNEL_WRITE_PPOS) + return (kernel_write(fp, buf, count, off)); +#else + mm_segment_t saved_fs; + ssize_t rc; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + rc = vfs_write(fp, (__force const char __user __user *)buf, count, off); + + set_fs(saved_fs); + + return (rc); +#endif +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_pos; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + fp->f_pos = off; + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +static ssize_t +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(fp, buf, count, off)); +#else + mm_segment_t saved_fs; + ssize_t rc; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + rc = vfs_read(fp, (void __user *)buf, count, off); + set_fs(saved_fs); + + return (rc); +#endif +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_pos; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + fp->f_pos = off; + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + if (*offp < 0 || *offp > MAXOFFSET_T) + return (EINVAL); + + rc = vfs_llseek(fp, *offp, whence); + if (rc < 0) + return (-rc); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode. + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr) +{ + struct kstat stat; + int rc; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, &stat); +#elif defined(HAVE_3ARGS_VFS_GETATTR) + rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat); +#else +#error "No available vfs_getattr()" +#endif + if (rc) + return (-rc); + + zfattr->zfa_size = stat.size; + zfattr->zfa_mode = stat.mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *filp, int flags) +{ + int datasync = 0; + int error; + int fstrans; + + if (flags & O_DSYNC) + datasync = 1; + + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + error = -vfs_fsync(filp, datasync); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + int fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + /* + * When supported by the underlying file system preferentially + * use the fallocate() callback to preallocate the space. + */ + int error = EOPNOTSUPP; + if (fp->f_op->fallocate) + error = fp->f_op->fallocate(fp, mode, offset, len); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_pos); +} + +/* + * Request file pointer private data + * + * fp - pointer to file + * + * Returns pointer to file private data. + */ +void * +zfs_file_private(zfs_file_t *fp) +{ + return (fp->private_data); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (EOPNOTSUPP); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * + * Returns pointer to file struct or NULL + */ +zfs_file_t * +zfs_file_get(int fd) +{ + return (fget(fd)); +} + +/* + * Drop reference to file pointer + * + * fp - input file struct pointer + */ +void +zfs_file_put(zfs_file_t *fp) +{ + fput(fp); +} diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c new file mode 100644 index 0000000000..fee3fe540b --- /dev/null +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -0,0 +1,280 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Portions Copyright 2012 Pawel Jakub Dawidek + * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +boolean_t +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_sb != NULL); +} + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + if (*zfvp == NULL || (*zfvp)->z_sb == NULL || + !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) { + return (SET_ERROR(ESRCH)); + } + return (0); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + deactivate_super(zfsvfs->z_sb); +} + +void +zfsdev_private_set_state(void *priv, zfsdev_state_t *zs) +{ + struct file *filp = priv; + + filp->private_data = zs; +} + +zfsdev_state_t * +zfsdev_private_get_state(void *priv) +{ + struct file *filp = priv; + + return (filp->private_data); +} + +static int +zfsdev_open(struct inode *ino, struct file *filp) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_init(filp); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_release(struct inode *ino, struct file *filp) +{ + zfsdev_state_destroy(filp); + + return (0); +} + +static long +zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + uint_t vecnum; + zfs_cmd_t *zc; + int error, rc; + + vecnum = cmd - ZFS_IOC_FIRST; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) { + error = -SET_ERROR(EFAULT); + goto out; + } + error = -zfsdev_ioctl_common(vecnum, zc, 0); + rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), 0); + if (error == 0 && rc != 0) + error = -SET_ERROR(EFAULT); +out: + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); + +} + +uint64_t +zfs_max_nvlist_src_size_os(void) +{ + if (zfs_max_nvlist_src_size != 0) + return (zfs_max_nvlist_src_size); + + return (MIN(ptob(zfs_totalram_pages) / 4, 128 * 1024 * 1024)); +} + +/* Update the VFS's cache of mountpoint properties */ +void +zfs_ioctl_update_mount_cache(const char *dsname) +{ +} + +void +zfs_ioctl_init_os(void) +{ +} + +#ifdef CONFIG_COMPAT +static long +zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + return (zfsdev_ioctl(filp, cmd, arg)); +} +#else +#define zfsdev_compat_ioctl NULL +#endif + +static const struct file_operations zfsdev_fops = { + .open = zfsdev_open, + .release = zfsdev_release, + .unlocked_ioctl = zfsdev_ioctl, + .compat_ioctl = zfsdev_compat_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice zfs_misc = { + .minor = ZFS_DEVICE_MINOR, + .name = ZFS_DRIVER, + .fops = &zfsdev_fops, +}; + +MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR); +MODULE_ALIAS("devname:zfs"); + +int +zfsdev_attach(void) +{ + int error; + + error = misc_register(&zfs_misc); + if (error == -EBUSY) { + /* + * Fallback to dynamic minor allocation in the event of a + * collision with a reserved minor in linux/miscdevice.h. + * In this case the kernel modules must be manually loaded. + */ + printk(KERN_INFO "ZFS: misc_register() with static minor %d " + "failed %d, retrying with MISC_DYNAMIC_MINOR\n", + ZFS_DEVICE_MINOR, error); + + zfs_misc.minor = MISC_DYNAMIC_MINOR; + error = misc_register(&zfs_misc); + } + + if (error) + printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); + + return (error); +} + +void +zfsdev_detach(void) +{ + misc_deregister(&zfs_misc); +} + +#ifdef ZFS_DEBUG +#define ZFS_DEBUG_STR " (DEBUG mode)" +#else +#define ZFS_DEBUG_STR "" +#endif + +static int __init +openzfs_init(void) +{ + int error; + + if ((error = zfs_kmod_init()) != 0) { + printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" + ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, + ZFS_DEBUG_STR, error); + + return (-error); + } + + zfs_sysfs_init(); + + printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " + "ZFS pool version %s, ZFS filesystem version %s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, + SPA_VERSION_STRING, ZPL_VERSION_STRING); +#ifndef CONFIG_FS_POSIX_ACL + printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); +#endif /* CONFIG_FS_POSIX_ACL */ + + return (0); +} + +static void __exit +openzfs_fini(void) +{ + zfs_sysfs_fini(); + zfs_kmod_fini(); + + printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); +} + +#if defined(_KERNEL) +module_init(openzfs_init); +module_exit(openzfs_fini); +#endif + +ZFS_MODULE_DESCRIPTION("ZFS"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c new file mode 100644 index 0000000000..7897e0f9ed --- /dev/null +++ b/module/os/linux/zfs/zfs_racct.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +void +zfs_racct_read(uint64_t size, uint64_t iops) +{ +} + +void +zfs_racct_write(uint64_t size, uint64_t iops) +{ +} diff --git a/module/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c similarity index 98% rename from module/zfs/zfs_sysfs.c rename to module/os/linux/zfs/zfs_sysfs.c index 30b5edb01e..fb7c689873 100644 --- a/module/zfs/zfs_sysfs.c +++ b/module/os/linux/zfs/zfs_sysfs.c @@ -144,6 +144,10 @@ zfs_kobj_release(struct kobject *kobj) zkobj->zko_attr_count = 0; } +#ifndef sysfs_attr_init +#define sysfs_attr_init(attr) do {} while (0) +#endif + static void zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) { @@ -154,6 +158,7 @@ zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) zkobj->zko_attr_list[attr_num].name = attr_name; zkobj->zko_attr_list[attr_num].mode = 0444; zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; + sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); } static int @@ -259,6 +264,7 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, char *buf, size_t buflen) { const char *show_str; + char number[32]; /* For dataset properties list the dataset types that apply */ if (strcmp(attr_name, "datasets") == 0 && @@ -286,8 +292,6 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, } else if (strcmp(attr_name, "values") == 0) { show_str = property->pd_values ? property->pd_values : ""; } else if (strcmp(attr_name, "default") == 0) { - char number[32]; - switch (property->pd_proptype) { case PROP_TYPE_NUMBER: (void) snprintf(number, sizeof (number), "%llu", @@ -349,13 +353,14 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) * This list is intended for kernel features that don't have a pool feature * association or that extend existing user kernel interfaces. * - * A user processes can easily check if the running zfs kernel module + * A user process can easily check if the running zfs kernel module * supports the new feature. */ static const char *zfs_kernel_features[] = { /* --> Add new kernel features here */ "com.delphix:vdev_initialize", "org.zfsonlinux:vdev_trim", + "org.openzfs:l2arc_persistent", }; #define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features) diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c new file mode 100644 index 0000000000..a3d5d5f83b --- /dev/null +++ b/module/os/linux/zfs/zfs_uio.c @@ -0,0 +1,330 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ +/* + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + +#ifdef _KERNEL + +#include +#include +#include +#include +#include +#include + +/* + * Move "n" bytes at byte address "p"; "rw" indicates the direction + * of the move, and the I/O parameters are provided in "uio", which is + * update to reflect the data which was moved. Returns 0 on success or + * a non-zero errno on failure. + */ +static int +zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + ulong_t cnt; + + while (n && uio->uio_resid) { + cnt = MIN(iov->iov_len - skip, n); + switch (uio->uio_segflg) { + case UIO_USERSPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + if (copy_to_user(iov->iov_base+skip, p, cnt)) + return (EFAULT); + } else { + unsigned long b_left = 0; + if (uio->uio_fault_disable) { + if (!zfs_access_ok(VERIFY_READ, + (iov->iov_base + skip), cnt)) { + return (EFAULT); + } + pagefault_disable(); + b_left = + __copy_from_user_inatomic(p, + (iov->iov_base + skip), cnt); + pagefault_enable(); + } else { + b_left = + copy_from_user(p, + (iov->iov_base + skip), cnt); + } + if (b_left > 0) { + unsigned long c_bytes = + cnt - b_left; + uio->uio_skip += c_bytes; + ASSERT3U(uio->uio_skip, <, + iov->iov_len); + uio->uio_resid -= c_bytes; + uio->uio_loffset += c_bytes; + return (EFAULT); + } + } + break; + case UIO_SYSSPACE: + if (rw == UIO_READ) + bcopy(p, iov->iov_base + skip, cnt); + else + bcopy(iov->iov_base + skip, p, cnt); + break; + default: + ASSERT(0); + } + skip += cnt; + if (skip == iov->iov_len) { + skip = 0; + uio->uio_iov = (++iov); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); +} + +static int +zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + const struct bio_vec *bv = uio->uio_bvec; + size_t skip = uio->uio_skip; + ulong_t cnt; + + while (n && uio->uio_resid) { + void *paddr; + cnt = MIN(bv->bv_len - skip, n); + + paddr = zfs_kmap_atomic(bv->bv_page); + if (rw == UIO_READ) + bcopy(p, paddr + bv->bv_offset + skip, cnt); + else + bcopy(paddr + bv->bv_offset + skip, p, cnt); + zfs_kunmap_atomic(paddr); + + skip += cnt; + if (skip == bv->bv_len) { + skip = 0; + uio->uio_bvec = (++bv); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); +} + +#if defined(HAVE_VFS_IOV_ITER) +static int +zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, + boolean_t revert) +{ + size_t cnt = MIN(n, uio->uio_resid); + + if (uio->uio_skip) + iov_iter_advance(uio->uio_iter, uio->uio_skip); + + if (rw == UIO_READ) + cnt = copy_to_iter(p, cnt, uio->uio_iter); + else + cnt = copy_from_iter(p, cnt, uio->uio_iter); + + /* + * When operating on a full pipe no bytes are processed. + * In which case return EFAULT which is converted to EAGAIN + * by the kernel's generic_file_splice_read() function. + */ + if (cnt == 0) + return (EFAULT); + + /* + * Revert advancing the uio_iter. This is set by zfs_uiocopy() + * to avoid consuming the uio and its iov_iter structure. + */ + if (revert) + iov_iter_revert(uio->uio_iter, cnt); + + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + + return (0); +} +#endif + +int +zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) +{ + if (uio->uio_segflg == UIO_BVEC) + return (zfs_uiomove_bvec(p, n, rw, uio)); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); +#endif + else + return (zfs_uiomove_iov(p, n, rw, uio)); +} +EXPORT_SYMBOL(zfs_uiomove); + +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. Any + * error will terminate the process as this is only a best attempt to get + * the pages resident. + */ +int +zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) +{ + if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { + /* There's never a need to fault in kernel pages */ + return (0); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + /* + * At least a Linux 4.9 kernel, iov_iter_fault_in_readable() + * can be relied on to fault in user pages when referenced. + */ + if (iov_iter_fault_in_readable(uio->uio_iter, n)) + return (EFAULT); +#endif + } else { + /* Fault in all user pages */ + ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE); + const struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + size_t skip = uio->uio_skip; + uint8_t tmp; + caddr_t p; + + for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { + ulong_t cnt = MIN(iov->iov_len - skip, n); + /* empty iov */ + if (cnt == 0) + continue; + n -= cnt; + /* touch each page in this segment. */ + p = iov->iov_base + skip; + while (cnt) { + if (get_user(tmp, (uint8_t *)p)) + return (EFAULT); + ulong_t incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* touch the last byte in case it straddles a page. */ + p--; + if (get_user(tmp, (uint8_t *)p)) + return (EFAULT); + } + } + + return (0); +} +EXPORT_SYMBOL(zfs_uio_prefaultpages); + +/* + * The same as zfs_uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) +{ + zfs_uio_t uio_copy; + int ret; + + bcopy(uio, &uio_copy, sizeof (zfs_uio_t)); + + if (uio->uio_segflg == UIO_BVEC) + ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); +#endif + else + ret = zfs_uiomove_iov(p, n, rw, &uio_copy); + + *cbytes = uio->uio_resid - uio_copy.uio_resid; + + return (ret); +} +EXPORT_SYMBOL(zfs_uiocopy); + +/* + * Drop the next n chars out of *uio. + */ +void +zfs_uioskip(zfs_uio_t *uio, size_t n) +{ + if (n > uio->uio_resid) + return; + + if (uio->uio_segflg == UIO_BVEC) { + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_bvec->bv_len) { + uio->uio_skip -= uio->uio_bvec->bv_len; + uio->uio_bvec++; + uio->uio_iovcnt--; + } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + iov_iter_advance(uio->uio_iter, n); +#endif + } else { + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_iov->iov_len) { + uio->uio_skip -= uio->uio_iov->iov_len; + uio->uio_iov++; + uio->uio_iovcnt--; + } + } + uio->uio_loffset += n; + uio->uio_resid -= n; +} +EXPORT_SYMBOL(zfs_uioskip); + +#endif /* _KERNEL */ diff --git a/module/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c similarity index 77% rename from module/zfs/zfs_vfsops.c rename to module/os/linux/zfs/zfs_vfsops.c index 371c412f6b..ff0b0d9df8 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -52,9 +52,12 @@ #include #include #include +#include #include #include +#include #include +#include #include #include #include "zfs_comutil.h" @@ -110,7 +113,7 @@ zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) - strfree(vfsp->vfs_mntpoint); + kmem_strfree(vfsp->vfs_mntpoint); kmem_free(vfsp, sizeof (vfs_t)); } @@ -221,7 +224,7 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) char *tmp_mntopts, *p, *t; int token; - tmp_mntopts = t = strdup(mntopts); + tmp_mntopts = t = kmem_strdup(mntopts); if (tmp_mntopts == NULL) return (SET_ERROR(ENOMEM)); @@ -233,13 +236,13 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) token = match_token(p, zpl_tokens, args); error = zfsvfs_parse_option(p, token, args, tmp_vfsp); if (error) { - strfree(tmp_mntopts); + kmem_strfree(tmp_mntopts); zfsvfs_vfs_free(tmp_vfsp); return (error); } } - strfree(tmp_mntopts); + kmem_strfree(tmp_mntopts); } *vfsp = tmp_vfsp; @@ -291,7 +294,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(1). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); @@ -349,13 +352,14 @@ acltype_changed_cb(void *arg, uint64_t newval) zfsvfs_t *zfsvfs = arg; switch (newval) { + case ZFS_ACLTYPE_NFSV4: case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; - case ZFS_ACLTYPE_POSIXACL: + case ZFS_ACLTYPE_POSIX: #ifdef CONFIG_FS_POSIX_ACL - zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; + zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; @@ -430,9 +434,11 @@ snapdir_changed_cb(void *arg, uint64_t newval) } static void -vscan_changed_cb(void *arg, uint64_t newval) +acl_mode_changed_cb(void *arg, uint64_t newval) { - ((zfsvfs_t *)arg)->z_vscan = newval; + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; } static void @@ -494,11 +500,11 @@ zfs_register_callbacks(vfs_t *vfsp) zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); @@ -532,452 +538,79 @@ unregister: return (error); } -static int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) -{ - sa_hdr_phys_t sa; - sa_hdr_phys_t *sap = data; - uint64_t flags; - int hdrsize; - boolean_t swap = B_FALSE; - - /* - * Is it a valid type of object to track? - */ - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) - return (SET_ERROR(ENOENT)); - - /* - * If we have a NULL data pointer - * then assume the id's aren't changing and - * return EEXIST to the dmu to let it know to - * use the same ids - */ - if (data == NULL) - return (SET_ERROR(EEXIST)); - - if (bonustype == DMU_OT_ZNODE) { - znode_phys_t *znp = data; - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - *projectp = ZFS_DEFAULT_PROJID; - return (0); - } - - if (sap->sa_magic == 0) { - /* - * This should only happen for newly created files - * that haven't had the znode data filled in yet. - */ - *userp = 0; - *groupp = 0; - *projectp = ZFS_DEFAULT_PROJID; - return (0); - } - - sa = *sap; - if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { - sa.sa_magic = SA_MAGIC; - sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); - swap = B_TRUE; - } else { - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); - } - - hdrsize = sa_hdrsize(&sa); - VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); - - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); - flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); - if (swap) - flags = BSWAP_64(flags); - - if (flags & ZFS_PROJID) - *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_PROJID_OFFSET)); - else - *projectp = ZFS_DEFAULT_PROJID; - - if (swap) { - *userp = BSWAP_64(*userp); - *groupp = BSWAP_64(*groupp); - *projectp = BSWAP_64(*projectp); - } - return (0); -} - -static void -fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, - char *domainbuf, int buflen, uid_t *ridp) -{ - uint64_t fuid; - const char *domain; - - fuid = zfs_strtonum(fuidstr, NULL); - - domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); - if (domain) - (void) strlcpy(domainbuf, domain, buflen); - else - domainbuf[0] = '\0'; - *ridp = FUID_RID(fuid); -} - -static uint64_t -zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) -{ - switch (type) { - case ZFS_PROP_USERUSED: - case ZFS_PROP_USEROBJUSED: - return (DMU_USERUSED_OBJECT); - case ZFS_PROP_GROUPUSED: - case ZFS_PROP_GROUPOBJUSED: - return (DMU_GROUPUSED_OBJECT); - case ZFS_PROP_PROJECTUSED: - case ZFS_PROP_PROJECTOBJUSED: - return (DMU_PROJECTUSED_OBJECT); - case ZFS_PROP_USERQUOTA: - return (zfsvfs->z_userquota_obj); - case ZFS_PROP_GROUPQUOTA: - return (zfsvfs->z_groupquota_obj); - case ZFS_PROP_USEROBJQUOTA: - return (zfsvfs->z_userobjquota_obj); - case ZFS_PROP_GROUPOBJQUOTA: - return (zfsvfs->z_groupobjquota_obj); - case ZFS_PROP_PROJECTQUOTA: - return (zfsvfs->z_projectquota_obj); - case ZFS_PROP_PROJECTOBJQUOTA: - return (zfsvfs->z_projectobjquota_obj); - default: - return (ZFS_NO_OBJECT); - } -} - +/* + * Takes a dataset, a property, a value and that value's setpoint as + * found in the ZAP. Checks if the property has been changed in the vfs. + * If so, val and setpoint will be overwritten with updated content. + * Otherwise, they are left unchanged. + */ int -zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) { int error; - zap_cursor_t zc; - zap_attribute_t za; - zfs_useracct_t *buf = vbuf; - uint64_t obj; - int offset = 0; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); - if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || - type == ZFS_PROP_PROJECTOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED) && - !dmu_objset_projectquota_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); + if (dmu_objset_type(os) != DMU_OST_ZFS) + return (EINVAL); - if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED || - type == ZFS_PROP_PROJECTOBJQUOTA) && - !dmu_objset_userobjspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (zfvp == NULL) + return (ESRCH); - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == ZFS_NO_OBJECT) { - *bufsizep = 0; - return (0); - } + vfsp = zfvp->z_vfs; - if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_PROJECTOBJUSED) - offset = DMU_OBJACCT_PREFIX_LEN; - - for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); - (error = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > - *bufsizep) - break; - - /* - * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) - * when dealing with block quota and vice versa. - */ - if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, - DMU_OBJACCT_PREFIX_LEN) == 0)) - continue; - - fuidstr_to_sid(zfsvfs, za.za_name + offset, - buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); - - buf->zu_space = za.za_first_integer; - buf++; - } - if (error == ENOENT) - error = 0; - - ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); - *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; - *cookiep = zap_cursor_serialize(&zc); - zap_cursor_fini(&zc); - return (error); -} - -/* - * buf must be big enough (eg, 32 bytes) - */ -static int -id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, - char *buf, boolean_t addok) -{ - uint64_t fuid; - int domainid = 0; - - if (domain && domain[0]) { - domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); - if (domainid == -1) - return (SET_ERROR(ENOENT)); - } - fuid = FUID_ENCODE(domainid, rid); - (void) sprintf(buf, "%llx", (longlong_t)fuid); - return (0); -} - -int -zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valp) -{ - char buf[20 + DMU_OBJACCT_PREFIX_LEN]; - int offset = 0; - int err; - uint64_t obj; - - *valp = 0; - - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED || - type == ZFS_PROP_PROJECTOBJQUOTA) && - !dmu_objset_userobjspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || - type == ZFS_PROP_PROJECTOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - } - - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == ZFS_NO_OBJECT) - return (0); - - if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_PROJECTOBJUSED) { - strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); - offset = DMU_OBJACCT_PREFIX_LEN; - } - - err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE); - if (err) - return (err); - - err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); - if (err == ENOENT) - err = 0; - return (err); -} - -int -zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota) -{ - char buf[32]; - int err; - dmu_tx_t *tx; - uint64_t *objp; - boolean_t fuid_dirtied; - - if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) - return (SET_ERROR(ENOTSUP)); - - switch (type) { - case ZFS_PROP_USERQUOTA: - objp = &zfsvfs->z_userquota_obj; + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfsp->vfs_do_atime) + tmp = vfsp->vfs_atime; break; - case ZFS_PROP_GROUPQUOTA: - objp = &zfsvfs->z_groupquota_obj; + case ZFS_PROP_RELATIME: + if (vfsp->vfs_do_relatime) + tmp = vfsp->vfs_relatime; break; - case ZFS_PROP_USEROBJQUOTA: - objp = &zfsvfs->z_userobjquota_obj; + case ZFS_PROP_DEVICES: + if (vfsp->vfs_do_devices) + tmp = vfsp->vfs_devices; break; - case ZFS_PROP_GROUPOBJQUOTA: - objp = &zfsvfs->z_groupobjquota_obj; + case ZFS_PROP_EXEC: + if (vfsp->vfs_do_exec) + tmp = vfsp->vfs_exec; break; - case ZFS_PROP_PROJECTQUOTA: - if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - - objp = &zfsvfs->z_projectquota_obj; + case ZFS_PROP_SETUID: + if (vfsp->vfs_do_setuid) + tmp = vfsp->vfs_setuid; break; - case ZFS_PROP_PROJECTOBJQUOTA: - if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - - objp = &zfsvfs->z_projectobjquota_obj; + case ZFS_PROP_READONLY: + if (vfsp->vfs_do_readonly) + tmp = vfsp->vfs_readonly; + break; + case ZFS_PROP_XATTR: + if (vfsp->vfs_do_xattr) + tmp = vfsp->vfs_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfsp->vfs_do_nbmand) + tmp = vfsp->vfs_nbmand; break; default: - return (SET_ERROR(EINVAL)); + return (ENOENT); } - err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); - if (err) - return (err); - fuid_dirtied = zfsvfs->z_fuid_dirty; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); - if (*objp == 0) { - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, - zfs_userquota_prop_prefixes[type]); + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - mutex_enter(&zfsvfs->z_lock); - if (*objp == 0) { - *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, - DMU_OT_NONE, 0, tx); - VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); - } - mutex_exit(&zfsvfs->z_lock); - - if (quota == 0) { - err = zap_remove(zfsvfs->z_os, *objp, buf, tx); - if (err == ENOENT) - err = 0; - } else { - err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); - } - ASSERT(err == 0); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - dmu_tx_commit(tx); - return (err); -} - -boolean_t -zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - char buf[20 + DMU_OBJACCT_PREFIX_LEN]; - uint64_t used, quota, quotaobj; - int err; - - if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { - if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - - if (usedobj == DMU_PROJECTUSED_OBJECT) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { - if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - quotaobj = zfsvfs->z_projectobjquota_obj; - } else if (usedobj == DMU_USERUSED_OBJECT) { - quotaobj = zfsvfs->z_userobjquota_obj; - } else if (usedobj == DMU_GROUPUSED_OBJECT) { - quotaobj = zfsvfs->z_groupobjquota_obj; - } else { - return (B_FALSE); - } - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - (void) sprintf(buf, "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); - if (err != 0) - return (B_FALSE); - - (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); - if (err != 0) - return (B_FALSE); - return (used >= quota); -} - -boolean_t -zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - char buf[20]; - uint64_t used, quota, quotaobj; - int err; - - if (usedobj == DMU_PROJECTUSED_OBJECT) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { - if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - quotaobj = zfsvfs->z_projectquota_obj; - } else if (usedobj == DMU_USERUSED_OBJECT) { - quotaobj = zfsvfs->z_userquota_obj; - } else if (usedobj == DMU_GROUPUSED_OBJECT) { - quotaobj = zfsvfs->z_groupquota_obj; - } else { - return (B_FALSE); - } - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - (void) sprintf(buf, "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); - if (err != 0) - return (B_FALSE); - - err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); - if (err != 0) - return (B_FALSE); - return (used >= quota); -} - -boolean_t -zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - return (zfs_id_overblockquota(zfsvfs, usedobj, id) || - zfs_id_overobjquota(zfsvfs, usedobj, id)); + return (0); } /* @@ -1174,7 +807,7 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + ZFS_TEARDOWN_INIT(zfsvfs); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); @@ -1238,11 +871,13 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "num_entries in unlinked set: %llu", + zs.zs_num_entries); } - dprintf_ds(zfsvfs->z_os->os_dsl_dataset, - "num_entries in unlinked set: %llu", - zs.zs_num_entries); zfs_unlinked_drain(zfsvfs); + dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; } /* @@ -1308,7 +943,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs) mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); - rrm_destroy(&zfsvfs->z_teardown_lock); + ZFS_TEARDOWN_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != size; i++) { @@ -1329,7 +964,7 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs) zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } -void +static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; @@ -1361,7 +996,7 @@ zfs_check_global_label(const char *dsname, const char *hexsl) if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); + return (rdonly ? 0 : SET_ERROR(EACCES)); } return (SET_ERROR(EACCES)); } @@ -1378,7 +1013,8 @@ zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, int err; strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); - err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE); + err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, + sizeof (buf) - offset, B_FALSE); if (err) return (err); @@ -1445,9 +1081,9 @@ objs: } int -zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) +zfs_statvfs(struct inode *ip, struct kstatfs *statp) { - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; @@ -1476,7 +1112,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) * "preferred" size. */ - /* Round up so we never have a filesytem using 0 blocks. */ + /* Round up so we never have a filesystem using 0 blocks. */ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); statp->f_blocks = (refdbytes + availbytes) >> bshift; statp->f_bfree = availbytes >> bshift; @@ -1505,7 +1141,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && dmu_objset_projectquota_present(zfsvfs->z_os)) { - znode_t *zp = ITOZ(dentry->d_inode); + znode_t *zp = ITOZ(ip); if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && zpl_is_valid_projid(zp->z_projid)) @@ -1516,7 +1152,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) return (err); } -int +static int zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) { znode_t *rootzp; @@ -1532,7 +1168,6 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) return (error); } -#ifdef HAVE_D_PRUNE_ALIASES /* * Linux kernels older than 3.1 do not support a per-filesystem shrinker. * To accommodate this we must improvise and manually walk the list of znodes @@ -1583,14 +1218,13 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) if (atomic_read(&ZTOI(zp)->i_count) == 1) objects++; - iput(ZTOI(zp)); + zrele(zp); } kmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } -#endif /* HAVE_D_PRUNE_ALIASES */ /* * The ARC has requested that the filesystem drop entries from the dentry @@ -1602,13 +1236,11 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; -#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; -#endif ZFS_ENTER(zfsvfs); @@ -1626,7 +1258,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) #elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); -#elif defined(HAVE_SHRINK) +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) *objects = (*shrinker->shrink)(shrinker, &sc); #elif defined(HAVE_D_PRUNE_ALIASES) #define D_PRUNE_ALIASES_IS_DEFAULT @@ -1670,7 +1302,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) /* * If someone has not already unmounted this file system, - * drain the iput_taskq to ensure all active references to the + * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { @@ -1689,14 +1321,14 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { - taskq_wait_outstanding(dsl_pool_iput_taskq( + taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* @@ -1727,7 +1359,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } @@ -1736,7 +1368,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * - * Release all holds on dbufs. + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); @@ -1744,6 +1381,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); + if (igrab(ZTOI(zp)) != NULL) + zp->z_suspended = B_TRUE; + } mutex_exit(&zfsvfs->z_znodes_lock); } @@ -1756,7 +1396,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* @@ -1777,15 +1417,25 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * Evict cached data. We must write out any dirty data before * disowning the dataset. */ - if (!zfs_is_readonly(zfsvfs)) + objset_t *os = zfsvfs->z_os; + boolean_t os_dirty = B_FALSE; + for (int t = 0; t < TXG_SIZE; t++) { + if (dmu_objset_is_dirty(os, t)) { + os_dirty = B_TRUE; + break; + } + } + if (!zfs_is_readonly(zfsvfs) && os_dirty) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + } dmu_objset_evict_dbufs(zfsvfs->z_os); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); return (0); } -#if !defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) && \ - !defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) +#if defined(HAVE_SUPER_SETUP_BDI_NAME) atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); #endif @@ -1793,7 +1443,7 @@ int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; - struct inode *root_inode; + struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; @@ -1838,9 +1488,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; -#ifdef HAVE_S_D_OP sb->s_d_op = &zpl_dentry_operations; -#endif /* HAVE_S_D_OP */ /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1922,10 +1570,10 @@ zfs_preumount(struct super_block *sb) zfs_unlinked_drain_stop_wait(zfsvfs); zfsctl_destroy(sb->s_fs_info); /* - * Wait for iput_async before entering evict_inodes in + * Wait for zrele_async before entering evict_inodes in * generic_shutdown_super. The reason we must finish before * evict_inodes is when lazytime is on, or when zfs_purgedir - * calls zfs_zget, iput would bump i_count from 0 to 1. This + * calls zfs_zget, zrele would bump i_count from 0 to 1. This * would race with the i_count check in evict_inodes. This means * it could destroy the inode while we are still using it. * @@ -1933,12 +1581,12 @@ zfs_preumount(struct super_block *sb) * may add xattr entries in zfs_purgedir, so in the second pass * we wait for them. We don't use taskq_wait here because it is * a pool wide taskq. Other mounted filesystems can constantly - * do iput_async and there's no guarantee when taskq will be + * do zrele_async and there's no guarantee when taskq will be * empty. */ - taskq_wait_outstanding(dsl_pool_iput_taskq( + taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); - taskq_wait_outstanding(dsl_pool_iput_taskq( + taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); } } @@ -2078,7 +1726,11 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { - igrab(*ipp); + /* + * Must have an existing ref, so igrab() + * cannot return NULL + */ + VERIFY3P(igrab(*ipp), !=, NULL); } ZFS_EXIT(zfsvfs); return (0); @@ -2094,7 +1746,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { - iput(ZTOI(zp)); + zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } @@ -2109,14 +1761,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); - iput(ZTOI(zp)); + zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } *ipp = ZTOI(zp); if (*ipp) - zfs_inode_update(ITOZ(*ipp)); + zfs_znode_update_vfs(ITOZ(*ipp)); ZFS_EXIT(zfsvfs); return (0); @@ -2154,7 +1806,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) int err, err2; znode_t *zp; - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* @@ -2164,12 +1816,16 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; + ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); @@ -2192,6 +1848,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + zfs_zrele_async(zp); + zp->z_suspended = B_FALSE; + } } mutex_exit(&zfsvfs->z_znodes_lock); @@ -2204,12 +1866,25 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) zfs_unlinked_drain(zfsvfs); } + /* + * Most of the time zfs_suspend_fs is used for changing the contents + * of the underlying dataset. ZFS rollback and receive operations + * might create files for which negative dentries are present in + * the cache. Since walking the dcache would require a lot of GPL-only + * code duplication, it's much easier on these rather rare occasions + * just to flush the whole dcache for the given dataset/filesystem. + */ + shrink_dcache_sb(zfsvfs->z_sb); + bail: + if (err != 0) + zfsvfs->z_unmounted = B_TRUE; + /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); - if (err) { + if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. @@ -2220,6 +1895,60 @@ bail: return (err); } +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_sb); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +/* + * Automounted snapshots rely on periodic revalidation + * to defer snapshots from being automatically unmounted. + */ + +inline void +zfs_exit_fs(zfsvfs_t *zfsvfs) +{ + if (!zfsvfs->z_issnap) + return; + + if (time_after(jiffies, zfsvfs->z_snap_defer_time + + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { + zfsvfs->z_snap_defer_time = jiffies; + zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, + dmu_objset_id(zfsvfs->z_os), + zfs_expire_snapshot); + } +} + int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { @@ -2373,7 +2102,7 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) } /* - * Return true if the coresponding vfs's unmounted flag is set. + * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ @@ -2394,12 +2123,22 @@ zfs_get_vfs_flag_unmounted(objset_t *os) return (unmounted); } +/*ARGSUSED*/ +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + /* + * We don't need to do anything here, the devname is always current by + * virtue of zfsvfs->z_sb->s_op->show_devname. + */ +} + void zfs_init(void) { zfsctl_init(); zfs_znode_init(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); } @@ -2419,12 +2158,6 @@ zfs_fini(void) #if defined(_KERNEL) EXPORT_SYMBOL(zfs_suspend_fs); EXPORT_SYMBOL(zfs_resume_fs); -EXPORT_SYMBOL(zfs_userspace_one); -EXPORT_SYMBOL(zfs_userspace_many); -EXPORT_SYMBOL(zfs_set_userquota); -EXPORT_SYMBOL(zfs_id_overblockquota); -EXPORT_SYMBOL(zfs_id_overobjquota); -EXPORT_SYMBOL(zfs_id_overquota); EXPORT_SYMBOL(zfs_set_version); EXPORT_SYMBOL(zfsvfs_create); EXPORT_SYMBOL(zfsvfs_free); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c new file mode 100644 index 0000000000..e0dc6ed957 --- /dev/null +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -0,0 +1,3999 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) zrele() should always be the last thing except for zil_commit() (if + * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the + * last reference, the vnode/znode can be freed, so the zp may point to + * freed memory. Second, the last reference will call zfs_zinactive(), + * which may induce a lot of work -- pushing cached pages (which acquires + * range locks) and syncing out cached atime changes. Third, + * zfs_zinactive() may require a new tx, which could deadlock the system + * if you were already holding one. This deadlock occurs because the tx + * currently being operated on prevents a txg from syncing, which + * prevents the new tx from progressing, resulting in a deadlock. If you + * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() + * is a synonym for zrele(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +int +zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct inode *ip, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_dec_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) +{ + struct inode *ip = ZTOI(zp); + struct address_space *mp = ip->i_mapping; + struct page *pp; + uint64_t nbytes; + int64_t off; + void *pb; + + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + nbytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + pb + off, DMU_READ_PREFETCH); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } + + len -= nbytes; + off = 0; + } +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +int +mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) +{ + struct inode *ip = ZTOI(zp); + struct address_space *mp = ip->i_mapping; + struct page *pp; + int64_t start, off; + uint64_t bytes; + int len = nbytes; + int error = 0; + void *pb; + + start = uio->uio_loffset; + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + bytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + ASSERT(PageUptodate(pp)); + unlock_page(pp); + + pb = kmap(pp); + error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + put_page(pp); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + } + + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to + * data - bytes to write + * len - number of bytes to write + * pos - offset to start writing at + * + * OUT: resid - remaining bytes to write + * + * RETURN: 0 if success + * positive error code if failure. EIO is returned + * for a short write when residp isn't provided. + * + * Timestamps: + * zp - ctime|mtime updated if byte count > 0 + */ +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *residp) +{ + fstrans_cookie_t cookie; + int error; + + struct iovec iov; + iov.iov_base = (void *)data; + iov.iov_len = len; + + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); + + cookie = spl_fstrans_mark(); + error = zfs_write(zp, &uio, 0, kcred); + spl_fstrans_unmark(cookie); + + if (error == 0) { + if (residp != NULL) + *residp = zfs_uio_resid(&uio); + else if (zfs_uio_resid(&uio) != 0) + error = SET_ERROR(EIO); + } + + return (error); +} + +static void +zfs_rele_async_task(void *arg) +{ + iput(arg); +} + +void +zfs_zrele_async(znode_t *zp) +{ + struct inode *ip = ZTOI(zp); + objset_t *os = ITOZSB(ip)->z_os; + + ASSERT(atomic_read(&ip->i_count) > 0); + ASSERT(os != NULL); + + /* + * If decrementing the count would put us at 0, we can't do it inline + * here, because that would be synchronous. Instead, dispatch an iput + * to run later. + * + * For more information on the dangers of a synchronous iput, see the + * header comment of this file. + */ + if (!atomic_add_unless(&ip->i_count, -1, 1)) { + VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), + zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); + } +} + + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: zdp - znode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: zpp - znode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *zpp = zdp; + zhold(*zpp); + return (0); + } + return (error); + } + } + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *zpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, + B_FALSE, cr))) { + zrele(*zpp); + *zpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); + if ((error == 0) && (*zpp)) + zfs_znode_update_vfs(*zpp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dzp - znode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated if new entry created + * zp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *zpp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + zhold(dzp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + } else { + int aflags = (flag & O_APPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(ZTOI(zp)->i_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +int +zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + objset_t *os; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + uint64_t projid = ZFS_DEFAULT_PROJID; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *ipp = NULL; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* Add to unlinked set */ + zp->z_unlinked = B_TRUE; + zfs_unlinked_add(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +out: + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + *ipp = ZTOI(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dzp - znode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + zfs_external_acl(zp) == acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(xzp)); + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + + if (delete_now) + zrele(zp); + else + zfs_zrele_async(zp); + + if (xzp) { + zfs_znode_update_vfs(xzp); + zfs_zrele_async(xzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dzp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dzp - znode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime updated + * zpp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *zpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + zrele(zp); + } else { + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dzp - znode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, + int flags) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (!S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (zp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + goto top; + } + dmu_tx_abort(tx); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + zrele(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + zap_cursor_t zc; + zap_attribute_t zap; + int error; + uint8_t prefetch; + uint8_t type; + int done = 0; + uint64_t parent; + uint64_t offset; /* must be unsigned; checks for < 1 */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * Quit if directory has been removed (posix) + */ + if (zp->z_unlinked) + goto out; + + error = 0; + os = zfsvfs->z_os; + offset = ctx->pos; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Transform to file-system independent format + */ + while (!done) { + uint64_t objnum; + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if (error == ENOENT) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers == 0) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld, " + "length = %d, num = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset, + zap.za_integer_length, + (u_longlong_t)zap.za_num_integers); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + } + + done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), + objnum, type); + if (done) + break; + + /* Prefetch znode */ + if (prefetch) { + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + ctx->pos = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + +update: + zap_cursor_fini(&zc); + if (error == ENOENT) + error = 0; +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Get the basic file attributes and place them in the provided kstat + * structure. The inode is assumed to be the authoritative source + * for most of the attributes. However, the znode currently has the + * authoritative atime, blksize, and block count. + * + * IN: ip - inode of file. + * + * OUT: sp - kstat values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, + struct kstat *sp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t blksize; + u_longlong_t nblocks; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + + zpl_generic_fillattr(user_ns, ip, sp); + /* + * +1 link count for root inode with visible '.zfs' directory. + */ + if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) + if (sp->nlink < ZFS_LINK_MAX) + sp->nlink++; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + sp->blksize = blksize; + sp->blocks = nblocks; + + if (unlikely(zp->z_blksz == 0)) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + sp->blksize = zfsvfs->z_max_blksz; + } + + mutex_exit(&zp->z_lock); + + /* + * Required to prevent NFS client from detecting different inode + * numbers of snapshot root dentry before and after snapshot mount. + */ + if (zfsvfs->z_issnap) { + if (ip->i_sb->s_root->d_inode == ip) + sp->ino = ZFSCTL_INO_SNAPDIRS - + dmu_objset_id(zfsvfs->z_os); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct inode *dxip = ZTOI(dzp); + struct inode *xip = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp = NULL; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && + KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { + xip->i_uid = dxip->i_uid; + uid = zfs_uid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { + xip->i_gid = dxip->i_gid; + gid = zfs_gid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + struct inode *ip; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 8; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + ip = ZTOI(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(ip->i_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(ip, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + zrele(attrzp); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = EPERM; + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); + new_uid = zfs_uid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); + } + } + + if (mask & ATTR_GID) { + ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); + new_gid = zfs_gid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = ZTOI(zp)->i_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&ip->i_atime, atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( + vap->va_mtime, ZTOI(zp)); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, + ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(ip->i_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + zrele(attrzp); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + if (err2 == 0 && handle_eadir) + err2 = zfs_setattr_dir(attrzp); + zrele(attrzp); + } + zfs_znode_update_vfs(zp); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_zrele_async(zl->zl_znode); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdzp - Source directory containing the "old entry". + * snm - Old entry name. + * tdzp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdzp,tdzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, + cred_t *cr, int flags) +{ + znode_t *szp, *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(sdzp); + zilog_t *zilog; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || + zfsctl_is_node(ZTOI(tdzp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + zrele(tzp); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + zrele(szp); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(ZTOI(szp)->i_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(ZTOI(szp)->i_mode)) { + if (!S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + goto top; + } + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + zfs_znode_update_vfs(sdzp); + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (sdzp != tdzp) + zfs_znode_update_vfs(tdzp); + + zfs_znode_update_vfs(szp); + zrele(szp); + if (tzp) { + zfs_znode_update_vfs(tzp); + zrele(tzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dzp - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: zpp - Znode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, + znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *zpp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datasets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + zrele(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdzp referencing szp. + * + * IN: tdzp - Directory to contain new entry. + * szp - znode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdzp - ctime|mtime updated + * szp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + struct inode *sip = ZTOI(szp); + znode_t *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(tdzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; +#ifdef HAVE_TMPFILE + is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); +#endif + ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (S_ISDIR(sip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + /* unmark z_unlinked so zfs_link_create will not reject */ + if (is_tmpfile) + szp->z_unlinked = B_FALSE; + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + /* + * tmpfile is created to be in z_unlinkedobj, so remove it. + * Also, we don't log in ZIL, because all previous file + * operation on the tmpfile are ignored by ZIL. Instead we + * always wait for txg to sync to make sure all previous + * operation are sync safe. + */ + if (is_tmpfile) { + VERIFY(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); + + zfs_znode_update_vfs(tdzp); + zfs_znode_update_vfs(szp); + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfs_putpage_commit_cb(void *arg) +{ + struct page *pp = arg; + + ClearPageError(pp); + end_page_writeback(pp); +} + +/* + * Push a page out to disk, once the page is on stable storage the + * registered commit callback will be run as notification of completion. + * + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + loff_t offset; + loff_t pgoff; + unsigned int pglen; + dmu_tx_t *tx; + caddr_t va; + int err = 0; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int cnt = 0; + struct address_space *mapping; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + ASSERT(PageLocked(pp)); + + pgoff = page_offset(pp); /* Page byte-offset in file */ + offset = i_size_read(ip); /* File length in bytes */ + pglen = MIN(PAGE_SIZE, /* Page length in bytes */ + P2ROUNDUP(offset, PAGE_SIZE)-pgoff); + + /* Page is beyond end of file */ + if (pgoff >= offset) { + unlock_page(pp); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Truncate page length to end of file */ + if (pgoff + pglen > offset) + pglen = offset - pgoff; + +#if 0 + /* + * FIXME: Allow mmap writes past its quota. The correct fix + * is to register a page_mkwrite() handler to count the page + * against its quota when it is about to be dirtied. + */ + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + err = EDQUOT; + } +#endif + + /* + * The ordering here is critical and must adhere to the following + * rules in order to avoid deadlocking in either zfs_read() or + * zfs_free_range() due to a lock inversion. + * + * 1) The page must be unlocked prior to acquiring the range lock. + * This is critical because zfs_read() calls find_lock_page() + * which may block on the page lock while holding the range lock. + * + * 2) Before setting or clearing write back on a page the range lock + * must be held in order to prevent a lock inversion with the + * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. + */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); + unlock_page(pp); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(pp)) + wait_on_page_bit(pp, PG_writeback); + } + + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; + set_page_writeback(pp); + unlock_page(pp); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) + dmu_tx_wait(tx); + + dmu_tx_abort(tx); + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (err); + } + + va = kmap(pp); + ASSERT3U(pglen, <=, PAGE_SIZE); + dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); + kunmap(pp); + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* Preserve the mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + zp->z_atime_dirty = B_FALSE; + zp->z_seq++; + + err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); + dmu_tx_commit(tx); + + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ + zil_commit(zfsvfs->z_log, zp->z_id); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Update the system attributes when the inode has been dirtied. For the + * moment we only update the mode, atime, mtime, and ctime. + */ +int +zfs_dirty_inode(struct inode *ip, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + dmu_tx_t *tx; + uint64_t mode, atime[2], mtime[2], ctime[2]; + sa_bulk_attr_t bulk[4]; + int error = 0; + int cnt = 0; + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + +#ifdef I_DIRTY_TIME + /* + * This is the lazytime semantic introduced in Linux 4.0 + * This flag will only be called from update_time when lazytime is set. + * (Note, I_DIRTY_SYNC will also set if not lazytime) + * Fortunately mtime and ctime are managed within ZFS itself, so we + * only need to dirty atime. + */ + if (flags == I_DIRTY_TIME) { + zp->z_atime_dirty = B_TRUE; + goto out; + } +#endif + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = B_FALSE; + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + /* Preserve the mode, mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_atime, atime); + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + mode = ip->i_mode; + + zp->z_mode = mode; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + mutex_exit(&zp->z_lock); + + dmu_tx_commit(tx); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t atime[2]; + int error; + int need_unlock = 0; + + /* Only read lock if we haven't already write locked, e.g. rollback */ + if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { + need_unlock = 1; + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + } + if (zp->z_sa_hdl == NULL) { + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + ZFS_TIME_ENCODE(&ip->i_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Fill pages with data from the disk. + */ +static int +zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + struct page *cur_pp; + u_offset_t io_off, total; + size_t io_len; + loff_t i_size; + unsigned page_idx; + int err; + + os = zfsvfs->z_os; + io_len = nr_pages << PAGE_SHIFT; + i_size = i_size_read(ip); + io_off = page_offset(pl[0]); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * Iterate over list of pages and read each page individually. + */ + page_idx = 0; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + cur_pp = pl[page_idx++]; + va = kmap(cur_pp); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + kunmap(cur_pp); + if (err) { + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + } + + return (0); +} + +/* + * Uses zfs_fillpage to read data from the file and fill the pages. + * + * IN: ip - inode of file to get data from. + * pl - list of pages to read + * nr_pages - number of pages to read + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +int +zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int err; + + if (pl == NULL) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + err = zfs_fillpage(ip, pl, nr_pages); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Check ZFS specific permissions to memory map a section of a file. + * + * IN: ip - inode of the file to mmap + * off - file offset + * addrp - start address in memory region + * len - length of memory region + * vm_flags- address flags + * + * RETURN: 0 if success + * error code if failure + */ +/*ARGSUSED*/ +int +zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, + unsigned long vm_flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((vm_flags & VM_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((vm_flags & (VM_READ | VM_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: zp - znode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * zp - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +int +zfs_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSPC)); + } + + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_open); +EXPORT_SYMBOL(zfs_close); +EXPORT_SYMBOL(zfs_lookup); +EXPORT_SYMBOL(zfs_create); +EXPORT_SYMBOL(zfs_tmpfile); +EXPORT_SYMBOL(zfs_remove); +EXPORT_SYMBOL(zfs_mkdir); +EXPORT_SYMBOL(zfs_rmdir); +EXPORT_SYMBOL(zfs_readdir); +EXPORT_SYMBOL(zfs_getattr_fast); +EXPORT_SYMBOL(zfs_setattr); +EXPORT_SYMBOL(zfs_rename); +EXPORT_SYMBOL(zfs_symlink); +EXPORT_SYMBOL(zfs_readlink); +EXPORT_SYMBOL(zfs_link); +EXPORT_SYMBOL(zfs_inactive); +EXPORT_SYMBOL(zfs_space); +EXPORT_SYMBOL(zfs_fid); +EXPORT_SYMBOL(zfs_getpage); +EXPORT_SYMBOL(zfs_putpage); +EXPORT_SYMBOL(zfs_dirty_inode); +EXPORT_SYMBOL(zfs_map); + +/* BEGIN CSTYLED */ +module_param(zfs_delete_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +/* END CSTYLED */ + +#endif diff --git a/module/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c similarity index 94% rename from module/zfs/zfs_znode.c rename to module/os/linux/zfs/zfs_znode.c index 203a599093..859c51baff 100644 --- a/module/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -55,7 +54,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,7 +89,7 @@ int zfs_unlink_suspend_progress = 0; * called with the rangelock_t's rl_lock held, which avoids races. */ static void -zfs_rangelock_cb(locked_range_t *new, void *arg) +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; @@ -129,13 +128,12 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); - rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_moved = 0; return (0); } @@ -151,11 +149,11 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); - rangelock_fini(&zp->z_rangelock); + zfs_rangelock_fini(&zp->z_rangelock); - ASSERT(zp->z_dirlocks == NULL); - ASSERT(zp->z_acl_cached == NULL); - ASSERT(zp->z_xattr_cached == NULL); + ASSERT3P(zp->z_dirlocks, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); } static int @@ -219,7 +217,7 @@ zfs_znode_fini(void) * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally - * with an array of mutexs and AVLs trees which contain per-object locks. + * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In @@ -247,10 +245,10 @@ zfs_znode_hold_compare(const void *a, const void *b) const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; - return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj)); + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } -boolean_t +static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; @@ -321,6 +319,12 @@ zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) kmem_cache_free(znode_hold_cache, zh); } +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (dev); +} + static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) @@ -426,7 +430,7 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); - /*FALLTHROUGH*/ + fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); @@ -446,7 +450,7 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) } } -void +static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* @@ -475,14 +479,10 @@ zfs_set_inode_flags(znode_t *zp, struct inode *ip) } /* - * Update the embedded inode given the znode. We should work toward - * eliminating this function as soon as possible by removing values - * which are duplicated between the znode and inode. If the generic - * inode has the correct field it should be used, and the ZFS code - * updated to access the inode. This can be done incrementally. + * Update the embedded inode given the znode. */ void -zfs_inode_update(znode_t *zp) +zfs_znode_update_vfs(znode_t *zp) { zfsvfs_t *zfsvfs; struct inode *ip; @@ -500,6 +500,7 @@ zfs_inode_update(znode_t *zp) dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); + ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); @@ -515,7 +516,7 @@ zfs_inode_update(znode_t *zp) */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, - dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl) + dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; @@ -524,9 +525,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; - uint64_t atime[2], mtime[2], ctime[2]; + uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; - sa_bulk_attr_t bulk[11]; + sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); @@ -539,18 +540,18 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); - zp->z_moved = 0; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_FALSE; + zp->z_is_stale = B_FALSE; + zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; - zp->z_unlinked = 0; - zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_is_mapped = B_FALSE; - zp->z_is_ctldir = B_FALSE; - zp->z_is_stale = B_FALSE; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); @@ -568,6 +569,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && @@ -595,9 +597,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); ZFS_TIME_DECODE(&ip->i_ctime, ctime); + ZFS_TIME_DECODE(&zp->z_btime, btime); - ip->i_ino = obj; - zfs_inode_update(zp); + ip->i_ino = zp->z_id; + zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* @@ -605,18 +608,24 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * - * The one exception is rolling back a mounted file system, but in - * this case all the active inode are unhashed during the rollback. + * Exceptions include rolling back a mounted file system, either + * from the zfs rollback or zfs recv command. + * + * Active inodes are unhashed during the rollback, but since zrele + * can happen asynchronously, we can't guarantee they've been + * unhashed. This can cause hash collisions in unlinked drain + * processing so do not hash unlinked znodes. */ - VERIFY3S(insert_inode_locked(ip), ==, 0); + if (links > 0) + VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; - membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); - unlock_new_inode(ip); + if (links > 0) + unlock_new_inode(ip); return (zp); error: @@ -787,7 +796,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } /* - * No execs denied will be deterimed when zfs_mode_compute() is called. + * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| @@ -910,8 +919,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, * not fail retry until sufficient memory has been reclaimed. */ do { - *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj, - sa_hdl); + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); @@ -1094,6 +1102,10 @@ again: mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* + * If zp->z_unlinked is set, the znode is already marked + * for deletion and should not be discovered. Check this + * after checking igrab() due to fsetxattr() & O_TMPFILE. + * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. @@ -1108,18 +1120,24 @@ again: * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { - mutex_exit(&zp->z_lock); - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); + if (zp->z_unlinked) + err = SET_ERROR(ENOENT); + else + err = SET_ERROR(EAGAIN); + } else { + *zpp = zp; + err = 0; + } + + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + + if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } - *zpp = zp; - err = 0; - mutex_exit(&zp->z_lock); - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); return (err); } @@ -1134,7 +1152,7 @@ again: * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, - doi.doi_bonus_type, obj_num, NULL); + doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { @@ -1153,12 +1171,12 @@ zfs_rezget(znode_t *zp) uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; - sa_bulk_attr_t bulk[10]; + sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; - uint64_t atime[2], mtime[2], ctime[2]; + uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1228,6 +1246,7 @@ zfs_rezget(znode_t *zp) &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); @@ -1253,8 +1272,9 @@ zfs_rezget(znode_t *zp) ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + ZFS_TIME_DECODE(&zp->z_btime, btime); - if (gen != ZTOI(zp)->i_generation) { + if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); @@ -1264,14 +1284,14 @@ zfs_rezget(znode_t *zp) zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; - zp->z_atime_dirty = 0; - zfs_inode_update(zp); + zp->z_atime_dirty = B_FALSE; + zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the - * stale data and to prevent automatical removal of the file in + * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. @@ -1468,20 +1488,20 @@ zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; - locked_range_t *lr; + zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1511,7 +1531,7 @@ zfs_extend(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (error); } @@ -1523,7 +1543,7 @@ zfs_extend(znode_t *zp, uint64_t end) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); - rangelock_exit(lr); + zfs_rangelock_exit(lr); dmu_tx_commit(tx); @@ -1586,19 +1606,19 @@ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); - locked_range_t *lr; + zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ - lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (0); } @@ -1648,7 +1668,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) page_len); } } - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (error); } @@ -1666,7 +1686,7 @@ zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; - locked_range_t *lr; + zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; @@ -1674,20 +1694,20 @@ zfs_trunc(znode_t *zp, uint64_t end) /* * We will change zp_size, lock the whole file. */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1697,7 +1717,7 @@ zfs_trunc(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (error); } @@ -1713,7 +1733,7 @@ zfs_trunc(znode_t *zp, uint64_t end) VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); - rangelock_exit(lr); + zfs_rangelock_exit(lr); return (0); } @@ -1783,7 +1803,7 @@ log: dmu_tx_commit(tx); - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); error = 0; out: @@ -1884,9 +1904,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); - rootzp->z_moved = 0; - rootzp->z_unlinked = 0; - rootzp->z_atime_dirty = 0; + rootzp->z_unlinked = B_FALSE; + rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; @@ -1998,7 +2017,7 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, return (0); } -void +static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) { sa_handle_destroy(hdl); @@ -2115,8 +2134,10 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, size_t complen; int is_xattrdir = 0; - if (prevdb) + if (prevdb) { + ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); + } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) diff --git a/module/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c similarity index 97% rename from module/zfs/zio_crypt.c rename to module/os/linux/zfs/zio_crypt.c index eb781b64fa..52e62f4d1d 100644 --- a/module/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -26,7 +26,7 @@ #include #include #include -#include "qat.h" +#include /* * This file is responsible for handling all of the details of generating @@ -115,7 +115,7 @@ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing - * which which pieces of the block need to be encrypted. For more details about + * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: @@ -190,7 +190,7 @@ unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ - uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; @@ -369,14 +369,14 @@ error: /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to - * reference the cphertext. cuio must have enough space for the + * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, - uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) + zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; @@ -479,7 +479,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; @@ -495,7 +495,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, if (ret != 0) goto error; - /* initialize uio_ts */ + /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; @@ -549,12 +549,12 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { - int ret; crypto_mechanism_t mech; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; + int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); @@ -563,7 +563,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, keydata_len = zio_crypt_table[crypt].ci_keylen; - /* initialize uio_ts */ + /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; @@ -934,7 +934,7 @@ zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) /* * At L0 we want to verify these fields to ensure that data blocks - * can not be reinterpretted. For instance, we do not want an attacker + * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information @@ -1045,17 +1045,23 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; - dnode_phys_t *adnp; + dnode_phys_t *adnp, tmp_dncore; + size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr); boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; - uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; - /* authenticate the core dnode (masking out non-portable bits) */ - bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); - adnp = (dnode_phys_t *)tmp_dncore; + /* + * Authenticate the core dnode (masking out non-portable bits). + * We only copy the first 64 bytes we operate on to avoid the overhead + * of copying 512-64 unneeded bytes. The compiler seems to be fine + * with that. + */ + bcopy(dnp, &tmp_dncore, dn_core_size); + adnp = &tmp_dncore; + if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); @@ -1065,7 +1071,7 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; - cd.cd_length = sizeof (tmp_dncore); + cd.cd_length = dn_core_size; cd.cd_raw.iov_base = (char *)adnp; cd.cd_raw.iov_len = cd.cd_length; @@ -1283,7 +1289,7 @@ error: } static void -zio_crypt_destroy_uio(uio_t *uio) +zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); @@ -1373,8 +1379,8 @@ zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, - uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, + zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; @@ -1399,6 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, nr_src = 1; nr_dst = 0; } + bzero(dst, datalen); /* find the start and end record of the log block */ zilc = (zil_chain_t *)src; @@ -1568,7 +1575,7 @@ error: static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; @@ -1751,7 +1758,7 @@ error: static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, + uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { int ret; @@ -1811,8 +1818,8 @@ error: static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, - uint_t *auth_len, boolean_t *no_crypt) + uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; @@ -1871,7 +1878,7 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; @@ -1937,8 +1944,8 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, /* If the hardware implementation fails fall back to software */ } - bzero(&puio, sizeof (uio_t)); - bzero(&cuio, sizeof (uio_t)); + bzero(&puio, sizeof (zfs_uio_t)); + bzero(&cuio, sizeof (zfs_uio_t)); /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, diff --git a/module/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c similarity index 85% rename from module/zfs/zpl_ctldir.c rename to module/os/linux/zfs/zpl_ctldir.c index 6df367b817..9b526afd00 100644 --- a/module/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -27,9 +27,9 @@ * Brian Behlendorf */ +#include #include #include -#include #include #include @@ -55,7 +55,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (!zpl_dir_emit_dots(filp, ctx)) goto out; @@ -76,7 +76,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) ctx->pos++; } out: - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (error); } @@ -101,12 +101,22 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_root_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_root_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->atime = current_time(ip); return (0); @@ -114,11 +124,7 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat, ZPL_GETATTR_WRAPPER(zpl_root_getattr); static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) -#else zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) -#endif { cred_t *cr = CRED(); struct inode *ip; @@ -160,7 +166,6 @@ const struct inode_operations zpl_ops_root = { .getattr = zpl_root_getattr, }; -#ifdef HAVE_AUTOMOUNT static struct vfsmount * zpl_snapdir_automount(struct path *path) { @@ -179,7 +184,6 @@ zpl_snapdir_automount(struct path *path) */ return (NULL); } -#endif /* HAVE_AUTOMOUNT */ /* * Negative dentries must always be revalidated so newly created snapshots @@ -206,21 +210,13 @@ dentry_operations_t zpl_dops_snapdirs = { * name space. While it might be possible to add compatibility * code to accomplish this it would require considerable care. */ -#ifdef HAVE_AUTOMOUNT .d_automount = zpl_snapdir_automount, -#endif /* HAVE_AUTOMOUNT */ .d_revalidate = zpl_snapdir_revalidate, }; static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - struct nameidata *nd) -#else zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) -#endif - { fstrans_cookie_t cookie; cred_t *cr = CRED(); @@ -241,9 +237,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, ASSERT(error == 0 || ip == NULL); d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); -#ifdef HAVE_AUTOMOUNT dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; -#endif return (d_splice_alias(ip, dentry)); } @@ -258,13 +252,14 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) uint64_t id, pos; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; - pos = ctx->pos; + /* Start the position at 0 if it already emitted . and .. */ + pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, @@ -281,7 +276,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) } out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); if (error == -ENOENT) return (0); @@ -305,8 +300,14 @@ zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; @@ -324,7 +325,7 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -348,7 +349,12 @@ zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) } static int -zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) +#else +zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; @@ -378,19 +384,29 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_snapdir_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->nlink = stat->size = 2; stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } @@ -423,7 +439,7 @@ const struct file_operations zpl_fops_snapdir = { const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, @@ -433,13 +449,8 @@ const struct inode_operations zpl_ops_snapdir = { }; static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_shares_lookup(struct inode *dip, struct dentry *dentry, - struct nameidata *nd) -#else zpl_shares_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) -#endif { fstrans_cookie_t cookie; cred_t *cr = CRED(); @@ -473,7 +484,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) znode_t *dzp; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { @@ -492,7 +503,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); @@ -515,31 +526,45 @@ zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_shares_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *dzp; int error; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, path->dentry->d_inode, stat); +#else generic_fillattr(path->dentry->d_inode, stat); +#endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { - error = -zfs_getattr_fast(ZTOI(dzp), stat); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); +#endif iput(ZTOI(dzp)); } - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); diff --git a/module/zfs/zpl_export.c b/module/os/linux/zfs/zpl_export.c similarity index 81% rename from module/zfs/zpl_export.c rename to module/os/linux/zfs/zpl_export.c index a264d664cb..5be63532d3 100644 --- a/module/zfs/zpl_export.c +++ b/module/os/linux/zfs/zpl_export.c @@ -24,8 +24,8 @@ */ -#include #include +#include #include #include @@ -41,15 +41,19 @@ zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) struct inode *ip = dentry->d_inode; #endif /* HAVE_ENCODE_FH_WITH_INODE */ fstrans_cookie_t cookie; - fid_t *fid = (fid_t *)fh; + ushort_t empty_fid = 0; + fid_t *fid; int len_bytes, rc; len_bytes = *max_len * sizeof (__u32); - if (len_bytes < offsetof(fid_t, fid_data)) - return (255); + if (len_bytes < offsetof(fid_t, fid_data)) { + fid = (fid_t *)&empty_fid; + } else { + fid = (fid_t *)fh; + fid->fid_len = len_bytes - offsetof(fid_t, fid_data); + } - fid->fid_len = len_bytes - offsetof(fid_t, fid_data); cookie = spl_fstrans_mark(); if (zfsctl_is_node(ip)) @@ -64,25 +68,6 @@ zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) return (rc == 0 ? FILEID_INO32_GEN : 255); } -static struct dentry * -zpl_dentry_obtain_alias(struct inode *ip) -{ - struct dentry *result; - -#ifdef HAVE_D_OBTAIN_ALIAS - result = d_obtain_alias(ip); -#else - result = d_alloc_anon(ip); - - if (result == NULL) { - iput(ip); - result = ERR_PTR(-ENOMEM); - } -#endif /* HAVE_D_OBTAIN_ALIAS */ - - return (result); -} - static struct dentry * zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) @@ -121,7 +106,7 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, ASSERT((ip != NULL) && !IS_ERR(ip)); - return (zpl_dentry_obtain_alias(ip)); + return (d_obtain_alias(ip)); } static struct dentry * @@ -129,12 +114,12 @@ zpl_get_parent(struct dentry *child) { cred_t *cr = CRED(); fstrans_cookie_t cookie; - struct inode *ip; + znode_t *zp; int error; crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL); + error = -zfs_lookup(ITOZ(child->d_inode), "..", &zp, 0, cr, NULL, NULL); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -142,10 +127,9 @@ zpl_get_parent(struct dentry *child) if (error) return (ERR_PTR(error)); - return (zpl_dentry_obtain_alias(ip)); + return (d_obtain_alias(ZTOI(zp))); } -#ifdef HAVE_COMMIT_METADATA static int zpl_commit_metadata(struct inode *inode) { @@ -158,20 +142,17 @@ zpl_commit_metadata(struct inode *inode) crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, 0, cr); + error = -zfs_fsync(ITOZ(inode), 0, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } -#endif /* HAVE_COMMIT_METADATA */ const struct export_operations zpl_export_operations = { .encode_fh = zpl_encode_fh, .fh_to_dentry = zpl_fh_to_dentry, .get_parent = zpl_get_parent, -#ifdef HAVE_COMMIT_METADATA .commit_metadata = zpl_commit_metadata, -#endif /* HAVE_COMMIT_METADATA */ }; diff --git a/module/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c similarity index 74% rename from module/zfs/zpl_file.c rename to module/os/linux/zfs/zpl_file.c index acad4670d1..63002fe3b9 100644 --- a/module/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -29,11 +29,19 @@ #endif #include #include +#include #include #include -#include #include +#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS +#include +#endif +/* + * When using fallocate(2) to preallocate space, inflate the requested + * capacity check by 10% to account for the required metadata blocks. + */ +unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) @@ -108,40 +116,7 @@ zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ -#if defined(HAVE_FSYNC_WITH_DENTRY) -/* - * Linux 2.6.x - 2.6.34 API, - * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' - * to the fops->fsync() hook. For this reason, we must be careful not to - * use filp unconditionally. - */ -static int -zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_fsync(dentry->d_inode, datasync, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_FILE_AIO_FSYNC -static int -zpl_aio_fsync(struct kiocb *kiocb, int datasync) -{ - struct file *filp = kiocb->ki_filp; - return (zpl_fsync(filp, file_dentry(filp), datasync)); -} -#endif - -#elif defined(HAVE_FSYNC_WITHOUT_DENTRY) +#if defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed @@ -158,7 +133,7 @@ zpl_fsync(struct file *filp, int datasync) crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, datasync, cr); + error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -176,7 +151,7 @@ zpl_aio_fsync(struct kiocb *kiocb, int datasync) #elif defined(HAVE_FSYNC_RANGE) /* - * Linux 3.1 - 3.x API, + * Linux 3.1 API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock @@ -196,7 +171,7 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, datasync, cr); + error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -223,261 +198,235 @@ zfs_io_flags(struct kiocb *kiocb) #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) - flags |= FDSYNC; + flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) - flags |= FSYNC; + flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) - flags |= FAPPEND; + flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) - flags |= FDIRECT; + flags |= O_DIRECT; #endif return (flags); } -static ssize_t -zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) +/* + * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() + * is true. This is needed since datasets with inherited "relatime" property + * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after + * `zfs set relatime=...`), which is what relatime test in VFS by + * relatime_need_update() is based on. + */ +static inline void +zpl_file_accessed(struct file *filp) { - ssize_t read; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_read(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - read = count - uio.uio_resid; - *ppos += read; - - return (read); -} - -inline ssize_t -zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; - zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); - ssize_t read; - unsigned int f_flags = filp->f_flags; - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - /* - * If relatime is enabled, call file_accessed() only if - * zfs_relatime_need_update() is true. This is needed since datasets - * with inherited "relatime" property aren't necessarily mounted with - * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what - * relatime test in VFS by relatime_need_update() is based on. - */ - if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { + if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } +} + +#if defined(HAVE_VFS_RW_ITERATE) + +/* + * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports + * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to + * manipulate the iov_iter are available. In which case the full iov_iter + * can be attached to the uio and correctly handled in the lower layers. + * Otherwise, for older kernels extract the iovec and pass it instead. + */ +static void +zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, + loff_t pos, ssize_t count, size_t skip) +{ +#if defined(HAVE_VFS_IOV_ITER) + zfs_uio_iov_iter_init(uio, to, pos, count, skip); +#else + zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, + to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + count, skip); +#endif +} + +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + ssize_t count = iov_iter_count(to); + zfs_uio_t uio; + + zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); return (read); } -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +static inline ssize_t +zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, + size_t *countp) { - ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; - if (to->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (to->type & ITER_BVEC) - seg = UIO_BVEC; - ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, - iov_iter_count(to), seg, to->iov_offset); - if (ret > 0) - iov_iter_advance(to, ret); - return (ret); -} -#else -static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - size_t count; - - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); - if (ret) +#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB + ssize_t ret = generic_write_checks(kiocb, from); + if (ret <= 0) return (ret); - return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); -} -#endif /* HAVE_VFS_RW_ITERATE */ + *countp = ret; +#else + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); -static ssize_t -zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) -{ - ssize_t wrote; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; + *countp = iov_iter_count(from); + ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); + if (ret) + return (ret); +#endif - if (flags & O_APPEND) - *ppos = i_size_read(ip); - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_write(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - *ppos += wrote; - - return (wrote); + return (0); } -inline ssize_t -zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - ssize_t wrote; - unsigned int f_flags = filp->f_flags; - - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - return (wrote); -} - -#if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - size_t count; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + zfs_uio_t uio; + size_t count = 0; ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; -#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); - - count = iov_iter_count(from); - ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); + ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); -#else - /* - * XXX - ideally this check should be in the same lock region with - * write operations, so that there's no TOCTTOU race when doing - * append and someone else grow the file. - */ - ret = generic_write_checks(kiocb, from); - if (ret <= 0) - return (ret); - count = ret; -#endif - if (from->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (from->type & ITER_BVEC) - seg = UIO_BVEC; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, - count, seg, from->iov_offset); - if (ret > 0) - iov_iter_advance(from, ret); + crhold(cr); + cookie = spl_fstrans_mark(); - return (ret); + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } -#else + +#else /* !HAVE_VFS_RW_ITERATE */ + static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; size_t count; ssize_t ret; - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - ret = generic_write_checks(file, &pos, &count, isblk); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); - return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + if (ret) + return (ret); + + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ @@ -514,13 +463,26 @@ zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) #error "Unknown direct IO interface" #endif -#else +#else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t -zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, +zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { + if (rw == WRITE) + return (zpl_aio_write(kiocb, iov, nr_segs, pos)); + else + return (zpl_aio_read(kiocb, iov, nr_segs, pos)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + const struct iovec *iovp = iov_iter_iovec(iter); + unsigned long nr_segs = iter->nr_segs; + + ASSERT3S(pos, ==, kiocb->ki_pos); if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else @@ -545,7 +507,7 @@ zpl_llseek(struct file *filp, loff_t offset, int whence) spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); - error = -zfs_holey(ip, whence, &offset); + error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); @@ -631,13 +593,9 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). - * - * Current this function relies on zpl_read_common() and the O_DIRECT - * flag to read in a page. This works but the more correct way is to - * update zfs_fillpage() to be Linux friendly and use that interface. */ -static int -zpl_readpage(struct file *filp, struct page *pp) +static inline int +zpl_readpage_common(struct page *pp) { struct inode *ip; struct page *pl[1]; @@ -665,6 +623,18 @@ zpl_readpage(struct file *filp, struct page *pp) return (error); } +static int +zpl_readpage(struct file *filp, struct page *pp) +{ + return (zpl_readpage_common(pp)); +} + +static int +zpl_readpage_filler(void *data, struct page *pp) +{ + return (zpl_readpage_common(pp)); +} + /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand @@ -675,11 +645,10 @@ static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return (read_cache_pages(mapping, pages, - (filler_t *)zpl_readpage, filp)); + return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } -int +static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; @@ -703,10 +672,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) enum writeback_sync_modes sync_mode; int result; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); sync_mode = wbc->sync_mode; /* @@ -719,11 +688,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); /* * We need to call write_cache_pages() again (we can't just @@ -754,24 +723,24 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) } /* - * The only flag combination which matches the behavior of zfs_space() - * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE + * The flag combination which matches the behavior of zfs_space() is + * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. + * + * The original mode=0 (allocate space) behavior can be reasonably emulated + * by checking if enough space exists and creating a sparse file, as real + * persistent space reservation is not possible due to COW, snapshots, etc. */ -#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) -long +static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { - int error = -EOPNOTSUPP; - -#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) cred_t *cr = CRED(); - flock64_t bf; loff_t olen; fstrans_cookie_t cookie; + int error = 0; - if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return (error); + if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0) + return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); @@ -779,40 +748,68 @@ zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) spl_inode_lock(ip); olen = i_size_read(ip); - if (offset > olen) { - spl_inode_unlock(ip); - return (0); - } - if (offset + len > olen) - len = olen - offset; - bf.l_type = F_WRLCK; - bf.l_whence = SEEK_SET; - bf.l_start = offset; - bf.l_len = len; - bf.l_pid = 0; - crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); + if (mode & FALLOC_FL_PUNCH_HOLE) { + flock64_t bf; + + if (offset > olen) + goto out_unmark; + + if (offset + len > olen) + len = olen - offset; + bf.l_type = F_WRLCK; + bf.l_whence = SEEK_SET; + bf.l_start = offset; + bf.l_len = len; + bf.l_pid = 0; + + error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); + } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { + unsigned int percent = zfs_fallocate_reserve_percent; + struct kstatfs statfs; + + /* Legacy mode, disable fallocate compatibility. */ + if (percent == 0) { + error = -EOPNOTSUPP; + goto out_unmark; + } + + /* + * Use zfs_statvfs() instead of dmu_objset_space() since it + * also checks project quota limits, which are relevant here. + */ + error = zfs_statvfs(ip, &statfs); + if (error) + goto out_unmark; + + /* + * Shrink available space a bit to account for overhead/races. + * We know the product previously fit into availbytes from + * dmu_objset_space(), so the smaller product will also fit. + */ + if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { + error = -ENOSPC; + goto out_unmark; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) + error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); + } +out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); -#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ - ASSERT3S(error, <=, 0); return (error); } -#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ -#ifdef HAVE_FILE_FALLOCATE static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } -#endif /* HAVE_FILE_FALLOCATE */ #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) @@ -881,9 +878,9 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) - return (-EACCES); + return (-EPERM); - if (!zpl_inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EACCES); xva_init(xva); @@ -927,7 +924,7 @@ zpl_ioctl_setflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -975,7 +972,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -1024,6 +1021,9 @@ const struct address_space_operations zpl_address_space_operations = { .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, +#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS + .set_page_dirty = __set_page_dirty_nobuffers, +#endif }; const struct file_operations zpl_file_operations = { @@ -1037,6 +1037,10 @@ const struct file_operations zpl_file_operations = { #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, +#ifdef HAVE_VFS_IOV_ITER + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +#endif #else .read = do_sync_read, .write = do_sync_write, @@ -1048,9 +1052,7 @@ const struct file_operations zpl_file_operations = { #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif -#ifdef HAVE_FILE_FALLOCATE .fallocate = zpl_fallocate, -#endif /* HAVE_FILE_FALLOCATE */ .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, @@ -1073,3 +1075,9 @@ const struct file_operations zpl_dir_file_operations = { .compat_ioctl = zpl_compat_ioctl, #endif }; + +/* BEGIN CSTYLED */ +module_param(zfs_fallocate_reserve_percent, uint, 0644); +MODULE_PARM_DESC(zfs_fallocate_reserve_percent, + "Percentage of length to use for the available capacity check"); +/* END CSTYLED */ diff --git a/module/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c similarity index 73% rename from module/zfs/zpl_inode.c rename to module/os/linux/zfs/zpl_inode.c index 3f3b2e2dc5..24a8b036bf 100644 --- a/module/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -35,14 +35,11 @@ static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -#else zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -#endif { cred_t *cr = CRED(); struct inode *ip; + znode_t *zp; int error; fstrans_cookie_t cookie; pathname_t *ppn = NULL; @@ -63,16 +60,14 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) ppn = &pn; } - error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn); + error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp, + zfs_flags, cr, NULL, ppn); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); crfree(cr); spin_lock(&dentry->d_lock); dentry->d_time = jiffies; -#ifndef HAVE_S_D_OP - d_set_d_op(dentry, &zpl_dentry_operations); -#endif /* HAVE_S_D_OP */ spin_unlock(&dentry->d_lock); if (error) { @@ -92,6 +87,7 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) else return (ERR_PTR(error)); } + ip = ZTOI(zp); /* * If we are case insensitive, call the correct function @@ -116,7 +112,7 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } void -zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) +zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; @@ -132,16 +128,15 @@ zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) } static int -#ifdef HAVE_CREATE_NAMEIDATA -zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - struct nameidata *nd) +#ifdef HAVE_IOPS_CREATE_USERNS +zpl_create(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool flag) #else -zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - bool flag) +zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #endif { cred_t *cr = CRED(); - struct inode *ip; + znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; @@ -151,16 +146,20 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); + error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, + mode, &zp, cr, 0, NULL); if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) - error = zpl_init_acl(ip, dir); + error = zpl_init_acl(ZTOI(zp), dir); - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); + if (error) { + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + remove_inode_hash(ZTOI(zp)); + iput(ZTOI(zp)); + } else { + d_instantiate(dentry, ZTOI(zp)); + } } spl_fstrans_unmark(cookie); @@ -172,11 +171,16 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, } static int -zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, +#ifdef HAVE_IOPS_MKNOD_USERNS +zpl_mknod(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, +#else +zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, +#endif dev_t rdev) { cred_t *cr = CRED(); - struct inode *ip; + znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; @@ -194,16 +198,20 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, vap->va_rdev = rdev; cookie = spl_fstrans_mark(); - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); + error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, + mode, &zp, cr, 0, NULL); if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) - error = zpl_init_acl(ip, dir); + error = zpl_init_acl(ZTOI(zp), dir); - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); + if (error) { + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + remove_inode_hash(ZTOI(zp)); + iput(ZTOI(zp)); + } else { + d_instantiate(dentry, ZTOI(zp)); + } } spl_fstrans_unmark(cookie); @@ -216,7 +224,12 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, #ifdef HAVE_TMPFILE static int -zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) +#ifdef HAVE_TMPFILE_USERNS +zpl_tmpfile(struct user_namespace *userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +#else +zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); struct inode *ip; @@ -226,6 +239,12 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + /* + * The VFS does not apply the umask, therefore it is applied here + * when POSIX ACLs are not enabled. + */ + if (!IS_POSIXACL(dir)) + mode &= ~current_umask(); zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); @@ -263,7 +282,7 @@ zpl_unlink(struct inode *dir, struct dentry *dentry) crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_remove(dir, dname(dentry), cr, 0); + error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the @@ -280,11 +299,16 @@ zpl_unlink(struct inode *dir, struct dentry *dentry) } static int -zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) +#else +zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; - struct inode *ip; + znode_t *zp; int error; fstrans_cookie_t cookie; @@ -293,16 +317,19 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) zpl_vap_init(vap, dir, mode | S_IFDIR, cr); cookie = spl_fstrans_mark(); - error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); + error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL); if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) - error = zpl_init_acl(ip, dir); + error = zpl_init_acl(ZTOI(zp), dir); - if (error) - (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0); + if (error) { + (void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); + remove_inode_hash(ZTOI(zp)); + iput(ZTOI(zp)); + } else { + d_instantiate(dentry, ZTOI(zp)); + } } spl_fstrans_unmark(cookie); @@ -323,7 +350,7 @@ zpl_rmdir(struct inode *dir, struct dentry *dentry) crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0); + error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the @@ -340,19 +367,57 @@ zpl_rmdir(struct inode *dir, struct dentry *dentry) } static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { int error; fstrans_cookie_t cookie; + struct inode *ip = path->dentry->d_inode; + znode_t *zp __maybe_unused = ITOZ(ip); cookie = spl_fstrans_mark(); /* - * XXX request_mask and query_flags currently ignored. + * XXX query_flags currently ignored. */ - error = -zfs_getattr_fast(path->dentry->d_inode, stat); +#ifdef HAVE_USERNS_IOPS_GETATTR + error = -zfs_getattr_fast(user_ns, ip, stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, ip, stat); +#endif + +#ifdef STATX_BTIME + if (request_mask & STATX_BTIME) { + stat->btime = zp->z_btime; + stat->result_mask |= STATX_BTIME; + } +#endif + +#ifdef STATX_ATTR_IMMUTABLE + if (zp->z_pflags & ZFS_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; +#endif + +#ifdef STATX_ATTR_APPEND + if (zp->z_pflags & ZFS_APPENDONLY) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; +#endif + +#ifdef STATX_ATTR_NODUMP + if (zp->z_pflags & ZFS_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; +#endif + spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); @@ -361,7 +426,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, ZPL_GETATTR_WRAPPER(zpl_getattr); static int +#ifdef HAVE_SETATTR_PREPARE_USERNS +zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, + struct iattr *ia) +#else zpl_setattr(struct dentry *dentry, struct iattr *ia) +#endif { struct inode *ip = dentry->d_inode; cred_t *cr = CRED(); @@ -369,7 +439,7 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) int error; fstrans_cookie_t cookie; - error = setattr_prepare(dentry, ia); + error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); if (error) return (error); @@ -384,13 +454,11 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap->va_mtime = ia->ia_mtime; vap->va_ctime = ia->ia_ctime; - if (vap->va_mask & ATTR_ATIME) { - ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime, - ip->i_sb->s_time_gran); - } + if (vap->va_mask & ATTR_ATIME) + ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); cookie = spl_fstrans_mark(); - error = -zfs_setattr(ip, vap, 0, cr); + error = -zfs_setattr(ITOZ(ip), vap, 0, cr); if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); @@ -403,8 +471,14 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) } static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; @@ -416,7 +490,8 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); + error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), + dname(tdentry), cr, 0); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -424,7 +499,7 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -434,11 +509,16 @@ zpl_rename(struct inode *sdip, struct dentry *sdentry, #endif static int +#ifdef HAVE_IOPS_SYMLINK_USERNS +zpl_symlink(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, const char *name) +#else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) +#endif { cred_t *cr = CRED(); vattr_t *vap; - struct inode *ip; + znode_t *zp; int error; fstrans_cookie_t cookie; @@ -447,13 +527,17 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); cookie = spl_fstrans_mark(); - error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); + error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, + (char *)name, &zp, cr, 0); if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); + if (error) { + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + remove_inode_hash(ZTOI(zp)); + iput(ZTOI(zp)); + } else { + d_instantiate(dentry, ZTOI(zp)); + } } spl_fstrans_unmark(cookie); @@ -492,19 +576,17 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); - struct iovec iov; - uio_t uio = { { 0 }, 0 }; int error; crhold(cr); *link = NULL; + + struct iovec iov; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = (MAXPATHLEN - 1); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); @@ -520,7 +602,7 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) } #if defined(HAVE_GET_LINK_DELAYED) -const char * +static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { @@ -539,7 +621,7 @@ zpl_get_link(struct dentry *dentry, struct inode *inode, return (link); } #elif defined(HAVE_GET_LINK_COOKIE) -const char * +static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) { char *link = NULL; @@ -555,7 +637,7 @@ zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_COOKIE) -const char * +static const char * zpl_follow_link(struct dentry *dentry, void **cookie) { char *link = NULL; @@ -597,10 +679,11 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) crhold(cr); ip->i_ctime = current_time(ip); - igrab(ip); /* Use ihold() if available */ + /* Must have an existing ref, so igrab() cannot return NULL */ + VERIFY3P(igrab(ip), !=, NULL); cookie = spl_fstrans_mark(); - error = -zfs_link(dir, ip, dname(dentry), cr, 0); + error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0); if (error) { iput(ip); goto out; @@ -615,46 +698,6 @@ out: return (error); } -#ifdef HAVE_INODE_TRUNCATE_RANGE -static void -zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) -{ - cred_t *cr = CRED(); - flock64_t bf; - fstrans_cookie_t cookie; - - ASSERT3S(start, <=, end); - - /* - * zfs_freesp() will interpret (len == 0) as meaning "truncate until - * the end of the file". We don't want that. - */ - if (start == end) - return; - - crhold(cr); - - bf.l_type = F_WRLCK; - bf.l_whence = SEEK_SET; - bf.l_start = start; - bf.l_len = end - start; - bf.l_pid = 0; - cookie = spl_fstrans_mark(); - zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr); - spl_fstrans_unmark(cookie); - - crfree(cr); -} -#endif /* HAVE_INODE_TRUNCATE_RANGE */ - -#ifdef HAVE_INODE_FALLOCATE -static long -zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) -{ - return (zpl_fallocate_common(ip, mode, offset, len)); -} -#endif /* HAVE_INODE_FALLOCATE */ - static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -671,19 +714,6 @@ zpl_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return (-ECHILD); - /* - * Automounted snapshots rely on periodic dentry revalidation - * to defer snapshots from being automatically unmounted. - */ - if (zfsvfs->z_issnap) { - if (time_after(jiffies, zfsvfs->z_snap_defer_time + - MAX(zfs_expire_snapshot * HZ / 2, HZ))) { - zfsvfs->z_snap_defer_time = jiffies; - zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, - dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); - } - } - /* * After a rollback negative dentries created before the rollback * time must be invalidated. Otherwise they can obscure files which @@ -717,23 +747,11 @@ const struct inode_operations zpl_inode_operations = { .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, -#ifdef HAVE_INODE_TRUNCATE_RANGE - .truncate_range = zpl_truncate_range, -#endif /* HAVE_INODE_TRUNCATE_RANGE */ -#ifdef HAVE_INODE_FALLOCATE - .fallocate = zpl_fallocate, -#endif /* HAVE_INODE_FALLOCATE */ #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) +#endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; @@ -746,7 +764,7 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #else .rename = zpl_rename, @@ -765,14 +783,8 @@ const struct inode_operations zpl_dir_inode_operations = { #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) +#endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; @@ -810,14 +822,8 @@ const struct inode_operations zpl_special_inode_operations = { #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) +#endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ #endif /* CONFIG_FS_POSIX_ACL */ }; diff --git a/module/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c similarity index 73% rename from module/zfs/zpl_super.c rename to module/os/linux/zfs/zpl_super.c index 216c794015..c2fd3fee14 100644 --- a/module/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -23,9 +23,9 @@ */ +#include #include #include -#include #include #include @@ -81,18 +81,6 @@ zpl_dirty_inode(struct inode *ip) * unhashed and has no links the default policy is to evict it * immediately. * - * Prior to 2.6.36 this eviction was accomplished by the vfs calling - * ->delete_inode(). It was ->delete_inode()'s responsibility to - * truncate the inode pages and call clear_inode(). The call to - * clear_inode() synchronously invalidates all the buffers and - * calls ->clear_inode(). It was ->clear_inode()'s responsibility - * to cleanup and filesystem specific data before freeing the inode. - * - * This elaborate mechanism was replaced by ->evict_inode() which - * does the job of both ->delete_inode() and ->clear_inode(). It - * will be called exactly once, and when it returns the inode must - * be in a state where it can simply be freed.i - * * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -102,7 +90,6 @@ zpl_dirty_inode(struct inode *ip) * any remaining inode specific data via zfs_inactive(). * remaining filesystem specific data. */ -#ifdef HAVE_EVICT_INODE static void zpl_evict_inode(struct inode *ip) { @@ -115,32 +102,6 @@ zpl_evict_inode(struct inode *ip) spl_fstrans_unmark(cookie); } -#else - -static void -zpl_drop_inode(struct inode *ip) -{ - generic_delete_inode(ip); -} - -static void -zpl_clear_inode(struct inode *ip) -{ - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - zfs_inactive(ip); - spl_fstrans_unmark(cookie); -} - -static void -zpl_inode_delete(struct inode *ip) -{ - truncate_setsize(ip, 0); - clear_inode(ip); -} -#endif /* HAVE_EVICT_INODE */ - static void zpl_put_super(struct super_block *sb) { @@ -177,7 +138,7 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp) int error; cookie = spl_fstrans_mark(); - error = -zfs_statvfs(dentry, statp); + error = -zfs_statvfs(dentry->d_inode, statp); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); @@ -221,6 +182,40 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) return (error); } +static int +__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) +{ + ZPL_ENTER(zfsvfs); + + char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dmu_objset_name(zfsvfs->z_os, fsname); + + for (int i = 0; fsname[i] != 0; i++) { + /* + * Spaces in the dataset name must be converted to their + * octal escape sequence for getmntent(3) to correctly + * parse then fsname portion of /proc/self/mounts. + */ + if (fsname[i] == ' ') { + seq_puts(seq, "\\040"); + } else { + seq_putc(seq, fsname[i]); + } + } + + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); + + ZPL_EXIT(zfsvfs); + + return (0); +} + +static int +zpl_show_devname(struct seq_file *seq, struct dentry *root) +{ + return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); +} + static int __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) { @@ -229,7 +224,7 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { - case ZFS_ACLTYPE_POSIXACL: + case ZFS_ACLTYPE_POSIX: seq_puts(seq, ",posixacl"); break; default: @@ -241,19 +236,11 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) return (0); } -#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY static int zpl_show_options(struct seq_file *seq, struct dentry *root) { return (__zpl_show_options(seq, root->d_sb->s_fs_info)); } -#else -static int -zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) -{ - return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); -} -#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */ static int zpl_fill_super(struct super_block *sb, void *data, int silent) @@ -297,11 +284,15 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) * The dsl pool lock must be released prior to calling sget(). * It is possible sget() may block on the lock in grab_super() * while deactivate_super() holds that same lock and waits for - * a txg sync. If the dsl_pool lock is held over over sget() + * a txg sync. If the dsl_pool lock is held over sget() * this can prevent the pool sync and cause a deadlock. */ + dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); - s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os); + + s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + + dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) @@ -322,7 +313,6 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) return (s); } -#ifdef HAVE_FST_MOUNT static struct dentry * zpl_mount(struct file_system_type *fs_type, int flags, const char *osname, void *data) @@ -335,32 +325,12 @@ zpl_mount(struct file_system_type *fs_type, int flags, return (dget(sb->s_root)); } -#else -static int -zpl_get_sb(struct file_system_type *fs_type, int flags, - const char *osname, void *data, struct vfsmount *mnt) -{ - zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; - - struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); - if (IS_ERR(sb)) - return (PTR_ERR(sb)); - - (void) simple_set_mnt(mnt, sb); - - return (0); -} -#endif /* HAVE_FST_MOUNT */ static void zpl_kill_sb(struct super_block *sb) { zfs_preumount(sb); kill_anon_super(sb); - -#ifdef HAVE_S_INSTANCES_LIST_HEAD - sb->s_instances.next = &(zpl_fs_type.fs_supers); -#endif /* HAVE_S_INSTANCES_LIST_HEAD */ } void @@ -372,55 +342,24 @@ zpl_prune_sb(int64_t nr_to_scan, void *arg) (void) -zfs_prune(sb, nr_to_scan, &objects); } -#ifdef HAVE_NR_CACHED_OBJECTS -static int -zpl_nr_cached_objects(struct super_block *sb) -{ - return (0); -} -#endif /* HAVE_NR_CACHED_OBJECTS */ - -#ifdef HAVE_FREE_CACHED_OBJECTS -static void -zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) -{ - /* noop */ -} -#endif /* HAVE_FREE_CACHED_OBJECTS */ - const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, -#ifdef HAVE_EVICT_INODE .evict_inode = zpl_evict_inode, -#else - .drop_inode = zpl_drop_inode, - .clear_inode = zpl_clear_inode, - .delete_inode = zpl_inode_delete, -#endif /* HAVE_EVICT_INODE */ .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, + .show_devname = zpl_show_devname, .show_options = zpl_show_options, .show_stats = NULL, -#ifdef HAVE_NR_CACHED_OBJECTS - .nr_cached_objects = zpl_nr_cached_objects, -#endif /* HAVE_NR_CACHED_OBJECTS */ -#ifdef HAVE_FREE_CACHED_OBJECTS - .free_cached_objects = zpl_free_cached_objects, -#endif /* HAVE_FREE_CACHED_OBJECTS */ }; struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, -#ifdef HAVE_FST_MOUNT .mount = zpl_mount, -#else - .get_sb = zpl_get_sb, -#endif /* HAVE_FST_MOUNT */ .kill_sb = zpl_kill_sb, }; diff --git a/module/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c similarity index 89% rename from module/zfs/zpl_xattr.c rename to module/os/linux/zfs/zpl_xattr.c index 8ee6e9a97f..e7726e8458 100644 --- a/module/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -77,9 +77,9 @@ * largely avoids the issue except in the overflow case. */ +#include #include #include -#include #include #include #include @@ -113,9 +113,6 @@ zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) #elif defined(HAVE_XATTR_LIST_HANDLER) if (!handler->list(handler, d, NULL, 0, name, name_len)) return (0); -#elif defined(HAVE_XATTR_LIST_INODE) - if (!handler->list(d->d_inode, NULL, 0, name, name_len)) - return (0); #endif } @@ -151,7 +148,7 @@ zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) * Read as many directory entry names as will fit in to the provided buffer, * or when no buffer is provided calculate the required buffer size. */ -int +static int zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) { zap_cursor_t zc; @@ -187,10 +184,12 @@ zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) { struct inode *ip = xf->dentry->d_inode; struct inode *dxip = NULL; + znode_t *dxzp; int error; /* Lookup the xattr directory */ - error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, + cr, NULL, NULL); if (error) { if (error == -ENOENT) error = 0; @@ -198,6 +197,7 @@ zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) return (error); } + dxip = ZTOI(dxzp); error = zpl_xattr_readdir(dxip, xf); iput(dxip); @@ -274,21 +274,24 @@ static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { - struct inode *dxip = NULL; + fstrans_cookie_t cookie; struct inode *xip = NULL; - loff_t pos = 0; + znode_t *dxzp = NULL; + znode_t *xzp = NULL; int error; /* Lookup the xattr directory */ - error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, + cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ - error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); + error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error) goto out; + xip = ZTOI(xzp); if (!size) { error = i_size_read(xip); goto out; @@ -299,13 +302,25 @@ zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, goto out; } - error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); -out: - if (xip) - iput(xip); + struct iovec iov; + iov.iov_base = (void *)value; + iov.iov_len = size; - if (dxip) - iput(dxip); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); + + cookie = spl_fstrans_mark(); + error = -zfs_read(ITOZ(xip), &uio, 0, cr); + spl_fstrans_unmark(cookie); + + if (error == 0) + error = size - zfs_uio_resid(&uio); +out: + if (xzp) + zrele(xzp); + + if (dxzp) + zrele(dxzp); return (error); } @@ -435,10 +450,9 @@ static int zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { - struct inode *dxip = NULL; - struct inode *xip = NULL; + znode_t *dxzp = NULL; + znode_t *xzp = NULL; vattr_t *vap = NULL; - ssize_t wrote; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; @@ -453,12 +467,13 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, if (value != NULL) lookup_flags |= CREATE_XATTR_DIR; - error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL); + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags, + cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ - error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); + error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error && (error != -ENOENT)) goto out; @@ -466,38 +481,34 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, /* Remove a specific name xattr when value is set to NULL. */ if (value == NULL) { - if (xip) - error = -zfs_remove(dxip, (char *)name, cr, 0); + if (xzp) + error = -zfs_remove(dxzp, (char *)name, cr, 0); goto out; } /* Lookup failed create a new xattr. */ - if (xip == NULL) { + if (xzp == NULL) { vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mode = xattr_mode; vap->va_mask = ATTR_MODE; vap->va_uid = crgetfsuid(cr); vap->va_gid = crgetfsgid(cr); - error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip, + error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, cr, 0, NULL); if (error) goto out; } - ASSERT(xip != NULL); + ASSERT(xzp != NULL); - error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE); + error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE); if (error) goto out; - wrote = zpl_write_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); - if (wrote < 0) - error = wrote; - + error = -zfs_write_simple(xzp, value, size, pos, NULL); out: - if (error == 0) { ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); @@ -506,11 +517,11 @@ out: if (vap) kmem_free(vap, sizeof (vattr_t)); - if (xip) - iput(xip); + if (xzp) + zrele(xzp); - if (dxip) - iput(dxip); + if (dxzp) + zrele(dxzp); if (error == -ENOENT) error = -ENODATA; @@ -594,7 +605,7 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); - rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); + rw_enter(&zp->z_xattr_lock, RW_WRITER); /* * Before setting the xattr check to see if it already exists. @@ -645,7 +656,7 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, if (error == 0 && (where & XATTR_IN_SA)) zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: - rw_exit(&ITOZ(ip)->z_xattr_lock); + rw_exit(&zp->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); @@ -707,7 +718,7 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } @@ -729,7 +740,7 @@ __zpl_xattr_user_set(struct inode *ip, const char *name, xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } @@ -776,7 +787,7 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } @@ -798,7 +809,7 @@ __zpl_xattr_trusted_set(struct inode *ip, const char *name, #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } @@ -845,7 +856,7 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } @@ -864,15 +875,14 @@ __zpl_xattr_security_set(struct inode *ip, const char *name, #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); + kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); -#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY static int -__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs, +zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, void *fs_info) { const struct xattr *xattr; @@ -894,37 +904,9 @@ zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr) { return security_inode_init_security(ip, dip, qstr, - &__zpl_xattr_security_init, NULL); + &zpl_xattr_security_init_impl, NULL); } -#else -int -zpl_xattr_security_init(struct inode *ip, struct inode *dip, - const struct qstr *qstr) -{ - int error; - size_t len; - void *value; - char *name; - - error = zpl_security_inode_init_security(ip, dip, qstr, - &name, &value, &len); - if (error) { - if (error == -EOPNOTSUPP) - return (0); - - return (error); - } - - error = __zpl_xattr_security_set(ip, name, value, len, 0); - - kfree(name); - kfree(value); - - return (error); -} -#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */ - /* * Security xattr namespace handlers. */ @@ -944,8 +926,8 @@ xattr_handler_t zpl_xattr_security_handler = { * attribute implemented by filesystems in the kernel." - xattr(7) */ #ifdef CONFIG_FS_POSIX_ACL -int -zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) +static int +zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) { char *name, *value = NULL; int error = 0; @@ -958,7 +940,7 @@ zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - zpl_equivmode_t mode = ip->i_mode; + umode_t mode = ip->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error < 0) { return (error); @@ -1017,13 +999,25 @@ zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) return (error); } -struct posix_acl * -zpl_get_acl(struct inode *ip, int type) +#ifdef HAVE_SET_ACL +int +#ifdef HAVE_SET_ACL_USERNS +zpl_set_acl(struct user_namespace *userns, struct inode *ip, + struct posix_acl *acl, int type) +#else +zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) +#endif /* HAVE_SET_ACL_USERNS */ +{ + return (zpl_set_acl_impl(ip, acl, type)); +} +#endif /* HAVE_SET_ACL */ + +static struct posix_acl * +zpl_get_acl_impl(struct inode *ip, int type) { struct posix_acl *acl; void *value = NULL; char *name; - int size; /* * As of Linux 3.14, the kernel get_acl will check this for us. @@ -1047,7 +1041,7 @@ zpl_get_acl(struct inode *ip, int type) return (ERR_PTR(-EINVAL)); } - size = zpl_xattr_get(ip, name, NULL, 0); + int size = zpl_xattr_get(ip, name, NULL, 0); if (size > 0) { value = kmem_alloc(size, KM_SLEEP); size = zpl_xattr_get(ip, name, value, size); @@ -1073,52 +1067,24 @@ zpl_get_acl(struct inode *ip, int type) return (acl); } -#if !defined(HAVE_GET_ACL) -static int -__zpl_check_acl(struct inode *ip, int mask) +#if defined(HAVE_GET_ACL_RCU) +struct posix_acl * +zpl_get_acl(struct inode *ip, int type, bool rcu) { - struct posix_acl *acl; - int error; + if (rcu) + return (ERR_PTR(-ECHILD)); - acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - - if (acl) { - error = posix_acl_permission(ip, acl, mask); - zpl_posix_acl_release(acl); - return (error); - } - - return (-EAGAIN); + return (zpl_get_acl_impl(ip, type)); } - -#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -int -zpl_check_acl(struct inode *ip, int mask, unsigned int flags) +#elif defined(HAVE_GET_ACL) +struct posix_acl * +zpl_get_acl(struct inode *ip, int type) { - return (__zpl_check_acl(ip, mask)); + return (zpl_get_acl_impl(ip, type)); } -#elif defined(HAVE_CHECK_ACL) -int -zpl_check_acl(struct inode *ip, int mask) -{ - return (__zpl_check_acl(ip, mask)); -} -#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -int -zpl_permission(struct inode *ip, int mask, struct nameidata *nd) -{ - return (generic_permission(ip, mask, __zpl_check_acl)); -} -#elif defined(HAVE_PERMISSION) -int -zpl_permission(struct inode *ip, int mask) -{ - return (generic_permission(ip, mask, __zpl_check_acl)); -} -#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* !HAVE_GET_ACL */ +#else +#error "Unsupported iops->get_acl() implementation" +#endif /* HAVE_GET_ACL_RCU */ int zpl_init_acl(struct inode *ip, struct inode *dir) @@ -1126,16 +1092,13 @@ zpl_init_acl(struct inode *ip, struct inode *dir) struct posix_acl *acl = NULL; int error = 0; - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (!S_ISLNK(ip->i_mode)) { - if (ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) { - acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - } - + acl = zpl_get_acl_impl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); if (!acl) { ip->i_mode &= ~current_umask(); ip->i_ctime = current_time(ip); @@ -1144,11 +1107,11 @@ zpl_init_acl(struct inode *ip, struct inode *dir) } } - if ((ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) && acl) { + if (acl) { umode_t mode; if (S_ISDIR(ip->i_mode)) { - error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); + error = zpl_set_acl_impl(ip, acl, ACL_TYPE_DEFAULT); if (error) goto out; } @@ -1158,8 +1121,10 @@ zpl_init_acl(struct inode *ip, struct inode *dir) if (error >= 0) { ip->i_mode = mode; zfs_mark_inode_dirty(ip); - if (error > 0) - error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + if (error > 0) { + error = zpl_set_acl_impl(ip, acl, + ACL_TYPE_ACCESS); + } } } out: @@ -1174,19 +1139,19 @@ zpl_chmod_acl(struct inode *ip) struct posix_acl *acl; int error; - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); - acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); + acl = zpl_get_acl_impl(ip, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return (PTR_ERR(acl)); error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); if (!error) - error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + error = zpl_set_acl_impl(ip, acl, ACL_TYPE_ACCESS); zpl_posix_acl_release(acl); @@ -1200,7 +1165,7 @@ __zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) @@ -1217,7 +1182,7 @@ __zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) @@ -1239,10 +1204,10 @@ __zpl_xattr_acl_get_access(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - acl = zpl_get_acl(ip, type); + acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) @@ -1267,10 +1232,10 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - acl = zpl_get_acl(ip, type); + acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) @@ -1295,10 +1260,10 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!zpl_inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { @@ -1315,8 +1280,7 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, } else { acl = NULL; } - - error = zpl_set_acl(ip, acl, type); + error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); @@ -1335,10 +1299,10 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!zpl_inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { @@ -1356,7 +1320,7 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, acl = NULL; } - error = zpl_set_acl(ip, acl, type); + error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c new file mode 100644 index 0000000000..c174234263 --- /dev/null +++ b/module/os/linux/zfs/zvol_os.c @@ -0,0 +1,1174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_threads = 32; + +struct zvol_state_os { + struct gendisk *zvo_disk; /* generic disk */ + struct request_queue *zvo_queue; /* request queue */ + dev_t zvo_dev; /* device id */ +}; + +taskq_t *zvol_taskq; +static struct ida zvol_ida; + +typedef struct zv_request_stack { + zvol_state_t *zv; + struct bio *bio; +} zv_request_t; + +typedef struct zv_request_task { + zv_request_t zvr; + taskq_ent_t ent; +} zv_request_task_t; + +static zv_request_task_t * +zv_request_task_create(zv_request_t zvr) +{ + zv_request_task_t *task; + task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); + taskq_init_ent(&task->ent); + task->zvr = zvr; + return (task); +} + +static void +zv_request_task_free(zv_request_task_t *task) +{ + kmem_free(task, sizeof (*task)); +} + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +static boolean_t +zvol_is_zvol_impl(const char *path) +{ + dev_t dev = 0; + + if (vdev_lookup_bdev(path, &dev) != 0) + return (B_FALSE); + + if (MAJOR(dev) == zvol_major) + return (B_TRUE); + + return (B_FALSE); +} + +static void +zvol_write(zv_request_t *zvr) +{ + struct bio *bio = zvr->bio; + int error = 0; + zfs_uio_t uio; + + zfs_uio_bvec_init(&uio, bio); + + zvol_state_t *zv = zvr->zv; + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); + + /* bio marked as FLUSH need to flush before write */ + if (bio_is_flush(bio)) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + /* Some requests are just for flush and nothing else. */ + if (uio.uio_resid == 0) { + rw_exit(&zv->zv_suspend_lock); + BIO_END_IO(bio, 0); + return; + } + + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + ssize_t start_resid = uio.uio_resid; + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + + boolean_t sync = + bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + uio.uio_loffset, uio.uio_resid, RL_WRITER); + + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio.uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + + /* This will only fail for ENOSPC */ + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + if (error == 0) { + zvol_log_write(zv, tx, off, bytes, sync); + } + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + + int64_t nwritten = start_resid - uio.uio_resid; + dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); + task_io_account_write(nwritten); + + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + rw_exit(&zv->zv_suspend_lock); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + + BIO_END_IO(bio, -error); +} + +static void +zvol_write_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_write(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_discard(zv_request_t *zvr) +{ + struct bio *bio = zvr->bio; + zvol_state_t *zv = zvr->zv; + uint64_t start = BIO_BI_SECTOR(bio) << 9; + uint64_t size = BIO_BI_SIZE(bio); + uint64_t end = start + size; + boolean_t sync; + int error = 0; + dmu_tx_t *tx; + + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); + + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); + + sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + if (end > zv->zv_volsize) { + error = SET_ERROR(EIO); + goto unlock; + } + + /* + * Align the request to volume block boundaries when a secure erase is + * not required. This will prevent dnode_free_range() from zeroing out + * the unaligned parts which is slow (read-modify-write) and useless + * since we are not freeing any space by doing so. + */ + if (!bio_is_secure_erase(bio)) { + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + size = end - start; + } + + if (start >= end) + goto unlock; + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + start, size, RL_WRITER); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, start, size, B_TRUE); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, start, size); + } + zfs_rangelock_exit(lr); + + if (error == 0 && sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + +unlock: + rw_exit(&zv->zv_suspend_lock); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + + BIO_END_IO(bio, -error); +} + +static void +zvol_discard_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_discard(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_read(zv_request_t *zvr) +{ + struct bio *bio = zvr->bio; + int error = 0; + zfs_uio_t uio; + + zfs_uio_bvec_init(&uio, bio); + + zvol_state_t *zv = zvr->zv; + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + ssize_t start_resid = uio.uio_resid; + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, bio); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + uio.uio_loffset, uio.uio_resid, RL_READER); + + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio.uio_loffset) + bytes = volsize - uio.uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + + int64_t nread = start_resid - uio.uio_resid; + dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + task_io_account_read(nread); + + rw_exit(&zv->zv_suspend_lock); + + if (acct) + blk_generic_end_io_acct(q, disk, READ, bio, start_time); + + BIO_END_IO(bio, -error); +} + +static void +zvol_read_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_read(&task->zvr); + zv_request_task_free(task); +} + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else + struct request_queue *q = bio->bi_disk->queue; +#endif +#endif + zvol_state_t *zv = q->queuedata; + fstrans_cookie_t cookie = spl_fstrans_mark(); + uint64_t offset = BIO_BI_SECTOR(bio) << 9; + uint64_t size = BIO_BI_SIZE(bio); + int rw = bio_data_dir(bio); + + if (bio_has_data(bio) && offset + size > zv->zv_volsize) { + printk(KERN_INFO + "%s: bad access: offset=%llu, size=%lu\n", + zv->zv_zso->zvo_disk->disk_name, + (long long unsigned)offset, + (long unsigned)size); + + BIO_END_IO(bio, -SET_ERROR(EIO)); + goto out; + } + + zv_request_t zvr = { + .zv = zv, + .bio = bio, + }; + zv_request_task_t *task; + + if (rw == WRITE) { + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { + BIO_END_IO(bio, -SET_ERROR(EROFS)); + goto out; + } + + /* + * Prevents the zvol from being suspended, or the ZIL being + * concurrently opened. Will be released after the i/o + * completes. + */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + /* replay / destroy done in zvol_create_minor */ + VERIFY0((zv->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv->zv_suspend_lock); + } + + /* + * We don't want this thread to be blocked waiting for i/o to + * complete, so we instead wait from a taskq callback. The + * i/o may be a ZIL write (via zil_commit()), or a read of an + * indirect block, or a read of a data block (if this is a + * partial-block write). We will indicate that the i/o is + * complete by calling BIO_END_IO() from the taskq callback. + * + * This design allows the calling thread to continue and + * initiate more concurrent operations by calling + * zvol_request() again. There are typically only a small + * number of threads available to call zvol_request() (e.g. + * one per iSCSI target), so keeping the latency of + * zvol_request() low is important for performance. + * + * The zvol_request_sync module parameter allows this + * behavior to be altered, for performance evaluation + * purposes. If the callback blocks, setting + * zvol_request_sync=1 will result in much worse performance. + * + * We can have up to zvol_threads concurrent i/o's being + * processed for all zvols on the system. This is typically + * a vast improvement over the zvol_request_sync=1 behavior + * of one i/o at a time per zvol. However, an even better + * design would be for zvol_request() to initiate the zio + * directly, and then be notified by the zio_done callback, + * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL + * interfaces lack this functionality (they block waiting for + * the i/o to complete). + */ + if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { + if (zvol_request_sync) { + zvol_discard(&zvr); + } else { + task = zv_request_task_create(zvr); + taskq_dispatch_ent(zvol_taskq, + zvol_discard_task, task, 0, &task->ent); + } + } else { + if (zvol_request_sync) { + zvol_write(&zvr); + } else { + task = zv_request_task_create(zvr); + taskq_dispatch_ent(zvol_taskq, + zvol_write_task, task, 0, &task->ent); + } + } + } else { + /* + * The SCST driver, and possibly others, may issue READ I/Os + * with a length of zero bytes. These empty I/Os contain no + * data and require no additional handling. + */ + if (size == 0) { + BIO_END_IO(bio, 0); + goto out; + } + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* See comment in WRITE case above. */ + if (zvol_request_sync) { + zvol_read(&zvr); + } else { + task = zv_request_task_create(zvr); + taskq_dispatch_ent(zvol_taskq, + zvol_read_task, task, 0, &task->ent); + } + } + +out: + spl_fstrans_unmark(cookie); +#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ + defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) + return (BLK_QC_T_NONE); +#endif +} + +static int +zvol_open(struct block_device *bdev, fmode_t flag) +{ + zvol_state_t *zv; + int error = 0; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, RW_READER); + /* + * Obtain a copy of private_data under the zvol_state_lock to make + * sure that either the result of zvol free code path setting + * bdev->bd_disk->private_data to NULL is observed, or zvol_free() + * is not called on this zv because of the positive zv_open_count. + */ + zv = bdev->bd_disk->private_data; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(-ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + if (error) + goto out_mutex; + } + + if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + error = -EROFS; + goto out_open_count; + } + + zv->zv_open_count++; + + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + + zfs_check_media_change(bdev); + + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); + +out_mutex: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + if (error == -EINTR) { + error = -ERESTARTSYS; + schedule(); + } + return (SET_ERROR(error)); +} + +static void +zvol_release(struct gendisk *disk, fmode_t mode) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, RW_READER); + zv = disk->private_data; + + mutex_enter(&zv->zv_state_lock); + ASSERT3U(zv->zv_open_count, >, 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + zv->zv_open_count--; + if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); +} + +static int +zvol_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + int error = 0; + + ASSERT3U(zv->zv_open_count, >, 0); + + switch (cmd) { + case BLKFLSBUF: + fsync_bdev(bdev); + invalidate_bdev(bdev); + rw_enter(&zv->zv_suspend_lock, RW_READER); + + if (!(zv->zv_flags & ZVOL_RDONLY)) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + + rw_exit(&zv->zv_suspend_lock); + break; + + case BLKZNAME: + mutex_enter(&zv->zv_state_lock); + error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); + mutex_exit(&zv->zv_state_lock); + break; + + default: + error = -ENOTTY; + break; + } + + return (SET_ERROR(error)); +} + +#ifdef CONFIG_COMPAT +static int +zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + return (zvol_ioctl(bdev, mode, cmd, arg)); +} +#else +#define zvol_compat_ioctl NULL +#endif + +static unsigned int +zvol_check_events(struct gendisk *disk, unsigned int clearing) +{ + unsigned int mask = 0; + + rw_enter(&zvol_state_lock, RW_READER); + + zvol_state_t *zv = disk->private_data; + if (zv != NULL) { + mutex_enter(&zv->zv_state_lock); + mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; + zv->zv_changed = 0; + mutex_exit(&zv->zv_state_lock); + } + + rw_exit(&zvol_state_lock); + + return (mask); +} + +static int +zvol_revalidate_disk(struct gendisk *disk) +{ + rw_enter(&zvol_state_lock, RW_READER); + + zvol_state_t *zv = disk->private_data; + if (zv != NULL) { + mutex_enter(&zv->zv_state_lock); + set_capacity(zv->zv_zso->zvo_disk, + zv->zv_volsize >> SECTOR_BITS); + mutex_exit(&zv->zv_state_lock); + } + + rw_exit(&zvol_state_lock); + + return (0); +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + struct gendisk *disk = zv->zv_zso->zvo_disk; + +#if defined(HAVE_REVALIDATE_DISK_SIZE) + revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); +#elif defined(HAVE_REVALIDATE_DISK) + revalidate_disk(disk); +#else + zvol_revalidate_disk(disk); +#endif + return (0); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + /* + * Cleared while holding zvol_state_lock as a writer + * which will prevent zvol_open() from opening it. + */ + zv->zv_zso->zvo_disk->private_data = NULL; +} + +/* + * Provide a simple virtual geometry for legacy compatibility. For devices + * smaller than 1 MiB a small head and sector count is used to allow very + * tiny devices. For devices over 1 Mib a standard head and sector count + * is used to keep the cylinders count reasonable. + */ +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + sector_t sectors; + + ASSERT3U(zv->zv_open_count, >, 0); + + sectors = get_capacity(zv->zv_zso->zvo_disk); + + if (sectors > 2048) { + geo->heads = 16; + geo->sectors = 63; + } else { + geo->heads = 2; + geo->sectors = 4; + } + + geo->start = 0; + geo->cylinders = sectors / (geo->heads * geo->sectors); + + return (0); +} + +static struct block_device_operations zvol_ops = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .check_events = zvol_check_events, +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK + .revalidate_disk = zvol_revalidate_disk, +#endif + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS + .submit_bio = zvol_submit_bio, +#endif +}; + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_alloc(dev_t dev, const char *name) +{ + zvol_state_t *zv; + struct zvol_state_os *zso; + uint64_t volmode; + + if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) + return (NULL); + + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + if (volmode == ZFS_VOLMODE_NONE) + return (NULL); + + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso = zso; + zv->zv_volmode = volmode; + + list_link_init(&zv->zv_next); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#ifdef HAVE_BLK_ALLOC_DISK + zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); + if (zso->zvo_disk == NULL) + goto out_kmem; + + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#else + zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + goto out_kmem; + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + goto out_kmem; + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_BLK_ALLOC_DISK */ +#else + zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); + if (zso->zvo_queue == NULL) + goto out_kmem; + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + goto out_kmem; + } + + zso->zvo_disk->queue = zso->zvo_queue; +#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + + blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); + + /* Limit read-ahead to a single page to prevent over-prefetching. */ + blk_queue_set_read_ahead(zso->zvo_queue, 1); + + /* Disable write merging in favor of the ZIO pipeline. */ + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + + /* Enable /proc/diskstats */ + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); + + zso->zvo_queue->queuedata = zv; + zso->zvo_dev = dev; + zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, MAXNAMELEN); + + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + + zso->zvo_disk->major = zvol_major; + zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; + + if (volmode == ZFS_VOLMODE_DEV) { + /* + * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set + * gendisk->minors = 1 as noted in include/linux/genhd.h. + * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) + * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) + * setting gendisk->flags accordingly. + */ + zso->zvo_disk->minors = 1; +#if defined(GENHD_FL_EXT_DEVT) + zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; +#endif +#if defined(GENHD_FL_NO_PART_SCAN) + zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; +#endif + } + zso->zvo_disk->first_minor = (dev & MINORMASK); + zso->zvo_disk->fops = &zvol_ops; + zso->zvo_disk->private_data = zv; + snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", + ZVOL_DEV_NAME, (dev & MINORMASK)); + + return (zv); + +out_kmem: + kmem_free(zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (NULL); +} + +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * At this time, the structure is not opened by anyone, is taken off + * the zvol_state_list, and has its private data set to NULL. + * The zvol_state_lock is dropped. + * + * This function may take many milliseconds to complete (e.g. we've seen + * it take over 256ms), due to the calls to "blk_cleanup_queue" and + * "del_gendisk". Thus, consumers need to be careful to account for this + * latency when calling this function. + */ +static void +zvol_free(zvol_state_t *zv) +{ + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + del_gendisk(zv->zv_zso->zvo_disk); +#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ + defined(HAVE_BLK_ALLOC_DISK) + blk_cleanup_disk(zv->zv_zso->zvo_disk); +#else + blk_cleanup_queue(zv->zv_zso->zvo_queue); + put_disk(zv->zv_zso->zvo_disk); +#endif + + ida_simple_remove(&zvol_ida, + MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + + mutex_destroy(&zv->zv_state_lock); + dataset_kstats_destroy(&zv->zv_kstat); + + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); +} + +void +zvol_wait_close(zvol_state_t *zv) +{ +} + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +static int +zvol_os_create_minor(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t len; + unsigned minor = 0; + int error = 0; + int idx; + uint64_t hash = zvol_name_hash(name); + + if (zvol_inhibit_dev) + return (0); + + idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + if (idx < 0) + return (SET_ERROR(-idx)); + minor = idx << ZVOL_MINOR_BITS; + + zv = zvol_find_by_name_hash(name, hash, RW_NONE); + if (zv) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + ida_simple_remove(&zvol_ida, idx); + return (SET_ERROR(EEXIST)); + } + + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + zv = zvol_alloc(MKDEV(zvol_major, minor), name); + if (zv == NULL) { + error = SET_ERROR(EAGAIN); + goto out_dmu_objset_disown; + } + zv->zv_hash = hash; + + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); + + blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, + (DMU_MAX_ACCESS / 4) >> 9); + blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + blk_queue_physical_block_size(zv->zv_zso->zvo_queue, + zv->zv_volblocksize); + blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); + blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, + (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); + blk_queue_discard_granularity(zv->zv_zso->zvo_queue, + zv->zv_volblocksize); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); +#ifdef QUEUE_FLAG_NONROT + blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); +#endif +#ifdef QUEUE_FLAG_ADD_RANDOM + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); +#endif + /* This flag was introduced in kernel version 4.12. */ +#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH + blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); +#endif + + ASSERT3P(zv->zv_zilog, ==, NULL); + zv->zv_zilog = zil_open(os, zvol_get_data); + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(zv->zv_zilog, B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + + /* + * When udev detects the addition of the device it will immediately + * invoke blkid(8) to determine the type of content on the device. + * Prefetching the blocks commonly scanned by blkid(8) will speed + * up this process. + */ + len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); + if (len > 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, + ZIO_PRIORITY_SYNC_READ); + } + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + + /* + * Keep in mind that once add_disk() is called, the zvol is + * announced to the world, and zvol_open()/zvol_release() can + * be called at any time. Incidentally, add_disk() itself calls + * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() + * directly as well. + */ + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + rw_exit(&zvol_state_lock); + add_disk(zv->zv_zso->zvo_disk); + } else { + ida_simple_remove(&zvol_ida, idx); + } + + return (error); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + int readonly = get_disk_ro(zv->zv_zso->zvo_disk); + + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + /* + * The block device's read-only state is briefly changed causing + * a KOBJ_CHANGE uevent to be issued. This ensures udev detects + * the name change and fixes the symlinks. This does not change + * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never + * changes. This would normally be done using kobject_uevent() but + * that is a GPL-only symbol which is why we need this workaround. + */ + set_disk_ro(zv->zv_zso->zvo_disk, !readonly); + set_disk_ro(zv->zv_zso->zvo_disk, readonly); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + + set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + + set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_linux_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_os_create_minor, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +int +zvol_init(void) +{ + int error; + int threads = MIN(MAX(zvol_threads, 1), 1024); + + error = register_blkdev(zvol_major, ZVOL_DRIVER); + if (error) { + printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); + return (error); + } + zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, + threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (zvol_taskq == NULL) { + unregister_blkdev(zvol_major, ZVOL_DRIVER); + return (-ENOMEM); + } + zvol_init_impl(); + ida_init(&zvol_ida); + zvol_register_ops(&zvol_linux_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + taskq_destroy(zvol_taskq); + ida_destroy(&zvol_ida); +} + +/* BEGIN CSTYLED */ +module_param(zvol_inhibit_dev, uint, 0644); +MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); + +module_param(zvol_major, uint, 0444); +MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); + +module_param(zvol_threads, uint, 0444); +MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); + +module_param(zvol_request_sync, uint, 0644); +MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); + +module_param(zvol_max_discard_blocks, ulong, 0444); +MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); + +module_param(zvol_prefetch_bytes, uint, 0644); +MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); + +module_param(zvol_volmode, uint, 0644); +MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); +/* END CSTYLED */ diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index 3bcbf63cbc..cedbfe92b5 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -1,29 +1,13 @@ -src = @abs_top_srcdir@/module/spl +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +mfdir = $(obj) +else +mfdir = $(srctree)/$(src) +endif MODULE := spl obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - -$(MODULE)-objs += spl-atomic.o -$(MODULE)-objs += spl-condvar.o -$(MODULE)-objs += spl-cred.o -$(MODULE)-objs += spl-err.o -$(MODULE)-objs += spl-generic.o -$(MODULE)-objs += spl-kmem.o -$(MODULE)-objs += spl-kmem-cache.o -$(MODULE)-objs += spl-kobj.o -$(MODULE)-objs += spl-kstat.o -$(MODULE)-objs += spl-mutex.o -$(MODULE)-objs += spl-proc.o -$(MODULE)-objs += spl-procfs-list.o -$(MODULE)-objs += spl-rwlock.o -$(MODULE)-objs += spl-taskq.o -$(MODULE)-objs += spl-thread.o -$(MODULE)-objs += spl-tsd.o -$(MODULE)-objs += spl-vmem.o -$(MODULE)-objs += spl-vnode.o -$(MODULE)-objs += spl-xdr.o -$(MODULE)-objs += spl-zlib.o +include $(mfdir)/../os/linux/spl/Makefile diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c deleted file mode 100644 index 7019369bd2..0000000000 --- a/module/spl/spl-kobj.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Kobj Implementation. - */ - -#include - -struct _buf * -kobj_open_file(const char *name) -{ - struct _buf *file; - vnode_t *vp; - int rc; - - file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP)); - if (file == NULL) - return ((_buf_t *)-1UL); - - if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { - kfree(file); - return ((_buf_t *)-1UL); - } - - file->vp = vp; - - return (file); -} /* kobj_open_file() */ -EXPORT_SYMBOL(kobj_open_file); - -void -kobj_close_file(struct _buf *file) -{ - VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); - kfree(file); -} /* kobj_close_file() */ -EXPORT_SYMBOL(kobj_close_file); - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - ssize_t resid; - - if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, - UIO_SYSSPACE, 0, 0, 0, &resid) != 0) - return (-1); - - return (size - resid); -} /* kobj_read_file() */ -EXPORT_SYMBOL(kobj_read_file); - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - vattr_t vap; - int rc; - - rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); - if (rc) - return (rc); - - *size = vap.va_size; - - return (rc); -} /* kobj_get_filesize() */ -EXPORT_SYMBOL(kobj_get_filesize); diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c deleted file mode 100644 index ba818862b6..0000000000 --- a/module/spl/spl-mutex.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Mutex Implementation. - */ - -#include - -int spl_mutex_init(void) { return 0; } -void spl_mutex_fini(void) { } diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c deleted file mode 100644 index 86727ed195..0000000000 --- a/module/spl/spl-rwlock.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. - */ - -#include -#include - -#if defined(CONFIG_PREEMPT_RT_FULL) - -#include -#define RT_MUTEX_OWNER_MASKALL 1UL - -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ -#if defined(READER_BIAS) && defined(WRITER_BIAS) - /* - * After the 4.9.20-rt16 kernel the realtime patch series lifted the - * single reader restriction. While this could be accommodated by - * adding additional compatibility code assume the rwsem can never - * be upgraded. All caller must already cleanly handle this case. - */ - return (0); -#else - ASSERT((struct task_struct *) - ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) == - current); - - /* - * Prior to 4.9.20-rt16 kernel the realtime patch series, rwsem is - * implemented as a single mutex held by readers and writers alike. - * However, this implementation would prevent a thread from taking - * a read lock twice, as the mutex would already be locked on - * the second attempt. Therefore the implementation allows a - * single thread to take a rwsem as read lock multiple times - * tracking that nesting as read_depth counter. - */ - if (rwsem->read_depth <= 1) { - /* - * In case, the current thread has not taken the lock - * more than once as read lock, we can allow an - * upgrade to a write lock. rwsem_rt.h implements - * write locks as read_depth == 0. - */ - rwsem->read_depth = 0; - return (1); - } - return (0); -#endif -} -#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - int ret = 0; - unsigned long flags; - spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); - if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && - list_empty(&rwsem->wait_list)) { - ret = 1; - RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; - } - spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); - return (ret); -} -#elif defined(RWSEM_ACTIVE_MASK) -#if defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - long val; - val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, - SPL_RWSEM_SINGLE_WRITER_VALUE); - return (val == SPL_RWSEM_SINGLE_READER_VALUE); -} -#else -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - typeof(rwsem->count) val; - val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, - SPL_RWSEM_SINGLE_WRITER_VALUE); - return (val == SPL_RWSEM_SINGLE_READER_VALUE); -} -#endif -#else -static int -__rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - return (0); -} -#endif - -int -rwsem_tryupgrade(struct rw_semaphore *rwsem) -{ - if (__rwsem_tryupgrade(rwsem)) { - rwsem_release(&rwsem->dep_map, 1, _RET_IP_); - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - rwsem->owner = current; -#endif - return (1); - } - return (0); -} -EXPORT_SYMBOL(rwsem_tryupgrade); - -int spl_rw_init(void) { return 0; } -void spl_rw_fini(void) { } diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c deleted file mode 100644 index 11b5e4e5a2..0000000000 --- a/module/spl/spl-vnode.c +++ /dev/null @@ -1,777 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see . - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - * - * Solaris Porting Layer (SPL) Vnode Implementation. - */ - -#include -#include -#include -#include -#include -#include -#ifdef HAVE_FDTABLE_HEADER -#include -#endif - -vnode_t *rootdir = (vnode_t *)0xabcd1234; -EXPORT_SYMBOL(rootdir); - -static spl_kmem_cache_t *vn_cache; -static spl_kmem_cache_t *vn_file_cache; - -static spinlock_t vn_file_lock; -static LIST_HEAD(vn_file_list); - -static int -spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) -{ - int error = -EOPNOTSUPP; - -#ifdef HAVE_FILE_FALLOCATE - if (fp->f_op->fallocate) - error = fp->f_op->fallocate(fp, mode, offset, len); -#else -#ifdef HAVE_INODE_FALLOCATE - if (fp->f_dentry && fp->f_dentry->d_inode && - fp->f_dentry->d_inode->i_op->fallocate) - error = fp->f_dentry->d_inode->i_op->fallocate( - fp->f_dentry->d_inode, mode, offset, len); -#endif /* HAVE_INODE_FALLOCATE */ -#endif /* HAVE_FILE_FALLOCATE */ - - return (error); -} - -static int -spl_filp_fsync(struct file *fp, int sync) -{ -#ifdef HAVE_2ARGS_VFS_FSYNC - return (vfs_fsync(fp, sync)); -#else - return (vfs_fsync(fp, (fp)->f_dentry, sync)); -#endif /* HAVE_2ARGS_VFS_FSYNC */ -} - -static ssize_t -spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) -{ -#if defined(HAVE_KERNEL_WRITE_PPOS) - return (kernel_write(file, buf, count, pos)); -#else - mm_segment_t saved_fs; - ssize_t ret; - - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - ret = vfs_write(file, (__force const char __user *)buf, count, pos); - - set_fs(saved_fs); - - return (ret); -#endif -} - -static ssize_t -spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) -{ -#if defined(HAVE_KERNEL_READ_PPOS) - return (kernel_read(file, buf, count, pos)); -#else - mm_segment_t saved_fs; - ssize_t ret; - - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - ret = vfs_read(file, (void __user *)buf, count, pos); - - set_fs(saved_fs); - - return (ret); -#endif -} - -vtype_t -vn_mode_to_vtype(mode_t mode) -{ - if (S_ISREG(mode)) - return (VREG); - - if (S_ISDIR(mode)) - return (VDIR); - - if (S_ISCHR(mode)) - return (VCHR); - - if (S_ISBLK(mode)) - return (VBLK); - - if (S_ISFIFO(mode)) - return (VFIFO); - - if (S_ISLNK(mode)) - return (VLNK); - - if (S_ISSOCK(mode)) - return (VSOCK); - - return (VNON); -} /* vn_mode_to_vtype() */ -EXPORT_SYMBOL(vn_mode_to_vtype); - -mode_t -vn_vtype_to_mode(vtype_t vtype) -{ - if (vtype == VREG) - return (S_IFREG); - - if (vtype == VDIR) - return (S_IFDIR); - - if (vtype == VCHR) - return (S_IFCHR); - - if (vtype == VBLK) - return (S_IFBLK); - - if (vtype == VFIFO) - return (S_IFIFO); - - if (vtype == VLNK) - return (S_IFLNK); - - if (vtype == VSOCK) - return (S_IFSOCK); - - return (VNON); -} /* vn_vtype_to_mode() */ -EXPORT_SYMBOL(vn_vtype_to_mode); - -vnode_t * -vn_alloc(int flag) -{ - vnode_t *vp; - - vp = kmem_cache_alloc(vn_cache, flag); - if (vp != NULL) { - vp->v_file = NULL; - vp->v_type = 0; - } - - return (vp); -} /* vn_alloc() */ -EXPORT_SYMBOL(vn_alloc); - -void -vn_free(vnode_t *vp) -{ - kmem_cache_free(vn_cache, vp); -} /* vn_free() */ -EXPORT_SYMBOL(vn_free); - -int -vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp, - int x1, void *x2) -{ - struct file *fp; - struct kstat stat; - int rc, saved_umask = 0; - gfp_t saved_gfp; - vnode_t *vp; - - ASSERT(flags & (FWRITE | FREAD)); - ASSERT(seg == UIO_SYSSPACE); - ASSERT(vpp); - *vpp = NULL; - - if (!(flags & FCREAT) && (flags & FWRITE)) - flags |= FEXCL; - - /* - * Note for filp_open() the two low bits must be remapped to mean: - * 01 - read-only -> 00 read-only - * 10 - write-only -> 01 write-only - * 11 - read-write -> 10 read-write - */ - flags--; - - if (flags & FCREAT) - saved_umask = xchg(¤t->fs->umask, 0); - - fp = filp_open(path, flags, mode); - - if (flags & FCREAT) - (void) xchg(¤t->fs->umask, saved_umask); - - if (IS_ERR(fp)) - return (-PTR_ERR(fp)); - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat); -#else - rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); -#endif - if (rc) { - filp_close(fp, 0); - return (-rc); - } - - vp = vn_alloc(KM_SLEEP); - if (!vp) { - filp_close(fp, 0); - return (ENOMEM); - } - - saved_gfp = mapping_gfp_mask(fp->f_mapping); - mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); - - mutex_enter(&vp->v_lock); - vp->v_type = vn_mode_to_vtype(stat.mode); - vp->v_file = fp; - vp->v_gfp_mask = saved_gfp; - *vpp = vp; - mutex_exit(&vp->v_lock); - - return (0); -} /* vn_open() */ -EXPORT_SYMBOL(vn_open); - -int -vn_openat(const char *path, uio_seg_t seg, int flags, int mode, - vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) -{ - char *realpath; - int len, rc; - - ASSERT(vp == rootdir); - - len = strlen(path) + 2; - realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP)); - if (!realpath) - return (ENOMEM); - - (void) snprintf(realpath, len, "/%s", path); - rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); - kfree(realpath); - - return (rc); -} /* vn_openat() */ -EXPORT_SYMBOL(vn_openat); - -int -vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, - uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) -{ - struct file *fp = vp->v_file; - loff_t offset = off; - int rc; - - ASSERT(uio == UIO_WRITE || uio == UIO_READ); - ASSERT(seg == UIO_SYSSPACE); - ASSERT((ioflag & ~FAPPEND) == 0); - - if (ioflag & FAPPEND) - offset = fp->f_pos; - - if (uio & UIO_WRITE) - rc = spl_kernel_write(fp, addr, len, &offset); - else - rc = spl_kernel_read(fp, addr, len, &offset); - - fp->f_pos = offset; - - if (rc < 0) - return (-rc); - - if (residp) { - *residp = len - rc; - } else { - if (rc != len) - return (EIO); - } - - return (0); -} /* vn_rdwr() */ -EXPORT_SYMBOL(vn_rdwr); - -int -vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) -{ - int rc; - - ASSERT(vp); - ASSERT(vp->v_file); - - mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); - rc = filp_close(vp->v_file, 0); - vn_free(vp); - - return (-rc); -} /* vn_close() */ -EXPORT_SYMBOL(vn_close); - -/* - * vn_seek() does not actually seek it only performs bounds checking on the - * proposed seek. We perform minimal checking and allow vn_rdwr() to catch - * anything more serious. - */ -int -vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) -{ - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} -EXPORT_SYMBOL(vn_seek); - -int -vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) -{ - struct file *fp; - struct kstat stat; - int rc; - - ASSERT(vp); - ASSERT(vp->v_file); - ASSERT(vap); - - fp = vp->v_file; - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS, - AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat); -#else - rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); -#endif - if (rc) - return (-rc); - - vap->va_type = vn_mode_to_vtype(stat.mode); - vap->va_mode = stat.mode; - vap->va_uid = KUID_TO_SUID(stat.uid); - vap->va_gid = KGID_TO_SGID(stat.gid); - vap->va_fsid = 0; - vap->va_nodeid = stat.ino; - vap->va_nlink = stat.nlink; - vap->va_size = stat.size; - vap->va_blksize = stat.blksize; - vap->va_atime = stat.atime; - vap->va_mtime = stat.mtime; - vap->va_ctime = stat.ctime; - vap->va_rdev = stat.rdev; - vap->va_nblocks = stat.blocks; - - return (0); -} -EXPORT_SYMBOL(vn_getattr); - -int -vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) -{ - int datasync = 0; - int error; - int fstrans; - - ASSERT(vp); - ASSERT(vp->v_file); - - if (flags & FDSYNC) - datasync = 1; - - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - error = -spl_filp_fsync(vp->v_file, datasync); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - - return (error); -} /* vn_fsync() */ -EXPORT_SYMBOL(vn_fsync); - -int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, - offset_t offset, void *x6, void *x7) -{ - int error = EOPNOTSUPP; -#ifdef FALLOC_FL_PUNCH_HOLE - int fstrans; -#endif - - if (cmd != F_FREESP || bfp->l_whence != SEEK_SET) - return (EOPNOTSUPP); - - ASSERT(vp); - ASSERT(vp->v_file); - ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); - -#ifdef FALLOC_FL_PUNCH_HOLE - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - /* - * When supported by the underlying file system preferentially - * use the fallocate() callback to preallocate the space. - */ - error = -spl_filp_fallocate(vp->v_file, - FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, - bfp->l_start, bfp->l_len); - - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - - if (error == 0) - return (0); -#endif - -#ifdef HAVE_INODE_TRUNCATE_RANGE - if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && - vp->v_file->f_dentry->d_inode->i_op && - vp->v_file->f_dentry->d_inode->i_op->truncate_range) { - off_t end = bfp->l_start + bfp->l_len; - /* - * Judging from the code in shmem_truncate_range(), - * it seems the kernel expects the end offset to be - * inclusive and aligned to the end of a page. - */ - if (end % PAGE_SIZE != 0) { - end &= ~(off_t)(PAGE_SIZE - 1); - if (end <= bfp->l_start) - return (0); - } - --end; - - vp->v_file->f_dentry->d_inode->i_op->truncate_range( - vp->v_file->f_dentry->d_inode, bfp->l_start, end); - - return (0); - } -#endif - - return (error); -} -EXPORT_SYMBOL(vn_space); - -/* Function must be called while holding the vn_file_lock */ -static file_t * -file_find(int fd, struct task_struct *task) -{ - file_t *fp; - - list_for_each_entry(fp, &vn_file_list, f_list) { - if (fd == fp->f_fd && fp->f_task == task) { - ASSERT(atomic_read(&fp->f_ref) != 0); - return (fp); - } - } - - return (NULL); -} /* file_find() */ - -file_t * -vn_getf(int fd) -{ - struct kstat stat; - struct file *lfp; - file_t *fp; - vnode_t *vp; - int rc = 0; - - if (fd < 0) - return (NULL); - - /* Already open just take an extra reference */ - spin_lock(&vn_file_lock); - - fp = file_find(fd, current); - if (fp) { - lfp = fget(fd); - fput(fp->f_file); - /* - * areleasef() can cause us to see a stale reference when - * userspace has reused a file descriptor before areleasef() - * has run. fput() the stale reference and replace it. We - * retain the original reference count such that the concurrent - * areleasef() will decrement its reference and terminate. - */ - if (lfp != fp->f_file) { - fp->f_file = lfp; - fp->f_vnode->v_file = lfp; - } - atomic_inc(&fp->f_ref); - spin_unlock(&vn_file_lock); - return (fp); - } - - spin_unlock(&vn_file_lock); - - /* File was not yet opened create the object and setup */ - fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); - if (fp == NULL) - goto out; - - mutex_enter(&fp->f_lock); - - fp->f_fd = fd; - fp->f_task = current; - fp->f_offset = 0; - atomic_inc(&fp->f_ref); - - lfp = fget(fd); - if (lfp == NULL) - goto out_mutex; - - vp = vn_alloc(KM_SLEEP); - if (vp == NULL) - goto out_fget; - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE, - AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&lfp->f_path, &stat); -#else - rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat); -#endif - if (rc) - goto out_vnode; - - mutex_enter(&vp->v_lock); - vp->v_type = vn_mode_to_vtype(stat.mode); - vp->v_file = lfp; - mutex_exit(&vp->v_lock); - - fp->f_vnode = vp; - fp->f_file = lfp; - - /* Put it on the tracking list */ - spin_lock(&vn_file_lock); - list_add(&fp->f_list, &vn_file_list); - spin_unlock(&vn_file_lock); - - mutex_exit(&fp->f_lock); - return (fp); - -out_vnode: - vn_free(vp); -out_fget: - fput(lfp); -out_mutex: - mutex_exit(&fp->f_lock); - kmem_cache_free(vn_file_cache, fp); -out: - return (NULL); -} /* getf() */ -EXPORT_SYMBOL(getf); - -static void releasef_locked(file_t *fp) -{ - ASSERT(fp->f_file); - ASSERT(fp->f_vnode); - - /* Unlinked from list, no refs, safe to free outside mutex */ - fput(fp->f_file); - vn_free(fp->f_vnode); - - kmem_cache_free(vn_file_cache, fp); -} - -void -vn_releasef(int fd) -{ - areleasef(fd, P_FINFO(current)); -} -EXPORT_SYMBOL(releasef); - -void -vn_areleasef(int fd, uf_info_t *fip) -{ - file_t *fp; - struct task_struct *task = (struct task_struct *)fip; - - if (fd < 0) - return; - - spin_lock(&vn_file_lock); - fp = file_find(fd, task); - if (fp) { - atomic_dec(&fp->f_ref); - if (atomic_read(&fp->f_ref) > 0) { - spin_unlock(&vn_file_lock); - return; - } - - list_del(&fp->f_list); - releasef_locked(fp); - } - spin_unlock(&vn_file_lock); -} /* releasef() */ -EXPORT_SYMBOL(areleasef); - - -static void -vn_set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; - -#ifdef HAVE_FS_STRUCT_SPINLOCK - spin_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - spin_unlock(&fs->lock); -#else - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); -#endif /* HAVE_FS_STRUCT_SPINLOCK */ - - if (old_pwd.dentry) - path_put(&old_pwd); -} - -int -vn_set_pwd(const char *filename) -{ - struct path path; - mm_segment_t saved_fs; - int rc; - - /* - * user_path_dir() and __user_walk() both expect 'filename' to be - * a user space address so we must briefly increase the data segment - * size to ensure strncpy_from_user() does not fail with -EFAULT. - */ - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - rc = user_path_dir(filename, &path); - if (rc) - goto out; - - rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); - if (rc) - goto dput_and_out; - - vn_set_fs_pwd(current->fs, &path); - -dput_and_out: - path_put(&path); -out: - set_fs(saved_fs); - - return (-rc); -} /* vn_set_pwd() */ -EXPORT_SYMBOL(vn_set_pwd); - -static int -vn_cache_constructor(void *buf, void *cdrarg, int kmflags) -{ - struct vnode *vp = buf; - - mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); - - return (0); -} /* vn_cache_constructor() */ - -static void -vn_cache_destructor(void *buf, void *cdrarg) -{ - struct vnode *vp = buf; - - mutex_destroy(&vp->v_lock); -} /* vn_cache_destructor() */ - -static int -vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) -{ - file_t *fp = buf; - - atomic_set(&fp->f_ref, 0); - mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); - INIT_LIST_HEAD(&fp->f_list); - - return (0); -} /* vn_file_cache_constructor() */ - -static void -vn_file_cache_destructor(void *buf, void *cdrarg) -{ - file_t *fp = buf; - - mutex_destroy(&fp->f_lock); -} /* vn_file_cache_destructor() */ - -int -spl_vn_init(void) -{ - spin_lock_init(&vn_file_lock); - - vn_cache = kmem_cache_create("spl_vn_cache", - sizeof (struct vnode), 64, vn_cache_constructor, - vn_cache_destructor, NULL, NULL, NULL, 0); - - vn_file_cache = kmem_cache_create("spl_vn_file_cache", - sizeof (file_t), 64, vn_file_cache_constructor, - vn_file_cache_destructor, NULL, NULL, NULL, 0); - - return (0); -} /* spl_vn_init() */ - -void -spl_vn_fini(void) -{ - file_t *fp, *next_fp; - int leaked = 0; - - spin_lock(&vn_file_lock); - - list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { - list_del(&fp->f_list); - releasef_locked(fp); - leaked++; - } - - spin_unlock(&vn_file_lock); - - if (leaked > 0) - printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked); - - kmem_cache_destroy(vn_file_cache); - kmem_cache_destroy(vn_cache); -} /* spl_vn_fini() */ diff --git a/module/unicode/Makefile.in b/module/unicode/Makefile.in index 82c90373a2..59c07c4555 100644 --- a/module/unicode/Makefile.in +++ b/module/unicode/Makefile.in @@ -1,11 +1,11 @@ -src = @abs_top_srcdir@/module/unicode +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ +endif MODULE := zunicode obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - $(MODULE)-objs += u8_textprep.o $(MODULE)-objs += uconv.o diff --git a/module/unicode/u8_textprep.c b/module/unicode/u8_textprep.c index 4e6105b2e8..bce5f19625 100644 --- a/module/unicode/u8_textprep.c +++ b/module/unicode/u8_textprep.c @@ -46,7 +46,7 @@ #include #include #include - +#include /* The maximum possible number of bytes in a UTF-8 character. */ #define U8_MB_CUR_MAX (4) @@ -330,7 +330,7 @@ const uint8_t u8_valid_max_2nd_byte[0x100] = { * specific to UTF-8 and Unicode. */ int -u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) +u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum) { uchar_t *ib; uchar_t *ibtail; @@ -865,7 +865,9 @@ do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { + // cppcheck-suppress arrayIndexOutOfBoundsCond start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; + // cppcheck-suppress arrayIndexOutOfBoundsCond end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; } @@ -884,7 +886,7 @@ do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, * | B0| B1| ... | Bm| * +---+---+-...-+---+ * - * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). + * The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH). * * (2) Canonical decomposition mappings: * @@ -1012,7 +1014,9 @@ find_composition_start(size_t uv, uchar_t *s, size_t sz) start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { + // cppcheck-suppress arrayIndexOutOfBoundsCond start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; + // cppcheck-suppress arrayIndexOutOfBoundsCond end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; } @@ -1710,7 +1714,7 @@ TURN_STREAM_SAFE: } /* - * The do_norm_compare() function does string comparion based on Unicode + * The do_norm_compare() function does string comparison based on Unicode * simple case mappings and Unicode Normalization definitions. * * It does so by collecting a sequence of character at a time and comparing @@ -2139,13 +2143,13 @@ unicode_fini(void) module_init(unicode_init); module_exit(unicode_fini); +#endif -MODULE_DESCRIPTION("Unicode implementation"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +ZFS_MODULE_DESCRIPTION("Unicode implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(u8_validate); EXPORT_SYMBOL(u8_strcmp); EXPORT_SYMBOL(u8_textprep_str); -#endif diff --git a/module/unicode/uconv.c b/module/unicode/uconv.c index d812d5f969..fe84979d08 100644 --- a/module/unicode/uconv.c +++ b/module/unicode/uconv.c @@ -69,7 +69,7 @@ #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) /* Native and reversed endian macros. */ -#ifdef _BIG_ENDIAN +#ifdef _ZFS_BIG_ENDIAN #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN diff --git a/module/zcommon/Makefile.in b/module/zcommon/Makefile.in index 0ac0d43ee8..ebc5384404 100644 --- a/module/zcommon/Makefile.in +++ b/module/zcommon/Makefile.in @@ -1,18 +1,16 @@ -src = @abs_top_srcdir@/module/zcommon +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ -target_cpu = @target_cpu@ +endif MODULE := zcommon obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - # Suppress unused-value warnings in sparc64 architecture headers -ifeq ($(target_cpu),sparc64) -ccflags-y += -Wno-unused-value -endif +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value +$(MODULE)-objs += cityhash.o $(MODULE)-objs += zfeature_common.o $(MODULE)-objs += zfs_comutil.o $(MODULE)-objs += zfs_deleg.o @@ -21,7 +19,6 @@ $(MODULE)-objs += zfs_fletcher_superscalar.o $(MODULE)-objs += zfs_fletcher_superscalar4.o $(MODULE)-objs += zfs_namecheck.o $(MODULE)-objs += zfs_prop.o -$(MODULE)-objs += zfs_uio.o $(MODULE)-objs += zpool_prop.o $(MODULE)-objs += zprop_common.o diff --git a/module/zfs/cityhash.c b/module/zcommon/cityhash.c similarity index 96% rename from module/zfs/cityhash.c rename to module/zcommon/cityhash.c index 2b62edad03..413a96df2c 100644 --- a/module/zfs/cityhash.c +++ b/module/zcommon/cityhash.c @@ -22,7 +22,7 @@ * Copyright (c) 2017 by Delphix. All rights reserved. */ -#include +#include #define HASH_K1 0xb492b66fbe98f273ULL #define HASH_K2 0x9ae16a3b2f90404fULL @@ -61,3 +61,7 @@ cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) a + rotate(b + HASH_K2, 18) + c, mul)); } + +#if defined(_KERNEL) +EXPORT_SYMBOL(cityhash4); +#endif diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index dc0c1161f8..fc0e09605e 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -25,6 +25,8 @@ * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #ifndef _KERNEL @@ -36,6 +38,7 @@ #include #include #include +#include #include #include "zfeature_common.h" @@ -97,6 +100,8 @@ zfeature_is_supported(const char *guid) for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; + if (!feature->fi_zfs_mod_supported) + continue; if (strcmp(guid, feature->fi_guid) == 0) return (B_TRUE); } @@ -217,8 +222,17 @@ zfs_mod_supported_feature(const char *name) * libzpool, always supports all the features. libzfs needs to * query the running module, via sysfs, to determine which * features are supported. + * + * The equivalent _can_ be done on FreeBSD by way of the sysctl + * tree, but this has not been done yet. Therefore, we return + * that all features except edonr are supported. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) +#if defined(__FreeBSD__) + if (strcmp(name, "org.illumos:edonr") == 0) + return (B_FALSE); + else + return (B_TRUE); +#elif defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name)); @@ -256,6 +270,19 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name, feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid); } +/* + * Every feature has a GUID of the form com.example:feature_name. The + * reversed DNS name ensures that the feature's GUID is unique across all ZFS + * implementations. This allows companies to independently develop and + * release features. Examples include org.delphix and org.datto. Previously, + * features developed on one implementation have used that implementation's + * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs + * domain name is recommended for new features which are developed by the + * OpenZFS community and its platforms. This domain may optionally be used by + * companies developing features for initial release through an OpenZFS + * implementation. Use of the org.openzfs domain requires reserving the + * feature name in advance with the OpenZFS project. + */ void zpool_feature_init(void) { @@ -348,6 +375,31 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); + { + static const spa_feature_t livelist_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LIVELIST, + "com.delphix:livelist", "livelist", + "Improved clone deletion performance.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + livelist_deps); + } + + { + static const spa_feature_t log_spacemap_deps[] = { + SPA_FEATURE_SPACEMAP_V2, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LOG_SPACEMAP, + "com.delphix:log_spacemap", "log_spacemap", + "Log metaslab changes on a single spacemap and " + "flush them periodically.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + log_spacemap_deps); + } + { static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -408,6 +460,47 @@ zpool_feature_init(void) edonr_deps); } + { + static const spa_feature_t redact_books_deps[] = { + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, + "com.delphix:redaction_bookmarks", "redaction_bookmarks", + "Support for bookmarks which store redaction lists for zfs " + "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, + redact_books_deps); + } + + { + static const spa_feature_t redact_datasets_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTED_DATASETS, + "com.delphix:redacted_datasets", "redacted_datasets", "Support for " + "redacted datasets, produced by receiving a redacted zfs send " + "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, + redact_datasets_deps); + } + + { + static const spa_feature_t bookmark_written_deps[] = { + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, + "com.delphix:bookmark_written", "bookmark_written", + "Additional accounting, enabling the written# property" + "(space written since a bookmark), and estimates of send stream " + "sizes for incrementals from bookmarks.", + 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps); + } + zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", "Top-level vdevs can be removed, reducing logical pool size.", @@ -476,17 +569,35 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, project_quota_deps); } - { zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, "org.zfsonlinux:allocation_classes", "allocation_classes", "Support for separate allocation classes.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); - } zfeature_register(SPA_FEATURE_RESILVER_DEFER, "com.datto:resilver_defer", "resilver_defer", - "Support for defering new resilvers when one is already running.", + "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_DEVICE_REBUILD, + "org.openzfs:device_rebuild", "device_rebuild", + "Support for sequential mirror/dRAID device rebuilds", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t zstd_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, + "org.freebsd:zstd_compress", "zstd_compress", + "zstd compression algorithm support.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); + } + + zfeature_register(SPA_FEATURE_DRAID, + "org.openzfs:draid", "draid", "Support for distributed spare RAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c index 5daa6907c5..886167759b 100644 --- a/module/zcommon/zfs_comutil.c +++ b/module/zcommon/zfs_comutil.c @@ -26,7 +26,7 @@ /* * This file is intended for functions that ought to be common between user * land (libzfs) and the kernel. When many common routines need to be shared - * then a separate file should to be created. + * then a separate file should be created. */ #if !defined(_KERNEL) @@ -64,6 +64,37 @@ zfs_allocatable_devs(nvlist_t *nv) return (B_FALSE); } +/* + * Are there special vdevs? + */ +boolean_t +zfs_special_devs(nvlist_t *nv, char *type) +{ + char *bias; + uint_t c; + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + return (B_FALSE); + } + for (c = 0; c < children; c++) { + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (type != NULL && strcmp(bias, type) == 0) { + return (B_TRUE); + } else if (type == NULL) { + return (B_TRUE); + } + } + } + } + return (B_FALSE); +} + void zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp) { @@ -223,6 +254,7 @@ zfs_dataset_name_hidden(const char *name) #if defined(_KERNEL) EXPORT_SYMBOL(zfs_allocatable_devs); +EXPORT_SYMBOL(zfs_special_devs); EXPORT_SYMBOL(zpool_get_load_policy); EXPORT_SYMBOL(zfs_zpl_version_map); EXPORT_SYMBOL(zfs_spa_version_map); diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c index 8d98f720a6..e1f5a353b7 100644 --- a/module/zcommon/zfs_deleg.c +++ b/module/zcommon/zfs_deleg.c @@ -52,7 +52,6 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_MOUNT}, {ZFS_DELEG_PERM_PROMOTE}, {ZFS_DELEG_PERM_RECEIVE}, - {ZFS_DELEG_PERM_REMAP}, {ZFS_DELEG_PERM_RENAME}, {ZFS_DELEG_PERM_ROLLBACK}, {ZFS_DELEG_PERM_SNAPSHOT}, diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 5a991ba607..7a9de4a430 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -137,6 +137,7 @@ #include #include #include +#include #include #include #include @@ -183,7 +184,10 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = { #if defined(__x86_64) && defined(HAVE_AVX512F) &fletcher_4_avx512f_ops, #endif -#if defined(__aarch64__) +#if defined(__x86_64) && defined(HAVE_AVX512BW) + &fletcher_4_avx512bw_ops, +#endif +#if defined(__aarch64__) && !defined(__FreeBSD__) &fletcher_4_aarch64_neon_ops, #endif }; @@ -205,21 +209,19 @@ static struct fletcher_4_impl_selector { const char *fis_name; uint32_t fis_sel; } fletcher_4_impl_selectors[] = { -#if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, -#endif { "fastest", IMPL_FASTEST }, { "scalar", IMPL_SCALAR } }; #if defined(_KERNEL) static kstat_t *fletcher_4_kstat; -#endif static struct fletcher_4_kstat { uint64_t native; uint64_t byteswap; } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; +#endif /* Indicate that benchmark has been completed */ static boolean_t fletcher_4_initialized = B_FALSE; @@ -408,32 +410,36 @@ fletcher_4_impl_set(const char *val) return (err); } +/* + * Returns the Fletcher 4 operations for checksums. When a SIMD + * implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ static inline const fletcher_4_ops_t * fletcher_4_impl_get(void) { - fletcher_4_ops_t *ops = NULL; - const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + if (!kfpu_allowed()) + return (&fletcher_4_superscalar4_ops); + + const fletcher_4_ops_t *ops = NULL; + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); switch (impl) { case IMPL_FASTEST: ASSERT(fletcher_4_initialized); ops = &fletcher_4_fastest_impl; break; -#if !defined(_KERNEL) - case IMPL_CYCLE: { + case IMPL_CYCLE: + /* Cycle through supported implementations */ ASSERT(fletcher_4_initialized); ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); - static uint32_t cycle_count = 0; uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; ops = fletcher_4_supp_impls[idx]; - } - break; -#endif + break; default: ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); - ops = fletcher_4_supp_impls[impl]; break; } @@ -592,8 +598,9 @@ fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) } #if defined(_KERNEL) -/* Fletcher 4 kstats */ - +/* + * Fletcher 4 kstats + */ static int fletcher_4_kstat_headers(char *buf, size_t size) { @@ -653,11 +660,12 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ } -#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); +#if defined(_KERNEL) static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -669,7 +677,6 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); - fletcher_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : fletcher_4_byteswap; @@ -716,16 +723,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) /* restore original selection */ atomic_swap_32(&fletcher_4_impl_chosen, sel_save); } +#endif /* _KERNEL */ -void -fletcher_4_init(void) +/* + * Initialize and benchmark all supported implementations. + */ +static void +fletcher_4_benchmark(void) { - static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ fletcher_4_ops_t *curr_impl; - char *databuf; int i, c; - /* move supported impl into fletcher_4_supp_impls */ + /* Move supported implementations into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; @@ -735,19 +744,10 @@ fletcher_4_init(void) membar_producer(); /* complete fletcher_4_supp_impls[] init */ fletcher_4_supp_impls_cnt = c; /* number of supported impl */ -#if !defined(_KERNEL) - /* Skip benchmarking and use last implementation as fastest */ - memcpy(&fletcher_4_fastest_impl, - fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], - sizeof (fletcher_4_fastest_impl)); - fletcher_4_fastest_impl.name = "fastest"; - membar_producer(); +#if defined(_KERNEL) + static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ + char *databuf = vmem_alloc(data_size, KM_SLEEP); - fletcher_4_initialized = B_TRUE; - return; -#endif - /* Benchmark all supported implementations */ - databuf = vmem_alloc(data_size, KM_SLEEP); for (i = 0; i < data_size / sizeof (uint64_t); i++) ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ @@ -755,9 +755,28 @@ fletcher_4_init(void) fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); vmem_free(databuf, data_size); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&fletcher_4_fastest_impl, + fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], + sizeof (fletcher_4_fastest_impl)); + fletcher_4_fastest_impl.name = "fastest"; + membar_producer(); +#endif /* _KERNEL */ +} + +void +fletcher_4_init(void) +{ + /* Determine the fastest available implementation. */ + fletcher_4_benchmark(); #if defined(_KERNEL) - /* install kstats for all implementations */ + /* Install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { @@ -866,24 +885,26 @@ zio_abd_checksum_func_t fletcher_4_abd_ops = { .acf_iter = abd_fletcher_4_iter }; - #if defined(_KERNEL) -#include + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); char *fmt; - int i, cnt = 0; + int cnt = 0; /* list fastest */ - fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; + fmt = IMPL_FMT(impl, IMPL_FASTEST); cnt += sprintf(buffer + cnt, fmt, "fastest"); /* list all supported implementations */ - for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { - fmt = (i == impl) ? "[%s] " : "%s "; + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); cnt += sprintf(buffer + cnt, fmt, fletcher_4_supp_impls[i]->name); } @@ -897,14 +918,62 @@ fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) return (fletcher_4_impl_set(val)); } +#else + +#include + +static int +fletcher_4_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, + fletcher_4_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (-fletcher_4_impl_set(buf)); +} + +#endif + +#undef IMPL_FMT + /* * Choose a fletcher 4 implementation in ZFS. * Users can choose "cycle" to exercise all implementations, but this is * for testing purpose therefore it can only be set in user space. */ -module_param_call(zfs_fletcher_4_impl, - fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); -MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +/* BEGIN CSTYLED */ +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl, + fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW, + "Select fletcher 4 implementation."); +/* END CSTYLED */ EXPORT_SYMBOL(fletcher_init); EXPORT_SYMBOL(fletcher_2_incremental_native); diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index bd2db2b20f..c95a716815 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -43,7 +43,7 @@ #if defined(__aarch64__) -#include +#include #include #include #include @@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16))); static boolean_t fletcher_4_aarch64_neon_valid(void) { - return (B_TRUE); + return (kfpu_allowed()); } const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 7260a9864b..963f089b04 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -24,14 +24,16 @@ #if defined(__x86_64) && defined(HAVE_AVX512F) -#include #include #include #include #include +#include #include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif static void fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) @@ -157,7 +159,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); static boolean_t fletcher_4_avx512f_valid(void) { - return (zfs_avx512f_available()); + return (kfpu_allowed() && zfs_avx512f_available()); } const fletcher_4_ops_t fletcher_4_avx512f_ops = { @@ -171,4 +173,59 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = { .name = "avx512f" }; +#if defined(HAVE_AVX512BW) +static void +fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + static const zfs_fletcher_avx512_t mask = { + .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } + }; + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); + + __asm("vpshufb %zmm5, %zmm4, %zmm4"); + + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx) + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); + +static boolean_t +fletcher_4_avx512bw_valid(void) +{ + return (fletcher_4_avx512f_valid() && zfs_avx512bw_available()); +} + +const fletcher_4_ops_t fletcher_4_avx512bw_ops = { + .init_native = fletcher_4_avx512f_init, + .fini_native = fletcher_4_avx512f_fini, + .compute_native = fletcher_4_avx512f_native, + .init_byteswap = fletcher_4_avx512f_init, + .fini_byteswap = fletcher_4_avx512f_fini, + .compute_byteswap = fletcher_4_avx512bw_byteswap, + .valid = fletcher_4_avx512bw_valid, + .name = "avx512bw" +}; +#endif + #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index 6dac047dad..5136a01eca 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -42,8 +42,8 @@ #if defined(HAVE_AVX) && defined(HAVE_AVX2) -#include #include +#include #include #include @@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_avx2_valid(void) { - return (zfs_avx_available() && zfs_avx2_available()); + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const fletcher_4_ops_t fletcher_4_avx2_ops = { diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index a0b42e5f5f..15ce9b07ff 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -43,7 +43,7 @@ #if defined(HAVE_SSE2) -#include +#include #include #include #include @@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_sse2_valid(void) { - return (zfs_sse2_available()); + return (kfpu_allowed() && zfs_sse2_available()); } const fletcher_4_ops_t fletcher_4_sse2_ops = { @@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_ssse3_valid(void) { - return (zfs_sse2_available() && zfs_ssse3_available()); + return (kfpu_allowed() && zfs_sse2_available() && + zfs_ssse3_available()); } const fletcher_4_ops_t fletcher_4_ssse3_ops = { diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c index fbbbf80603..153f5c7d75 100644 --- a/module/zcommon/zfs_fletcher_superscalar.c +++ b/module/zcommon/zfs_fletcher_superscalar.c @@ -41,6 +41,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c index 97fdb7b7d3..75e6a3baf9 100644 --- a/module/zcommon/zfs_fletcher_superscalar4.c +++ b/module/zcommon/zfs_fletcher_superscalar4.c @@ -41,6 +41,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 58b23b0e00..7ecce451b4 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -74,7 +74,7 @@ get_dataset_depth(const char *path) /* * Keep track of nesting until you hit the end of the - * path or found the snapshot/bookmark seperator. + * path or found the snapshot/bookmark separator. */ for (int i = 0; loc[i] != '\0' && loc[i] != '@' && @@ -171,7 +171,7 @@ dataset_nestcheck(const char *path) * Where each component is made up of alphanumeric characters plus the following * characters: * - * [-_.:%] + * [-_.: %] * * We allow '%' here as we use that character internally to create unique * names for temporary clones (for online recv). @@ -183,6 +183,8 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) { const char *end; + EQUIV(why == NULL, what == NULL); + /* * Make sure the name is not too long. */ @@ -232,6 +234,27 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) } } + if (*end == '\0' || *end == '/') { + int component_length = end - start; + /* Validate the contents of this component is not '.' */ + if (component_length == 1) { + if (start[0] == '.') { + if (why) + *why = NAME_ERR_SELF_REF; + return (-1); + } + } + + /* Validate the content of this component is not '..' */ + if (component_length == 2) { + if (start[0] == '.' && start[1] == '.') { + if (why) + *why = NAME_ERR_PARENT_REF; + return (-1); + } + } + } + /* Snapshot or bookmark delimiter found */ if (*end == '@' || *end == '#') { /* Multiple delimiters are not allowed */ @@ -289,6 +312,44 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) return (ret); } +/* + * Assert path is a valid bookmark name + */ +int +bookmark_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + int ret = entity_namecheck(path, why, what); + + if (ret == 0 && strchr(path, '#') == NULL) { + if (why != NULL) { + *why = NAME_ERR_NO_POUND; + *what = '#'; + } + return (-1); + } + + return (ret); +} + +/* + * Assert path is a valid snapshot name + */ +int +snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + int ret = entity_namecheck(path, why, what); + + if (ret == 0 && strchr(path, '@') == NULL) { + if (why != NULL) { + *why = NAME_ERR_NO_AT; + *what = '@'; + } + return (-1); + } + + return (ret); +} + /* * mountpoint names must be of the following form: * @@ -381,29 +442,26 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); } - if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) { - if (why) - *why = NAME_ERR_DISKLIKE; - return (-1); - } - return (0); } -#if defined(_KERNEL) +EXPORT_SYMBOL(entity_namecheck); EXPORT_SYMBOL(pool_namecheck); EXPORT_SYMBOL(dataset_namecheck); +EXPORT_SYMBOL(bookmark_namecheck); +EXPORT_SYMBOL(snapshot_namecheck); EXPORT_SYMBOL(zfs_component_namecheck); EXPORT_SYMBOL(dataset_nestcheck); EXPORT_SYMBOL(get_dataset_depth); EXPORT_SYMBOL(zfs_max_dataset_nesting); -module_param(zfs_max_dataset_nesting, int, 0644); -MODULE_PARM_DESC(zfs_max_dataset_nesting, "Maximum depth of nested datasets"); -#endif +ZFS_MODULE_PARAM(zfs, zfs_, max_dataset_nesting, INT, ZMOD_RW, + "Limit to the amount of nesting a path can have. Defaults to 50."); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index dab749138a..d173219908 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -20,9 +20,11 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2016, Joyent, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* Portions Copyright 2010 Robert Milkowski */ @@ -81,7 +83,10 @@ zfs_prop_init(void) { "noparity", ZIO_CHECKSUM_NOPARITY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, +#if !defined(__FreeBSD__) + { "edonr", ZIO_CHECKSUM_EDONR }, +#endif { NULL } }; @@ -98,8 +103,11 @@ zfs_prop_init(void) { "skein", ZIO_CHECKSUM_SKEIN }, { "skein,verify", ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, +#if !defined(__FreeBSD__) + { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, +#endif { NULL } }; @@ -119,6 +127,87 @@ zfs_prop_init(void) { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, + { "zstd", ZIO_COMPRESS_ZSTD }, + { "zstd-fast", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) }, + + /* + * ZSTD 1-19 are synthetic. We store the compression level in a + * separate hidden property to avoid wasting a large amount of + * space in the ZIO_COMPRESS enum. + * + * The compression level is also stored within the header of the + * compressed block since we may need it for later recompression + * to avoid checksum errors (L2ARC). + * + * Note that the level here is defined as bit shifted mask on + * top of the method. + */ + { "zstd-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) }, + { "zstd-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) }, + { "zstd-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) }, + { "zstd-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) }, + { "zstd-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) }, + { "zstd-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) }, + { "zstd-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) }, + { "zstd-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) }, + { "zstd-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) }, + { "zstd-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) }, + { "zstd-11", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) }, + { "zstd-12", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) }, + { "zstd-13", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) }, + { "zstd-14", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) }, + { "zstd-15", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) }, + { "zstd-16", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) }, + { "zstd-17", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) }, + { "zstd-18", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) }, + { "zstd-19", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) }, + + /* + * The ZSTD-Fast levels are also synthetic. + */ + { "zstd-fast-1", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) }, + { "zstd-fast-2", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) }, + { "zstd-fast-3", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) }, + { "zstd-fast-4", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) }, + { "zstd-fast-5", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) }, + { "zstd-fast-6", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) }, + { "zstd-fast-7", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) }, + { "zstd-fast-8", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) }, + { "zstd-fast-9", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) }, + { "zstd-fast-10", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) }, + { "zstd-fast-20", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) }, + { "zstd-fast-30", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) }, + { "zstd-fast-40", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) }, + { "zstd-fast-50", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) }, + { "zstd-fast-60", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) }, + { "zstd-fast-70", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) }, + { "zstd-fast-80", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) }, + { "zstd-fast-90", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) }, + { "zstd-fast-100", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) }, + { "zstd-fast-500", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) }, + { "zstd-fast-1000", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) }, { NULL } }; @@ -154,11 +243,21 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t acl_mode_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "groupmask", ZFS_ACL_GROUPMASK }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "restricted", ZFS_ACL_RESTRICTED }, + { NULL } + }; + static zprop_index_t acltype_table[] = { { "off", ZFS_ACLTYPE_OFF }, - { "disabled", ZFS_ACLTYPE_OFF }, - { "noacl", ZFS_ACLTYPE_OFF }, - { "posixacl", ZFS_ACLTYPE_POSIXACL }, + { "posix", ZFS_ACLTYPE_POSIX }, + { "nfsv4", ZFS_ACLTYPE_NFSV4 }, + { "disabled", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ + { "noacl", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ + { "posixacl", ZFS_ACLTYPE_POSIX }, /* bkwrd compatibility */ { NULL } }; @@ -297,26 +396,48 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | fletcher2 | fletcher4 | sha256 | sha512 | " - "skein | edonr", "CHECKSUM", checksum_table); +#if !defined(__FreeBSD__) + "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein" + " | edonr", +#else + "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein", +#endif + "CHECKSUM", checksum_table); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | verify | sha256[,verify], sha512[,verify], " - "skein[,verify], edonr,verify", "DEDUP", dedup_table); + "on | off | verify | sha256[,verify] | sha512[,verify] | " +#if !defined(__FreeBSD__) + "skein[,verify] | edonr,verify", +#else + "skein[,verify]", +#endif + "DEDUP", dedup_table); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", "COMPRESS", - compress_table); + "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | " + "zstd | zstd-[1-19] | " + "zstd-fast | zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]", + "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_SNAPDEV, "snapdev", ZFS_SNAPDEV_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "hidden | visible", "SNAPDEV", snapdev_table); - zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_OFF, + zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough | restricted", "ACLMODE", + acl_mode_table); + zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", +#ifdef __linux__ + /* Linux doesn't natively support ZFS's NFSv4-style ACLs. */ + ZFS_ACLTYPE_OFF, +#else + ZFS_ACLTYPE_NFSV4, +#endif PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "noacl | posixacl", "ACLTYPE", acltype_table); + "off | nfsv4 | posix", "ACLTYPE", acltype_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", @@ -363,14 +484,19 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); +#ifdef __FreeBSD__ + zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); +#else zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); +#endif zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); - zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 0, PROP_INHERIT, + zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "OVERLAY", boolean_table); /* default index properties */ @@ -425,14 +551,14 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "on | off | sharemgr(1M) options", "SHARESMB"); + "on | off | SMB share options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); @@ -457,7 +583,11 @@ zfs_prop_init(void) "ENCROOT"); zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation", "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "prompt | ", "KEYLOCATION"); + "prompt | | | ", "KEYLOCATION"); + zprop_register_string(ZFS_PROP_REDACT_SNAPS, + "redact_snaps", NULL, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "[,...]", + "RSNAPS"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, @@ -465,9 +595,10 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, - PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, - PROP_READONLY, ZFS_TYPE_DATASET, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, @@ -495,7 +626,8 @@ zfs_prop_init(void) PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "LUSED"); zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", - 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); + 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "LREFER"); zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "", "FSCOUNT"); @@ -506,8 +638,6 @@ zfs_prop_init(void) ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); - zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters", 0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "PBKDF2ITERS"); @@ -540,7 +670,7 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS"); + "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, @@ -569,13 +699,16 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT"); zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID"); + zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED"); /* - * Property to be removed once libbe is integrated + * Properties that are obsolete and not used. These are retained so + * that we don't have to change the values of the zfs_prop_t enum, or + * have NULL pointers in the zfs_prop_table[]. */ - zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM, - "PRIV_PROP"); + zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, @@ -668,8 +801,10 @@ zfs_prop_userquota(const char *name) boolean_t zfs_prop_written(const char *name) { - static const char *prefix = "written@"; - return (strncmp(name, prefix, strlen(prefix)) == 0); + static const char *prop_prefix = "written@"; + static const char *book_prefix = "written#"; + return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 || + strncmp(name, book_prefix, strlen(book_prefix)) == 0); } /* @@ -801,12 +936,17 @@ zfs_prop_valid_keylocation(const char *str, boolean_t encrypted) return (B_TRUE); else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0) return (B_TRUE); + else if (strlen(str) > 8 && strncmp("https://", str, 8) == 0) + return (B_TRUE); + else if (strlen(str) > 7 && strncmp("http://", str, 7) == 0) + return (B_TRUE); return (B_FALSE); } #ifndef _KERNEL +#include /* * Returns a string describing the set of acceptable values for the given @@ -853,10 +993,23 @@ zfs_prop_align_right(zfs_prop_t prop) #endif #if defined(_KERNEL) + +#include + +#if defined(HAVE_KERNEL_FPU_INTERNAL) +union fpregs_state **zfs_kfpu_fpregs; +EXPORT_SYMBOL(zfs_kfpu_fpregs); +#endif /* HAVE_KERNEL_FPU_INTERNAL */ + static int __init zcommon_init(void) { + int error = kfpu_init(); + if (error) + return (error); + fletcher_4_init(); + return (0); } @@ -864,15 +1017,18 @@ static void __exit zcommon_fini(void) { fletcher_4_fini(); + kfpu_fini(); } -module_init(zcommon_init); +module_init_early(zcommon_init); module_exit(zcommon_fini); -MODULE_DESCRIPTION("Generic ZFS support"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +#endif + +ZFS_MODULE_DESCRIPTION("Generic ZFS support"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); /* zfs dataset property functions */ EXPORT_SYMBOL(zfs_userquota_prop_prefixes); @@ -898,5 +1054,3 @@ EXPORT_SYMBOL(zfs_prop_index_to_string); EXPORT_SYMBOL(zfs_prop_string_to_index); EXPORT_SYMBOL(zfs_prop_valid_for_type); EXPORT_SYMBOL(zfs_prop_written); - -#endif diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c deleted file mode 100644 index c1e31f51be..0000000000 --- a/module/zcommon/zfs_uio.c +++ /dev/null @@ -1,278 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ - -/* - * University Copyright- Copyright (c) 1982, 1986, 1988 - * The Regents of the University of California - * All Rights Reserved - * - * University Acknowledgment- Portions of this document are derived from - * software developed by the University of California, Berkeley, and its - * contributors. - */ -/* - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - */ - -/* - * The uio support from OpenSolaris has been added as a short term - * work around. The hope is to adopt native Linux type and drop the - * use of uio's entirely. Under Linux they only add overhead and - * when possible we want to use native APIs for the ZPL layer. - */ -#ifdef _KERNEL - -#include -#include -#include -#include -#include -#include - -/* - * Move "n" bytes at byte address "p"; "rw" indicates the direction - * of the move, and the I/O parameters are provided in "uio", which is - * update to reflect the data which was moved. Returns 0 on success or - * a non-zero errno on failure. - */ -static int -uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) -{ - const struct iovec *iov = uio->uio_iov; - size_t skip = uio->uio_skip; - ulong_t cnt; - - while (n && uio->uio_resid) { - cnt = MIN(iov->iov_len - skip, n); - switch (uio->uio_segflg) { - case UIO_USERSPACE: - case UIO_USERISPACE: - /* - * p = kernel data pointer - * iov->iov_base = user data pointer - */ - if (rw == UIO_READ) { - if (copy_to_user(iov->iov_base+skip, p, cnt)) - return (EFAULT); - } else { - if (uio->uio_fault_disable) { - if (!zfs_access_ok(VERIFY_READ, - (iov->iov_base + skip), cnt)) { - return (EFAULT); - } - pagefault_disable(); - if (__copy_from_user_inatomic(p, - (iov->iov_base + skip), cnt)) { - pagefault_enable(); - return (EFAULT); - } - pagefault_enable(); - } else { - if (copy_from_user(p, - (iov->iov_base + skip), cnt)) - return (EFAULT); - } - } - break; - case UIO_SYSSPACE: - if (rw == UIO_READ) - bcopy(p, iov->iov_base + skip, cnt); - else - bcopy(iov->iov_base + skip, p, cnt); - break; - default: - ASSERT(0); - } - skip += cnt; - if (skip == iov->iov_len) { - skip = 0; - uio->uio_iov = (++iov); - uio->uio_iovcnt--; - } - uio->uio_skip = skip; - uio->uio_resid -= cnt; - uio->uio_loffset += cnt; - p = (caddr_t)p + cnt; - n -= cnt; - } - return (0); -} - -static int -uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) -{ - const struct bio_vec *bv = uio->uio_bvec; - size_t skip = uio->uio_skip; - ulong_t cnt; - - while (n && uio->uio_resid) { - void *paddr; - cnt = MIN(bv->bv_len - skip, n); - - paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1); - if (rw == UIO_READ) - bcopy(p, paddr + bv->bv_offset + skip, cnt); - else - bcopy(paddr + bv->bv_offset + skip, p, cnt); - zfs_kunmap_atomic(paddr, KM_USER1); - - skip += cnt; - if (skip == bv->bv_len) { - skip = 0; - uio->uio_bvec = (++bv); - uio->uio_iovcnt--; - } - uio->uio_skip = skip; - uio->uio_resid -= cnt; - uio->uio_loffset += cnt; - p = (caddr_t)p + cnt; - n -= cnt; - } - return (0); -} - -int -uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) -{ - if (uio->uio_segflg != UIO_BVEC) - return (uiomove_iov(p, n, rw, uio)); - else - return (uiomove_bvec(p, n, rw, uio)); -} -EXPORT_SYMBOL(uiomove); - -#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) - -/* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. Any - * error will terminate the process as this is only a best attempt to get - * the pages resident. - */ -int -uio_prefaultpages(ssize_t n, struct uio *uio) -{ - const struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - uint8_t tmp; - int iovcnt; - size_t skip; - - /* no need to fault in kernel pages */ - switch (uio->uio_segflg) { - case UIO_SYSSPACE: - case UIO_BVEC: - return (0); - case UIO_USERSPACE: - case UIO_USERISPACE: - break; - default: - ASSERT(0); - } - - iov = uio->uio_iov; - iovcnt = uio->uio_iovcnt; - skip = uio->uio_skip; - - for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { - cnt = MIN(iov->iov_len - skip, n); - /* empty iov */ - if (cnt == 0) - continue; - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base + skip; - while (cnt) { - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); - } - - return (0); -} -EXPORT_SYMBOL(uio_prefaultpages); - -/* - * same as uiomove() but doesn't modify uio structure. - * return in cbytes how many bytes were copied. - */ -int -uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) -{ - struct uio uio_copy; - int ret; - - bcopy(uio, &uio_copy, sizeof (struct uio)); - ret = uiomove(p, n, rw, &uio_copy); - *cbytes = uio->uio_resid - uio_copy.uio_resid; - return (ret); -} -EXPORT_SYMBOL(uiocopy); - -/* - * Drop the next n chars out of *uiop. - */ -void -uioskip(uio_t *uiop, size_t n) -{ - if (n > uiop->uio_resid) - return; - - uiop->uio_skip += n; - if (uiop->uio_segflg != UIO_BVEC) { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_iov->iov_len) { - uiop->uio_skip -= uiop->uio_iov->iov_len; - uiop->uio_iov++; - uiop->uio_iovcnt--; - } - } else { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_bvec->bv_len) { - uiop->uio_skip -= uiop->uio_bvec->bv_len; - uiop->uio_bvec++; - uiop->uio_iovcnt--; - } - } - uiop->uio_loffset += n; - uiop->uio_resid -= n; -} -EXPORT_SYMBOL(uioskip); -#endif /* _KERNEL */ diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index ac1c42b3f0..6299d371f2 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -22,6 +22,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2021, Colm Buckley */ #include @@ -71,6 +72,9 @@ zpool_prop_init(void) PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); + zprop_register_string(ZPOOL_PROP_COMPATIBILITY, "compatibility", + "off", PROP_DEFAULT, ZFS_TYPE_POOL, + " | off | legacy", "COMPATIBILITY"); /* readonly number properties */ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, @@ -104,8 +108,6 @@ zpool_prop_init(void) /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); - zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ASHIFT"); @@ -131,7 +133,7 @@ zpool_prop_init(void) ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table); zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", - SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, + SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "AUTOTRIM", boolean_table); /* hidden properties */ @@ -143,6 +145,8 @@ zpool_prop_init(void) PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE"); + zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", + PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO"); } /* @@ -156,7 +160,7 @@ zpool_name_to_prop(const char *propname) /* * Given a pool property ID, returns the corresponding name. - * Assuming the pool propety ID is valid. + * Assuming the pool property ID is valid. */ const char * zpool_prop_to_name(zpool_prop_t prop) @@ -235,6 +239,7 @@ zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) } #ifndef _KERNEL +#include const char * zpool_prop_values(zpool_prop_t prop) diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c index 8416983fd9..faab9d9a74 100644 --- a/module/zcommon/zprop_common.c +++ b/module/zcommon/zprop_common.c @@ -41,11 +41,7 @@ #include "zfs_prop.h" #include "zfs_deleg.h" -#if defined(_KERNEL) -#include -#define qsort(base, num, size, cmp) \ - sort(base, num, size, cmp, NULL) -#else +#if !defined(_KERNEL) #include #include #include @@ -77,8 +73,11 @@ zfs_mod_supported_prop(const char *name, zfs_type_t type) * The zfs module spa_feature_table[], whether in-kernel or in libzpool, * always supports all the properties. libzfs needs to query the running * module, via sysfs, to determine which properties are supported. + * + * The equivalent _can_ be done on FreeBSD by way of the sysctl + * tree, but this has not been done yet. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) return (B_TRUE); #else return (zfs_mod_supported(type == ZFS_TYPE_POOL ? @@ -144,7 +143,7 @@ zprop_register_index(int prop, const char *name, uint64_t def, const char *colname, const zprop_index_t *idx_tbl) { zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, - objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl); + objset_types, values, colname, B_FALSE, B_TRUE, idx_tbl); } void diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index b2460f0d65..653ea0da9b 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -1,20 +1,17 @@ -src = @abs_top_srcdir@/module/zfs +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ obj = @abs_builddir@ -target_cpu = @target_cpu@ +mfdir = $(obj) +else +mfdir = $(srctree)/$(src) +endif MODULE := zfs obj-$(CONFIG_ZFS) := $(MODULE).o -ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) - # Suppress unused-value warnings in sparc64 architecture headers -ifeq ($(target_cpu),sparc64) -ccflags-y += -Wno-unused-value -endif - -# Suppress unused but set variable warnings often due to ASSERTs -ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value $(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o @@ -22,12 +19,12 @@ $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o $(MODULE)-objs += bpobj.o -$(MODULE)-objs += cityhash.o -$(MODULE)-objs += dbuf.o -$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += bptree.o +$(MODULE)-objs += btree.o $(MODULE)-objs += bqueue.o $(MODULE)-objs += dataset_kstats.o +$(MODULE)-objs += dbuf.o +$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += ddt.o $(MODULE)-objs += ddt_zap.o $(MODULE)-objs += dmu.o @@ -35,33 +32,36 @@ $(MODULE)-objs += dmu_diff.o $(MODULE)-objs += dmu_object.o $(MODULE)-objs += dmu_objset.o $(MODULE)-objs += dmu_recv.o +$(MODULE)-objs += dmu_redact.o $(MODULE)-objs += dmu_send.o $(MODULE)-objs += dmu_traverse.o $(MODULE)-objs += dmu_tx.o $(MODULE)-objs += dmu_zfetch.o $(MODULE)-objs += dnode.o $(MODULE)-objs += dnode_sync.o +$(MODULE)-objs += dsl_bookmark.o +$(MODULE)-objs += dsl_crypt.o $(MODULE)-objs += dsl_dataset.o $(MODULE)-objs += dsl_deadlist.o $(MODULE)-objs += dsl_deleg.o -$(MODULE)-objs += dsl_bookmark.o +$(MODULE)-objs += dsl_destroy.o $(MODULE)-objs += dsl_dir.o -$(MODULE)-objs += dsl_crypt.o $(MODULE)-objs += dsl_pool.o $(MODULE)-objs += dsl_prop.o $(MODULE)-objs += dsl_scan.o $(MODULE)-objs += dsl_synctask.o +$(MODULE)-objs += dsl_userhold.o $(MODULE)-objs += edonr_zfs.o $(MODULE)-objs += fm.o $(MODULE)-objs += gzip.o $(MODULE)-objs += hkdf.o -$(MODULE)-objs += lzjb.o $(MODULE)-objs += lz4.o +$(MODULE)-objs += lzjb.o $(MODULE)-objs += metaslab.o $(MODULE)-objs += mmp.o $(MODULE)-objs += multilist.o +$(MODULE)-objs += objlist.o $(MODULE)-objs += pathname.o -$(MODULE)-objs += policy.o $(MODULE)-objs += range_tree.o $(MODULE)-objs += refcount.o $(MODULE)-objs += rrwlock.o @@ -74,18 +74,18 @@ $(MODULE)-objs += spa_checkpoint.o $(MODULE)-objs += spa_config.o $(MODULE)-objs += spa_errlog.o $(MODULE)-objs += spa_history.o +$(MODULE)-objs += spa_log_spacemap.o $(MODULE)-objs += spa_misc.o $(MODULE)-objs += spa_stats.o $(MODULE)-objs += space_map.o $(MODULE)-objs += space_reftree.o $(MODULE)-objs += txg.o -$(MODULE)-objs += trace.o $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o -$(MODULE)-objs += vdev_disk.o -$(MODULE)-objs += vdev_file.o +$(MODULE)-objs += vdev_draid.o +$(MODULE)-objs += vdev_draid_rand.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o @@ -97,6 +97,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_rebuild.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += vdev_trim.o @@ -107,47 +108,30 @@ $(MODULE)-objs += zcp.o $(MODULE)-objs += zcp_get.o $(MODULE)-objs += zcp_global.o $(MODULE)-objs += zcp_iter.o +$(MODULE)-objs += zcp_set.o $(MODULE)-objs += zcp_synctask.o $(MODULE)-objs += zfeature.o -$(MODULE)-objs += zfs_acl.o $(MODULE)-objs += zfs_byteswap.o -$(MODULE)-objs += zfs_ctldir.o -$(MODULE)-objs += zfs_debug.o -$(MODULE)-objs += zfs_dir.o $(MODULE)-objs += zfs_fm.o $(MODULE)-objs += zfs_fuid.o $(MODULE)-objs += zfs_ioctl.o $(MODULE)-objs += zfs_log.o $(MODULE)-objs += zfs_onexit.o +$(MODULE)-objs += zfs_quota.o $(MODULE)-objs += zfs_ratelimit.o $(MODULE)-objs += zfs_replay.o $(MODULE)-objs += zfs_rlock.o $(MODULE)-objs += zfs_sa.o -$(MODULE)-objs += zfs_sysfs.o -$(MODULE)-objs += zfs_vfsops.o $(MODULE)-objs += zfs_vnops.o -$(MODULE)-objs += zfs_znode.o $(MODULE)-objs += zil.o $(MODULE)-objs += zio.o $(MODULE)-objs += zio_checksum.o $(MODULE)-objs += zio_compress.o -$(MODULE)-objs += zio_crypt.o $(MODULE)-objs += zio_inject.o $(MODULE)-objs += zle.o -$(MODULE)-objs += zpl_ctldir.o -$(MODULE)-objs += zpl_export.o -$(MODULE)-objs += zpl_file.o -$(MODULE)-objs += zpl_inode.o -$(MODULE)-objs += zpl_super.o -$(MODULE)-objs += zpl_xattr.o $(MODULE)-objs += zrlock.o $(MODULE)-objs += zthr.o $(MODULE)-objs += zvol.o -$(MODULE)-objs += dsl_destroy.o -$(MODULE)-objs += dsl_userhold.o -$(MODULE)-objs += qat.o -$(MODULE)-objs += qat_compress.o -$(MODULE)-objs += qat_crypt.o # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. @@ -162,3 +146,12 @@ $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o + +$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o +$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o + +ifeq ($(CONFIG_ALTIVEC),y) +$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec +endif + +include $(mfdir)/../os/linux/zfs/Makefile diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 9041bd8b18..bf39cd6133 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -59,31 +59,6 @@ * +----------------->| chunk N-1 | * +-----------+ * - * Linear buffers act exactly like normal buffers and are always mapped into the - * kernel's virtual memory space, while scattered ABD data chunks are allocated - * as physical pages and then mapped in only while they are actually being - * accessed through one of the abd_* library functions. Using scattered ABDs - * provides several benefits: - * - * (1) They avoid use of kmem_*, preventing performance problems where running - * kmem_reap on very large memory systems never finishes and causes - * constant TLB shootdowns. - * - * (2) Fragmentation is less of an issue since when we are at the limit of - * allocatable space, we won't have to search around for a long free - * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we weren't using segkpm (see next point) we - * wouldn't need to worry about finding a contiguous address range. - * - * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs - * on each ABD access. (If segkpm isn't available then we use all linear - * ABDs to avoid this penalty.) See seg_kpm.c for more details. - * - * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to - * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not - * available, which is the case on all 32-bit systems and any 64-bit systems - * where kpm_enable is turned off. - * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of @@ -112,490 +87,91 @@ * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. + * + * As an additional feature, linear and scatter ABD's can be stitched together + * by using the gang ABD type (abd_alloc_gang_abd()). This allows for + * multiple ABDs to be viewed as a singular ABD. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. */ -#include +#include #include #include #include #include -#ifdef _KERNEL -#include -#include -#else -#define MAX_ORDER 1 -#endif - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; - kstat_named_t abdstat_scatter_page_multi_chunk; - kstat_named_t abdstat_scatter_page_multi_zone; - kstat_named_t abdstat_scatter_page_alloc_retry; - kstat_named_t abdstat_scatter_sg_table_retry; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of compound allocations of a given order. These - * allocations are spread over all currently allocated ABDs, and - * act as a measure of memory fragmentation. - */ - { { "scatter_order_N", KSTAT_DATA_UINT64 } }, - /* - * The number of scatter ABDs which contain multiple chunks. - * ABDs are preferentially allocated from the minimum number of - * contiguous multi-page chunks, a single chunk is optimal. - */ - { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are split across memory zones. - * ABDs are preferentially allocated using pages from a single zone. - */ - { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the pages to populate the scatter ABD. - */ - { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the sg table for an ABD. - */ - { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define abd_for_each_sg(abd, sg, n, i) \ - for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; -unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; - -/* - * zfs_abd_scatter_min_size is the minimum allocation size to use scatter - * ABD's. Smaller allocations will use linear ABD's which uses - * zio_[data_]buf_alloc(). - * - * Scatter ABD's use at least one page each, so sub-page allocations waste - * some space when allocated as scatter (e.g. 2KB scatter allocation wastes - * half of each page). Using linear ABD's for small allocations means that - * they will be put on slabs which contain many allocations. This can - * improve memory efficiency, but it also makes it much harder for ARC - * evictions to actually free pages, because all the buffers on one slab need - * to be freed in order for the slab (and underlying pages) to be freed. - * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's - * possible for them to actually waste more memory than scatter (one page per - * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). - * - * Spill blocks are typically 512B and are heavily used on systems running - * selinux with the default dnode size and the `xattr=sa` property set. - * - * By default we use linear allocations for 512B and 1KB, and scatter - * allocations for larger (1.5KB and up). - */ -int zfs_abd_scatter_min_size = 512 * 3; - -static kmem_cache_t *abd_cache = NULL; -static kstat_t *abd_ksp; - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); -} - -#ifdef _KERNEL -#ifndef CONFIG_HIGHMEM - -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -static unsigned long -abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp, order); - if (!page) - return (0); - - return ((unsigned long) page_address(page)); -} - -/* - * The goal is to minimize fragmentation by preferentially populating ABDs - * with higher order compound pages from a single zone. Allocation size is - * progressively decreased until it can be satisfied without performing - * reclaim or compaction. When necessary this function will degenerate to - * allocating individual pages and allowing reclaim to satisfy allocations. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct list_head pages; - struct sg_table table; - struct scatterlist *sg; - struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; - size_t remaining_size; - int nid = NUMA_NO_NODE; - int alloc_pages = 0; - int order; - - INIT_LIST_HEAD(&pages); - - while (alloc_pages < nr_pages) { - unsigned long paddr; - unsigned chunk_pages; - - order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); - chunk_pages = (1U << order); - - paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order); - if (paddr == 0) { - if (order == 0) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } else { - max_order = MAX(0, order - 1); - } - continue; - } - - page = virt_to_page(paddr); - list_add_tail(&page->lru, &pages); - - if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) - zones++; - - nid = page_to_nid(page); - ABDSTAT_BUMP(abdstat_scatter_orders[order]); - chunks++; - alloc_pages += chunk_pages; - } - - ASSERT3S(alloc_pages, ==, nr_pages); - - while (sg_alloc_table(&table, chunks, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - sg = table.sgl; - remaining_size = size; - list_for_each_entry_safe(page, tmp_page, &pages, lru) { - size_t sg_size = MIN(PAGESIZE << compound_order(page), - remaining_size); - sg_set_page(sg, page, sg_size, 0); - remaining_size -= sg_size; - - sg = sg_next(sg); - list_del(&page->lru); - } - - if (chunks > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - - if (zones) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); - abd->abd_flags |= ABD_FLAG_MULTI_ZONE; - } - } - - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = table.nents; -} -#else -/* - * Allocate N individual pages to construct a scatter ABD. This function - * makes no attempt to request contiguous pages and requires the minimal - * number of kernel interfaces. It's designed for maximum compatibility. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - int nr_pages = abd_chunkcnt_for_bytes(size); - int i = 0; - - while (sg_alloc_table(&table, nr_pages, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - ASSERT3U(table.nents, ==, nr_pages); - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = nr_pages; - - abd_for_each_sg(abd, sg, nr_pages, i) { - while ((page = __page_cache_alloc(gfp)) == NULL) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } - - ABDSTAT_BUMP(abdstat_scatter_orders[0]); - sg_set_page(sg, page, PAGESIZE, 0); - } - - if (nr_pages > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - } -} -#endif /* !CONFIG_HIGHMEM */ - -static void -abd_free_pages(abd_t *abd) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - int nr_pages = ABD_SCATTER(abd).abd_nents; - int order, i = 0; - - if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); - - if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); - } - - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); -} - -#else /* _KERNEL */ - -#ifndef PAGE_SHIFT -#define PAGE_SHIFT (highbit64(PAGESIZE)-1) -#endif - -struct page; - -#define kpm_enable 1 -#define abd_alloc_chunk(o) \ - ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP)) -#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o)) -#define zfs_kmap_atomic(chunk, km) ((void *)chunk) -#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) -#define local_irq_save(flags) do { (void)(flags); } while (0) -#define local_irq_restore(flags) do { (void)(flags); } while (0) -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); -} - -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; - - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = abd_alloc_chunk(0); - sg_set_page(sg, p, PAGESIZE, 0); - } - ABD_SCATTER(abd).abd_nents = nr_pages; -} - -static void -abd_free_pages(abd_t *abd) -{ - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - int j; - - abd_for_each_sg(abd, sg, n, i) { - for (j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT); - abd_free_chunk(p, 0); - } - } - - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); -} - -#endif /* _KERNEL */ void -abd_init(void) -{ - int i; - - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - - for (i = 0; i < MAX_ORDER; i++) { - snprintf(abd_stats.abdstat_scatter_orders[i].name, - KSTAT_STRLEN, "scatter_order_%d", i); - abd_stats.abdstat_scatter_orders[i].data_type = - KSTAT_DATA_UINT64; - } - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - if (abd_cache) { - kmem_cache_destroy(abd_cache); - abd_cache = NULL; - } -} - -static inline void abd_verify(abd_t *abd) { +#ifdef ZFS_DEBUG ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK)); + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | + ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); + ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); + } else if (abd_is_gang(abd)) { + uint_t child_sizes = 0; + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT(list_link_active(&cabd->abd_gang_link)); + child_sizes += cabd->abd_size; + abd_verify(cabd); } + ASSERT3U(abd->abd_size, ==, child_sizes); + } else { + abd_verify_scatter(abd); } +#endif } -static inline abd_t * -abd_alloc_struct(void) +static void +abd_init_struct(abd_t *abd) { - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + abd->abd_flags = 0; +#ifdef ZFS_DEBUG + zfs_refcount_create(&abd->abd_children); + abd->abd_parent = NULL; +#endif + abd->abd_size = 0; +} - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); +static void +abd_fini_struct(abd_t *abd) +{ + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); +#ifdef ZFS_DEBUG + zfs_refcount_destroy(&abd->abd_children); +#endif +} +abd_t * +abd_alloc_struct(size_t size) +{ + abd_t *abd = abd_alloc_struct_impl(size); + abd_init_struct(abd); + abd->abd_flags |= ABD_FLAG_ALLOCD; return (abd); } -static inline void +void abd_free_struct(abd_t *abd) { - kmem_cache_free(abd_cache, abd); - ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); + abd_fini_struct(abd); + abd_free_struct_impl(abd); } /* @@ -605,47 +181,26 @@ abd_free_struct(abd_t *abd) abd_t * abd_alloc(size_t size, boolean_t is_metadata) { - /* see the comment above zfs_abd_scatter_min_size */ - if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) + if (abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - abd_t *abd = abd_alloc_struct(); - abd->abd_flags = ABD_FLAG_OWNER; - abd_alloc_pages(abd, size); + abd_t *abd = abd_alloc_struct(size); + abd->abd_flags |= ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_chunks(abd, size); if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_scatter.abd_offset = 0; - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - P2ROUNDUP(size, PAGESIZE) - size); + abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } -static void -abd_free_scatter(abd_t *abd) -{ - abd_free_pages(abd); - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); - - abd_free_struct(abd); -} - /* * Allocate an ABD that must be linear, along with its own underlying data * buffer. Only use this when it would be very annoying to write your ABD @@ -654,26 +209,23 @@ abd_free_scatter(abd_t *abd) abd_t * abd_alloc_linear(size_t size, boolean_t is_metadata) { - abd_t *abd = abd_alloc_struct(); + abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); + abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } @@ -681,33 +233,91 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) static void abd_free_linear(abd_t *abd) { + if (abd_is_linear_page(abd)) { + abd_free_linear_page(abd); + return; + } if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); +} - abd_free_struct(abd); +static void +abd_free_gang(abd_t *abd) +{ + ASSERT(abd_is_gang(abd)); + abd_t *cabd; + + while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) { + /* + * We must acquire the child ABDs mutex to ensure that if it + * is being added to another gang ABD we will set the link + * as inactive when removing it from this gang ABD and before + * adding it to the other gang ABD. + */ + mutex_enter(&cabd->abd_mtx); + ASSERT(list_link_active(&cabd->abd_gang_link)); + list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); + mutex_exit(&cabd->abd_mtx); + if (cabd->abd_flags & ABD_FLAG_GANG_FREE) + abd_free(cabd); + } + list_destroy(&ABD_GANG(abd).abd_gang_chain); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_chunks(abd); + abd_update_scatter_stats(abd, ABDSTAT_DECR); } /* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). + * Free an ABD. Use with any kind of abd: those created with abd_alloc_*() + * and abd_get_*(), including abd_get_offset_struct(). + * + * If the ABD was created with abd_alloc_*(), the underlying data + * (scatterlist or linear buffer) will also be freed. (Subject to ownership + * changes via abd_*_ownership_of_buf().) + * + * Unless the ABD was created with abd_get_offset_struct(), the abd_t will + * also be freed. */ void abd_free(abd_t *abd) { + if (abd == NULL) + return; + abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); +#ifdef ZFS_DEBUG + IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL); +#endif + + if (abd_is_gang(abd)) { + abd_free_gang(abd); + } else if (abd_is_linear(abd)) { + if (abd->abd_flags & ABD_FLAG_OWNER) + abd_free_linear(abd); + } else { + if (abd->abd_flags & ABD_FLAG_OWNER) + abd_free_scatter(abd); + } + +#ifdef ZFS_DEBUG + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } +#endif + + abd_fini_struct(abd); + if (abd->abd_flags & ABD_FLAG_ALLOCD) + abd_free_struct_impl(abd); } /* @@ -718,7 +328,8 @@ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); @@ -726,110 +337,265 @@ abd_alloc_sametype(abd_t *sabd, size_t size) } /* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - * - * On Linux the optimal thing to do would be to use abd_get_offset() and - * construct a new ABD which shares the original pages thereby eliminating - * the copy. But for the moment a new linear ABD is allocated until this - * performance optimization can be implemented. + * Create gang ABD that will be the head of a list of ABD's. This is used + * to "chain" scatter/gather lists together when constructing aggregated + * IO's. To free this abd, abd_free() must be called. */ abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) +abd_alloc_gang(void) { - return (abd_alloc(size, is_metadata)); + abd_t *abd = abd_alloc_struct(0); + abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER; + list_create(&ABD_GANG(abd).abd_gang_chain, + sizeof (abd_t), offsetof(abd_t, abd_gang_link)); + return (abd); } /* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. + * Add a child gang ABD to a parent gang ABDs chained list. */ -static inline abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +static void +abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { - abd_t *abd; + ASSERT(abd_is_gang(pabd)); + ASSERT(abd_is_gang(cabd)); + if (free_on_free) { + /* + * If the parent is responsible for freeing the child gang + * ABD we will just splice the child's children ABD list to + * the parent's list and immediately free the child gang ABD + * struct. The parent gang ABDs children from the child gang + * will retain all the free_on_free settings after being + * added to the parents list. + */ + pabd->abd_size += cabd->abd_size; + list_move_tail(&ABD_GANG(pabd).abd_gang_chain, + &ABD_GANG(cabd).abd_gang_chain); + ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); + abd_verify(pabd); + abd_free(cabd); + } else { + for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); + child != NULL; + child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) { + /* + * We always pass B_FALSE for free_on_free as it is the + * original child gang ABDs responsibility to determine + * if any of its child ABDs should be free'd on the call + * to abd_free(). + */ + abd_gang_add(pabd, child, B_FALSE); + } + abd_verify(pabd); + } +} + +/* + * Add a child ABD to a gang ABD's chained list. + */ +void +abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) +{ + ASSERT(abd_is_gang(pabd)); + abd_t *child_abd = NULL; + + /* + * If the child being added is a gang ABD, we will add the + * child's ABDs to the parent gang ABD. This allows us to account + * for the offset correctly in the parent gang ABD. + */ + if (abd_is_gang(cabd)) { + ASSERT(!list_link_active(&cabd->abd_gang_link)); + ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); + return (abd_gang_add_gang(pabd, cabd, free_on_free)); + } + ASSERT(!abd_is_gang(cabd)); + + /* + * In order to verify that an ABD is not already part of + * another gang ABD, we must lock the child ABD's abd_mtx + * to check its abd_gang_link status. We unlock the abd_mtx + * only after it is has been added to a gang ABD, which + * will update the abd_gang_link's status. See comment below + * for how an ABD can be in multiple gang ABD's simultaneously. + */ + mutex_enter(&cabd->abd_mtx); + if (list_link_active(&cabd->abd_gang_link)) { + /* + * If the child ABD is already part of another + * gang ABD then we must allocate a new + * ABD to use a separate link. We mark the newly + * allocated ABD with ABD_FLAG_GANG_FREE, before + * adding it to the gang ABD's list, to make the + * gang ABD aware that it is responsible to call + * abd_free(). We use abd_get_offset() in order + * to just allocate a new ABD but avoid copying the + * data over into the newly allocated ABD. + * + * An ABD may become part of multiple gang ABD's. For + * example, when writing ditto bocks, the same ABD + * is used to write 2 or 3 locations with 2 or 3 + * zio_t's. Each of the zio's may be aggregated with + * different adjacent zio's. zio aggregation uses gang + * zio's, so the single ABD can become part of multiple + * gang zio's. + * + * The ASSERT below is to make sure that if + * free_on_free is passed as B_TRUE, the ABD can + * not be in multiple gang ABD's. The gang ABD + * can not be responsible for cleaning up the child + * ABD memory allocation if the ABD can be in + * multiple gang ABD's at one time. + */ + ASSERT3B(free_on_free, ==, B_FALSE); + child_abd = abd_get_offset(cabd, 0); + child_abd->abd_flags |= ABD_FLAG_GANG_FREE; + } else { + child_abd = cabd; + if (free_on_free) + child_abd->abd_flags |= ABD_FLAG_GANG_FREE; + } + ASSERT3P(child_abd, !=, NULL); + + list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd); + mutex_exit(&cabd->abd_mtx); + pabd->abd_size += child_abd->abd_size; +} + +/* + * Locate the ABD for the supplied offset in the gang ABD. + * Return a new offset relative to the returned ABD. + */ +abd_t * +abd_gang_get_offset(abd_t *abd, size_t *off) +{ + abd_t *cabd; + + ASSERT(abd_is_gang(abd)); + ASSERT3U(*off, <, abd->abd_size); + for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (*off >= cabd->abd_size) + *off -= cabd->abd_size; + else + return (cabd); + } + VERIFY3P(cabd, !=, NULL); + return (cabd); +} + +/* + * Allocate a new ABD, using the provided struct (if non-NULL, and if + * circumstances allow - otherwise allocate the struct). The returned ABD will + * point to offset off of sabd. It shares the underlying buffer data with sabd. + * Use abd_free() to free. sabd must not be freed while any derived ABDs exist. + */ +static abd_t * +abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) +{ abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); + ASSERT3U(off + size, <=, sabd->abd_size); if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(); - + if (abd == NULL) + abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR; - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - int i = 0; - struct scatterlist *sg = NULL; - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { - if (new_offset < sg->length) - break; - new_offset -= sg->length; + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; + } else if (abd_is_gang(sabd)) { + size_t left = size; + if (abd == NULL) { + abd = abd_alloc_gang(); + } else { + abd->abd_flags |= ABD_FLAG_GANG; + list_create(&ABD_GANG(abd).abd_gang_chain, + sizeof (abd_t), offsetof(abd_t, abd_gang_link)); } - ABD_SCATTER(abd).abd_sgl = sg; - ABD_SCATTER(abd).abd_offset = new_offset; - ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + abd->abd_flags &= ~ABD_FLAG_OWNER; + for (abd_t *cabd = abd_gang_get_offset(sabd, &off); + cabd != NULL && left > 0; + cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { + int csize = MIN(left, cabd->abd_size - off); + + abd_t *nabd = abd_get_offset_size(cabd, off, csize); + abd_gang_add(abd, nabd, B_TRUE); + left -= csize; + off = 0; + } + ASSERT3U(left, ==, 0); + } else { + abd = abd_get_offset_scatter(abd, sabd, off, size); } + ASSERT3P(abd, !=, NULL); abd->abd_size = size; +#ifdef ZFS_DEBUG abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - +#endif return (abd); } +/* + * Like abd_get_offset_size(), but memory for the abd_t is provided by the + * caller. Using this routine can improve performance by avoiding the cost + * of allocating memory for the abd_t struct, and updating the abd stats. + * Usually, the provided abd is returned, but in some circumstances (FreeBSD, + * if sabd is scatter and size is more than 2 pages) a new abd_t may need to + * be allocated. Therefore callers should be careful to use the returned + * abd_t*. + */ +abd_t * +abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size) +{ + abd_t *result; + abd_init_struct(abd); + result = abd_get_offset_impl(abd, sabd, off, size); + if (result != abd) + abd_fini_struct(abd); + return (result); +} + abd_t * abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; - VERIFY3U(size, >, 0); - - return (abd_get_offset_impl(sabd, off, size)); + return (abd_get_offset_impl(NULL, sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); - - return (abd_get_offset_impl(sabd, off, size)); + return (abd_get_offset_impl(NULL, sabd, off, size)); } /* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. + * Return a size scatter ABD containing only zeros. + */ +abd_t * +abd_get_zeros(size_t size) +{ + ASSERT3P(abd_zero_scatter, !=, NULL); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + return (abd_get_offset_size(abd_zero_scatter, 0, size)); +} + +/* + * Allocate a linear ABD structure for buf. */ abd_t * abd_get_from_buf(void *buf, size_t size) { - abd_t *abd = abd_alloc_struct(); + abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); @@ -838,35 +604,14 @@ abd_get_from_buf(void *buf, size_t size) * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_linear.abd_buf = buf; + ABD_LINEAR_BUF(abd) = buf; return (abd); } -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - /* * Get the raw buffer associated with a linear ABD. */ @@ -875,7 +620,7 @@ abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); + return (ABD_LINEAR_BUF(abd)); } /* @@ -895,8 +640,9 @@ abd_borrow_buf(abd_t *abd, size_t n) } else { buf = zio_buf_alloc(n); } +#ifdef ZFS_DEBUG (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - +#endif return (buf); } @@ -927,7 +673,9 @@ abd_return_buf(abd_t *abd, void *buf, size_t n) ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } +#ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif } void @@ -939,6 +687,31 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) abd_return_buf(abd, buf, n); } +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + abd_update_linear_stats(abd, ABDSTAT_DECR); +} + + /* * Give this ABD ownership of the buffer that it's storing. Can only be used on * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated @@ -957,180 +730,73 @@ abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) abd->abd_flags |= ABD_FLAG_META; } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_INCR); } -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -#ifndef HAVE_1ARG_KMAP_ATOMIC -#define NR_KM_TYPE (6) -#ifdef _KERNEL -int km_table[NR_KM_TYPE] = { - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, -}; -#endif -#endif - -struct abd_iter { - /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; - size_t iter_offset; /* offset in current sg/abd_buf, */ - /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ -#ifndef HAVE_1ARG_KMAP_ATOMIC - int iter_km; /* KM_* for kmap_atomic */ -#endif -}; - /* - * Initialize the abd_iter. + * Initializes an abd_iter based on whether the abd is a gang ABD + * or just a single ABD. */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) +static inline abd_t * +abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) { - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; + abd_t *cabd = NULL; + + if (abd_is_gang(abd)) { + cabd = abd_gang_get_offset(abd, &off); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, off); + } } else { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + abd_iter_init(aiter, abd); + abd_iter_advance(aiter, off); } -#ifndef HAVE_1ARG_KMAP_ATOMIC - ASSERT3U(km_type, <, NR_KM_TYPE); - aiter->iter_km = km_type; -#endif + return (cabd); } /* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. + * Advances an abd_iter. We have to be careful with gang ABD as + * advancing could mean that we are at the end of a particular ABD and + * must grab the ABD in the gang ABD's list. */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) +static inline abd_t * +abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, + size_t len) { - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; - aiter->iter_offset += amount; - if (!abd_is_linear(aiter->iter_abd)) { - while (aiter->iter_offset >= aiter->iter_sg->length) { - aiter->iter_offset -= aiter->iter_sg->length; - aiter->iter_sg = sg_next(aiter->iter_sg); - if (aiter->iter_sg == NULL) { - ASSERT0(aiter->iter_offset); - break; - } + abd_iter_advance(aiter, len); + if (abd_is_gang(abd) && abd_iter_at_end(aiter)) { + ASSERT3P(cabd, !=, NULL); + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, 0); } } -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); - offset = aiter->iter_offset; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - offset = aiter->iter_offset; - aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - - paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), - km_table[aiter->iter_km]); - } - - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (!abd_is_linear(aiter->iter_abd)) { - /* LINTED E_FUNC_SET_NOT_USED */ - zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, - km_table[aiter->iter_km]); - } - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; + return (cabd); } int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { - int ret = 0; struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); + boolean_t gang = abd_is_gang(abd); + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { + /* If we are at the end of the gang ABD we are done */ + if (gang && !c_abd) + break; + abd_iter_map(&aiter); size_t len = MIN(aiter.iter_mapsize, size); @@ -1144,7 +810,7 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, break; size -= len; - abd_iter_advance(&aiter, len); + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); } return (ret); @@ -1251,6 +917,11 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; + boolean_t dabd_is_gang_abd, sabd_is_gang_abd; + abd_t *c_dabd, *c_sabd; + + if (size == 0) + return (0); abd_verify(dabd); abd_verify(sabd); @@ -1258,12 +929,17 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); + dabd_is_gang_abd = abd_is_gang(dabd); + sabd_is_gang_abd = abd_is_gang(sabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, doff); + c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { + /* if we are at the end of the gang ABD we are done */ + if ((dabd_is_gang_abd && !c_dabd) || + (sabd_is_gang_abd && !c_sabd)) + break; + abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1282,8 +958,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, break; size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, len); + c_sabd = + abd_advance_abd_iter(sabd, c_sabd, &saiter, len); } return (ret); @@ -1343,35 +1021,55 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; - unsigned long flags; + unsigned long flags __maybe_unused = 0; + abd_t *c_cabds[3]; + abd_t *c_dabd = NULL; + boolean_t cabds_is_gang_abd[3]; + boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); + for (i = 0; i < parity; i++) { + cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); + } - if (dabd) - abd_iter_init(&daiter, dabd, i); + if (dabd) { + dabd_is_gang_abd = abd_is_gang(dabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, 0); + } ASSERT3S(dsize, >=, 0); - local_irq_save(flags); + abd_enter_critical(flags); while (csize > 0) { + /* if we are at the end of the gang ABD we are done */ + if (dabd_is_gang_abd && !c_dabd) + break; + + for (i = 0; i < parity; i++) { + /* + * If we are at the end of the gang ABD we are + * done. + */ + if (cabds_is_gang_abd[i] && !c_cabds[i]) + break; + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + len = csize; if (dabd && dsize > 0) abd_iter_map(&daiter); - for (i = 0; i < parity; i++) { - abd_iter_map(&caiters[i]); - caddrs[i] = caiters[i].iter_mapaddr; - } - switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); + fallthrough; case 2: len = MIN(caiters[1].iter_mapsize, len); + fallthrough; case 1: len = MIN(caiters[0].iter_mapsize, len); } @@ -1398,12 +1096,16 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &caiters[i], len); } if (dabd && dsize > 0) { abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, + dlen); dsize -= dlen; } @@ -1412,7 +1114,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } - local_irq_restore(flags); + abd_exit_critical(flags); } /* @@ -1437,19 +1139,35 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; - unsigned long flags; + unsigned long flags __maybe_unused = 0; + boolean_t cabds_is_gang_abd[3]; + boolean_t tabds_is_gang_abd[3]; + abd_t *c_cabds[3]; + abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); + cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + c_cabds[i] = + abd_init_abd_iter(cabds[i], &citers[i], 0); + c_tabds[i] = + abd_init_abd_iter(tabds[i], &xiters[i], 0); } - local_irq_save(flags); + abd_enter_critical(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { + /* + * If we are at the end of the gang ABD we + * are done. + */ + if (cabds_is_gang_abd[i] && !c_cabds[i]) + break; + if (tabds_is_gang_abd[i] && !c_tabds[i]) + break; abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; @@ -1461,9 +1179,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); + fallthrough; case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); + fallthrough; case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); @@ -1481,87 +1201,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); + c_tabds[i] = + abd_advance_abd_iter(tabds[i], c_tabds[i], + &xiters[i], len); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &citers[i], len); } tsize -= len; ASSERT3S(tsize, >=, 0); } - local_irq_restore(flags); + abd_exit_critical(flags); } - -#if defined(_KERNEL) -/* - * bio_nr_pages for ABD. - * @off is the offset in @abd - */ -unsigned long -abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) -{ - unsigned long pos; - - if (abd_is_linear(abd)) - pos = (unsigned long)abd_to_buf(abd) + off; - else - pos = abd->abd_u.abd_scatter.abd_offset + off; - - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); -} - -/* - * bio_map for scatter ABD. - * @off is the offset in @abd - * Remaining IO size is returned - */ -unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, - unsigned int io_size, size_t off) -{ - int i; - struct abd_iter aiter; - - ASSERT(!abd_is_linear(abd)); - ASSERT3U(io_size, <=, abd->abd_size - off); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - for (i = 0; i < bio->bi_max_vecs; i++) { - struct page *pg; - size_t len, sgoff, pgoff; - struct scatterlist *sg; - - if (io_size <= 0) - break; - - sg = aiter.iter_sg; - sgoff = aiter.iter_offset; - pgoff = sgoff & (PAGESIZE - 1); - len = MIN(io_size, PAGESIZE - pgoff); - ASSERT(len > 0); - - pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); - if (bio_add_page(bio, pg, len, pgoff) != len) - break; - - io_size -= len; - abd_iter_advance(&aiter, len); - } - - return (io_size); -} - -/* Tunable Parameters */ -module_param(zfs_abd_scatter_enabled, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_enabled, - "Toggle whether ABD allocations must be linear."); -module_param(zfs_abd_scatter_min_size, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_min_size, - "Minimum size of scatter allocations."); -/* CSTYLED */ -module_param(zfs_abd_scatter_max_order, uint, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_max_order, - "Maximum order allocation used for a scatter ABD."); -#endif diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c index ace3a83a5d..c4ea4f86fc 100644 --- a/module/zfs/aggsum.c +++ b/module/zfs/aggsum.c @@ -70,14 +70,19 @@ * zeroing out the borrowed value (forcing that thread to borrow on its next * request, which will also be expensive). This is what makes aggsums well * suited for write-many read-rarely operations. + * + * Note that the aggsums do not expand if more CPUs are hot-added. In that + * case, we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ /* - * We will borrow aggsum_borrow_multiplier times the current request, so we will - * have to get the as_lock approximately every aggsum_borrow_multiplier calls to - * aggsum_delta(). + * We will borrow 2^aggsum_borrow_shift times the current request, so we will + * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to + * aggsum_add(). */ -static uint_t aggsum_borrow_multiplier = 10; +static uint_t aggsum_borrow_shift = 4; void aggsum_init(aggsum_t *as, uint64_t value) @@ -85,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value) bzero(as, sizeof (*as)); as->as_lower_bound = as->as_upper_bound = value; mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); - as->as_numbuckets = boot_ncpus; - as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), - KM_SLEEP); + /* + * Too many buckets may hurt read performance without improving + * write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc. + */ + as->as_bucketshift = highbit64(boot_ncpus / 6) / 2; + as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1; + as->as_buckets = kmem_zalloc(as->as_numbuckets * + sizeof (aggsum_bucket_t), KM_SLEEP); for (int i = 0; i < as->as_numbuckets; i++) { mutex_init(&as->as_buckets[i].asc_lock, NULL, MUTEX_DEFAULT, NULL); @@ -106,100 +116,91 @@ aggsum_fini(aggsum_t *as) int64_t aggsum_lower_bound(aggsum_t *as) { - return (as->as_lower_bound); + return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound)); } -int64_t +uint64_t aggsum_upper_bound(aggsum_t *as) { - return (as->as_upper_bound); -} - -static void -aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) -{ - ASSERT(MUTEX_HELD(&as->as_lock)); - ASSERT(MUTEX_HELD(&asb->asc_lock)); - - /* - * We use atomic instructions for this because we read the upper and - * lower bounds without the lock, so we need stores to be atomic. - */ - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta); - asb->asc_delta = 0; - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - -asb->asc_borrowed); - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - asb->asc_borrowed); - asb->asc_borrowed = 0; + return (atomic_load_64(&as->as_upper_bound)); } uint64_t aggsum_value(aggsum_t *as) { - int64_t rv; + int64_t lb; + uint64_t ub; mutex_enter(&as->as_lock); - if (as->as_lower_bound == as->as_upper_bound) { - rv = as->as_lower_bound; + lb = as->as_lower_bound; + ub = as->as_upper_bound; + if (lb == ub) { for (int i = 0; i < as->as_numbuckets; i++) { ASSERT0(as->as_buckets[i].asc_delta); ASSERT0(as->as_buckets[i].asc_borrowed); } mutex_exit(&as->as_lock); - return (rv); + return (lb); } for (int i = 0; i < as->as_numbuckets; i++) { struct aggsum_bucket *asb = &as->as_buckets[i]; + if (asb->asc_borrowed == 0) + continue; mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); + lb += asb->asc_delta + asb->asc_borrowed; + ub += asb->asc_delta - asb->asc_borrowed; + asb->asc_delta = 0; + asb->asc_borrowed = 0; mutex_exit(&asb->asc_lock); } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - rv = as->as_lower_bound; + ASSERT3U(lb, ==, ub); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); + atomic_store_64(&as->as_upper_bound, lb); mutex_exit(&as->as_lock); - return (rv); -} - -static void -aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb) -{ - int64_t abs_delta = (delta < 0 ? -delta : delta); - mutex_enter(&as->as_lock); - mutex_enter(&asb->asc_lock); - - aggsum_flush_bucket(as, asb); - - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta); - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta); - asb->asc_borrowed = abs_delta; - - mutex_exit(&asb->asc_lock); - mutex_exit(&as->as_lock); + return (lb); } void aggsum_add(aggsum_t *as, int64_t delta) { struct aggsum_bucket *asb; + int64_t borrow; - kpreempt_disable(); - asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; - kpreempt_enable(); + asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) % + as->as_numbuckets]; - for (;;) { - mutex_enter(&asb->asc_lock); - if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && - asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { - asb->asc_delta += delta; - mutex_exit(&asb->asc_lock); - return; - } + /* Try fast path if we already borrowed enough before. */ + mutex_enter(&asb->asc_lock); + if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && + asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { + asb->asc_delta += delta; mutex_exit(&asb->asc_lock); - aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb); + return; } + mutex_exit(&asb->asc_lock); + + /* + * We haven't borrowed enough. Take the global lock and borrow + * considering what is requested now and what we borrowed before. + */ + borrow = (delta < 0 ? -delta : delta); + borrow <<= aggsum_borrow_shift + as->as_bucketshift; + mutex_enter(&as->as_lock); + if (borrow >= asb->asc_borrowed) + borrow -= asb->asc_borrowed; + else + borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; + mutex_enter(&asb->asc_lock); + delta += asb->asc_delta; + asb->asc_delta = 0; + asb->asc_borrowed += borrow; + mutex_exit(&asb->asc_lock); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, + as->as_lower_bound + delta - borrow); + atomic_store_64(&as->as_upper_bound, + as->as_upper_bound + delta + borrow); + mutex_exit(&as->as_lock); } /* @@ -210,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta) int aggsum_compare(aggsum_t *as, uint64_t target) { - if (as->as_upper_bound < target) + int64_t lb; + uint64_t ub; + int i; + + if (atomic_load_64(&as->as_upper_bound) < target) return (-1); - if (as->as_lower_bound > target) + lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound); + if (lb > 0 && (uint64_t)lb > target) return (1); mutex_enter(&as->as_lock); - for (int i = 0; i < as->as_numbuckets; i++) { + lb = as->as_lower_bound; + ub = as->as_upper_bound; + for (i = 0; i < as->as_numbuckets; i++) { struct aggsum_bucket *asb = &as->as_buckets[i]; + if (asb->asc_borrowed == 0) + continue; mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); + lb += asb->asc_delta + asb->asc_borrowed; + ub += asb->asc_delta - asb->asc_borrowed; + asb->asc_delta = 0; + asb->asc_borrowed = 0; mutex_exit(&asb->asc_lock); - if (as->as_upper_bound < target) { - mutex_exit(&as->as_lock); - return (-1); - } - if (as->as_lower_bound > target) { - mutex_exit(&as->as_lock); - return (1); - } + if (ub < target || (lb > 0 && (uint64_t)lb > target)) + break; } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - ASSERT3U(as->as_lower_bound, ==, target); + if (i >= as->as_numbuckets) + ASSERT3U(lb, ==, ub); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); + atomic_store_64(&as->as_upper_bound, ub); mutex_exit(&as->as_lock); - return (0); + return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9b500352a4..f0330150f9 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,9 +21,17 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2020, Delphix. All rights reserved. + * Copyright (c) 2014, Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. */ /* @@ -62,7 +70,7 @@ * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we - * have variable sized cache blocks (rangeing from 512 bytes to + * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. @@ -262,7 +270,7 @@ * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not - * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded. + * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a @@ -282,29 +290,26 @@ #include #include #include -#include +#include #include #include #include -#include #include #include #include #include -#ifdef _KERNEL -#include -#include -#include -#include -#endif #include #include #include #include #include -#include +#include #include -#include +#include +#include +#include +#include +#include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -316,17 +321,38 @@ boolean_t arc_watch = B_FALSE; * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ -static zthr_t *arc_reap_zthr; +static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling - * arc_adjust(), which improves arc_is_overflowing(). + * arc_evict(), which improves arc_is_overflowing(). */ -static zthr_t *arc_adjust_zthr; +static zthr_t *arc_evict_zthr; -static kmutex_t arc_adjust_lock; -static kcondvar_t arc_adjust_waiters_cv; -static boolean_t arc_adjust_needed = B_FALSE; +static kmutex_t arc_evict_lock; +static boolean_t arc_evict_needed = B_FALSE; + +/* + * Count of bytes evicted since boot. + */ +static uint64_t arc_evict_count; + +/* + * List of arc_evict_waiter_t's, representing threads waiting for the + * arc_evict_count to reach specific values. + */ +static list_t arc_evict_waiters; + +/* + * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of + * the requested amount of data to be evicted. For example, by default for + * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. + * Since this is above 100%, it ensures that progress is made towards getting + * arc_size under arc_c. Since this is finite, it ensures that allocations + * can still happen, even during the potentially long time that arc_size is + * more than arc_c. + */ +int zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before @@ -338,7 +364,7 @@ static boolean_t arc_adjust_needed = B_FALSE; int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ -static int arc_grow_retry = 5; +int arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). @@ -352,11 +378,11 @@ int zfs_arc_overflow_shift = 8; int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 7; +int arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL -static uint_t zfs_arc_pc_percent = 0; +uint_t zfs_arc_pc_percent = 0; #endif /* @@ -383,20 +409,10 @@ static int arc_min_prescient_prefetch_ms; */ int arc_lotsfree_percent = 10; -/* - * hdr_recl() uses this to determine if the arc is up and running. - */ -static boolean_t arc_initialized; - /* * The arc has filled available memory and has now warmed up. */ -static boolean_t arc_warm; - -/* - * log2 fraction of the zio arena to keep free. - */ -int arc_zio_arena_free_shift = 2; +boolean_t arc_warm; /* * These tunables are for performance analysis. @@ -448,289 +464,14 @@ int zfs_arc_meta_adjust_restarts = 4096; int zfs_arc_lotsfree_percent = 10; /* The 6 states: */ -static arc_state_t ARC_anon; -static arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -static arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; +arc_state_t ARC_anon; +arc_state_t ARC_mru; +arc_state_t ARC_mru_ghost; +arc_state_t ARC_mfu; +arc_state_t ARC_mfu_ghost; +arc_state_t ARC_l2c_only; -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach its target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by dmu_buf_impl_t objects. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_dbuf_size; - /* - * Number of bytes consumed by dnode_t objects. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_dnode_size; - /* - * Number of bytes consumed by bonus buffers. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_bonus_size; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_memory_direct_count; - kstat_named_t arcstat_memory_indirect_count; - kstat_named_t arcstat_memory_all_bytes; - kstat_named_t arcstat_memory_free_bytes; - kstat_named_t arcstat_memory_available_bytes; - kstat_named_t arcstat_no_grow; - kstat_named_t arcstat_tempreserve; - kstat_named_t arcstat_loaned_bytes; - kstat_named_t arcstat_prune; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; - kstat_named_t arcstat_need_free; - kstat_named_t arcstat_sys_free; - kstat_named_t arcstat_raw_size; -} arc_stats_t; - -static arc_stats_t arc_stats = { +arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, @@ -752,6 +493,8 @@ static arc_stats_t arc_stats = { { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, @@ -773,6 +516,9 @@ static arc_stats_t arc_stats = { { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, +#if defined(COMPAT_FREEBSD11) + { "other_size", KSTAT_DATA_UINT64 }, +#endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, @@ -790,6 +536,11 @@ static arc_stats_t arc_stats = { { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, + { "l2_mru_asize", KSTAT_DATA_UINT64 }, + { "l2_mfu_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, @@ -808,6 +559,22 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, @@ -828,16 +595,12 @@ static arc_stats_t arc_stats = { { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, - { "arc_raw_size", KSTAT_DATA_UINT64 } + { "arc_raw_size", KSTAT_DATA_UINT64 }, + { "cached_only_in_progress", KSTAT_DATA_UINT64 }, + { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) +arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ @@ -846,9 +609,6 @@ static arc_stats_t arc_stats = { continue; \ } -#define ARCSTAT_MAXSTAT(stat) \ - ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) - /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for @@ -869,13 +629,24 @@ static arc_stats_t arc_stats = { } \ } +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + } while (0) + kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu; -static arc_state_t *arc_mfu_ghost; -static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- @@ -885,51 +656,18 @@ static arc_state_t *arc_l2c_only; * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ +/* max size for dnodes */ +#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ -#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ -/* size of all b_rabd's in entire arc */ -#define arc_raw_size ARCSTAT(arcstat_raw_size) -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* - * There are also some ARC variables that we want to export, but that are - * updated so often that having the canonical representation be the statistic - * variable causes a performance bottleneck. We want to use aggsum_t's for these - * instead, but still be able to export the kstat in the same way as before. - * The solution is to always use the aggsum version, except in the kstat update - * callback. - */ -aggsum_t arc_size; -aggsum_t arc_meta_used; -aggsum_t astat_data_size; -aggsum_t astat_metadata_size; -aggsum_t astat_dbuf_size; -aggsum_t astat_dnode_size; -aggsum_t astat_bonus_size; -aggsum_t astat_hdr_size; -aggsum_t astat_l2_hdr_size; - -static hrtime_t arc_growtime; -static list_t arc_prune_list; -static kmutex_t arc_prune_mtx; -static taskq_t *arc_prune_taskq; +hrtime_t arc_growtime; +list_t arc_prune_list; +kmutex_t arc_prune_mtx; +taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -994,29 +732,18 @@ static taskq_t *arc_prune_taskq; * Hash table routines */ -#define HT_LOCK_ALIGN 64 -#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) - -struct ht_lock { - kmutex_t ht_lock; -#ifdef _KERNEL - unsigned char pad[HT_LOCK_PAD]; -#endif -}; - -#define BUF_LOCKS 8192 +#define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS]; + kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) -#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) @@ -1043,9 +770,6 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_FEED_TYPES 4 -#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) -#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) - /* L2ARC Performance Tunables */ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ @@ -1056,6 +780,7 @@ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ +int l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals @@ -1093,24 +818,36 @@ typedef enum arc_fill_flags { ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; +typedef enum arc_ovf_level { + ARC_OVF_NONE, /* ARC within target size. */ + ARC_OVF_SOME, /* ARC is slightly overflowed. */ + ARC_OVF_SEVERE /* ARC is severely overflowed. */ +} arc_ovf_level_t; + static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + +enum arc_hdr_alloc_flags { + ARC_HDR_ALLOC_RDATA = 0x1, + ARC_HDR_DO_ADAPT = 0x2, + ARC_HDR_USE_RESERVE = 0x4, +}; + + +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); -static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); +static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); -static void arc_tuning_update(void); -static void arc_prune_async(int64_t); -static uint64_t arc_all_memory(void); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1119,7 +856,93 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +static void l2arc_do_free_on_write(void); +static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only); +#define l2arc_hdr_arcstats_increment(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) +#define l2arc_hdr_arcstats_decrement(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) +#define l2arc_hdr_arcstats_increment_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) +#define l2arc_hdr_arcstats_decrement_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) + +/* + * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU + * metadata and data are cached from ARC into L2ARC. + */ +int l2arc_mfuonly = 0; + +/* + * L2ARC TRIM + * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of + * the current write size (l2arc_write_max) we should TRIM if we + * have filled the device. It is defined as a percentage of the + * write size. If set to 100 we trim twice the space required to + * accommodate upcoming writes. A minimum of 64MB will be trimmed. + * It also enables TRIM of the whole L2ARC device upon creation or + * addition to an existing pool or if the header of the device is + * invalid upon importing a pool or onlining a cache device. The + * default is 0, which disables TRIM on L2ARC altogether as it can + * put significant stress on the underlying storage devices. This + * will vary depending of how well the specific device handles + * these commands. + */ +unsigned long l2arc_trim_ahead = 0; + +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_thread(void *arg); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxiliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without @@ -1217,9 +1040,9 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ARCSTAT_MAX(arcstat_hash_chain_max, i); } - - ARCSTAT_BUMP(arcstat_hash_elements); - ARCSTAT_MAXSTAT(arcstat_hash_elements); + uint64_t he = atomic_inc_64_nv( + &arc_stats.arcstat_hash_elements.value.ui64); + ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } @@ -1243,7 +1066,7 @@ buf_hash_remove(arc_buf_hdr_t *hdr) arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ - ARCSTAT_BUMPDOWN(arcstat_hash_elements); + atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) @@ -1276,7 +1099,7 @@ buf_fini(void) (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (i = 0; i < BUF_LOCKS; i++) - mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); @@ -1376,7 +1199,7 @@ hdr_full_crypt_dest(void *vbuf, void *unused) static void hdr_l2only_dest(void *vbuf, void *unused) { - ASSERTV(arc_buf_hdr_t *hdr = vbuf); + arc_buf_hdr_t *hdr __maybe_unused = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); @@ -1392,22 +1215,6 @@ buf_dest(void *vbuf, void *unused) arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (arc_initialized) - zthr_wakeup(arc_reap_zthr); -} - static void buf_init(void) { @@ -1443,12 +1250,12 @@ retry: } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - hdr_recl, NULL, NULL, 0); + NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", - HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -1457,10 +1264,8 @@ retry: for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } + for (i = 0; i < BUF_LOCKS; i++) + mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ @@ -1542,6 +1347,12 @@ arc_hdr_get_compress(arc_buf_hdr_t *hdr) HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } +uint8_t +arc_get_complevel(arc_buf_t *buf) +{ + return (buf->b_hdr->b_complevel); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -1872,11 +1683,51 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ - EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + if (zfs_flags & ZFS_DEBUG_MODIFY) + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } +/* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +static arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, uint8_t complevel, boolean_t protected, + boolean_t prefetch, arc_state_type_t arcs_state) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = complevel; + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + hdr->b_l2hdr.b_arcs_state = arcs_state; + + return (hdr); +} + /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ @@ -1922,9 +1773,8 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) tmpbuf = zio_buf_alloc(lsize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); - csize = zio_compress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, tmpbuf, lsize); + hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); ASSERT3U(csize, <=, psize); abd_zero_off(abd, csize, psize - csize); } @@ -1978,7 +1828,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_abd(hdr, B_FALSE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -2005,12 +1855,13 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); + HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; @@ -2151,7 +2002,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } /* - * Adjust encrypted and authenticated headers to accomodate + * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. @@ -2220,7 +2071,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); - /* We need to give the buf it's own b_data */ + /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); @@ -2253,12 +2104,12 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), + &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should @@ -2317,8 +2168,8 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, */ ret = SET_ERROR(EIO); spa_log_error(spa, zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, zb, NULL, 0); } return (ret); @@ -2346,7 +2197,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -2387,7 +2237,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -2430,12 +2279,16 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - multilist_remove(state->arcs_list[arc_buf_type(hdr)], + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } } @@ -2460,7 +2313,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } @@ -2563,7 +2416,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(old_state->arcs_list[buftype], hdr); + multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); @@ -2580,7 +2433,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); @@ -2718,15 +2571,15 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } } - if (HDR_HAS_L1HDR(hdr)) + if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; - /* - * L2 headers should never be on the L2 state list since they don't - * have L1 headers allocated. - */ - ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); + if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { + l2arc_hdr_arcstats_decrement_state(hdr); + hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; + l2arc_hdr_arcstats_increment_state(hdr); + } + } } void @@ -2738,32 +2591,41 @@ arc_space_consume(uint64_t space, arc_space_type_t type) default: break; case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, space); + ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, space); + ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, space); + ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, space); + ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, space); + ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, space); + aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); + break; + case ARC_SPACE_ABD_CHUNK_WASTE: + /* + * Note: this includes space wasted by all scatter ABD's, not + * just those allocated by the ARC. But the vast majority of + * scatter ABD's come from the ARC, because other users are + * very short-lived. + */ + ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } - if (type != ARC_SPACE_DATA) - aggsum_add(&arc_meta_used, space); + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + aggsum_add(&arc_sums.arcstat_meta_used, space); - aggsum_add(&arc_size, space); + aggsum_add(&arc_sums.arcstat_size, space); } void @@ -2775,42 +2637,41 @@ arc_space_return(uint64_t space, arc_space_type_t type) default: break; case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, -space); + ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, -space); + ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, -space); + ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, -space); + ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, -space); + ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, -space); + aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); + break; + case ARC_SPACE_ABD_CHUNK_WASTE: + ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } - if (type != ARC_SPACE_DATA) { - ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); - /* - * We use the upper bound here rather than the precise value - * because the arc_meta_max value doesn't need to be - * precise. It's only consumed by humans via arcstats. - */ - if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) - arc_meta_max = aggsum_upper_bound(&arc_meta_used); - aggsum_add(&arc_meta_used, -space); + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { + ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, + space) >= 0); + ARCSTAT_MAX(arcstat_meta_max, + aggsum_upper_bound(&arc_sums.arcstat_meta_used)); + aggsum_add(&arc_sums.arcstat_meta_used, -space); } - ASSERT(aggsum_compare(&arc_size, space) >= 0); - aggsum_add(&arc_size, -space); + ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); + aggsum_add(&arc_sums.arcstat_size, -space); } /* @@ -2837,7 +2698,7 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that - * is sharable, but wasn't at the time of its allocation. Rather than + * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. @@ -2874,12 +2735,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); - hdr->b_l1hdr.b_mru_hits = 0; - hdr->b_l1hdr.b_mru_ghost_hits = 0; - hdr->b_l1hdr.b_mfu_hits = 0; - hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_l2_hits = 0; - buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -2896,7 +2751,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, /* * Only honor requests for compressed bufs if the hdr is actually - * compressed. This must be overriden if the buffer is encrypted since + * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { @@ -2917,7 +2772,8 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * allocate a new buffer to store the buf's data. + * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new + * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be @@ -2925,10 +2781,17 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't - * need to be ABD-aware. + * need to be ABD-aware. It must be allocated via + * zio_[data_]buf_alloc(), not as a page, because we need to be able + * to abd_release_ownership_of_buf(), which isn't allowed on "linear + * page" buffers because the ABD code needs to handle freeing them + * specially. */ - boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd); + boolean_t can_share = arc_can_share(hdr, buf) && + !HDR_L2_WRITING(hdr) && + hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(hdr->b_l1hdr.b_pabd) && + !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { @@ -2989,10 +2852,10 @@ arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, - psize, lsize, compression_type); + psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); @@ -3003,10 +2866,11 @@ arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, - byteorder, salt, iv, mac, ot, psize, lsize, compression_type); + byteorder, salt, iv, mac, ot, psize, lsize, compression_type, + complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); @@ -3137,7 +3001,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); - abd_put(hdr->b_l1hdr.b_pabd); + abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; @@ -3192,7 +3056,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) } /* - * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's + * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void @@ -3303,9 +3167,10 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) +arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; + boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3315,13 +3180,15 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); - hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr); + hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, + alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, + alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } @@ -3367,10 +3234,32 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } +/* + * Allocate empty anonymous ARC header. The header will get its identity + * assigned and buffers attached later as part of read or write operations. + * + * In case of read arc_read() assigns header its identify (b_dva + b_birth), + * inserts it into ARC hash to become globally visible and allocates physical + * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read + * completion arc_read_done() allocates ARC buffer(s) as needed, potentially + * sharing one of them with the physical ABD buffer. + * + * In case of write arc_alloc_buf() allocates ARC buffer to be filled with + * data. Then after compression and/or encryption arc_write_ready() allocates + * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD + * buffer. On disk write completion arc_write_done() assigns the header its + * new identity (b_dva + b_birth) and inserts into ARC hash. + * + * In case of partial overwrite the old data is read first as described. Then + * arc_release() either allocates new anonymous ARC header and moves the ARC + * buffer to it, or reuses the old ARC header by discarding its identity and + * removing it from ARC hash. After buffer modification normal write process + * follows as described. + */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - boolean_t protected, enum zio_compress compression_type, - arc_buf_contents_t type, boolean_t alloc_rdata) + boolean_t protected, enum zio_compress compression_type, uint8_t complevel, + arc_buf_contents_t type) { arc_buf_hdr_t *hdr; @@ -3390,20 +3279,19 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); + hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_abd(hdr, alloc_rdata); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -3533,7 +3421,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) arc_buf_hdr_t *nhdr; arc_buf_t *buf; kmem_cache_t *ncache, *ocache; - unsigned nsize, osize; /* * This function requires that hdr is in the arc_anon state. @@ -3550,14 +3437,10 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) if (need_crypt) { ncache = hdr_full_crypt_cache; - nsize = sizeof (hdr->b_crypt_hdr); ocache = hdr_full_cache; - osize = HDR_FULL_SIZE; } else { ncache = hdr_full_cache; - nsize = HDR_FULL_SIZE; ocache = hdr_full_crypt_cache; - osize = sizeof (hdr->b_crypt_hdr); } nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); @@ -3583,7 +3466,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; @@ -3628,7 +3510,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_l2_hits = 0; hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_pabd = NULL; @@ -3651,7 +3532,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It - * is also used to allow the root objset block to be uupdated without altering + * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ @@ -3692,7 +3573,7 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); + B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3708,7 +3589,7 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); @@ -3716,7 +3597,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); + B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3724,17 +3605,12 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_untransform() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); - arc_hdr_free_abd(hdr, B_FALSE); - arc_share_buf(hdr, buf); - } + /* + * To ensure that the hdr has the correct data in it if we call + * arc_untransform() on this buf before it's been written to disk, + * it's easiest if we just set up sharing between the buf and the hdr. + */ + arc_share_buf(hdr, buf); return (buf); } @@ -3743,7 +3619,7 @@ arc_buf_t * arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -3756,7 +3632,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, - compression_type, type, B_TRUE); + compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -3780,6 +3656,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, return (buf); } +static void +l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + arc_buf_contents_t type = hdr->b_type; + int64_t lsize_s; + int64_t psize_s; + int64_t asize_s; + + if (incr) { + lsize_s = lsize; + psize_s = psize; + asize_s = asize; + } else { + lsize_s = -lsize; + psize_s = -psize; + asize_s = -asize; + } + + /* If the buffer is a prefetch, count it as such. */ + if (HDR_PREFETCH(hdr)) { + ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); + } else { + /* + * We use the value stored in the L2 header upon initial + * caching in L2ARC. This value will be updated in case + * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC + * metadata (log entry) cannot currently be updated. Having + * the ARC state in the L2 header solves the problem of a + * possibly absent L1 header (apparent in buffers restored + * from persistent L2ARC). + */ + switch (hdr->b_l2hdr.b_arcs_state) { + case ARC_STATE_MRU_GHOST: + case ARC_STATE_MRU: + ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); + break; + case ARC_STATE_MFU_GHOST: + case ARC_STATE_MFU: + ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); + break; + default: + break; + } + } + + if (state_only) + return; + + ARCSTAT_INCR(arcstat_l2_psize, psize_s); + ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); + + switch (type) { + case ARC_BUFC_DATA: + ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); + break; + case ARC_BUFC_METADATA: + ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); + break; + default: + break; + } +} + + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { @@ -3793,9 +3739,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - + l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), @@ -3831,8 +3775,13 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ - if (HDR_HAS_L2HDR(hdr)) + if (HDR_HAS_L2HDR(hdr)) { + + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + arc_hdr_l2hdr_destroy(hdr); + } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); @@ -3913,9 +3862,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted + * + * Return total size of evicted data buffers for eviction progress tracking. + * When evicting from ghost states return logical buffer size to make eviction + * progress at the same (or at least comparable) rate as from non-ghost states. + * + * Return *real_evicted for actual ARC size reduction to wake up threads + * waiting for it. For non-ghost states it includes size of evicted data + * buffers (the headers are not freed there). For ghost states it includes + * only the evicted headers size. */ static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; @@ -3925,6 +3883,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); + *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); @@ -3961,9 +3920,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); + *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); + *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } @@ -3987,8 +3948,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ARCSTAT_BUMP(arcstat_mutex_miss); break; } - if (buf->b_data != NULL) + if (buf->b_data != NULL) { bytes_evicted += HDR_GET_LSIZE(hdr); + *real_evicted += HDR_GET_LSIZE(hdr); + } mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } @@ -3999,6 +3962,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); + + switch (state->arcs_state) { + case ARC_STATE_MRU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mru, + HDR_GET_LSIZE(hdr)); + break; + case ARC_STATE_MFU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mfu, + HDR_GET_LSIZE(hdr)); + break; + default: + break; + } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); @@ -4009,6 +3987,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); + *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed @@ -4031,25 +4010,37 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) return (bytes_evicted); } +static void +arc_set_need_free(void) +{ + ASSERT(MUTEX_HELD(&arc_evict_lock)); + int64_t remaining = arc_free_memory() - arc_sys_free / 2; + arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); + if (aw == NULL) { + arc_need_free = MAX(-remaining, 0); + } else { + arc_need_free = + MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); + } +} + static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, int64_t bytes) + uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; - uint64_t bytes_evicted = 0; + uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - int evict_count = 0; + int evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); - for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { - if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || - (evict_count >= zfs_arc_evict_batch_limit)) + if ((evict_count <= 0) || (bytes_evicted >= bytes)) break; /* @@ -4097,10 +4088,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { - uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + uint64_t revicted; + uint64_t evicted = arc_evict_hdr(hdr, hash_lock, + &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; + real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have @@ -4108,31 +4102,8 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * evict_count in this case. */ if (evicted != 0) - evict_count++; + evict_count--; - /* - * If arc_size isn't overflowing, signal any - * threads that might happen to be waiting. - * - * For each header evicted, we wake up a single - * thread. If we used cv_broadcast, we could - * wake up "too many" threads causing arc_size - * to significantly overflow arc_c; since - * arc_get_data_impl() doesn't check for overflow - * when it's woken up (it doesn't because it's - * possible for the ARC to be overflowing while - * full of un-evictable buffers, and the - * function should proceed in this case). - * - * If threads are left sleeping, due to not - * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_adjust_cb() just - * before arc_adjust_zthr sleeps. - */ - mutex_enter(&arc_adjust_lock); - if (!arc_is_overflowing()) - cv_signal(&arc_adjust_waiters_cv); - mutex_exit(&arc_adjust_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -4140,6 +4111,41 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, multilist_sublist_unlock(mls); + /* + * Increment the count of evicted bytes, and wake up any threads that + * are waiting for the count to reach this value. Since the list is + * ordered by ascending aew_count, we pop off the beginning of the + * list until we reach the end, or a waiter that's past the current + * "count". Doing this outside the loop reduces the number of times + * we need to acquire the global arc_evict_lock. + * + * Only wake when there's sufficient free memory in the system + * (specifically, arc_sys_free/2, which by default is a bit more than + * 1/64th of RAM). See the comments in arc_wait_for_eviction(). + */ + mutex_enter(&arc_evict_lock); + arc_evict_count += real_evicted; + + if (arc_free_memory() > arc_sys_free / 2) { + arc_evict_waiter_t *aw; + while ((aw = list_head(&arc_evict_waiters)) != NULL && + aw->aew_count <= arc_evict_count) { + list_remove(&arc_evict_waiters, aw); + cv_broadcast(&aw->aew_cv); + } + } + arc_set_need_free(); + mutex_exit(&arc_evict_lock); + + /* + * If the ARC size is reduced from arc_c_max to arc_c_min (especially + * if the average cached block is small), eviction can be on-CPU for + * many seconds. To ensure that other threads that may be bound to + * this CPU are able to make progress, make a voluntary preemption + * call here. + */ + cond_resched(); + return (bytes_evicted); } @@ -4157,16 +4163,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, +arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; - multilist_t *ml = state->arcs_list[type]; + multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - num_sublists = multilist_get_num_sublists(ml); /* @@ -4184,7 +4188,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, /* * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_adjust_type() and + * a marker. This fact is used in arc_evict_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; @@ -4198,7 +4202,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ - while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; @@ -4207,10 +4211,11 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Request that 10% of the LRUs be scanned by the superblock * shrinker. */ - if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, - arc_dnode_limit) > 0) { - arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - - arc_dnode_limit) / sizeof (dnode_t) / + if (type == ARC_BUFC_DATA && aggsum_compare( + &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { + arc_prune_async((aggsum_upper_bound( + &arc_sums.arcstat_dnode_size) - + arc_dnode_size_limit) / sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); } @@ -4225,9 +4230,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, uint64_t bytes_remaining; uint64_t bytes_evicted; - if (bytes == ARC_EVICT_ALL) - bytes_remaining = ARC_EVICT_ALL; - else if (total_evicted < bytes) + if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; @@ -4310,57 +4313,6 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, return (evicted); } -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -static void -arc_prune_async(int64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function @@ -4370,10 +4322,10 @@ arc_prune_async(int64_t adjust) * evict everything it can, when passed a negative value for "bytes". */ static uint64_t -arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, +arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { - int64_t delta; + uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), @@ -4402,7 +4354,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, * available for reclaim. */ static uint64_t -arc_adjust_meta_balanced(uint64_t meta_used) +arc_evict_meta_balanced(uint64_t meta_used) { int64_t delta, prune = 0, adjustmnt; uint64_t total_evicted = 0; @@ -4412,7 +4364,7 @@ arc_adjust_meta_balanced(uint64_t meta_used) restart: /* * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no + * arc_evict because we don't have a "target" value (i.e. no * "meta" arc_p). As a result, I think we can completely * cannibalize the metadata in the MRU before we evict the * metadata from the MFU. I think we probably need to implement a @@ -4424,7 +4376,7 @@ restart: zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), adjustmnt); - total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); + total_evicted += arc_evict_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } @@ -4442,7 +4394,7 @@ restart: zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), adjustmnt); - total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); + total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); } adjustmnt = meta_used - arc_meta_limit; @@ -4451,7 +4403,7 @@ restart: zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); + total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } @@ -4459,7 +4411,7 @@ restart: zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); + total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); } /* @@ -4490,11 +4442,11 @@ restart: } /* - * Evict metadata buffers from the cache, such that arc_meta_used is + * Evict metadata buffers from the cache, such that arcstat_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t -arc_adjust_meta_only(uint64_t meta_used) +arc_evict_meta_only(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; @@ -4510,7 +4462,7 @@ arc_adjust_meta_only(uint64_t meta_used) (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us @@ -4521,18 +4473,18 @@ arc_adjust_meta_only(uint64_t meta_used) (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); - total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static uint64_t -arc_adjust_meta(uint64_t meta_used) +arc_evict_meta(uint64_t meta_used) { if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_adjust_meta_only(meta_used)); + return (arc_evict_meta_only(meta_used)); else - return (arc_adjust_meta_balanced(meta_used)); + return (arc_evict_meta_balanced(meta_used)); } /* @@ -4544,10 +4496,10 @@ arc_adjust_meta(uint64_t meta_used) * returned. */ static arc_buf_contents_t -arc_adjust_type(arc_state_t *state) +arc_evict_type(arc_state_t *state) { - multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; @@ -4611,22 +4563,22 @@ arc_adjust_type(arc_state_t *state) } /* - * Evict buffers from the cache, such that arc_size is capped by arc_c. + * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t -arc_adjust(void) +arc_evict(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; - uint64_t asize = aggsum_value(&arc_size); - uint64_t ameta = aggsum_value(&arc_meta_used); + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ - total_evicted += arc_adjust_meta(ameta); + total_evicted += arc_evict_meta(ameta); /* * Adjust MRU size @@ -4650,9 +4602,9 @@ arc_adjust(void) * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ - if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* @@ -4662,9 +4614,9 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* @@ -4674,14 +4626,14 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Re-sum ARC stats after the first round of evictions. */ - asize = aggsum_value(&arc_size); - ameta = aggsum_value(&arc_meta_used); + asize = aggsum_value(&arc_sums.arcstat_size); + ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* @@ -4693,9 +4645,9 @@ arc_adjust(void) */ target = asize - arc_c; - if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && + if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* @@ -4705,9 +4657,9 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* @@ -4717,7 +4669,7 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* @@ -4734,13 +4686,13 @@ arc_adjust(void) target = zfs_refcount_count(&arc_mru->arcs_size) + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += - arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than @@ -4753,13 +4705,13 @@ arc_adjust(void) target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } @@ -4792,17 +4744,24 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } -static void +void arc_reduce_target_size(int64_t to_free) { - uint64_t asize = aggsum_value(&arc_size); - uint64_t c = arc_c; + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + + /* + * All callers want the ARC to actually evict (at least) this much + * memory. Therefore we reduce from the lower of the current size and + * the target size. This way, even if arc_c is much higher than + * arc_size (as can be the case after many calls to arc_freed(), we will + * immediately have arc_c < arc_size and therefore the arc_evict_zthr + * will evict. + */ + uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (asize < arc_c) - arc_c = MAX(asize, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); @@ -4812,216 +4771,26 @@ arc_reduce_target_size(int64_t to_free) } if (asize > arc_c) { - /* See comment in arc_adjust_cb_check() on why lock+flag */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } } -/* - * Return maximum amount of memory that we could possibly use. Reduced - * to half of all memory in user space which is primarily used for testing. - */ -static uint64_t -arc_all_memory(void) -{ -#ifdef _KERNEL -#ifdef CONFIG_HIGHMEM - return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); -#else - return (ptob(zfs_totalram_pages)); -#endif /* CONFIG_HIGHMEM */ -#else - return (ptob(physmem) / 2); -#endif /* _KERNEL */ -} - -/* - * Return the amount of memory that is considered free. In user space - * which is primarily used for testing we pretend that free memory ranges - * from 0-20% of all memory. - */ -static uint64_t -arc_free_memory(void) -{ -#ifdef _KERNEL -#ifdef CONFIG_HIGHMEM - struct sysinfo si; - si_meminfo(&si); - return (ptob(si.freeram - si.freehigh)); -#else - return (ptob(nr_free_pages() + - nr_inactive_file_pages() + - nr_inactive_anon_pages() + - nr_slab_reclaimable_pages())); - -#endif /* CONFIG_HIGHMEM */ -#else - return (spa_get_random(arc_all_memory() * 20 / 100)); -#endif /* _KERNEL */ -} - -typedef enum free_memory_reason_t { - FMR_UNKNOWN, - FMR_NEEDFREE, - FMR_LOTSFREE, - FMR_SWAPFS_MINFREE, - FMR_PAGES_PP_MAXIMUM, - FMR_HEAP_ARENA, - FMR_ZIO_ARENA, -} free_memory_reason_t; - -int64_t last_free_memory; -free_memory_reason_t last_free_reason; - -#ifdef _KERNEL -/* - * Additional reserve of pages for pp_reserve. - */ -int64_t arc_pages_pp_reserve = 64; - -/* - * Additional reserve of pages for swapfs. - */ -int64_t arc_swapfs_reserve = 64; -#endif /* _KERNEL */ - -/* - * Return the amount of memory that can be consumed before reclaim will be - * needed. Positive if there is sufficient free memory, negative indicates - * the amount of memory that needs to be freed up. - */ -static int64_t -arc_available_memory(void) -{ - int64_t lowest = INT64_MAX; - free_memory_reason_t r = FMR_UNKNOWN; -#ifdef _KERNEL - int64_t n; -#ifdef __linux__ -#ifdef freemem -#undef freemem -#endif - pgcnt_t needfree = btop(arc_need_free); - pgcnt_t lotsfree = btop(arc_sys_free); - pgcnt_t desfree = 0; - pgcnt_t freemem = btop(arc_free_memory()); -#endif - - if (needfree > 0) { - n = PAGESIZE * (-needfree); - if (n < lowest) { - lowest = n; - r = FMR_NEEDFREE; - } - } - - /* - * check that we're out of range of the pageout scanner. It starts to - * schedule paging if freemem is less than lotsfree and needfree. - * lotsfree is the high-water mark for pageout, and needfree is the - * number of needed free pages. We add extra pages here to make sure - * the scanner doesn't start up while we're freeing memory. - */ - n = PAGESIZE * (freemem - lotsfree - needfree - desfree); - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - -#ifndef __linux__ - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - - desfree - arc_swapfs_reserve); - if (n < lowest) { - lowest = n; - r = FMR_SWAPFS_MINFREE; - } - - /* - * Check that we have enough availrmem that memory locking (e.g., via - * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum - * stores the number of pages that cannot be locked; when availrmem - * drops below pages_pp_maximum, page locking mechanisms such as - * page_pp_lock() will fail.) - */ - n = PAGESIZE * (availrmem - pages_pp_maximum - - arc_pages_pp_reserve); - if (n < lowest) { - lowest = n; - r = FMR_PAGES_PP_MAXIMUM; - } -#endif - -#if defined(_ILP32) - /* - * If we're on a 32-bit platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - n = vmem_size(heap_arena, VMEM_FREE) - - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); - if (n < lowest) { - lowest = n; - r = FMR_HEAP_ARENA; - } -#endif - - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this arena remains - * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. - * - * Note that reducing the arc_zio_arena_free_shift keeps more virtual - * memory (in the zio_arena) free, which can avoid memory - * fragmentation issues. - */ - if (zio_arena != NULL) { - n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - - (vmem_size(zio_arena, VMEM_ALLOC) >> - arc_zio_arena_free_shift); - if (n < lowest) { - lowest = n; - r = FMR_ZIO_ARENA; - } - } -#else /* _KERNEL */ - /* Every 100 calls, free a small amount */ - if (spa_get_random(100) == 0) - lowest = -1024; -#endif /* _KERNEL */ - - last_free_memory = lowest; - last_free_reason = r; - - return (lowest); -} /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ -static boolean_t +boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } -static void +void arc_kmem_reap_soon(void) { size_t i; @@ -5029,11 +4798,10 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - extern kmem_cache_t *range_seg_cache; #ifdef _KERNEL - if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && - zfs_arc_meta_prune) { + if ((aggsum_compare(&arc_sums.arcstat_meta_used, + arc_meta_limit) >= 0) && zfs_arc_meta_prune) { /* * We are exceeding our meta-data cache limit. * Prune some entries to release holds on meta-data. @@ -5066,29 +4834,15 @@ arc_kmem_reap_soon(void) kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(range_seg_cache); - - if (zio_arena != NULL) { - /* - * Ask the vmem arena to reclaim unused memory from its - * quantum caches. - */ - vmem_qcache_reap(zio_arena); - } + kmem_cache_reap_now(zfs_btree_leaf_cache); + abd_cache_reap_now(); } /* ARGSUSED */ static boolean_t -arc_adjust_cb_check(void *arg, zthr_t *zthr) +arc_evict_cb_check(void *arg, zthr_t *zthr) { - /* - * This is necessary so that any changes which may have been made to - * many of the zfs_arc_* module parameters will be propagated to - * their actual internal variable counterparts. Without this, - * changing those module params at runtime would have no effect. - */ - arc_tuning_update(); - +#ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the @@ -5096,71 +4850,74 @@ arc_adjust_cb_check(void *arg, zthr_t *zthr) * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date(the arc_adjust_zthr has a maximum sleep - * time of 1 second); but that should suffice. The - * arc_state_t structures can be queried directly if more - * accurate information is needed. + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this call, the data might be out of date if the + * evict thread hasn't been woken recently; but that should + * suffice. The arc_state_t structures can be queried + * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); +#endif /* - * We have to rely on arc_get_data_impl() to tell us when to adjust, - * rather than checking if we are overflowing here, so that we are - * sure to not leave arc_get_data_impl() waiting on - * arc_adjust_waiters_cv. If we have become "not overflowing" since - * arc_get_data_impl() checked, we need to wake it up. We could - * broadcast the CV here, but arc_get_data_impl() may have not yet - * gone to sleep. We would need to use a mutex to ensure that this - * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_adjust_lock). However, the lock ordering of - * such a lock would necessarily be incorrect with respect to the - * zthr_lock, which is held before this function is called, and is - * held by arc_get_data_impl() when it calls zthr_wakeup(). + * We have to rely on arc_wait_for_eviction() to tell us when to + * evict, rather than checking if we are overflowing here, so that we + * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. + * If we have become "not overflowing" since arc_wait_for_eviction() + * checked, we need to wake it up. We could broadcast the CV here, + * but arc_wait_for_eviction() may have not yet gone to sleep. We + * would need to use a mutex to ensure that this function doesn't + * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. + * the arc_evict_lock). However, the lock ordering of such a lock + * would necessarily be incorrect with respect to the zthr_lock, + * which is held before this function is called, and is held by + * arc_wait_for_eviction() when it calls zthr_wakeup(). */ - return (arc_adjust_needed); + return (arc_evict_needed); } /* - * Keep arc_size under arc_c by running arc_adjust which evicts data + * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ /* ARGSUSED */ static void -arc_adjust_cb(void *arg, zthr_t *zthr) +arc_evict_cb(void *arg, zthr_t *zthr) { uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Evict from cache */ - evicted = arc_adjust(); + evicted = arc_evict(); /* * If evicted is zero, we couldn't evict anything - * via arc_adjust(). This could be due to hash lock + * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the - * broadcast will wake any remaining arc adjust waiters. + * broadcast will wake any remaining arc evict waiters. */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && - evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; - if (!arc_adjust_needed) { + mutex_enter(&arc_evict_lock); + arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; + if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ - cv_broadcast(&arc_adjust_waiters_cv); - arc_need_free = 0; + arc_evict_waiter_t *aw; + while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { + cv_broadcast(&aw->aew_cv); + } + arc_set_need_free(); } - mutex_exit(&arc_adjust_lock); + mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } @@ -5169,6 +4926,7 @@ static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { int64_t free_memory = arc_available_memory(); + static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must @@ -5193,13 +4951,21 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) arc_no_grow = B_FALSE; } + /* + * Called unconditionally every 60 seconds to reclaim unused + * zstd compression and decompression context. This is done + * here to avoid the need for an independent thread. + */ + if (!((reap_cb_check_counter++) % 60)) + zfs_zstd_cache_reap_now(); + return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the - * target size of the cache (arc_c), causing the arc_adjust_cb() + * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ /* ARGSUSED */ @@ -5230,7 +4996,7 @@ arc_reap_cb(void *arg, zthr_t *zthr) * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional - * amount. If free memory is positive but less then the fractional + * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); @@ -5238,9 +5004,6 @@ arc_reap_cb(void *arg, zthr_t *zthr) int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { -#ifdef _KERNEL - to_free = MAX(to_free, arc_need_free); -#endif arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); @@ -5292,109 +5055,7 @@ arc_reap_cb(void *arg, zthr_t *zthr) * already below arc_c_min, evicting any more would only * increase this negative difference. */ -static uint64_t -arc_evictable_memory(void) -{ - int64_t asize = aggsum_value(&arc_size); - uint64_t arc_clean = - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); - uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); - /* - * Scale reported evictable memory in proportion to page cache, cap - * at specified min/max. - */ - uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; - min = MAX(arc_c_min, MIN(arc_c_max, min)); - - if (arc_dirty >= min) - return (arc_clean); - - return (MAX((int64_t)asize - (int64_t)min, 0)); -} - -/* - * If sc->nr_to_scan is zero, the caller is requesting a query of the - * number of objects which can potentially be freed. If it is nonzero, - * the request is to free that many objects. - * - * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks - * in struct shrinker and also require the shrinker to return the number - * of objects freed. - * - * Older kernels require the shrinker to return the number of freeable - * objects following the freeing of nr_to_free. - */ -static spl_shrinker_t -__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) -{ - int64_t pages; - - /* The arc is considered warm once reclaim has occurred */ - if (unlikely(arc_warm == B_FALSE)) - arc_warm = B_TRUE; - - /* Return the potential number of reclaimable pages */ - pages = btop((int64_t)arc_evictable_memory()); - if (sc->nr_to_scan == 0) - return (pages); - - /* Not allowed to perform filesystem reclaim */ - if (!(sc->gfp_mask & __GFP_FS)) - return (SHRINK_STOP); - - /* Reclaim in progress */ - if (mutex_tryenter(&arc_adjust_lock) == 0) { - ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan)); - return (0); - } - - mutex_exit(&arc_adjust_lock); - - /* - * Evict the requested number of pages by shrinking arc_c the - * requested amount. - */ - if (pages > 0) { - arc_reduce_target_size(ptob(sc->nr_to_scan)); - if (current_is_kswapd()) - arc_kmem_reap_soon(); -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK - pages = MAX((int64_t)pages - - (int64_t)btop(arc_evictable_memory()), 0); -#else - pages = btop(arc_evictable_memory()); -#endif - /* - * We've shrunk what we can, wake up threads. - */ - cv_broadcast(&arc_adjust_waiters_cv); - } else - pages = SHRINK_STOP; - - /* - * When direct reclaim is observed it usually indicates a rapid - * increase in memory pressure. This occurs because the kswapd - * threads were unable to asynchronously keep enough free memory - * available. In this case set arc_no_grow to briefly pause arc - * growth to avoid compounding the memory pressure. - */ - if (current_is_kswapd()) { - ARCSTAT_BUMP(arcstat_memory_indirect_count); - } else { - arc_no_grow = B_TRUE; - arc_kmem_reap_soon(); - ARCSTAT_BUMP(arcstat_memory_direct_count); - } - - return (pages); -} -SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); - -SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); #endif /* _KERNEL */ /* @@ -5410,9 +5071,6 @@ arc_adapt(int bytes, arc_state_t *state) int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - if (state == arc_l2c_only) - return; - ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -5459,8 +5117,8 @@ arc_adapt(int bytes, arc_state_t *state) * cache size, increment the target cache size */ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); - if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >= - 0) { + if (aggsum_upper_bound(&arc_sums.arcstat_size) >= + arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; @@ -5476,11 +5134,11 @@ arc_adapt(int bytes, arc_state_t *state) * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ -static boolean_t -arc_is_overflowing(void) +static arc_ovf_level_t +arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ - uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* @@ -5492,15 +5150,21 @@ arc_is_overflowing(void) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); + int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - + arc_c - overflow / 2; + if (!use_reserve) + overflow /= 2; + return (over < 0 ? ARC_OVF_NONE : + over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * -arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag); + arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { @@ -5514,7 +5178,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag); + arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5523,6 +5187,85 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) } } +/* + * Wait for the specified amount of data (in bytes) to be evicted from the + * ARC, and for there to be sufficient free memory in the system. Waiting for + * eviction ensures that the memory used by the ARC decreases. Waiting for + * free memory ensures that the system won't run out of free pages, regardless + * of ARC behavior and settings. See arc_lowmem_init(). + */ +void +arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) +{ + switch (arc_is_overflowing(use_reserve)) { + case ARC_OVF_NONE: + return; + case ARC_OVF_SOME: + /* + * This is a bit racy without taking arc_evict_lock, but the + * worst that can happen is we either call zthr_wakeup() extra + * time due to race with other thread here, or the set flag + * get cleared by arc_evict_cb(), which is unlikely due to + * big hysteresis, but also not important since at this level + * of overflow the eviction is purely advisory. Same time + * taking the global lock here every time without waiting for + * the actual eviction creates a significant lock contention. + */ + if (!arc_evict_needed) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + } + return; + case ARC_OVF_SEVERE: + default: + { + arc_evict_waiter_t aw; + list_link_init(&aw.aew_node); + cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); + + uint64_t last_count = 0; + mutex_enter(&arc_evict_lock); + if (!list_is_empty(&arc_evict_waiters)) { + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + last_count = last->aew_count; + } else if (!arc_evict_needed) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + } + /* + * Note, the last waiter's count may be less than + * arc_evict_count if we are low on memory in which + * case arc_evict_state_impl() may have deferred + * wakeups (but still incremented arc_evict_count). + */ + aw.aew_count = MAX(last_count, arc_evict_count) + amount; + + list_insert_tail(&arc_evict_waiters, &aw); + + arc_set_need_free(); + + DTRACE_PROBE3(arc__wait__for__eviction, + uint64_t, amount, + uint64_t, arc_evict_count, + uint64_t, aw.aew_count); + + /* + * We will be woken up either when arc_evict_count reaches + * aew_count, or when the ARC is no longer overflowing and + * eviction completes. + * In case of "false" wakeup, we will still be on the list. + */ + do { + cv_wait(&aw.aew_cv, &arc_evict_lock); + } while (list_link_active(&aw.aew_node)); + mutex_exit(&arc_evict_lock); + + cv_destroy(&aw.aew_cv); + } + } +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction @@ -5530,49 +5273,30 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * limit, we'll only signal the reclaim thread and continue on. */ static void -arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - arc_adapt(size, state); + if (alloc_flags & ARC_HDR_DO_ADAPT) + arc_adapt(size, state); /* - * If arc_size is currently overflowing, and has grown past our - * upper limit, we must be adding data faster than the evict - * thread can evict. Thus, to ensure we don't compound the + * If arc_size is currently overflowing, we must be adding data + * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the - * eviction thread to catch up. + * further past it's target size, we wait for the eviction thread to + * make some progress. We also wait for there to be sufficient free + * memory in the system, as measured by arc_free_memory(). * - * It's also possible that the reclaim thread is unable to evict - * enough buffers to get arc_size below the overflow limit (e.g. - * due to buffers being un-evictable, or hash lock collisions). - * In this case, we want to proceed regardless if we're - * overflowing; thus we don't use a while loop here. + * Specifically, we wait for zfs_arc_eviction_pct percent of the + * requested size to be evicted. This should be more than 100%, to + * ensure that that progress is also made towards getting arc_size + * under arc_c. See the comment above zfs_arc_eviction_pct. */ - if (arc_is_overflowing()) { - mutex_enter(&arc_adjust_lock); - - /* - * Now that we've acquired the lock, we may no longer be - * over the overflow limit, lets check. - * - * We're ignoring the case of spurious wake ups. If that - * were to happen, it'd let this thread consume an ARC - * buffer before it should have (i.e. before we're under - * the overflow limit and were signalled by the reclaim - * thread). As long as that is a rare occurrence, it - * shouldn't cause any harm. - */ - if (arc_is_overflowing()) { - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - (void) cv_wait(&arc_adjust_waiters_cv, - &arc_adjust_lock); - } - mutex_exit(&arc_adjust_lock); - } + arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, + alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -5608,7 +5332,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (aggsum_compare(&arc_size, arc_c) < 0 && + if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) @@ -5706,11 +5430,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); - atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); + hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; @@ -5732,7 +5460,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); + hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; @@ -5741,13 +5469,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * was evicted from the cache. Move it to the * MFU state. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -5758,7 +5489,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); + hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* @@ -5771,7 +5502,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * the head of the list now. */ - atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); + hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { @@ -5794,7 +5525,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); + hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* @@ -5969,6 +5700,9 @@ arc_read_done(zio_t *zio) } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + if (!HDR_L2_READING(hdr)) { + hdr->b_complevel = zio->io_prop.zp_complevel; + } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); @@ -5997,7 +5731,7 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { - if (!acb->acb_done) + if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; @@ -6026,8 +5760,9 @@ arc_read_done(zio_t *zio) error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + (void) zfs_ereport_post( + FM_EREPORT_ZFS_AUTHENTICATION, + zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } @@ -6161,12 +5896,39 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); + boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_REDACTED(bp)); + /* + * Normally SPL_FSTRANS will already be set since kernel threads which + * expect to call the DMU interfaces will set it when created. System + * calls are similarly handled by setting/cleaning the bit in the + * registered callback (module/os/.../zfs/zpl_*). + * + * External consumers such as Lustre which call the exported DMU + * interfaces may not have set SPL_FSTRANS. To avoid a deadlock + * on the hash_lock always set and clear the bit. + */ + fstrans_cookie_t cookie = spl_fstrans_mark(); top: + /* + * Verify the block pointer contents are reasonable. This should + * always be the case since the blkptr is protected by a checksum. + * However, if there is damage it's desirable to detect this early + * and treat it as a checksum error. This allows an alternate blkptr + * to be tried when one is available (e.g. ditto blocks). + */ + if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, + BLK_VERIFY_LOG)) { + rc = SET_ERROR(ECKSUM); + goto out; + } + if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". @@ -6177,7 +5939,7 @@ top: /* * Determine if we have an L1 cache hit or a cache miss. For simplicity - * we maintain encrypted data seperately from compressed / uncompressed + * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. @@ -6190,6 +5952,13 @@ top: if (HDR_IO_IN_PROGRESS(hdr)) { zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + if (*arc_flags & ARC_FLAG_CACHED_ONLY) { + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_cached_only_in_progress); + rc = SET_ERROR(ENOENT); + goto out; + } + ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { @@ -6225,6 +5994,7 @@ top: acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; + acb->acb_nobuf = no_buf; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, @@ -6234,8 +6004,6 @@ top: acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - mutex_exit(hash_lock); - goto out; } mutex_exit(hash_lock); goto out; @@ -6244,7 +6012,7 @@ top: ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); - if (done) { + if (done && !no_buf) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to @@ -6282,9 +6050,9 @@ top: rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb); - zfs_ereport_post( + (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } } if (rc != 0) { @@ -6298,8 +6066,12 @@ top: ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); @@ -6324,13 +6096,12 @@ top: boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; + int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; - /* - * Gracefully handle a damaged logical block size as a - * checksum error. - */ - if (lsize > spa_maxblocksize(spa)) { - rc = SET_ERROR(ECKSUM); + if (*arc_flags & ARC_FLAG_CACHED_ONLY) { + rc = SET_ERROR(ENOENT); + if (hash_lock != NULL) + mutex_exit(hash_lock); goto out; } @@ -6342,8 +6113,7 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type, - encrypted_read); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); @@ -6357,6 +6127,7 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } + alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6402,10 +6173,11 @@ top: * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ + arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); - arc_hdr_alloc_abd(hdr, encrypted_read); } + arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); @@ -6432,8 +6204,13 @@ top: } if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) @@ -6500,9 +6277,14 @@ top: ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); + zfs_racct_read(size, 1); } - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* Check if the spa even has l2 configured */ + const boolean_t spa_has_l2 = l2arc_ndev != 0 && + spa->spa_l2cache.sav_count > 0; + + if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -6510,7 +6292,7 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. + * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && @@ -6521,7 +6303,7 @@ top: DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr.b_hits); + hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); @@ -6530,6 +6312,17 @@ top: cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + /* + * When Compressed ARC is disabled, but the + * L2ARC block is compressed, arc_hdr_size() + * will have returned LSIZE rather than PSIZE. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr) && + HDR_GET_PSIZE(hdr) != 0) { + size = HDR_GET_PSIZE(hdr); + } + asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, @@ -6592,15 +6385,24 @@ top: } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); + /* - * Skip ARC stat bump for block pointers with - * embedded data. The data are read from the blkptr - * itself via decode_embedded_bp_compressed(). + * Only a spa with l2 should contribute to l2 + * miss stats. (Including the case of having a + * faulted cache device - that's also a miss.) */ - if (l2arc_ndev != 0 && !embedded_bp) { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); + if (spa_has_l2) { + /* + * Skip ARC stat bump for block pointers with + * embedded data. The data are read from the + * blkptr itself via + * decode_embedded_bp_compressed(). + */ + if (!embedded_bp) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } } @@ -6624,6 +6426,7 @@ out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); + spl_fstrans_unmark(cookie); return (rc); } @@ -6744,7 +6547,6 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(HDR_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); @@ -6846,7 +6648,7 @@ arc_release(arc_buf_t *buf, void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_abd(hdr, B_FALSE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6896,7 +6698,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, - compress, type, HDR_HAS_RABD(hdr)); + compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -6907,11 +6709,6 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_bufcnt = 1; if (ARC_BUF_ENCRYPTED(buf)) nhdr->b_crypt_hdr.b_ebufcnt = 1; - nhdr->b_l1hdr.b_mru_hits = 0; - nhdr->b_l1hdr.b_mru_ghost_hits = 0; - nhdr->b_l1hdr.b_mfu_hits = 0; - nhdr->b_l1hdr.b_mfu_ghost_hits = 0; - nhdr->b_l1hdr.b_l2_hits = 0; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; @@ -6928,7 +6725,6 @@ arc_release(arc_buf_t *buf, void *tag) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_l2_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; @@ -7060,6 +6856,7 @@ arc_write_ready(zio_t *zio) } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; @@ -7081,9 +6878,11 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, B_TRUE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); - } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || + !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the @@ -7091,16 +6890,19 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, B_TRUE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, B_FALSE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, B_FALSE); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -7218,7 +7020,7 @@ arc_write_done(zio_t *zio) ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); - abd_put(zio->io_abd); + abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -7248,6 +7050,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; @@ -7266,6 +7069,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); @@ -7314,49 +7118,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, return (zio); } -static int -arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) -{ -#ifdef _KERNEL - uint64_t available_memory = arc_free_memory(); - -#if defined(_ILP32) - available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); -#endif - - if (available_memory > arc_all_memory() * arc_lotsfree_percent / 100) - return (0); - - if (txg > spa->spa_lowmem_last_txg) { - spa->spa_lowmem_last_txg = txg; - spa->spa_lowmem_page_load = 0; - } - /* - * If we are in pageout, we know that memory is already tight, - * the arc is already going to be evicting, so we just want to - * continue to let page writes occur as quickly as possible. - */ - if (current_is_kswapd()) { - if (spa->spa_lowmem_page_load > - MAX(arc_sys_free / 4, available_memory) / 4) { - DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); - return (SET_ERROR(ERESTART)); - } - /* Note: reserve is inflated, so we deflate */ - atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); - return (0); - } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { - /* memory is low, delay before restarting */ - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); - return (SET_ERROR(EAGAIN)); - } - spa->spa_lowmem_page_load = 0; -#endif /* _KERNEL */ - return (0); -} - void arc_tempreserve_clear(uint64_t reserve) { @@ -7423,9 +7184,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); - - if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && - anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; + if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( @@ -7433,9 +7194,12 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " - "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve >> 10, meta_esize >> 10, - data_esize >> 10, reserve >> 10, arc_c >> 10); + "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", + (u_longlong_t)arc_tempreserve >> 10, + (u_longlong_t)meta_esize >> 10, + (u_longlong_t)data_esize >> 10, + (u_longlong_t)reserve >> 10, + (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); @@ -7460,48 +7224,219 @@ arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; - if (rw == KSTAT_WRITE) { + if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); - } else { - arc_kstat_update_state(arc_anon, - &as->arcstat_anon_size, - &as->arcstat_anon_evictable_data, - &as->arcstat_anon_evictable_metadata); - arc_kstat_update_state(arc_mru, - &as->arcstat_mru_size, - &as->arcstat_mru_evictable_data, - &as->arcstat_mru_evictable_metadata); - arc_kstat_update_state(arc_mru_ghost, - &as->arcstat_mru_ghost_size, - &as->arcstat_mru_ghost_evictable_data, - &as->arcstat_mru_ghost_evictable_metadata); - arc_kstat_update_state(arc_mfu, - &as->arcstat_mfu_size, - &as->arcstat_mfu_evictable_data, - &as->arcstat_mfu_evictable_metadata); - arc_kstat_update_state(arc_mfu_ghost, - &as->arcstat_mfu_ghost_size, - &as->arcstat_mfu_ghost_evictable_data, - &as->arcstat_mfu_ghost_evictable_metadata); - ARCSTAT(arcstat_size) = aggsum_value(&arc_size); - ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); - ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); - ARCSTAT(arcstat_metadata_size) = - aggsum_value(&astat_metadata_size); - ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); - ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); - ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); - ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); - ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); + as->arcstat_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_hits); + as->arcstat_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_misses); + as->arcstat_demand_data_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_hits); + as->arcstat_demand_data_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_misses); + as->arcstat_demand_metadata_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_hits); + as->arcstat_demand_metadata_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_misses); + as->arcstat_prefetch_data_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_hits); + as->arcstat_prefetch_data_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_misses); + as->arcstat_prefetch_metadata_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); + as->arcstat_prefetch_metadata_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); + as->arcstat_mru_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_mru_hits); + as->arcstat_mru_ghost_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_mru_ghost_hits); + as->arcstat_mfu_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_mfu_hits); + as->arcstat_mfu_ghost_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); + as->arcstat_deleted.value.ui64 = + wmsum_value(&arc_sums.arcstat_deleted); + as->arcstat_mutex_miss.value.ui64 = + wmsum_value(&arc_sums.arcstat_mutex_miss); + as->arcstat_access_skip.value.ui64 = + wmsum_value(&arc_sums.arcstat_access_skip); + as->arcstat_evict_skip.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_skip); + as->arcstat_evict_not_enough.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_not_enough); + as->arcstat_evict_l2_cached.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_cached); + as->arcstat_evict_l2_eligible.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_eligible); + as->arcstat_evict_l2_eligible_mfu.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); + as->arcstat_evict_l2_eligible_mru.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); + as->arcstat_evict_l2_ineligible.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); + as->arcstat_evict_l2_skip.value.ui64 = + wmsum_value(&arc_sums.arcstat_evict_l2_skip); + as->arcstat_hash_collisions.value.ui64 = + wmsum_value(&arc_sums.arcstat_hash_collisions); + as->arcstat_hash_chains.value.ui64 = + wmsum_value(&arc_sums.arcstat_hash_chains); + as->arcstat_size.value.ui64 = + aggsum_value(&arc_sums.arcstat_size); + as->arcstat_compressed_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_compressed_size); + as->arcstat_uncompressed_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_uncompressed_size); + as->arcstat_overhead_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_overhead_size); + as->arcstat_hdr_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_hdr_size); + as->arcstat_data_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_data_size); + as->arcstat_metadata_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_metadata_size); + as->arcstat_dbuf_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_dbuf_size); +#if defined(COMPAT_FREEBSD11) + as->arcstat_other_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_bonus_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + + wmsum_value(&arc_sums.arcstat_dbuf_size); +#endif - as->arcstat_memory_all_bytes.value.ui64 = - arc_all_memory(); - as->arcstat_memory_free_bytes.value.ui64 = - arc_free_memory(); - as->arcstat_memory_available_bytes.value.i64 = - arc_available_memory(); - } + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evictable_data, + &as->arcstat_anon_evictable_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evictable_data, + &as->arcstat_mru_evictable_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evictable_data, + &as->arcstat_mru_ghost_evictable_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evictable_data, + &as->arcstat_mfu_evictable_metadata); + arc_kstat_update_state(arc_mfu_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evictable_data, + &as->arcstat_mfu_ghost_evictable_metadata); + + as->arcstat_dnode_size.value.ui64 = + aggsum_value(&arc_sums.arcstat_dnode_size); + as->arcstat_bonus_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_bonus_size); + as->arcstat_l2_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_hits); + as->arcstat_l2_misses.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_misses); + as->arcstat_l2_prefetch_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); + as->arcstat_l2_mru_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_mru_asize); + as->arcstat_l2_mfu_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_mfu_asize); + as->arcstat_l2_bufc_data_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); + as->arcstat_l2_bufc_metadata_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); + as->arcstat_l2_feeds.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_feeds); + as->arcstat_l2_rw_clash.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rw_clash); + as->arcstat_l2_read_bytes.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_read_bytes); + as->arcstat_l2_write_bytes.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_write_bytes); + as->arcstat_l2_writes_sent.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_writes_sent); + as->arcstat_l2_writes_done.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_writes_done); + as->arcstat_l2_writes_error.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_writes_error); + as->arcstat_l2_writes_lock_retry.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); + as->arcstat_l2_evict_lock_retry.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); + as->arcstat_l2_evict_reading.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_evict_reading); + as->arcstat_l2_evict_l1cached.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); + as->arcstat_l2_free_on_write.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_free_on_write); + as->arcstat_l2_abort_lowmem.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); + as->arcstat_l2_cksum_bad.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_cksum_bad); + as->arcstat_l2_io_error.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_io_error); + as->arcstat_l2_lsize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_lsize); + as->arcstat_l2_psize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_psize); + as->arcstat_l2_hdr_size.value.ui64 = + aggsum_value(&arc_sums.arcstat_l2_hdr_size); + as->arcstat_l2_log_blk_writes.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); + as->arcstat_l2_log_blk_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); + as->arcstat_l2_log_blk_count.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_log_blk_count); + as->arcstat_l2_rebuild_success.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_success); + as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); + as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); + as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); + as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); + as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); + as->arcstat_l2_rebuild_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_size); + as->arcstat_l2_rebuild_asize.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); + as->arcstat_l2_rebuild_bufs.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); + as->arcstat_l2_rebuild_bufs_precached.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); + as->arcstat_l2_rebuild_log_blks.value.ui64 = + wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); + as->arcstat_memory_throttle_count.value.ui64 = + wmsum_value(&arc_sums.arcstat_memory_throttle_count); + as->arcstat_memory_direct_count.value.ui64 = + wmsum_value(&arc_sums.arcstat_memory_direct_count); + as->arcstat_memory_indirect_count.value.ui64 = + wmsum_value(&arc_sums.arcstat_memory_indirect_count); + + as->arcstat_memory_all_bytes.value.ui64 = + arc_all_memory(); + as->arcstat_memory_free_bytes.value.ui64 = + arc_free_memory(); + as->arcstat_memory_available_bytes.value.i64 = + arc_available_memory(); + + as->arcstat_prune.value.ui64 = + wmsum_value(&arc_sums.arcstat_prune); + as->arcstat_meta_used.value.ui64 = + aggsum_value(&arc_sums.arcstat_meta_used); + as->arcstat_async_upgrade_sync.value.ui64 = + wmsum_value(&arc_sums.arcstat_async_upgrade_sync); + as->arcstat_demand_hit_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); + as->arcstat_demand_hit_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); + as->arcstat_raw_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_raw_size); + as->arcstat_cached_only_in_progress.value.ui64 = + wmsum_value(&arc_sums.arcstat_cached_only_in_progress); + as->arcstat_abd_chunk_waste_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } @@ -7513,7 +7448,7 @@ arc_kstat_update(kstat_t *ksp, int rw) * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ -unsigned int +static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; @@ -7535,36 +7470,40 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. + * would not be evenly distributed. In this context full 64bit + * division would be a waste of time, so limit it to 32 bits. */ - return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } +static unsigned int +arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) +{ + panic("Header %p insert into arc_l2c_only %p", obj, ml); +} + +#define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ + if ((do_warn) && (tuning) && ((tuning) != (value))) { \ + cmn_err(CE_WARN, \ + "ignoring tunable %s (using %llu instead)", \ + (#tuning), (u_longlong_t)(value)); \ + } \ +} while (0) + /* * Called during module initialization and periodically thereafter to - * apply reasonable changes to the exposed performance tunings. Non-zero - * zfs_* values which differ from the currently set values will be applied. + * apply reasonable changes to the exposed performance tunings. Can also be + * called explicitly by param_set_arc_*() functions when ARC tunables are + * updated manually. Non-zero zfs_* values which differ from the currently set + * values will be applied. */ -static void -arc_tuning_update(void) +void +arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); unsigned long limit; - /* Valid range: 64M - */ - if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && - (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) && - (zfs_arc_max > arc_c_min)) { - arc_c_max = zfs_arc_max; - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - if (arc_meta_limit > arc_c_max) - arc_meta_limit = arc_c_max; - if (arc_dnode_limit > arc_meta_limit) - arc_dnode_limit = arc_meta_limit; - } - /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && @@ -7572,6 +7511,21 @@ arc_tuning_update(void) arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } + WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); + + /* Valid range: 64M - */ + if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && + (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && + (zfs_arc_max > arc_c_min)) { + arc_c_max = zfs_arc_max; + arc_c = MIN(arc_c, arc_c_max); + arc_p = (arc_c >> 1); + if (arc_meta_limit > arc_c_max) + arc_meta_limit = arc_c_max; + if (arc_dnode_size_limit > arc_meta_limit) + arc_dnode_size_limit = arc_meta_limit; + } + WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 16M - */ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && @@ -7580,9 +7534,10 @@ arc_tuning_update(void) arc_meta_min = zfs_arc_meta_min; if (arc_meta_limit < arc_meta_min) arc_meta_limit = arc_meta_min; - if (arc_dnode_limit < arc_meta_min) - arc_dnode_limit = arc_meta_min; + if (arc_dnode_size_limit < arc_meta_min) + arc_dnode_size_limit = arc_meta_min; } + WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); /* Valid range: - */ limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : @@ -7591,14 +7546,17 @@ arc_tuning_update(void) (limit >= arc_meta_min) && (limit <= arc_c_max)) arc_meta_limit = limit; + WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); /* Valid range: - */ limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; - if ((limit != arc_dnode_limit) && + if ((limit != arc_dnode_size_limit) && (limit >= arc_meta_min) && (limit <= arc_meta_limit)) - arc_dnode_limit = limit; + arc_dnode_size_limit = limit; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, + verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) @@ -7628,63 +7586,62 @@ arc_tuning_update(void) if ((zfs_arc_lotsfree_percent >= 0) && (zfs_arc_lotsfree_percent <= 100)) arc_lotsfree_percent = zfs_arc_lotsfree_percent; + WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, + verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); - + WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_init(void) { - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - - arc_mru->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mru->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. Special index function asserts that. + */ + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), + arc_state_l2c_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); + arc_state_l2c_multilist_index_func); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); @@ -7706,15 +7663,93 @@ arc_state_init(void) zfs_refcount_create(&arc_mfu_ghost->arcs_size); zfs_refcount_create(&arc_l2c_only->arcs_size); - aggsum_init(&arc_meta_used, 0); - aggsum_init(&arc_size, 0); - aggsum_init(&astat_data_size, 0); - aggsum_init(&astat_metadata_size, 0); - aggsum_init(&astat_hdr_size, 0); - aggsum_init(&astat_l2_hdr_size, 0); - aggsum_init(&astat_bonus_size, 0); - aggsum_init(&astat_dnode_size, 0); - aggsum_init(&astat_dbuf_size, 0); + wmsum_init(&arc_sums.arcstat_hits, 0); + wmsum_init(&arc_sums.arcstat_misses, 0); + wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); + wmsum_init(&arc_sums.arcstat_mru_hits, 0); + wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); + wmsum_init(&arc_sums.arcstat_mfu_hits, 0); + wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); + wmsum_init(&arc_sums.arcstat_deleted, 0); + wmsum_init(&arc_sums.arcstat_mutex_miss, 0); + wmsum_init(&arc_sums.arcstat_access_skip, 0); + wmsum_init(&arc_sums.arcstat_evict_skip, 0); + wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); + wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); + wmsum_init(&arc_sums.arcstat_hash_collisions, 0); + wmsum_init(&arc_sums.arcstat_hash_chains, 0); + aggsum_init(&arc_sums.arcstat_size, 0); + wmsum_init(&arc_sums.arcstat_compressed_size, 0); + wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); + wmsum_init(&arc_sums.arcstat_overhead_size, 0); + wmsum_init(&arc_sums.arcstat_hdr_size, 0); + wmsum_init(&arc_sums.arcstat_data_size, 0); + wmsum_init(&arc_sums.arcstat_metadata_size, 0); + wmsum_init(&arc_sums.arcstat_dbuf_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); + wmsum_init(&arc_sums.arcstat_bonus_size, 0); + wmsum_init(&arc_sums.arcstat_l2_hits, 0); + wmsum_init(&arc_sums.arcstat_l2_misses, 0); + wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_feeds, 0); + wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); + wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); + wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); + wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); + wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); + wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); + wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); + wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); + wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); + wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); + wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); + wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); + wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); + wmsum_init(&arc_sums.arcstat_l2_io_error, 0); + wmsum_init(&arc_sums.arcstat_l2_lsize, 0); + wmsum_init(&arc_sums.arcstat_l2_psize, 0); + aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); + wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); + wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); + wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); + wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); + wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); + wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); + wmsum_init(&arc_sums.arcstat_prune, 0); + aggsum_init(&arc_sums.arcstat_meta_used, 0); + wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); + wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); + wmsum_init(&arc_sums.arcstat_raw_size, 0); + wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); + wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -7747,26 +7782,104 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - aggsum_fini(&arc_meta_used); - aggsum_fini(&arc_size); - aggsum_fini(&astat_data_size); - aggsum_fini(&astat_metadata_size); - aggsum_fini(&astat_hdr_size); - aggsum_fini(&astat_l2_hdr_size); - aggsum_fini(&astat_bonus_size); - aggsum_fini(&astat_dnode_size); - aggsum_fini(&astat_dbuf_size); + wmsum_fini(&arc_sums.arcstat_hits); + wmsum_fini(&arc_sums.arcstat_misses); + wmsum_fini(&arc_sums.arcstat_demand_data_hits); + wmsum_fini(&arc_sums.arcstat_demand_data_misses); + wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); + wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); + wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); + wmsum_fini(&arc_sums.arcstat_mru_hits); + wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); + wmsum_fini(&arc_sums.arcstat_mfu_hits); + wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); + wmsum_fini(&arc_sums.arcstat_deleted); + wmsum_fini(&arc_sums.arcstat_mutex_miss); + wmsum_fini(&arc_sums.arcstat_access_skip); + wmsum_fini(&arc_sums.arcstat_evict_skip); + wmsum_fini(&arc_sums.arcstat_evict_not_enough); + wmsum_fini(&arc_sums.arcstat_evict_l2_cached); + wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); + wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); + wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); + wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); + wmsum_fini(&arc_sums.arcstat_evict_l2_skip); + wmsum_fini(&arc_sums.arcstat_hash_collisions); + wmsum_fini(&arc_sums.arcstat_hash_chains); + aggsum_fini(&arc_sums.arcstat_size); + wmsum_fini(&arc_sums.arcstat_compressed_size); + wmsum_fini(&arc_sums.arcstat_uncompressed_size); + wmsum_fini(&arc_sums.arcstat_overhead_size); + wmsum_fini(&arc_sums.arcstat_hdr_size); + wmsum_fini(&arc_sums.arcstat_data_size); + wmsum_fini(&arc_sums.arcstat_metadata_size); + wmsum_fini(&arc_sums.arcstat_dbuf_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); + wmsum_fini(&arc_sums.arcstat_bonus_size); + wmsum_fini(&arc_sums.arcstat_l2_hits); + wmsum_fini(&arc_sums.arcstat_l2_misses); + wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); + wmsum_fini(&arc_sums.arcstat_l2_mru_asize); + wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); + wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); + wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); + wmsum_fini(&arc_sums.arcstat_l2_feeds); + wmsum_fini(&arc_sums.arcstat_l2_rw_clash); + wmsum_fini(&arc_sums.arcstat_l2_read_bytes); + wmsum_fini(&arc_sums.arcstat_l2_write_bytes); + wmsum_fini(&arc_sums.arcstat_l2_writes_sent); + wmsum_fini(&arc_sums.arcstat_l2_writes_done); + wmsum_fini(&arc_sums.arcstat_l2_writes_error); + wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); + wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); + wmsum_fini(&arc_sums.arcstat_l2_evict_reading); + wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); + wmsum_fini(&arc_sums.arcstat_l2_free_on_write); + wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); + wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); + wmsum_fini(&arc_sums.arcstat_l2_io_error); + wmsum_fini(&arc_sums.arcstat_l2_lsize); + wmsum_fini(&arc_sums.arcstat_l2_psize); + aggsum_fini(&arc_sums.arcstat_l2_hdr_size); + wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); + wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); + wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); + wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); + wmsum_fini(&arc_sums.arcstat_memory_throttle_count); + wmsum_fini(&arc_sums.arcstat_memory_direct_count); + wmsum_fini(&arc_sums.arcstat_memory_indirect_count); + wmsum_fini(&arc_sums.arcstat_prune); + aggsum_fini(&arc_sums.arcstat_meta_used); + wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); + wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); + wmsum_fini(&arc_sums.arcstat_raw_size); + wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); + wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t @@ -7775,35 +7888,48 @@ arc_target_bytes(void) return (arc_c); } +void +arc_set_limits(uint64_t allmem) +{ + /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ + arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); + + /* How to set default max varies by platform. */ + arc_c_max = arc_default_max(arc_c_min, allmem); +} void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); - mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), + offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; -#ifdef _KERNEL - /* - * Register a shrinker to support synchronous (direct) memory - * reclaim from the arc. This is done to prevent kswapd from - * swapping out pages when it is preferable to shrink the arc. - */ - spl_register_shrinker(&arc_shrinker); - - /* Set to 1/64 of all memory or a minimum of 512K */ - arc_sys_free = MAX(allmem / 64, (512 * 1024)); - arc_need_free = 0; +#if defined(_KERNEL) + arc_lowmem_init(); #endif - /* Set max to 1/2 of all memory */ - arc_c_max = allmem / 2; + arc_set_limits(allmem); -#ifdef _KERNEL - /* Set min cache to 1/32 of all memory, or 32MB, whichever is more */ - arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); +#ifdef _KERNEL + /* + * If zfs_arc_max is non-zero at init, meaning it was set in the kernel + * environment before the module was loaded, don't block setting the + * maximum because it is less than arc_c_min, instead, reset arc_c_min + * to a lower value. + * zfs_arc_min will be handled by arc_tuning_update(). + */ + if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && + zfs_arc_max < allmem) { + arc_c_max = zfs_arc_max; + if (arc_c_min >= arc_c_max) { + arc_c_min = MAX(zfs_arc_max / 2, + 2ULL << SPA_MAXBLOCKSHIFT); + } + } #else /* * In userland, there's only the memory pressure that we artificially @@ -7814,13 +7940,11 @@ arc_init(void) arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif - arc_c = arc_c_max; + arc_c = arc_c_min; arc_p = (arc_c >> 1); /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; - /* Initialize maximum observed usage to zero */ - arc_meta_max = 0; /* * Set arc_meta_limit to a percent of arc_c_max with a floor of * arc_meta_min, and a ceiling of arc_c_max. @@ -7828,10 +7952,10 @@ arc_init(void) percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_limit = (percent * arc_meta_limit) / 100; + arc_dnode_size_limit = (percent * arc_meta_limit) / 100; /* Apply user specified tunings */ - arc_tuning_update(); + arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) @@ -7839,22 +7963,19 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; + arc_register_hotplug(); + arc_state_init(); - /* - * The arc must be "uninitialized", so that hdr_recl() (which is - * registered by buf_init()) will not access arc_reap_zthr before - * it is created. - */ - ASSERT(!arc_initialized); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, - max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, + boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7865,12 +7986,11 @@ arc_init(void) kstat_install(arc_ksp); } - arc_adjust_zthr = zthr_create(arc_adjust_cb_check, - arc_adjust_cb, NULL); - arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, - arc_reap_cb, NULL, SEC2NSEC(1)); + arc_evict_zthr = zthr_create("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri); + arc_reap_zthr = zthr_create_timer("arc_reap", + arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); - arc_initialized = B_TRUE; arc_warm = B_FALSE; /* @@ -7881,9 +8001,15 @@ arc_init(void) * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ +#ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * @@ -7891,6 +8017,18 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + + /* + * dp_wrlog_total is reduced for each txg at the end of + * spa_sync(). However, dp_dirty_total is reduced every time + * a block is written out. Thus under normal operation, + * dp_wrlog_total could grow 2 times as big as + * zfs_dirty_data_max. + */ + zfs_wrlog_data_max = zfs_dirty_data_max * 2; + } } void @@ -7899,14 +8037,12 @@ arc_fini(void) arc_prune_t *p; #ifdef _KERNEL - spl_unregister_shrinker(&arc_shrinker); + arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); - arc_initialized = B_FALSE; - if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; @@ -7926,14 +8062,19 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - (void) zthr_cancel(arc_adjust_zthr); - zthr_destroy(arc_adjust_zthr); + (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); - zthr_destroy(arc_reap_zthr); - mutex_destroy(&arc_adjust_lock); - cv_destroy(&arc_adjust_waiters_cv); + mutex_destroy(&arc_evict_lock); + list_destroy(&arc_evict_waiters); + + /* + * Free any buffers that were tagged for destruction. This needs + * to occur before arc_state_fini() runs and destroys the aggsum + * values which are updated when freeing scatter ABDs. + */ + l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may @@ -7943,6 +8084,16 @@ arc_fini(void) buf_fini(); arc_state_fini(); + arc_unregister_hotplug(); + + /* + * We destroy the zthrs after all the ARC state has been + * torn down to avoid the case of them receiving any + * wakeup() signals after they are destroyed. + */ + zthr_destroy(arc_evict_zthr); + zthr_destroy(arc_reap_zthr); + ASSERT0(arc_loaned_bytes); } @@ -8089,6 +8240,103 @@ arc_fini(void) * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. */ static boolean_t @@ -8109,9 +8357,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) } static uint64_t -l2arc_write_size(void) +l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user @@ -8128,6 +8376,30 @@ l2arc_write_size(void) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + tsize = size + l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); + + if (tsize >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "plus the overhead of log blocks (persistent L2ARC, " + "%llu bytes) exceeds the size of the cache device " + "(guid %llu), resetting them to the default (%d)", + (u_longlong_t)l2arc_log_blk_overhead(size, dev), + (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + return (size); } @@ -8193,10 +8465,12 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev)); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; @@ -8245,16 +8519,20 @@ l2arc_do_free_on_write(void) static void l2arc_write_done(zio_t *zio) { - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -8263,9 +8541,6 @@ l2arc_write_done(zio_t *zio) DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - /* * All writes completed, or an error was hit. */ @@ -8329,8 +8604,7 @@ top: arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -8347,12 +8621,74 @@ top: mutex_exit(hash_lock); } - atomic_inc_64(&l2arc_writes_done); + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_writes_error); + + /* + * Restore the lbps array in the header to its previous state. + * If the list of log block pointers is empty, zero out the + * log block pointers in the device header. + */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + if (lb_ptr_buf == NULL) { + /* + * If the list is empty zero out the device + * header. Otherwise zero out the second log + * block pointer in the header. + */ + if (i == 0) { + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + } else { + bzero(&l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + } + break; + } + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + + ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); + ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -8387,7 +8723,8 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { - abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8423,12 +8760,13 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); + HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); @@ -8527,6 +8865,7 @@ l2arc_read_done(zio_t *zio) (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); @@ -8544,7 +8883,6 @@ l2arc_read_done(zio_t *zio) zio->io_private = hdr; arc_read_done(zio); } else { - mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. @@ -8569,10 +8907,24 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb)); + &cb->l2rcb_zb); + + /* + * Original ZIO will be freed, so we need to update + * ARC header with the new ZIO pointer to be used + * by zio_change_priority() in arc_read(). + */ + for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; + acb != NULL; acb = acb->acb_next) + acb->acb_zio_head = zio; + + mutex_exit(hash_lock); + zio_nowait(zio); + } else { + mutex_exit(hash_lock); } } @@ -8599,16 +8951,16 @@ l2arc_sublist_lock(int list_num) switch (list_num) { case 0: - ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - ml = arc_mru->arcs_list[ARC_BUFC_DATA]; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); @@ -8624,9 +8976,32 @@ l2arc_sublist_lock(int list_num) return (multilist_sublist_lock(ml, idx)); } +/* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_log_entries == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + /* * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. + * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ @@ -8637,22 +9012,37 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + vdev_t *vd = dev->l2ad_vdev; + boolean_t rerun; buflist = &dev->l2ad_buflist; - if (!all && dev->l2ad_first) { + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); + if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { /* - * This is the first sweep through the device. There is - * nothing to evict. + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the write size, whichever is greater. */ - return; + distance += MAX(64 * 1024 * 1024, + (distance * l2arc_trim_ahead) / 100); } - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. + * When there is no space to accommodate upcoming writes, + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. */ + rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; @@ -8660,8 +9050,90 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); -top: + if (!all) { + /* + * This check has to be placed after deciding whether to + * iterate (rerun). + */ + if (dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. We have already trimmmed the + * whole device. + */ + goto out; + } else { + /* + * Trim the space to be evicted. + */ + if (vd->vdev_has_trim && dev->l2ad_evict < taddr && + l2arc_trim_ahead > 0) { + /* + * We have to drop the spa_config lock because + * vdev_trim_range() will acquire it. + * l2ad_evict already accounts for the label + * size. To prevent vdev_trim_ranges() from + * adding it again, we subtract it from + * l2ad_evict. + */ + spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); + vdev_trim_simple(vd, + dev->l2ad_evict - VDEV_LABEL_START_SIZE, + taddr - dev->l2ad_evict); + spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, + RW_READER); + } + + /* + * When rebuilding L2ARC we retrieve the evict hand + * from the header of the device. Of note, l2arc_evict() + * does not actually delete buffers from the cache + * device, but trimming may do so depending on the + * hardware implementation. Thus keeping track of the + * evict hand is useful. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + } + } + +retry: mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + + /* + * We don't worry about log blocks left behind (ie + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(vd, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -8681,7 +9153,7 @@ top: mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto retry; } /* @@ -8693,7 +9165,7 @@ top: ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, @@ -8730,6 +9202,33 @@ top: mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); + +out: + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } + + if (!all) { + /* + * In case of cache device removal (all) the following + * assertions may be violated without functional consequences + * as the device is about to be removed. + */ + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + } } /* @@ -8760,7 +9259,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, /* * If this data simply needs its own buffer, we simply allocate it - * and copy the data. This may be done to elimiate a depedency on a + * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { @@ -8786,7 +9285,18 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, cabd = abd_alloc_for_io(asize, ismd); tmp = abd_borrow_buf(cabd, asize); - psize = zio_compress_data(compress, to_write, tmp, size); + psize = zio_compress_data(compress, to_write, tmp, size, + hdr->b_complevel); + + if (psize >= size) { + abd_return_buf(cabd, tmp, asize); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + to_write = cabd; + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto encrypt; + } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) bzero((char *)tmp + psize, asize - psize); @@ -8795,6 +9305,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, to_write = cabd; } +encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); @@ -8849,6 +9360,17 @@ error: return (ret); } +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_free(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + /* * Find and write ARC buffers to the L2ARC device. * @@ -8858,17 +9380,19 @@ error: * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -8881,8 +9405,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Copy buffers for L2ARC writing. */ - for (int try = 0; try < L2ARC_FEED_TYPES; try++) { - multilist_sublist_t *mls = l2arc_sublist_lock(try); + for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { + /* + * If pass == 1 or 3, we cache MRU metadata and data + * respectively. + */ + if (l2arc_mfuonly) { + if (pass == 1 || pass == 3) + continue; + } + + multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); @@ -8920,7 +9453,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { + if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ @@ -8972,7 +9505,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on - * disk. Otherwise, the L2ARC can normally use the + * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the @@ -9020,6 +9553,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } @@ -9028,6 +9568,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); @@ -9050,11 +9592,20 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); - (void) zio_nowait(wzio); + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + + zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -9068,31 +9619,47 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + if (dev->l2ad_evict != l2dhdr->dh_evict) + l2arc_dev_hdr_update(dev); + return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); - - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; + /* + * Update the device header after the zio completes as + * l2arc_write_done() may have updated the memory holding the log block + * pointers in the device header. + */ + l2arc_dev_hdr_update(dev); + return (write_asize); } +static boolean_t +l2arc_hdr_limit_reached(void) +{ + int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); + + return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); +} + /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -9115,7 +9682,7 @@ l2arc_feed_thread(void *unused) cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig(&l2arc_feed_thr_cv, + (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; @@ -9160,7 +9727,7 @@ l2arc_feed_thread(void *unused) /* * Avoid contributing to memory pressure. */ - if (arc_reclaim_needed()) { + if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; @@ -9168,7 +9735,7 @@ l2arc_feed_thread(void *unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(); + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -9197,7 +9764,17 @@ l2arc_feed_thread(void *unused) boolean_t l2arc_vdev_present(vdev_t *vd) { - l2arc_dev_t *dev; + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; @@ -9207,7 +9784,81 @@ l2arc_vdev_present(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - return (dev != NULL); + return (dev); +} + +static void +l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + spa_t *spa = dev->l2ad_spa; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (spa_writeable(spa)) { + /* + * In this case TRIM the whole device if l2arc_trim_ahead > 0, + * otherwise create a new header. We zero out the memory holding + * the header to reset dh_start_lbps. If we TRIM the whole + * device the new header will be written by + * vdev_trim_l2arc_thread() at the end of the TRIM to update the + * trim_state in the header too. When reading the header, if + * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 + * we opt to TRIM the whole device again. + */ + if (l2arc_trim_ahead > 0) { + dev->l2ad_trim_all = B_TRUE; + } else { + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } + } } /* @@ -9217,22 +9868,30 @@ l2arc_vdev_present(vdev_t *vd) void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { - l2arc_dev_t *adddev; + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ - adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* @@ -9242,8 +9901,26 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); + + /* + * Decide if dev is eligible for L2ARC rebuild or whole device + * trimming. This has to happen before the device is added in the + * cache device list and l2arc_dev_mtx is released. Otherwise + * l2arc_feed_thread() might already start writing on the + * device. + */ + l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list @@ -9254,30 +9931,65 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) mutex_exit(&l2arc_dev_mtx); } +/* + * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() + * in case of onlining a cache device. + */ +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + + /* + * In contrast to l2arc_add_vdev() we do not have to worry about + * l2arc_feed_thread() invalidating previous content when onlining a + * cache device. The device parameters (l2ad*) are not cleared when + * offlining the device and writing new buffers will not invalidate + * all previous content. In worst case only buffers that have not had + * their log block written to the device will be lost. + * When onlining the cache device (ie offline->online without exporting + * the pool in between) this happens: + * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() + * | | + * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE + * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild + * is set to B_TRUE we might write additional buffers to the device. + */ + l2arc_rebuild_dev(dev, reopen); +} + /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *dev, *nextdev, *remdev = NULL; + l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } + remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); + /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + /* * Remove device from global list */ + mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); @@ -9288,9 +10000,14 @@ l2arc_remove_vdev(vdev_t *vd) */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); - kmem_free(remdev, sizeof (l2arc_dev_t)); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); + vmem_free(remdev, sizeof (l2arc_dev_t)); } void @@ -9298,11 +10015,11 @@ l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; - l2arc_writes_sent = 0; - l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -9317,16 +10034,10 @@ l2arc_init(void) void l2arc_fini(void) { - /* - * This is called from dmu_fini(), which is called from spa_fini(); - * Because of this, we can assume that all l2arc devices have - * already been removed when the pools themselves were removed. - */ - - l2arc_do_free_on_write(); - mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); @@ -9337,7 +10048,7 @@ l2arc_fini(void) void l2arc_start(void) { - if (!(spa_mode_global & FWRITE)) + if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, @@ -9347,7 +10058,7 @@ l2arc_start(void) void l2arc_stop(void) { - if (!(spa_mode_global & FWRITE)) + if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); @@ -9358,7 +10069,930 @@ l2arc_stop(void) mutex_exit(&l2arc_feed_thr_lock); } -#if defined(_KERNEL) +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_thread(void *arg) +{ + l2arc_dev_t *dev = arg; + + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; + vd->vdev_trim_state = l2dhdr->dh_trim_state; + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (l2arc_hdr_limit_reached()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize); + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, asize, 0, 0); + + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. + */ + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) + goto out; + + cond_resched(); + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + vmem_free(this_lb, sizeof (*this_lb)); + vmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + } else if (err == ECANCELED) { + /* + * In case the rebuild was canceled do not log to spa history + * log as the pool may be in the process of being removed. + */ + zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err != 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_free(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, + (u_longlong_t)dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict) || + (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && + l2arc_trim_ahead > 0)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + uint64_t asize; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, + (u_longlong_t)this_lbp->lbp_daddr, + (u_longlong_t)dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + (u_longlong_t)this_lbp->lbp_daddr, + (u_longlong_t)dev->l2ad_vdev->vdev_guid, + (u_longlong_t)dev->l2ad_hand, + (u_longlong_t)dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_asize) +{ + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; + + /* + * Usually arc_adapt() is called only for data, not headers, but + * since we may allocate significant amount of memory here, let ARC + * grow its arc_c. + */ + arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here + * + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop), + L2BLK_GET_STATE((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls vdev_space_update(). + */ + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + exists->b_l2hdr.b_arcs_state = + L2BLK_GET_STATE((le)->le_prop); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + l2arc_hdr_arcstats_increment(exists); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain a newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t asize; + zio_t *pio; + l2arc_read_callback_t *cb; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. + */ +void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); + l2dhdr->dh_flags = 0; + l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; + l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_free(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, + (u_longlong_t)dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb), 0); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_free(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + + if (dev->l2ad_log_entries == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, dev->l2ad_log_entries); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + le->le_complevel = hdr->b_complevel; + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * --------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------============--------------| + * + * bottom > top: Looped-around case: + * --------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============---------------===========| + * ^ ^ + * | (or here?) | + * +---------------+--------- + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} + EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); @@ -9368,104 +11002,120 @@ EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); /* BEGIN CSTYLED */ -module_param(zfs_arc_min, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, + param_get_long, ZMOD_RW, "Min arc size"); -module_param(zfs_arc_max, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, + param_get_long, ZMOD_RW, "Max arc size"); -module_param(zfs_arc_meta_limit, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, + param_get_long, ZMOD_RW, "Metadata limit for arc size"); -module_param(zfs_arc_meta_limit_percent, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_meta_limit_percent, +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, + param_set_arc_long, param_get_long, ZMOD_RW, "Percent of arc size for arc meta limit"); -module_param(zfs_arc_meta_min, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, + param_get_long, ZMOD_RW, "Min arc metadata"); -module_param(zfs_arc_meta_prune, int, 0644); -MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, + "Meta objects to scan for prune"); -module_param(zfs_arc_meta_adjust_restarts, int, 0644); -MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, - "Limit number of restarts in arc_adjust_meta"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW, + "Limit number of restarts in arc_evict_meta"); -module_param(zfs_arc_meta_strategy, int, 0644); -MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, + "Meta reclaim strategy"); -module_param(zfs_arc_grow_retry, int, 0644); -MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, + param_get_int, ZMOD_RW, "Seconds before growing arc size"); -module_param(zfs_arc_p_dampener_disable, int, 0644); -MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, + "Disable arc_p adapt dampener"); -module_param(zfs_arc_shrink_shift, int, 0644); -MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, + param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); -module_param(zfs_arc_pc_percent, uint, 0644); -MODULE_PARM_DESC(zfs_arc_pc_percent, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim arc to"); -module_param(zfs_arc_p_min_shift, int, 0644); -MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, + param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); -module_param(zfs_arc_average_blocksize, int, 0444); -MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, + "Target average block size"); -module_param(zfs_compressed_arc_enabled, int, 0644); -MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers"); +ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, + "Disable compressed arc buffers"); -module_param(zfs_arc_min_prefetch_ms, int, 0644); -MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, + param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); -module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644); -MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms, +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, + param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prescient prefetched block in ms"); -module_param(l2arc_write_max, ulong, 0644); -MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, + "Max write bytes per interval"); -module_param(l2arc_write_boost, ulong, 0644); -MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, + "Extra write bytes during device warmup"); -module_param(l2arc_headroom, ulong, 0644); -MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, + "Number of max device writes to precache"); -module_param(l2arc_headroom_boost, ulong, 0644); -MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, + "Compressed l2arc_headroom multiplier"); -module_param(l2arc_feed_secs, ulong, 0644); -MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, + "TRIM ahead L2ARC write size multiplier"); -module_param(l2arc_feed_min_ms, ulong, 0644); -MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, + "Seconds between L2ARC writing"); -module_param(l2arc_noprefetch, int, 0644); -MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, + "Min feed interval in milliseconds"); -module_param(l2arc_feed_again, int, 0644); -MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, + "Skip caching prefetched buffers"); -module_param(l2arc_norw, int, 0644); -MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, + "Turbo L2ARC warmup"); -module_param(zfs_arc_lotsfree_percent, int, 0644); -MODULE_PARM_DESC(zfs_arc_lotsfree_percent, - "System free memory I/O throttle in bytes"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, + "No reads during writes"); -module_param(zfs_arc_sys_free, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW, + "Percent of ARC size allowed for L2ARC-only headers"); -module_param(zfs_arc_dnode_limit, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, + "Rebuild the L2ARC when importing a pool"); -module_param(zfs_arc_dnode_limit_percent, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_dnode_limit_percent, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, + "Min size in bytes to write rebuild log blocks in L2ARC"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, + "Cache only MFU data from ARC into L2ARC"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, + param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, + param_get_long, ZMOD_RW, "System free memory target size in bytes"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, + param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, + param_set_arc_long, param_get_long, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); -module_param(zfs_arc_dnode_reduce_percent, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, + "When full, ARC allocation waits for eviction of this % of alloc size"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW, + "The number of headers to evict per sublist before moving to the next"); /* END CSTYLED */ -#endif diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c index ee24b1c312..aa09ded8db 100644 --- a/module/zfs/blkptr.c +++ b/module/zfs/blkptr.c @@ -17,6 +17,7 @@ * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ +#include #include #include #include @@ -142,7 +143,7 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) uint8_t dstbuf[BPE_PAYLOAD_SIZE]; decode_embedded_bp_compressed(bp, dstbuf); VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), - dstbuf, buf, psize, buflen)); + dstbuf, buf, psize, buflen, NULL)); } else { ASSERT3U(lsize, ==, psize); decode_embedded_bp_compressed(bp, buf); diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index c81151e08a..47ea364ef2 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include @@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) } mutex_exit(&bpl->bpl_lock); } + +void +bplist_clear(bplist_t *bpl) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + kmem_free(bpe, sizeof (*bpe)); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 633801956e..e75ba5cccd 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,13 +20,13 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ #include #include -#include +#include #include #include #include @@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) size = BPOBJ_SIZE_V0; else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) size = BPOBJ_SIZE_V1; + else if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_LIVELIST)) + size = BPOBJ_SIZE_V2; else size = sizeof (bpobj_phys_t); @@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); bpo->bpo_phys = bpo->bpo_dbuf->db_data; return (0); } @@ -199,11 +203,21 @@ bpobj_close(bpobj_t *bpo) mutex_destroy(&bpo->bpo_lock); } +static boolean_t +bpobj_is_empty_impl(bpobj_t *bpo) +{ + ASSERT(MUTEX_HELD(&bpo->bpo_lock)); + return (bpo->bpo_phys->bpo_num_blkptrs == 0 && + (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); +} + boolean_t bpobj_is_empty(bpobj_t *bpo) { - return (bpo->bpo_phys->bpo_num_blkptrs == 0 && - (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); + mutex_enter(&bpo->bpo_lock); + boolean_t is_empty = bpobj_is_empty_impl(bpo); + mutex_exit(&bpo->bpo_lock); + return (is_empty); } /* @@ -245,8 +259,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) * Update bpobj and all of its parents with new space accounting. */ static void -propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, - uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) +propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, + int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) { for (; bpi != NULL; bpi = bpi->bpi_parent) { @@ -263,22 +277,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, static int bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + int64_t start, dmu_tx_t *tx, boolean_t free) { int err = 0; - uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; + int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, - FTAG, &dbuf, 0); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, FTAG, &dbuf, 0); if (err) break; } @@ -288,18 +302,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, blkptr_t *bparray = dbuf->db_data; blkptr_t *bp = &bparray[blkoff]; - err = func(arg, bp, tx); + + boolean_t bp_freed = BP_GET_FREE(bp); + err = func(arg, bp, bp_freed, tx); if (err) break; if (free) { + int sign = bp_freed ? -1 : +1; spa_t *spa = dmu_objset_spa(bpo->bpo_os); - freed += bp_get_dsize_sync(spa, bp); - comp_freed += BP_GET_PSIZE(bp); - uncomp_freed += BP_GET_UCSIZE(bp); + freed += sign * bp_get_dsize_sync(spa, bp); + comp_freed += sign * BP_GET_PSIZE(bp); + uncomp_freed += sign * BP_GET_UCSIZE(bp); ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed--; + ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); + } } } if (free) { @@ -328,7 +350,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, */ static int bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) { list_t stack; bpobj_info_t *bpi; @@ -341,6 +363,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, list_create(&stack, sizeof (bpobj_info_t), offsetof(bpobj_info_t, bpi_node)); mutex_enter(&initial_bpo->bpo_lock); + + if (bpobj_size != NULL) + *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); while ((bpi = list_head(&stack)) != NULL) { @@ -354,7 +380,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpi->bpi_visited == B_FALSE) { - err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); + err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, + free); bpi->bpi_visited = B_TRUE; if (err != 0) break; @@ -370,7 +397,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, * If there are no entries, there should * be no bytes. */ - if (bpobj_is_empty(bpo)) { + if (bpobj_is_empty_impl(bpo)) { ASSERT0(bpo->bpo_phys->bpo_bytes); ASSERT0(bpo->bpo_phys->bpo_comp); ASSERT0(bpo->bpo_phys->bpo_uncomp); @@ -433,6 +460,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, * We have unprocessed subobjs. Process the next one. */ ASSERT(bpo->bpo_havecomp); + ASSERT3P(bpobj_size, ==, NULL); /* Add the last subobj to stack. */ int64_t i = bpi->bpi_unprocessed_subobjs - 1; @@ -489,16 +517,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); } /* * Iterate the entries. If func returns nonzero, iteration will stop. + * + * If there are no subobjs: + * + * *bpobj_size can be used to return the number of block pointers in the + * bpobj. Note that this may be different from the number of block pointers + * that are iterated over, if iteration is terminated early (e.g. by the func + * returning nonzero). + * + * If there are concurrent (or subsequent) modifications to the bpobj then the + * returned *bpobj_size can be passed as "start" to + * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. */ int -bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + uint64_t *bpobj_size) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); + return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); +} + +/* + * Iterate over the blkptrs in the bpobj beginning at index start. If func + * returns nonzero, iteration will stop. This is a livelist specific function + * since it assumes that there are no subobjs present. + */ +int +livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + int64_t start) +{ + if (bpo->bpo_havesubobj) + VERIFY0(bpo->bpo_phys->bpo_subobjs); + bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); + int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); + kmem_free(bpi, sizeof (bpobj_info_t)); + return (err); } /* @@ -724,7 +781,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } void -bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { blkptr_t stored_bp = *bp; uint64_t offset; @@ -755,8 +813,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); } - /* We never need the fill count. */ stored_bp.blk_fill = 0; + BP_SET_FREE(&stored_bp, bp_freed); mutex_enter(&bpo->bpo_lock); @@ -779,11 +837,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); bpo->bpo_phys->bpo_num_blkptrs++; - bpo->bpo_phys->bpo_bytes += + int sign = bp_freed ? -1 : +1; + bpo->bpo_phys->bpo_bytes += sign * bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); + } + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed++; } mutex_exit(&bpo->bpo_lock); } @@ -799,7 +862,7 @@ struct space_range_arg { /* ARGSUSED */ static int -space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct space_range_arg *sra = arg; @@ -863,3 +926,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, *uncompp = sra.uncomp; return (err); } + +/* + * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a + * bpobj are designated as free or allocated that information is not preserved + * in bplists. + */ +/* ARGSUSED */ +int +bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + bplist_t *bpl = arg; + bplist_append(bpl, bp); + return (0); +} diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 8f78e8de59..1827a3c4e3 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #include @@ -33,7 +33,6 @@ #include #include #include -#include #include /* @@ -156,7 +155,8 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (bp == NULL || BP_IS_HOLE(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_REDACTED(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c index f30253d24b..22539efc4e 100644 --- a/module/zfs/bqueue.c +++ b/module/zfs/bqueue.c @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. */ #include @@ -27,13 +27,27 @@ obj2node(bqueue_t *q, void *data) /* * Initialize a blocking queue The maximum capacity of the queue is set to - * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, - * and offset should give its offset from the start of the struct. Return 0 on - * success, or -1 on failure. + * size. Types that are stored in a bqueue must contain a bqueue_node_t, + * and node_offset must be its offset from the start of the struct. + * fill_fraction is a performance tuning value; when the queue is full, any + * threads attempting to enqueue records will block. They will block until + * they're signaled, which will occur when the queue is at least 1/fill_fraction + * empty. Similar behavior occurs on dequeue; if the queue is empty, threads + * block. They will be signalled when the queue has 1/fill_fraction full, or + * when bqueue_flush is called. As a result, you must call bqueue_flush when + * you enqueue your final record on a thread, in case the dequeueing threads are + * currently blocked and that enqueue does not cause them to be awoken. + * Alternatively, this behavior can be disabled (causing signaling to happen + * immediately) by setting fill_fraction to any value larger than size. + * Return 0 on success, or -1 on failure. */ int -bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, + size_t node_offset) { + if (fill_fraction == 0) { + return (-1); + } list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), node_offset + offsetof(bqueue_node_t, bqn_node)); cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); @@ -42,6 +56,7 @@ bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) q->bq_node_offset = node_offset; q->bq_size = 0; q->bq_maxsize = size; + q->bq_fill_fraction = fill_fraction; return (0); } @@ -53,11 +68,33 @@ bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) void bqueue_destroy(bqueue_t *q) { + mutex_enter(&q->bq_lock); ASSERT0(q->bq_size); cv_destroy(&q->bq_add_cv); cv_destroy(&q->bq_pop_cv); - mutex_destroy(&q->bq_lock); list_destroy(&q->bq_list); + mutex_exit(&q->bq_lock); + mutex_destroy(&q->bq_lock); +} + +static void +bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, + boolean_t flush) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <=, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait_sig(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); + if (flush) + cv_broadcast(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); } /* @@ -68,18 +105,23 @@ bqueue_destroy(bqueue_t *q) void bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) { - ASSERT3U(item_size, >, 0); - ASSERT3U(item_size, <=, q->bq_maxsize); - mutex_enter(&q->bq_lock); - obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { - cv_wait(&q->bq_add_cv, &q->bq_lock); - } - q->bq_size += item_size; - list_insert_tail(&q->bq_list, data); - cv_signal(&q->bq_pop_cv); - mutex_exit(&q->bq_lock); + bqueue_enqueue_impl(q, data, item_size, B_FALSE); } + +/* + * Enqueue an entry, and then flush the queue. This forces the popping threads + * to wake up, even if we're below the fill fraction. We have this in a single + * function, rather than having a separate call, because it prevents race + * conditions between the enqueuing thread and the dequeueing thread, where the + * enqueueing thread will wake up the dequeueing thread, that thread will + * destroy the condvar before the enqueuing thread is done. + */ +void +bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +{ + bqueue_enqueue_impl(q, data, item_size, B_TRUE); +} + /* * Take the first element off of q. If there are no elements on the queue, wait * until one is put there. Return the removed element. @@ -91,13 +133,14 @@ bqueue_dequeue(bqueue_t *q) uint64_t item_size; mutex_enter(&q->bq_lock); while (q->bq_size == 0) { - cv_wait(&q->bq_pop_cv, &q->bq_lock); + cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); } ret = list_remove_head(&q->bq_list); ASSERT3P(ret, !=, NULL); item_size = obj2node(q, ret)->bqn_size; q->bq_size -= item_size; - cv_signal(&q->bq_add_cv); + if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction)) + cv_signal(&q->bq_add_cv); mutex_exit(&q->bq_lock); return (ret); } diff --git a/module/zfs/btree.c b/module/zfs/btree.c new file mode 100644 index 0000000000..57b9dbbb2b --- /dev/null +++ b/module/zfs/btree.c @@ -0,0 +1,2124 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include + +kmem_cache_t *zfs_btree_leaf_cache; + +/* + * Control the extent of the verification that occurs when zfs_btree_verify is + * called. Primarily used for debugging when extending the btree logic and + * functionality. As the intensity is increased, new verification steps are + * added. These steps are cumulative; intensity = 3 includes the intensity = 1 + * and intensity = 2 steps as well. + * + * Intensity 1: Verify that the tree's height is consistent throughout. + * Intensity 2: Verify that a core node's children's parent pointers point + * to the core node. + * Intensity 3: Verify that the total number of elements in the tree matches the + * sum of the number of elements in each node. Also verifies that each node's + * count obeys the invariants (less than or equal to maximum value, greater than + * or equal to half the maximum minus one). + * Intensity 4: Verify that each element compares less than the element + * immediately after it and greater than the one immediately before it using the + * comparator function. For core nodes, also checks that each element is greater + * than the last element in the first of the two nodes it separates, and less + * than the first element in the second of the two nodes. + * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside + * of each node is poisoned appropriately. Note that poisoning always occurs if + * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal + * operation. + * + * Intensity 4 and 5 are particularly expensive to perform; the previous levels + * are a few memory operations per node, while these levels require multiple + * operations per element. In addition, when creating large btrees, these + * operations are called at every step, resulting in extremely slow operation + * (while the asymptotic complexity of the other steps is the same, the + * importance of the constant factors cannot be denied). + */ +int zfs_btree_verify_intensity = 0; + +/* + * A convenience function to silence warnings from memmove's return value and + * change argument order to src, dest. + */ +static void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + +#ifdef _ILP32 +#define BTREE_POISON 0xabadb10c +#else +#define BTREE_POISON 0xabadb10cdeadbeef +#endif + +static void +zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - + hdr->bth_count * size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + node->btc_children[i] = + (zfs_btree_hdr_t *)BTREE_POISON; + } + (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, + (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } +#endif +} + +static inline void +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + ASSERT3U(offset, >=, hdr->bth_count); + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + offset * size, 0x0f, size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + node->btc_children[offset + 1] = + (zfs_btree_hdr_t *)BTREE_POISON; + (void) memset(node->btc_elems + offset * size, 0x0f, size); + } +#endif +} + +static inline void +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + uint8_t eval = 0x0f; + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; + VERIFY3P(node->btc_children[offset + 1], ==, cval); + for (int i = 0; i < size; i++) + VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + } else { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 0; i < size; i++) + VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + } +#endif +} + +void +zfs_btree_init(void) +{ + zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, + NULL, 0); +} + +void +zfs_btree_fini(void) +{ + kmem_cache_destroy(zfs_btree_leaf_cache); +} + +void +zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), + size_t size) +{ + /* + * We need a minimmum of 4 elements so that when we split a node we + * always have at least two elements in each node. This simplifies the + * logic in zfs_btree_bulk_finish, since it means the last leaf will + * always have a left sibling to share with (unless it's the root). + */ + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); + + bzero(tree, sizeof (*tree)); + tree->bt_compar = compar; + tree->bt_elem_size = size; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +/* + * Find value in the array of elements provided. Uses a simple binary search. + */ +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, zfs_btree_index_t *where) +{ + uint64_t max = nelems; + uint64_t min = 0; + while (max > min) { + uint64_t idx = (min + max) / 2; + uint8_t *cur = buf + idx * tree->bt_elem_size; + int comp = tree->bt_compar(cur, value); + if (comp == -1) { + min = idx + 1; + } else if (comp == 1) { + max = idx; + } else { + ASSERT0(comp); + where->bti_offset = idx; + where->bti_before = B_FALSE; + return (cur); + } + } + + where->bti_offset = max; + where->bti_before = B_TRUE; + return (NULL); +} + +/* + * Find the given value in the tree. where may be passed as null to use as a + * membership test or if the btree is being used as a map. + */ +void * +zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + if (where != NULL) { + where->bti_node = NULL; + where->bti_offset = 0; + } + ASSERT0(tree->bt_num_elems); + return (NULL); + } + + /* + * If we're in bulk-insert mode, we check the last spot in the tree + * and the last leaf in the tree before doing the normal search, + * because for most workloads the vast majority of finds in + * bulk-insert mode are to insert new elements. + */ + zfs_btree_index_t idx; + if (tree->bt_bulk != NULL) { + zfs_btree_leaf_t *last_leaf = tree->bt_bulk; + int compar = tree->bt_compar(last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), + value); + if (compar < 0) { + /* + * If what they're looking for is after the last + * element, it's not in the tree. + */ + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count; + where->bti_before = B_TRUE; + } + return (NULL); + } else if (compar == 0) { + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count - 1; + where->bti_before = B_FALSE; + } + return (last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * + tree->bt_elem_size)); + } + if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + /* + * If what they're looking for is after the first + * element in the last leaf, it's in the last leaf or + * it's not in the tree. + */ + void *d = zfs_btree_find_in_buf(tree, + last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, + value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)last_leaf; + *where = idx; + } + return (d); + } + } + + zfs_btree_core_t *node = NULL; + uint64_t child = 0; + uint64_t depth = 0; + + /* + * Iterate down the tree, finding which child the value should be in + * by comparing with the separators. + */ + for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (zfs_btree_core_t *)node->btc_children[child], depth++) { + ASSERT3P(node, !=, NULL); + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + node->btc_hdr.bth_count, value, &idx); + EQUIV(d != NULL, !idx.bti_before); + if (d != NULL) { + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)node; + *where = idx; + } + return (d); + } + ASSERT(idx.bti_before); + child = idx.bti_offset; + } + + /* + * The value is in this leaf, or it would be if it were in the + * tree. Find its proper location and return it. + */ + zfs_btree_leaf_t *leaf = (depth == 0 ? + (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, + leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)leaf; + *where = idx; + } + + return (d); +} + +/* + * To explain the following functions, it is useful to understand the four + * kinds of shifts used in btree operation. First, a shift is a movement of + * elements within a node. It is used to create gaps for inserting new + * elements and children, or cover gaps created when things are removed. A + * shift has two fundamental properties, each of which can be one of two + * values, making four types of shifts. There is the direction of the shift + * (left or right) and the shape of the shift (parallelogram or isoceles + * trapezoid (shortened to trapezoid hereafter)). The shape distinction only + * applies to shifts of core nodes. + * + * The names derive from the following imagining of the layout of a node: + * + * Elements: * * * * * * * ... * * * + * Children: * * * * * * * * ... * * * + * + * This layout follows from the fact that the elements act as separators + * between pairs of children, and that children root subtrees "below" the + * current node. A left and right shift are fairly self-explanatory; a left + * shift moves things to the left, while a right shift moves things to the + * right. A parallelogram shift is a shift with the same number of elements + * and children being moved, while a trapezoid shift is a shift that moves one + * more children than elements. An example follows: + * + * A parallelogram shift could contain the following: + * _______________ + * \* * * * \ * * * ... * * * + * * \ * * * *\ * * * ... * * * + * --------------- + * A trapezoid shift could contain the following: + * ___________ + * * / * * * \ * * * ... * * * + * * / * * * *\ * * * ... * * * + * --------------- + * + * Note that a parallelogram shift is always shaped like a "left-leaning" + * parallelogram, where the starting index of the children being moved is + * always one higher than the starting index of the elements being moved. No + * "right-leaning" parallelogram shifts are needed (shifts where the starting + * element index and starting child index being moved are the same) to achieve + * any btree operations, so we ignore them. + */ + +enum bt_shift_shape { + BSS_TRAPEZOID, + BSS_PARALLELOGRAM +}; + +enum bt_shift_direction { + BSD_LEFT, + BSD_RIGHT +}; + +/* + * Shift elements and children in the provided core node by off spots. The + * first element moved is idx, and count elements are moved. The shape of the + * shift is determined by shape. The direction is determined by dir. + */ +static inline void +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_shape shape, + enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(node->btc_hdr.bth_core); + + uint8_t *e_start = node->btc_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *e_out = e_start + sign * off * size; + uint64_t e_count = count; + bmov(e_start, e_out, e_count * size); + + zfs_btree_hdr_t **c_start = node->btc_children + idx + + (shape == BSS_TRAPEZOID ? 0 : 1); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : + c_start + off); + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(c_start, c_out, c_count * sizeof (*c_start)); +} + +/* + * Shift elements and children in the provided core node left by one spot. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. + */ +static inline void +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); +} + +/* + * Shift elements and children in the provided core node right by one spot. + * Starts with elements[idx] and children[idx] and one more child than element. + */ +static inline void +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); +} + +/* + * Shift elements and children in the provided leaf node by off spots. + * The first element moved is idx, and count elements are moved. The direction + * is determined by left. + */ +static inline void +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(!node->btl_hdr.bth_core); + + uint8_t *start = node->btl_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *out = start + sign * off * size; + bmov(start, out, count * size); +} + +static inline void +bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); +} + +static inline void +bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); +} + +/* + * Move children and elements from one core node to another. The shape + * parameter behaves the same as it does in the shift logic. + */ +static inline void +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, + uint64_t count, zfs_btree_core_t *dest, uint64_t didx, + enum bt_shift_shape shape) +{ + size_t size = tree->bt_elem_size; + ASSERT(source->btc_hdr.bth_core); + ASSERT(dest->btc_hdr.bth_core); + + bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + count * size); + + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), + c_count * sizeof (*source->btc_children)); +} + +static inline void +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, + uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) +{ + size_t size = tree->bt_elem_size; + ASSERT(!source->btl_hdr.bth_core); + ASSERT(!dest->btl_hdr.bth_core); + + bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + count * size); +} + +/* + * Find the first element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[0]) + ; + + ASSERT(!node->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = 0; + where->bti_before = B_FALSE; + } + return (&leaf->btl_elems[0]); +} + +/* Insert an element and a child into a core node at the given offset. */ +static void +zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, + uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) +{ + uint64_t size = tree->bt_elem_size; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + ASSERT3P(par_hdr, ==, new_node->bth_parent); + ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); + + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bmov(buf, parent->btc_elems + offset * size, size); + par_hdr->bth_count++; +} + +/* + * Insert new_node into the parent of old_node directly after old_node, with + * buf as the dividing element between the two. + */ +static void +zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, + zfs_btree_hdr_t *new_node, void *buf) +{ + ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); + uint64_t size = tree->bt_elem_size; + zfs_btree_core_t *parent = old_node->bth_parent; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + + /* + * If this is the root node we were splitting, we create a new root + * and increase the height of the tree. + */ + if (parent == NULL) { + ASSERT3P(old_node, ==, tree->bt_root); + tree->bt_num_nodes++; + zfs_btree_core_t *new_root = + kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * + size, KM_SLEEP); + zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + new_root_hdr->bth_parent = NULL; + new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_count = 1; + + old_node->bth_parent = new_node->bth_parent = new_root; + new_root->btc_children[0] = old_node; + new_root->btc_children[1] = new_node; + bmov(buf, new_root->btc_elems, size); + + tree->bt_height++; + tree->bt_root = new_root_hdr; + zfs_btree_poison_node(tree, new_root_hdr); + return; + } + + /* + * Since we have the new separator, binary search for where to put + * new_node. + */ + zfs_btree_index_t idx; + ASSERT(par_hdr->bth_core); + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + par_hdr->bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + uint64_t offset = idx.bti_offset; + ASSERT3U(offset, <=, par_hdr->bth_count); + ASSERT3P(parent->btc_children[offset], ==, old_node); + + /* + * If the parent isn't full, shift things to accommodate our insertions + * and return. + */ + if (par_hdr->bth_count != BTREE_CORE_ELEMS) { + zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); + return; + } + + /* + * We need to split this core node into two. Currently there are + * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for + * BTREE_CORE_ELEMS + 2. Some of the children will be part of the + * current node, and the others will be moved to the new core node. + * There are BTREE_CORE_ELEMS + 1 elements including the new one. One + * will be used as the new separator in our parent, and the others + * will be split among the two core nodes. + * + * Usually we will split the node in half evenly, with + * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we + * instead move only about a quarter of the elements (and children) to + * the new node. Since the average state after a long time is a 3/4 + * full node, shortcutting directly to that state improves efficiency. + * + * We do this in two stages: first we split into two nodes, and then we + * reuse our existing logic to insert the new element and child. + */ + uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4)) - 1, 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; + ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + new_par_hdr->bth_parent = par_hdr->bth_parent; + new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_par_hdr); + + par_hdr->bth_count = keep_count; + + bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, + 0, BSS_TRAPEZOID); + + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bmov(parent->btc_elems + keep_count * size, tmp_buf, + size); + zfs_btree_poison_node(tree, par_hdr); + + if (offset < keep_count) { + /* Insert the new node into the left half */ + zfs_btree_insert_core_impl(tree, parent, offset, new_node, + buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else if (offset > keep_count) { + /* Insert the new node into the right half */ + new_node->bth_parent = new_parent; + zfs_btree_insert_core_impl(tree, new_parent, + offset - keep_count - 1, new_node, buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else { + /* + * Move the new separator into the right half, and replace it + * with buf. We also need to shift back the elements in the + * right half to accommodate new_node. + */ + bt_shift_core_right(tree, new_parent, 0, move_count, + BSS_TRAPEZOID); + new_parent->btc_children[0] = new_node; + bmov(tmp_buf, new_parent->btc_elems, size); + new_par_hdr->bth_count++; + } + kmem_free(tmp_buf, size); + zfs_btree_poison_node(tree, par_hdr); + + for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + new_parent->btc_children[i]->bth_parent = new_parent; + + for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting. + */ + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, + &new_parent->btc_hdr, buf); +} + +/* Insert an element into a leaf node at the given offset. */ +static void +zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + uint64_t idx, const void *value) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + uint64_t count = leaf->btl_hdr.bth_count - idx; + ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + + bt_shift_leaf_right(tree, leaf, idx, count); + bmov(value, start, size); + hdr->bth_count++; +} + +/* Helper function for inserting a new value into leaf at the given index. */ +static void +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + const void *value, uint64_t idx) +{ + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * If the leaf isn't full, shift the elements after idx and insert + * value. + */ + if (leaf->btl_hdr.bth_count != capacity) { + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + return; + } + + /* + * Otherwise, we split the leaf node into two nodes. If we're not bulk + * inserting, each is of size (capacity / 2). If we are bulk + * inserting, we move a quarter of the elements to the new node so + * inserts into the old node don't cause immediate splitting but the + * tree stays relatively dense. Since the average state after a long + * time is a 3/4 full node, shortcutting directly to that state + * improves efficiency. At the end of the bulk insertion process + * we'll need to go through and fix up any nodes (the last leaf and + * its ancestors, potentially) that are below the minimum. + * + * In either case, we're left with one extra element. The leftover + * element will become the new dividing element between the two nodes. + */ + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - + 1, 2); + uint64_t keep_count = capacity - move_count - 1; + ASSERT3U(capacity - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + new_hdr->bth_parent = leaf->btl_hdr.bth_parent; + new_hdr->bth_core = B_FALSE; + new_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_hdr); + + leaf->btl_hdr.bth_count = keep_count; + + if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) + tree->bt_bulk = new_leaf; + + /* Copy the back part to the new leaf. */ + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, + 0); + + /* We store the new separator in a buffer we control for simplicity. */ + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(leaf->btl_elems + (keep_count * size), buf, size); + zfs_btree_poison_node(tree, &leaf->btl_hdr); + + if (idx < keep_count) { + /* Insert into the existing leaf. */ + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + } else if (idx > keep_count) { + /* Insert into the new leaf. */ + zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - + 1, value); + } else { + /* + * Shift the elements in the new leaf to make room for the + * separator, and use the new value as the new separator. + */ + bt_shift_leaf_right(tree, new_leaf, 0, move_count); + bmov(buf, new_leaf->btl_elems, size); + bmov(value, buf, size); + new_hdr->bth_count++; + } + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting, bur only of core nodes. + */ + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, + buf); + kmem_free(buf, size); +} + +static uint64_t +zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + void *buf; + if (hdr->bth_core) { + buf = ((zfs_btree_core_t *)hdr)->btc_elems; + } else { + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; + } + zfs_btree_index_t idx; + zfs_btree_core_t *parent = hdr->bth_parent; + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + parent->btc_hdr.bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); + ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); + return (idx.bti_offset); +} + +/* + * Take the b-tree out of bulk insert mode. During bulk-insert mode, some + * nodes may violate the invariant that non-root nodes must be at least half + * full. All nodes violating this invariant should be the last node in their + * particular level. To correct the invariant, we take values from their left + * neighbor until they are half full. They must have a left neighbor at their + * level because the last node at a level is not the first node unless it's + * the root. + */ +static void +zfs_btree_bulk_finish(zfs_btree_t *tree) +{ + ASSERT3P(tree->bt_bulk, !=, NULL); + ASSERT3P(tree->bt_root, !=, NULL); + zfs_btree_leaf_t *leaf = tree->bt_bulk; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * The invariant doesn't apply to the root node, if that's the only + * node in the tree we're done. + */ + if (parent == NULL) { + tree->bt_bulk = NULL; + return; + } + + /* First, take elements to rebalance the leaf node. */ + if (hdr->bth_count < capacity / 2) { + /* + * First, find the left neighbor. The simplest way to do this + * is to call zfs_btree_prev twice; the first time finds some + * ancestor of this node, and the second time finds the left + * neighbor. The ancestor found is the lowest common ancestor + * of leaf and the neighbor. + */ + zfs_btree_index_t idx = { + .bti_node = hdr, + .bti_offset = 0 + }; + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(idx.bti_node->bth_core); + zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; + uint64_t common_idx = idx.bti_offset; + + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(!idx.bti_node->bth_core); + zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; + zfs_btree_hdr_t *l_hdr = idx.bti_node; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + + /* First, shift elements in leaf back. */ + bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, + BSD_RIGHT); + + /* Next, move the separator from the common ancestor to leaf. */ + uint8_t *separator = common->btc_elems + (common_idx * size); + uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); + bmov(separator, out, size); + move_count--; + + /* + * Now we move elements from the tail of the left neighbor to + * fill the remaining spots in leaf. + */ + bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - + move_count, move_count, leaf, 0); + + /* + * Finally, move the new last element in the left neighbor to + * the separator. + */ + bmov(l_neighbor->btl_elems + (l_hdr->bth_count - + move_count - 1) * size, separator, size); + + /* Adjust the node's counts, and we're done. */ + l_hdr->bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_hdr->bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + zfs_btree_poison_node(tree, l_hdr); + } + + /* + * Now we have to rebalance any ancestors of leaf that may also + * violate the invariant. + */ + capacity = BTREE_CORE_ELEMS; + while (parent->btc_hdr.bth_parent != NULL) { + zfs_btree_core_t *cur = parent; + zfs_btree_hdr_t *hdr = &cur->btc_hdr; + parent = hdr->bth_parent; + /* + * If the invariant isn't violated, move on to the next + * ancestor. + */ + if (hdr->bth_count >= capacity / 2) + continue; + + /* + * Because the smallest number of nodes we can move when + * splitting is 2, we never need to worry about not having a + * left sibling (a sibling is a neighbor with the same parent). + */ + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + ASSERT3U(parent_idx, >, 0); + zfs_btree_core_t *l_neighbor = + (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + hdr->bth_count + i); + } + } + /* First, shift things in the right node back. */ + bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, + BSS_TRAPEZOID, BSD_RIGHT); + + /* Next, move the separator to the right node. */ + uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * + size); + uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); + bmov(separator, e_out, size); + + /* + * Now, move elements and children from the left node to the + * right. We move one more child than elements. + */ + move_count--; + uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, + BSS_TRAPEZOID); + + /* + * Finally, move the last element in the left node to the + * separator's position. + */ + move_idx--; + bmov(l_neighbor->btc_elems + move_idx * size, separator, size); + + l_neighbor->btc_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + + zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); + + for (int i = 0; i <= hdr->bth_count; i++) + cur->btc_children[i]->bth_parent = cur; + } + + tree->bt_bulk = NULL; +} + +/* + * Insert value into tree at the location specified by where. + */ +void +zfs_btree_add_idx(zfs_btree_t *tree, const void *value, + const zfs_btree_index_t *where) +{ + zfs_btree_index_t idx = {0}; + + /* If we're not inserting in the last leaf, end bulk insert mode. */ + if (tree->bt_bulk != NULL) { + if (where->bti_node != &tree->bt_bulk->btl_hdr) { + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); + where = &idx; + } + } + + tree->bt_num_elems++; + /* + * If this is the first element in the tree, create a leaf root node + * and add the value to it. + */ + if (where->bti_node == NULL) { + ASSERT3U(tree->bt_num_elems, ==, 1); + ASSERT3S(tree->bt_height, ==, -1); + ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0(where->bti_offset); + + tree->bt_num_nodes++; + zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + tree->bt_root = &leaf->btl_hdr; + tree->bt_height++; + + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + hdr->bth_parent = NULL; + hdr->bth_core = B_FALSE; + hdr->bth_count = 0; + zfs_btree_poison_node(tree, hdr); + + zfs_btree_insert_into_leaf(tree, leaf, value, 0); + tree->bt_bulk = leaf; + } else if (!where->bti_node->bth_core) { + /* + * If we're inserting into a leaf, go directly to the helper + * function. + */ + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)where->bti_node, value, + where->bti_offset); + } else { + /* + * If we're inserting into a core node, we can't just shift + * the existing element in that slot in the same node without + * breaking our ordering invariants. Instead we place the new + * value in the node at that spot and then insert the old + * separator into the first slot in the subtree to the right. + */ + ASSERT(where->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; + + /* + * We can ignore bti_before, because either way the value + * should end up in bti_offset. + */ + uint64_t off = where->bti_offset; + zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; + size_t size = tree->bt_elem_size; + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); + + /* + * Find the first slot in the subtree to the right, insert + * there. + */ + zfs_btree_index_t new_idx; + VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); + ASSERT0(new_idx.bti_offset); + ASSERT(!new_idx.bti_node->bth_core); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); + kmem_free(buf, size); + } + zfs_btree_verify(tree); +} + +/* + * Return the first element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_first_helper(tree->bt_root, where)); +} + +/* + * Find the last element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) + ; + + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = node->bth_count - 1; + where->bti_before = B_FALSE; + } + return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); +} + +/* + * Return the last element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_last_helper(tree, tree->bt_root, where)); +} + +/* + * This function contains the logic to find the next node in the tree. A + * helper function is used because there are multiple internal consumemrs of + * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each + * node after we've finished with it. + */ +static void * +zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx, + void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the next element of an element in a leaf, + * there are two cases. If the element isn't the last one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our ancestor isn't the last child + * of its parent. Once we do, the next element is the + * separator after our ancestor in its parent. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + if (leaf->btl_hdr.bth_count > new_off) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = new_off; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + new_off * tree->bt_elem_size); + } + + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (done_func != NULL) + done_func(tree, prev); + if (i == hdr->bth_count) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + i * tree->bt_elem_size); + } + if (done_func != NULL) + done_func(tree, prev); + /* + * We've traversed all the way up and been at the end of the + * node every time, so this was the last element in the tree. + */ + return (NULL); + } + + /* If we were before an element in a core node, return that element. */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + if (idx->bti_before) { + out_idx->bti_before = B_FALSE; + return (node->btc_elems + offset * tree->bt_elem_size); + } + + /* + * The next element from one in a core node is the first element in + * the subtree just to the right of the separator. + */ + zfs_btree_hdr_t *child = node->btc_children[offset + 1]; + return (zfs_btree_first_helper(child, out_idx)); +} + +/* + * Return the next valued node in the tree. The same address can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); +} + +/* + * Return the previous valued node in the tree. The same value can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the previous element of an element in a leaf, + * there are two cases. If the element isn't the first one in + * the leaf, in which case we just return the previous element + * in the leaf. Otherwise, we need to traverse up our parents + * until we find one where our previous ancestor isn't the + * first child. Once we do, the previous element is the + * separator after our previous ancestor. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + if (offset != 0) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = offset - 1; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + (offset - 1) * + tree->bt_elem_size); + } + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (i == 0) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i - 1; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + (i - 1) * tree->bt_elem_size); + } + /* + * We've traversed all the way up and been at the start of the + * node every time, so this was the first node in the tree. + */ + return (NULL); + } + + /* + * The previous element from one in a core node is the last element in + * the subtree just to the left of the separator. + */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + zfs_btree_hdr_t *child = node->btc_children[offset]; + return (zfs_btree_last_helper(tree, child, out_idx)); +} + +/* + * Get the value at the provided index in the tree. + * + * Note that the value returned from this function can be mutated, but only + * if it will not change the ordering of the element with respect to any other + * elements that could be in the tree. + */ +void * +zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) +{ + ASSERT(!idx->bti_before); + if (!idx->bti_node->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + } + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); +} + +/* Add the given value to the tree. Must not already be in the tree. */ +void +zfs_btree_add(zfs_btree_t *tree, const void *node) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); + zfs_btree_add_idx(tree, node, &where); +} + +/* Helper function to free a tree node. */ +static void +zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) +{ + tree->bt_num_nodes--; + if (!node->bth_core) { + kmem_cache_free(zfs_btree_leaf_cache, node); + } else { + kmem_free(node, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * tree->bt_elem_size); + } +} + +/* + * Remove the rm_hdr and the separator to its left from the parent node. The + * buffer that rm_hdr was stored in may already be freed, so its contents + * cannot be accessed. + */ +static void +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, + zfs_btree_hdr_t *rm_hdr) +{ + size_t size = tree->bt_elem_size; + uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; + zfs_btree_hdr_t *hdr = &node->btc_hdr; + /* + * If the node is the root node and rm_hdr is one of two children, + * promote the other child to the root. + */ + if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { + ASSERT3U(hdr->bth_count, ==, 1); + ASSERT3P(tree->bt_root, ==, node); + ASSERT3P(node->btc_children[1], ==, rm_hdr); + tree->bt_root = node->btc_children[0]; + node->btc_children[0]->bth_parent = NULL; + zfs_btree_node_destroy(tree, hdr); + tree->bt_height--; + return; + } + + uint64_t idx; + for (idx = 0; idx <= hdr->bth_count; idx++) { + if (node->btc_children[idx] == rm_hdr) + break; + } + ASSERT3U(idx, <=, hdr->bth_count); + + /* + * If the node is the root or it has more than the minimum number of + * children, just remove the child and separator, and return. + */ + if (hdr->bth_parent == NULL || + hdr->bth_count > min_count) { + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + hdr->bth_count--; + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + return; + } + + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a neighbor. We check left, then + * right. If the neighbor exists and has more than the minimum number + * of elements, we move the separator between us and them to our + * node, move their closest element (last for left, first for right) + * to the separator, and move their closest child to our node. Along + * the way we need to collapse the gap made by idx, and (for our right + * neighbor) the gap made by removing their first element and child. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling (i.e. a neighbor with a different + * parent). This isn't critical functionality, but may be worth + * implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(l_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; + + /* + * Start by shifting the elements and children in the current + * node to the right by one spot. + */ + bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); + + /* + * Move the separator between node and neighbor to the first + * element slot in the current node. + */ + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, node->btc_elems, size); + + /* Move the last child of neighbor to our first child slot. */ + zfs_btree_hdr_t **take_child = neighbor->btc_children + + l_hdr->bth_count; + bmov(take_child, node->btc_children, sizeof (*take_child)); + node->btc_children[0]->bth_parent = node; + + /* Move the last element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems + + (l_hdr->bth_count - 1) * size; + bmov(take_elem, separator, size); + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(r_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; + + /* + * Shift elements in node left by one spot to overwrite rm_hdr + * and the separator before it. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + + /* + * Move the separator between node and neighbor to the last + * element spot in node. + */ + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, + size); + + /* + * Move the first child of neighbor to the last child spot in + * node. + */ + zfs_btree_hdr_t **take_child = neighbor->btc_children; + bmov(take_child, node->btc_children + hdr->bth_count, + sizeof (*take_child)); + node->btc_children[hdr->bth_count]->bth_parent = node; + + /* Move the first element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems; + bmov(take_elem, separator, size); + r_hdr->bth_count--; + + /* + * Shift the elements and children of neighbor to cover the + * stolen elements. + */ + bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, + BSS_TRAPEZOID); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arbitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + new_rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + new_rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(keep_hdr->bth_core); + ASSERT(new_rm_hdr->bth_core); + + zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; + zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + + /* Move the separator into the left node. */ + uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, e_out, size); + keep_hdr->bth_count++; + + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, + keep_hdr->bth_count, BSS_TRAPEZOID); + + uint64_t old_count = keep_hdr->bth_count; + + /* Update bookkeeping */ + keep_hdr->bth_count += new_rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); + + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); + bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, + BSS_PARALLELOGRAM); + keep_hdr->bth_count--; + + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + old_count - 1; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; + for (int i = 0; i <= keep_hdr->bth_count; i++) { + ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); + ASSERT3P(keep->btc_children[i], !=, rm_hdr); + } + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + new_rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, new_rm_hdr); + zfs_btree_remove_from_node(tree, parent, new_rm_hdr); +} + +/* Remove the element at the specific location. */ +void +zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + size_t size = tree->bt_elem_size; + zfs_btree_hdr_t *hdr = where->bti_node; + uint64_t idx = where->bti_offset; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + ASSERT(!where->bti_before); + if (tree->bt_bulk != NULL) { + /* + * Leave bulk insert mode. Note that our index would be + * invalid after we correct the tree, so we copy the value + * we're planning to remove and find it again after + * bulk_finish. + */ + uint8_t *value = zfs_btree_get(tree, where); + uint8_t *tmp = kmem_alloc(size, KM_SLEEP); + bmov(value, tmp, size); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); + kmem_free(tmp, size); + hdr = where->bti_node; + idx = where->bti_offset; + } + + tree->bt_num_elems--; + /* + * If the element happens to be in a core node, we move a leaf node's + * element into its place and then remove the leaf node element. This + * makes the rebalance logic not need to be recursive both upwards and + * downwards. + */ + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = zfs_btree_last_helper(tree, left_subtree, + where); + ASSERT3P(new_value, !=, NULL); + + bmov(new_value, node->btc_elems + idx * size, size); + + hdr = where->bti_node; + idx = where->bti_offset; + ASSERT(!where->bti_before); + } + + /* + * First, we'll update the leaf's metadata. Then, we shift any + * elements after the idx to the left. After that, we rebalance if + * needed. + */ + ASSERT(!hdr->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + ASSERT3U(hdr->bth_count, >, 0); + + uint64_t min_count = (capacity / 2) - 1; + + /* + * If we're over the minimum size or this is the root, just overwrite + * the value and return. + */ + if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { + hdr->bth_count--; + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + if (hdr->bth_parent == NULL) { + ASSERT0(tree->bt_height); + if (hdr->bth_count == 0) { + tree->bt_root = NULL; + tree->bt_height--; + zfs_btree_node_destroy(tree, &leaf->btl_hdr); + } + } + if (tree->bt_root != NULL) + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_verify(tree); + return; + } + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a sibling. We check left, then + * right. If they exist and have more than the minimum number of + * elements, we move the separator between us and them to our node + * and move their closest element (last for left, first for right) to + * the separator. Along the way we need to collapse the gap made by + * idx, and (for our right neighbor) the gap made by removing their + * first element. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(!l_hdr->bth_core); + + /* + * Move our elements back by one spot to make room for the + * stolen element and overwrite the element being removed. + */ + bt_shift_leaf_right(tree, leaf, 0, idx); + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + + (l_hdr->bth_count - 1) * size; + /* Move the separator to our first spot. */ + bmov(separator, leaf->btl_elems, size); + + /* Move our neighbor's last element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + + zfs_btree_verify(tree); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(!r_hdr->bth_core); + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; + + /* + * Move our elements after the element being removed forwards + * by one spot to make room for the stolen element and + * overwrite the element being removed. + */ + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - + 1); + + uint8_t *separator = parent->btc_elems + parent_idx * size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; + /* Move the separator between us to our last spot. */ + bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + size); + + /* Move our neighbor's first element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + r_hdr->bth_count--; + + /* + * Move our neighbors elements forwards to overwrite the + * stolen element. + */ + bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_verify(tree); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arbitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; // 449 + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(!keep_hdr->bth_core); + ASSERT(!rm_hdr->bth_core); + ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT3U(rm_hdr->bth_count, ==, min_count); + + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, out, size); + keep_hdr->bth_count++; + + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, + keep_hdr->bth_count); + + /* Update the bookkeeping. */ + keep_hdr->bth_count += rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + + /* Remove the value from the node */ + keep_hdr->bth_count--; + bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - + new_idx); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, rm_hdr); + /* Remove the emptied node from the parent. */ + zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_verify(tree); +} + +/* Remove the given value from the tree. */ +void +zfs_btree_remove(zfs_btree_t *tree, const void *value) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); + zfs_btree_remove_idx(tree, &where); +} + +/* Return the number of elements in the tree. */ +ulong_t +zfs_btree_numnodes(zfs_btree_t *tree) +{ + return (tree->bt_num_elems); +} + +/* + * This function is used to visit all the elements in the tree before + * destroying the tree. This allows the calling code to perform any cleanup it + * needs to do. This is more efficient than just removing the first element + * over and over, because it removes all rebalancing. Once the destroy_nodes() + * function has been called, no other btree operations are valid until it + * returns NULL, which point the only valid operation is zfs_btree_destroy(). + * + * example: + * + * zfs_btree_index_t *cookie = NULL; + * my_data_t *node; + * + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * free(node->ptr); + * zfs_btree_destroy(tree); + * + */ +void * +zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) +{ + if (*cookie == NULL) { + if (tree->bt_height == -1) + return (NULL); + *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); + return (zfs_btree_first(tree, *cookie)); + } + + void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, + zfs_btree_node_destroy); + if (rval == NULL) { + tree->bt_root = NULL; + tree->bt_height = -1; + tree->bt_num_elems = 0; + kmem_free(*cookie, sizeof (**cookie)); + tree->bt_bulk = NULL; + } + return (rval); +} + +static void +zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (hdr->bth_core) { + zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_clear_helper(tree, btc->btc_children[i]); + } + } + + zfs_btree_node_destroy(tree, hdr); +} + +void +zfs_btree_clear(zfs_btree_t *tree) +{ + if (tree->bt_root == NULL) { + ASSERT0(tree->bt_num_elems); + return; + } + + zfs_btree_clear_helper(tree, tree->bt_root); + tree->bt_num_elems = 0; + tree->bt_root = NULL; + tree->bt_num_nodes = 0; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +void +zfs_btree_destroy(zfs_btree_t *tree) +{ + ASSERT0(tree->bt_num_elems); + ASSERT3P(tree->bt_root, ==, NULL); +} + +/* Verify that every child of this node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) + return; + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); + zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); + } +} + +/* Verify that every node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers(zfs_btree_t *tree) +{ + if (tree->bt_height == -1) { + VERIFY3P(tree->bt_root, ==, NULL); + return; + } + VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + zfs_btree_verify_pointers_helper(tree, tree->bt_root); +} + +/* + * Verify that all the current node and its children satisfy the count + * invariants, and return the total count in the subtree rooted in this node. + */ +static uint64_t +zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) { + if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); + VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); + } + + return (hdr->bth_count); + } else { + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = hdr->bth_count; + if (tree->bt_root != hdr && tree->bt_bulk == NULL) + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_counts_helper(tree, + node->btc_children[i]); + } + + return (ret); + } +} + +/* + * Verify that all nodes satisfy the invariants and that the total number of + * elements is correct. + */ +static void +zfs_btree_verify_counts(zfs_btree_t *tree) +{ + EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); + if (tree->bt_height == -1) { + return; + } + VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, + tree->bt_num_elems); +} + +/* + * Check that the subtree rooted at this node has a uniform height. Returns + * the number of nodes under this node, to help verify bt_num_nodes. + */ +static uint64_t +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + int64_t height) +{ + if (!hdr->bth_core) { + VERIFY0(height); + return (1); + } + + VERIFY(hdr->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = 1; + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_height_helper(tree, + node->btc_children[i], height - 1); + } + return (ret); +} + +/* + * Check that the tree rooted at this node has a uniform height, and that the + * bt_height in the tree is correct. + */ +static void +zfs_btree_verify_height(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, + tree->bt_height), ==, tree->bt_num_nodes); +} + +/* + * Check that the elements in this node are sorted, and that if this is a core + * node, the separators are properly between the subtrees they separaate and + * that the children also satisfy this requirement. + */ +static void +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * + size, leaf->btl_elems + i * size), ==, -1); + } + return; + } + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, + node->btc_elems + i * size), ==, -1); + } + for (int i = 0; i < hdr->bth_count; i++) { + uint8_t *left_child_last = NULL; + zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; + if (left_child_hdr->bth_core) { + zfs_btree_core_t *left_child = + (zfs_btree_core_t *)left_child_hdr; + left_child_last = left_child->btc_elems + + (left_child_hdr->bth_count - 1) * size; + } else { + zfs_btree_leaf_t *left_child = + (zfs_btree_leaf_t *)left_child_hdr; + left_child_last = left_child->btl_elems + + (left_child_hdr->bth_count - 1) * size; + } + if (tree->bt_compar(node->btc_elems + i * size, + left_child_last) != 1) { + panic("btree: compar returned %d (expected 1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, left_child_last), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)left_child_last); + } + + uint8_t *right_child_first = NULL; + zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + if (right_child_hdr->bth_core) { + zfs_btree_core_t *right_child = + (zfs_btree_core_t *)right_child_hdr; + right_child_first = right_child->btc_elems; + } else { + zfs_btree_leaf_t *right_child = + (zfs_btree_leaf_t *)right_child_hdr; + right_child_first = right_child->btl_elems; + } + if (tree->bt_compar(node->btc_elems + i * size, + right_child_first) != -1) { + panic("btree: compar returned %d (expected -1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, right_child_first), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)right_child_first); + } + } + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_order_helper(tree, node->btc_children[i]); + } +} + +/* Check that all elements in the tree are in sorted order. */ +static void +zfs_btree_verify_order(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + zfs_btree_verify_order_helper(tree, tree->bt_root); +} + +#ifdef ZFS_DEBUG +/* Check that all unused memory is poisoned correctly. */ +static void +zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t); i++) { + VERIFY3U(leaf->btl_elems[i], ==, val); + } + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; + i++) { + VERIFY3U(node->btc_elems[i], ==, val); + } + + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + VERIFY3P(node->btc_children[i], ==, + (zfs_btree_hdr_t *)BTREE_POISON); + } + + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_poison_helper(tree, + node->btc_children[i]); + } + } +} +#endif + +/* Check that unused memory in the tree is still poisoned. */ +static void +zfs_btree_verify_poison(zfs_btree_t *tree) +{ +#ifdef ZFS_DEBUG + if (tree->bt_height == -1) + return; + zfs_btree_verify_poison_helper(tree, tree->bt_root); +#endif +} + +void +zfs_btree_verify(zfs_btree_t *tree) +{ + if (zfs_btree_verify_intensity == 0) + return; + zfs_btree_verify_height(tree); + if (zfs_btree_verify_intensity == 1) + return; + zfs_btree_verify_pointers(tree); + if (zfs_btree_verify_intensity == 2) + return; + zfs_btree_verify_counts(tree); + if (zfs_btree_verify_intensity == 3) + return; + zfs_btree_verify_order(tree); + + if (zfs_btree_verify_intensity == 4) + return; + zfs_btree_verify_poison(tree); +} diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 522825c42c..3fbb24ddef 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -50,17 +50,17 @@ dataset_kstats_update(kstat_t *ksp, int rw) dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; dkv->dkv_writes.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_writes); + wmsum_value(&dk->dk_sums.dss_writes); dkv->dkv_nwritten.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_nwritten); + wmsum_value(&dk->dk_sums.dss_nwritten); dkv->dkv_reads.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_reads); + wmsum_value(&dk->dk_sums.dss_reads); dkv->dkv_nread.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_nread); + wmsum_value(&dk->dk_sums.dss_nread); dkv->dkv_nunlinks.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_nunlinks); + wmsum_value(&dk->dk_sums.dss_nunlinks); dkv->dkv_nunlinked.value.ui64 = - aggsum_value(&dk->dk_aggsums.das_nunlinked); + wmsum_value(&dk->dk_sums.dss_nunlinked); return (0); } @@ -135,16 +135,17 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) kstat->ks_data = dk_kstats; kstat->ks_update = dataset_kstats_update; kstat->ks_private = dk; + kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN; kstat_install(kstat); dk->dk_kstats = kstat; - aggsum_init(&dk->dk_aggsums.das_writes, 0); - aggsum_init(&dk->dk_aggsums.das_nwritten, 0); - aggsum_init(&dk->dk_aggsums.das_reads, 0); - aggsum_init(&dk->dk_aggsums.das_nread, 0); - aggsum_init(&dk->dk_aggsums.das_nunlinks, 0); - aggsum_init(&dk->dk_aggsums.das_nunlinked, 0); + wmsum_init(&dk->dk_sums.dss_writes, 0); + wmsum_init(&dk->dk_sums.dss_nwritten, 0); + wmsum_init(&dk->dk_sums.dss_reads, 0); + wmsum_init(&dk->dk_sums.dss_nread, 0); + wmsum_init(&dk->dk_sums.dss_nunlinks, 0); + wmsum_init(&dk->dk_sums.dss_nunlinked, 0); } void @@ -161,12 +162,12 @@ dataset_kstats_destroy(dataset_kstats_t *dk) kstat_delete(dk->dk_kstats); dk->dk_kstats = NULL; - aggsum_fini(&dk->dk_aggsums.das_writes); - aggsum_fini(&dk->dk_aggsums.das_nwritten); - aggsum_fini(&dk->dk_aggsums.das_reads); - aggsum_fini(&dk->dk_aggsums.das_nread); - aggsum_fini(&dk->dk_aggsums.das_nunlinks); - aggsum_fini(&dk->dk_aggsums.das_nunlinked); + wmsum_fini(&dk->dk_sums.dss_writes); + wmsum_fini(&dk->dk_sums.dss_nwritten); + wmsum_fini(&dk->dk_sums.dss_reads); + wmsum_fini(&dk->dk_sums.dss_nread); + wmsum_fini(&dk->dk_sums.dss_nunlinks); + wmsum_fini(&dk->dk_sums.dss_nunlinked); } void @@ -178,8 +179,8 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk, if (dk->dk_kstats == NULL) return; - aggsum_add(&dk->dk_aggsums.das_writes, 1); - aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten); + wmsum_add(&dk->dk_sums.dss_writes, 1); + wmsum_add(&dk->dk_sums.dss_nwritten, nwritten); } void @@ -191,8 +192,8 @@ dataset_kstats_update_read_kstats(dataset_kstats_t *dk, if (dk->dk_kstats == NULL) return; - aggsum_add(&dk->dk_aggsums.das_reads, 1); - aggsum_add(&dk->dk_aggsums.das_nread, nread); + wmsum_add(&dk->dk_sums.dss_reads, 1); + wmsum_add(&dk->dk_sums.dss_nread, nread); } void @@ -201,7 +202,7 @@ dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta) if (dk->dk_kstats == NULL) return; - aggsum_add(&dk->dk_aggsums.das_nunlinks, delta); + wmsum_add(&dk->dk_sums.dss_nunlinks, delta); } void @@ -210,5 +211,5 @@ dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta) if (dk->dk_kstats == NULL) return; - aggsum_add(&dk->dk_aggsums.das_nunlinked, delta); + wmsum_add(&dk->dk_sums.dss_nunlinked, delta); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 07e616f6f0..289247c6ed 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -21,9 +21,11 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include @@ -44,12 +46,13 @@ #include #include #include -#include +#include #include #include #include -#include +#include #include +#include kstat_t *dbuf_ksp; @@ -133,8 +136,22 @@ dbuf_stats_t dbuf_stats = { { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; +struct { + wmsum_t cache_count; + wmsum_t cache_total_evicts; + wmsum_t cache_levels[DN_MAX_LEVELS]; + wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; + wmsum_t hash_hits; + wmsum_t hash_misses; + wmsum_t hash_collisions; + wmsum_t hash_chains; + wmsum_t hash_insert_race; + wmsum_t metadata_cache_count; + wmsum_t metadata_cache_overflow; +} dbuf_sums; + #define DBUF_STAT_INCR(stat, val) \ - atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); + wmsum_add(&dbuf_sums.stat, val); #define DBUF_STAT_DECR(stat, val) \ DBUF_STAT_INCR(stat, -(val)); #define DBUF_STAT_BUMP(stat) \ @@ -148,31 +165,10 @@ dbuf_stats_t dbuf_stats = { continue; \ } -typedef struct dbuf_hold_arg { - /* Function arguments */ - dnode_t *dh_dn; - uint8_t dh_level; - uint64_t dh_blkid; - boolean_t dh_fail_sparse; - boolean_t dh_fail_uncached; - void *dh_tag; - dmu_buf_impl_t **dh_dbp; - /* Local variables */ - dmu_buf_impl_t *dh_db; - dmu_buf_impl_t *dh_parent; - blkptr_t *dh_bp; - int dh_err; - dbuf_dirty_record_t *dh_dr; -} dbuf_hold_arg_t; - -static dbuf_hold_arg_t *dbuf_hold_arg_create(dnode_t *dn, uint8_t level, - uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp); -static int dbuf_hold_impl_arg(dbuf_hold_arg_t *dh); -static void dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh); - static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); +static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, @@ -222,18 +218,22 @@ static boolean_t dbuf_evict_thread_exit; * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { - multilist_t *cache; - zfs_refcount_t size; + multilist_t cache; + zfs_refcount_t size ____cacheline_aligned; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ -unsigned long dbuf_cache_max_bytes = 0; -unsigned long dbuf_metadata_cache_max_bytes = 0; +unsigned long dbuf_cache_max_bytes = ULONG_MAX; +unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; + /* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; int dbuf_metadata_cache_shift = 6; +static unsigned long dbuf_cache_target_bytes(void); +static unsigned long dbuf_metadata_cache_target_bytes(void); + /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread @@ -287,6 +287,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -300,6 +301,7 @@ dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); + rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); @@ -310,8 +312,6 @@ dbuf_dest(void *vdb, void *unused) */ static dbuf_hash_table_t dbuf_hash_table; -static uint64_t dbuf_hash_count; - /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. @@ -322,6 +322,10 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } +#define DTRACE_SET_STATE(db, why) \ + DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \ + const char *, why) + #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -418,8 +422,8 @@ dbuf_hash_insert(dmu_buf_impl_t *db) db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_inc_64(&dbuf_hash_count); - DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); + uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); + DBUF_STAT_MAX(hash_elements_max, he); return (NULL); } @@ -447,7 +451,7 @@ dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) */ if (zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > - dbuf_metadata_cache_max_bytes) { + dbuf_metadata_cache_target_bytes()) { DBUF_STAT_BUMP(metadata_cache_overflow); return (B_FALSE); } @@ -492,7 +496,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_hash_count); + atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } typedef enum { @@ -603,7 +607,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ -unsigned int +static unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; @@ -618,18 +622,34 @@ dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. + * would not be evenly distributed. In this context full 64bit + * division would be a waste of time, so limit it to 32 bits. */ - return (dbuf_hash(db->db_objset, db->db.db_object, + return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } +/* + * The target size of the dbuf cache can grow with the ARC target, + * unless limited by the tunable dbuf_cache_max_bytes. + */ static inline unsigned long dbuf_cache_target_bytes(void) { - return MIN(dbuf_cache_max_bytes, - arc_target_bytes() >> dbuf_cache_shift); + return (MIN(dbuf_cache_max_bytes, + arc_target_bytes() >> dbuf_cache_shift)); +} + +/* + * The target size of the dbuf metadata cache can grow with the ARC target, + * unless limited by the tunable dbuf_metadata_cache_max_bytes. + */ +static inline unsigned long +dbuf_metadata_cache_target_bytes(void) +{ + return (MIN(dbuf_metadata_cache_max_bytes, + arc_target_bytes() >> dbuf_metadata_cache_shift)); } static inline uint64_t @@ -648,13 +668,6 @@ dbuf_cache_lowater_bytes(void) (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); } -static inline boolean_t -dbuf_cache_above_hiwater(void) -{ - return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > - dbuf_cache_hiwater_bytes()); -} - static inline boolean_t dbuf_cache_above_lowater(void) { @@ -668,9 +681,9 @@ dbuf_cache_above_lowater(void) static void dbuf_evict_one(void) { - int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( - dbuf_caches[DB_DBUF_CACHE].cache, idx); + &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -694,8 +707,6 @@ dbuf_evict_one(void) ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); - DBUF_STAT_MAX(cache_size_bytes_max, - zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size)); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); @@ -721,7 +732,7 @@ dbuf_evict_thread(void *unused) while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig_hires(&dbuf_evict_cv, + (void) cv_timedwait_idle_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } @@ -751,16 +762,15 @@ dbuf_evict_thread(void *unused) * dbuf cache using the callers context. */ static void -dbuf_evict_notify(void) +dbuf_evict_notify(uint64_t size) { /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ - if (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > - dbuf_cache_target_bytes()) { - if (dbuf_cache_above_hiwater()) + if (size > dbuf_cache_target_bytes()) { + if (size > dbuf_cache_hiwater_bytes()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } @@ -771,19 +781,40 @@ dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; - if (rw == KSTAT_WRITE) { + if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); - } else { - ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( - &dbuf_caches[DB_DBUF_METADATA_CACHE].size); - ds->cache_size_bytes.value.ui64 = - zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); - ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); - ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); - ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); - ds->hash_elements.value.ui64 = dbuf_hash_count; - } + ds->cache_count.value.ui64 = + wmsum_value(&dbuf_sums.cache_count); + ds->cache_size_bytes.value.ui64 = + zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); + ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); + ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); + ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); + ds->cache_total_evicts.value.ui64 = + wmsum_value(&dbuf_sums.cache_total_evicts); + for (int i = 0; i < DN_MAX_LEVELS; i++) { + ds->cache_levels[i].value.ui64 = + wmsum_value(&dbuf_sums.cache_levels[i]); + ds->cache_levels_bytes[i].value.ui64 = + wmsum_value(&dbuf_sums.cache_levels_bytes[i]); + } + ds->hash_hits.value.ui64 = + wmsum_value(&dbuf_sums.hash_hits); + ds->hash_misses.value.ui64 = + wmsum_value(&dbuf_sums.hash_misses); + ds->hash_collisions.value.ui64 = + wmsum_value(&dbuf_sums.hash_collisions); + ds->hash_chains.value.ui64 = + wmsum_value(&dbuf_sums.hash_chains); + ds->hash_insert_race.value.ui64 = + wmsum_value(&dbuf_sums.hash_insert_race); + ds->metadata_cache_count.value.ui64 = + wmsum_value(&dbuf_sums.metadata_cache_count); + ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( + &dbuf_caches[DB_DBUF_METADATA_CACHE].size); + ds->metadata_cache_overflow.value.ui64 = + wmsum_value(&dbuf_sums.metadata_cache_overflow); return (0); } @@ -795,12 +826,12 @@ dbuf_init(void) int i; /* - * The hash table is big enough to fill all of physical memory + * The hash table is big enough to fill one eighth of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ - while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) + while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; retry: @@ -830,23 +861,6 @@ retry: dbuf_stats_init(h); - /* - * Setup the parameters for the dbuf caches. We set the sizes of the - * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) - * of the target size of the ARC. If the values has been specified as - * a module option and they're not greater than the target size of the - * ARC, then we honor that value. - */ - if (dbuf_cache_max_bytes == 0 || - dbuf_cache_max_bytes >= arc_target_bytes()) { - dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift; - } - if (dbuf_metadata_cache_max_bytes == 0 || - dbuf_metadata_cache_max_bytes >= arc_target_bytes()) { - dbuf_metadata_cache_max_bytes = - arc_target_bytes() >> dbuf_metadata_cache_shift; - } - /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. @@ -854,8 +868,8 @@ retry: dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - dbuf_caches[dcs].cache = - multilist_create(sizeof (dmu_buf_impl_t), + multilist_create(&dbuf_caches[dcs].cache, + sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); zfs_refcount_create(&dbuf_caches[dcs].size); @@ -867,14 +881,24 @@ retry: dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); + wmsum_init(&dbuf_sums.cache_count, 0); + wmsum_init(&dbuf_sums.cache_total_evicts, 0); + for (i = 0; i < DN_MAX_LEVELS; i++) { + wmsum_init(&dbuf_sums.cache_levels[i], 0); + wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); + } + wmsum_init(&dbuf_sums.hash_hits, 0); + wmsum_init(&dbuf_sums.hash_misses, 0); + wmsum_init(&dbuf_sums.hash_collisions, 0); + wmsum_init(&dbuf_sums.hash_chains, 0); + wmsum_init(&dbuf_sums.hash_insert_race, 0); + wmsum_init(&dbuf_sums.metadata_cache_count, 0); + wmsum_init(&dbuf_sums.metadata_cache_overflow, 0); + dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { - dbuf_ksp->ks_data = &dbuf_stats; - dbuf_ksp->ks_update = dbuf_kstat_update; - kstat_install(dbuf_ksp); - for (i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); @@ -885,6 +909,9 @@ retry: dbuf_stats.cache_levels_bytes[i].data_type = KSTAT_DATA_UINT64; } + dbuf_ksp->ks_data = &dbuf_stats; + dbuf_ksp->ks_update = dbuf_kstat_update; + kstat_install(dbuf_ksp); } } @@ -923,13 +950,27 @@ dbuf_fini(void) for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { zfs_refcount_destroy(&dbuf_caches[dcs].size); - multilist_destroy(dbuf_caches[dcs].cache); + multilist_destroy(&dbuf_caches[dcs].cache); } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); dbuf_ksp = NULL; } + + wmsum_fini(&dbuf_sums.cache_count); + wmsum_fini(&dbuf_sums.cache_total_evicts); + for (i = 0; i < DN_MAX_LEVELS; i++) { + wmsum_fini(&dbuf_sums.cache_levels[i]); + wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); + } + wmsum_fini(&dbuf_sums.hash_hits); + wmsum_fini(&dbuf_sums.hash_misses); + wmsum_fini(&dbuf_sums.hash_collisions); + wmsum_fini(&dbuf_sums.hash_chains); + wmsum_fini(&dbuf_sums.hash_insert_race); + wmsum_fini(&dbuf_sums.metadata_cache_count); + wmsum_fini(&dbuf_sums.metadata_cache_overflow); } /* @@ -942,6 +983,7 @@ dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; + uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -973,11 +1015,16 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + if ((dr = list_head(&db->db_dirty_records)) != NULL) { ASSERT(dr->dr_dbuf == db); + txg_prev = dr->dr_txg; + for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + ASSERT(dr->dr_dbuf == db); + ASSERT(txg_prev > dr->dr_txg); + txg_prev = dr->dr_txg; + } + } /* * We can't assert that db_size matches dn_datablksz because it @@ -1007,17 +1054,17 @@ dbuf_verify(dmu_buf_impl_t *db) &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ - ASSERTV(int epb = db->db_parent->db.db_size >> - SPA_BLKPTRSHIFT); + int epb __maybe_unused = db->db_parent->db.db_size >> + SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer + * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -1089,8 +1136,10 @@ dbuf_clear_data(dmu_buf_impl_t *db) dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) + if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "clear data"); + } } static void @@ -1104,6 +1153,14 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db.db_data = buf->b_data; } +static arc_buf_t * +dbuf_alloc_arcbuf(dmu_buf_impl_t *db) +{ + spa_t *spa = db->db_objset->os_spa; + + return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); +} + /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ @@ -1177,6 +1234,44 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) } } +/* + * This function is used to lock the parent of the provided dbuf. This should be + * used when modifying or reading db_blkptr. + */ +db_lock_type_t +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +{ + enum db_lock_type ret = DLT_NONE; + if (db->db_parent != NULL) { + rw_enter(&db->db_parent->db_rwlock, rw); + ret = DLT_PARENT; + } else if (dmu_objset_ds(db->db_objset) != NULL) { + rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, + tag); + ret = DLT_OBJSET; + } + /* + * We only return a DLT_NONE lock when it's the top-most indirect block + * of the meta-dnode of the MOS. + */ + return (ret); +} + +/* + * We need to pass the lock type in because it's possible that the block will + * move from being the topmost indirect block in a dnode (and thus, have no + * parent) to not the top-most via an indirection increase. This would cause a + * panic if we didn't pass the lock type in. + */ +void +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +{ + if (type == DLT_PARENT) + rw_exit(&db->db_parent->db_rwlock); + else if (type == DLT_OBJSET) + rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); +} + static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) @@ -1197,6 +1292,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); @@ -1206,16 +1302,104 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "freed in flight"); } else { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "successful read"); } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); } +/* + * Shortcut for performing reads on bonus dbufs. Returns + * an error if we fail to verify the dnode associated with + * a decrypted block. Otherwise success. + */ +static int +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +{ + int bonuslen, max_bonuslen, err; + + err = dbuf_read_verify_dnode_crypt(db, flags); + if (err) + return (err); + + bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); + arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); + if (bonuslen < max_bonuslen) + bzero(db->db.db_data, max_bonuslen); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "bonus buffer filled"); + return (0); +} + +static void +dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) +{ + blkptr_t *bps = db->db.db_data; + uint32_t indbs = 1ULL << dn->dn_indblkshift; + int n_bps = indbs >> SPA_BLKPTRSHIFT; + + for (int i = 0; i < n_bps; i++) { + blkptr_t *bp = &bps[i]; + + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); + BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? + dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); + BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); + BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); + BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + } +} + +/* + * Handle reads on dbufs that are holes, if necessary. This function + * requires that the dbuf's mutex is held. Returns success (0) if action + * was taken, ENOENT if no action was taken. + */ +static int +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); + /* + * For level 0 blocks only, if the above check fails: + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (!is_hole && db->db_level == 0) { + is_hole = dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr); + } + + if (is_hole) { + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); + bzero(db->db.db_data, db->db.db_size); + + if (db->db_blkptr != NULL && db->db_level > 0 && + BP_IS_HOLE(db->db_blkptr) && + db->db_blkptr->blk_birth != 0) { + dbuf_handle_indirect_hole(db, dn); + } + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "hole read satisfied"); + return (0); + } + return (ENOENT); +} /* * This function ensures that, when doing a decrypting read of a block, @@ -1273,93 +1457,51 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) return (err); } +/* + * Drops db_mtx and the parent lock specified by dblt and tag before + * returning. + */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, void *tag) { dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; - int err, zio_flags = 0; + int err, zio_flags; + err = zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); + ASSERT(db->db_parent == NULL || + RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { - /* - * The bonus length stored in the dnode may be less than - * the maximum available space in the bonus buffer. - */ - int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); - int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - - /* if the underlying dnode block is encrypted, decrypt it */ - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) { - DB_DNODE_EXIT(db); - mutex_exit(&db->db_mtx); - return (err); - } - - ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); - arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); - if (bonuslen < max_bonuslen) - bzero(db->db.db_data, max_bonuslen); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return (0); + err = dbuf_read_bonus(db, dn, flags); + goto early_unlock; } + err = dbuf_read_hole(db, dn, flags); + if (err == 0) + goto early_unlock; + /* - * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() - * processes the delete record and clears the bp while we are waiting - * for the dn_mtx (resulting in a "no" from block_freed). + * Any attempt to read a redacted block should result in an error. This + * will never happen under normal conditions, but can be useful for + * debugging purposes. */ - if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || - (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr)))) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, - db->db.db_size)); - bzero(db->db.db_data, db->db.db_size); - - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { - blkptr_t *bps = db->db.db_data; - for (int i = 0; i < ((1 << - DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); - i++) { - blkptr_t *bp = &bps[i]; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - 1 << dn->dn_indblkshift); - BP_SET_LSIZE(bp, - BP_GET_LEVEL(db->db_blkptr) == 1 ? - dn->dn_datablksz : - BP_GET_LSIZE(db->db_blkptr)); - BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); - BP_SET_LEVEL(bp, - BP_GET_LEVEL(db->db_blkptr) - 1); - BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); - } - } - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return (0); + if (BP_IS_REDACTED(db->db_blkptr)) { + ASSERT(dsl_dataset_feature_is_active( + db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + err = SET_ERROR(EIO); + goto early_unlock; } - SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); @@ -1371,21 +1513,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) spa_log_error(db->db_objset->os_spa, &zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); - DB_DNODE_EXIT(db); - mutex_exit(&db->db_mtx); - return (SET_ERROR(EIO)); + err = SET_ERROR(EIO); + goto early_unlock; } err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) { - DB_DNODE_EXIT(db); - mutex_exit(&db->db_mtx); - return (err); - } + if (err != 0) + goto early_unlock; DB_DNODE_EXIT(db); db->db_state = DB_READ; + DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) @@ -1398,11 +1537,23 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; - - err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + /* + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on + * an l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would be a lock ordering violation. + */ + blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); + (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - + return (err); +early_unlock: + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (err); } @@ -1422,7 +1573,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); @@ -1455,6 +1606,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); + uint8_t complevel = arc_get_complevel(db->db_buf); if (arc_is_encrypted(db->db_buf)) { boolean_t byteorder; @@ -1467,11 +1619,12 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), - compress_type); + compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, - size, arc_buf_lsize(db->db_buf), compress_type); + size, arc_buf_lsize(db->db_buf), compress_type, + complevel); } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } @@ -1500,8 +1653,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && @@ -1538,30 +1689,34 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + B_FALSE, flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags); + err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + /* + * dbuf_read_impl has dropped db_mtx and our parent's rwlock + * for us + */ + if (!err && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + db->db_state != DB_CACHED, + flags & DB_RF_HAVESTRUCT); + } - /* dbuf_read_impl has dropped db_mtx for us */ - - if (!err && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); @@ -1586,16 +1741,16 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + B_TRUE, flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* Skip the wait per the caller's request. */ - mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { + mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || @@ -1606,8 +1761,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); } - mutex_exit(&db->db_mtx); } return (err); @@ -1622,13 +1777,11 @@ dbuf_noread(dmu_buf_impl_t *db) while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; + DTRACE_SET_STATE(db, "assigning filled buffer"); } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { @@ -1691,11 +1844,13 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; + dbuf_dirty_record_t *dr; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid, + (u_longlong_t)end_blkid); db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; @@ -1744,9 +1899,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } /* The dbuf is referenced */ - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - + dr = list_head(&db->db_dirty_records); + if (dr != NULL) { if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file @@ -1771,7 +1925,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } @@ -1785,7 +1941,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { - arc_buf_t *buf, *obuf; + arc_buf_t *buf, *old_buf; + dbuf_dirty_record_t *dr; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; @@ -1795,15 +1952,6 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers @@ -1814,21 +1962,25 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + old_buf = db->db_buf; + bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - arc_buf_destroy(obuf, db); + arc_buf_destroy(old_buf, db); db->db.db_size = size; - if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; - } + dr = list_head(&db->db_dirty_records); + /* dirty record added by dmu_buf_will_dirty() */ + VERIFY(dr != NULL); + if (db->db_level == 0) + dr->dt.dl.dr_data = buf; + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + ASSERT3U(dr->dr_accounted, ==, osize); + dr->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); @@ -1838,7 +1990,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) void dbuf_release_bp(dmu_buf_impl_t *db) { - ASSERTV(objset_t *os = db->db_objset); + objset_t *os __maybe_unused = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || @@ -1874,14 +2026,82 @@ dbuf_redirty(dbuf_dirty_record_t *dr) } } +dbuf_dirty_record_t * +dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); + dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); + ASSERT(dn->dn_maxblkid >= blkid); + + dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; + dr->dr_txg = tx->tx_txg; + dr->dt.dll.dr_blkid = blkid; + dr->dr_accounted = dn->dn_datablksz; + + /* + * There should not be any dbuf for the block that we're dirtying. + * Otherwise the buffer contents could be inconsistent between the + * dbuf and the lightweight dirty record. + */ + ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); + + mutex_enter(&dn->dn_mtx); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); + } + + if (dn->dn_nlevels == 1) { + ASSERT3U(blkid, <, dn->dn_nblkptr); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); + dnode_setdirty(dn, tx); + } else { + mutex_exit(&dn->dn_mtx); + + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, + 1, blkid >> epbs, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (parent_db == NULL) { + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + int err = dbuf_read(parent_db, NULL, + (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err != 0) { + dbuf_rele(parent_db, FTAG); + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + + dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); + dbuf_rele(parent_db, FTAG); + mutex_enter(&parent_dr->dt.di.dr_mtx); + ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); + list_insert_tail(&parent_dr->dt.di.dr_children, dr); + mutex_exit(&parent_dr->dt.di.dr_mtx); + dr->dr_parent = parent_dr; + } + + dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); + + return (dr); +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; - dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; + dbuf_dirty_record_t *dr, *dr_next, *dr_head; int txgoff = tx->tx_txg & TXG_MASK; + boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -1894,7 +2114,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ -#ifdef DEBUG +#ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); @@ -1926,27 +2146,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - RW_READER, FTAG); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? - DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - FTAG); - } - } - + dnode_set_dirtyctx(dn, tx, db); if (tx->tx_txg > dn->dn_dirty_txg) dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); @@ -1957,17 +2157,16 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is already dirty, we're done. */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + dr_head = list_head(&db->db_dirty_records); + ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; - if (dr && dr->dr_txg == tx->tx_txg) { + dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); + if (dr_next && dr_next->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); - dbuf_redirty(dr); + dbuf_redirty(dr_next); mutex_exit(&db->db_mtx); - return (dr); + return (dr_next); } /* @@ -1988,7 +2187,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); -#ifdef DEBUG +#ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || @@ -2011,6 +2210,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; @@ -2041,12 +2242,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + if (db->db_blkid != DMU_BONUS_BLKID) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; + list_insert_before(&db->db_dirty_records, dr_next, dr); /* * We could have been freed_in_flight between the dbuf_noread @@ -2084,15 +2284,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; + drop_struct_rwlock = B_TRUE; + } + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + if (db->db_blkptr != NULL) { + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + ddt_prefetch(os->os_spa, db->db_blkptr); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* @@ -2105,19 +2311,12 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, - drop_struct_lock, B_FALSE); + drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } @@ -2128,15 +2327,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, + parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); + ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); @@ -2146,7 +2344,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ - if (db->db_last_dirty == dr || + if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); @@ -2157,14 +2355,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } @@ -2173,6 +2371,30 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } +static void +dbuf_undirty_bonus(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + if (dr->dt.dl.dr_data != db->db.db_data) { + struct dnode *dn = dr->dr_dnode; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + + kmem_free(dr->dt.dl.dr_data, max_bonuslen); + arc_space_return(max_bonuslen, ARC_SPACE_BONUS); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + if (dr->dr_dbuf->db_level != 0) { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT3U(db->db_dirtycnt, >, 0); + db->db_dirtycnt -= 1; +} + /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. @@ -2180,9 +2402,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); @@ -2202,16 +2422,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg <= txg) - break; - if (dr == NULL || dr->dr_txg < txg) + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); + if (dr == NULL) return (B_FALSE); - ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2220,7 +2436,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); - *drp = dr->dr_next; + list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() @@ -2239,7 +2455,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } - DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); @@ -2273,22 +2488,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* - * Quick check for dirtyness. For already dirty blocks, this + * Quick check for dirtiness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); - dbuf_dirty_record_t *dr; - for (dr = db->db_last_dirty; - dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ - if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { + if (dr != NULL) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); @@ -2316,17 +2530,12 @@ boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; mutex_enter(&db->db_mtx); - for (dbuf_dirty_record_t *dr = db->db_last_dirty; - dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { - if (dr->dr_txg == tx->tx_txg) { - mutex_exit(&db->db_mtx); - return (B_TRUE); - } - } + dr = dbuf_find_dirty_eq(db, tx->tx_txg); mutex_exit(&db->db_mtx); - return (B_FALSE); + return (dr != NULL); } void @@ -2335,7 +2544,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; - + DTRACE_SET_STATE(db, "allocating NOFILL buffer"); dmu_buf_will_fill(db_fake, tx); } @@ -2381,12 +2590,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); - dr = db->db_last_dirty; - while (dr != NULL && dr->dr_txg > tx->tx_txg) - dr = dr->dr_next; + dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); - ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; @@ -2395,23 +2601,43 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); } -#pragma weak dmu_buf_fill_done = dbuf_fill_done +static void +dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct dirty_leaf *dl; + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; + dl->dr_overridden_by = *bp; + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = dr->dr_txg; +} + /* ARGSUSED */ void -dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - if (db->db_state == DB_FILL) { + old_state = db->db_state; + db->db_state = DB_CACHED; + if (old_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; + DTRACE_SET_STATE(db, + "fill done handling freed in flight"); + } else { + DTRACE_SET_STATE(db, "fill done"); } - db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); @@ -2426,6 +2652,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; + dbuf_dirty_record_t *dr; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), @@ -2441,8 +2668,9 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_will_not_fill(dbuf, tx); - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - dl = &db->db_last_dirty->dt.dl; + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); @@ -2451,7 +2679,32 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; + dl->dr_overridden_by.blk_birth = dr->dr_txg; +} + +void +dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dmu_object_type_t type; + ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + dmu_buf_will_not_fill(dbuf, tx); + + blkptr_t bp = { { { {0} } } }; + BP_SET_TYPE(&bp, type); + BP_SET_LEVEL(&bp, 0); + BP_SET_BIRTH(&bp, tx->tx_txg, 0); + BP_SET_REDACTED(&bp); + BPE_SET_LSIZE(&bp, dbuf->db_size); + + dbuf_override_impl(db, &bp, tx); } /* @@ -2492,13 +2745,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); return; } - xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { @@ -2520,6 +2771,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; + DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); @@ -2547,6 +2799,7 @@ dbuf_destroy(dmu_buf_impl_t *db) kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "buffer cleared"); } } @@ -2556,7 +2809,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -2574,8 +2827,10 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); + ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; + DTRACE_SET_STATE(db, "buffer eviction started"); db->db_blkptr = NULL; /* @@ -2591,9 +2846,9 @@ dbuf_destroy(dmu_buf_impl_t *db) if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) - mutex_enter(&dn->dn_dbufs_mtx); + mutex_enter_nested(&dn->dn_dbufs_mtx, + NESTED_SINGLE); avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); if (needlock) @@ -2703,10 +2958,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; - dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level + 1, + + err = dbuf_hold_impl(dn, level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); - err = dbuf_hold_impl_arg(dh); - dbuf_hold_arg_destroy(dh); + if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -2716,10 +2971,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } + rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); + rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ @@ -2747,11 +3004,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dbuf_node)); + db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; @@ -2769,6 +3028,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "bonus buffer created"); db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); @@ -2788,21 +3048,22 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone - * trying to look up this dbuf before its added to the + * trying to look up this dbuf before it's added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; + db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); + kmem_cache_free(dbuf_kmem_cache, db); DBUF_STAT_BUMP(hash_insert_race); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "regular buffer created"); db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); @@ -2813,13 +3074,42 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || zfs_refcount_count(&dn->dn_holds) > 0); (void) zfs_refcount_add(&dn->dn_holds, db); - atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } +/* + * This function returns a block pointer and information about the object, + * given a dnode and a block. This is a publicly accessible version of + * dbuf_findbp that only returns some information, rather than the + * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock + * should be locked as (at least) a reader. + */ +int +dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) +{ + dmu_buf_impl_t *dbp = NULL; + blkptr_t *bp2; + int err = 0; + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); + if (err == 0) { + *bp = *bp2; + if (dbp != NULL) + dbuf_rele(dbp, NULL); + if (datablkszsec != NULL) + *datablkszsec = dn->dn_phys->dn_datablkszsec; + if (indblkshift != NULL) + *indblkshift = dn->dn_phys->dn_indblkshift; + } + + return (err); +} + typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ @@ -2829,20 +3119,47 @@ typedef struct dbuf_prefetch_arg { zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ + dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ + void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; +static void +dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) +{ + if (dpa->dpa_cb != NULL) + dpa->dpa_cb(dpa->dpa_arg, io_done); + kmem_free(dpa, sizeof (*dpa)); +} + +static void +dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + dbuf_prefetch_fini(dpa, B_TRUE); + if (abuf != NULL) + arc_buf_destroy(abuf, private); +} + /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return; + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) + return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = - dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_NO_BUF; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && @@ -2852,7 +3169,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, + dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } @@ -2872,8 +3190,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); @@ -2905,11 +3222,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { - kmem_free(dpa, sizeof (*dpa)); arc_buf_destroy(abuf, private); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); @@ -2921,12 +3236,15 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp)) { - kmem_free(dpa, sizeof (*dpa)); + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { + dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -2956,9 +3274,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) +int +dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; @@ -2968,10 +3287,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) - return; + goto no_issue; - if (dnode_block_freed(dn, blkid)) - return; + if (level == 0 && dnode_block_freed(dn, blkid)) + goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to @@ -2979,11 +3298,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; + goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; + goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); @@ -2993,7 +3312,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ - return; + goto no_issue; } /* @@ -3025,8 +3344,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } - if (BP_IS_HOLE(&bp)) - return; + ASSERT(!BP_IS_REDACTED(&bp) || + dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) + goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); @@ -3044,6 +3366,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; + dpa->dpa_cb = cb; + dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) @@ -3059,7 +3383,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3080,27 +3403,36 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * dpa may have already been freed. */ zio_nowait(pio); + return (1); +no_issue: + if (cb != NULL) + cb(arg, B_FALSE); + return (0); } -#define DBUF_HOLD_IMPL_MAX_DEPTH 20 +int +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + + return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); +} /* - * Helper function for dbuf_hold_impl_arg() to copy a buffer. Handles + * Helper function for dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * - * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl_arg(). + * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). */ noinline static void -dbuf_hold_copy(struct dbuf_hold_arg *dh) +dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { - dnode_t *dn = dh->dh_dn; - dmu_buf_impl_t *db = dh->dh_db; - dbuf_dirty_record_t *dr = dh->dh_dr; + dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *data = dr->dt.dl.dr_data; - enum zio_compress compress_type = arc_get_compression(data); + uint8_t complevel = arc_get_complevel(data); if (arc_is_encrypted(data)) { boolean_t byteorder; @@ -3112,178 +3444,130 @@ dbuf_hold_copy(struct dbuf_hold_arg *dh) dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, arc_buf_size(data), arc_buf_lsize(data), - compress_type)); + compress_type, complevel)); } else if (compress_type != ZIO_COMPRESS_OFF) { dbuf_set_data(db, arc_alloc_compressed_buf( dn->dn_objset->os_spa, db, arc_buf_size(data), - arc_buf_lsize(data), compress_type)); + arc_buf_lsize(data), compress_type, complevel)); } else { dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } + rw_enter(&db->db_rwlock, RW_WRITER); bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + rw_exit(&db->db_rwlock); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ -static int -dbuf_hold_impl_arg(struct dbuf_hold_arg *dh) -{ - dh->dh_parent = NULL; - - ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); - ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); - - *(dh->dh_dbp) = NULL; - - /* dbuf_find() returns with db_mtx held */ - dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, - dh->dh_level, dh->dh_blkid); - - if (dh->dh_db == NULL) { - dh->dh_bp = NULL; - - if (dh->dh_fail_uncached) - return (SET_ERROR(ENOENT)); - - ASSERT3P(dh->dh_parent, ==, NULL); - dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp); - if (dh->dh_fail_sparse) { - if (dh->dh_err == 0 && - dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) - dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - return (dh->dh_err); - } - } - if (dh->dh_err && dh->dh_err != ENOENT) - return (dh->dh_err); - dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_parent, dh->dh_bp); - } - - if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { - mutex_exit(&dh->dh_db->db_mtx); - return (SET_ERROR(ENOENT)); - } - - if (dh->dh_db->db_buf != NULL) { - arc_buf_access(dh->dh_db->db_buf); - ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); - } - - ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); - - /* - * If this buffer is currently syncing out, and we are are - * still referencing it from db_data, we need to make a copy - * of it in case we decide we want to dirty it again in this txg. - */ - if (dh->dh_db->db_level == 0 && - dh->dh_db->db_blkid != DMU_BONUS_BLKID && - dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && - dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { - dh->dh_dr = dh->dh_db->db_data_pending; - if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) - dbuf_hold_copy(dh); - } - - if (multilist_link_active(&dh->dh_db->db_cache_link)) { - ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds)); - ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || - dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); - - multilist_remove( - dbuf_caches[dh->dh_db->db_caching_status].cache, - dh->dh_db); - (void) zfs_refcount_remove_many( - &dbuf_caches[dh->dh_db->db_caching_status].size, - dh->dh_db->db.db_size, dh->dh_db); - - if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMPDOWN(metadata_cache_count); - } else { - DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], - dh->dh_db->db.db_size); - } - dh->dh_db->db_caching_status = DB_NO_CACHE; - } - (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag); - DBUF_VERIFY(dh->dh_db); - mutex_exit(&dh->dh_db->db_mtx); - - /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - - ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); - ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); - ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); - *(dh->dh_dbp) = dh->dh_db; - - return (0); -} - -/* - * dbuf_hold_impl_arg() is called recursively, via dbuf_findbp(). There can - * be as many recursive calls as there are levels of on-disk indirect blocks, - * but typically only 0-2 recursive calls. To minimize the stack frame size, - * the recursive function's arguments and "local variables" are allocated on - * the heap as the dbuf_hold_arg_t. - */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { - dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level, blkid, - fail_sparse, fail_uncached, tag, dbp); + dmu_buf_impl_t *db, *parent = NULL; - int error = dbuf_hold_impl_arg(dh); + /* If the pool has been created, verify the tx_sync_lock is not held */ + spa_t *spa = dn->dn_objset->os_spa; + dsl_pool_t *dp = spa->spa_dsl_pool; + if (dp != NULL) { + ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); + } - dbuf_hold_arg_destroy(dh); + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); - return (error); -} + *dbp = NULL; -static dbuf_hold_arg_t * -dbuf_hold_arg_create(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) -{ - dbuf_hold_arg_t *dh = kmem_alloc(sizeof (*dh), KM_SLEEP); - dh->dh_dn = dn; - dh->dh_level = level; - dh->dh_blkid = blkid; + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); - dh->dh_fail_sparse = fail_sparse; - dh->dh_fail_uncached = fail_uncached; + if (db == NULL) { + blkptr_t *bp = NULL; + int err; - dh->dh_tag = tag; - dh->dh_dbp = dbp; + if (fail_uncached) + return (SET_ERROR(ENOENT)); - dh->dh_db = NULL; - dh->dh_parent = NULL; - dh->dh_bp = NULL; - dh->dh_err = 0; - dh->dh_dr = NULL; + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = SET_ERROR(ENOENT); + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } - return (dh); -} + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } -static void -dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh) -{ - kmem_free(dh, sizeof (*dh)); + if (db->db_buf != NULL) { + arc_buf_access(db->db_buf); + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + if (dr->dt.dl.dr_data == db->db_buf) + dbuf_hold_copy(dn, db); + } + + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, + db->db.db_size, db); + + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + } + db->db_caching_status = DB_NO_CACHE; + } + (void) zfs_refcount_add(&db->db_holds, tag); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(DB_DNODE(db), ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); } dmu_buf_impl_t * @@ -3313,7 +3597,6 @@ int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); @@ -3322,12 +3605,7 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); return (0); } @@ -3407,6 +3685,7 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; + uint64_t size; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); @@ -3502,34 +3781,30 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; - multilist_insert(dbuf_caches[dcs].cache, db); - (void) zfs_refcount_add_many( - &dbuf_caches[dcs].size, - db->db.db_size, db); + multilist_insert(&dbuf_caches[dcs].cache, db); + uint64_t db_size = db->db.db_size; + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, db_size, db); + uint8_t db_level = db->db_level; + mutex_exit(&db->db_mtx); if (dcs == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMP(metadata_cache_count); DBUF_STAT_MAX( metadata_cache_size_bytes_max, - zfs_refcount_count( - &dbuf_caches[dcs].size)); + size); } else { - DBUF_STAT_BUMP( - cache_levels[db->db_level]); DBUF_STAT_BUMP(cache_count); - DBUF_STAT_INCR( - cache_levels_bytes[db->db_level], - db->db.db_size); DBUF_STAT_MAX(cache_size_bytes_max, - zfs_refcount_count( - &dbuf_caches[dcs].size)); + size); + DBUF_STAT_BUMP(cache_levels[db_level]); + DBUF_STAT_INCR( + cache_levels_bytes[db_level], + db_size); } - mutex_exit(&db->db_mtx); - if (db->db_caching_status == DB_DBUF_CACHE && - !evicting) { - dbuf_evict_notify(); - } + if (dcs == DB_DBUF_CACHE && !evicting) + dbuf_evict_notify(size); } if (do_arc_evict) @@ -3663,7 +3938,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was - * inappropriate to hook it in (i.e., nlevels mis-match). + * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); @@ -3690,6 +3965,28 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) } } +static void +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + void *data = dr->dt.dl.dr_data; + + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + ASSERT(data != NULL); + + dnode_t *dn = dr->dr_dnode; + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); + bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); + + dbuf_sync_leaf_verify_bonus_dnode(dr); + + dbuf_undirty_bonus(dr); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); +} + /* * When syncing out a blocks of dnodes, adjust the block to deal with * encryption. Normally, we make sure the block is decrypted before writing @@ -3741,8 +4038,7 @@ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; + dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); @@ -3762,12 +4058,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; @@ -3776,7 +4069,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, db->db_buf, tx); - zio = dr->dr_zio; + zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -3784,6 +4077,193 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio_nowait(zio); } +/* + * Verify that the size of the data in our bonus buffer does not exceed + * its recorded size. + * + * The purpose of this verification is to catch any cases in development + * where the size of a phys structure (i.e space_map_phys_t) grows and, + * due to incorrect feature management, older pools expect to read more + * data even though they didn't actually write it to begin with. + * + * For a example, this would catch an error in the feature logic where we + * open an older pool and we expect to write the space map histogram of + * a space map with size SPACE_MAP_SIZE_V0. + */ +static void +dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dnode_t *dn = dr->dr_dnode; + + /* + * Encrypted bonus buffers can have data past their bonuslen. + * Skip the verification of these blocks. + */ + if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) + return; + + uint16_t bonuslen = dn->dn_phys->dn_bonuslen; + uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT3U(bonuslen, <=, maxbonuslen); + + arc_buf_t *datap = dr->dt.dl.dr_data; + char *datap_end = ((char *)datap) + bonuslen; + char *datap_max = ((char *)datap) + maxbonuslen; + + /* ensure that everything is zero after our data */ + for (; datap_end < datap_max; datap_end++) + ASSERT(*datap_end == 0); +#endif +} + +static blkptr_t * +dbuf_lightweight_bp(dbuf_dirty_record_t *dr) +{ + /* This must be a lightweight dirty record. */ + ASSERT3P(dr->dr_dbuf, ==, NULL); + dnode_t *dn = dr->dr_dnode; + + if (dn->dn_phys->dn_nlevels == 1) { + VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); + return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); + } else { + dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + VERIFY3U(parent_db->db_level, ==, 1); + VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + blkptr_t *bp = parent_db->db.db_data; + return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); + } +} + +static void +dbuf_lightweight_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error != 0) + return; + + dnode_t *dn = dr->dr_dnode; + + blkptr_t *bp_orig = dbuf_lightweight_bp(dr); + spa_t *spa = dmu_objset_spa(dn->dn_objset); + int64_t delta = bp_get_dsize_sync(spa, bp) - + bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta); + + uint64_t blkid = dr->dt.dll.dr_blkid; + mutex_enter(&dn->dn_mtx); + if (blkid > dn->dn_phys->dn_maxblkid) { + ASSERT0(dn->dn_objset->os_raw_receive); + dn->dn_phys->dn_maxblkid = blkid; + } + mutex_exit(&dn->dn_mtx); + + if (!BP_IS_EMBEDDED(bp)) { + uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; + BP_SET_FILL(bp, fill); + } + + dmu_buf_impl_t *parent_db; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + rw_enter(&parent_db->db_rwlock, RW_WRITER); + *bp_orig = *bp; + rw_exit(&parent_db->db_rwlock); +} + +static void +dbuf_lightweight_physdone(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_lightweight_done(). + */ + int delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +static void +dbuf_lightweight_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + + VERIFY0(zio->io_error); + + objset_t *os = dr->dr_dnode->dn_objset; + dmu_tx_t *tx = os->os_synctx; + + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } + + /* + * See comment in dbuf_write_done(). + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + abd_free(dr->dt.dll.dr_abd); + kmem_free(dr, sizeof (*dr)); +} + +noinline static void +dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dnode_t *dn = dr->dr_dnode; + zio_t *pio; + if (dn->dn_phys->dn_nlevels == 1) { + pio = dn->dn_zio; + } else { + pio = dr->dr_parent->dr_zio; + } + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(dn->dn_objset), + .zb_object = dn->dn_object, + .zb_level = 0, + .zb_blkid = dr->dt.dll.dr_blkid, + }; + + /* + * See comment in dbuf_write(). This is so that zio->io_bp_orig + * will have the old BP in dbuf_lightweight_done(). + */ + dr->dr_bp_copy = *dbuf_lightweight_bp(dr); + + dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), + dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, + dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), + &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, + dbuf_lightweight_physdone, dbuf_lightweight_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + + zio_nowait(dr->dr_zio); +} + /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to @@ -3794,7 +4274,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; @@ -3818,9 +4298,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { @@ -3850,37 +4327,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - - ASSERT(*datap != NULL); - ASSERT0(db->db_level); - ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, - DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), - DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - kmem_free(*datap, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; - if (dr->dr_dbuf->db_level != 0) { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + dbuf_sync_bonus(dr, tx); return; } @@ -3931,6 +4379,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); + uint8_t complevel = arc_get_complevel(*datap); if (arc_is_encrypted(*datap)) { boolean_t byteorder; @@ -3941,11 +4390,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) arc_get_raw_params(*datap, &byteorder, salt, iv, mac); *datap = arc_alloc_raw_buf(os->os_spa, db, dmu_objset_id(os), byteorder, salt, iv, mac, - dn->dn_type, psize, lsize, compress_type); + dn->dn_type, psize, lsize, compress_type, + complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); *datap = arc_alloc_compressed_buf(os->os_spa, db, - psize, lsize, compress_type); + psize, lsize, compress_type, complevel); } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } @@ -3960,16 +4410,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); - DB_DNODE_EXIT(db); } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } @@ -3992,15 +4433,19 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); + if (dr->dr_dbuf == NULL) { + dbuf_sync_lightweight(dr, tx); + } else { + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } } } @@ -4090,9 +4535,9 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* ARGSUSED */ @@ -4133,9 +4578,9 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } @@ -4165,8 +4610,7 @@ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dsl_pool_sync()'s call to - * dsl_pool_undirty_space(). + * error will be cleaned up by dbuf_write_done(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); @@ -4181,7 +4625,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); @@ -4202,24 +4645,18 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) DBUF_VERIFY(db); - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; + dbuf_dirty_record_t *dr = db->db_data_pending; + dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); } #endif @@ -4231,31 +4668,49 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { - ASSERTV(int epbs = dn->dn_phys->dn_indblkshift - - SPA_BLKPTRSHIFT); + int epbs __maybe_unused = dn->dn_phys->dn_indblkshift - + SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } - DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); + + /* + * If we didn't do a physical write in this ZIO and we + * still ended up here, it means that the space of the + * dbuf that we just released (and undirtied) above hasn't + * been marked as undirtied in the pool's accounting. + * + * Thus, we undirty that space in the pool's view of the + * world here. For physical writes this type of update + * happens in dbuf_write_physdone(). + * + * If we did a physical write, cleanup any rounding errors + * that came up due to writing multiple copies of a block + * on disk [see dbuf_write_physdone()]. + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void @@ -4297,7 +4752,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) - abd_put(zio->io_abd); + abd_free(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { @@ -4326,7 +4781,7 @@ dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, } static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); @@ -4340,71 +4795,42 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* - * The struct_rwlock prevents dbuf_read_impl() from + * If the blkptr being remapped is tracked by a livelist, + * then we need to make sure the livelist reflects the update. + * First, cancel out the old blkptr by appending a 'FREE' + * entry. Next, add an 'ALLOC' to track the new version. This + * way we avoid trying to free an inaccurate blkptr at delete. + * Note that embedded blkptrs are not tracked in livelists. + */ + if (dn->dn_objset != spa_meta_objset(spa)) { + dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg) { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, + bp); + bplist_append(&ds->ds_dir->dd_pending_allocs, + &bp_copy); + } + } + + /* + * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (rw != NULL) + rw_enter(rw, RW_WRITER); *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); + if (rw != NULL) + rw_exit(rw); } } -/* - * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting - * to remap a copy of every bp in the dbuf. - */ -boolean_t -dbuf_can_remap(const dmu_buf_impl_t *db) -{ - spa_t *spa = dmu_objset_spa(db->db_objset); - blkptr_t *bp = db->db.db_data; - boolean_t ret = B_FALSE; - - ASSERT3U(db->db_level, >, 0); - ASSERT3S(db->db_state, ==, DB_CACHED); - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - blkptr_t bp_copy = bp[i]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - -boolean_t -dnode_needs_remap(const dnode_t *dn) -{ - spa_t *spa = dmu_objset_spa(dn->dn_objset); - boolean_t ret = B_FALSE; - - if (dn->dn_phys->dn_nlevels == 0) { - return (B_FALSE); - } - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { - blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - /* * Remap any existing BP's to concrete vdevs, if possible. */ @@ -4420,7 +4846,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); + dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; @@ -4429,7 +4855,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : + &dn->dn_dbuf->db_rwlock); + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, + tx); } } } @@ -4441,19 +4870,17 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; - zio_t *zio; + zio_t *pio; /* parent I/O */ int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { @@ -4484,7 +4911,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; + pio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && @@ -4493,12 +4920,12 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; + pio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); + ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -4509,7 +4936,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty @@ -4528,9 +4954,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; - dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, - &zp, dbuf_write_override_ready, NULL, NULL, + dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, + contents, db->db.db_size, db->db.db_size, &zp, + dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); @@ -4541,7 +4967,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); - dr->dr_zio = zio_write(zio, os->os_spa, txg, + dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, @@ -4559,7 +4985,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; - dr->dr_zio = arc_write(zio, os->os_spa, txg, + dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, @@ -4568,7 +4994,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } } -#if defined(_KERNEL) EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); @@ -4606,31 +5031,24 @@ EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); /* BEGIN CSTYLED */ -module_param(dbuf_cache_max_bytes, ulong, 0644); -MODULE_PARM_DESC(dbuf_cache_max_bytes, +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); -module_param(dbuf_cache_hiwater_pct, uint, 0644); -MODULE_PARM_DESC(dbuf_cache_hiwater_pct, +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " "directly."); -module_param(dbuf_cache_lowater_pct, uint, 0644); -MODULE_PARM_DESC(dbuf_cache_lowater_pct, +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, "Percentage below dbuf_cache_max_bytes when the evict thread stops " "evicting dbufs."); -module_param(dbuf_metadata_cache_max_bytes, ulong, 0644); -MODULE_PARM_DESC(dbuf_metadata_cache_max_bytes, +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW, "Maximum size in bytes of the dbuf metadata cache."); -module_param(dbuf_cache_shift, int, 0644); -MODULE_PARM_DESC(dbuf_cache_shift, +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, "Set the size of the dbuf cache to a log2 fraction of arc size."); -module_param(dbuf_metadata_cache_shift, int, 0644); -MODULE_PARM_DESC(dbuf_cache_shift, - "Set the size of the dbuf metadata cache to a log2 fraction of " - "arc size."); +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, + "Set the size of the dbuf metadata cache to a log2 fraction of arc " + "size."); /* END CSTYLED */ -#endif diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index afe7c34cf4..12bb568a08 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -61,7 +61,7 @@ dbuf_stats_hash_table_headers(char *buf, size_t size) return (0); } -int +static int __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) { arc_buf_info_t abi = { 0 }; @@ -134,7 +134,8 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) ASSERT3S(dsh->idx, >=, 0); ASSERT3S(dsh->idx, <=, h->hash_table_mask); - memset(buf, 0, size); + if (size) + buf[0] = 0; mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { @@ -225,7 +226,7 @@ dbuf_stats_destroy(void) dbuf_stats_hash_table_destroy(); } -#if defined(_KERNEL) -module_param(zfs_dbuf_state_index, int, 0644); -MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); -#endif +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW, + "Calculate arc header index"); +/* END CSTYLED */ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index a38c2b24ea..fe5a188f4d 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -253,7 +253,7 @@ void ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, char *name) { - (void) sprintf(name, DMU_POOL_DDT, + (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, zio_checksum_table[ddt->ddt_checksum].ci_name, ddt_ops[type]->ddt_op_name, ddt_class_name[class]); } @@ -423,8 +423,8 @@ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ - while (d < d_end) - *d++ += (*s++ ^ neg) - neg; + for (int i = 0; i < d_end - d; i++) + d[i] += (s[i] ^ neg) - neg; } static void @@ -503,7 +503,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, @@ -552,65 +552,6 @@ ddt_get_pool_dedup_ratio(spa_t *spa) return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); } -int -ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) -{ - spa_t *spa = ddt->ddt_spa; - uint64_t total_refcnt = 0; - uint64_t ditto = spa->spa_dedup_ditto; - int total_copies = 0; - int desired_copies = 0; - int copies_needed = 0; - - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - zio_t *zio = dde->dde_lead_zio[p]; - uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ - if (zio != NULL) - refcnt += zio->io_parent_count; /* pending refs */ - if (ddp == ddp_willref) - refcnt++; /* caller's ref */ - if (refcnt != 0) { - total_refcnt += refcnt; - total_copies += p; - } - } - - if (ditto == 0 || ditto > UINT32_MAX) - ditto = UINT32_MAX; - - if (total_refcnt >= 1) - desired_copies++; - if (total_refcnt >= ditto) - desired_copies++; - if (total_refcnt >= ditto * ditto) - desired_copies++; - - copies_needed = MAX(desired_copies, total_copies) - total_copies; - - /* encrypted blocks store their IV in DVA[2] */ - if (DDK_GET_CRYPT(&dde->dde_key)) - copies_needed = MIN(copies_needed, SPA_DVAS_PER_BP - 1); - - return (copies_needed); -} - -int -ddt_ditto_copies_present(ddt_entry_t *dde) -{ - ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; - dva_t *dva = ddp->ddp_dva; - int copies = 0 - DVA_GET_GANG(dva); - - for (int d = 0; d < DDE_GET_NDVAS(dde); d++, dva++) - if (DVA_IS_VALID(dva)) - copies++; - - ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); - - return (copies); -} - size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) { @@ -629,7 +570,6 @@ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) } *version = cpfunc; - /* CONSTCOND */ if (ZFS_HOST_BYTEORDER) *version |= DDT_COMPRESS_BYTEORDER_MASK; @@ -653,12 +593,6 @@ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) byteswap_uint64_array(dst, d_len); } -ddt_t * -ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) -{ - return (spa->spa_ddt[c]); -} - ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { @@ -842,7 +776,7 @@ ddt_entry_compare(const void *x1, const void *x2) break; } - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } static ddt_t * @@ -1088,8 +1022,11 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) continue; } if (p == DDT_PHYS_DITTO) { - if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) - ddt_phys_free(ddt, ddk, ddp, txg); + /* + * Note, we no longer create DDT-DITTO blocks, but we + * don't want to leak any written by older software. + */ + ddt_phys_free(ddt, ddk, ddp, txg); continue; } if (ddp->ddp_refcnt == 0) @@ -1097,9 +1034,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) total_refcnt += ddp->ddp_refcnt; } - if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) - nclass = DDT_CLASS_DITTO; - else if (total_refcnt > 1) + /* We do not create new DDT-DITTO blocks. */ + ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); + if (total_refcnt > 1) nclass = DDT_CLASS_DUPLICATE; else nclass = DDT_CLASS_UNIQUE; @@ -1243,7 +1180,7 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) return (SET_ERROR(ENOENT)); } -#if defined(_KERNEL) -module_param(zfs_dedup_prefetch, int, 0644); -MODULE_PARM_DESC(zfs_dedup_prefetch, "Enable prefetching dedup-ed blks"); -#endif +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, + "Enable prefetching dedup-ed blks"); +/* END CSTYLED */ diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 77c0784cca..c5c9eda0b2 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #include @@ -45,7 +46,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, DMU_OT_NONE, 0, tx); - return (*objectp == 0 ? ENOTSUP : 0); + return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); } static int @@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) zap_attribute_t za; int error; - zap_cursor_init_serialized(&zc, os, object, *walk); + if (*walk == 0) { + /* + * We don't want to prefetch the entire ZAP object, because + * it can be enormous. Also the primary use of DDT iteration + * is for scrubbing, in which case we will be issuing many + * scrub I/Os for each ZAP block that we read in, so + * reading the ZAP is unlikely to be the bottleneck. + */ + zap_cursor_init_noprefetch(&zc, os, object); + } else { + zap_cursor_init_serialized(&zc, os, object, *walk); + } if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1697a63207..f12c5eda8b 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -20,12 +20,14 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include @@ -49,7 +51,8 @@ #include #include #include -#include +#include +#include #include #ifdef _KERNEL #include @@ -75,11 +78,11 @@ unsigned long zfs_per_txg_dirty_frees_percent = 5; int zfs_dmu_offset_next_sync = 0; /* - * This can be used for testing, to ensure that certain actions happen - * while in the middle of a remap (which might otherwise complete too - * quickly). Used by ztest(8). + * Limit the amount we can prefetch with one call to this amount. This + * helps to limit the amount of memory that can be used by prefetching. + * Larger objects should be prefetched a bit at a time. */ -int zfs_object_remap_one_indirect_delay_ms = 0; +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, @@ -151,15 +154,15 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; -int +static int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; - blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -183,8 +186,8 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -489,15 +492,17 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ -static int +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; + zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; - zio_t *zio; + zio_t *zio = NULL; + boolean_t missed = B_FALSE; ASSERT(length <= DMU_MAX_ACCESS); @@ -529,39 +534,64 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + if (read) + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + /* + * Prepare the zfetch before initiating the demand reads, so + * that if multiple threads block on same indirect block, we + * base predictions on the original less racy request order. + */ + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn), B_TRUE); + } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); + if (read) + zio_nowait(zio); return (SET_ERROR(EIO)); } - /* initiate async i/o */ - if (read) + /* + * Initiate async demand data read. + * We check the db_state after calling dbuf_read() because + * (1) dbuf_read() may change the state to CACHED due to a + * hit in the ARC, and (2) on a cache miss, a child will + * have been added to "zio" but not yet completed, so the + * state will not yet be CACHED. + */ + if (read) { (void) dbuf_read(db, zio, dbuf_flags); + if (db->db_state != DB_CACHED) + missed = B_TRUE; + } dbp[i] = &db->db; } - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { - dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn)); - } + if (!read) + zfs_racct_write(length, nblks); + + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); - /* wait for async i/o */ - err = zio_wait(zio); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ if (read) { + /* wait for async read i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); @@ -583,7 +613,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, return (0); } -static int +int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { @@ -639,11 +669,11 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the - * indirect blocks prefeteched will be those that point to the blocks containing + * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not - * in cache, they will be asychronously read in. + * in cache, they will be asynchronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, @@ -667,6 +697,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, return; } + /* + * See comment before the definition of dmu_prefetch_max. + */ + len = MIN(len, dmu_prefetch_max); + /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the @@ -676,7 +711,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, if (err != 0) return; - rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the @@ -684,6 +718,7 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; @@ -696,7 +731,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } - rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -719,8 +753,8 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ - uint64_t iblkrange = - dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + uint64_t iblkrange = (uint64_t)dn->dn_datablksz * + EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); @@ -934,9 +968,7 @@ dmu_free_long_object(objset_t *os, uint64_t object) dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { - if (err == 0) - err = dmu_object_free(os, object, tx); - + err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); @@ -1086,6 +1118,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +/* + * Note: Lustre is an external consumer of this interface. + */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) @@ -1102,137 +1137,6 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } -static int -dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, - uint64_t last_removal_txg, uint64_t offset) -{ - uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); - dnode_t *dn_tx; - int err = 0; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); - ASSERT3P(dbuf, !=, NULL); - - /* - * If the block hasn't been written yet, this default will ensure - * we don't try to remap it. - */ - uint64_t birth = UINT64_MAX; - ASSERT3U(last_removal_txg, !=, UINT64_MAX); - if (dbuf->db_blkptr != NULL) - birth = dbuf->db_blkptr->blk_birth; - rw_exit(&dn->dn_struct_rwlock); - - /* - * If this L1 was already written after the last removal, then we've - * already tried to remap it. An additional hold is taken after the - * dmu_tx_assign() to handle the case where the dnode is freed while - * waiting for the next open txg. - */ - if (birth <= last_removal_txg && - dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && - dbuf_can_remap(dbuf)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - err = dnode_hold(os, dn->dn_object, FTAG, &dn_tx); - if (err == 0) { - (void) dbuf_dirty(dbuf, tx); - dnode_rele(dn_tx, FTAG); - } - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - } - - dbuf_rele(dbuf, FTAG); - - delay(MSEC_TO_TICK(zfs_object_remap_one_indirect_delay_ms)); - - return (err); -} - -/* - * Remap all blockpointers in the object, if possible, so that they reference - * only concrete vdevs. - * - * To do this, iterate over the L0 blockpointers and remap any that reference - * an indirect vdev. Note that we only examine L0 blockpointers; since we - * cannot guarantee that we can remap all blockpointer anyways (due to split - * blocks), we do not want to make the code unnecessarily complicated to - * catch the unlikely case that there is an L1 block on an indirect vdev that - * contains no indirect blockpointers. - */ -int -dmu_object_remap_indirects(objset_t *os, uint64_t object, - uint64_t last_removal_txg) -{ - uint64_t offset, l1span; - int err; - dnode_t *dn, *dn_tx; - - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) { - return (err); - } - - if (dn->dn_nlevels <= 1) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - } - - /* - * If the dnode has no indirect blocks, we cannot dirty them. - * We still want to remap the blkptr(s) in the dnode if - * appropriate, so mark it as dirty. An additional hold is - * taken after the dmu_tx_assign() to handle the case where - * the dnode is freed while waiting for the next open txg. - */ - if (err == 0 && dnode_needs_remap(dn)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, object); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - err = dnode_hold(os, object, FTAG, &dn_tx); - if (err == 0) { - dnode_setdirty(dn_tx, tx); - dnode_rele(dn_tx, FTAG); - } - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - } - - dnode_rele(dn, FTAG); - return (err); - } - - offset = 0; - l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + - dn->dn_datablkshift); - /* - * Find the next L1 indirect that is not a hole. - */ - while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - break; - } - if ((err = dmu_object_remap_one_indirect(os, dn, - last_removal_txg, offset)) != 0) { - break; - } - offset += l1span; - } - - dnode_rele(dn, FTAG); - return (err); -} - void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) @@ -1273,171 +1177,32 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_rele(db, FTAG); } -/* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ - kstat_named_t xuiostat_onloan_rbuf; - kstat_named_t xuiostat_onloan_wbuf; - /* whether a copy is made when loaning out a read buffer */ - kstat_named_t xuiostat_rbuf_copied; - kstat_named_t xuiostat_rbuf_nocopy; - /* whether a copy is made when assigning a write buffer */ - kstat_named_t xuiostat_wbuf_copied; - kstat_named_t xuiostat_wbuf_nocopy; -} xuio_stats_t; - -static xuio_stats_t xuio_stats = { - { "onloan_read_buf", KSTAT_DATA_UINT64 }, - { "onloan_write_buf", KSTAT_DATA_UINT64 }, - { "read_buf_copied", KSTAT_DATA_UINT64 }, - { "read_buf_nocopy", KSTAT_DATA_UINT64 }, - { "write_buf_copied", KSTAT_DATA_UINT64 }, - { "write_buf_nocopy", KSTAT_DATA_UINT64 } -}; - -#define XUIOSTAT_INCR(stat, val) \ - atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -#ifdef HAVE_UIO_ZEROCOPY -int -dmu_xuio_init(xuio_t *xuio, int nblk) -{ - dmu_xuio_t *priv; - uio_t *uio = &xuio->xu_uio; - - uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); - - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); - priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = (iovec_t *)uio->uio_iov; - XUIO_XUZC_PRIV(xuio) = priv; - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); - - return (0); -} - void -dmu_xuio_fini(xuio_t *xuio) +dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) { - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int nblk = priv->cnt; + int numbufs, i; + dmu_buf_t **dbp; - kmem_free(priv->iovp, nblk * sizeof (iovec_t)); - kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); - kmem_free(priv, sizeof (dmu_xuio_t)); - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -} - -/* - * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } - * and increase priv->next by 1. - */ -int -dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) -{ - struct iovec *iov; - uio_t *uio = &xuio->xu_uio; - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int i = priv->next++; - - ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_lsize(abuf)); - iov = (iovec_t *)uio->uio_iov + i; - iov->iov_base = (char *)abuf->b_data + off; - iov->iov_len = n; - priv->bufs[i] = abuf; - return (0); -} - -int -dmu_xuio_cnt(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - return (priv->cnt); -} - -arc_buf_t * -dmu_xuio_arcbuf(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - return (priv->bufs[i]); -} - -void -dmu_xuio_clear(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - priv->bufs[i] = NULL; -} -#endif /* HAVE_UIO_ZEROCOPY */ - -static void -xuio_stat_init(void) -{ - xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (xuio_ksp != NULL) { - xuio_ksp->ks_data = &xuio_stats; - kstat_install(xuio_ksp); - } -} - -static void -xuio_stat_fini(void) -{ - if (xuio_ksp != NULL) { - kstat_delete(xuio_ksp); - xuio_ksp = NULL; - } -} - -void -xuio_stat_wbuf_copied(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_copied); -} - -void -xuio_stat_wbuf_nocopy(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, + &numbufs, &dbp)); + for (i = 0; i < numbufs; i++) + dmu_buf_redact(dbp[i], tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL int -dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) +dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; -#endif /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); @@ -1449,28 +1214,12 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) ASSERT(size > 0); - bufoff = uio->uio_loffset - db->db_offset; + bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); -#ifdef HAVE_UIO_ZEROCOPY - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } + err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else -#endif - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); if (err) break; @@ -1484,14 +1233,14 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. - * Starting at offset uio->uio_loffset. + * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int -dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) +dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; @@ -1511,10 +1260,10 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) /* * Read 'size' bytes into the uio buffer. * From the specified object - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). */ int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; @@ -1534,14 +1283,14 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) } int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); @@ -1553,7 +1302,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = uio->uio_loffset - db->db_offset; + bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -1564,13 +1313,13 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_will_dirty(db, tx); /* - * XXX uiomove could block forever (eg.nfs-backed + * XXX zfs_uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't + * to lock the pages in memory, so that zfs_uiomove won't * block. */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = zfs_uio_fault_move((char *)db->db_data + bufoff, + tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1588,14 +1337,14 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int -dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, +dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; @@ -1616,10 +1365,10 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, /* * Write 'size' bytes from the uio buffer. * To the specified object. - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). */ int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, +dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; @@ -1661,54 +1410,30 @@ dmu_return_arcbuf(arc_buf_t *buf) arc_buf_destroy(buf, FTAG); } -void -dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, - dmu_buf_t *handle, dmu_tx_t *tx) +/* + * A "lightweight" write is faster than a regular write (e.g. + * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the + * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the + * data can not be read or overwritten until the transaction's txg has been + * synced. This makes it appropriate for workloads that are known to be + * (temporarily) write-only, like "zfs receive". + * + * A single block is written, starting at the specified offset in bytes. If + * the call is successful, it returns 0 and the provided abd has been + * consumed (the caller should not free it). + */ +int +dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) { - dmu_buf_t *dst_handle; - dmu_buf_impl_t *dstdb; - dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle; - dmu_object_type_t type; - arc_buf_t *abuf; - uint64_t datalen; - boolean_t byteorder; - uint8_t salt[ZIO_DATA_SALT_LEN]; - uint8_t iv[ZIO_DATA_IV_LEN]; - uint8_t mac[ZIO_DATA_MAC_LEN]; - - ASSERT3P(srcdb->db_buf, !=, NULL); - - /* hold the db that we want to write to */ - VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle, - DMU_READ_NO_DECRYPT)); - dstdb = (dmu_buf_impl_t *)dst_handle; - datalen = arc_buf_size(srcdb->db_buf); - - DB_DNODE_ENTER(dstdb); - type = DB_DNODE(dstdb)->dn_type; - DB_DNODE_EXIT(dstdb); - - /* allocated an arc buffer that matches the type of srcdb->db_buf */ - if (arc_is_encrypted(srcdb->db_buf)) { - arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac); - abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os), - byteorder, salt, iv, mac, type, - datalen, arc_buf_lsize(srcdb->db_buf), - arc_get_compression(srcdb->db_buf)); - } else { - /* we won't get a compressed db back from dmu_buf_hold() */ - ASSERT3U(arc_get_compression(srcdb->db_buf), - ==, ZIO_COMPRESS_OFF); - abuf = arc_loan_buf(os->os_spa, - DMU_OT_IS_METADATA(type), datalen); - } - - ASSERT3U(datalen, ==, arc_buf_size(abuf)); - - /* copy the data to the new buffer and assign it to the dstdb */ - bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen); - dbuf_assign_arcbuf(dstdb, abuf, tx); - dmu_buf_rele(dst_handle, FTAG); + dbuf_dirty_record_t *dr = + dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); + if (dr == NULL) + return (SET_ERROR(EIO)); + dr->dt.dll.dr_abd = abd; + dr->dt.dll.dr_props = *zp; + dr->dt.dll.dr_flags = flags; + return (0); } /* @@ -1734,10 +1459,11 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_exit(&dn->dn_struct_rwlock); /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. + * We can only assign if the offset is aligned and the arc buf is the + * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { + zfs_racct_write(blksz, 1); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { @@ -1748,7 +1474,6 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); } return (0); @@ -1878,7 +1603,7 @@ dmu_sync_late_arrival_done(zio_t *zio) zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { - ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); + blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); @@ -1891,7 +1616,7 @@ dmu_sync_late_arrival_done(zio_t *zio) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - abd_put(zio->io_abd); + abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } @@ -1986,7 +1711,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; - dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; @@ -2034,9 +1759,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } - dr = db->db_last_dirty; - while (dr && dr->dr_txg != txg) - dr = dr->dr_next; + dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* @@ -2047,7 +1770,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(ENOENT)); } - ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + dr_next = list_next(&db->db_dirty_records, dr); + ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* @@ -2088,7 +1812,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); @@ -2221,6 +1945,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; + uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; @@ -2277,6 +2002,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); + complevel = zio_complevel_select(os->os_spa, compress, + complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : @@ -2286,7 +2013,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the - * dedup checkum. If the checksum is not strong + * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ @@ -2335,6 +2062,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } zp->zp_compress = compress; + zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; @@ -2365,67 +2093,41 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int i, err; - boolean_t clean = B_TRUE; + int err; +restart: err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - /* - * Check if there are dirty data blocks or frees which have not been - * synced. Dirty spill and bonus blocks which are external to the - * object can ignored when reporting holes. - */ - mutex_enter(&dn->dn_mtx); - for (i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i])) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_free_ranges[i] != NULL) { - clean = B_FALSE; - break; - } - - list_t *list = &dn->dn_dirty_records[i]; - dbuf_dirty_record_t *dr; - - for (dr = list_head(list); dr != NULL; - dr = list_next(list, dr)) { - dmu_buf_impl_t *db = dr->dr_dbuf; - - if (db->db_blkid == DMU_SPILL_BLKID || - db->db_blkid == DMU_BONUS_BLKID) - continue; - - clean = B_FALSE; - break; - } + if (dnode_is_dirty(dn)) { + /* + * If the zfs_dmu_offset_next_sync module option is enabled + * then strict hole reporting has been requested. Dirty + * dnodes must be synced to disk to accurately report all + * holes. When disabled (the default) dirty dnodes are + * reported to not have any holes which is always safe. + * + * When called by zfs_holey_common() the zp->z_rangelock + * is held to prevent zfs_write() and mmap writeback from + * re-dirtying the dnode after txg_wait_synced(). + */ + if (zfs_dmu_offset_next_sync) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + goto restart; } - if (clean == B_FALSE) - break; - } - mutex_exit(&dn->dn_mtx); - - /* - * If compatibility option is on, sync any current changes before - * we go trundling through the block pointers. - */ - if (!clean && zfs_dmu_offset_next_sync) { - clean = B_TRUE; - dnode_rele(dn, FTAG); - txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - } - - if (clean) - err = dnode_next_offset(dn, - (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); - else err = SET_ERROR(EBUSY); + } else { + err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK | + (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); + } + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); @@ -2501,7 +2203,6 @@ dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) /* * Faster still when you only care about the size. - * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, @@ -2583,7 +2284,6 @@ dmu_init(void) abd_init(); zfs_dbgmsg_init(); sa_cache_init(); - xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); @@ -2603,13 +2303,11 @@ dmu_fini(void) dbuf_fini(); dnode_fini(); dmu_objset_fini(); - xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } -#if defined(_KERNEL) EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); @@ -2633,6 +2331,7 @@ EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); +EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); @@ -2643,17 +2342,15 @@ EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); /* BEGIN CSTYLED */ -module_param(zfs_nopwrite_enabled, int, 0644); -MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); +ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, + "Enable NOP writes"); -module_param(zfs_per_txg_dirty_frees_percent, ulong, 0644); -MODULE_PARM_DESC(zfs_per_txg_dirty_frees_percent, - "percentage of dirtied blocks from frees in one TXG"); +ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, + "Percentage of dirtied blocks from frees in one TXG"); -module_param(zfs_dmu_offset_next_sync, int, 0644); -MODULE_PARM_DESC(zfs_dmu_offset_next_sync, +ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); +ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW, + "Limit one prefetch call to this size"); /* END CSTYLED */ - -#endif diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 76c32b1264..a573a2e1bd 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -20,7 +20,8 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. */ #include @@ -39,33 +40,36 @@ #include #include #include +#include -struct diffarg { - struct vnode *da_vp; /* file to which we are reporting */ + +typedef struct dmu_diffarg { + zfs_file_t *da_fp; /* file to which we are reporting */ offset_t *da_offp; int da_err; /* error that stopped diff search */ dmu_diff_record_t da_ddr; -}; +} dmu_diffarg_t; static int -write_record(struct diffarg *da) +write_record(dmu_diffarg_t *da) { - ssize_t resid; /* have to get resid to get detailed errno */ + zfs_file_t *fp; + ssize_t resid; if (da->da_ddr.ddr_type == DDR_NONE) { da->da_err = 0; return (0); } - da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr, - sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND, - RLIM64_INFINITY, CRED(), &resid); + fp = da->da_fp; + da->da_err = zfs_file_write(fp, (caddr_t)&da->da_ddr, + sizeof (da->da_ddr), &resid); *da->da_offp += sizeof (da->da_ddr); return (da->da_err); } static int -report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +report_free_dnode_range(dmu_diffarg_t *da, uint64_t first, uint64_t last) { ASSERT(first <= last); if (da->da_ddr.ddr_type != DDR_FREE || @@ -82,7 +86,7 @@ report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) } static int -report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp) { ASSERT(dnp != NULL); if (dnp->dn_type == DMU_OT_NONE) @@ -109,13 +113,14 @@ static int diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - struct diffarg *da = arg; + dmu_diffarg_t *da = arg; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); - if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) + if (zb->zb_level == ZB_DNODE_LEVEL || + zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { @@ -130,7 +135,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dnode_phys_t *blk; arc_buf_t *abuf; arc_flags_t aflags = ARC_FLAG_WAIT; - int blksz = BP_GET_LSIZE(bp); + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; int zio_flags = ZIO_FLAG_CANFAIL; int i; @@ -142,7 +147,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (SET_ERROR(EIO)); blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = report_dnode(da, dnobj, blk+i); @@ -160,9 +165,9 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int dmu_diff(const char *tosnap_name, const char *fromsnap_name, - struct vnode *vp, offset_t *offp) + zfs_file_t *fp, offset_t *offp) { - struct diffarg da; + dmu_diffarg_t da; dsl_dataset_t *fromsnap; dsl_dataset_t *tosnap; dsl_pool_t *dp; @@ -203,7 +208,7 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name, dsl_dataset_long_hold(tosnap, FTAG); dsl_pool_rele(dp, FTAG); - da.da_vp = vp; + da.da_fp = fp; da.da_offp = offp; da.da_ddr.ddr_type = DDR_NONE; da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index ec78ebbdcb..12cdbd68b1 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -57,10 +58,8 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; int error; - kpreempt_disable(); - cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % os->os_obj_next_percpu_len]; - kpreempt_enable(); if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; @@ -504,7 +503,6 @@ dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) VERIFY0(dmu_object_free(mos, object, tx)); } -#if defined(_KERNEL) EXPORT_SYMBOL(dmu_object_alloc); EXPORT_SYMBOL(dmu_object_alloc_ibs); EXPORT_SYMBOL(dmu_object_alloc_dnsize); @@ -520,8 +518,6 @@ EXPORT_SYMBOL(dmu_object_zapify); EXPORT_SYMBOL(dmu_object_free_zapified); /* BEGIN CSTYLED */ -module_param(dmu_object_alloc_chunk_shift, int, 0644); -MODULE_PARM_DESC(dmu_object_alloc_chunk_shift, +ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW, "CPU-specific allocator grabs 2^N objects at once"); /* END CSTYLED */ -#endif diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f95915b9e2..af107fb8ad 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -30,11 +30,12 @@ * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* Portions Copyright 2010 Robert Milkowski */ -#include #include #include #include @@ -193,8 +194,10 @@ compression_changed_cb(void *arg, uint64_t newval) */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - os->os_compress = zio_compress_select(os->os_spa, newval, - ZIO_COMPRESS_ON); + os->os_compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON); + os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress, + ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT); } static void @@ -323,7 +326,7 @@ smallblk_changed_cb(void *arg, uint64_t newval) /* * Inheritance and range checking should have been done by now. */ - ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); + ASSERT(newval <= SPA_MAXBLOCKSIZE); ASSERT(ISP2(newval)); os->os_zpl_special_smallblock = newval; @@ -392,11 +395,19 @@ dnode_hash(const objset_t *os, uint64_t obj) return (crc); } -unsigned int +static unsigned int dnode_multilist_index_func(multilist_t *ml, void *obj) { dnode_t *dn = obj; - return (dnode_hash(dn->dn_objset, dn->dn_object) % + + /* + * The low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. In this context full 64bit + * division would be a waste of time, so limit it to 32 bits. + */ + return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) % multilist_get_num_sublists(ml)); } @@ -412,6 +423,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + ASSERT(!BP_IS_REDACTED(bp)); + + /* + * We need the pool config lock to get properties. + */ + ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * The $ORIGIN dataset (if it exists) doesn't have an associated @@ -502,20 +519,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { - boolean_t needlock = B_FALSE; - os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); - /* - * Note: it's valid to open the objset if the dataset is - * long-held, in which case the pool_config lock will not - * be held. - */ - if (!dsl_pool_config_held(dmu_objset_pool(os))) { - needlock = B_TRUE; - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - } - err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -578,8 +583,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, smallblk_changed_cb, os); } } - if (needlock) - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); @@ -589,6 +592,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; + os->os_complevel = ZIO_COMPLEVEL_DEFAULT; os->os_encrypted = B_FALSE; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; @@ -605,7 +609,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { - os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), + multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i]), dnode_multilist_index_func); } @@ -649,11 +653,11 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) int err = 0; /* - * We shouldn't be doing anything with dsl_dataset_t's unless the - * pool_config lock is held, or the dataset is long-held. + * We need the pool_config lock to manipulate the dsl_dataset_t. + * Even if the dataset is long-held, we need the pool_config lock + * to open the objset, as it needs to get properties. */ - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || - dsl_dataset_long_held(ds)); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { @@ -686,8 +690,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); @@ -759,8 +764,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); @@ -782,11 +788,15 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, * speed up pool import times and to keep this txg reserved * completely for recovery work. */ - if ((dmu_objset_userobjspace_upgradable(*osp) || - dmu_objset_projectquota_upgradable(*osp)) && - !readonly && !dp->dp_spa->spa_claiming && - (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) - dmu_objset_id_quota_upgrade(*osp); + if (!readonly && !dp->dp_spa->spa_claiming && + (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) { + if (dmu_objset_userobjspace_upgradable(*osp) || + dmu_objset_projectquota_upgradable(*osp)) { + dmu_objset_id_quota_upgrade(*osp); + } else if (dmu_objset_userused_enabled(*osp)) { + dmu_objset_userspace_upgrade(*osp); + } + } dsl_pool_rele(dp, FTAG); return (0); @@ -798,8 +808,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, { dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); @@ -816,9 +827,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; - + ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } @@ -846,7 +858,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); @@ -854,21 +868,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, decrypt, tag); - VERIFY0(dsl_dataset_own(dp, name, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); + dsl_dataset_disown(ds, flags, tag); + VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { + ds_hold_flags_t flags; + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); - dsl_dataset_disown(os->os_dsl_dataset, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); + dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void @@ -988,9 +1003,8 @@ dmu_objset_evict_done(objset_t *os) mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); mutex_destroy(&os->os_upgrade_lock); - for (int i = 0; i < TXG_SIZE; i++) { - multilist_destroy(os->os_dirty_dnodes[i]); - } + for (int i = 0; i < TXG_SIZE; i++) + multilist_destroy(&os->os_dirty_dnodes[i]); spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -1027,7 +1041,7 @@ dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* * We don't want to have to increase the meta-dnode's nlevels - * later, because then we could do it in quescing context while + * later, because then we could do it in quiescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), @@ -1104,6 +1118,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; + proc_t *doca_proc; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; @@ -1148,7 +1163,7 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, - doca->doca_cred); + doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); @@ -1234,7 +1249,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) } VERIFY0(zio_wait(rzio)); - dmu_objset_do_userquota_updates(os, tx); + dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); @@ -1261,8 +1276,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) mutex_exit(&ds->ds_lock); } - spa_history_log_internal_ds(ds, "create", tx, ""); - zvol_create_minors(spa, doca->doca_name, B_TRUE); + spa_history_log_internal_ds(ds, "create", tx, " "); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_dir_rele(pdd, FTAG); @@ -1277,6 +1291,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, doca.doca_name = name; doca.doca_cred = CRED(); + doca.doca_proc = curproc; doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; @@ -1292,15 +1307,20 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, */ doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp; - return (dsl_sync_task(name, + int rv = dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, - 6, ZFS_SPACE_CHECK_NORMAL)); + 6, ZFS_SPACE_CHECK_NORMAL); + + if (rv == 0) + zvol_create_minor(name); + return (rv); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; + proc_t *doca_proc; } dmu_objset_clone_arg_t; /*ARGSUSED*/ @@ -1329,7 +1349,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, - doca->doca_cred); + doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); @@ -1348,13 +1368,6 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EINVAL)); } - error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir); - if (error != 0) { - dsl_dataset_rele(origin, FTAG); - dsl_dir_rele(pdd, FTAG); - return (error); - } - dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); @@ -1381,8 +1394,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, - "origin=%s (%llu)", namebuf, origin->ds_object); - zvol_create_minors(dp->dp_spa, doca->doca_clone, B_TRUE); + "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); @@ -1396,105 +1408,16 @@ dmu_objset_clone(const char *clone, const char *origin) doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); + doca.doca_proc = curproc; - return (dsl_sync_task(clone, + int rv = dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, - 6, ZFS_SPACE_CHECK_NORMAL)); -} + 6, ZFS_SPACE_CHECK_NORMAL); -static int -dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) -{ - int error = 0; - uint64_t object = 0; - while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - error = dmu_object_remap_indirects(os, object, - last_removed_txg); - /* - * If the ZPL removed the object before we managed to dnode_hold - * it, we would get an ENOENT. If the ZPL declares its intent - * to remove the object (dnode_free) before we manage to - * dnode_hold it, we would get an EEXIST. In either case, we - * want to continue remapping the other objects in the objset; - * in all other cases, we want to break early. - */ - if (error != 0 && error != ENOENT && error != EEXIST) { - break; - } - } - if (error == ESRCH) { - error = 0; - } - return (error); -} + if (rv == 0) + zvol_create_minor(clone); -int -dmu_objset_remap_indirects(const char *fsname) -{ - int error = 0; - objset_t *os = NULL; - uint64_t last_removed_txg; - uint64_t remap_start_txg; - dsl_dir_t *dd; - - error = dmu_objset_hold(fsname, FTAG, &os); - if (error != 0) { - return (error); - } - dd = dmu_objset_ds(os)->ds_dir; - - if (!spa_feature_is_enabled(dmu_objset_spa(os), - SPA_FEATURE_OBSOLETE_COUNTS)) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * If there has not been a removal, we're done. - */ - last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); - if (last_removed_txg == -1ULL) { - dmu_objset_rele(os, FTAG); - return (0); - } - - /* - * If we have remapped since the last removal, we're done. - */ - if (dsl_dir_is_zapified(dd)) { - uint64_t last_remap_txg; - if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), - dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && - last_remap_txg > last_removed_txg) { - dmu_objset_rele(os, FTAG); - return (0); - } - } - - dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); - dsl_pool_rele(dmu_objset_pool(os), FTAG); - - remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); - error = dmu_objset_remap_indirects_impl(os, last_removed_txg); - if (error == 0) { - /* - * We update the last_remap_txg to be the start txg so that - * we can guarantee that every block older than last_remap_txg - * that can be remapped has been remapped. - */ - error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); - } - - dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); - dsl_dataset_rele(dmu_objset_ds(os), FTAG); - - return (error); + return (rv); } int @@ -1505,7 +1428,7 @@ dmu_objset_snapshot_one(const char *fsname, const char *snapname) nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); - strfree(longsnap); + kmem_strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); @@ -1519,10 +1442,15 @@ dmu_objset_upgrade_task_cb(void *data) mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { + int status; + mutex_exit(&os->os_upgrade_lock); - os->os_upgrade_status = os->os_upgrade_cb(os); + status = os->os_upgrade_cb(os); + mutex_enter(&os->os_upgrade_lock); + + os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; @@ -1550,6 +1478,8 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; } + } else { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } @@ -1584,7 +1514,7 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the - * meta-dnode needs to set it ouside dnode_sync(). + * meta-dnode needs to set it outside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); @@ -1593,23 +1523,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) multilist_sublist_remove(list, dn); /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. + * See the comment above dnode_rele_task() for an explanation + * of why this dnode hold is always needed (even when not + * doing user accounting). */ - multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } + multilist_t *newlist = &dn->dn_objset->os_synced_dnodes; + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); dnode_sync(dn, tx); } @@ -1699,10 +1619,12 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; + int num_sublists; + multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; - dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ @@ -1769,28 +1691,27 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os) && - (!os->os_encrypted || !dmu_objset_is_receiving(os))) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes.ml_sublists == NULL) { + multilist_create(&os->os_synced_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes.ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); } - for (int i = 0; - i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) { + ml = &os->os_dirty_dnodes[txgoff]; + num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = os->os_dirty_dnodes[txgoff]; + sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, @@ -1803,8 +1724,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) while ((dr = list_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); - if (dr->dr_zio) - zio_nowait(dr->dr_zio); + zio_nowait(dr->dr_zio); } /* Enable dnode backfill if enough objects have been freed. */ @@ -1824,22 +1744,32 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { - return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); + return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK])); } -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; void -dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) { - used_cbs[ost] = cb; + file_cbs[ost] = cb; +} + +int +dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zfi) +{ + file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; + if (cb == NULL) + return (EINVAL); + return (cb(bonustype, data, zfi)); } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] != NULL && + file_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } @@ -1853,7 +1783,7 @@ dmu_objset_userobjused_enabled(objset_t *os) boolean_t dmu_objset_projectquota_enabled(objset_t *os) { - return (used_cbs[os->os_phys->os_type] != NULL && + return (file_cbs[os->os_phys->os_type] != NULL && DMU_PROJECTUSED_DNODE(os) != NULL && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); } @@ -1884,7 +1814,7 @@ userquota_compare(const void *l, const void *r) */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } static void @@ -1969,14 +1899,15 @@ do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used, if (subtract) delta = -delta; - (void) sprintf(name, "%llx", (longlong_t)user); + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user); userquota_update_cache(&cache->uqc_user_deltas, name, delta); - (void) sprintf(name, "%llx", (longlong_t)group); + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group); userquota_update_cache(&cache->uqc_group_deltas, name, delta); if (dmu_objset_projectquota_enabled(os)) { - (void) sprintf(name, "%llx", (longlong_t)project); + (void) snprintf(name, sizeof (name), "%llx", + (longlong_t)project); userquota_update_cache(&cache->uqc_project_deltas, name, delta); } @@ -2024,7 +1955,7 @@ userquota_updates_task(void *arg) userquota_cache_t cache = { { 0 } }; multilist_sublist_t *list = - multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2078,23 +2009,54 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); - dnode_rele(dn, os->os_synced_dnodes); + dnode_rele(dn, &os->os_synced_dnodes); } do_userquota_cacheflush(os, &cache, tx); multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +/* + * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being + * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be + * evicted because the block containing the dnode can't be evicted until it is + * written out. However, this hold is necessary to prevent the dnode_t from + * being moved (via dnode_move()) while it's still referenced by + * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for + * dirty_lightweight_leaf-type dirty records. + * + * If we are doing user-object accounting, the dnode_rele() happens from + * userquota_updates_task() instead. + */ +static void +dnode_rele_task(void *arg) +{ + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + + multilist_sublist_t *list = + multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + + dnode_t *dn; + while ((dn = multilist_sublist_head(list)) != NULL) { + multilist_sublist_remove(list, dn); + dnode_rele(dn, &os->os_synced_dnodes); + } + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +/* + * Return TRUE if userquota updates are needed. + */ +static boolean_t +dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) { if (!dmu_objset_userused_enabled(os)) - return; + return (B_FALSE); /* * If this is a raw receive just return and handle accounting @@ -2104,10 +2066,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) - return; + return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) - return; + return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { @@ -2124,21 +2086,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } + return (B_TRUE); +} - for (int i = 0; - i < multilist_get_num_sublists(os->os_synced_dnodes); i++) { +/* + * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and + * also release the holds on the dnodes from dmu_objset_sync_dnodes(). + * The caller must taskq_wait(dp_sync_taskq). + */ +void +dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) +{ + boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); + + int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes); + for (int i = 0; i < num_sublists; i++) { userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; - /* note: caller does taskq_wait() */ + + /* + * If we don't need to update userquotas, use + * dnode_rele_task() to call dnode_rele() + */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); + need_userquota ? userquota_updates_task : dnode_rele_task, + uua, 0); /* callback frees uua */ } } + /* * Returns a pointer to data to find uid/gid from * @@ -2149,31 +2129,22 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg == tx->tx_txg) - break; + dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr == NULL) { data = NULL; } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && + if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); } return (data); @@ -2185,9 +2156,6 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; - uint64_t *user = NULL; - uint64_t *group = NULL; - uint64_t *project = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; @@ -2241,23 +2209,23 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) return; } - if (before) { - ASSERT(data); - user = &dn->dn_olduid; - group = &dn->dn_oldgid; - project = &dn->dn_oldprojid; - } else if (data) { - user = &dn->dn_newuid; - group = &dn->dn_newgid; - project = &dn->dn_newprojid; - } - /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ - error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, - user, group, project); + zfs_file_info_t zfi; + error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); + + if (before) { + ASSERT(data); + dn->dn_olduid = zfi.zfi_user; + dn->dn_oldgid = zfi.zfi_group; + dn->dn_oldprojid = zfi.zfi_project; + } else if (data) { + dn->dn_newuid = zfi.zfi_user; + dn->dn_newgid = zfi.zfi_group; + dn->dn_newprojid = zfi.zfi_project; + } /* * Preserve existing uid/gid when the callback can't determine @@ -2366,8 +2334,8 @@ dmu_objset_space_upgrade(objset_t *os) return (0); } -int -dmu_objset_userspace_upgrade(objset_t *os) +static int +dmu_objset_userspace_upgrade_cb(objset_t *os) { int err = 0; @@ -2387,6 +2355,12 @@ dmu_objset_userspace_upgrade(objset_t *os) return (0); } +void +dmu_objset_userspace_upgrade(objset_t *os) +{ + dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb); +} + static int dmu_objset_id_quota_upgrade_cb(objset_t *os) { @@ -2397,14 +2371,15 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); - if (!dmu_objset_userobjused_enabled(os)) + if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (!dmu_objset_projectquota_enabled(os) && dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); - dmu_objset_ds(os)->ds_feature_activation[ - SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; + if (dmu_objset_userobjused_enabled(os)) + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; @@ -2413,7 +2388,9 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) if (err) return (err); - os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + if (dmu_objset_userobjused_enabled(os)) + os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; if (dmu_objset_projectquota_enabled(os)) os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; @@ -2494,7 +2471,7 @@ dmu_objset_is_snapshot(objset_t *os) } int -dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, +dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; @@ -2535,7 +2512,7 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, return (SET_ERROR(ENAMETOOLONG)); } - (void) strcpy(name, attr.za_name); + (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; if (case_conflict) @@ -2580,7 +2557,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (SET_ERROR(ENAMETOOLONG)); } - (void) strcpy(name, attr.za_name); + (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); @@ -2740,7 +2717,7 @@ dmu_objset_find_dp_cb(void *arg) /* * We need to get a pool_config_lock here, as there are several - * asssert(pool_config_held) down the stack. Getting a lock via + * assert(pool_config_held) down the stack. Getting a lock via * dsl_pool_config_enter is risky, as it might be stalled by a * pending writer. This would deadlock, as the write lock can * only be granted when our parent thread gives up the lock. @@ -2887,7 +2864,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name, err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); - strfree(child); + kmem_strfree(child); if (err != 0) break; } @@ -2925,7 +2902,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name, dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); - strfree(child); + kmem_strfree(child); if (err != 0) break; } @@ -2948,7 +2925,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name, * See comment above dmu_objset_find_impl(). */ int -dmu_objset_find(char *name, int func(const char *, void *), void *arg, +dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; @@ -3000,9 +2977,17 @@ dmu_fsname(const char *snapname, char *buf) } /* - * Call when we think we're going to write/free space in open context to track - * the amount of dirty data in the open txg, which is also the amount - * of memory that can not be evicted until this txg syncs. + * Call when we think we're going to write/free space in open context + * to track the amount of dirty data in the open txg, which is also the + * amount of memory that can not be evicted until this txg syncs. + * + * Note that there are two conditions where this can be called from + * syncing context: + * + * [1] When we just created the dataset, in which case we go on with + * updating any accounting of dirty data as usual. + * [2] When we are dirtying MOS data, in which case we only update the + * pool's accounting of dirty data. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) @@ -3012,8 +2997,9 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } + + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } #if defined(_KERNEL) @@ -3049,7 +3035,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); -EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 976b1bd464..0ec46bdb4f 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -21,16 +21,18 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. - * Copyright 2016 RackTop Systems. - * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include #include +#include +#include #include #include #include @@ -42,38 +44,301 @@ #include #include #include -#include #include #include +#include #include #include #include #include #include #include -#include #include #include #include #include #include -#include -#include +#include +#ifdef _KERNEL +#include +#endif +#include int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; +int zfs_recv_queue_ff = 20; +int zfs_recv_write_batch_size = 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; -static void byteswap_record(dmu_replay_record_t *drr); +static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, + void *buf); + +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a WRITE or SPILL, pointer to the abd containing the + * payload. + */ + abd_t *abd; + int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { + objset_t *os; + boolean_t byteswap; + bqueue_t q; + + /* + * These three members are used to signal to the main thread when + * we're done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + + int err; + boolean_t resumable; + boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ + boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ + boolean_t full; /* this is a full send stream */ + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ + uint64_t bytes_read; /* bytes read when current record created */ + + list_t write_batch; + + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ + boolean_t or_crypt_params_present; + uint64_t or_firstobj; + uint64_t or_numslots; + uint8_t or_salt[ZIO_DATA_SALT_LEN]; + uint8_t or_iv[ZIO_DATA_IV_LEN]; + uint8_t or_mac[ZIO_DATA_MAC_LEN]; + boolean_t or_byteorder; +}; typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; + proc_t *drba_proc; dsl_crypto_params_t *drba_dcp; } dmu_recv_begin_arg_t; +static void +byteswap_record(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_versioninfo); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + DO32(drr_object.drr_raw_bonuslen); + DO64(drr_object.drr_toguid); + DO64(drr_object.drr_maxblkid); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_logical_size); + DO64(drr_write.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); + DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); + break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + DO64(drr_spill.drr_compressed_size); + DO32(drr_spill.drr_type); + break; + case DRR_OBJECT_RANGE: + DO64(drr_object_range.drr_firstobj); + DO64(drr_object_range.drr_numslots); + DO64(drr_object_range.drr_toguid); + break; + case DRR_REDACT: + DO64(drr_redact.drr_object); + DO64(drr_redact.drr_offset); + DO64(drr_redact.drr_length); + DO64(drr_redact.drr_toguid); + break; + case DRR_END: + DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); + break; + default: + break; + } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + +#undef DO64 +#undef DO32 +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check that the new stream we're trying to receive is redacted with respect to + * a subset of the snapshots that the origin was redacted with respect to. For + * the reasons behind this, see the man page on redacted zfs sends and receives. + */ +static boolean_t +compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, + uint64_t *redact_snaps, uint64_t num_redact_snaps) +{ + /* + * Short circuit the comparison; if we are redacted with respect to + * more snapshots than the origin, we can't be redacted with respect + * to a subset. + */ + if (num_redact_snaps > origin_num_snaps) { + return (B_FALSE); + } + + for (int i = 0; i < num_redact_snaps; i++) { + if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + redact_snaps[i])) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static boolean_t +redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) +{ + uint64_t *origin_snaps; + uint64_t origin_num_snaps; + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + int err = 0; + boolean_t ret = B_TRUE; + uint64_t *redact_snaps; + uint_t numredactsnaps; + + /* + * If this is a full send stream, we're safe no matter what. + */ + if (drrb->drr_fromguid == 0) + return (ret); + + VERIFY(dsl_dataset_get_uint64_array_feature(origin, + SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == + 0) { + /* + * If the send stream was sent from the redaction bookmark or + * the redacted version of the dataset, then we're safe. Verify + * that this is from the a compatible redaction bookmark or + * redacted dataset. + */ + if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, + redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + /* + * If the stream is redacted, it must be redacted with respect + * to a subset of what the origin is redacted with respect to. + * See case number 2 in the zfs man page section on redacted zfs + * send. + */ + err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); + + if (err != 0 || !compatible_redact_snaps(origin_snaps, + origin_num_snaps, redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + drrb->drr_toguid)) { + /* + * If the stream isn't redacted but the origin is, this must be + * one of the snapshots the origin is redacted with respect to. + * See case number 1 in the zfs man page section on redacted zfs + * send. + */ + err = EINVAL; + } + + if (err != 0) + ret = B_FALSE; + return (ret); +} + +/* + * If we previously received a stream with --large-block, we don't support + * receiving an incremental on top of it without --large-block. This avoids + * forcing a read-modify-write or trying to re-aggregate a string of WRITE + * records. + */ +static int +recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) +{ + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && + !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); + return (0); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -86,21 +351,25 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; - /* temporary clone name must not exist */ + /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) - return (error == 0 ? EBUSY : error); + return (error == 0 ? SET_ERROR(EBUSY) : error); - /* new snapshot name must not exist */ + /* Resume state must not be set. */ + if (dsl_dataset_has_resume_receive_state(ds)) + return (SET_ERROR(EBUSY)); + + /* New snapshot name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) - return (error == 0 ? EEXIST : error); + return (error == 0 ? SET_ERROR(EEXIST) : error); - /* must not have children if receiving a ZVOL */ + /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); if (error != 0) @@ -119,7 +388,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, - NULL, drba->drba_cred); + NULL, drba->drba_cred, drba->drba_proc); if (error != 0) return (error); @@ -127,7 +396,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - /* Can't raw receive on top of an unencrypted dataset */ + /* Can't perform a raw receive on top of a non-raw receive */ if (!encrypted && raw) return (SET_ERROR(EINVAL)); @@ -158,9 +427,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, } else { /* * If we are not forcing, there must be no - * changes since fromsnap. + * changes since fromsnap. Raw sends have an + * additional constraint that requires that + * no "noop" snapshots exist between fromsnap + * and tosnap for the IVset checking code to + * work properly. */ - if (dsl_dataset_modified_since_snap(ds, snap)) { + if (dsl_dataset_modified_since_snap(ds, snap) || + (raw && + dsl_dataset_phys(ds)->ds_prev_snap_obj != + snap->ds_object)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } @@ -168,6 +444,19 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, ds->ds_prev->ds_object; } + if (dsl_dataset_feature_is_active(snap, + SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, + snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = recv_check_large_blocks(snap, featureflags); + if (error != 0) { + dsl_dataset_rele(snap, FTAG); + return (error); + } + dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ @@ -199,12 +488,67 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (will_encrypt && embed) return (SET_ERROR(EINVAL)); } - - drba->drba_cookie->drc_fromsnapobj = 0; } return (0); +} +/* + * Check that any feature flags used in the data stream we're receiving are + * supported by the pool we are receiving into. + * + * Note that some of the features we explicitly check here have additional + * (implicit) features they depend on, but those dependencies are enforced + * through the zfeature_register() calls declaring the features that we + * explicitly check. + */ +static int +recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) +{ + /* + * Check if there are any unsupported feature flags. + */ + if (!DMU_STREAM_SUPPORTED(featureflags)) { + return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE)); + } + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, + * and large_dnodes in the stream can only be used if those pool + * features are enabled because we don't attempt to decompress / + * un-embed / un-mooch / split up the blocks / dnodes during the + * receive process. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && + !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); + + /* + * Receiving redacted streams requires that redacted datasets are + * enabled. + */ + if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && + !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) + return (SET_ERROR(ENOTSUP)); + + return (0); } static int @@ -215,9 +559,9 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; @@ -231,41 +575,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); + error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); + if (error != 0) + return (error); + /* Resumable receives require extensible datasets */ if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. - */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); - if (featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require the encryption feature */ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) @@ -304,7 +622,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || drba->drba_origin)) return (SET_ERROR(ENOENT)); @@ -313,14 +631,14 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * contain all the necessary free records and freeobject * records, reject it. */ - if (fromguid == 0 && drba->drba_origin && + if (fromguid == 0 && drba->drba_origin != NULL && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); - error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); @@ -338,13 +656,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) error = dmu_objset_create_crypt_check(ds->ds_dir, drba->drba_dcp, &will_encrypt); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (will_encrypt && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } } @@ -355,61 +673,86 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + ZFS_PROP_FILESYSTEM_LIMIT, NULL, + drba->drba_cred, drba->drba_proc); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + drba->drba_cred, drba->drba_proc); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } /* can't recv below anything but filesystems (eg. no ZVOLs) */ error = dmu_objset_from_ds(ds, &os); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; - error = dsl_dataset_hold_flags(dp, drba->drba_origin, dsflags, FTAG, &origin); if (error != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } + if (origin->ds_dir->dd_crypto_obj != 0 && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } - dsl_dataset_rele_flags(origin, - dsflags, FTAG); + + /* + * If the origin is redacted we need to verify that this + * send stream can safely be received on top of the + * origin. + */ + if (dsl_dataset_feature_is_active(origin, + SPA_FEATURE_REDACTED_DATASETS)) { + if (!redact_check(drba, origin)) { + dsl_dataset_rele_flags(origin, dsflags, + FTAG); + dsl_dataset_rele_flags(ds, dsflags, + FTAG); + return (SET_ERROR(EINVAL)); + } + } + + error = recv_check_large_blocks(ds, featureflags); + if (error != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + dsl_dataset_rele_flags(origin, dsflags, FTAG); } - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); error = 0; } return (error); @@ -421,13 +764,14 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - const char *tofs = drba->drba_cookie->drc_tofs; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + const char *tofs = drc->drc_tofs; + uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; @@ -444,7 +788,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) * the raw cmd set. Raw incremental recvs do not use a dcp * since the encryption parameters are already set in stone. */ - if (dcp == NULL && drba->drba_cookie->drc_fromsnapobj == 0 && + if (dcp == NULL && drrb->drr_fromguid == 0 && drba->drba_origin == NULL) { ASSERT3P(dcp, ==, NULL); dcp = &dummy_dcp; @@ -463,7 +807,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, dcp, tx); if (drba->drba_cookie->drc_fromsnapobj != 0) @@ -488,13 +831,24 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); - drba->drba_cookie->drc_newfs = B_TRUE; + drc->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, + &newds)); + if (dsl_dataset_feature_is_active(newds, + SPA_FEATURE_REDACTED_DATASETS)) { + /* + * If the origin dataset is redacted, the child will be redacted + * when we create it. We clear the new dataset's + * redaction info; if it should be redacted, we'll fill + * in its information later. + */ + dsl_dataset_deactivate_feature(newds, + SPA_FEATURE_REDACTED_DATASETS, tx); } - - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &newds)); VERIFY0(dmu_objset_from_ds(newds, &os)); - if (drba->drba_cookie->drc_resumable) { + if (drc->drc_resumable) { dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, @@ -528,6 +882,17 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, 8, 1, &one, tx)); } + + uint64_t *redact_snaps; + uint_t numredactsnaps; + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, + &numredactsnaps) == 0) { + VERIFY0(zap_add(mos, dsobj, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, + sizeof (*redact_snaps), numredactsnaps, + redact_snaps, tx)); + } } /* @@ -540,6 +905,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) drba->drba_cookie->drc_raw = B_TRUE; } + if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t *redact_snaps; + uint_t numredactsnaps; + VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); + dsl_dataset_activate_redaction(newds, redact_snaps, + numredactsnaps, tx); + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -558,68 +932,48 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; + drba->drba_cookie->drc_os = os; - spa_history_log_internal_ds(newds, "receive", tx, ""); + spa_history_log_internal_ds(newds, "receive", tx, " "); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; + dmu_recv_cookie_t *drc = drba->drba_cookie; dsl_pool_t *dp = dmu_tx_pool(tx); - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + struct drr_begin *drrb = drc->drc_drrb; int error; - ds_hold_flags_t dsflags = 0; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; - const char *tofs = drba->drba_cookie->drc_tofs; + const char *tofs = drc->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + * This is mostly a sanity check since we should have already done these + * checks during a previous attempt to receive the data. */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); + error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, + dp->dp_spa); + if (error != 0) + return (error); /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); - if (featureflags & DMU_BACKUP_FEATURE_RAW) { + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require spill block allocation flag */ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); @@ -683,6 +1037,50 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EINVAL)); } + if (ds->ds_prev != NULL && drrb->drr_fromguid != 0) + drc->drc_fromsnapobj = ds->ds_prev->ds_object; + + /* + * If we're resuming, and the send is redacted, then the original send + * must have been redacted, and must have been redacted with respect to + * the same snapshots. + */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t num_ds_redact_snaps; + uint64_t *ds_redact_snaps; + + uint_t num_stream_redact_snaps; + uint64_t *stream_redact_snaps; + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &stream_redact_snaps, + &num_stream_redact_snaps) != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (!dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, + &ds_redact_snaps)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + for (int i = 0; i < num_ds_redact_snaps; i++) { + if (!redact_snaps_contains(ds_redact_snaps, + num_ds_redact_snaps, stream_redact_snaps[i])) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + } + } + + error = recv_check_large_blocks(ds, drc->drc_featureflags); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } @@ -693,17 +1091,14 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; - objset_t *os; - ds_hold_flags_t dsflags = 0; - uint64_t dsobj; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - (void) snprintf(recvname, sizeof (recvname), "%s/%s", - tofs, recv_clone_name); + (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, + recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { drba->drba_cookie->drc_raw = B_TRUE; @@ -711,33 +1106,25 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) dsflags |= DS_HOLD_FLAG_DECRYPT; } - if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { + if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) + != 0) { /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds)); + VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, + &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } - /* clear the inconsistent flag so that we can own it */ ASSERT(DS_IS_INCONSISTENT(ds)); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; - dsobj = ds->ds_object; - dsl_dataset_rele_flags(ds, dsflags, FTAG); - - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &ds)); - VERIFY0(dmu_objset_from_ds(ds, &os)); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drba->drba_cookie->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; + VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); + drba->drba_cookie->drc_should_save = B_TRUE; - spa_history_log_internal_ds(ds, "resume receive", tx, ""); + spa_history_log_internal_ds(ds, "resume receive", tx, " "); } /* @@ -747,9 +1134,11 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t resumable, nvlist_t *localprops, - nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc) + nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, + zfs_file_t *fp, offset_t *voffp) { dmu_recv_begin_arg_t drba = { 0 }; + int err; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; @@ -759,6 +1148,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); + drc->drc_proc = curproc; drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { @@ -773,20 +1163,46 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, return (SET_ERROR(EINVAL)); } + drc->drc_fp = fp; + drc->drc_voff = *voffp; + drc->drc_featureflags = + DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, payloadlen, + payload); + if (err != 0) { + kmem_free(payload, payloadlen); + return (err); + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, + KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) { + kmem_free(drc->drc_next_rrd, + sizeof (*drc->drc_next_rrd)); + return (err); + } + } + if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) drc->drc_spill = B_TRUE; drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); + drba.drba_proc = curproc; - if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_RESUMING) { - return (dsl_sync_task(tofs, + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); - } else { - int err; + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + } else { /* * For non-raw, non-incremental, non-resuming receives the @@ -803,143 +1219,25 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, origin == NULL && drc->drc_drrb->drr_fromguid == 0) { err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, localprops, hidden_args, &drba.drba_dcp); - if (err != 0) - return (err); } - err = dsl_sync_task(tofs, - dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL); - dsl_crypto_params_free(drba.drba_dcp, !!err); - - return (err); - } -} - -struct receive_record_arg { - dmu_replay_record_t header; - void *payload; /* Pointer to a buffer containing the payload */ - /* - * If the record is a write, pointer to the arc_buf_t containing the - * payload. - */ - arc_buf_t *arc_buf; - int payload_size; - uint64_t bytes_read; /* bytes read from stream when record created */ - boolean_t eos_marker; /* Marks the end of the stream */ - bqueue_node_t node; -}; - -struct receive_writer_arg { - objset_t *os; - boolean_t byteswap; - bqueue_t q; - - /* - * These three args are used to signal to the main thread that we're - * done. - */ - kmutex_t mutex; - kcondvar_t cv; - boolean_t done; - - int err; - /* A map from guid to dataset to help handle dedup'd streams. */ - avl_tree_t *guid_to_ds_map; - boolean_t resumable; - boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ - boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ - uint64_t last_object; - uint64_t last_offset; - uint64_t max_object; /* highest object ID referenced in stream */ - uint64_t bytes_read; /* bytes read when current record created */ - - /* Encryption parameters for the last received DRR_OBJECT_RANGE */ - boolean_t or_crypt_params_present; - uint64_t or_firstobj; - uint64_t or_numslots; - uint8_t or_salt[ZIO_DATA_SALT_LEN]; - uint8_t or_iv[ZIO_DATA_IV_LEN]; - uint8_t or_mac[ZIO_DATA_MAC_LEN]; - boolean_t or_byteorder; -}; - -struct objlist { - list_t list; /* List of struct receive_objnode. */ - /* - * Last object looked up. Used to assert that objects are being looked - * up in ascending order. - */ - uint64_t last_lookup; -}; - -struct receive_objnode { - list_node_t node; - uint64_t object; -}; - -struct receive_arg { - objset_t *os; - vnode_t *vp; /* The vnode to read the stream from */ - uint64_t voff; /* The current offset in the stream */ - uint64_t bytes_read; - /* - * A record that has had its payload read in, but hasn't yet been handed - * off to the worker thread. - */ - struct receive_record_arg *rrd; - /* A record that has had its header read in, but not its payload. */ - struct receive_record_arg *next_rrd; - zio_cksum_t cksum; - zio_cksum_t prev_cksum; - int err; - boolean_t byteswap; - boolean_t raw; - uint64_t featureflags; - /* Sorted list of objects not to issue prefetches for. */ - struct objlist ignore_objlist; -}; - -typedef struct guid_map_entry { - uint64_t guid; - boolean_t raw; - dsl_dataset_t *gme_ds; - avl_node_t avlnode; -} guid_map_entry_t; - -static int -guid_compare(const void *arg1, const void *arg2) -{ - const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; - const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - - return (AVL_CMP(gmep1->guid, gmep2->guid)); -} - -static void -free_guid_map_onexit(void *arg) -{ - avl_tree_t *ca = arg; - void *cookie = NULL; - guid_map_entry_t *gmep; - - while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT; - - if (gmep->raw) { - gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE; - dsflags &= ~DS_HOLD_FLAG_DECRYPT; + if (err == 0) { + err = dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + dsl_crypto_params_free(drba.drba_dcp, !!err); } - - dsl_dataset_disown(gmep->gme_ds, dsflags, gmep); - kmem_free(gmep, sizeof (guid_map_entry_t)); } - avl_destroy(ca); - kmem_free(ca, sizeof (avl_tree_t)); + + if (err != 0) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + nvlist_free(drc->drc_begin_nvl); + } + return (err); } static int -receive_read(struct receive_arg *ra, int len, void *buf) +receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { int done = 0; @@ -948,132 +1246,33 @@ receive_read(struct receive_arg *ra, int len, void *buf) * comment in dump_bytes. */ ASSERT(len % 8 == 0 || - (ra->featureflags & DMU_BACKUP_FEATURE_RAW) != 0); + (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { ssize_t resid; - - ra->err = vn_rdwr(UIO_READ, ra->vp, - (char *)buf + done, len - done, - ra->voff, UIO_SYSSPACE, FAPPEND, - RLIM64_INFINITY, CRED(), &resid); - + zfs_file_t *fp = drc->drc_fp; + int err = zfs_file_read(fp, (char *)buf + done, + len - done, &resid); if (resid == len - done) { /* - * Note: ECKSUM indicates that the receive - * was interrupted and can potentially be resumed. + * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates + * that the receive was interrupted and can + * potentially be resumed. */ - ra->err = SET_ERROR(ECKSUM); + err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } - ra->voff += len - done - resid; + drc->drc_voff += len - done - resid; done = len - resid; - if (ra->err != 0) - return (ra->err); + if (err != 0) + return (err); } - ra->bytes_read += len; + drc->drc_bytes_read += len; ASSERT3U(done, ==, len); return (0); } -noinline static void -byteswap_record(dmu_replay_record_t *drr) -{ -#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) -#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) - drr->drr_type = BSWAP_32(drr->drr_type); - drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); - - switch (drr->drr_type) { - case DRR_BEGIN: - DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_versioninfo); - DO64(drr_begin.drr_creation_time); - DO32(drr_begin.drr_type); - DO32(drr_begin.drr_flags); - DO64(drr_begin.drr_toguid); - DO64(drr_begin.drr_fromguid); - break; - case DRR_OBJECT: - DO64(drr_object.drr_object); - DO32(drr_object.drr_type); - DO32(drr_object.drr_bonustype); - DO32(drr_object.drr_blksz); - DO32(drr_object.drr_bonuslen); - DO32(drr_object.drr_raw_bonuslen); - DO64(drr_object.drr_toguid); - DO64(drr_object.drr_maxblkid); - break; - case DRR_FREEOBJECTS: - DO64(drr_freeobjects.drr_firstobj); - DO64(drr_freeobjects.drr_numobjs); - DO64(drr_freeobjects.drr_toguid); - break; - case DRR_WRITE: - DO64(drr_write.drr_object); - DO32(drr_write.drr_type); - DO64(drr_write.drr_offset); - DO64(drr_write.drr_logical_size); - DO64(drr_write.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); - DO64(drr_write.drr_key.ddk_prop); - DO64(drr_write.drr_compressed_size); - break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; - case DRR_WRITE_EMBEDDED: - DO64(drr_write_embedded.drr_object); - DO64(drr_write_embedded.drr_offset); - DO64(drr_write_embedded.drr_length); - DO64(drr_write_embedded.drr_toguid); - DO32(drr_write_embedded.drr_lsize); - DO32(drr_write_embedded.drr_psize); - break; - case DRR_FREE: - DO64(drr_free.drr_object); - DO64(drr_free.drr_offset); - DO64(drr_free.drr_length); - DO64(drr_free.drr_toguid); - break; - case DRR_SPILL: - DO64(drr_spill.drr_object); - DO64(drr_spill.drr_length); - DO64(drr_spill.drr_toguid); - DO64(drr_spill.drr_compressed_size); - DO32(drr_spill.drr_type); - break; - case DRR_OBJECT_RANGE: - DO64(drr_object_range.drr_firstobj); - DO64(drr_object_range.drr_numslots); - DO64(drr_object_range.drr_toguid); - break; - case DRR_END: - DO64(drr_end.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); - break; - default: - break; - } - - if (drr->drr_type != DRR_BEGIN) { - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); - } - -#undef DO64 -#undef DO32 -} - static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { @@ -1123,14 +1322,251 @@ save_resume_state(struct receive_writer_arg *rwa, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } +static int +receive_object_is_same_generation(objset_t *os, uint64_t object, + dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, + const void *new_bonus, boolean_t *samegenp) +{ + zfs_file_info_t zoi; + int err; + + dmu_buf_t *old_bonus_dbuf; + err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); + if (err != 0) + return (err); + err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, + &zoi); + dmu_buf_rele(old_bonus_dbuf, FTAG); + if (err != 0) + return (err); + uint64_t old_gen = zoi.zfi_generation; + + err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); + if (err != 0) + return (err); + uint64_t new_gen = zoi.zfi_generation; + + *samegenp = (old_gen == new_gen); + return (0); +} + +static int +receive_handle_existing_object(const struct receive_writer_arg *rwa, + const struct drr_object *drro, const dmu_object_info_t *doi, + const void *bonus_data, + uint64_t *object_to_hold, uint32_t *new_blksz) +{ + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + boolean_t do_free_range = B_FALSE; + int err; + + *object_to_hold = drro->drr_object; + + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + /* + * After the previous send stream, the sending system may + * have freed this object, and then happened to re-allocate + * this object number in a later txg. In this case, we are + * receiving a different logical file, and the block size may + * appear to be different. i.e. we may have a different + * block size for this object than what the send stream says. + * In this case we need to remove the object's contents, + * so that its structure can be changed and then its contents + * entirely replaced by subsequent WRITE records. + * + * If this is a -L (--large-block) incremental stream, and + * the previous stream was not -L, the block size may appear + * to increase. i.e. we may have a smaller block size for + * this object than what the send stream says. In this case + * we need to keep the object's contents and block size + * intact, so that we don't lose parts of the object's + * contents that are not changed by this incremental send + * stream. + * + * We can distinguish between the two above cases by using + * the ZPL's generation number (see + * receive_object_is_same_generation()). However, we only + * want to rely on the generation number when absolutely + * necessary, because with raw receives, the generation is + * encrypted. We also want to minimize dependence on the + * ZPL, so that other types of datasets can also be received + * (e.g. ZVOLs, although note that ZVOLS currently do not + * reallocate their objects or change their structure). + * Therefore, we check a number of different cases where we + * know it is safe to discard the object's contents, before + * using the ZPL's generation number to make the above + * distinction. + */ + if (drro->drr_blksz != doi->doi_data_block_size) { + if (rwa->raw) { + /* + * RAW streams always have large blocks, so + * we are sure that the data is not needed + * due to changing --large-block to be on. + * Which is fortunate since the bonus buffer + * (which contains the ZPL generation) is + * encrypted, and the key might not be + * loaded. + */ + do_free_range = B_TRUE; + } else if (rwa->full) { + /* + * This is a full send stream, so it always + * replaces what we have. Even if the + * generation numbers happen to match, this + * can not actually be the same logical file. + * This is relevant when receiving a full + * send as a clone. + */ + do_free_range = B_TRUE; + } else if (drro->drr_type != + DMU_OT_PLAIN_FILE_CONTENTS || + doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { + /* + * PLAIN_FILE_CONTENTS are the only type of + * objects that have ever been stored with + * large blocks, so we don't need the special + * logic below. ZAP blocks can shrink (when + * there's only one block), so we don't want + * to hit the error below about block size + * only increasing. + */ + do_free_range = B_TRUE; + } else if (doi->doi_max_offset <= + doi->doi_data_block_size) { + /* + * There is only one block. We can free it, + * because its contents will be replaced by a + * WRITE record. This can not be the no-L -> + * -L case, because the no-L case would have + * resulted in multiple blocks. If we + * supported -L -> no-L, it would not be safe + * to free the file's contents. Fortunately, + * that is not allowed (see + * recv_check_large_blocks()). + */ + do_free_range = B_TRUE; + } else { + boolean_t is_same_gen; + err = receive_object_is_same_generation(rwa->os, + drro->drr_object, doi->doi_bonus_type, + drro->drr_bonustype, bonus_data, &is_same_gen); + if (err != 0) + return (SET_ERROR(EINVAL)); + + if (is_same_gen) { + /* + * This is the same logical file, and + * the block size must be increasing. + * It could only decrease if + * --large-block was changed to be + * off, which is checked in + * recv_check_large_blocks(). + */ + if (drro->drr_blksz <= + doi->doi_data_block_size) + return (SET_ERROR(EINVAL)); + /* + * We keep the existing blocksize and + * contents. + */ + *new_blksz = + doi->doi_data_block_size; + } else { + do_free_range = B_TRUE; + } + } + } + + /* nblkptr can only decrease if the object was reallocated */ + if (nblkptr < doi->doi_nblkptr) + do_free_range = B_TRUE; + + /* number of slots can only change on reallocation */ + if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) + do_free_range = B_TRUE; + + /* + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ + if (rwa->raw) { + if (indblksz != doi->doi_metadata_block_size) + do_free_range = B_TRUE; + if (drro->drr_nlevels < doi->doi_indirection) + do_free_range = B_TRUE; + } + + if (do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + *object_to_hold = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * doi->doi_data_block_size, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + return (0); +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; - uint64_t object; int err; + uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; @@ -1145,7 +1581,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, drro->drr_bonuslen > DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || dn_slots > - (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { + (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } @@ -1180,6 +1616,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } err = dmu_object_info(rwa->os, drro->drr_object, &doi); + if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); @@ -1193,86 +1630,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ + uint64_t object_to_hold; if (err == 0) { - uint32_t indblksz = drro->drr_indblkshift ? - 1ULL << drro->drr_indblkshift : 0; - int nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - boolean_t did_free = B_FALSE; - - object = drro->drr_object; - - /* nblkptr should be bounded by the bonus size and type */ - if (rwa->raw && nblkptr != drro->drr_nblkptr) - return (SET_ERROR(EINVAL)); - - /* - * Check for indicators that the object was freed and - * reallocated. For all sends, these indicators are: - * - A changed block size - * - A smaller nblkptr - * - A changed dnode size - * For raw sends we also check a few other fields to - * ensure we are preserving the objset structure exactly - * as it was on the receive side: - * - A changed indirect block size - * - A smaller nlevels - */ - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || - (rwa->raw && - (indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection))) { - err = dmu_free_long_range(rwa->os, - drro->drr_object, 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - else - did_free = B_TRUE; - } - - /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. - */ - if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_object(rwa->os, drro->drr_object); - if (err != 0) - return (SET_ERROR(EINVAL)); - - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = DMU_NEW_OBJECT; - } - - /* - * For raw receives, free everything beyond the new incoming - * maxblkid. Normally this would be done with a DRR_FREE - * record that would come after this DRR_OBJECT record is - * processed. However, for raw receives we manually set the - * maxblkid from the drr_maxblkid and so we must first free - * everything above that blkid to ensure the DMU is always - * consistent with itself. We will never free the first block - * of the object here because a maxblkid of 0 could indicate - * an object with a single block or one with no blocks. This - * free may be skipped when dmu_free_long_range() was called - * above since it covers the entire object's contents. - */ - if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - (drro->drr_maxblkid + 1) * doi.doi_data_block_size, - DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } + err = receive_handle_existing_object(rwa, drro, &doi, data, + &object_to_hold, &new_blksz); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1287,10 +1648,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } /* @@ -1325,27 +1686,27 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_write(tx, object, 0, 0); + dmu_tx_hold_bonus(tx, object_to_hold); + dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - if (object == DMU_NEW_OBJECT) { + if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || + new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); @@ -1411,6 +1772,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ + ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, @@ -1470,7 +1832,8 @@ receive_freeobjects(struct receive_writer_arg *rwa, return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && + obj < DN_MAX_OBJECT && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; @@ -1485,22 +1848,196 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); - - if (obj > rwa->max_object) - rwa->max_object = obj; } if (next_err != ESRCH) return (next_err); return (0); } -noinline static int -receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, - arc_buf_t *abuf) +/* + * Note: if this fails, the caller will clean up any records left on the + * rwa->write_batch list. + */ +static int +flush_write_batch_impl(struct receive_writer_arg *rwa) { - int err; - dmu_tx_t *tx; dnode_t *dn; + int err; + + if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) + return (SET_ERROR(EINVAL)); + + struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); + struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + + ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); + ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); + + dmu_tx_t *tx = dmu_tx_create(rwa->os); + dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, + last_drrw->drr_offset - first_drrw->drr_offset + + last_drrw->drr_logical_size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + dnode_rele(dn, FTAG); + return (err); + } + + struct receive_record_arg *rrd; + while ((rrd = list_head(&rwa->write_batch)) != NULL) { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + abd_t *abd = rrd->abd; + + ASSERT3U(drrw->drr_object, ==, rwa->last_object); + + if (drrw->drr_logical_size != dn->dn_datablksz) { + /* + * The WRITE record is larger than the object's block + * size. We must be receiving an incremental + * large-block stream into a dataset that previously did + * a non-large-block receive. Lightweight writes must + * be exactly one block, so we need to decompress the + * data (if compressed) and do a normal dmu_write(). + */ + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + if (DRR_WRITE_COMPRESSED(drrw)) { + abd_t *decomp_abd = + abd_alloc_linear(drrw->drr_logical_size, + B_FALSE); + + err = zio_decompress_data( + drrw->drr_compressiontype, + abd, abd_to_buf(decomp_abd), + abd_get_size(abd), + abd_get_size(decomp_abd), NULL); + + if (err == 0) { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(decomp_abd), tx); + } + abd_free(decomp_abd); + } else { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(abd), tx); + } + if (err == 0) + abd_free(abd); + } else { + zio_prop_t zp; + dmu_write_policy(rwa->os, dn, 0, 0, &zp); + + enum zio_flag zio_flags = 0; + + if (rwa->raw) { + zp.zp_encrypt = B_TRUE; + zp.zp_compress = drrw->drr_compressiontype; + zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + rwa->byteswap; + bcopy(drrw->drr_salt, zp.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(drrw->drr_iv, zp.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(drrw->drr_mac, zp.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { + zp.zp_nopwrite = B_FALSE; + zp.zp_copies = MIN(zp.zp_copies, + SPA_DVAS_PER_BP - 1); + } + zio_flags |= ZIO_FLAG_RAW; + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + zp.zp_compress = drrw->drr_compressiontype; + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } else if (rwa->byteswap) { + /* + * Note: compressed blocks never need to be + * byteswapped, because WRITE records for + * metadata blocks are never compressed. The + * exception is raw streams, which are written + * in the original byteorder, and the byteorder + * bit is preserved in the BP by setting + * zp_byteorder above. + */ + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func( + abd_to_buf(abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + /* + * Since this data can't be read until the receive + * completes, we can do a "lightweight" write for + * improved performance. + */ + err = dmu_lightweight_write_by_dnode(dn, + drrw->drr_offset, abd, &zp, zio_flags, tx); + } + + if (err != 0) { + /* + * This rrd is left on the list, so the caller will + * free it (and the abd). + */ + break; + } + + /* + * Note: If the receive fails, we want the resume stream to + * start with the same record that we last successfully + * received (as opposed to the next record), so that we can + * verify that we are resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + + list_remove(&rwa->write_batch, rrd); + kmem_free(rrd, sizeof (*rrd)); + } + + dmu_tx_commit(tx); + dnode_rele(dn, FTAG); + return (err); +} + +noinline static int +flush_write_batch(struct receive_writer_arg *rwa) +{ + if (list_is_empty(&rwa->write_batch)) + return (0); + int err = rwa->err; + if (err == 0) + err = flush_write_batch_impl(rwa); + if (err != 0) { + struct receive_record_arg *rrd; + while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { + abd_free(rrd->abd); + kmem_free(rrd, sizeof (*rrd)); + } + } + ASSERT(list_is_empty(&rwa->write_batch)); + return (err); +} + +noinline static int +receive_process_write_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err = 0; + + ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); + struct drr_write *drrw = &rrd->header.drr_u.drr_write; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) @@ -1515,127 +2052,31 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + uint64_t batch_size = + MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); + if (first_rrd != NULL && + (drrw->drr_object != first_drrw->drr_object || + drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { + err = flush_write_batch(rwa); + if (err != 0) + return (err); + } + rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; - if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_logical_size); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); - err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); - if (err != 0) { - dnode_rele(dn, FTAG); - dmu_tx_commit(tx); - return (err); - } - dnode_rele(dn, FTAG); - + list_insert_tail(&rwa->write_batch, rrd); /* - * Note: If the receive fails, we want the resume stream to start - * with the same record that we last successfully received (as opposed - * to the next record), so that we can verify that we are - * resuming from the correct location. + * Return EAGAIN to indicate that we will use this rrd again, + * so the caller should not free it */ - save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); - dmu_tx_commit(tx); - - return (0); -} - -/* - * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed - * streams to refer to a copy of the data that is already on the - * system because it came in earlier in the stream. This function - * finds the earlier copy of the data, and uses that copy instead of - * data from the stream to fulfill this write. - */ -static int -receive_write_byref(struct receive_writer_arg *rwa, - struct drr_write_byref *drrwbr) -{ - dmu_tx_t *tx; - int err; - guid_map_entry_t gmesrch; - guid_map_entry_t *gmep; - avl_index_t where; - objset_t *ref_os = NULL; - int flags = DMU_READ_PREFETCH; - dmu_buf_t *dbp; - - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) - return (SET_ERROR(EINVAL)); - - /* - * If the GUID of the referenced dataset is different from the - * GUID of the target dataset, find the referenced dataset. - */ - if (drrwbr->drr_toguid != drrwbr->drr_refguid) { - gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, - &where)) == NULL) { - return (SET_ERROR(EINVAL)); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) - return (SET_ERROR(EINVAL)); - } else { - ref_os = rwa->os; - } - - if (drrwbr->drr_object > rwa->max_object) - rwa->max_object = drrwbr->drr_object; - - if (rwa->raw) - flags |= DMU_READ_NO_DECRYPT; - - /* may return either a regular db or an encrypted one */ - err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp, flags); - if (err != 0) - return (err); - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_write(tx, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (rwa->raw) { - dmu_copy_from_buf(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, dbp, tx); - } else { - dmu_write(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); - } - dmu_buf_rele(dbp, FTAG); - - /* See comment in restore_write. */ - save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); - dmu_tx_commit(tx); - return (0); + return (EAGAIN); } static int @@ -1684,12 +2125,10 @@ receive_write_embedded(struct receive_writer_arg *rwa, static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - arc_buf_t *abuf) + abd_t *abd) { - dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; - uint32_t flags = 0; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) @@ -1702,7 +2141,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (0); } @@ -1711,8 +2150,6 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || drrs->drr_compressed_size == 0) return (SET_ERROR(EINVAL)); - - flags |= DMU_READ_NO_DECRYPT; } if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) @@ -1728,7 +2165,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, return (err); } - tx = dmu_tx_create(rwa->os); + dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1747,18 +2184,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx); - VERIFY(0 == dbuf_spill_set_blksz(db_spill, + VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); + arc_buf_t *abuf; + if (rwa->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + rwa->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype, 0); + } else { + abuf = arc_loan_buf(dmu_objset_spa(rwa->os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } } + bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs)); + abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); @@ -1774,7 +2228,7 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; - if (drrf->drr_length != DMU_OBJECT_END && + if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); @@ -1839,13 +2293,30 @@ receive_object_range(struct receive_writer_arg *rwa, return (0); } +/* + * Until we have the ability to redact large ranges of data efficiently, we + * process these records as frees. + */ +/* ARGSUSED */ +noinline static int +receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) +{ + struct drr_free drrf = {0}; + drrf.drr_length = drrr->drr_length; + drrf.drr_object = drrr->drr_object; + drrf.drr_offset = drrr->drr_offset; + drrf.drr_toguid = drrr->drr_toguid; + return (receive_free(rwa, &drrf)); +} + /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; + dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has @@ -1857,7 +2328,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + if (drc->drc_resumable && drc->drc_should_save && + !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); } else { @@ -1870,61 +2342,60 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) } static void -receive_cksum(struct receive_arg *ra, int len, void *buf) +receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) { - if (ra->byteswap) { - (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + if (drc->drc_byteswap) { + (void) fletcher_4_incremental_byteswap(buf, len, + &drc->drc_cksum); } else { - (void) fletcher_4_incremental_native(buf, len, &ra->cksum); + (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. - * Allocate ra->next_rrd and read the next record's header into - * ra->next_rrd->header. + * Allocate drc->drc_next_rrd and read the next record's header into + * drc->drc_next_rrd->header. * Verify checksum of payload and next record. */ static int -receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) +receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) { int err; - zio_cksum_t cksum_orig; - zio_cksum_t *cksump; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - err = receive_read(ra, len, buf); + err = receive_read(drc, len, buf); if (err != 0) return (err); - receive_cksum(ra, len, buf); + receive_cksum(drc, len, buf); /* note: rrd is NULL when reading the begin record's payload */ - if (ra->rrd != NULL) { - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - ra->rrd->bytes_read = ra->bytes_read; + if (drc->drc_rrd != NULL) { + drc->drc_rrd->payload = buf; + drc->drc_rrd->payload_size = len; + drc->drc_rrd->bytes_read = drc->drc_bytes_read; } } else { ASSERT3P(buf, ==, NULL); } - ra->prev_cksum = ra->cksum; + drc->drc_prev_cksum = drc->drc_cksum; - ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); - err = receive_read(ra, sizeof (ra->next_rrd->header), - &ra->next_rrd->header); - ra->next_rrd->bytes_read = ra->bytes_read; + drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); + err = receive_read(drc, sizeof (drc->drc_next_rrd->header), + &drc->drc_next_rrd->header); + drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; if (err != 0) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; return (err); } - if (ra->next_rrd->header.drr_type == DRR_BEGIN) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; + if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; return (SET_ERROR(EINVAL)); } @@ -1934,90 +2405,30 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - receive_cksum(ra, + receive_cksum(drc, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &ra->next_rrd->header); + &drc->drc_next_rrd->header); - cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; - cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t cksum_orig = + drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; - if (ra->byteswap) - byteswap_record(&ra->next_rrd->header); + if (drc->drc_byteswap) + byteswap_record(&drc->drc_next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; + !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; return (SET_ERROR(ECKSUM)); } - receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); + receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); return (0); } -static void -objlist_create(struct objlist *list) -{ - list_create(&list->list, sizeof (struct receive_objnode), - offsetof(struct receive_objnode, node)); - list->last_lookup = 0; -} - -static void -objlist_destroy(struct objlist *list) -{ - for (struct receive_objnode *n = list_remove_head(&list->list); - n != NULL; n = list_remove_head(&list->list)) { - kmem_free(n, sizeof (*n)); - } - list_destroy(&list->list); -} - -/* - * This function looks through the objlist to see if the specified object number - * is contained in the objlist. In the process, it will remove all object - * numbers in the list that are smaller than the specified object number. Thus, - * any lookup of an object number smaller than a previously looked up object - * number will always return false; therefore, all lookups should be done in - * ascending order. - */ -static boolean_t -objlist_exists(struct objlist *list, uint64_t object) -{ - struct receive_objnode *node = list_head(&list->list); - ASSERT3U(object, >=, list->last_lookup); - list->last_lookup = object; - while (node != NULL && node->object < object) { - VERIFY3P(node, ==, list_remove_head(&list->list)); - kmem_free(node, sizeof (*node)); - node = list_head(&list->list); - } - return (node != NULL && node->object == object); -} - -/* - * The objlist is a list of object numbers stored in ascending order. However, - * the insertion of new object numbers does not seek out the correct location to - * store a new object number; instead, it appends it to the list for simplicity. - * Thus, any users must take care to only insert new object numbers in ascending - * order. - */ -static void -objlist_insert(struct objlist *list, uint64_t object) -{ - struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); - node->object = object; -#ifdef ZFS_DEBUG - { - struct receive_objnode *last_object = list_tail(&list->list); - uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); - ASSERT3U(node->object, >, last_objnum); - } -#endif - list_insert_tail(&list->list, node); -} - /* * Issue the prefetch reads for any necessary indirect blocks. * @@ -2037,11 +2448,11 @@ objlist_insert(struct objlist *list, uint64_t object) */ /* ARGSUSED */ static void -receive_read_prefetch(struct receive_arg *ra, - uint64_t object, uint64_t offset, uint64_t length) +receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, + uint64_t length) { - if (!objlist_exists(&ra->ignore_objlist, object)) { - dmu_prefetch(ra->os, object, 1, offset, length, + if (!objlist_exists(drc->drc_ignore_objlist, object)) { + dmu_prefetch(drc->drc_os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } @@ -2050,14 +2461,15 @@ receive_read_prefetch(struct receive_arg *ra, * Read records off the stream, issuing any necessary prefetches. */ static int -receive_read_record(struct receive_arg *ra) +receive_read_record(dmu_recv_cookie_t *drc) { int err; - switch (ra->rrd->header.drr_type) { + switch (drc->drc_rrd->header.drr_type) { case DRR_OBJECT: { - struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + struct drr_object *drro = + &drc->drc_rrd->header.drr_u.drr_object; uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); void *buf = NULL; dmu_object_info_t doi; @@ -2065,153 +2477,106 @@ receive_read_record(struct receive_arg *ra) if (size != 0) buf = kmem_zalloc(size, KM_SLEEP); - err = receive_read_payload_and_next_header(ra, size, buf); + err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } - err = dmu_object_info(ra->os, drro->drr_object, &doi); + err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { - objlist_insert(&ra->ignore_objlist, drro->drr_object); + objlist_insert(drc->drc_ignore_objlist, + drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { - err = receive_read_payload_and_next_header(ra, 0, NULL); + err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_WRITE: { - struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - drrw->drr_object, byteorder, drrw->drr_salt, - drrw->drr_iv, drrw->drr_mac, drrw->drr_type, - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(ra->os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - is_meta, drrw->drr_logical_size); - } - - err = receive_read_payload_and_next_header(ra, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; + int size = DRR_WRITE_PAYLOAD_SIZE(drrw); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); if (err != 0) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (err); } - ra->rrd->arc_buf = abuf; - receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drc->drc_rrd->abd = abd; + receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &ra->rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(ra, 0, NULL); - receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); - } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = - &ra->rrd->header.drr_u.drr_write_embedded; + &drc->drc_rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); - err = receive_read_payload_and_next_header(ra, size, buf); + err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } - receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: + case DRR_REDACT: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ - err = receive_read_payload_and_next_header(ra, 0, NULL); + err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_END: { - struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; - if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) + struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, + drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { - struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; - arc_buf_t *abuf; - int len = DRR_SPILL_PAYLOAD_SIZE(drrs); - - /* DRR_SPILL records are either raw or uncompressed */ - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - dmu_objset_id(ra->os), byteorder, drrs->drr_salt, - drrs->drr_iv, drrs->drr_mac, drrs->drr_type, - drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - DMU_OT_IS_METADATA(drrs->drr_type), - drrs->drr_length); - } - - err = receive_read_payload_and_next_header(ra, len, - abuf->b_data); - if (err != 0) { - dmu_return_arcbuf(abuf); - return (err); - } - ra->rrd->arc_buf = abuf; + struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; + int size = DRR_SPILL_PAYLOAD_SIZE(drrs); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); + if (err != 0) + abd_free(abd); + else + drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: { - err = receive_read_payload_and_next_header(ra, 0, NULL); + err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); + } default: return (SET_ERROR(EINVAL)); } } + + static void dprintf_drr(struct receive_record_arg *rrd, int err) { @@ -2223,8 +2588,8 @@ dprintf_drr(struct receive_record_arg *rrd, int err) dprintf("drr_type = OBJECT obj = %llu type = %u " "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " "compress = %u dn_slots = %u err = %d\n", - drro->drr_object, drro->drr_type, drro->drr_bonustype, - drro->drr_blksz, drro->drr_bonuslen, + (u_longlong_t)drro->drr_object, drro->drr_type, + drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen, drro->drr_checksumtype, drro->drr_compress, drro->drr_dn_slots, err); break; @@ -2235,7 +2600,8 @@ dprintf_drr(struct receive_record_arg *rrd, int err) &rrd->header.drr_u.drr_freeobjects; dprintf("drr_type = FREEOBJECTS firstobj = %llu " "numobjs = %llu err = %d\n", - drrfo->drr_firstobj, drrfo->drr_numobjs, err); + (u_longlong_t)drrfo->drr_firstobj, + (u_longlong_t)drrfo->drr_numobjs, err); break; } case DRR_WRITE: @@ -2244,10 +2610,12 @@ dprintf_drr(struct receive_record_arg *rrd, int err) dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " "lsize = %llu cksumtype = %u flags = %u " "compress = %u psize = %llu err = %d\n", - drrw->drr_object, drrw->drr_type, drrw->drr_offset, - drrw->drr_logical_size, drrw->drr_checksumtype, - drrw->drr_flags, drrw->drr_compressiontype, - drrw->drr_compressed_size, err); + (u_longlong_t)drrw->drr_object, drrw->drr_type, + (u_longlong_t)drrw->drr_offset, + (u_longlong_t)drrw->drr_logical_size, + drrw->drr_checksumtype, drrw->drr_flags, + drrw->drr_compressiontype, + (u_longlong_t)drrw->drr_compressed_size, err); break; } case DRR_WRITE_BYREF: @@ -2258,11 +2626,14 @@ dprintf_drr(struct receive_record_arg *rrd, int err) "length = %llu toguid = %llx refguid = %llx " "refobject = %llu refoffset = %llu cksumtype = %u " "flags = %u err = %d\n", - drrwbr->drr_object, drrwbr->drr_offset, - drrwbr->drr_length, drrwbr->drr_toguid, - drrwbr->drr_refguid, drrwbr->drr_refobject, - drrwbr->drr_refoffset, drrwbr->drr_checksumtype, - drrwbr->drr_flags, err); + (u_longlong_t)drrwbr->drr_object, + (u_longlong_t)drrwbr->drr_offset, + (u_longlong_t)drrwbr->drr_length, + (u_longlong_t)drrwbr->drr_toguid, + (u_longlong_t)drrwbr->drr_refguid, + (u_longlong_t)drrwbr->drr_refobject, + (u_longlong_t)drrwbr->drr_refoffset, + drrwbr->drr_checksumtype, drrwbr->drr_flags, err); break; } case DRR_WRITE_EMBEDDED: @@ -2272,7 +2643,9 @@ dprintf_drr(struct receive_record_arg *rrd, int err) dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " "length = %llu compress = %u etype = %u lsize = %u " "psize = %u err = %d\n", - drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length, + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset, + (u_longlong_t)drrwe->drr_length, drrwe->drr_compression, drrwe->drr_etype, drrwe->drr_lsize, drrwe->drr_psize, err); break; @@ -2282,7 +2655,9 @@ dprintf_drr(struct receive_record_arg *rrd, int err) struct drr_free *drrf = &rrd->header.drr_u.drr_free; dprintf("drr_type = FREE obj = %llu offset = %llu " "length = %lld err = %d\n", - drrf->drr_object, drrf->drr_offset, drrf->drr_length, + (u_longlong_t)drrf->drr_object, + (u_longlong_t)drrf->drr_offset, + (longlong_t)drrf->drr_length, err); break; } @@ -2290,7 +2665,8 @@ dprintf_drr(struct receive_record_arg *rrd, int err) { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; dprintf("drr_type = SPILL obj = %llu length = %llu " - "err = %d\n", drrs->drr_object, drrs->drr_length, err); + "err = %d\n", (u_longlong_t)drrs->drr_object, + (u_longlong_t)drrs->drr_length, err); break; } case DRR_OBJECT_RANGE: @@ -2299,7 +2675,8 @@ dprintf_drr(struct receive_record_arg *rrd, int err) &rrd->header.drr_u.drr_object_range; dprintf("drr_type = OBJECT_RANGE firstobj = %llu " "numslots = %llu flags = %u err = %d\n", - drror->drr_firstobj, drror->drr_numslots, + (u_longlong_t)drror->drr_firstobj, + (u_longlong_t)drror->drr_numslots, drror->drr_flags, err); break; } @@ -2322,6 +2699,22 @@ receive_process_record(struct receive_writer_arg *rwa, ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; + if (rrd->header.drr_type != DRR_WRITE) { + err = flush_write_batch(rwa); + if (err != 0) { + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + + return (err); + } + } + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2340,20 +2733,17 @@ receive_process_record(struct receive_writer_arg *rwa, } case DRR_WRITE: { - struct drr_write *drrw = &rrd->header.drr_u.drr_write; - err = receive_write(rwa, drrw, rrd->arc_buf); - /* if receive_write() is successful, it consumes the arc_buf */ - if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; - rrd->payload = NULL; - break; - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwbr = - &rrd->header.drr_u.drr_write_byref; - err = receive_write_byref(rwa, drrwbr); + err = receive_process_write_record(rwa, rrd); + if (err != EAGAIN) { + /* + * On success, receive_process_write_record() returns + * EAGAIN to indicate that we do not want to free + * the rrd or arc_buf. + */ + ASSERT(err != 0); + abd_free(rrd->abd); + rrd->abd = NULL; + } break; } case DRR_WRITE_EMBEDDED: @@ -2374,11 +2764,10 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->arc_buf); - /* if receive_spill() is successful, it consumes the arc_buf */ + err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; break; } @@ -2389,6 +2778,12 @@ receive_process_record(struct receive_writer_arg *rwa, err = receive_object_range(rwa, drror); break; } + case DRR_REDACT: + { + struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; + err = receive_redact(rwa, drrr); + break; + } default: err = (SET_ERROR(EINVAL)); } @@ -2417,19 +2812,34 @@ receive_writer_thread(void *arg) * on the queue, but we need to clear everything in it before we * can exit. */ + int err = 0; if (rwa->err == 0) { - rwa->err = receive_process_record(rwa, rrd); - } else if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + err = receive_process_record(rwa, rrd); + } else if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } - kmem_free(rrd, sizeof (*rrd)); + /* + * EAGAIN indicates that this record has been saved (on + * raw->write_batch), and will be used again, so we don't + * free it. + */ + if (err != EAGAIN) { + if (rwa->err == 0) + rwa->err = err; + kmem_free(rrd, sizeof (*rrd)); + } } kmem_free(rrd, sizeof (*rrd)); + + int err = flush_write_batch(rwa); + if (rwa->err == 0) + rwa->err = err; + mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); @@ -2439,11 +2849,11 @@ receive_writer_thread(void *arg) } static int -resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) { uint64_t val; - objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; - uint64_t dsobj = dmu_objset_id(ra->os); + objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(drc->drc_os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, @@ -2477,113 +2887,39 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) * NB: callers *must* call dmu_recv_end() if this succeeds. */ int -dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep) +dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; - struct receive_arg *ra; - struct receive_writer_arg *rwa; - int featureflags; - uint32_t payloadlen; - void *payload; - nvlist_t *begin_nvl = NULL; + struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); - ra = kmem_zalloc(sizeof (*ra), KM_SLEEP); - rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); - - ra->byteswap = drc->drc_byteswap; - ra->raw = drc->drc_raw; - ra->cksum = drc->drc_cksum; - ra->vp = vp; - ra->voff = *voffp; - - if (dsl_dataset_is_zapified(drc->drc_ds)) { + if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { + uint64_t bytes = 0; (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, - sizeof (ra->bytes_read), 1, &ra->bytes_read); + sizeof (bytes), 1, &bytes); + drc->drc_bytes_read += bytes; } - objlist_create(&ra->ignore_objlist); + drc->drc_ignore_objlist = objlist_create(); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); - /* - * Open the objset we are modifying. - */ - VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra->os)); - ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); - - featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); - ra->featureflags = featureflags; - - ASSERT0(ra->os->os_encrypted && - (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); - - /* if this stream is dedup'ed, set up the avl tree for guid mapping */ - if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - minor_t minor; - - if (cleanup_fd == -1) { - err = SET_ERROR(EBADF); - goto out; - } - err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (err != 0) { - cleanup_fd = -1; - goto out; - } - - if (*action_handlep == 0) { - rwa->guid_to_ds_map = - kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(rwa->guid_to_ds_map, guid_compare, - sizeof (guid_map_entry_t), - offsetof(guid_map_entry_t, avlnode)); - err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, rwa->guid_to_ds_map, - action_handlep); - if (err != 0) - goto out; - } else { - err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&rwa->guid_to_ds_map); - if (err != 0) - goto out; - } - - drc->drc_guid_to_ds_map = rwa->guid_to_ds_map; - } - - payloadlen = drc->drc_drr_begin->drr_payloadlen; - payload = NULL; - if (payloadlen != 0) - payload = kmem_alloc(payloadlen, KM_SLEEP); - - err = receive_read_payload_and_next_header(ra, payloadlen, payload); - if (err != 0) { - if (payloadlen != 0) - kmem_free(payload, payloadlen); - goto out; - } - if (payloadlen != 0) { - err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); - kmem_free(payload, payloadlen); - if (err != 0) - goto out; - } + ASSERT0(drc->drc_os->os_encrypted && + (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); /* handle DSL encryption key payload */ - if (featureflags & DMU_BACKUP_FEATURE_RAW) { + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { nvlist_t *keynvl = NULL; - ASSERT(ra->os->os_encrypted); + ASSERT(drc->drc_os->os_encrypted); ASSERT(drc->drc_raw); - err = nvlist_lookup_nvlist(begin_nvl, "crypt_keydata", &keynvl); + err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", + &keynvl); if (err != 0) goto out; @@ -2593,7 +2929,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, * are sure the rest of the receive succeeded so we stash * the keynvl away until then. */ - err = dsl_crypto_recv_raw(spa_name(ra->os->os_spa), + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) @@ -2608,23 +2944,33 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, drc->drc_keynvl = fnvlist_dup(keynvl); } - if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { - err = resume_check(ra, begin_nvl); + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(drc, drc->drc_begin_nvl); if (err != 0) goto out; } - (void) bqueue_init(&rwa->q, + /* + * If we failed before this point we will clean up any new resume + * state that was created. Now that we've gotten past the initial + * checks we are ok to retain that resume state. + */ + drc->drc_should_save = B_TRUE; + + (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); - rwa->os = ra->os; + rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; + rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; + list_create(&rwa->write_batch, sizeof (struct receive_record_arg), + offsetof(struct receive_record_arg, node.bqn_node)); (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); @@ -2638,10 +2984,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the - * first loop and ra->rrd was never allocated, or it's later and ra->rrd - * has been handed off to the writer thread who will free it. Finally, - * if receive_read_record fails or we're at the end of the stream, then - * we free ra->rrd and exit. + * first loop and drc->drc_rrd was never allocated, or it's later, and + * drc->drc_rrd has been handed off to the writer thread who will free + * it. Finally, if receive_read_record fails or we're at the end of the + * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { @@ -2649,30 +2995,36 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, break; } - ASSERT3P(ra->rrd, ==, NULL); - ra->rrd = ra->next_rrd; - ra->next_rrd = NULL; - /* Allocates and loads header into ra->next_rrd */ - err = receive_read_record(ra); + ASSERT3P(drc->drc_rrd, ==, NULL); + drc->drc_rrd = drc->drc_next_rrd; + drc->drc_next_rrd = NULL; + /* Allocates and loads header into drc->drc_next_rrd */ + err = receive_read_record(drc); - if (ra->rrd->header.drr_type == DRR_END || err != 0) { - kmem_free(ra->rrd, sizeof (*ra->rrd)); - ra->rrd = NULL; + if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); + drc->drc_rrd = NULL; break; } - bqueue_enqueue(&rwa->q, ra->rrd, - sizeof (struct receive_record_arg) + ra->rrd->payload_size); - ra->rrd = NULL; + bqueue_enqueue(&rwa->q, drc->drc_rrd, + sizeof (struct receive_record_arg) + + drc->drc_rrd->payload_size); + drc->drc_rrd = NULL; } - ASSERT3P(ra->rrd, ==, NULL); - ra->rrd = kmem_zalloc(sizeof (*ra->rrd), KM_SLEEP); - ra->rrd->eos_marker = B_TRUE; - bqueue_enqueue(&rwa->q, ra->rrd, 1); + + ASSERT3P(drc->drc_rrd, ==, NULL); + drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); + drc->drc_rrd->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); mutex_enter(&rwa->mutex); while (!rwa->done) { - cv_wait(&rwa->cv, &rwa->mutex); + /* + * We need to use cv_wait_sig() so that any process that may + * be sleeping here can still fork. + */ + (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } mutex_exit(&rwa->mutex); @@ -2705,6 +3057,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); + list_destroy(&rwa->write_batch); if (err == 0) err = rwa->err; @@ -2714,12 +3067,17 @@ out: * we need to clean up the next_rrd we create by processing the * DRR_BEGIN record. */ - if (ra->next_rrd != NULL) - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + if (drc->drc_next_rrd != NULL) + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); - nvlist_free(begin_nvl); - if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) - zfs_onexit_fd_rele(cleanup_fd); + /* + * The objset will be invalidated by dmu_recv_end() when we do + * dsl_dataset_clone_swap_sync_impl(). + */ + drc->drc_os = NULL; + + kmem_free(rwa, sizeof (*rwa)); + nvlist_free(drc->drc_begin_nvl); if (err != 0) { /* @@ -2731,10 +3089,9 @@ out: nvlist_free(drc->drc_keynvl); } - *voffp = ra->voff; - objlist_destroy(&ra->ignore_objlist); - kmem_free(ra, sizeof (*ra)); - kmem_free(rwa, sizeof (*rwa)); + objlist_destroy(drc->drc_ignore_objlist); + drc->drc_ignore_objlist = NULL; + *voffp = drc->drc_voff; return (err); } @@ -2802,7 +3159,8 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + drc->drc_tosnap, tx, B_TRUE, 1, + drc->drc_cred, drc->drc_proc); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); @@ -2810,7 +3168,8 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + drc->drc_tosnap, tx, B_TRUE, 1, + drc->drc_cred, drc->drc_proc); } return (error); } @@ -2821,6 +3180,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; + uint64_t newsnapobj; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); @@ -2859,10 +3219,17 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) drc->drc_keynvl = NULL; } - VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); + VERIFY3P(drc->drc_ds->ds_prev, ==, + origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); + /* + * The objset was evicted by dsl_dataset_clone_swap_sync_impl, + * so drc_os is no longer valid. + */ + drc->drc_os = NULL; + dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); @@ -2879,7 +3246,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; - drc->drc_newsnapobj = + newsnapobj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; dsl_dataset_rele(origin_head, FTAG); @@ -2916,8 +3283,10 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); } - drc->drc_newsnapobj = + newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } @@ -2932,15 +3301,13 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) * value. */ if (drc->drc_raw && drc->drc_ivset_guid != 0) { - dmu_object_zapify(dp->dp_meta_objset, drc->drc_newsnapobj, + dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); - VERIFY0(zap_update(dp->dp_meta_objset, drc->drc_newsnapobj, + VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, &drc->drc_ivset_guid, tx)); } - zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); - /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode @@ -2957,54 +3324,6 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) drc->drc_ds = NULL; } -static int -add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, - boolean_t raw) -{ - dsl_pool_t *dp; - dsl_dataset_t *snapds; - guid_map_entry_t *gmep; - objset_t *os; - ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT; - int err; - - ASSERT(guid_map != NULL); - - err = dsl_pool_hold(name, FTAG, &dp); - if (err != 0) - return (err); - gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); - err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds); - if (err == 0) { - /* - * If this is a deduplicated raw send stream, we need - * to make sure that we can still read raw blocks from - * earlier datasets in the stream, so we set the - * os_raw_receive flag now. - */ - if (raw) { - err = dmu_objset_from_ds(snapds, &os); - if (err != 0) { - dsl_dataset_disown(snapds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - kmem_free(gmep, sizeof (*gmep)); - return (err); - } - os->os_raw_receive = B_TRUE; - } - - gmep->raw = raw; - gmep->guid = dsl_dataset_phys(snapds)->ds_guid; - gmep->gme_ds = snapds; - avl_add(guid_map, gmep); - } else { - kmem_free(gmep, sizeof (*gmep)); - } - - dsl_pool_rele(dp, FTAG); - return (err); -} - static int dmu_recv_end_modified_blocks = 3; static int @@ -3048,9 +3367,14 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); - } else if (drc->drc_guid_to_ds_map != NULL) { - (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, - drc->drc_newsnapobj, drc->drc_raw); + } else { + if (drc->drc_newfs) { + zvol_create_minor(drc->drc_tofs); + } + char *snapname = kmem_asprintf("%s@%s", + drc->drc_tofs, drc->drc_tosnap); + zvol_create_minor(snapname); + kmem_strfree(snapname); } return (error); } @@ -3065,7 +3389,13 @@ dmu_objset_is_receiving(objset_t *os) os->os_dsl_dataset->ds_owner == dmu_recv_tag); } -#if defined(_KERNEL) -module_param(zfs_recv_queue_length, int, 0644); -MODULE_PARM_DESC(zfs_recv_queue_length, "Maximum receive queue length"); -#endif +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW, + "Maximum receive queue length"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW, + "Receive queue fill fraction"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW, + "Maximum amount of writes to batch into one transaction"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c new file mode 100644 index 0000000000..fdbdf7d6e8 --- /dev/null +++ b/module/zfs/dmu_redact.c @@ -0,0 +1,1201 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif + +/* + * This controls the number of entries in the buffer the redaction_list_update + * synctask uses to buffer writes to the redaction list. + */ +int redact_sync_bufsize = 1024; + +/* + * Controls how often to update the redaction list when creating a redaction + * list. + */ +uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ + +/* + * This tunable controls the length of the queues that zfs redact worker threads + * use to communicate. If the dmu_redact_snap thread is blocking on these + * queues, this variable may need to be increased. If there is a significant + * slowdown at the start of a redact operation as these threads consume all the + * available IO resources, or the queues are consuming too much memory, this + * variable may need to be decreased. + */ +int zfs_redact_queue_length = 1024 * 1024; + +/* + * These tunables control the fill fraction of the queues by zfs redact. The + * fill fraction controls the frequency with which threads have to be + * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these + * should be tuned down. If the queues empty before the signalled thread can + * catch up, then these should be tuned up. + */ +uint64_t zfs_redact_queue_ff = 20; + +struct redact_record { + bqueue_node_t ln; + boolean_t eos_marker; /* Marks the end of the stream */ + uint64_t start_object; + uint64_t start_blkid; + uint64_t end_object; + uint64_t end_blkid; + uint8_t indblkshift; + uint32_t datablksz; +}; + +struct redact_thread_arg { + bqueue_t q; + objset_t *os; /* Objset to traverse */ + dsl_dataset_t *ds; /* Dataset to traverse */ + struct redact_record *current_record; + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; + objlist_t *deleted_objs; + uint64_t *num_blocks_visited; + uint64_t ignore_object; /* ignore further callbacks on this */ + uint64_t txg; /* txg to traverse since */ +}; + +/* + * The redaction node is a wrapper around the redaction record that is used + * by the redaction merging thread to sort the records and determine overlaps. + * + * It contains two nodes; one sorts the records by their start_zb, and the other + * sorts the records by their end_zb. + */ +struct redact_node { + avl_node_t avl_node_start; + avl_node_t avl_node_end; + struct redact_record *record; + struct redact_thread_arg *rt_arg; + uint32_t thread_num; +}; + +struct merge_data { + list_t md_redact_block_pending; + redact_block_phys_t md_coalesce_block; + uint64_t md_last_time; + redact_block_phys_t md_furthest[TXG_SIZE]; + /* Lists of struct redact_block_list_node. */ + list_t md_blocks[TXG_SIZE]; + boolean_t md_synctask_txg[TXG_SIZE]; + uint64_t md_latest_synctask_txg; + redaction_list_t *md_redaction_list; +}; + +/* + * A wrapper around struct redact_block so it can be stored in a list_t. + */ +struct redact_block_list_node { + redact_block_phys_t block; + list_node_t node; +}; + +/* + * We've found a new redaction candidate. In order to improve performance, we + * coalesce these blocks when they're adjacent to each other. This function + * handles that. If the new candidate block range is immediately after the + * range we're building, coalesce it into the range we're building. Otherwise, + * put the record we're building on the queue, and update the build pointer to + * point to the new record. + */ +static void +record_merge_enqueue(bqueue_t *q, struct redact_record **build, + struct redact_record *new) +{ + if (new->eos_marker) { + if (*build != NULL) + bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue_flush(q, new, sizeof (*new)); + return; + } + if (*build == NULL) { + *build = new; + return; + } + struct redact_record *curbuild = *build; + if ((curbuild->end_object == new->start_object && + curbuild->end_blkid + 1 == new->start_blkid && + curbuild->end_blkid != UINT64_MAX) || + (curbuild->end_object + 1 == new->start_object && + curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) { + curbuild->end_object = new->end_object; + curbuild->end_blkid = new->end_blkid; + kmem_free(new, sizeof (*new)); + } else { + bqueue_enqueue(q, curbuild, sizeof (*curbuild)); + *build = new; + } +} +#ifdef _KERNEL +struct objnode { + avl_node_t node; + uint64_t obj; +}; + +static int +objnode_compare(const void *o1, const void *o2) +{ + const struct objnode *obj1 = o1; + const struct objnode *obj2 = o2; + if (obj1->obj < obj2->obj) + return (-1); + if (obj1->obj > obj2->obj) + return (1); + return (0); +} + + +static objlist_t * +zfs_get_deleteq(objset_t *os) +{ + objlist_t *deleteq_objlist = objlist_create(); + uint64_t deleteq_obj; + zap_cursor_t zc; + zap_attribute_t za; + dmu_object_info_t doi; + + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); + + VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + + /* + * In order to insert objects into the objlist, they must be in sorted + * order. We don't know what order we'll get them out of the ZAP in, so + * we insert them into and remove them from an avl_tree_t to sort them. + */ + avl_tree_t at; + avl_create(&at, objnode_compare, sizeof (struct objnode), + offsetof(struct objnode, node)); + + for (zap_cursor_init(&zc, os, deleteq_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); + obj->obj = za.za_first_integer; + avl_add(&at, obj); + } + zap_cursor_fini(&zc); + + struct objnode *next, *found = avl_first(&at); + while (found != NULL) { + next = AVL_NEXT(&at, found); + objlist_insert(deleteq_objlist, found->obj); + found = next; + } + + void *cookie = NULL; + while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) + kmem_free(found, sizeof (*found)); + avl_destroy(&at); + return (deleteq_objlist); +} +#endif + +/* + * This is the callback function to traverse_dataset for the redaction threads + * for dmu_redact_snap. This thread is responsible for creating redaction + * records for all the data that is modified by the snapshots we're redacting + * with respect to. Redaction records represent ranges of data that have been + * modified by one of the redaction snapshots, and are stored in the + * redact_record struct. We need to create redaction records for three + * cases: + * + * First, if there's a normal write, we need to create a redaction record for + * that block. + * + * Second, if there's a hole, we need to create a redaction record that covers + * the whole range of the hole. If the hole is in the meta-dnode, it must cover + * every block in all of the objects in the hole. + * + * Third, if there is a deleted object, we need to create a redaction record for + * all of the blocks in that object. + */ +/*ARGSUSED*/ +static int +redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct redact_thread_arg *rta = arg; + struct redact_record *record; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= rta->resume.zb_object); + + if (rta->cancel) + return (SET_ERROR(EINTR)); + + if (rta->ignore_object == zb->zb_object) + return (0); + + /* + * If we're visiting a dnode, we need to handle the case where the + * object has been deleted. + */ + if (zb->zb_level == ZB_DNODE_LEVEL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + + if (zb->zb_object == 0) + return (0); + + /* + * If the object has been deleted, redact all of the blocks in + * it. + */ + if (dnp->dn_type == DMU_OT_NONE || + objlist_exists(rta->deleted_objs, zb->zb_object)) { + rta->ignore_object = zb->zb_object; + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + + record->eos_marker = B_FALSE; + record->start_object = record->end_object = + zb->zb_object; + record->start_blkid = 0; + record->end_blkid = UINT64_MAX; + record_merge_enqueue(&rta->q, + &rta->current_record, record); + } + return (0); + } else if (zb->zb_level < 0) { + return (0); + } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) { + /* + * If this is an indirect block, but not a hole, it doesn't + * provide any useful information for redaction, so ignore it. + */ + return (0); + } + + /* + * At this point, there are two options left for the type of block we're + * looking at. Either this is a hole (which could be in the dnode or + * the meta-dnode), or it's a level 0 block of some sort. If it's a + * hole, we create a redaction record that covers the whole range. If + * the hole is in a dnode, we need to redact all the blocks in that + * hole. If the hole is in the meta-dnode, we instead need to redact + * all blocks in every object covered by that hole. If it's a level 0 + * block, we only need to redact that single block. + */ + record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); + record->eos_marker = B_FALSE; + + record->start_object = record->end_object = zb->zb_object; + if (BP_IS_HOLE(bp)) { + record->start_blkid = zb->zb_blkid * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + + record->end_blkid = ((zb->zb_blkid + 1) * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1; + + if (zb->zb_object == DMU_META_DNODE_OBJECT) { + record->start_object = record->start_blkid * + ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t)); + record->start_blkid = 0; + record->end_object = ((record->end_blkid + + 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t))) - 1; + record->end_blkid = UINT64_MAX; + } + } else if (zb->zb_level != 0 || + zb->zb_object == DMU_META_DNODE_OBJECT) { + kmem_free(record, sizeof (*record)); + return (0); + } else { + record->start_blkid = record->end_blkid = zb->zb_blkid; + } + record->indblkshift = dnp->dn_indblkshift; + record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + record_merge_enqueue(&rta->q, &rta->current_record, record); + + return (0); +} + +static void +redact_traverse_thread(void *arg) +{ + struct redact_thread_arg *rt_arg = arg; + int err; + struct redact_record *data; +#ifdef _KERNEL + if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS) + rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os); + else + rt_arg->deleted_objs = objlist_create(); +#else + rt_arg->deleted_objs = objlist_create(); +#endif + + err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, + &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + redact_cb, rt_arg); + + if (err != EINTR) + rt_arg->error_code = err; + objlist_destroy(rt_arg->deleted_objs); + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data); + thread_exit(); +} + +static inline void +create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object, + uint64_t blkid) +{ + zb->zb_object = object; + zb->zb_level = 0; + zb->zb_blkid = blkid; +} + +/* + * This is a utility function that can do the comparison for the start or ends + * of the ranges in a redact_record. + */ +static int +redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1, + uint64_t obj2, uint64_t off2, uint32_t dbss2) +{ + zbookmark_phys_t z1, z2; + create_zbookmark_from_obj_off(&z1, obj1, off1); + create_zbookmark_from_obj_off(&z2, obj2, off2); + + return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0, + dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2)); +} + +/* + * Compare two redaction records by their range's start location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_start(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *rr1 = rn1->record; + const struct redact_record *rr2 = rn2->record; + if (rr1->eos_marker) + return (1); + if (rr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid, + rr1->datablksz, rr2->start_object, rr2->start_blkid, + rr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Compare two redaction records by their range's end location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_end(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *srr1 = rn1->record; + const struct redact_record *srr2 = rn2->record; + if (srr1->eos_marker) + return (1); + if (srr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid, + srr1->datablksz, srr2->end_object, srr2->end_blkid, + srr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Utility function that compares two redaction records to determine if any part + * of the "from" record is before any part of the "to" record. Also causes End + * of Stream redaction records to compare after all others, so that the + * redaction merging logic can stay simple. + */ +static boolean_t +redact_record_before(const struct redact_record *from, + const struct redact_record *to) +{ + if (from->eos_marker == B_TRUE) + return (B_FALSE); + else if (to->eos_marker == B_TRUE) + return (B_TRUE); + return (redact_range_compare(from->start_object, from->start_blkid, + from->datablksz, to->end_object, to->end_blkid, + to->datablksz) <= 0); +} + +/* + * Pop a new redaction record off the queue, check that the records are in the + * right order, and free the old data. + */ +static struct redact_record * +get_next_redact_record(bqueue_t *bq, struct redact_record *prev) +{ + struct redact_record *next = bqueue_dequeue(bq); + ASSERT(redact_record_before(prev, next)); + kmem_free(prev, sizeof (*prev)); + return (next); +} + +/* + * Remove the given redaction node from both trees, pull a new redaction record + * off the queue, free the old redaction record, update the redaction node, and + * reinsert the node into the trees. + */ +static int +update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree, + struct redact_node *redact_node) +{ + avl_remove(start_tree, redact_node); + avl_remove(end_tree, redact_node); + redact_node->record = get_next_redact_record(&redact_node->rt_arg->q, + redact_node->record); + avl_add(end_tree, redact_node); + avl_add(start_tree, redact_node); + return (redact_node->rt_arg->error_code); +} + +/* + * Synctask for updating redaction lists. We first take this txg's list of + * redacted blocks and append those to the redaction list. We then update the + * redaction list's bonus buffer. We store the furthest blocks we visited and + * the list of snapshots that we're redacting with respect to. We need these so + * that redacted sends and receives can be correctly resumed. + */ +static void +redaction_list_update_sync(void *arg, dmu_tx_t *tx) +{ + struct merge_data *md = arg; + uint64_t txg = dmu_tx_get_txg(tx); + list_t *list = &md->md_blocks[txg & TXG_MASK]; + redact_block_phys_t *furthest_visited = + &md->md_furthest[txg & TXG_MASK]; + objset_t *mos = tx->tx_pool->dp_meta_objset; + redaction_list_t *rl = md->md_redaction_list; + int bufsize = redact_sync_bufsize; + redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf), + KM_SLEEP); + int index = 0; + + dmu_buf_will_dirty(rl->rl_dbuf, tx); + + for (struct redact_block_list_node *rbln = list_remove_head(list); + rbln != NULL; rbln = list_remove_head(list)) { + ASSERT3U(rbln->block.rbp_object, <=, + furthest_visited->rbp_object); + ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object || + rbln->block.rbp_blkid <= furthest_visited->rbp_blkid); + buf[index] = rbln->block; + index++; + if (index == bufsize) { + dmu_write(mos, rl->rl_object, + rl->rl_phys->rlp_num_entries * sizeof (*buf), + bufsize * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += bufsize; + index = 0; + } + kmem_free(rbln, sizeof (*rbln)); + } + if (index > 0) { + dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * + sizeof (*buf), index * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += index; + } + kmem_free(buf, bufsize * sizeof (*buf)); + + md->md_synctask_txg[txg & TXG_MASK] = B_FALSE; + rl->rl_phys->rlp_last_object = furthest_visited->rbp_object; + rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid; +} + +static void +commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, + uint64_t blkid) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + if (!md->md_synctask_txg[txg & TXG_MASK]) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + redaction_list_update_sync, md, tx); + md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; + md->md_latest_synctask_txg = txg; + } + md->md_furthest[txg & TXG_MASK].rbp_object = object; + md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid; + list_move_tail(&md->md_blocks[txg & TXG_MASK], + &md->md_redact_block_pending); + dmu_tx_commit(tx); + md->md_last_time = gethrtime(); +} + +/* + * We want to store the list of blocks that we're redacting in the bookmark's + * redaction list. However, this list is stored in the MOS, which means it can + * only be written to in syncing context. To get around this, we create a + * synctask that will write to the mos for us. We tell it what to write by + * a linked list for each current transaction group; every time we decide to + * redact a block, we append it to the transaction group that is currently in + * open context. We also update some progress information that the synctask + * will store to enable resumable redacted sends. + */ +static void +update_redaction_list(struct merge_data *md, objset_t *os, + uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz) +{ + boolean_t enqueue = B_FALSE; + redact_block_phys_t cur = {0}; + uint64_t count = endblkid - blkid + 1; + while (count > REDACT_BLOCK_MAX_COUNT) { + update_redaction_list(md, os, object, blkid, + blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz); + blkid += REDACT_BLOCK_MAX_COUNT; + count -= REDACT_BLOCK_MAX_COUNT; + } + redact_block_phys_t *coalesce = &md->md_coalesce_block; + boolean_t new; + if (coalesce->rbp_size_count == 0) { + new = B_TRUE; + enqueue = B_FALSE; + } else { + uint64_t old_count = redact_block_get_count(coalesce); + if (coalesce->rbp_object == object && + coalesce->rbp_blkid + old_count == blkid && + old_count + count <= REDACT_BLOCK_MAX_COUNT) { + ASSERT3U(redact_block_get_size(coalesce), ==, blksz); + redact_block_set_count(coalesce, old_count + count); + new = B_FALSE; + enqueue = B_FALSE; + } else { + new = B_TRUE; + enqueue = B_TRUE; + } + } + + if (new) { + cur = *coalesce; + coalesce->rbp_blkid = blkid; + coalesce->rbp_object = object; + + redact_block_set_count(coalesce, count); + redact_block_set_size(coalesce, blksz); + } + + if (enqueue && redact_block_get_size(&cur) != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = cur; + list_insert_tail(&md->md_redact_block_pending, rbln); + } + + if (gethrtime() > md->md_last_time + + redaction_list_update_interval_ns) { + commit_rl_updates(os, md, object, blkid); + } +} + +/* + * This thread merges all the redaction records provided by the worker threads, + * and determines which blocks are redacted by all the snapshots. The algorithm + * for doing so is similar to performing a merge in mergesort with n sub-lists + * instead of 2, with some added complexity due to the fact that the entries are + * ranges, not just single blocks. This algorithm relies on the fact that the + * queues are sorted, which is ensured by the fact that traverse_dataset + * traverses the dataset in a consistent order. We pull one entry off the front + * of the queues of each secure dataset traversal thread. Then we repeat the + * following: each record represents a range of blocks modified by one of the + * redaction snapshots, and each block in that range may need to be redacted in + * the send stream. Find the record with the latest start of its range, and the + * record with the earliest end of its range. If the last start is before the + * first end, then we know that the blocks in the range [last_start, first_end] + * are covered by all of the ranges at the front of the queues, which means + * every thread redacts that whole range. For example, let's say the ranges on + * each queue look like this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * Thread 3 has the last start (5), and the thread 2 has the last end (6). All + * three threads modified the range [5,6], so that data should not be sent over + * the wire. After we've determined whether or not to redact anything, we take + * the record with the first end. We discard that record, and pull a new one + * off the front of the queue it came from. In the above example, we would + * discard Thread 2's record, and pull a new one. Let's say the next record we + * pulled from Thread 2 covered range [10,11]. The new layout would look like + * this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [==] + * Thread 3 | [=================] + * + * When we compare the last start (10, from Thread 2) and the first end (9, from + * Thread 1), we see that the last start is greater than the first end. + * Therefore, we do not redact anything from these records. We'll iterate by + * replacing the record from Thread 1. + * + * We iterate by replacing the record with the lowest end because we know + * that the record with the lowest end has helped us as much as it can. All the + * ranges before it that we will ever redact have been redacted. In addition, + * by replacing the one with the lowest end, we guarantee we catch all ranges + * that need to be redacted. For example, if in the case above we had replaced + * the record from Thread 1 instead, we might have ended up with the following: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 12 + * Thread 1 | [==] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * If the next record from Thread 2 had been [8,10], for example, we should have + * redacted part of that range, but because we updated Thread 1's record, we + * missed it. + * + * We implement this algorithm by using two trees. The first sorts the + * redaction records by their start_zb, and the second sorts them by their + * end_zb. We use these to find the record with the last start and the record + * with the first end. We create a record with that start and end, and send it + * on. The overall runtime of this implementation is O(n log m), where n is the + * total number of redaction records from all the different redaction snapshots, + * and m is the number of redaction snapshots. + * + * If we redact with respect to zero snapshots, we create a redaction + * record with the start object and blkid to 0, and the end object and blkid to + * UINT64_MAX. This will result in us redacting every block. + */ +static int +perform_thread_merge(bqueue_t *q, uint32_t num_threads, + struct redact_thread_arg *thread_args, boolean_t *cancel) +{ + struct redact_node *redact_nodes = NULL; + avl_tree_t start_tree, end_tree; + struct redact_record *record; + struct redact_record *current_record = NULL; + int err = 0; + struct merge_data md = { {0} }; + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + + /* + * If we're redacting with respect to zero snapshots, then no data is + * permitted to be sent. We enqueue a record that redacts all blocks, + * and an eos marker. + */ + if (num_threads == 0) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + // We can't redact object 0, so don't try. + record->start_object = 1; + record->start_blkid = 0; + record->end_object = record->end_blkid = UINT64_MAX; + bqueue_enqueue(q, record, sizeof (*record)); + return (0); + } + if (num_threads > 0) { + redact_nodes = kmem_zalloc(num_threads * + sizeof (*redact_nodes), KM_SLEEP); + } + + avl_create(&start_tree, redact_node_compare_start, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_start)); + avl_create(&end_tree, redact_node_compare_end, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_end)); + + for (int i = 0; i < num_threads; i++) { + struct redact_node *node = &redact_nodes[i]; + struct redact_thread_arg *targ = &thread_args[i]; + node->record = bqueue_dequeue(&targ->q); + node->rt_arg = targ; + node->thread_num = i; + avl_add(&start_tree, node); + avl_add(&end_tree, node); + } + + /* + * Once the first record in the end tree has returned EOS, every record + * must be an EOS record, so we should stop. + */ + while (err == 0 && !((struct redact_node *)avl_first(&end_tree))-> + record->eos_marker) { + if (*cancel) { + err = EINTR; + break; + } + struct redact_node *last_start = avl_last(&start_tree); + struct redact_node *first_end = avl_first(&end_tree); + + /* + * If the last start record is before the first end record, + * then we have blocks that are redacted by all threads. + * Therefore, we should redact them. Copy the record, and send + * it to the main thread. + */ + if (redact_record_before(last_start->record, + first_end->record)) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + *record = *first_end->record; + record->start_object = last_start->record->start_object; + record->start_blkid = last_start->record->start_blkid; + record_merge_enqueue(q, ¤t_record, + record); + } + err = update_avl_trees(&start_tree, &end_tree, first_end); + } + + /* + * We're done; if we were cancelled, we need to cancel our workers and + * clear out their queues. Either way, we need to remove every thread's + * redact_node struct from the avl trees. + */ + for (int i = 0; i < num_threads; i++) { + if (err != 0) { + thread_args[i].cancel = B_TRUE; + while (!redact_nodes[i].record->eos_marker) { + (void) update_avl_trees(&start_tree, &end_tree, + &redact_nodes[i]); + } + } + avl_remove(&start_tree, &redact_nodes[i]); + avl_remove(&end_tree, &redact_nodes[i]); + kmem_free(redact_nodes[i].record, + sizeof (struct redact_record)); + bqueue_destroy(&thread_args[i].q); + } + + avl_destroy(&start_tree); + avl_destroy(&end_tree); + kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + if (current_record != NULL) + bqueue_enqueue(q, current_record, sizeof (current_record)); + return (err); +} + +struct redact_merge_thread_arg { + bqueue_t q; + spa_t *spa; + int numsnaps; + struct redact_thread_arg *thr_args; + boolean_t cancel; + int error_code; +}; + +static void +redact_merge_thread(void *arg) +{ + struct redact_merge_thread_arg *rmta = arg; + rmta->error_code = perform_thread_merge(&rmta->q, + rmta->numsnaps, rmta->thr_args, &rmta->cancel); + struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP); + rec->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rmta->q, rec, 1); + thread_exit(); +} + +/* + * Find the next object in or after the redaction range passed in, and hold + * its dnode with the provided tag. Also update *object to contain the new + * object number. + */ +static int +hold_next_object(objset_t *os, struct redact_record *rec, void *tag, + uint64_t *object, dnode_t **dn) +{ + int err = 0; + if (*dn != NULL) + dnode_rele(*dn, tag); + *dn = NULL; + if (*object < rec->start_object) { + *object = rec->start_object - 1; + } + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + return (err); + + err = dnode_hold(os, *object, tag, dn); + while (err == 0 && (*object < rec->start_object || + DMU_OT_IS_METADATA((*dn)->dn_type))) { + dnode_rele(*dn, tag); + *dn = NULL; + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + break; + err = dnode_hold(os, *object, tag, dn); + } + return (err); +} + +static int +perform_redaction(objset_t *os, redaction_list_t *rl, + struct redact_merge_thread_arg *rmta) +{ + int err = 0; + bqueue_t *q = &rmta->q; + struct redact_record *rec = NULL; + struct merge_data md = { {0} }; + + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + md.md_redaction_list = rl; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&md.md_blocks[i], + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + } + dnode_t *dn = NULL; + uint64_t prev_obj = 0; + for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0; + rec = get_next_redact_record(q, rec)) { + ASSERT3U(rec->start_object, !=, 0); + uint64_t object; + if (prev_obj != rec->start_object) { + object = rec->start_object - 1; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } else { + object = prev_obj; + } + while (err == 0 && object <= rec->end_object) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = EINTR; + break; + } + /* + * Part of the current object is contained somewhere in + * the range covered by rec. + */ + uint64_t startblkid; + uint64_t endblkid; + uint64_t maxblkid = dn->dn_phys->dn_maxblkid; + + if (rec->start_object < object) + startblkid = 0; + else if (rec->start_blkid > maxblkid) + break; + else + startblkid = rec->start_blkid; + + if (rec->end_object > object || rec->end_blkid > + maxblkid) { + endblkid = maxblkid; + } else { + endblkid = rec->end_blkid; + } + update_redaction_list(&md, os, object, startblkid, + endblkid, dn->dn_datablksz); + + if (object == rec->end_object) + break; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } + if (err == ESRCH) + err = 0; + if (dn != NULL) + prev_obj = object; + } + if (err == 0 && dn != NULL) + dnode_rele(dn, FTAG); + + if (err == ESRCH) + err = 0; + rmta->cancel = B_TRUE; + while (!rec->eos_marker) + rec = get_next_redact_record(q, rec); + kmem_free(rec, sizeof (*rec)); + + /* + * There may be a block that's being coalesced, sync that out before we + * return. + */ + if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = md.md_coalesce_block; + list_insert_tail(&md.md_redact_block_pending, rbln); + } + commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX); + + /* + * Wait for all the redaction info to sync out before we return, so that + * anyone who attempts to resume this redaction will have all the data + * they need. + */ + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + if (md.md_latest_synctask_txg != 0) + txg_wait_synced(dp, md.md_latest_synctask_txg); + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&md.md_blocks[i]); + return (err); +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +int +dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, + const char *redactbook) +{ + int err = 0; + dsl_pool_t *dp = NULL; + dsl_dataset_t *ds = NULL; + int numsnaps = 0; + objset_t *os; + struct redact_thread_arg *args = NULL; + redaction_list_t *new_rl = NULL; + char *newredactbook; + + if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) + return (err); + + newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN, + KM_SLEEP); + + if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT, + FTAG, &ds)) != 0) { + goto out; + } + dsl_dataset_long_hold(ds, FTAG); + if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) { + err = EINVAL; + goto out; + } + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + goto out; + } + + numsnaps = fnvlist_num_pairs(redactnvl); + if (numsnaps > 0) + args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + + nvpair_t *pair = NULL; + for (int i = 0; i < numsnaps; i++) { + pair = nvlist_next_nvpair(redactnvl, pair); + const char *name = nvpair_name(pair); + struct redact_thread_arg *rta = &args[i]; + err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT, + FTAG, &rta->ds); + if (err != 0) + break; + /* + * We want to do the long hold before we can get any other + * errors, because the cleanup code will release the long + * hold if rta->ds is filled in. + */ + dsl_dataset_long_hold(rta->ds, FTAG); + + err = dmu_objset_from_ds(rta->ds, &rta->os); + if (err != 0) + break; + if (!dsl_dataset_is_before(rta->ds, ds, 0)) { + err = EINVAL; + break; + } + if (dsl_dataset_feature_is_active(rta->ds, + SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + break; + + } + } + if (err != 0) + goto out; + VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); + + boolean_t resuming = B_FALSE; + zfs_bookmark_phys_t bookmark; + + (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(newredactbook, '@'); + ASSERT3P(c, !=, NULL); + int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook), + "#%s", redactbook); + if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { + dsl_pool_rele(dp, FTAG); + kmem_free(newredactbook, + sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); + if (args != NULL) + kmem_free(args, numsnaps * sizeof (*args)); + return (SET_ERROR(ENAMETOOLONG)); + } + err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); + if (err == 0) { + resuming = B_TRUE; + if (bookmark.zbm_redaction_obj == 0) { + err = EEXIST; + goto out; + } + err = dsl_redaction_list_hold_obj(dp, + bookmark.zbm_redaction_obj, FTAG, &new_rl); + if (err != 0) { + err = EIO; + goto out; + } + dsl_redaction_list_long_hold(dp, new_rl, FTAG); + if (new_rl->rl_phys->rlp_num_snaps != numsnaps) { + err = ESRCH; + goto out; + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, + new_rl->rl_phys->rlp_num_snaps, + dsl_dataset_phys(rta->ds)->ds_guid)) { + err = ESRCH; + goto out; + } + } + if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && + new_rl->rl_phys->rlp_last_object == UINT64_MAX) { + err = EEXIST; + goto out; + } + dsl_pool_rele(dp, FTAG); + dp = NULL; + } else { + uint64_t *guids = NULL; + if (numsnaps > 0) { + guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + KM_SLEEP); + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + guids[i] = dsl_dataset_phys(rta->ds)->ds_guid; + } + + dsl_pool_rele(dp, FTAG); + dp = NULL; + err = dsl_bookmark_create_redacted(newredactbook, snapname, + numsnaps, guids, FTAG, &new_rl); + kmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) { + goto out; + } + } + + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + (void) bqueue_init(&rta->q, zfs_redact_queue_ff, + zfs_redact_queue_length, + offsetof(struct redact_record, ln)); + if (resuming) { + rta->resume.zb_blkid = + new_rl->rl_phys->rlp_last_blkid; + rta->resume.zb_object = + new_rl->rl_phys->rlp_last_object; + } + rta->txg = dsl_dataset_phys(ds)->ds_creation_txg; + (void) thread_create(NULL, 0, redact_traverse_thread, rta, + 0, curproc, TS_RUN, minclsyspri); + } + + struct redact_merge_thread_arg *rmta; + rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP); + + (void) bqueue_init(&rmta->q, zfs_redact_queue_ff, + zfs_redact_queue_length, offsetof(struct redact_record, ln)); + rmta->numsnaps = numsnaps; + rmta->spa = os->os_spa; + rmta->thr_args = args; + (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc, + TS_RUN, minclsyspri); + err = perform_redaction(os, new_rl, rmta); + bqueue_destroy(&rmta->q); + kmem_free(rmta, sizeof (struct redact_merge_thread_arg)); + +out: + kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); + + if (new_rl != NULL) { + dsl_redaction_list_long_rele(new_rl, FTAG); + dsl_redaction_list_rele(new_rl, FTAG); + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + /* + * rta->ds may be NULL if we got an error while filling + * it in. + */ + if (rta->ds != NULL) { + dsl_dataset_long_rele(rta->ds, FTAG); + dsl_dataset_rele_flags(rta->ds, + DS_HOLD_FLAG_DECRYPT, FTAG); + } + } + + if (args != NULL) + kmem_free(args, numsnaps * sizeof (*args)); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + if (ds != NULL) { + dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + } + return (SET_ERROR(err)); + +} diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index a6ff5ce3e4..0658e13c2d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -21,11 +21,13 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include @@ -51,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -58,104 +61,191 @@ #include #include #include +#include +#ifdef _KERNEL +#include +#endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +/* + * This tunable controls the amount of data (measured in bytes) that will be + * prefetched by zfs send. If the main thread is blocking on reads that haven't + * completed, this variable might need to be increased. If instead the main + * thread is issuing new reads because the prefetches have fallen out of the + * cache, this may need to be decreased. + */ int zfs_send_queue_length = SPA_MAXBLOCKSIZE; -/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ -int zfs_send_set_freerecords_bit = B_TRUE; -/* Set this tunable to FALSE is disable sending unmodified spill blocks. */ -int zfs_send_unmodified_spill_blocks = B_TRUE; +/* + * This tunable controls the length of the queues that zfs send worker threads + * use to communicate. If the send_main_thread is blocking on these queues, + * this variable may need to be increased. If there is a significant slowdown + * at the start of a send as these threads consume all the available IO + * resources, this variable may need to be decreased. + */ +int zfs_send_no_prefetch_queue_length = 1024 * 1024; +/* + * These tunables control the fill fraction of the queues by zfs send. The fill + * fraction controls the frequency with which threads have to be cv_signaled. + * If a lot of cpu time is being spent on cv_signal, then these should be tuned + * down. If the queues empty before the signalled thread can catch up, then + * these should be tuned up. + */ +int zfs_send_queue_ff = 20; +int zfs_send_no_prefetch_queue_ff = 20; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ -unsigned long zfs_override_estimate_recordsize = 0; +int zfs_override_estimate_recordsize = 0; -#define BP_SPAN(datablkszsec, indblkshift, level) \ - (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (indblkshift - SPA_BLKPTRSHIFT))) +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; + +/* Set this tunable to FALSE is disable sending unmodified spill blocks. */ +int zfs_send_unmodified_spill_blocks = B_TRUE; + +static inline boolean_t +overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) +{ + uint64_t temp = a * b; + if (b != 0 && temp / b != a) + return (B_FALSE); + *c = temp; + return (B_TRUE); +} struct send_thread_arg { bqueue_t q; - dsl_dataset_t *ds; /* Dataset to traverse */ + objset_t *os; /* Objset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; + uint64_t *num_blocks_visited; }; -struct send_block_record { +struct redact_list_thread_arg { + boolean_t cancel; + bqueue_t q; + zbookmark_phys_t resume; + redaction_list_t *rl; + boolean_t mark_redact; + int error_code; + uint64_t *num_blocks_visited; +}; + +struct send_merge_thread_arg { + bqueue_t q; + objset_t *os; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *redact_arg; + int error; + boolean_t cancel; +}; + +struct send_range { boolean_t eos_marker; /* Marks the end of the stream */ - blkptr_t bp; - zbookmark_phys_t zb; - uint8_t indblkshift; - uint16_t datablkszsec; + uint64_t object; + uint64_t start_blkid; + uint64_t end_blkid; bqueue_node_t ln; + enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT, + PREVIOUSLY_REDACTED} type; + union { + struct srd { + dmu_object_type_t obj_type; + uint32_t datablksz; // logical size + uint32_t datasz; // payload size + blkptr_t bp; + arc_buf_t *abuf; + abd_t *abd; + kmutex_t lock; + kcondvar_t cv; + boolean_t io_outstanding; + int io_err; + } data; + struct srh { + uint32_t datablksz; + } hole; + struct sro { + /* + * This is a pointer because embedding it in the + * struct causes these structures to be massively larger + * for all range types; this makes the code much less + * memory efficient. + */ + dnode_phys_t *dnp; + blkptr_t bp; + } object; + struct srr { + uint32_t datablksz; + } redact; + struct sror { + blkptr_t bp; + } object_range; + } sru; }; -typedef struct dump_bytes_io { - dmu_sendarg_t *dbi_dsp; - void *dbi_buf; - int dbi_len; -} dump_bytes_io_t; +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free(), + * dump_freeobjects(), and dump_redact() can be aggregated into a single + * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS, + PENDING_REDACT +} dmu_pendop_t; -static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data); +typedef struct dmu_send_cookie { + dmu_replay_record_t *dsc_drr; + dmu_send_outparams_t *dsc_dso; + offset_t *dsc_off; + objset_t *dsc_os; + zio_cksum_t dsc_zc; + uint64_t dsc_toguid; + uint64_t dsc_fromtxg; + int dsc_err; + dmu_pendop_t dsc_pending_op; + uint64_t dsc_featureflags; + uint64_t dsc_last_data_object; + uint64_t dsc_last_data_offset; + uint64_t dsc_resume_object; + uint64_t dsc_resume_offset; + boolean_t dsc_sent_begin; + boolean_t dsc_sent_end; +} dmu_send_cookie_t; + +static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range); static void -dump_bytes_cb(void *arg) +range_free(struct send_range *range) { - dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; - dmu_sendarg_t *dsp = dbi->dbi_dsp; - dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); - ssize_t resid; /* have to get resid to get detailed errno */ + if (range->type == OBJECT) { + size_t size = sizeof (dnode_phys_t) * + (range->sru.object.dnp->dn_extra_slots + 1); + kmem_free(range->sru.object.dnp, size); + } else if (range->type == DATA) { + mutex_enter(&range->sru.data.lock); + while (range->sru.data.io_outstanding) + cv_wait(&range->sru.data.cv, &range->sru.data.lock); + if (range->sru.data.abd != NULL) + abd_free(range->sru.data.abd); + if (range->sru.data.abuf != NULL) { + arc_buf_destroy(range->sru.data.abuf, + &range->sru.data.abuf); + } + mutex_exit(&range->sru.data.lock); - /* - * The code does not rely on len being a multiple of 8. We keep - * this assertion because of the corresponding assertion in - * receive_read(). Keeping this assertion ensures that we do not - * inadvertently break backwards compatibility (causing the assertion - * in receive_read() to trigger on old software). Newer feature flags - * (such as raw send) may break this assertion since they were - * introduced after the requirement was made obsolete. - */ - - ASSERT(dbi->dbi_len % 8 == 0 || - (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); - - dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, - (caddr_t)dbi->dbi_buf, dbi->dbi_len, - 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - - mutex_enter(&ds->ds_sendstream_lock); - *dsp->dsa_off += dbi->dbi_len; - mutex_exit(&ds->ds_sendstream_lock); -} - -static int -dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) -{ - dump_bytes_io_t dbi; - - dbi.dbi_dsp = dsp; - dbi.dbi_buf = buf; - dbi.dbi_len = len; - -#if defined(HAVE_LARGE_STACKS) - dump_bytes_cb(&dbi); -#else - /* - * The vn_rdwr() call is performed in a taskq to ensure that there is - * always enough stack space to write safely to the target filesystem. - * The ZIO_TYPE_FREE threads are used because there can be a lot of - * them and they are used in vdev_file.c for a similar purpose. - */ - spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE, - ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); -#endif /* HAVE_LARGE_STACKS */ - - return (dsp->dsa_err); + cv_destroy(&range->sru.data.cv); + mutex_destroy(&range->sru.data.lock); + } + kmem_free(range, sizeof (*range)); } /* @@ -164,32 +254,60 @@ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) * up to the start of the checksum itself. */ static int -dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) +dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len) { + dmu_send_outparams_t *dso = dscp->dsc_dso; ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - (void) fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dscp->dsc_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &dsp->dsa_zc); - if (dsp->dsa_drr->drr_type == DRR_BEGIN) { - dsp->dsa_sent_begin = B_TRUE; + &dscp->dsc_zc); + if (dscp->dsc_drr->drr_type == DRR_BEGIN) { + dscp->dsc_sent_begin = B_TRUE; } else { - ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u. drr_checksum.drr_checksum)); - dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; + dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc; } - if (dsp->dsa_drr->drr_type == DRR_END) { - dsp->dsa_sent_end = B_TRUE; + if (dscp->dsc_drr->drr_type == DRR_END) { + dscp->dsc_sent_end = B_TRUE; } - (void) fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dscp->dsc_drr-> drr_u.drr_checksum.drr_checksum, - sizeof (zio_cksum_t), &dsp->dsa_zc); - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + sizeof (zio_cksum_t), &dscp->dsc_zc); + *dscp->dsc_off += sizeof (dmu_replay_record_t); + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr, + sizeof (dmu_replay_record_t), dso->dso_arg); + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - (void) fletcher_4_incremental_native(payload, payload_len, - &dsp->dsa_zc); - if (dump_bytes(dsp, payload, payload_len) != 0) + *dscp->dsc_off += payload_len; + /* + * payload is null when dso_dryrun == B_TRUE (i.e. when we're + * doing a send size calculation) + */ + if (payload != NULL) { + (void) fletcher_4_incremental_native( + payload, payload_len, &dscp->dsc_zc); + } + + /* + * The code does not rely on this (len being a multiple of 8). + * We keep this assertion because of the corresponding assertion + * in receive_read(). Keeping this assertion ensures that we do + * not inadvertently break backwards compatibility (causing the + * assertion in receive_read() to trigger on old software). + * + * Raw sends cannot be received on old software, and so can + * bypass this assertion. + */ + + ASSERT((payload_len % 8 == 0) || + (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)); + + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload, + payload_len, dso->dso_arg); + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); } return (0); @@ -204,10 +322,10 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) * and freeobject records that were generated on the source. */ static int -dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, +dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, uint64_t length) { - struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes @@ -222,87 +340,131 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * another way to assert that the one-record constraint is still * satisfied. */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be - * aggregated with other DRR_FREEOBJECTS records. + * aggregated with other DRR_FREEOBJECTS records). */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - if (dsp->dsa_pending_op == PENDING_FREE) { - /* - * There should never be a PENDING_FREE if length is - * DMU_OBJECT_END (because dump_dnode is the only place where - * this function is called with a DMU_OBJECT_END, and only after - * flushing any pending record). - */ - ASSERT(length != DMU_OBJECT_END); + if (dscp->dsc_pending_op == PENDING_FREE) { /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { - if (offset + length < offset) - drrf->drr_length = DMU_OBJECT_END; + if (offset + length < offset || length == UINT64_MAX) + drrf->drr_length = UINT64_MAX; else drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; if (offset + length < offset) drrf->drr_length = DMU_OBJECT_END; else drrf->drr_length = length; - drrf->drr_toguid = dsp->dsa_toguid; + drrf->drr_toguid = dscp->dsc_toguid; if (length == DMU_OBJECT_END) { - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { - dsp->dsa_pending_op = PENDING_FREE; + dscp->dsc_pending_op = PENDING_FREE; } return (0); } +/* + * Fill in the drr_redact struct, or perform aggregation if the previous record + * is also a redaction record, and the two are adjacent. + */ static int -dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, +dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact; + + /* + * If there is a pending op, but it's not PENDING_REDACT, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_REDACT records can only be aggregated with + * other DRR_REDACT records). + */ + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_REDACT) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + if (dscp->dsc_pending_op == PENDING_REDACT) { + /* + * Check to see whether this redacted block can be aggregated + * with pending one. + */ + if (drrr->drr_object == object && drrr->drr_offset + + drrr->drr_length == offset) { + drrr->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + } + /* create a REDACT record and make it pending */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_REDACT; + drrr->drr_object = object; + drrr->drr_offset = offset; + drrr->drr_length = length; + drrr->drr_toguid = dscp->dsc_toguid; + dscp->dsc_pending_op = PENDING_REDACT; + + return (0); +} + +static int +dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; - boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); - struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); + struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); - dsp->dsa_last_data_object = object; - dsp->dsa_last_data_offset = offset + lsize - 1; + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); + dscp->dsc_last_data_object = object; + dscp->dsc_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either @@ -310,22 +472,24 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, * the stream, since aggregation can't be done across operations * of different types. */ - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write a WRITE record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; - drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed or raw */ if (raw || lsize != psize) { + ASSERT(raw || dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3S(psize, >, 0); @@ -345,7 +509,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, zio_crypt_decode_mac_bp(bp, drrw->drr_mac); } else { /* this is a compressed block */ - ASSERT(dsp->dsa_featureflags & + ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); @@ -365,7 +529,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, /* * There's no pre-computed checksum for partial-block writes, * embedded BP's, or encrypted BP's that are being sent as - * plaintext, so (like fletcher4-checkummed blocks) userland + * plaintext, so (like fletcher4-checksummed blocks) userland * will have to compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; @@ -381,33 +545,33 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, drrw->drr_key.ddk_cksum = bp->blk_cksum; } - if (dump_record(dsp, data, payload_size) != 0) + if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, +dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = - &(dsp->dsa_drr->drr_u.drr_write_embedded); + &(dscp->dsc_drr->drr_u.drr_write_embedded); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; - drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); @@ -415,39 +579,40 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, decode_embedded_bp_compressed(bp, buf); - if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) +dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, + void *data) { - struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); + struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill); uint64_t blksz = BP_GET_LSIZE(bp); uint64_t payload_size = blksz; - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write a SPILL record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_SPILL; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; - drrs->drr_toguid = dsp->dsa_toguid; + drrs->drr_toguid = dscp->dsc_toguid; /* See comment in dump_dnode() for full details */ if (zfs_send_unmodified_spill_blocks && - (bp->blk_birth <= dsp->dsa_fromtxg)) { + (bp->blk_birth <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; } /* handle raw send fields */ - if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) @@ -459,17 +624,17 @@ dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) payload_size = drrs->drr_compressed_size; } - if (dump_record(dsp, data, payload_size) != 0) + if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) +dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs) { - struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects); uint64_t maxobj = DNODES_PER_BLOCK * - (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1); + (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1); /* * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, @@ -478,7 +643,7 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) * receiving side. */ if (maxobj > 0) { - if (maxobj < firstobj) + if (maxobj <= firstobj) return (0); if (maxobj < firstobj + numobjs) @@ -490,15 +655,16 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records - * can only be aggregated with other DRR_FREEOBJECTS records. + * can only be aggregated with other DRR_FREEOBJECTS records). */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREEOBJECTS) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREEOBJECTS) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { + + if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one @@ -508,32 +674,32 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) return (0); } else { /* can't be aggregated. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; - drrfo->drr_toguid = dsp->dsa_toguid; + drrfo->drr_toguid = dscp->dsc_toguid; - dsp->dsa_pending_op = PENDING_FREEOBJECTS; + dscp->dsc_pending_op = PENDING_FREEOBJECTS; return (0); } static int -dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, +dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, dnode_phys_t *dnp) { - struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object); int bonuslen; - if (object < dsp->dsa_resume_object) { + if (object < dscp->dsc_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In @@ -541,23 +707,23 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ - ASSERT3U(dsp->dsa_resume_object - object, <, + ASSERT3U(dscp->dsc_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(dsp, object, 1)); + return (dump_freeobjects(dscp, object, 1)); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } /* write an OBJECT record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_OBJECT; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; @@ -566,15 +732,15 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; - drro->drr_toguid = dsp->dsa_toguid; + drro->drr_toguid = dscp->dsc_toguid; - if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); - if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) { + if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { ASSERT(BP_IS_ENCRYPTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) @@ -599,22 +765,22 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, /* * DRR_OBJECT_SPILL is set for every dnode which references a - * spill block. This allows the receiving pool to definitively + * spill block. This allows the receiving pool to definitively * determine when a spill block should be kept or freed. */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) drro->drr_flags |= DRR_OBJECT_SPILL; - if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0) + if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ - if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * + if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); /* - * Send DRR_SPILL records for unmodified spill blocks. This is useful + * Send DRR_SPILL records for unmodified spill blocks. This is useful * because changing certain attributes of the object (e.g. blocksize) * can cause old versions of ZFS to incorrectly remove a spill block. * Including these records in the stream forces an up to date version @@ -624,62 +790,67 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, */ if (zfs_send_unmodified_spill_blocks && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) { - struct send_block_record record; + (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) { + struct send_range record; + blkptr_t *bp = DN_SPILL_BLKPTR(dnp); - bzero(&record, sizeof (struct send_block_record)); + bzero(&record, sizeof (struct send_range)); + record.type = DATA; + record.object = object; record.eos_marker = B_FALSE; - record.bp = *DN_SPILL_BLKPTR(dnp); - SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os), - object, 0, DMU_SPILL_BLKID); + record.start_blkid = DMU_SPILL_BLKID; + record.end_blkid = record.start_blkid + 1; + record.sru.data.bp = *bp; + record.sru.data.obj_type = dnp->dn_type; + record.sru.data.datablksz = BP_GET_LSIZE(bp); - if (do_dump(dsp, &record) != 0) + if (do_dump(dscp, &record) != 0) return (SET_ERROR(EINTR)); } - if (dsp->dsa_err != 0) + if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); return (0); } static int -dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj, - uint64_t numslots) +dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp, + uint64_t firstobj, uint64_t numslots) { struct drr_object_range *drror = - &(dsp->dsa_drr->drr_u.drr_object_range); + &(dscp->dsc_drr->drr_u.drr_object_range); /* we only use this record type for raw sends */ ASSERT(BP_IS_PROTECTED(bp)); - ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); + ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); ASSERT0(BP_GET_LEVEL(bp)); - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; + dscp->dsc_pending_op = PENDING_NONE; } - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE; + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE; drror->drr_firstobj = firstobj; drror->drr_numslots = numslots; - drror->drr_toguid = dsp->dsa_toguid; + drror->drr_toguid = dscp->dsc_toguid; if (BP_SHOULD_BYTESWAP(bp)) drror->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); zio_crypt_decode_mac_bp(bp, drror->drr_mac); - if (dump_record(dsp, NULL, 0) != 0) + if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t -backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +send_do_embed(const blkptr_t *bp, uint64_t featureflags) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); @@ -688,7 +859,15 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && - !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) + !(featureflags & DMU_BACKUP_FEATURE_LZ4))) + return (B_FALSE); + + /* + * If we have not set the ZSTD feature flag, we can't send ZSTD + * compressed embedded blocks, as the receiver may not support them. + */ + if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD && + !(featureflags & DMU_BACKUP_FEATURE_ZSTD))) return (B_FALSE); /* @@ -696,7 +875,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: - if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: @@ -706,7 +885,208 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) } /* - * This is the callback function to traverse_dataset that acts as the worker + * This function actually handles figuring out what kind of record needs to be + * dumped, and calling the appropriate helper function. In most cases, + * the data has already been read by send_reader_thread(). + */ +static int +do_dump(dmu_send_cookie_t *dscp, struct send_range *range) +{ + int err = 0; + switch (range->type) { + case OBJECT: + err = dump_dnode(dscp, &range->sru.object.bp, range->object, + range->sru.object.dnp); + return (err); + case OBJECT_RANGE: { + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { + return (0); + } + uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >> + DNODE_SHIFT; + uint64_t firstobj = range->start_blkid * epb; + err = dump_object_range(dscp, &range->sru.object_range.bp, + firstobj, epb); + break; + } + case REDACT: { + struct srr *srrp = &range->sru.redact; + err = dump_redact(dscp, range->object, range->start_blkid * + srrp->datablksz, (range->end_blkid - range->start_blkid) * + srrp->datablksz); + return (err); + } + case DATA: { + struct srd *srdp = &range->sru.data; + blkptr_t *bp = &srdp->bp; + spa_t *spa = + dmu_objset_spa(dscp->dsc_os); + + ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + arc_flags_t aflags = ARC_FLAG_WAIT; + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + zioflags |= ZIO_FLAG_RAW; + } + + zbookmark_phys_t zb; + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + zb.zb_objset = dmu_objset_id(dscp->dsc_os); + zb.zb_object = range->object; + zb.zb_level = 0; + zb.zb_blkid = range->start_blkid; + + arc_buf_t *abuf = NULL; + if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, + bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + zioflags, &aflags, &zb) != 0) + return (SET_ERROR(EIO)); + + err = dump_spill(dscp, bp, zb.zb_object, + (abuf == NULL ? NULL : abuf->b_data)); + if (abuf != NULL) + arc_buf_destroy(abuf, &abuf); + return (err); + } + if (send_do_embed(bp, dscp->dsc_featureflags)) { + err = dump_write_embedded(dscp, range->object, + range->start_blkid * srdp->datablksz, + srdp->datablksz, bp); + return (err); + } + ASSERT(range->object > dscp->dsc_resume_object || + (range->object == dscp->dsc_resume_object && + range->start_blkid * srdp->datablksz >= + dscp->dsc_resume_offset)); + /* it's a level-0 block of a regular object */ + + mutex_enter(&srdp->lock); + while (srdp->io_outstanding) + cv_wait(&srdp->cv, &srdp->lock); + err = srdp->io_err; + mutex_exit(&srdp->lock); + + if (err != 0) { + if (zfs_send_corrupt_data && + !dscp->dsc_dso->dso_dryrun) { + /* + * Send a block filled with 0x"zfs badd bloc" + */ + srdp->abuf = arc_alloc_buf(spa, &srdp->abuf, + ARC_BUFC_DATA, srdp->datablksz); + uint64_t *ptr; + for (ptr = srdp->abuf->b_data; + (char *)ptr < (char *)srdp->abuf->b_data + + srdp->datablksz; ptr++) + *ptr = 0x2f5baddb10cULL; + } else { + return (SET_ERROR(EIO)); + } + } + + ASSERT(dscp->dsc_dso->dso_dryrun || + srdp->abuf != NULL || srdp->abd != NULL); + + uint64_t offset = range->start_blkid * srdp->datablksz; + + char *data = NULL; + if (srdp->abd != NULL) { + data = abd_to_buf(srdp->abd); + ASSERT3P(srdp->abuf, ==, NULL); + } else if (srdp->abuf != NULL) { + data = srdp->abuf->b_data; + } + + /* + * If we have large blocks stored on disk but the send flags + * don't allow us to send large blocks, we split the data from + * the arc buf into chunks. + */ + if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && + !(dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_LARGE_BLOCKS)) { + while (srdp->datablksz > 0 && err == 0) { + int n = MIN(srdp->datablksz, + SPA_OLD_MAXBLOCKSIZE); + err = dmu_dump_write(dscp, srdp->obj_type, + range->object, offset, n, n, NULL, data); + offset += n; + /* + * When doing dry run, data==NULL is used as a + * sentinel value by + * dmu_dump_write()->dump_record(). + */ + if (data != NULL) + data += n; + srdp->datablksz -= n; + } + } else { + err = dmu_dump_write(dscp, srdp->obj_type, + range->object, offset, + srdp->datablksz, srdp->datasz, bp, data); + } + return (err); + } + case HOLE: { + struct srh *srhp = &range->sru.hole; + if (range->object == DMU_META_DNODE_OBJECT) { + uint32_t span = srhp->datablksz >> DNODE_SHIFT; + uint64_t first_obj = range->start_blkid * span; + uint64_t numobj = range->end_blkid * span - first_obj; + return (dump_freeobjects(dscp, first_obj, numobj)); + } + uint64_t offset = 0; + + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(range->start_blkid, srhp->datablksz, + &offset)) { + return (0); + } + uint64_t len = 0; + + if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len)) + len = UINT64_MAX; + len = len - offset; + return (dump_free(dscp, range->object, offset, len)); + } + default: + panic("Invalid range type in do_dump: %d", range->type); + } + return (err); +} + +static struct send_range * +range_alloc(enum type type, uint64_t object, uint64_t start_blkid, + uint64_t end_blkid, boolean_t eos) +{ + struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP); + range->type = type; + range->object = object; + range->start_blkid = start_blkid; + range->end_blkid = end_blkid; + range->eos_marker = eos; + if (type == DATA) { + range->sru.data.abd = NULL; + range->sru.data.abuf = NULL; + mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL); + range->sru.data.io_outstanding = 0; + range->sru.data.io_err = 0; + } + return (range); +} + +/* + * This is the callback function to traverse_dataset that acts as a worker * thread for dmu_send_impl. */ /*ARGSUSED*/ @@ -715,304 +1095,1260 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { struct send_thread_arg *sta = arg; - struct send_block_record *record; - uint64_t record_size; - int err = 0; + struct send_range *record; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); - ASSERT3P(sta->ds, !=, NULL); - - if (sta->cancel) - return (SET_ERROR(EINTR)); - - if (bp == NULL) { - ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); - return (0); - } else if (zb->zb_level < 0) { - return (0); - } - - record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); - record->eos_marker = B_FALSE; - record->bp = *bp; - record->zb = *zb; - record->indblkshift = dnp->dn_indblkshift; - record->datablkszsec = dnp->dn_datablkszsec; - record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - bqueue_enqueue(&sta->q, record, record_size); - - return (err); -} - -/* - * This function kicks off the traverse_dataset. It also handles setting the - * error code of the thread in case something goes wrong, and pushes the End of - * Stream record when the traverse_dataset call has finished. If there is no - * dataset to traverse, the thread immediately pushes End of Stream marker. - */ -static void -send_traverse_thread(void *arg) -{ - struct send_thread_arg *st_arg = arg; - int err; - struct send_block_record *data; - fstrans_cookie_t cookie = spl_fstrans_mark(); - - if (st_arg->ds != NULL) { - err = traverse_dataset_resume(st_arg->ds, - st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); - - if (err != EINTR) - st_arg->error_code = err; - } - data = kmem_zalloc(sizeof (*data), KM_SLEEP); - data->eos_marker = B_TRUE; - bqueue_enqueue(&st_arg->q, data, 1); - spl_fstrans_unmark(cookie); - thread_exit(); -} - -/* - * This function actually handles figuring out what kind of record needs to be - * dumped, reading the data (which has hopefully been prefetched), and calling - * the appropriate helper function. - */ -static int -do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) -{ - dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); - const blkptr_t *bp = &data->bp; - const zbookmark_phys_t *zb = &data->zb; - uint8_t indblkshift = data->indblkshift; - uint16_t dblkszsec = data->datablkszsec; - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - int err = 0; - - ASSERT3U(zb->zb_level, >=, 0); - - ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || - zb->zb_object >= dsa->dsa_resume_object); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (dsa->dsa_os->os_encrypted && + if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb); zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", ds->ds_object); + "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); } + if (sta->cancel) + return (SET_ERROR(EINTR)); if (zb->zb_object != DMU_META_DNODE_OBJECT && - DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) return (0); - } else if (BP_IS_HOLE(bp) && - zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); - } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t offset = zb->zb_blkid * span; - /* Don't dump free records for offsets > DMU_OBJECT_END */ - if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid) - err = dump_free(dsa, zb->zb_object, offset, span); - } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + atomic_inc_64(sta->num_blocks_visited); + + if (zb->zb_level == ZB_DNODE_LEVEL) { + if (zb->zb_object == DMU_META_DNODE_OBJECT) + return (0); + record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE); + record->sru.object.bp = *bp; + size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1); + record->sru.object.dnp = kmem_alloc(size, KM_SLEEP); + bcopy(dnp, record->sru.object.dnp, size); + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); + } + if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT && + !BP_IS_HOLE(bp)) { + record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid, + zb->zb_blkid + 1, B_FALSE); + record->sru.object_range.bp = *bp; + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); + } + if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp))) + return (0); + if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp)) return (0); - } else if (type == DMU_OT_DNODE) { - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_ENCRYPTED(bp)); - ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); - zioflags |= ZIO_FLAG_RAW; - } + uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + uint64_t start; - ASSERT0(zb->zb_level); - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - dnode_phys_t *blk = abuf->b_data; - uint64_t dnobj = zb->zb_blkid * epb; - - /* - * Raw sends require sending encryption parameters for the - * block of dnodes. Regular sends do not need to send this - * info. - */ - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(arc_is_encrypted(abuf)); - err = dump_object_range(dsa, bp, dnobj, epb); - } - - if (err == 0) { - for (int i = 0; i < epb; - i += blk[i].dn_extra_slots + 1) { - err = dump_dnode(dsa, bp, dnobj + i, blk + i); - if (err != 0) - break; - } - } - arc_buf_destroy(abuf, &abuf); - } else if (type == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - - if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_PROTECTED(bp)); - zioflags |= ZIO_FLAG_RAW; - } - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data); - arc_buf_destroy(abuf, &abuf); - } else if (backup_do_embed(dsa, bp)) { - /* it's an embedded level-0 block of a regular object */ - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; - ASSERT0(zb->zb_level); - err = dump_write_embedded(dsa, zb->zb_object, - zb->zb_blkid * blksz, blksz, bp); - } else { - /* it's a level-0 block of a regular object */ - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; - uint64_t offset; - - /* - * If we have large blocks stored on disk but the send flags - * don't allow us to send large blocks, we split the data from - * the arc buf into chunks. - */ - boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && - !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); - - /* - * Raw sends require that we always get raw data as it exists - * on disk, so we assert that we are not splitting blocks here. - */ - boolean_t request_raw = - (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; - - /* - * We should only request compressed data from the ARC if all - * the following are true: - * - stream compression was requested - * - we aren't splitting large blocks into smaller chunks - * - the data won't need to be byteswapped before sending - * - this isn't an embedded block - * - this isn't metadata (if receiving on a different endian - * system it can be byteswapped more easily) - */ - boolean_t request_compressed = - (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && - !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && - !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); - - IMPLY(request_raw, !split_large_blocks); - IMPLY(request_raw, BP_IS_PROTECTED(bp)); - ASSERT0(zb->zb_level); - ASSERT(zb->zb_object > dsa->dsa_resume_object || - (zb->zb_object == dsa->dsa_resume_object && - zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); - - ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); - - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - if (request_raw) - zioflags |= ZIO_FLAG_RAW; - else if (request_compressed) - zioflags |= ZIO_FLAG_RAW_COMPRESS; - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { - if (zfs_send_corrupt_data) { - /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, - blksz); - uint64_t *ptr; - for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) - *ptr = 0x2f5baddb10cULL; - } else { - return (SET_ERROR(EIO)); - } - } - - offset = zb->zb_blkid * blksz; - - if (split_large_blocks) { - ASSERT0(arc_is_encrypted(abuf)); - ASSERT3U(arc_get_compression(abuf), ==, - ZIO_COMPRESS_OFF); - char *buf = abuf->b_data; - while (blksz > 0 && err == 0) { - int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsa, type, zb->zb_object, - offset, n, n, NULL, buf); - offset += n; - buf += n; - blksz -= n; - } - } else { - err = dump_write(dsa, type, zb->zb_object, offset, - blksz, arc_buf_size(abuf), bp, abuf->b_data); - } - arc_buf_destroy(abuf, &abuf); + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid == + DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) && + span * zb->zb_blkid > dnp->dn_maxblkid)) { + ASSERT(BP_IS_HOLE(bp)); + return (0); } - ASSERT(err == 0 || err == EINTR); - return (err); + if (zb->zb_blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); + + enum type record_type = DATA; + if (BP_IS_HOLE(bp)) + record_type = HOLE; + else if (BP_IS_REDACTED(bp)) + record_type = REDACT; + else + record_type = DATA; + + record = range_alloc(record_type, zb->zb_object, start, + (start + span < start ? 0 : start + span), B_FALSE); + + uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ? + BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + + if (BP_IS_HOLE(bp)) { + record->sru.hole.datablksz = datablksz; + } else if (BP_IS_REDACTED(bp)) { + record->sru.redact.datablksz = datablksz; + } else { + record->sru.data.datablksz = datablksz; + record->sru.data.obj_type = dnp->dn_type; + record->sru.data.bp = *bp; + } + + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); +} + +struct redact_list_cb_arg { + uint64_t *num_blocks_visited; + bqueue_t *q; + boolean_t *cancel; + boolean_t mark_redact; +}; + +static int +redact_list_cb(redact_block_phys_t *rb, void *arg) +{ + struct redact_list_cb_arg *rlcap = arg; + + atomic_inc_64(rlcap->num_blocks_visited); + if (*rlcap->cancel) + return (-1); + + struct send_range *data = range_alloc(REDACT, rb->rbp_object, + rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE); + ASSERT3U(data->end_blkid, >, rb->rbp_blkid); + if (rlcap->mark_redact) { + data->type = REDACT; + data->sru.redact.datablksz = redact_block_get_size(rb); + } else { + data->type = PREVIOUSLY_REDACTED; + } + bqueue_enqueue(rlcap->q, data, sizeof (*data)); + + return (0); } /* - * Pop the new data off the queue, and free the old data. + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. */ -static struct send_block_record * -get_next_record(bqueue_t *bq, struct send_block_record *data) +static void +send_traverse_thread(void *arg) { - struct send_block_record *tmp = bqueue_dequeue(bq); - kmem_free(data, sizeof (*data)); - return (tmp); + struct send_thread_arg *st_arg = arg; + int err = 0; + struct send_range *data; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + + if (err != EINTR) + st_arg->error_code = err; + data = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data)); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +/* + * Utility function that causes End of Stream records to compare after of all + * others, so that other threads' comparison logic can stay simple. + */ +static int __attribute__((unused)) +send_range_after(const struct send_range *from, const struct send_range *to) +{ + if (from->eos_marker == B_TRUE) + return (1); + if (to->eos_marker == B_TRUE) + return (-1); + + uint64_t from_obj = from->object; + uint64_t from_end_obj = from->object + 1; + uint64_t to_obj = to->object; + uint64_t to_end_obj = to->object + 1; + if (from_obj == 0) { + ASSERT(from->type == HOLE || from->type == OBJECT_RANGE); + from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT; + from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + if (to_obj == 0) { + ASSERT(to->type == HOLE || to->type == OBJECT_RANGE); + to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT; + to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + + if (from_end_obj <= to_obj) + return (-1); + if (from_obj >= to_end_obj) + return (1); + int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type == + OBJECT_RANGE); + if (unlikely(cmp)) + return (cmp); + cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT); + if (unlikely(cmp)) + return (cmp); + if (from->end_blkid <= to->start_blkid) + return (-1); + if (from->start_blkid >= to->end_blkid) + return (1); + return (0); +} + +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, but do not free the old data. This is used so that the + * records can be sent on to the main thread without copying the data. + */ +static struct send_range * +get_next_range_nofree(bqueue_t *bq, struct send_range *prev) +{ + struct send_range *next = bqueue_dequeue(bq); + ASSERT3S(send_range_after(prev, next), ==, -1); + return (next); +} + +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, and free the old data. + */ +static struct send_range * +get_next_range(bqueue_t *bq, struct send_range *prev) +{ + struct send_range *next = get_next_range_nofree(bq, prev); + range_free(prev); + return (next); +} + +static void +redact_list_thread(void *arg) +{ + struct redact_list_thread_arg *rlt_arg = arg; + struct send_range *record; + fstrans_cookie_t cookie = spl_fstrans_mark(); + if (rlt_arg->rl != NULL) { + struct redact_list_cb_arg rlcba = {0}; + rlcba.cancel = &rlt_arg->cancel; + rlcba.q = &rlt_arg->q; + rlcba.num_blocks_visited = rlt_arg->num_blocks_visited; + rlcba.mark_redact = rlt_arg->mark_redact; + int err = dsl_redaction_list_traverse(rlt_arg->rl, + &rlt_arg->resume, redact_list_cb, &rlcba); + if (err != EINTR) + rlt_arg->error_code = err; + } + record = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); + spl_fstrans_unmark(cookie); + + thread_exit(); +} + +/* + * Compare the start point of the two provided ranges. End of stream ranges + * compare last, objects compare before any data or hole inside that object and + * multi-object holes that start at the same object. + */ +static int +send_range_start_compare(struct send_range *r1, struct send_range *r2) +{ + uint64_t r1_objequiv = r1->object; + uint64_t r1_l0equiv = r1->start_blkid; + uint64_t r2_objequiv = r2->object; + uint64_t r2_l0equiv = r2->start_blkid; + int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker); + if (unlikely(cmp)) + return (cmp); + if (r1->object == 0) { + r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK; + r1_l0equiv = 0; + } + if (r2->object == 0) { + r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK; + r2_l0equiv = 0; + } + + cmp = TREE_CMP(r1_objequiv, r2_objequiv); + if (likely(cmp)) + return (cmp); + cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); + if (unlikely(cmp)) + return (cmp); + cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT); + if (unlikely(cmp)) + return (cmp); + + return (TREE_CMP(r1_l0equiv, r2_l0equiv)); +} + +enum q_idx { + REDACT_IDX = 0, + TO_IDX, + FROM_IDX, + NUM_THREADS +}; + +/* + * This function returns the next range the send_merge_thread should operate on. + * The inputs are two arrays; the first one stores the range at the front of the + * queues stored in the second one. The ranges are sorted in descending + * priority order; the metadata from earlier ranges overrules metadata from + * later ranges. out_mask is used to return which threads the ranges came from; + * bit i is set if ranges[i] started at the same place as the returned range. + * + * This code is not hardcoded to compare a specific number of threads; it could + * be used with any number, just by changing the q_idx enum. + * + * The "next range" is the one with the earliest start; if two starts are equal, + * the highest-priority range is the next to operate on. If a higher-priority + * range starts in the middle of the first range, then the first range will be + * truncated to end where the higher-priority range starts, and we will operate + * on that one next time. In this way, we make sure that each block covered by + * some range gets covered by a returned range, and each block covered is + * returned using the metadata of the highest-priority range it appears in. + * + * For example, if the three ranges at the front of the queues were [2,4), + * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata + * from the third range, [2,4) with the metadata from the first range, and then + * [4,5) with the metadata from the second. + */ +static struct send_range * +find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask) +{ + int idx = 0; // index of the range with the earliest start + int i; + uint64_t bmask = 0; + for (i = 1; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) < 0) + idx = i; + } + if (ranges[idx]->eos_marker) { + struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE); + *out_mask = 0; + return (ret); + } + /* + * Find all the ranges that start at that same point. + */ + for (i = 0; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) == 0) + bmask |= 1 << i; + } + *out_mask = bmask; + /* + * OBJECT_RANGE records only come from the TO thread, and should always + * be treated as overlapping with nothing and sent on immediately. They + * are only used in raw sends, and are never redacted. + */ + if (ranges[idx]->type == OBJECT_RANGE) { + ASSERT3U(idx, ==, TO_IDX); + ASSERT3U(*out_mask, ==, 1 << TO_IDX); + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } + /* + * Find the first start or end point after the start of the first range. + */ + uint64_t first_change = ranges[idx]->end_blkid; + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || ranges[i]->eos_marker || + ranges[i]->object > ranges[idx]->object || + ranges[i]->object == DMU_META_DNODE_OBJECT) + continue; + ASSERT3U(ranges[i]->object, ==, ranges[idx]->object); + if (first_change > ranges[i]->start_blkid && + (bmask & (1 << i)) == 0) + first_change = ranges[i]->start_blkid; + else if (first_change > ranges[i]->end_blkid) + first_change = ranges[i]->end_blkid; + } + /* + * Update all ranges to no longer overlap with the range we're + * returning. All such ranges must start at the same place as the range + * being returned, and end at or after first_change. Thus we update + * their start to first_change. If that makes them size 0, then free + * them and pull a new range from that thread. + */ + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || (bmask & (1 << i)) == 0) + continue; + ASSERT3U(first_change, >, ranges[i]->start_blkid); + ranges[i]->start_blkid = first_change; + ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid); + if (ranges[i]->start_blkid == ranges[i]->end_blkid) + ranges[i] = get_next_range(qs[i], ranges[i]); + } + /* + * Short-circuit the simple case; if the range doesn't overlap with + * anything else, or it only overlaps with things that start at the same + * place and are longer, send it on. + */ + if (first_change == ranges[idx]->end_blkid) { + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } + + /* + * Otherwise, return a truncated copy of ranges[idx] and move the start + * of ranges[idx] back to first_change. + */ + struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP); + *ret = *ranges[idx]; + ret->end_blkid = first_change; + ranges[idx]->start_blkid = first_change; + return (ret); +} + +#define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX)) + +/* + * Merge the results from the from thread and the to thread, and then hand the + * records off to send_prefetch_thread to prefetch them. If this is not a + * send from a redaction bookmark, the from thread will push an end of stream + * record and stop, and we'll just send everything that was changed in the + * to_ds since the ancestor's creation txg. If it is, then since + * traverse_dataset has a canonical order, we can compare each change as + * they're pulled off the queues. That will give us a stream that is + * appropriately sorted, and covers all records. In addition, we pull the + * data from the redact_list_thread and use that to determine which blocks + * should be redacted. + */ +static void +send_merge_thread(void *arg) +{ + struct send_merge_thread_arg *smt_arg = arg; + struct send_range *front_ranges[NUM_THREADS]; + bqueue_t *queues[NUM_THREADS]; + int err = 0; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + if (smt_arg->redact_arg == NULL) { + front_ranges[REDACT_IDX] = + kmem_zalloc(sizeof (struct send_range), KM_SLEEP); + front_ranges[REDACT_IDX]->eos_marker = B_TRUE; + front_ranges[REDACT_IDX]->type = REDACT; + queues[REDACT_IDX] = NULL; + } else { + front_ranges[REDACT_IDX] = + bqueue_dequeue(&smt_arg->redact_arg->q); + queues[REDACT_IDX] = &smt_arg->redact_arg->q; + } + front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q); + queues[TO_IDX] = &smt_arg->to_arg->q; + front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q); + queues[FROM_IDX] = &smt_arg->from_arg->q; + uint64_t mask = 0; + struct send_range *range; + for (range = find_next_range(front_ranges, queues, &mask); + !range->eos_marker && err == 0 && !smt_arg->cancel; + range = find_next_range(front_ranges, queues, &mask)) { + /* + * If the range in question was in both the from redact bookmark + * and the bookmark we're using to redact, then don't send it. + * It's already redacted on the receiving system, so a redaction + * record would be redundant. + */ + if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) { + ASSERT3U(range->type, ==, REDACT); + range_free(range); + continue; + } + bqueue_enqueue(&smt_arg->q, range, sizeof (*range)); + + if (smt_arg->to_arg->error_code != 0) { + err = smt_arg->to_arg->error_code; + } else if (smt_arg->from_arg->error_code != 0) { + err = smt_arg->from_arg->error_code; + } else if (smt_arg->redact_arg != NULL && + smt_arg->redact_arg->error_code != 0) { + err = smt_arg->redact_arg->error_code; + } + } + if (smt_arg->cancel && err == 0) + err = SET_ERROR(EINTR); + smt_arg->error = err; + if (smt_arg->error != 0) { + smt_arg->to_arg->cancel = B_TRUE; + smt_arg->from_arg->cancel = B_TRUE; + if (smt_arg->redact_arg != NULL) + smt_arg->redact_arg->cancel = B_TRUE; + } + for (int i = 0; i < NUM_THREADS; i++) { + while (!front_ranges[i]->eos_marker) { + front_ranges[i] = get_next_range(queues[i], + front_ranges[i]); + } + range_free(front_ranges[i]); + } + if (range == NULL) + range = kmem_zalloc(sizeof (*range), KM_SLEEP); + range->eos_marker = B_TRUE; + bqueue_enqueue_flush(&smt_arg->q, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +struct send_reader_thread_arg { + struct send_merge_thread_arg *smta; + bqueue_t q; + boolean_t cancel; + boolean_t issue_reads; + uint64_t featureflags; + int error; +}; + +static void +dmu_send_read_done(zio_t *zio) +{ + struct send_range *range = zio->io_private; + + mutex_enter(&range->sru.data.lock); + if (zio->io_error != 0) { + abd_free(range->sru.data.abd); + range->sru.data.abd = NULL; + range->sru.data.io_err = zio->io_error; + } + + ASSERT(range->sru.data.io_outstanding); + range->sru.data.io_outstanding = B_FALSE; + cv_broadcast(&range->sru.data.cv); + mutex_exit(&range->sru.data.lock); +} + +static void +issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) +{ + struct srd *srdp = &range->sru.data; + blkptr_t *bp = &srdp->bp; + objset_t *os = srta->smta->os; + + ASSERT3U(range->type, ==, DATA); + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + /* + * If we have large blocks stored on disk but + * the send flags don't allow us to send large + * blocks, we split the data from the arc buf + * into chunks. + */ + boolean_t split_large_blocks = + srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && + !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + /* + * We should only request compressed data from the ARC if all + * the following are true: + * - stream compression was requested + * - we aren't splitting large blocks into smaller chunks + * - the data won't need to be byteswapped before sending + * - this isn't an embedded block + * - this isn't metadata (if receiving on a different endian + * system it can be byteswapped more easily) + */ + boolean_t request_compressed = + (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && + !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); + + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) + zioflags |= ZIO_FLAG_RAW; + else if (request_compressed) + zioflags |= ZIO_FLAG_RAW_COMPRESS; + + srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ? + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp); + + if (!srta->issue_reads) + return; + if (BP_IS_REDACTED(bp)) + return; + if (send_do_embed(bp, srta->featureflags)) + return; + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(os), + .zb_object = range->object, + .zb_level = 0, + .zb_blkid = range->start_blkid, + }; + + arc_flags_t aflags = ARC_FLAG_CACHED_ONLY; + + int arc_err = arc_read(NULL, os->os_spa, bp, + arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ, + zioflags, &aflags, &zb); + /* + * If the data is not already cached in the ARC, we read directly + * from zio. This avoids the performance overhead of adding a new + * entry to the ARC, and we also avoid polluting the ARC cache with + * data that is not likely to be used in the future. + */ + if (arc_err != 0) { + srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE); + srdp->io_outstanding = B_TRUE; + zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd, + srdp->datasz, dmu_send_read_done, range, + ZIO_PRIORITY_ASYNC_READ, zioflags, &zb)); + } +} + +/* + * Create a new record with the given values. + */ +static void +enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, + uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz) +{ + enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE : + (BP_IS_REDACTED(bp) ? REDACT : DATA)); + + struct send_range *range = range_alloc(range_type, dn->dn_object, + blkid, blkid + count, B_FALSE); + + if (blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); + + switch (range_type) { + case HOLE: + range->sru.hole.datablksz = datablksz; + break; + case DATA: + ASSERT3U(count, ==, 1); + range->sru.data.datablksz = datablksz; + range->sru.data.obj_type = dn->dn_type; + range->sru.data.bp = *bp; + issue_data_read(srta, range); + break; + case REDACT: + range->sru.redact.datablksz = datablksz; + break; + default: + break; + } + bqueue_enqueue(q, range, datablksz); +} + +/* + * This thread is responsible for two things: First, it retrieves the correct + * blkptr in the to ds if we need to send the data because of something from + * the from thread. As a result of this, we're the first ones to discover that + * some indirect blocks can be discarded because they're not holes. Second, + * it issues prefetches for the data we need to send. + */ +static void +send_reader_thread(void *arg) +{ + struct send_reader_thread_arg *srta = arg; + struct send_merge_thread_arg *smta = srta->smta; + bqueue_t *inq = &smta->q; + bqueue_t *outq = &srta->q; + objset_t *os = smta->os; + fstrans_cookie_t cookie = spl_fstrans_mark(); + struct send_range *range = bqueue_dequeue(inq); + int err = 0; + + /* + * If the record we're analyzing is from a redaction bookmark from the + * fromds, then we need to know whether or not it exists in the tods so + * we know whether to create records for it or not. If it does, we need + * the datablksz so we can generate an appropriate record for it. + * Finally, if it isn't redacted, we need the blkptr so that we can send + * a WRITE record containing the actual data. + */ + uint64_t last_obj = UINT64_MAX; + uint64_t last_obj_exists = B_TRUE; + while (!range->eos_marker && !srta->cancel && smta->error == 0 && + err == 0) { + switch (range->type) { + case DATA: + issue_data_read(srta, range); + bqueue_enqueue(outq, range, range->sru.data.datablksz); + range = get_next_range_nofree(inq, range); + break; + case HOLE: + case OBJECT: + case OBJECT_RANGE: + case REDACT: // Redacted blocks must exist + bqueue_enqueue(outq, range, sizeof (*range)); + range = get_next_range_nofree(inq, range); + break; + case PREVIOUSLY_REDACTED: { + /* + * This entry came from the "from bookmark" when + * sending from a bookmark that has a redaction + * list. We need to check if this object/blkid + * exists in the target ("to") dataset, and if + * not then we drop this entry. We also need + * to fill in the block pointer so that we know + * what to prefetch. + * + * To accomplish the above, we first cache whether or + * not the last object we examined exists. If it + * doesn't, we can drop this record. If it does, we hold + * the dnode and use it to call dbuf_dnode_findbp. We do + * this instead of dbuf_bookmark_findbp because we will + * often operate on large ranges, and holding the dnode + * once is more efficient. + */ + boolean_t object_exists = B_TRUE; + /* + * If the data is redacted, we only care if it exists, + * so that we don't send records for objects that have + * been deleted. + */ + dnode_t *dn; + if (range->object == last_obj && !last_obj_exists) { + /* + * If we're still examining the same object as + * previously, and it doesn't exist, we don't + * need to call dbuf_bookmark_findbp. + */ + object_exists = B_FALSE; + } else { + err = dnode_hold(os, range->object, FTAG, &dn); + if (err == ENOENT) { + object_exists = B_FALSE; + err = 0; + } + last_obj = range->object; + last_obj_exists = object_exists; + } + + if (err != 0) { + break; + } else if (!object_exists) { + /* + * The block was modified, but doesn't + * exist in the to dataset; if it was + * deleted in the to dataset, then we'll + * visit the hole bp for it at some point. + */ + range = get_next_range(inq, range); + continue; + } + uint64_t file_max = + (dn->dn_maxblkid < range->end_blkid ? + dn->dn_maxblkid : range->end_blkid); + /* + * The object exists, so we need to try to find the + * blkptr for each block in the range we're processing. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (uint64_t blkid = range->start_blkid; + blkid < file_max; blkid++) { + blkptr_t bp; + uint32_t datablksz = + dn->dn_phys->dn_datablkszsec << + SPA_MINBLOCKSHIFT; + uint64_t offset = blkid * datablksz; + /* + * This call finds the next non-hole block in + * the object. This is to prevent a + * performance problem where we're unredacting + * a large hole. Using dnode_next_offset to + * skip over the large hole avoids iterating + * over every block in it. + */ + err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &offset, 1, 1, 0); + if (err == ESRCH) { + offset = UINT64_MAX; + err = 0; + } else if (err != 0) { + break; + } + if (offset != blkid * datablksz) { + /* + * if there is a hole from here + * (blkid) to offset + */ + offset = MIN(offset, file_max * + datablksz); + uint64_t nblks = (offset / datablksz) - + blkid; + enqueue_range(srta, outq, dn, blkid, + nblks, NULL, datablksz); + blkid += nblks; + } + if (blkid >= file_max) + break; + err = dbuf_dnode_findbp(dn, 0, blkid, &bp, + NULL, NULL); + if (err != 0) + break; + ASSERT(!BP_IS_HOLE(&bp)); + enqueue_range(srta, outq, dn, blkid, 1, &bp, + datablksz); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + range = get_next_range(inq, range); + } + } + } + if (srta->cancel || err != 0) { + smta->cancel = B_TRUE; + srta->error = err; + } else if (smta->error != 0) { + srta->error = smta->error; + } + while (!range->eos_marker) + range = get_next_range(inq, range); + + bqueue_enqueue_flush(outq, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +#define NUM_SNAPS_NOT_REDACTED UINT64_MAX + +struct dmu_send_params { + /* Pool args */ + void *tag; // Tag that dp was held with, will be used to release dp. + dsl_pool_t *dp; + /* To snapshot args */ + const char *tosnap; + dsl_dataset_t *to_ds; + /* From snapshot args */ + zfs_bookmark_phys_t ancestor_zb; + uint64_t *fromredactsnaps; + /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */ + uint64_t numfromredactsnaps; + /* Stream params */ + boolean_t is_clone; + boolean_t embedok; + boolean_t large_block_ok; + boolean_t compressok; + boolean_t rawok; + boolean_t savedok; + uint64_t resumeobj; + uint64_t resumeoff; + uint64_t saved_guid; + zfs_bookmark_phys_t *redactbook; + /* Stream output params */ + dmu_send_outparams_t *dso; + + /* Stream progress params */ + offset_t *off; + int outfd; + char saved_toname[MAXNAMELEN]; +}; + +static int +setup_featureflags(struct dmu_send_params *dspp, objset_t *os, + uint64_t *featureflags) +{ + dsl_dataset_t *to_ds = dspp->to_ds; + dsl_pool_t *dp = dspp->dp; +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) + return (SET_ERROR(EINVAL)); + + if (version >= ZPL_VERSION_SA) + *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } +#endif + + /* raw sends imply large_block_ok */ + if ((dspp->rawok || dspp->large_block_ok) && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + } + + /* encrypted datasets will not have embedded blocks */ + if ((dspp->embedok || dspp->rawok) && !os->os_encrypted && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + *featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + } + + /* raw send implies compressok */ + if (dspp->compressok || dspp->rawok) + *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + + if (dspp->rawok && os->os_encrypted) + *featureflags |= DMU_BACKUP_FEATURE_RAW; + + if ((*featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | + DMU_BACKUP_FEATURE_RAW)) != 0 && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_LZ4; + } + + /* + * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to + * allow sending ZSTD compressed datasets to a receiver that does not + * support ZSTD + */ + if ((*featureflags & + (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_ZSTD; + } + + if (dspp->resumeobj != 0 || dspp->resumeoff != 0) { + *featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + + if (dspp->redactbook != NULL) { + *featureflags |= DMU_BACKUP_FEATURE_REDACTED; + } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; + } + return (0); +} + +static dmu_replay_record_t * +create_begin_record(struct dmu_send_params *dspp, objset_t *os, + uint64_t featureflags) +{ + dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t), + KM_SLEEP); + drr->drr_type = DRR_BEGIN; + + struct drr_begin *drrb = &drr->drr_u.drr_begin; + dsl_dataset_t *to_ds = dspp->to_ds; + + drrb->drr_magic = DMU_BACKUP_MAGIC; + drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; + drrb->drr_type = dmu_objset_type(os); + drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid; + + DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags); + + if (dspp->is_clone) + drrb->drr_flags |= DRR_FLAG_CLONE; + if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET) + drrb->drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drrb->drr_flags |= DRR_FLAG_FREERECORDS; + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; + + if (dspp->savedok) { + drrb->drr_toguid = dspp->saved_guid; + strlcpy(drrb->drr_toname, dspp->saved_toname, + sizeof (drrb->drr_toname)); + } else { + dsl_dataset_name(to_ds, drrb->drr_toname); + if (!to_ds->ds_is_snapshot) { + (void) strlcat(drrb->drr_toname, "@--head--", + sizeof (drrb->drr_toname)); + } + } + return (drr); +} + +static void +setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os, + dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok) +{ + VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + to_arg->error_code = 0; + to_arg->cancel = B_FALSE; + to_arg->os = to_os; + to_arg->fromtxg = fromtxg; + to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; + if (rawok) + to_arg->flags |= TRAVERSE_NO_DECRYPT; + if (zfs_send_corrupt_data) + to_arg->flags |= TRAVERSE_HARD; + to_arg->num_blocks_visited = &dssp->dss_blocks; + (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_from_thread(struct redact_list_thread_arg *from_arg, + redaction_list_t *from_rl, dmu_sendstatus_t *dssp) +{ + VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + from_arg->error_code = 0; + from_arg->cancel = B_FALSE; + from_arg->rl = from_rl; + from_arg->mark_redact = B_FALSE; + from_arg->num_blocks_visited = &dssp->dss_blocks; + /* + * If from_ds is null, send_traverse_thread just returns success and + * enqueues an eos marker. + */ + (void) thread_create(NULL, 0, redact_list_thread, from_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg, + struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp) +{ + if (dspp->redactbook == NULL) + return; + + rlt_arg->cancel = B_FALSE; + VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + rlt_arg->error_code = 0; + rlt_arg->mark_redact = B_TRUE; + rlt_arg->rl = rl; + rlt_arg->num_blocks_visited = &dssp->dss_blocks; + + (void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_merge_thread(struct send_merge_thread_arg *smt_arg, + struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg, + objset_t *os) +{ + VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + smt_arg->cancel = B_FALSE; + smt_arg->error = 0; + smt_arg->from_arg = from_arg; + smt_arg->to_arg = to_arg; + if (dspp->redactbook != NULL) + smt_arg->redact_arg = rlt_arg; + + smt_arg->os = os; + (void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc, + TS_RUN, minclsyspri); +} + +static void +setup_reader_thread(struct send_reader_thread_arg *srt_arg, + struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg, + uint64_t featureflags) +{ + VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff, + MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + srt_arg->smta = smt_arg; + srt_arg->issue_reads = !dspp->dso->dso_dryrun; + srt_arg->featureflags = featureflags; + (void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static int +setup_resume_points(struct dmu_send_params *dspp, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg, + struct redact_list_thread_arg *rlt_arg, + struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os, + redaction_list_t *redact_rl, nvlist_t *nvl) +{ + dsl_dataset_t *to_ds = dspp->to_ds; + int err = 0; + + uint64_t obj = 0; + uint64_t blkid = 0; + if (resuming) { + obj = dspp->resumeobj; + dmu_object_info_t to_doi; + err = dmu_object_info(os, obj, &to_doi); + if (err != 0) + return (err); + + blkid = dspp->resumeoff / to_doi.doi_data_block_size; + } + /* + * If we're resuming a redacted send, we can skip to the appropriate + * point in the redaction bookmark by binary searching through it. + */ + if (redact_rl != NULL) { + SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid); + } + + SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid); + if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) { + uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj; + /* + * Note: If the resume point is in an object whose + * blocksize is different in the from vs to snapshots, + * we will have divided by the "wrong" blocksize. + * However, in this case fromsnap's send_cb() will + * detect that the blocksize has changed and therefore + * ignore this object. + * + * If we're resuming a send from a redaction bookmark, + * we still cannot accidentally suggest blocks behind + * the to_ds. In addition, we know that any blocks in + * the object in the to_ds will have to be sent, since + * the size changed. Therefore, we can't cause any harm + * this way either. + */ + SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid); + } + if (resuming) { + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj); + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff); + } + return (0); +} + +static dmu_sendstatus_t * +setup_send_progress(struct dmu_send_params *dspp) +{ + dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP); + dssp->dss_outfd = dspp->outfd; + dssp->dss_off = dspp->off; + dssp->dss_proc = curproc; + mutex_enter(&dspp->to_ds->ds_sendstream_lock); + list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); + mutex_exit(&dspp->to_ds->ds_sendstream_lock); + return (dssp); } /* * Actually do the bulk of the work in a zfs send. * + * The idea is that we want to do a send from ancestor_zb to to_ds. We also + * want to not send any data that has been modified by all the datasets in + * redactsnaparr, and store the list of blocks that are redacted in this way in + * a bookmark named redactbook, created on the to_ds. We do this by creating + * several worker threads, whose function is described below. + * + * There are three cases. + * The first case is a redacted zfs send. In this case there are 5 threads. + * The first thread is the to_ds traversal thread: it calls dataset_traverse on + * the to_ds and finds all the blocks that have changed since ancestor_zb (if + * it's a full send, that's all blocks in the dataset). It then sends those + * blocks on to the send merge thread. The redact list thread takes the data + * from the redaction bookmark and sends those blocks on to the send merge + * thread. The send merge thread takes the data from the to_ds traversal + * thread, and combines it with the redaction records from the redact list + * thread. If a block appears in both the to_ds's data and the redaction data, + * the send merge thread will mark it as redacted and send it on to the prefetch + * thread. Otherwise, the send merge thread will send the block on to the + * prefetch thread unchanged. The prefetch thread will issue prefetch reads for + * any data that isn't redacted, and then send the data on to the main thread. + * The main thread behaves the same as in a normal send case, issuing demand + * reads for data blocks and sending out records over the network + * + * The graphic below diagrams the flow of data in the case of a redacted zfs + * send. Each box represents a thread, and each line represents the flow of + * data. + * + * Records from the | + * redaction bookmark | + * +--------------------+ | +---------------------------+ + * | | v | Send Merge Thread | + * | Redact List Thread +----------> Apply redaction marks to | + * | | | records as specified by | + * +--------------------+ | redaction ranges | + * +----^---------------+------+ + * | | Merged data + * | | + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since | + * ancestor_zb +------------v----+ + * | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The second case is an incremental send from a redaction bookmark. The to_ds + * traversal thread and the main thread behave the same as in the redacted + * send case. The new thread is the from bookmark traversal thread. It + * iterates over the redaction list in the redaction bookmark, and enqueues + * records for each block that was redacted in the original send. The send + * merge thread now has to merge the data from the two threads. For details + * about that process, see the header comment of send_merge_thread(). Any data + * it decides to send on will be prefetched by the prefetch thread. Note that + * you can perform a redacted send from a redaction bookmark; in that case, + * the data flow behaves very similarly to the flow in the redacted send case, + * except with the addition of the bookmark traversal thread iterating over the + * redaction bookmark. The send_merge_thread also has to take on the + * responsibility of merging the redact list thread's records, the bookmark + * traversal thread's records, and the to_ds records. + * + * +---------------------+ + * | | + * | Redact List Thread +--------------+ + * | | | + * +---------------------+ | + * Blocks in redaction list | Ranges modified by every secure snap + * of from bookmark | (or EOS if not readcted) + * | + * +---------------------+ | +----v----------------------+ + * | bookmark Traversal | v | Send Merge Thread | + * | Thread (finds +---------> Merges bookmark, rlt, and | + * | candidate blocks) | | to_ds send records | + * +---------------------+ +----^---------------+------+ + * | | Merged data + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since +------------v----+ + * ancestor_zb | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The final case is a simple zfs full or incremental send. The to_ds traversal + * thread behaves the same as always. The redact list thread is never started. + * The send merge thread takes all the blocks that the to_ds traversal thread + * sends it, prefetches the data, and sends the blocks on to the main thread. + * The main thread sends the data over the wire. + * + * To keep performance acceptable, we want to prefetch the data in the worker + * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH + * feature built into traverse_dataset, the combining and deletion of records + * due to redaction and sends from redaction bookmarks mean that we could + * issue many unnecessary prefetches. As a result, we only prefetch data + * after we've determined that the record is not going to be redacted. To + * prevent the prefetching from getting too far ahead of the main thread, the + * blocking queues that are used for communication are capped not by the + * number of entries in the queue, but by the sum of the size of the + * prefetches associated with them. The limit on the amount of data that the + * thread can prefetch beyond what the main thread has reached is controlled + * by the global variable zfs_send_queue_length. In addition, to prevent poor + * performance in the beginning of a send, we also limit the distance ahead + * that the traversal threads can be. That distance is controlled by the + * zfs_send_no_prefetch_queue_length tunable. + * * Note: Releases dp using the specified tag. */ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, - vnode_t *vp, offset_t *off) +dmu_send_impl(struct dmu_send_params *dspp) { objset_t *os; dmu_replay_record_t *drr; - dmu_sendarg_t *dsp; + dmu_sendstatus_t *dssp; + dmu_send_cookie_t dsc = {0}; int err; - uint64_t fromtxg = 0; + uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg; uint64_t featureflags = 0; - struct send_thread_arg to_arg; - void *payload = NULL; - size_t payload_len = 0; - struct send_block_record *to_data; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *rlt_arg; + struct send_merge_thread_arg *smt_arg; + struct send_reader_thread_arg *srt_arg; + struct send_range *range; + redaction_list_t *from_rl = NULL; + redaction_list_t *redact_rl = NULL; + boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0); + boolean_t book_resuming = resuming; + + dsl_dataset_t *to_ds = dspp->to_ds; + zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb; + dsl_pool_t *dp = dspp->dp; + void *tag = dspp->tag; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { @@ -1026,7 +2362,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, * either a snapshot or we have owned the dataset, ensuring that * it can't be modified. */ - if (!rawok && os->os_encrypted && + if (!dspp->rawok && os->os_encrypted && arc_is_unauthenticated(os->os_phys_buf)) { zbookmark_phys_t zb; @@ -1042,225 +2378,244 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); } - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, - DMU_SUBSTREAM); + if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) { + dsl_pool_rele(dp, tag); + return (err); + } - bzero(&to_arg, sizeof (to_arg)); - -#ifdef _KERNEL - if (dmu_objset_type(os) == DMU_OST_ZFS) { - uint64_t version; - if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); + /* + * If we're doing a redacted send, hold the bookmark's redaction list. + */ + if (dspp->redactbook != NULL) { + err = dsl_redaction_list_hold_obj(dp, + dspp->redactbook->zbm_redaction_obj, FTAG, + &redact_rl); + if (err != 0) { dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } - if (version >= ZPL_VERSION_SA) { - featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + dsl_redaction_list_long_hold(dp, redact_rl, FTAG); + } + + /* + * If we're sending from a redaction bookmark, hold the redaction list + * so that we can consider sending the redacted blocks. + */ + if (ancestor_zb->zbm_redaction_obj != 0) { + err = dsl_redaction_list_hold_obj(dp, + ancestor_zb->zbm_redaction_obj, FTAG, &from_rl); + if (err != 0) { + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); + } + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); } + dsl_redaction_list_long_hold(dp, from_rl, FTAG); } -#endif - - /* raw sends imply large_block_ok */ - if ((large_block_ok || rawok) && - dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) - featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; - if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) - featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; - - /* encrypted datasets will not have embedded blocks */ - if ((embedok || rawok) && !os->os_encrypted && - spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { - featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; - } - - /* raw send implies compressok */ - if (compressok || rawok) - featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; - - if (rawok && os->os_encrypted) - featureflags |= DMU_BACKUP_FEATURE_RAW; - - if ((featureflags & - (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | - DMU_BACKUP_FEATURE_RAW)) != 0 && - spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { - featureflags |= DMU_BACKUP_FEATURE_LZ4; - } - - if (resumeobj != 0 || resumeoff != 0) { - featureflags |= DMU_BACKUP_FEATURE_RESUMING; - } - - DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, - featureflags); - - drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(to_ds)->ds_creation_time; - drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); - if (is_clone) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; - if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (zfs_send_set_freerecords_bit) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; - - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; - - if (ancestor_zb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = - ancestor_zb->zbm_guid; - fromtxg = ancestor_zb->zbm_creation_txg; - } - dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); - if (!to_ds->ds_is_snapshot) { - (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", - sizeof (drr->drr_u.drr_begin.drr_toname)); - } - - dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); - - dsp->dsa_drr = drr; - dsp->dsa_vp = vp; - dsp->dsa_outfd = outfd; - dsp->dsa_proc = curproc; - dsp->dsa_os = os; - dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; - dsp->dsa_fromtxg = fromtxg; - dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_featureflags = featureflags; - dsp->dsa_resume_object = resumeobj; - dsp->dsa_resume_offset = resumeoff; - - mutex_enter(&to_ds->ds_sendstream_lock); - list_insert_head(&to_ds->ds_sendstreams, dsp); - mutex_exit(&to_ds->ds_sendstream_lock); dsl_dataset_long_hold(to_ds, FTAG); + + from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP); + to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP); + rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP); + smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP); + srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP); + + drr = create_begin_record(dspp, os, featureflags); + dssp = setup_send_progress(dspp); + + dsc.dsc_drr = drr; + dsc.dsc_dso = dspp->dso; + dsc.dsc_os = os; + dsc.dsc_off = dspp->off; + dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsc.dsc_fromtxg = fromtxg; + dsc.dsc_pending_op = PENDING_NONE; + dsc.dsc_featureflags = featureflags; + dsc.dsc_resume_object = dspp->resumeobj; + dsc.dsc_resume_offset = dspp->resumeoff; + dsl_pool_rele(dp, tag); - /* handle features that require a DRR_BEGIN payload */ - if (featureflags & - (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) { - nvlist_t *keynvl = NULL; - nvlist_t *nvl = fnvlist_alloc(); + void *payload = NULL; + size_t payload_len = 0; + nvlist_t *nvl = fnvlist_alloc(); - if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { - dmu_object_info_t to_doi; - err = dmu_object_info(os, resumeobj, &to_doi); - if (err != 0) { - fnvlist_free(nvl); - goto out; - } - - SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, - resumeobj, 0, - resumeoff / to_doi.doi_data_block_size); - - fnvlist_add_uint64(nvl, "resume_object", resumeobj); - fnvlist_add_uint64(nvl, "resume_offset", resumeoff); - } - - if (featureflags & DMU_BACKUP_FEATURE_RAW) { - uint64_t ivset_guid = (ancestor_zb != NULL) ? - ancestor_zb->zbm_ivset_guid : 0; - - ASSERT(os->os_encrypted); - - err = dsl_crypto_populate_key_nvlist(to_ds, - ivset_guid, &keynvl); - if (err != 0) { - fnvlist_free(nvl); - goto out; - } - - fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); - } - - payload = fnvlist_pack(nvl, &payload_len); - drr->drr_payloadlen = payload_len; - fnvlist_free(keynvl); - fnvlist_free(nvl); + /* + * If we're doing a redacted send, we include the snapshots we're + * redacted with respect to so that the target system knows what send + * streams can be correctly received on top of this dataset. If we're + * instead sending a redacted dataset, we include the snapshots that the + * dataset was created with respect to. + */ + if (dspp->redactbook != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, + redact_rl->rl_phys->rlp_snaps, + redact_rl->rl_phys->rlp_num_snaps); + } else if (dsl_dataset_feature_is_active(to_ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t *tods_guids; + uint64_t length; + VERIFY(dsl_dataset_get_uint64_array_feature(to_ds, + SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids)); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids, + length); } - err = dump_record(dsp, payload, payload_len); + /* + * If we're sending from a redaction bookmark, then we should retrieve + * the guids of that bookmark so we can send them over the wire. + */ + if (from_rl != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + from_rl->rl_phys->rlp_snaps, + from_rl->rl_phys->rlp_num_snaps); + } + + /* + * If the snapshot we're sending from is redacted, include the redaction + * list in the stream. + */ + if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) { + ASSERT3P(from_rl, ==, NULL); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps); + if (dspp->numfromredactsnaps > 0) { + kmem_free(dspp->fromredactsnaps, + dspp->numfromredactsnaps * sizeof (uint64_t)); + dspp->fromredactsnaps = NULL; + } + } + + if (resuming || book_resuming) { + err = setup_resume_points(dspp, to_arg, from_arg, + rlt_arg, smt_arg, resuming, os, redact_rl, nvl); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + uint64_t ivset_guid = (ancestor_zb != NULL) ? + ancestor_zb->zbm_ivset_guid : 0; + nvlist_t *keynvl = NULL; + ASSERT(os->os_encrypted); + + err = dsl_crypto_populate_key_nvlist(os, ivset_guid, + &keynvl); + if (err != 0) { + fnvlist_free(nvl); + goto out; + } + + fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); + fnvlist_free(keynvl); + } + + if (!nvlist_empty(nvl)) { + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + } + + fnvlist_free(nvl); + err = dump_record(&dsc, payload, payload_len); fnvlist_pack_free(payload, payload_len); if (err != 0) { - err = dsp->dsa_err; + err = dsc.dsc_err; goto out; } - err = bqueue_init(&to_arg.q, - MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), - offsetof(struct send_block_record, ln)); - to_arg.error_code = 0; - to_arg.cancel = B_FALSE; - to_arg.ds = to_ds; - to_arg.fromtxg = fromtxg; - to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; - if (rawok) - to_arg.flags |= TRAVERSE_NO_DECRYPT; - (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, - TS_RUN, minclsyspri); + setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok); + setup_from_thread(from_arg, from_rl, dssp); + setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp); + setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os); + setup_reader_thread(srt_arg, dspp, smt_arg, featureflags); - to_data = bqueue_dequeue(&to_arg.q); - - while (!to_data->eos_marker && err == 0) { - err = do_dump(dsp, to_data); - to_data = get_next_record(&to_arg.q, to_data); + range = bqueue_dequeue(&srt_arg->q); + while (err == 0 && !range->eos_marker) { + err = do_dump(&dsc, range); + range = get_next_range(&srt_arg->q, range); if (issig(JUSTLOOKING) && issig(FORREAL)) - err = EINTR; + err = SET_ERROR(EINTR); } + /* + * If we hit an error or are interrupted, cancel our worker threads and + * clear the queue of any pending records. The threads will pass the + * cancel up the tree of worker threads, and each one will clean up any + * pending records before exiting. + */ if (err != 0) { - to_arg.cancel = B_TRUE; - while (!to_data->eos_marker) { - to_data = get_next_record(&to_arg.q, to_data); + srt_arg->cancel = B_TRUE; + while (!range->eos_marker) { + range = get_next_range(&srt_arg->q, range); } } - kmem_free(to_data, sizeof (*to_data)); + range_free(range); - bqueue_destroy(&to_arg.q); + bqueue_destroy(&srt_arg->q); + bqueue_destroy(&smt_arg->q); + if (dspp->redactbook != NULL) + bqueue_destroy(&rlt_arg->q); + bqueue_destroy(&to_arg->q); + bqueue_destroy(&from_arg->q); - if (err == 0 && to_arg.error_code != 0) - err = to_arg.error_code; + if (err == 0 && srt_arg->error != 0) + err = srt_arg->error; if (err != 0) goto out; - if (dsp->dsa_pending_op != PENDING_NONE) - if (dump_record(dsp, NULL, 0) != 0) + if (dsc.dsc_pending_op != PENDING_NONE) + if (dump_record(&dsc, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { - if (err == EINTR && dsp->dsa_err != 0) - err = dsp->dsa_err; + if (err == EINTR && dsc.dsc_err != 0) + err = dsc.dsc_err; goto out; } - bzero(drr, sizeof (dmu_replay_record_t)); - drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; - drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; + /* + * Send the DRR_END record if this is not a saved stream. + * Otherwise, the omitted DRR_END record will signal to + * the receive side that the stream is incomplete. + */ + if (!dspp->savedok) { + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc; + drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid; - if (dump_record(dsp, NULL, 0) != 0) - err = dsp->dsa_err; + if (dump_record(&dsc, NULL, 0) != 0) + err = dsc.dsc_err; + } out: mutex_enter(&to_ds->ds_sendstream_lock); - list_remove(&to_ds->ds_sendstreams, dsp); + list_remove(&to_ds->ds_sendstreams, dssp); mutex_exit(&to_ds->ds_sendstream_lock); - VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); + VERIFY(err != 0 || (dsc.dsc_sent_begin && + (dsc.dsc_sent_end || dspp->savedok))); kmem_free(drr, sizeof (dmu_replay_record_t)); - kmem_free(dsp, sizeof (dmu_sendarg_t)); + kmem_free(dssp, sizeof (dmu_sendstatus_t)); + kmem_free(from_arg, sizeof (*from_arg)); + kmem_free(to_arg, sizeof (*to_arg)); + kmem_free(rlt_arg, sizeof (*rlt_arg)); + kmem_free(smt_arg, sizeof (*smt_arg)); + kmem_free(srt_arg, sizeof (*srt_arg)); dsl_dataset_long_rele(to_ds, FTAG); + if (from_rl != NULL) { + dsl_redaction_list_long_rele(from_rl, FTAG); + dsl_redaction_list_rele(from_rl, FTAG); + } + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); + } return (err); } @@ -1268,104 +2623,216 @@ out: int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - boolean_t rawok, int outfd, vnode_t *vp, offset_t *off) + boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, + dmu_send_outparams_t *dsop) { - dsl_pool_t *dp; - dsl_dataset_t *ds; - dsl_dataset_t *fromds = NULL; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; int err; + dsl_dataset_t *fromds; + ds_hold_flags_t dsflags; + struct dmu_send_params dspp = {0}; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.rawok = rawok; + dspp.savedok = savedok; - err = dsl_pool_hold(pool, FTAG, &dp); + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; + err = dsl_pool_hold(pool, FTAG, &dspp.dp); if (err != 0) return (err); - err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds); + err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); if (err != 0) { - dsl_pool_rele(dp, FTAG); + dsl_pool_rele(dspp.dp, FTAG); return (err); } if (fromsnap != 0) { - zfs_bookmark_phys_t zb = { 0 }; - boolean_t is_clone; - - err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); + err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags, + FTAG, &fromds); if (err != 0) { - dsl_dataset_rele_flags(ds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + dsl_pool_rele(dspp.dp, FTAG); return (err); } - if (!dsl_dataset_is_before(ds, fromds, 0)) { - err = SET_ERROR(EXDEV); - dsl_dataset_rele(fromds, FTAG); - dsl_dataset_rele_flags(ds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); - } - - zb.zbm_creation_time = + dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + dspp.ancestor_zb.zbm_creation_txg = + dsl_dataset_phys(fromds)->ds_creation_txg; + dspp.ancestor_zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; if (dsl_dataset_is_zapified(fromds)) { - (void) zap_lookup(dp->dp_meta_objset, + (void) zap_lookup(dspp.dp->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, - &zb.zbm_ivset_guid); + &dspp.ancestor_zb.zbm_ivset_guid); } - is_clone = (fromds->ds_dir != ds->ds_dir); + /* See dmu_send for the reasons behind this. */ + uint64_t *fromredact; + + if (!dsl_dataset_get_uint64_array_feature(fromds, + SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, size); + } + + boolean_t is_before = + dsl_dataset_is_before(dspp.to_ds, fromds, 0); + dspp.is_clone = (dspp.to_ds->ds_dir != + fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, rawok, outfd, - 0, 0, vp, off); + if (!is_before) { + dsl_pool_rele(dspp.dp, FTAG); + err = SET_ERROR(EXDEV); + } else { + err = dmu_send_impl(&dspp); + } } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, rawok, outfd, - 0, 0, vp, off); + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); } - dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_dataset_rele(dspp.to_ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, - int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, - offset_t *off) + boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook, int outfd, offset_t *off, + dmu_send_outparams_t *dsop) { - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + int err = 0; + ds_hold_flags_t dsflags; boolean_t owned = B_FALSE; + dsl_dataset_t *fromds = NULL; + zfs_bookmark_phys_t book = {0}; + struct dmu_send_params dspp = {0}; + + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; + dspp.tosnap = tosnap; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.resumeobj = resumeobj; + dspp.resumeoff = resumeoff; + dspp.rawok = rawok; + dspp.savedok = savedok; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); - err = dsl_pool_hold(tosnap, FTAG, &dp); + err = dsl_pool_hold(tosnap, FTAG, &dspp.dp); if (err != 0) return (err); - if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { + + if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ - err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds); + + if (savedok) { + /* + * We are looking for the dataset that represents the + * partially received send stream. If this stream was + * received as a new snapshot of an existing dataset, + * this will be saved in a hidden clone named + * "//%recv". Otherwise, the stream + * will be saved in the live dataset itself. In + * either case we need to use dsl_dataset_own_force() + * because the stream is marked as inconsistent, + * which would normally make it unavailable to be + * owned. + */ + char *name = kmem_asprintf("%s/%s", tosnap, + recv_clone_name); + err = dsl_dataset_own_force(dspp.dp, name, dsflags, + FTAG, &dspp.to_ds); + if (err == ENOENT) { + err = dsl_dataset_own_force(dspp.dp, tosnap, + dsflags, FTAG, &dspp.to_ds); + } + + if (err == 0) { + err = zap_lookup(dspp.dp->dp_meta_objset, + dspp.to_ds->ds_object, + DS_FIELD_RESUME_TOGUID, 8, 1, + &dspp.saved_guid); + } + + if (err == 0) { + err = zap_lookup(dspp.dp->dp_meta_objset, + dspp.to_ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, + sizeof (dspp.saved_toname), + dspp.saved_toname); + } + if (err != 0) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + + kmem_strfree(name); + } else { + err = dsl_dataset_own(dspp.dp, tosnap, dsflags, + FTAG, &dspp.to_ds); + } owned = B_TRUE; } else { - err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds); + err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); } + if (err != 0) { - dsl_pool_rele(dp, FTAG); + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } + + if (redactbook != NULL) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(path, tosnap, sizeof (path)); + char *at = strchr(path, '@'); + if (at == NULL) { + err = EINVAL; + } else { + (void) snprintf(at, sizeof (path) - (at - path), "#%s", + redactbook); + err = dsl_bookmark_lookup(dspp.dp, path, + NULL, &book); + dspp.redactbook = &book; + } + } + + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + if (owned) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); return (err); } if (fromsnap != NULL) { - zfs_bookmark_phys_t zb = { 0 }; - boolean_t is_clone = B_FALSE; - int fsnamelen = strchr(tosnap, '@') - tosnap; + zfs_bookmark_phys_t *zb = &dspp.ancestor_zb; + int fsnamelen; + if (strpbrk(tosnap, "@#") != NULL) + fsnamelen = strpbrk(tosnap, "@#") - tosnap; + else + fsnamelen = strlen(tosnap); /* * If the fromsnap is in a different filesystem, then @@ -1374,55 +2841,85 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { - is_clone = B_TRUE; + dspp.is_clone = B_TRUE; } - if (strchr(fromsnap, '@')) { - dsl_dataset_t *fromds; - err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); - if (err == 0) { - if (!dsl_dataset_is_before(ds, fromds, 0)) - err = SET_ERROR(EXDEV); - zb.zbm_creation_time = - dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = - dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; - is_clone = (ds->ds_dir != fromds->ds_dir); + if (strchr(fromsnap, '@') != NULL) { + err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG, + &fromds); - if (dsl_dataset_is_zapified(fromds)) { - (void) zap_lookup(dp->dp_meta_objset, - fromds->ds_object, - DS_FIELD_IVSET_GUID, 8, 1, - &zb.zbm_ivset_guid); + if (err != 0) { + ASSERT3P(fromds, ==, NULL); + } else { + /* + * We need to make a deep copy of the redact + * snapshots of the from snapshot, because the + * array will be freed when we evict from_ds. + */ + uint64_t *fromredact; + if (!dsl_dataset_get_uint64_array_feature( + fromds, SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = + NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = + dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, + KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, + size); + } + if (!dsl_dataset_is_before(dspp.to_ds, fromds, + 0)) { + err = SET_ERROR(EXDEV); + } else { + zb->zbm_creation_txg = + dsl_dataset_phys(fromds)-> + ds_creation_txg; + zb->zbm_creation_time = + dsl_dataset_phys(fromds)-> + ds_creation_time; + zb->zbm_guid = + dsl_dataset_phys(fromds)->ds_guid; + zb->zbm_redaction_obj = 0; + + if (dsl_dataset_is_zapified(fromds)) { + (void) zap_lookup( + dspp.dp->dp_meta_objset, + fromds->ds_object, + DS_FIELD_IVSET_GUID, 8, 1, + &zb->zbm_ivset_guid); + } } dsl_dataset_rele(fromds, FTAG); } } else { - err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds, + zb); + if (err == EXDEV && zb->zbm_redaction_obj != 0 && + zb->zbm_guid == + dsl_dataset_phys(dspp.to_ds)->ds_guid) + err = 0; } - if (err != 0) { - if (owned) - dsl_dataset_disown(ds, dsflags, FTAG); - else - dsl_dataset_rele_flags(ds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); + if (err == 0) { + /* dmu_send_impl will call dsl_pool_rele for us. */ + err = dmu_send_impl(&dspp); + } else { + dsl_pool_rele(dspp.dp, FTAG); } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, rawok, - outfd, resumeobj, resumeoff, vp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, rawok, - outfd, resumeobj, resumeoff, vp, off); + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); } if (owned) - dsl_dataset_disown(ds, dsflags, FTAG); + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); else - dsl_dataset_rele_flags(ds, dsflags, FTAG); - + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); return (err); } @@ -1483,39 +2980,83 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, } int -dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, - boolean_t stream_compressed, uint64_t *sizep) +dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, + boolean_t saved, uint64_t *sizep) { int err; + dsl_dataset_t *ds = origds; uint64_t uncomp, comp; - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* tosnap must be a snapshot */ - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !fromds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); + ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool)); + ASSERT(fromds == NULL || frombook == NULL); /* - * fromsnap must be an earlier snapshot from the same fs as tosnap, - * or the origin's fs. + * If this is a saved send we may actually be sending + * from the %recv clone used for resuming. */ - if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) - return (SET_ERROR(EXDEV)); + if (saved) { + objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset; + uint64_t guid; + char dsname[ZFS_MAX_DATASET_NAME_LEN + 6]; - /* Get compressed and uncompressed size estimates of changed data. */ - if (fromds == NULL) { + dsl_dataset_name(origds, dsname); + (void) strcat(dsname, "/"); + (void) strcat(dsname, recv_clone_name); + + err = dsl_dataset_hold(origds->ds_dir->dd_pool, + dsname, FTAG, &ds); + if (err != ENOENT && err != 0) { + return (err); + } else if (err == ENOENT) { + ds = origds; + } + + /* check that this dataset has partially received data */ + err = zap_lookup(mos, ds->ds_object, + DS_FIELD_RESUME_TOGUID, 8, 1, &guid); + if (err != 0) { + err = SET_ERROR(err == ENOENT ? EINVAL : err); + goto out; + } + + err = zap_lookup(mos, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname); + if (err != 0) { + err = SET_ERROR(err == ENOENT ? EINVAL : err); + goto out; + } + } + + /* tosnap must be a snapshot or the target of a saved send */ + if (!ds->ds_is_snapshot && ds == origds) + return (SET_ERROR(EINVAL)); + + if (fromds != NULL) { + uint64_t used; + if (!fromds->ds_is_snapshot) { + err = SET_ERROR(EINVAL); + goto out; + } + + if (!dsl_dataset_is_before(ds, fromds, 0)) { + err = SET_ERROR(EXDEV); + goto out; + } + + err = dsl_dataset_space_written(fromds, ds, &used, &comp, + &uncomp); + if (err != 0) + goto out; + } else if (frombook != NULL) { + uint64_t used; + err = dsl_dataset_space_written_bookmark(frombook, ds, &used, + &comp, &uncomp); + if (err != 0) + goto out; + } else { uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; comp = dsl_dataset_phys(ds)->ds_compressed_bytes; - } else { - uint64_t used; - err = dsl_dataset_space_written(fromds, ds, - &used, &comp, &uncomp); - if (err != 0) - return (err); } err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, @@ -1524,84 +3065,32 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, * Add the size of the BEGIN and END records to the estimate. */ *sizep += 2 * sizeof (dmu_replay_record_t); + +out: + if (ds != origds) + dsl_dataset_rele(ds, FTAG); return (err); } -struct calculate_send_arg { - uint64_t uncompressed; - uint64_t compressed; -}; - -/* - * Simple callback used to traverse the blocks of a snapshot and sum their - * uncompressed and compressed sizes. - */ -/* ARGSUSED */ -static int -dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct calculate_send_arg *space = arg; - if (bp != NULL && !BP_IS_HOLE(bp)) { - space->uncompressed += BP_GET_UCSIZE(bp); - space->compressed += BP_GET_PSIZE(bp); - } - return (0); -} - -/* - * Given a desination snapshot and a TXG, calculate the approximate size of a - * send stream sent from that TXG. from_txg may be zero, indicating that the - * whole snapshot will be sent. - */ -int -dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, - boolean_t stream_compressed, uint64_t *sizep) -{ - int err; - struct calculate_send_arg size = { 0 }; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) - return (SET_ERROR(EINVAL)); - - /* verify that from_txg is before the provided snapshot was taken */ - if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { - return (SET_ERROR(EXDEV)); - } - /* - * traverse the blocks of the snapshot with birth times after - * from_txg, summing their uncompressed size - */ - err = traverse_dataset(ds, from_txg, - TRAVERSE_POST | TRAVERSE_NO_DECRYPT, - dmu_calculate_send_traversal, &size); - - if (err) - return (err); - - err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, - size.compressed, stream_compressed, sizep); - return (err); -} - - -#if defined(_KERNEL) /* BEGIN CSTYLED */ -module_param(zfs_override_estimate_recordsize, ulong, 0644); -MODULE_PARM_DESC(zfs_override_estimate_recordsize, - "Record size calculation override for zfs send estimates"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, + "Allow sending corrupt data"); -module_param(zfs_send_corrupt_data, int, 0644); -MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW, + "Maximum send queue length"); -module_param(zfs_send_queue_length, int, 0644); -MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length"); - -module_param(zfs_send_unmodified_spill_blocks, int, 0644); -MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks, +ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW, "Send unmodified spill blocks"); -#endif + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW, + "Maximum send queue length for non-prefetch queues"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW, + "Send queue fill fraction"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW, + "Send queue fill fraction for non-prefetch queues"); + +ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW, + "Override block size estimate with fixed size"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index f426520991..862c0bf404 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -41,6 +41,7 @@ int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ int32_t send_holes_without_birth_time = 1; +int32_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; @@ -67,13 +68,14 @@ typedef struct traverse_data { boolean_t td_realloc_possible; } traverse_data_t; -static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - uint64_t objset, uint64_t object); +static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, + const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); static int -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, + uint64_t claim_txg) { traverse_data_t *td = arg; zbookmark_phys_t zb; @@ -93,7 +95,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) } static int -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, + uint64_t claim_txg) { traverse_data_t *td = arg; @@ -174,7 +177,10 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void +/* + * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. + */ +static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { @@ -182,24 +188,26 @@ traverse_prefetch_metadata(traverse_data_t *td, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) - return; + return (B_FALSE); /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) - return; + return (B_FALSE); if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) - return; + return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) - return; + return (B_FALSE); + ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + return (B_TRUE); } static boolean_t @@ -207,7 +215,7 @@ prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) { ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || - BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp)) return (B_FALSE); return (B_TRUE); } @@ -274,7 +282,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, mutex_exit(&pd->pd_mtx); } - if (BP_IS_HOLE(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; @@ -292,7 +300,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_FLAG_WAIT; - int32_t i; + int32_t i, ptidx, pidx; + uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -305,16 +314,46 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + /* + * When performing a traversal it is beneficial to + * asynchronously read-ahead the upcoming indirect + * blocks since they will be needed shortly. However, + * since a 128k indirect (non-L0) block may contain up + * to 1024 128-byte block pointers, its preferable to not + * prefetch them all at once. Issuing a large number of + * async reads may effect performance, and the earlier + * the indirect blocks are prefetched the less likely + * they are to still be resident in the ARC when needed. + * Therefore, prefetching indirect blocks is limited to + * zfs_traverse_indirect_prefetch_limit=32 blocks by + * default. + * + * pidx: Index for which next prefetch to be issued. + * ptidx: Index at which next prefetch to be triggered. + */ + ptidx = 0; + pidx = 1; + prefetchlimit = zfs_traverse_indirect_prefetch_limit; for (i = 0; i < epb; i++) { - SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); - } + if (prefetchlimit && i == ptidx) { + ASSERT3S(ptidx, <=, pidx); + for (uint32_t prefetched = 0; pidx < epb && + prefetched < prefetchlimit; pidx++) { + SET_BOOKMARK(czb, zb->zb_objset, + zb->zb_object, zb->zb_level - 1, + zb->zb_blkid * epb + pidx); + if (traverse_prefetch_metadata(td, + &((blkptr_t *)buf->b_data)[pidx], + czb) == B_TRUE) { + prefetched++; + if (prefetched == + MAX(prefetchlimit / 2, 1)) + ptidx = pidx; + } + } + } - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + /* recursively visitbp() blocks below this */ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); @@ -354,7 +393,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { - err = traverse_dnode(td, &child_dnp[i], + err = traverse_dnode(td, bp, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; @@ -395,19 +434,19 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, + err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); if (err == 0) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); if (err == 0) - err = traverse_dnode(td, + err = traverse_dnode(td, bp, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } @@ -475,7 +514,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, } static int -traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, +traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; @@ -488,7 +527,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); @@ -511,7 +550,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (err == 0 && (td->td_flags & TRAVERSE_POST)) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); @@ -532,7 +571,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); - if (bp == NULL) + if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); @@ -635,6 +674,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; + ASSERT(!BP_IS_REDACTED(rootbp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(rootbp)) @@ -766,18 +806,22 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, return (err); } -#if defined(_KERNEL) EXPORT_SYMBOL(traverse_dataset); EXPORT_SYMBOL(traverse_pool); -module_param(zfs_pd_bytes_max, int, 0644); -MODULE_PARM_DESC(zfs_pd_bytes_max, "Max number of bytes to prefetch"); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, + "Max number of bytes to prefetch"); +ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW, + "Traverse prefetch number of blocks pointed by indirect block"); + +#if defined(_KERNEL) module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); -MODULE_PARM_DESC(ignore_hole_birth, "Alias for send_holes_without_birth_time"); - -module_param_named(send_holes_without_birth_time, - send_holes_without_birth_time, int, 0644); -MODULE_PARM_DESC(send_holes_without_birth_time, - "Ignore hole_birth txg for zfs send"); +MODULE_PARM_DESC(ignore_hole_birth, + "Alias for send_holes_without_birth_time"); #endif + +ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, + "Ignore hole_birth txg for zfs send"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index cbadcc86fc..5fa5168666 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); @@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -230,9 +231,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); - if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - if (dn == NULL) return; @@ -316,23 +314,6 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) } } -void -dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_WRITE, 0, 0); - if (txh == NULL) - return; - - dnode_t *dn = txh->txh_dnode; - (void) zfs_refcount_add_many(&txh->txh_space_towrite, - 1ULL << dn->dn_indblkshift, FTAG); - dmu_tx_count_dnode(txh); -} - void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { @@ -633,7 +614,8 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", - txh->txh_type, beginblk, endblk); + txh->txh_type, (u_longlong_t)beginblk, + (u_longlong_t)endblk); switch (txh->txh_type) { case THT_WRITE: @@ -903,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (SET_ERROR(ERESTART)); } + if (!tx->tx_dirty_delayed && + dsl_pool_wrlog_over_max(tx->tx_pool)) { + DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + return (SET_ERROR(ERESTART)); + } + if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; @@ -925,6 +913,25 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { + /* + * This thread can't hold the dn_struct_rwlock + * while assigning the tx, because this can lead to + * deadlock. Specifically, if this dnode is already + * assigned to an earlier txg, this thread may need + * to wait for that txg to sync (the ERESTART case + * below). The other thread that has assigned this + * dnode to an earlier txg prevents this txg from + * syncing until its tx can complete (calling + * dmu_tx_commit()), but it may need to acquire the + * dn_struct_rwlock to do so (e.g. via + * dmu_buf_hold*()). + * + * Note that this thread can't hold the lock for + * read either, but the rwlock doesn't record + * enough information to make that assertion. + */ + ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); + mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); @@ -1013,6 +1020,22 @@ dmu_tx_unassign(dmu_tx_t *tx) * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). + * + * It is guaranteed that subsequent successful calls to dmu_tx_assign() + * will assign the tx to monotonically increasing txgs. Of course this is + * not strong monotonicity, because the same txg can be returned multiple + * times in a row. This guarantee holds both for subsequent calls from + * one thread and for multiple threads. For example, it is impossible to + * observe the following sequence of events: + * + * Thread 1 Thread 2 + * + * dmu_tx_assign(T1, ...) + * 1 <- dmu_tx_get_txg(T1) + * dmu_tx_assign(T2, ...) + * 2 <- dmu_tx_get_txg(T2) + * dmu_tx_assign(T3, ...) + * 1 <- dmu_tx_get_txg(T3) */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) @@ -1181,7 +1204,7 @@ dmu_tx_abort(dmu_tx_t *tx) * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) - dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); dmu_tx_destroy(tx); } @@ -1319,7 +1342,10 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) object = sa_handle_object(hdl); - dmu_tx_hold_bonus(tx, object); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + DB_DNODE_ENTER(db); + dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); + DB_DNODE_EXIT(db); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; @@ -1341,7 +1367,6 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 364e4d7aa8..043344a137 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include @@ -34,6 +34,7 @@ #include #include #include +#include /* * This tunable disables predictive prefetch. Note that it leaves "prescient" @@ -59,28 +60,64 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; + kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, }; -#define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); +struct { + wmsum_t zfetchstat_hits; + wmsum_t zfetchstat_misses; + wmsum_t zfetchstat_max_streams; + wmsum_t zfetchstat_io_issued; +} zfetch_sums; + +#define ZFETCHSTAT_BUMP(stat) \ + wmsum_add(&zfetch_sums.stat, 1) +#define ZFETCHSTAT_ADD(stat, val) \ + wmsum_add(&zfetch_sums.stat, val) + kstat_t *zfetch_ksp; +static int +zfetch_kstats_update(kstat_t *ksp, int rw) +{ + zfetch_stats_t *zs = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + zs->zfetchstat_hits.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_hits); + zs->zfetchstat_misses.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_misses); + zs->zfetchstat_max_streams.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_max_streams); + zs->zfetchstat_io_issued.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_io_issued); + return (0); +} + void zfetch_init(void) { + wmsum_init(&zfetch_sums.zfetchstat_hits, 0); + wmsum_init(&zfetch_sums.zfetchstat_misses, 0); + wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); + wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); + zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zfetch_ksp != NULL) { zfetch_ksp->ks_data = &zfetch_stats; + zfetch_ksp->ks_update = zfetch_kstats_update; kstat_install(zfetch_ksp); } } @@ -92,6 +129,11 @@ zfetch_fini(void) kstat_delete(zfetch_ksp); zfetch_ksp = NULL; } + + wmsum_fini(&zfetch_sums.zfetchstat_hits); + wmsum_fini(&zfetch_sums.zfetchstat_misses); + wmsum_fini(&zfetch_sums.zfetchstat_max_streams); + wmsum_fini(&zfetch_sums.zfetchstat_io_issued); } /* @@ -104,22 +146,33 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; - zf->zf_dnode = dno; + zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); - rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); +} + +static void +dmu_zfetch_stream_fini(zstream_t *zs) +{ + ASSERT(!list_link_active(&zs->zs_node)); + zfs_refcount_destroy(&zs->zs_callers); + zfs_refcount_destroy(&zs->zs_refs); + kmem_free(zs, sizeof (*zs)); } static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); + zf->zf_numstreams--; + membar_producer(); + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); } /* @@ -131,14 +184,12 @@ dmu_zfetch_fini(zfetch_t *zf) { zstream_t *zs; - ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - - rw_enter(&zf->zf_rwlock, RW_WRITER); + mutex_enter(&zf->zf_lock); while ((zs = list_head(&zf->zf_stream)) != NULL) dmu_zfetch_stream_remove(zf, zs); - rw_exit(&zf->zf_rwlock); + mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); - rw_destroy(&zf->zf_rwlock); + mutex_destroy(&zf->zf_lock); zf->zf_dnode = NULL; } @@ -153,9 +204,9 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; - int numstreams = 0; + hrtime_t now = gethrtime(); - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + ASSERT(MUTEX_HELD(&zf->zf_lock)); /* * Clean up old streams. @@ -163,11 +214,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > + /* + * Skip if still active. 1 -- zf_stream reference. + */ + if (zfs_refcount_count(&zs->zs_refs) != 1) + continue; + if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; } /* @@ -181,62 +235,94 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); - if (numstreams >= max_streams) { + if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs->zs_blkid = blkid; + zs->zs_pf_blkid1 = blkid; zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid1 = blkid; zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); - mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - + zs->zs_atime = now; + zs->zs_fetch = zf; + zs->zs_missed = B_FALSE; + zfs_refcount_create(&zs->zs_callers); + zfs_refcount_create(&zs->zs_refs); + /* One reference for zf_stream. */ + zfs_refcount_add(&zs->zs_refs, NULL); + zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } +static void +dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +{ + zstream_t *zs = arg; + + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); +} + /* - * This is the predictive prefetch entry point. It associates dnode access - * specified with blkid and nblks arguments with prefetch stream, predicts - * further accesses based on that stats and initiates speculative prefetch. + * This is the predictive prefetch entry point. dmu_zfetch_prepare() + * associates dnode access specified with blkid and nblks arguments with + * prefetch stream, predicts further accesses based on that stats and returns + * the stream pointer on success. That pointer must later be passed to + * dmu_zfetch_run() to initiate the speculative prefetch for the stream and + * release it. dmu_zfetch() is a wrapper for simple cases when window between + * prediction and prefetch initiation is not needed. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ -void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) +zstream_t * +dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, + boolean_t fetch_data, boolean_t have_lock) { zstream_t *zs; - int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_start, ipf_start; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; - uint64_t end_of_access_blkid; + int max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid, maxblkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; - krw_t rw = RW_READER; if (zfs_prefetch_disable) - return; + return (NULL); /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on * concrete vdevs (or previously-loaded indirect vdevs). So we * can't allow the predictive prefetcher to attempt reads of other - * blocks (e.g. of the MOS's dnode obejct). + * blocks (e.g. of the MOS's dnode object). */ if (!spa_indirect_vdevs_loaded(spa)) - return; + return (NULL); /* * As a fast path for small (single-block) files, ignore access * to the first block. */ - if (blkid == 0) - return; + if (!have_lock && blkid == 0) + return (NULL); -retry: - rw_enter(&zf->zf_rwlock, rw); + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + /* + * A fast path for small files for which no prefetch will + * happen. + */ + maxblkid = zf->zf_dnode->dn_maxblkid; + if (maxblkid < 2) { + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + mutex_enter(&zf->zf_lock); /* * Find matching prefetch stream. Depending on whether the accesses @@ -245,44 +331,47 @@ retry: */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { - if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { - mutex_enter(&zs->zs_lock); - /* - * zs_blkid could have changed before we - * acquired zs_lock; re-check them here. - */ - if (blkid == zs->zs_blkid) { - break; - } else if (blkid + 1 == zs->zs_blkid) { - blkid++; - nblks--; - if (nblks == 0) { - /* Already prefetched this before. */ - mutex_exit(&zs->zs_lock); - rw_exit(&zf->zf_rwlock); - return; - } - break; - } - mutex_exit(&zs->zs_lock); + if (blkid == zs->zs_blkid) { + break; + } else if (blkid + 1 == zs->zs_blkid) { + blkid++; + nblks--; + break; } } + /* + * If the file is ending, remove the matching stream if found. + * If not found then it is too late to create a new one now. + */ + if (end_of_access_blkid >= maxblkid) { + if (zs != NULL) + dmu_zfetch_stream_remove(zf, zs); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + + /* Exit if we already prefetched this block before. */ + if (nblks == 0) { + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ - ZFETCHSTAT_BUMP(zfetchstat_misses); - if (rw == RW_READER && !rw_tryupgrade(&zf->zf_rwlock)) { - rw_exit(&zf->zf_rwlock); - rw = RW_WRITER; - goto retry; - } - dmu_zfetch_stream_create(zf, end_of_access_blkid); - rw_exit(&zf->zf_rwlock); - return; + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); } /* @@ -296,6 +385,10 @@ retry: * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + if (zs->zs_pf_blkid1 < end_of_access_blkid) + zs->zs_pf_blkid1 = end_of_access_blkid; + if (zs->zs_ipf_blkid1 < end_of_access_blkid) + zs->zs_ipf_blkid1 = end_of_access_blkid; /* * Double our amount of prefetched data, but don't let the @@ -334,52 +427,125 @@ retry: * (i.e. the amount read now + the amount of data prefetched now). */ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; - max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); ipf_nblks = MIN(pf_ahead_blks, max_blks); zs->zs_ipf_blkid = ipf_start + ipf_nblks; - epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; - ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; - ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; - - zs->zs_atime = gethrtime(); zs->zs_blkid = end_of_access_blkid; - mutex_exit(&zs->zs_lock); - rw_exit(&zf->zf_rwlock); + /* Protect the stream from reclamation. */ + zs->zs_atime = gethrtime(); + zfs_refcount_add(&zs->zs_refs, NULL); + /* Count concurrent callers. */ + zfs_refcount_add(&zs->zs_callers, NULL); + mutex_exit(&zf->zf_lock); - /* - * dbuf_prefetch() is asynchronous (even when it needs to read - * indirect blocks), but we still prefer to drop our locks before - * calling it to reduce the time we hold them. - */ + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); - for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); - } - for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); - } ZFETCHSTAT_BUMP(zfetchstat_hits); + return (zs); +} + +void +dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +{ + zfetch_t *zf = zs->zs_fetch; + int64_t pf_start, pf_end, ipf_start, ipf_end; + int epbs, issued; + + if (missed) + zs->zs_missed = missed; + + /* + * Postpone the prefetch if there are more concurrent callers. + * It happens when multiple requests are waiting for the same + * indirect block. The last one will run the prefetch for all. + */ + if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { + /* Drop reference taken in dmu_zfetch_prepare(). */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } + + mutex_enter(&zf->zf_lock); + if (zs->zs_missed) { + pf_start = zs->zs_pf_blkid1; + pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; + } else { + pf_start = pf_end = 0; + } + ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); + ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; + mutex_exit(&zf->zf_lock); + ASSERT3S(pf_start, <=, pf_end); + ASSERT3S(ipf_start, <=, ipf_end); + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; + ASSERT3S(ipf_start, <=, ipf_end); + issued = pf_end - pf_start + ipf_end - ipf_start; + if (issued > 1) { + /* More references on top of taken in dmu_zfetch_prepare(). */ + for (int i = 0; i < issued - 1; i++) + zfs_refcount_add(&zs->zs_refs, NULL); + } else if (issued == 0) { + /* Some other thread has done our work, so drop the ref. */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } + + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + issued = 0; + for (int64_t blk = pf_start; blk < pf_end; blk++) { + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); + } + for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { + issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); + } + + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); +} + +void +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t missed, boolean_t have_lock) +{ + zstream_t *zs; + + zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); + if (zs) + dmu_zfetch_run(zs, missed, have_lock); } -#if defined(_KERNEL) /* BEGIN CSTYLED */ -module_param(zfs_prefetch_disable, int, 0644); -MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching"); +ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, + "Disable all ZFS prefetching"); -module_param(zfetch_max_streams, uint, 0644); -MODULE_PARM_DESC(zfetch_max_streams, "Max number of streams per zfetch"); +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, + "Max number of streams per zfetch"); -module_param(zfetch_min_sec_reap, uint, 0644); -MODULE_PARM_DESC(zfetch_min_sec_reap, "Min time before stream reclaim"); +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, + "Min time before stream reclaim"); -module_param(zfetch_max_distance, uint, 0644); -MODULE_PARM_DESC(zfetch_max_distance, - "Max bytes to prefetch per stream (default 8MB)"); +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, + "Max bytes to prefetch per stream"); -module_param(zfetch_array_rd_sz, ulong, 0644); -MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, + "Max bytes to prefetch indirects for per stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, + "Number of bytes in a array_read"); /* END CSTYLED */ -#endif diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index c06f614e19..6f87f49f89 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include dnode_stats_t dnode_stats = { @@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = { { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_allocate", KSTAT_DATA_UINT64 }, { "dnode_reallocate", KSTAT_DATA_UINT64 }, @@ -75,7 +74,7 @@ dnode_stats_t dnode_stats = { static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; -ASSERTV(static dnode_phys_t dnode_phys_zero); +static dnode_phys_t dnode_phys_zero __maybe_unused; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; @@ -90,11 +89,11 @@ dbuf_compare(const void *x1, const void *x2) const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - int cmp = AVL_CMP(d1->db_level, d2->db_level); + int cmp = TREE_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); @@ -106,7 +105,7 @@ dbuf_compare(const void *x1, const void *x2) return (1); } - return (AVL_PCMP(d1, d2)); + return (TREE_PCMP(d1, d2)); } /* ARGSUSED */ @@ -120,6 +119,7 @@ dnode_cons(void *arg, void *unused, int kmflag) mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is @@ -129,6 +129,7 @@ dnode_cons(void *arg, void *unused, int kmflag) zfs_refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); + bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type)); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); @@ -184,6 +185,7 @@ dnode_dest(void *arg, void *unused) mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); + cv_destroy(&dn->dn_nodnholds); zfs_refcount_destroy(&dn->dn_holds); zfs_refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); @@ -390,6 +392,14 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + + if (newsize < dn->dn_bonuslen) { + /* clear any data after the end of the new size */ + size_t diff = dn->dn_bonuslen - newsize; + char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; + bzero(data_end, diff); + } + dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; @@ -439,7 +449,6 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dnode_t *dn; dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; /* @@ -535,10 +544,7 @@ dnode_destroy(dnode_t *dn) dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; - if (dn->dn_dirtyctx_firstset != NULL) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); @@ -587,7 +593,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", - dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + dn->dn_objset, (u_longlong_t)dn->dn_object, + (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots); DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); @@ -604,7 +611,6 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); @@ -643,10 +649,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirty_txg = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -752,7 +756,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); - ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; @@ -820,9 +823,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_newgid = odn->dn_newgid; ndn->dn_newprojid = odn->dn_newprojid; ndn->dn_id_flags = odn->dn_id_flags; - dmu_zfetch_init(&ndn->dn_zfetch, NULL); - list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); - ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + dmu_zfetch_init(&ndn->dn_zfetch, ndn); /* * Update back pointers. Updating the handle fixes the back pointer of @@ -830,9 +831,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; - if (ndn->dn_zfetch.zf_dnode == odn) { - ndn->dn_zfetch.zf_dnode = ndn; - } /* * Invalidate the original dnode by clearing all of its back pointers. @@ -998,7 +996,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ refcount = zfs_refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); - dbufs = odn->dn_dbufs_count; + dbufs = DN_DBUFS_COUNT(odn); /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); @@ -1025,7 +1023,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds)); - ASSERT(dbufs == ndn->dn_dbufs_count); + ASSERT(dbufs == DN_DBUFS_COUNT(ndn)); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); @@ -1171,13 +1169,15 @@ dnode_special_close(dnode_handle_t *dnh) dnode_t *dn = dnh->dnh_dnode; /* - * Wait for final references to the dnode to clear. This can - * only happen if the arc is asynchronously evicting state that - * has a hold on this dnode while we are trying to evict this - * dnode. + * Ensure dnode_rele_and_unlock() has released dn_mtx, after final + * zfs_refcount_remove() */ - while (zfs_refcount_count(&dn->dn_holds) > 0) - delay(1); + mutex_enter(&dn->dn_mtx); + if (zfs_refcount_count(&dn->dn_holds) > 0) + cv_wait(&dn->dn_nodnholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0); + ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); @@ -1193,7 +1193,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_t *dn; zrl_init(&dnh->dnh_zrlock); - zrl_tryenter(&dnh->dnh_zrlock); + VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock)); dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); @@ -1255,6 +1255,10 @@ dnode_buf_evict_async(void *dbu) * as an extra dnode slot by an large dnode, in which case it returns * ENOENT. * + * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just + * return whether the hold would succeed or not. tag and dnp should set to + * NULL in this case. + * * errors: * EINVAL - Invalid object number or flags. * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) @@ -1283,6 +1287,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL)); /* * If you are holding the spa config lock as writer, you shouldn't @@ -1312,8 +1317,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); - (void) zfs_refcount_add(&dn->dn_holds, tag); - *dnp = dn; + /* Don't actually hold if dry run, just return 0 */ + if (!(flag & DNODE_DRY_RUN)) { + (void) zfs_refcount_add(&dn->dn_holds, tag); + *dnp = dn; + } return (0); } @@ -1331,7 +1339,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); - db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); @@ -1344,7 +1351,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * We do not need to decrypt to read the dnode so it doesn't matter * if we get the encrypted or decrypted version. */ - err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT); + err = dbuf_read(db, NULL, DB_RF_CANFAIL | + DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); if (err) { DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); @@ -1455,6 +1463,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(ENOENT)); } + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + DNODE_STAT_BUMP(dnode_hold_alloc_hits); } else if (flag & DNODE_MUST_BE_FREE) { @@ -1512,6 +1528,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(EEXIST)); } + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); DNODE_STAT_BUMP(dnode_hold_free_hits); } else { @@ -1519,15 +1543,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, return (SET_ERROR(EINVAL)); } - if (dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_txg); - type = dn->dn_type; - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? - ENOENT : EEXIST)); - } + ASSERT0(dn->dn_free_txg); if (zfs_refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); @@ -1538,6 +1554,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); DNODE_VERIFY(dn); + ASSERT3P(dnp, !=, NULL); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); @@ -1590,7 +1607,10 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) dnode_handle_t *dnh = dn->dn_handle; refs = zfs_refcount_remove(&dn->dn_holds, tag); + if (refs == 0) + cv_broadcast(&dn->dn_nodnholds); mutex_exit(&dn->dn_mtx); + /* dnode could get destroyed at this point, so don't use it anymore */ /* * It's unsafe to release the last hold on a dnode by dnode_rele() or @@ -1618,6 +1638,36 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) } } +/* + * Test whether we can create a dnode at the specified location. + */ +int +dnode_try_claim(objset_t *os, uint64_t object, int slots) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN, + slots, NULL, NULL)); +} + +/* + * Checks if the dnode contains any uncommitted dirty records. + */ +boolean_t +dnode_is_dirty(dnode_t *dn) +{ + mutex_enter(&dn->dn_mtx); + + for (int i = 0; i < TXG_SIZE; i++) { + if (list_head(&dn->dn_dirty_records[i]) != NULL) { + mutex_exit(&dn->dn_mtx); + return (B_TRUE); + } + } + + mutex_exit(&dn->dn_mtx); + + return (B_FALSE); +} + void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { @@ -1643,7 +1693,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); - multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; + multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK]; multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); /* @@ -1662,7 +1712,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", - dn->dn_object, txg); + (u_longlong_t)dn->dn_object, (u_longlong_t)txg); multilist_sublist_insert_head(mls, dn); @@ -1742,10 +1792,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) + if (err == 0) { dbuf_new_size(db, size, tx); - else if (err != ENOENT) + } else if (err != ENOENT) { goto fail; + } dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -1754,7 +1805,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } - /* rele after we have fixed the blocksize in the dnode */ + /* release after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); @@ -1777,6 +1828,7 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(new_nlevels, >, dn->dn_nlevels); dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); @@ -1794,10 +1846,12 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && + + IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); + if (dr->dr_dbuf == NULL || + (dr->dr_dbuf->db_level == old_nlevels - 1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; @@ -1915,18 +1969,20 @@ static void dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t db_search; + dmu_buf_impl_t *db_search; dmu_buf_impl_t *db; avl_index_t where; + db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + mutex_enter(&dn->dn_dbufs_mtx); - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; + db_search->db_level = 1; + db_search->db_blkid = start_blkid + 1; + db_search->db_state = DB_SEARCH; for (;;) { - db = avl_find(&dn->dn_dbufs, &db_search, &where); + db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); @@ -1938,7 +1994,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, /* * Setup the next blkid we want to search for. */ - db_search.db_blkid = db->db_blkid + 1; + db_search->db_blkid = db->db_blkid + 1; ASSERT3U(db->db_blkid, >=, start_blkid); /* @@ -1958,10 +2014,10 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, /* * Walk all the in-core level-1 dbufs and verify they have been dirtied. */ - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; - db = avl_find(&dn->dn_dbufs, &db_search, &where); + db_search->db_level = 1; + db_search->db_blkid = start_blkid + 1; + db_search->db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { @@ -1971,19 +2027,75 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, ASSERT(db->db_dirtycnt > 0); } #endif + kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } +void +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) +{ + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + + if (ds != NULL) { + rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); + } + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + if (dmu_tx_is_syncing(tx)) + dn->dn_dirtyctx = DN_DIRTY_SYNC; + else + dn->dn_dirtyctx = DN_DIRTY_OPEN; + dn->dn_dirtyctx_firstset = tag; + } + if (ds != NULL) { + rrw_exit(&ds->ds_bp_rwlock, tag); + } + } +} + +static void +dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int res; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, + FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + db_lock_type_t dblt; + boolean_t dirty; + + dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + /* don't dirty if not on disk and not dirty */ + dirty = !list_is_empty(&db->db_dirty_records) || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, dblt, FTAG); + if (dirty) { + caddr_t data; + + dmu_buf_will_dirty(&db->db, tx); + data = db->db.db_data; + bzero(data + blkoff, len); + } + dbuf_rele(db, FTAG); + } +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { - dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -2000,7 +2112,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { @@ -2009,12 +2121,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) */ blkid = 0; nblks = 1; - if (dn->dn_nlevels > 1) + if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_dirty_l1(dn, 0, tx); + rw_exit(&dn->dn_struct_rwlock); + } goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ - goto out; + return; } else { /* Freeing part of the block. */ head = blksz - off; @@ -2027,32 +2142,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), - TRUE, FALSE, FTAG, &db) == 0) { - caddr_t data; - - /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); - } - dbuf_rele(db, FTAG); - } + dnode_partial_zero(dn, off, blkoff, head, tx); off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) - goto out; + return; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; ASSERT(ISP2(blksz)); if (trunc) @@ -2065,24 +2166,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (tail) { if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), - TRUE, FALSE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); - } + dnode_partial_zero(dn, off + len, 0, tail, tx); len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) - goto out; + return; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); @@ -2112,6 +2202,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); uint64_t first, last; first = blkid >> epbs; @@ -2156,6 +2247,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) dnode_dirty_l1(dn, i, tx); } + rw_exit(&dn->dn_struct_rwlock); } done: @@ -2165,22 +2257,21 @@ done: */ mutex_enter(&dn->dn_mtx); { - int txgoff = tx->tx_txg & TXG_MASK; - if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); - } - range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); - range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] == NULL) { + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); + } + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); + range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); } dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); + (u_longlong_t)blkid, (u_longlong_t)nblks, + (u_longlong_t)tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); -out: - - rw_exit(&dn->dn_struct_rwlock); } static boolean_t @@ -2289,6 +2380,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, boolean_t hole; int i, inc, error, span; + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); @@ -2315,15 +2408,16 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, - DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT); + DB_RF_CANFAIL | DB_RF_HAVESTRUCT | + DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; + rw_enter(&db->db_rwlock, RW_READER); } - if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { @@ -2396,8 +2490,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = SET_ERROR(ESRCH); } - if (db) + if (db != NULL) { + rw_exit(&db->db_rwlock); dbuf_rele(db, FTAG); + } return (error); } @@ -2483,3 +2579,13 @@ out: return (error); } + +#if defined(_KERNEL) +EXPORT_SYMBOL(dnode_hold); +EXPORT_SYMBOL(dnode_rele); +EXPORT_SYMBOL(dnode_set_nlevels); +EXPORT_SYMBOL(dnode_set_blksz); +EXPORT_SYMBOL(dnode_free_range); +EXPORT_SYMBOL(dnode_evict_dbufs); +EXPORT_SYMBOL(dnode_evict_bonus); +#endif diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 581f812a14..dd37e3af7e 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -21,8 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2020 Oxide Computer Company */ #include @@ -51,7 +52,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); @@ -59,10 +59,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) dn->dn_phys->dn_nlevels = new_level; dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, - dn->dn_object, dn->dn_phys->dn_nlevels); + (u_longlong_t)dn->dn_object, dn->dn_phys->dn_nlevels); + + /* + * Lock ordering requires that we hold the children's db_mutexes (by + * calling dbuf_find()) before holding the parent's db_rwlock. The lock + * order is imposed by dbuf_read's steps of "grab the lock to protect + * db_parent, get db_parent, hold db_parent's db_rwlock". + */ + dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; + ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); + for (i = 0; i < nblkptr; i++) { + children[i] = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + } /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); @@ -72,12 +88,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + dmu_buf_impl_t *child = children[i]; if (child == NULL) continue; -#ifdef DEBUG +#ifdef ZFS_DEBUG DB_DNODE_ENTER(child); ASSERT3P(DB_DNODE(child), ==, dn); DB_DNODE_EXIT(child); @@ -106,6 +121,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + rw_exit(&db->db_rwlock); + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); @@ -117,7 +136,8 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); + dprintf("ds=%p obj=%llx num=%d\n", ds, (u_longlong_t)dn->dn_object, + num); for (int i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) @@ -182,17 +202,14 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, + err = dbuf_hold_impl(dn, db->db_level - 1, (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; - ASSERT(dr == NULL || dr->dr_txg == txg); + dr = dbuf_find_dirty_eq(child, txg); /* data_old better be zeroed */ if (dr) { @@ -213,7 +230,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { + list_is_empty(&child->db_dirty_records)) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -280,7 +297,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, * ancestor of the first or last block to be freed. The first and * last L1 indirect blocks are always dirtied by dnode_free_range(). */ + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); bp = db->db.db_data; @@ -306,7 +325,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + rw_enter(&db->db_rwlock, RW_WRITER); + free_blocks(dn, bp, end - start + 1, tx); + rw_exit(&db->db_rwlock); } else { for (uint64_t id = start; id <= end; id++, bp++) { if (BP_IS_HOLE(bp)) @@ -323,10 +344,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } if (free_indirects) { + rw_enter(&db->db_rwlock, RW_WRITER); for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -378,18 +401,31 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } - if (trunc) { - ASSERTV(uint64_t off); + /* + * Do not truncate the maxblkid if we are performing a raw + * receive. The raw receive sets the maxblkid manually and + * must not be overridden. Usually, the last DRR_FREE record + * will be at the maxblkid, because the source system sets + * the maxblkid when truncating. However, if the last block + * was freed by overwriting with zeros and being compressed + * away to a hole, the source system will generate a DRR_FREE + * record while leaving the maxblkid after the end of that + * record. In this case we need to leave the maxblkid as + * indicated in the DRR_OBJECT record, so that it matches the + * source system, ensuring that the cryptographic hashes will + * match. + */ + if (trunc && !dn->dn_objset->os_raw_receive) { + uint64_t off __maybe_unused; dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; - ASSERTV(off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT)); + off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); @@ -428,7 +464,7 @@ dnode_evict_dbufs(dnode_t *dn) mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { -#ifdef DEBUG +#ifdef ZFS_DEBUG DB_DNODE_ENTER(db); ASSERT3P(DB_DNODE(db), ==, dn); DB_DNODE_EXIT(db); @@ -504,8 +540,9 @@ dnode_undirty_dbufs(list_t *list) mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; + ASSERT(list_head(&db->db_dirty_records) == dr); + list_remove_head(&db->db_dirty_records); + ASSERT(list_is_empty(&db->db_dirty_records)); db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || @@ -591,7 +628,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; - ASSERTV(static const dnode_phys_t zerodn = { 0 }); + static const dnode_phys_t zerodn __maybe_unused = { 0 }; boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); @@ -727,13 +764,22 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; dsfra.dsfra_free_indirects = freeing_dnode; + mutex_enter(&dn->dn_mtx); if (freeing_dnode) { ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], 0, dn->dn_maxblkid + 1)); } - mutex_enter(&dn->dn_mtx); - range_tree_vacate(dn->dn_free_ranges[txgoff], + /* + * Because dnode_sync_free_range() must drop dn_mtx during its + * processing, using it as a callback to range_tree_vacate() is + * not safe. No other operations (besides destroy) are allowed + * once range_tree_vacate() has begun, and dropping dn_mtx + * would leave a window open for another thread to observe that + * invalid (and unsafe) state. + */ + range_tree_walk(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); + range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL); range_tree_destroy(dn->dn_free_ranges[txgoff]); dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); @@ -806,6 +852,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. + * initiated the IO for the dnode's dbuf. Additionally, the caller + * has already added a reference to the dnode because it's on the + * os_synced_dnodes list. */ } diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index a32198402f..bead7da223 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -14,8 +14,9 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright 2019, 2020 by Christian Schwarz. All rights reserved. */ #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +33,7 @@ #include #include #include +#include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, @@ -53,14 +56,19 @@ dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, } /* + * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed + * to be zeroed. + * * Returns ESRCH if bookmark is not found. + * Note, we need to use the ZAP rather than the AVL to look up bookmarks + * by name, because only the ZAP honors the casesensitivity setting. */ -static int -dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, +int +dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, zfs_bookmark_phys_t *bmark_phys) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; int err; @@ -77,15 +85,16 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, bzero(bmark_phys, sizeof (*bmark_phys)); err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), - sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, - NULL, 0, NULL); + sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, + NULL); - return (err == ENOENT ? ESRCH : err); + return (err == ENOENT ? SET_ERROR(ESRCH) : err); } /* - * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark - * does not represents an earlier point in later_ds's timeline. + * If later_ds is non-NULL, this will return EXDEV if the specified bookmark + * does not represents an earlier point in later_ds's timeline. However, + * bmp will still be filled in if we return EXDEV. * * Returns ENOENT if the dataset containing the bookmark does not exist. * Returns ESRCH if the dataset exists but the bookmark was not found in it. @@ -102,7 +111,7 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, if (error != 0) return (error); - error = dsl_dataset_bmark_lookup(ds, shortname, bmp); + error = dsl_bookmark_lookup_impl(ds, shortname, bmp); if (error == 0 && later_ds != NULL) { if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) error = SET_ERROR(EXDEV); @@ -111,148 +120,491 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, return (error); } -typedef struct dsl_bookmark_create_arg { - nvlist_t *dbca_bmarks; - nvlist_t *dbca_errors; -} dsl_bookmark_create_arg_t; - +/* + * Validates that + * - bmark is a full dataset path of a bookmark (bookmark_namecheck) + * - source is a full path of a snapshot or bookmark + * ({bookmark,snapshot}_namecheck) + * + * Returns 0 if valid, -1 otherwise. + */ static int -dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, - dmu_tx_t *tx) +dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source) { - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *bmark_fs; - char *shortname; + if (bookmark_namecheck(bmark, NULL, NULL) != 0) + return (-1); + + int is_bmark, is_snap; + is_bmark = bookmark_namecheck(source, NULL, NULL) == 0; + is_snap = snapshot_namecheck(source, NULL, NULL) == 0; + if (!is_bmark && !is_snap) + return (-1); + + return (0); +} + +/* + * Check that the given nvlist corresponds to the following schema: + * { newbookmark -> source, ... } + * where + * - each pair passes dsl_bookmark_create_nvl_validate_pair + * - all newbookmarks are in the same pool + * - all newbookmarks have unique names + * + * Note that this function is only validates above schema. Callers must ensure + * that the bookmarks can be created, e.g. that sources exist. + * + * Returns 0 if the nvlist adheres to above schema. + * Returns -1 if it doesn't. + */ +int +dsl_bookmark_create_nvl_validate(nvlist_t *bmarks) +{ + char *first; + size_t first_len; + + first = NULL; + for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { + + char *bmark = nvpair_name(pair); + char *source; + + /* list structure: values must be snapshots XOR bookmarks */ + if (nvpair_value_string(pair, &source) != 0) + return (-1); + if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0) + return (-1); + + /* same pool check */ + if (first == NULL) { + char *cp = strpbrk(bmark, "/#"); + if (cp == NULL) + return (-1); + first = bmark; + first_len = cp - bmark; + } + if (strncmp(first, bmark, first_len) != 0) + return (-1); + switch (*(bmark + first_len)) { + case '/': /* fallthrough */ + case '#': + break; + default: + return (-1); + } + + /* unique newbookmark names; todo: O(n^2) */ + for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) { + if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) + return (-1); + } + + } + return (0); +} + +/* + * expects that newbm and source have been validated using + * dsl_bookmark_create_nvl_validate_pair + */ +static int +dsl_bookmark_create_check_impl(dsl_pool_t *dp, + const char *newbm, const char *source) +{ + ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source)); + /* defer source namecheck until we know it's a snapshot or bookmark */ + int error; + dsl_dataset_t *newbm_ds; + char *newbm_short; zfs_bookmark_phys_t bmark_phys; - if (!snapds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - error = dsl_bookmark_hold_ds(dp, bookmark_name, - &bmark_fs, FTAG, &shortname); + error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short); if (error != 0) return (error); - if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) { - dsl_dataset_rele(bmark_fs, FTAG); - return (SET_ERROR(EINVAL)); + /* Verify that the new bookmark does not already exist */ + error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys); + switch (error) { + case ESRCH: + /* happy path: new bmark doesn't exist, proceed after switch */ + error = 0; + break; + case 0: + error = SET_ERROR(EEXIST); + goto eholdnewbmds; + default: + /* dsl_bookmark_lookup_impl already did SET_ERROR */ + goto eholdnewbmds; } - error = dsl_dataset_bmark_lookup(bmark_fs, shortname, - &bmark_phys); - dsl_dataset_rele(bmark_fs, FTAG); - if (error == 0) - return (SET_ERROR(EEXIST)); - if (error == ESRCH) - return (0); + /* error is retval of the following if-cascade */ + if (strchr(source, '@') != NULL) { + dsl_dataset_t *source_snap_ds; + ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0); + error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds); + if (error == 0) { + VERIFY(source_snap_ds->ds_is_snapshot); + /* + * Verify that source snapshot is an earlier point in + * newbm_ds's timeline (source may be newbm_ds's origin) + */ + if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0)) + error = SET_ERROR( + ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); + dsl_dataset_rele(source_snap_ds, FTAG); + } + } else if (strchr(source, '#') != NULL) { + zfs_bookmark_phys_t source_phys; + ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0); + /* + * Source must exists and be an earlier point in newbm_ds's + * timeline (newbm_ds's origin may be a snap of source's ds) + */ + error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys); + switch (error) { + case 0: + break; /* happy path */ + case EXDEV: + error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); + break; + default: + /* dsl_bookmark_lookup already did SET_ERROR */ + break; + } + } else { + /* + * dsl_bookmark_create_nvl_validate validates that source is + * either snapshot or bookmark + */ + panic("unreachable code: %s", source); + } + +eholdnewbmds: + dsl_dataset_rele(newbm_ds, FTAG); return (error); } -static int +int dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_arg_t *dbca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); int rv = 0; + int schema_err = 0; + ASSERT3P(dbca, !=, NULL); + ASSERT3P(dbca->dbca_bmarks, !=, NULL); + /* dbca->dbca_errors is allowed to be NULL */ + + dsl_pool_t *dp = dmu_tx_pool(tx); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); + if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0) + rv = schema_err = SET_ERROR(EINVAL); + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds; - int error; + char *new = nvpair_name(pair); - /* note: validity of nvlist checked by ioctl layer */ - error = dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds); + int error = schema_err; if (error == 0) { - error = dsl_bookmark_create_check_impl(snapds, - nvpair_name(pair), tx); - dsl_dataset_rele(snapds, FTAG); + char *source = fnvpair_value_string(pair); + error = dsl_bookmark_create_check_impl(dp, new, source); + if (error != 0) + error = SET_ERROR(error); } + if (error != 0) { - fnvlist_add_int32(dbca->dbca_errors, - nvpair_name(pair), error); rv = error; + if (dbca->dbca_errors != NULL) + fnvlist_add_int32(dbca->dbca_errors, + new, error); } } return (rv); } -static void -dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +static dsl_bookmark_node_t * +dsl_bookmark_node_alloc(char *shortname) +{ + dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP); + dbn->dbn_name = spa_strdup(shortname); + dbn->dbn_dirty = B_FALSE; + mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL); + return (dbn); +} + +/* + * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot. + */ +static void +dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) +{ + spa_t *spa = dsl_dataset_get_spa(snap); + objset_t *mos = spa_get_dsl(spa)->dp_meta_objset; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); + zbm->zbm_guid = dsp->ds_guid; + zbm->zbm_creation_txg = dsp->ds_creation_txg; + zbm->zbm_creation_time = dsp->ds_creation_time; + zbm->zbm_redaction_obj = 0; + + /* + * If the dataset is encrypted create a larger bookmark to + * accommodate the IVset guid. The IVset guid was added + * after the encryption feature to prevent a problem with + * raw sends. If we encounter an encrypted dataset without + * an IVset guid we fall back to a normal bookmark. + */ + if (snap->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { + (void) zap_lookup(mos, snap->ds_object, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &zbm->zbm_ivset_guid); + } + + if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) { + zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN; + zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + dsl_dataset_t *nextds; + VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool, + dsp->ds_next_snap_obj, FTAG, &nextds)); + dsl_deadlist_space(&nextds->ds_deadlist, + &zbm->zbm_referenced_freed_before_next_snap, + &zbm->zbm_compressed_freed_before_next_snap, + &zbm->zbm_uncompressed_freed_before_next_snap); + dsl_dataset_rele(nextds, FTAG); + } else { + bzero(&zbm->zbm_flags, + sizeof (zfs_bookmark_phys_t) - + offsetof(zfs_bookmark_phys_t, zbm_flags)); + } +} + +/* + * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate + * SPA feature counters. + */ +void +dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, + dmu_tx_t *tx) { - dsl_bookmark_create_arg_t *dbca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; - ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + if (hds->ds_bookmarks_obj == 0) { + hds->ds_bookmarks_obj = zap_create_norm(mos, + U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, + tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(hds, tx); + VERIFY0(zap_add(mos, hds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (hds->ds_bookmarks_obj), 1, + &hds->ds_bookmarks_obj, tx)); + } + + avl_add(&hds->ds_bookmarks, dbn); + + /* + * To maintain backwards compatibility with software that doesn't + * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest + * possible bookmark size. + */ + uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1; + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) && + (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags & + ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) { + bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx); + } + + __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 }; + ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, + &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); + + VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, + sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t), + &dbn->dbn_phys, tx)); +} + +/* + * If redaction_list is non-null, we create a redacted bookmark and redaction + * list, and store the object number of the redaction list in redact_obj. + */ +static void +dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, + dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag, + redaction_list_t **redaction_list) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *snapds, *bmark_fs; + char *shortname; + boolean_t bookmark_redacted; + uint64_t *dsredactsnaps; + uint64_t dsnumsnaps; + + VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG, + &shortname)); + + dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname); + dsl_bookmark_set_phys(&dbn->dbn_phys, snapds); + + bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds, + SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); + if (redaction_list != NULL || bookmark_redacted) { + redaction_list_t *local_rl; + if (bookmark_redacted) { + redact_snaps = dsredactsnaps; + num_redact_snaps = dsnumsnaps; + } + dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t), tx); + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + + VERIFY0(dsl_redaction_list_hold_obj(dp, + dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); + dsl_redaction_list_long_hold(dp, local_rl, tag); + + ASSERT3U((local_rl)->rl_dbuf->db_size, >=, + sizeof (redaction_list_phys_t) + num_redact_snaps * + sizeof (uint64_t)); + dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps, + sizeof (uint64_t) * num_redact_snaps); + local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; + if (bookmark_redacted) { + ASSERT3P(redaction_list, ==, NULL); + local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; + local_rl->rl_phys->rlp_last_object = UINT64_MAX; + dsl_redaction_list_long_rele(local_rl, tag); + dsl_redaction_list_rele(local_rl, tag); + } else { + *redaction_list = local_rl; + } + } + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + dsl_bookmark_node_add(bmark_fs, dbn, tx); + + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu", + shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object, + (longlong_t)dbn->dbn_phys.zbm_redaction_obj); + + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); +} + + +static void +dsl_bookmark_create_sync_impl_book( + const char *new_name, const char *source_name, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *bmark_fs_source, *bmark_fs_new; + char *source_shortname, *new_shortname; + zfs_bookmark_phys_t source_phys; + + VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG, + &source_shortname)); + VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG, + &new_shortname)); + + /* + * create a copy of the source bookmark by copying most of its members + * + * Caveat: bookmarking a redaction bookmark yields a normal bookmark + * ----------------------------------------------------------------- + * Reasoning: + * - The zbm_redaction_obj would be referred to by both source and new + * bookmark, but would be destroyed once either source or new is + * destroyed, resulting in use-after-free of the referred object. + * - User expectation when issuing the `zfs bookmark` command is that + * a normal bookmark of the source is created + * + * Design Alternatives For Full Redaction Bookmark Copying: + * - reference-count the redaction object => would require on-disk + * format change for existing redaction objects + * - Copy the redaction object => cannot be done in syncing context + * because the redaction object might be too large + */ + + VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname, + &source_phys)); + dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname); + + memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys)); + new_dbn->dbn_phys.zbm_redaction_obj = 0; + + /* update feature counters */ + if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + /* no need for redaction bookmark counter; nulled zbm_redaction_obj */ + /* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */ + + /* + * write new bookmark + * + * Note that dsl_bookmark_lookup_impl guarantees that, if source is a + * v1 bookmark, the v2-only fields are zeroed. + * And dsl_bookmark_node_add writes back a v1-sized bookmark if + * v2 bookmarks are disabled and/or v2-only fields are zeroed. + * => bookmark copying works on pre-bookmark-v2 pools + */ + dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx); + + spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx, + "name=%s creation_txg=%llu source_guid=%llu", + new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg, + (longlong_t)source_phys.zbm_guid); + + dsl_dataset_rele(bmark_fs_source, FTAG); + dsl_dataset_rele(bmark_fs_new, FTAG); +} + +void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + + ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa, + SPA_FEATURE_BOOKMARKS)); for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds, *bmark_fs; - zfs_bookmark_phys_t bmark_phys = { 0 }; - char *shortname; - uint32_t bmark_len = BOOKMARK_PHYS_SIZE_V1; - VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds)); - VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), - &bmark_fs, FTAG, &shortname)); - if (bmark_fs->ds_bookmarks == 0) { - bmark_fs->ds_bookmarks = - zap_create_norm(mos, U8_TEXTPREP_TOUPPER, - DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); - spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + char *new = nvpair_name(pair); + char *source = fnvpair_value_string(pair); - dsl_dataset_zapify(bmark_fs, tx); - VERIFY0(zap_add(mos, bmark_fs->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (bmark_fs->ds_bookmarks), 1, - &bmark_fs->ds_bookmarks, tx)); + if (strchr(source, '@') != NULL) { + dsl_bookmark_create_sync_impl_snap(new, source, tx, + 0, NULL, NULL, NULL); + } else if (strchr(source, '#') != NULL) { + dsl_bookmark_create_sync_impl_book(new, source, tx); + } else { + panic("unreachable code"); } - bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; - bmark_phys.zbm_creation_txg = - dsl_dataset_phys(snapds)->ds_creation_txg; - bmark_phys.zbm_creation_time = - dsl_dataset_phys(snapds)->ds_creation_time; - - /* - * If the dataset is encrypted create a larger bookmark to - * accommodate the IVset guid. The IVset guid was added - * after the encryption feature to prevent a problem with - * raw sends. If we encounter an encrypted dataset without - * an IVset guid we fall back to a normal bookmark. - */ - if (snapds->ds_dir->dd_crypto_obj != 0 && - spa_feature_is_enabled(dp->dp_spa, - SPA_FEATURE_BOOKMARK_V2)) { - int err = zap_lookup(mos, snapds->ds_object, - DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, - &bmark_phys.zbm_ivset_guid); - if (err == 0) { - bmark_len = BOOKMARK_PHYS_SIZE_V2; - spa_feature_incr(dp->dp_spa, - SPA_FEATURE_BOOKMARK_V2, tx); - } - } - - VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, - shortname, sizeof (uint64_t), - bmark_len / sizeof (uint64_t), &bmark_phys, tx)); - - spa_history_log_internal_ds(bmark_fs, "bookmark", tx, - "name=%s creation_txg=%llu target_snap=%llu", - shortname, - (longlong_t)bmark_phys.zbm_creation_txg, - (longlong_t)snapds->ds_object); - - dsl_dataset_rele(bmark_fs, FTAG); - dsl_dataset_rele(snapds, FTAG); } } @@ -277,58 +629,268 @@ dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); } +static int +dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + /* + * If the list of redact snaps will not fit in the bonus buffer with + * the furthest reached object and offset, fail. + */ + if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + return (SET_ERROR(E2BIG)); + + if (dsl_bookmark_create_nvl_validate_pair( + dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0) + return (SET_ERROR(EINVAL)); + + rv = dsl_bookmark_create_check_impl(dp, + dbcra->dbcra_bmark, dbcra->dbcra_snap); + return (rv); +} + +static void +dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark, + dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps, + dbcra->dbcra_tag, dbcra->dbcra_rl); +} + +int +dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, + uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl) +{ + dsl_bookmark_create_redacted_arg_t dbcra; + + dbcra.dbcra_bmark = bookmark; + dbcra.dbcra_snap = snapshot; + dbcra.dbcra_rl = rl; + dbcra.dbcra_numsnaps = numsnaps; + dbcra.dbcra_snaps = snapguids; + dbcra.dbcra_tag = tag; + + return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check, + dsl_bookmark_create_redacted_sync, &dbcra, 5, + ZFS_SPACE_CHECK_NORMAL)); +} + +/* + * Retrieve the list of properties given in the 'props' nvlist for a bookmark. + * If 'props' is NULL, retrieves all properties. + */ +static void +dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys, + nvlist_t *props, nvlist_t *out_props) +{ + ASSERT3P(dp, !=, NULL); + ASSERT3P(bmark_phys, !=, NULL); + ASSERT3P(out_props, !=, NULL); + ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock)); + + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys->zbm_guid); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys->zbm_creation_time); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid); + } + if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFERENCED))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_REFERENCED, + bmark_phys->zbm_referenced_bytes_refd); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_LOGICALREFERENCED, + bmark_phys->zbm_uncompressed_bytes_refd); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFRATIO))) { + uint64_t ratio = + bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 : + bmark_phys->zbm_uncompressed_bytes_refd * 100 / + bmark_phys->zbm_compressed_bytes_refd; + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_REFRATIO, ratio); + } + } + + if ((props == NULL || nvlist_exists(props, "redact_snaps") || + nvlist_exists(props, "redact_complete")) && + bmark_phys->zbm_redaction_obj != 0) { + redaction_list_t *rl; + int err = dsl_redaction_list_hold_obj(dp, + bmark_phys->zbm_redaction_obj, FTAG, &rl); + if (err == 0) { + if (nvlist_exists(props, "redact_snaps")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_uint64_array(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_snaps, + rl->rl_phys->rlp_num_snaps); + fnvlist_add_nvlist(out_props, "redact_snaps", + nvl); + nvlist_free(nvl); + } + if (nvlist_exists(props, "redact_complete")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_last_blkid == UINT64_MAX && + rl->rl_phys->rlp_last_object == UINT64_MAX); + fnvlist_add_nvlist(out_props, "redact_complete", + nvl); + nvlist_free(nvl); + } + dsl_redaction_list_rele(rl, FTAG); + } + } +} + int dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_is_snapshot(ds)) + return (SET_ERROR(EINVAL)); + + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + nvlist_t *out_props = fnvlist_alloc(); + + dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props); + + fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props); + fnvlist_free(out_props); + } + return (0); +} + +/* + * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by + * their TXG, then by their FBN-ness. The "FBN-ness" component ensures + * that all bookmarks at the same TXG that HAS_FBN are adjacent, which + * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be + * multiple bookmarks at the same TXG (with the same FBN-ness). In this + * case we differentiate them by an arbitrary metric (in this case, + * their names). + */ +static int +dsl_bookmark_compare(const void *l, const void *r) +{ + const dsl_bookmark_node_t *ldbn = l; + const dsl_bookmark_node_t *rdbn = r; + + int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg, + rdbn->dbn_phys.zbm_creation_txg); + if (likely(cmp)) + return (cmp); + cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), + (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + if (likely(cmp)) + return (cmp); + cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); + return (TREE_ISIGN(cmp)); +} + +/* + * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree. + */ +int +dsl_bookmark_init_ds(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(!ds->ds_is_snapshot); + + avl_create(&ds->ds_bookmarks, dsl_bookmark_compare, + sizeof (dsl_bookmark_node_t), + offsetof(dsl_bookmark_node_t, dbn_node)); + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj); + if (zaperr == ENOENT) + return (0); + if (zaperr != 0) + return (zaperr); + + if (ds->ds_bookmarks_obj == 0) + return (0); + int err = 0; zap_cursor_t zc; zap_attribute_t attr; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - uint64_t bmark_zapobj = ds->ds_bookmarks; - if (bmark_zapobj == 0) - return (0); - - for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); - zap_cursor_retrieve(&zc, &attr) == 0; + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + (err = zap_cursor_retrieve(&zc, &attr)) == 0; zap_cursor_advance(&zc)) { - char *bmark_name = attr.za_name; - zfs_bookmark_phys_t bmark_phys = { 0 }; + dsl_bookmark_node_t *dbn = + dsl_bookmark_node_alloc(attr.za_name); - err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); + err = dsl_bookmark_lookup_impl(ds, + dbn->dbn_name, &dbn->dbn_phys); ASSERT3U(err, !=, ENOENT); - if (err != 0) + if (err != 0) { + kmem_free(dbn, sizeof (*dbn)); break; - - nvlist_t *out_props = fnvlist_alloc(); - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_GUID))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_GUID, bmark_phys.zbm_guid); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATETXG))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); - } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATION))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); - } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_IVSET_GUID, bmark_phys.zbm_ivset_guid); - } - - fnvlist_add_nvlist(outnvl, bmark_name, out_props); - fnvlist_free(out_props); + avl_add(&ds->ds_bookmarks, dbn); } zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; return (err); } +void +dsl_bookmark_fini_ds(dsl_dataset_t *ds) +{ + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + if (ds->ds_is_snapshot) + return; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); +} + /* * Retrieve the bookmarks that exist in the specified dataset, and the * requested properties of each bookmark. @@ -359,27 +921,69 @@ dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) return (err); } +/* + * Retrieve all properties for a single bookmark in the given dataset. + */ +int +dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + zfs_bookmark_phys_t bmark_phys = { 0 }; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys); + if (err != 0) + goto out; + + dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props); +out: + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + typedef struct dsl_bookmark_destroy_arg { nvlist_t *dbda_bmarks; nvlist_t *dbda_success; nvlist_t *dbda_errors; } dsl_bookmark_destroy_arg_t; -static int -dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +static void +dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, + dmu_tx_t *tx) { - int err; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; uint64_t int_size, num_ints; + /* + * 'search' must be zeroed so that dbn_flags (which is used in + * dsl_bookmark_compare()) will be zeroed even if the on-disk + * (in ZAP) bookmark is shorter than offsetof(dbn_flags). + */ + dsl_bookmark_node_t search = { 0 }; + char realname[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * Find the real name of this bookmark, which may be different + * from the given name if the dataset is case-insensitive. Then + * use the real name to find the node in the ds_bookmarks AVL tree. + */ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; - err = zap_length(mos, bmark_zapobj, name, &int_size, &num_ints); - if (err != 0) - return (err); + VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints)); ASSERT3U(int_size, ==, sizeof (uint64_t)); @@ -387,8 +991,70 @@ dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_BOOKMARK_V2, tx); } + VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t), + num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL)); - return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); + search.dbn_name = realname; + dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL); + ASSERT(dbn != NULL); + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + /* + * If this bookmark HAS_FBN, and it is before the most + * recent snapshot, then its TXG is a key in the head's + * deadlist (and all clones' heads' deadlists). If this is + * the last thing keeping the key (i.e. there are no more + * bookmarks with HAS_FBN at this TXG, and there is no + * snapshot at this TXG), then remove the key. + * + * Note that this algorithm depends on ds_bookmarks being + * sorted such that all bookmarks at the same TXG with + * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks + * at the same TXG in between them). If this were not + * the case, we would need to examine *all* bookmarks + * at this TXG, rather than just the adjacent ones. + */ + + dsl_bookmark_node_t *dbn_prev = + AVL_PREV(&ds->ds_bookmarks, dbn); + dsl_bookmark_node_t *dbn_next = + AVL_NEXT(&ds->ds_bookmarks, dbn); + + boolean_t more_bookmarks_at_this_txg = + (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) || + (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) && + !more_bookmarks_at_this_txg && + dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_prev_snap_txg) { + dsl_dir_remove_clones_key(ds->ds_dir, + dbn->dbn_phys.zbm_creation_txg, tx); + dsl_deadlist_remove_key(&ds->ds_deadlist, + dbn->dbn_phys.zbm_creation_txg, tx); + } + + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + + avl_remove(&ds->ds_bookmarks, dbn); + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + + VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } static int @@ -419,7 +1085,7 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) continue; } if (error == 0) { - error = dsl_dataset_bmark_lookup(ds, shortname, &bm); + error = dsl_bookmark_lookup_impl(ds, shortname, &bm); dsl_dataset_rele(ds, FTAG); if (error == ESRCH) { /* @@ -428,6 +1094,20 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) */ continue; } + if (error == 0 && bm.zbm_redaction_obj != 0) { + redaction_list_t *rl = NULL; + error = dsl_redaction_list_hold_obj(tx->tx_pool, + bm.zbm_redaction_obj, FTAG, &rl); + if (error == ENOENT) { + error = 0; + } else if (error == 0 && + dsl_redaction_list_long_held(rl)) { + error = SET_ERROR(EBUSY); + } + if (rl != NULL) { + dsl_redaction_list_rele(rl, FTAG); + } + } } if (error == 0) { if (dmu_tx_is_syncing(tx)) { @@ -457,18 +1137,17 @@ dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &ds, FTAG, &shortname)); - VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); + dsl_bookmark_destroy_sync_impl(ds, shortname, tx); /* * If all of this dataset's bookmarks have been destroyed, * free the zap object and decrement the feature's use count. */ - VERIFY0(zap_count(mos, ds->ds_bookmarks, - &zap_cnt)); + VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt)); if (zap_cnt == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); - ds->ds_bookmarks = 0; + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); + ds->ds_bookmarks_obj = 0; spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); VERIFY0(zap_remove(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, tx)); @@ -503,3 +1182,553 @@ dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) fnvlist_free(dbda.dbda_success); return (rv); } + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_redaction_list_long_held(redaction_list_t *rl) +{ + return (!zfs_refcount_is_zero(&rl->rl_longholds)); +} + +void +dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag) +{ + ASSERT(dsl_pool_config_held(dp)); + (void) zfs_refcount_add(&rl->rl_longholds, tag); +} + +void +dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag) +{ + (void) zfs_refcount_remove(&rl->rl_longholds, tag); +} + +/* ARGSUSED */ +static void +redaction_list_evict_sync(void *rlu) +{ + redaction_list_t *rl = rlu; + zfs_refcount_destroy(&rl->rl_longholds); + + kmem_free(rl, sizeof (redaction_list_t)); +} + +void +dsl_redaction_list_rele(redaction_list_t *rl, void *tag) +{ + dmu_buf_rele(rl->rl_dbuf, tag); +} + +int +dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, + redaction_list_t **rlp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + redaction_list_t *rl; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, rlobj, tag, &dbuf); + if (err != 0) + return (err); + + rl = dmu_buf_get_user(dbuf); + if (rl == NULL) { + redaction_list_t *winner = NULL; + + rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); + rl->rl_dbuf = dbuf; + rl->rl_object = rlobj; + rl->rl_phys = dbuf->db_data; + rl->rl_mos = dp->dp_meta_objset; + zfs_refcount_create(&rl->rl_longholds); + dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, + &rl->rl_dbuf); + if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { + kmem_free(rl, sizeof (*rl)); + rl = winner; + } + } + *rlp = rl; + return (0); +} + +/* + * Snapshot ds is being destroyed. + * + * Adjust the "freed_before_next" of any bookmarks between this snap + * and the previous snapshot, because their "next snapshot" is changing. + * + * If there are any bookmarks with HAS_FBN at this snapshot, remove + * their HAS_SNAP flag (note: there can be at most one snapshot of + * each filesystem at a given txg), and return B_TRUE. In this case + * the caller can not remove the key in the deadlist at this TXG, because + * the HAS_FBN bookmarks require the key be there. + * + * Returns B_FALSE if there are no bookmarks with HAS_FBN at this + * snapshot's TXG. In this case the caller can remove the key in the + * deadlist at this TXG. + */ +boolean_t +dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + dsl_dataset_t *head, *next; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next)); + + /* + * Find the first bookmark that HAS_FBN at or after the + * previous snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at or after the previous + * snapshot, and before this (being deleted) snapshot. Adjust + * their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) + continue; + /* + * Increase our FBN by the amount of space that was live + * (referenced) at the time of this bookmark (i.e. + * birth <= zbm_creation_txg), and killed between this + * (being deleted) snapshot and the next snapshot (i.e. + * on the next snapshot's deadlist). (Space killed before + * this are already on our FBN.) + */ + uint64_t referenced, compressed, uncompressed; + dsl_deadlist_space_range(&next->ds_deadlist, + 0, dbn->dbn_phys.zbm_creation_txg, + &referenced, &compressed, &uncompressed); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + referenced; + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + compressed; + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + uncompressed; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } + dsl_dataset_rele(next, FTAG); + + /* + * There may be several bookmarks at this txg (the TXG of the + * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS + * flag on all of them, and return TRUE if there is at least 1 + * bookmark here with HAS_FBN (thus preventing the deadlist + * key from being removed). + */ + boolean_t rv = B_FALSE; + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + ASSERT(!(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS)); + continue; + } + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); + dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + rv = B_TRUE; + } + dsl_dataset_rele(head, FTAG); + return (rv); +} + +/* + * A snapshot is being created of this (head) dataset. + * + * We don't keep keys in the deadlist for the most recent snapshot, or any + * bookmarks at or after it, because there can't be any blocks on the + * deadlist in this range. Now that the most recent snapshot is after + * all bookmarks, we need to add these keys. Note that the caller always + * adds a key at the previous snapshot, so we only add keys for bookmarks + * after that. + */ +void +dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t last_key_added = UINT64_MAX; + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg > + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg; + ASSERT3U(creation_txg, <=, last_key_added); + /* + * Note, there may be multiple bookmarks at this TXG, + * and we only want to add the key for this TXG once. + * The ds_bookmarks AVL is sorted by TXG, so we will visit + * these bookmarks in sequence. + */ + if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) && + creation_txg != last_key_added) { + dsl_deadlist_add_key(&ds->ds_deadlist, + creation_txg, tx); + last_key_added = creation_txg; + } + } +} + +/* + * The next snapshot of the origin dataset has changed, due to + * promote or clone swap. If there are any bookmarks at this dataset, + * we need to update their zbm_*_freed_before_next_snap to reflect this. + * The head dataset has the relevant bookmarks in ds_bookmarks. + */ +void +dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + /* + * Find the first bookmark that HAS_FBN at the origin snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at the origin txg. + * Adjust their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(origin)->ds_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + + /* + * Bookmark is at the origin, therefore its + * "next dataset" is changing, so we need + * to reset its FBN by recomputing it in + * dsl_bookmark_set_phys(). + */ + ASSERT3U(dbn->dbn_phys.zbm_guid, ==, + dsl_dataset_phys(origin)->ds_guid); + ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==, + dsl_dataset_phys(origin)->ds_referenced_bytes); + ASSERT(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS); + /* + * Save and restore the zbm_redaction_obj, which + * is zeroed by dsl_bookmark_set_phys(). + */ + uint64_t redaction_obj = + dbn->dbn_phys.zbm_redaction_obj; + dsl_bookmark_set_phys(&dbn->dbn_phys, origin); + dbn->dbn_phys.zbm_redaction_obj = redaction_obj; + + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } +} + +/* + * This block is no longer referenced by this (head) dataset. + * + * Adjust the FBN of any bookmarks that reference this block, whose "next" + * is the head dataset. + */ +/* ARGSUSED */ +void +dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + /* + * Iterate over bookmarks whose "next" is the head dataset. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + /* + * If the block was live (referenced) at the time of this + * bookmark, add its space to the bookmark's FBN. + */ + if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + mutex_enter(&dbn->dbn_lock); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp); + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + BP_GET_PSIZE(bp); + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + BP_GET_UCSIZE(bp); + /* + * Changing the ZAP object here would be too + * expensive. Also, we may be called from the zio + * interrupt thread, which can't block on i/o. + * Therefore, we mark this bookmark as dirty and + * modify the ZAP once per txg, in + * dsl_bookmark_sync_done(). + */ + dbn->dbn_dirty = B_TRUE; + mutex_exit(&dbn->dbn_lock); + } + } +} + +void +dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (dsl_dataset_is_snapshot(ds)) + return; + + /* + * We only dirty bookmarks that are at or after the most recent + * snapshot. We can't create snapshots between + * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we + * don't need to look at any bookmarks before ds_prev_snap_txg. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + if (dbn->dbn_dirty) { + /* + * We only dirty nodes with HAS_FBN, therefore + * we can always use the current bookmark struct size. + */ + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + VERIFY0(zap_update(dp->dp_meta_objset, + ds->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + dbn->dbn_dirty = B_FALSE; + } + } +#ifdef ZFS_DEBUG + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + ASSERT(!dbn->dbn_dirty); + } +#endif +} + +/* + * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks). + */ +uint64_t +dsl_bookmark_latest_txg(dsl_dataset_t *ds) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + if (dbn == NULL) + return (0); + return (dbn->dbn_phys.zbm_creation_txg); +} + +/* + * Compare the redact_block_phys_t to the bookmark. If the last block in the + * redact_block_phys_t is before the bookmark, return -1. If the first block in + * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the + * bookmark is inside the range of the redact_block_phys_t, and we return 0. + */ +static int +redact_block_zb_compare(redact_block_phys_t *first, + zbookmark_phys_t *second) +{ + /* + * If the block_phys is for a previous object, or the last block in the + * block_phys is strictly before the block in the bookmark, the + * block_phys is earlier. + */ + if (first->rbp_object < second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid + (redact_block_get_count(first) - 1) < + second->zb_blkid)) { + return (-1); + } + + /* + * If the bookmark is for a previous object, or the block in the + * bookmark is strictly before the first block in the block_phys, the + * bookmark is earlier. + */ + if (first->rbp_object > second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid > second->zb_blkid)) { + return (1); + } + + return (0); +} + +/* + * Traverse the redaction list in the provided object, and call the callback for + * each entry we find. Don't call the callback for any records before resume. + */ +int +dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, + rl_traverse_callback_t cb, void *arg) +{ + objset_t *mos = rl->rl_mos; + int err = 0; + + if (rl->rl_phys->rlp_last_object != UINT64_MAX || + rl->rl_phys->rlp_last_blkid != UINT64_MAX) { + /* + * When we finish a send, we update the last object and offset + * to UINT64_MAX. If a send fails partway through, the last + * object and offset will have some other value, indicating how + * far the send got. The redaction list must be complete before + * it can be traversed, so return EINVAL if the last object and + * blkid are not set to UINT64_MAX. + */ + return (SET_ERROR(EINVAL)); + } + + /* + * This allows us to skip the binary search and resume checking logic + * below, if we're not resuming a redacted send. + */ + if (ZB_IS_ZERO(resume)) + resume = NULL; + + /* + * Binary search for the point to resume from. + */ + uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1; + uint64_t minidx = 0; + while (resume != NULL && maxidx > minidx) { + redact_block_phys_t rbp = { 0 }; + ASSERT3U(maxidx, >, minidx); + uint64_t mididx = minidx + ((maxidx - minidx) / 2); + err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp), + sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + int cmp = redact_block_zb_compare(&rbp, resume); + + if (cmp == 0) { + minidx = mididx; + break; + } else if (cmp > 0) { + maxidx = + (mididx == minidx ? minidx : mididx - 1); + } else { + minidx = mididx + 1; + } + } + + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + redact_block_phys_t *buf = zio_data_buf_alloc(bufsize); + + unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t); + uint64_t start_block = minidx / entries_per_buf; + err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf, + DMU_READ_PREFETCH); + + for (uint64_t curidx = minidx; + err == 0 && curidx < rl->rl_phys->rlp_num_entries; + curidx++) { + /* + * We read in the redaction list one block at a time. Once we + * finish with all the entries in a given block, we read in a + * new one. The predictive prefetcher will take care of any + * prefetching, and this code shouldn't be the bottleneck, so we + * don't need to do manual prefetching. + */ + if (curidx % entries_per_buf == 0) { + err = dmu_read(mos, rl->rl_object, curidx * + sizeof (*buf), bufsize, buf, + DMU_READ_PREFETCH); + if (err != 0) + break; + } + redact_block_phys_t *rb = &buf[curidx % entries_per_buf]; + /* + * If resume is non-null, we should either not send the data, or + * null out resume so we don't have to keep doing these + * comparisons. + */ + if (resume != NULL) { + /* + * It is possible that after the binary search we got + * a record before the resume point. There's two cases + * where this can occur. If the record is the last + * redaction record, and the resume point is after the + * end of the redacted data, curidx will be the last + * redaction record. In that case, the loop will end + * after this iteration. The second case is if the + * resume point is between two redaction records, the + * binary search can return either the record before + * or after the resume point. In that case, the next + * iteration will be greater than the resume point. + */ + if (redact_block_zb_compare(rb, resume) < 0) { + ASSERT3U(curidx, ==, minidx); + continue; + } else { + /* + * If the place to resume is in the middle of + * the range described by this + * redact_block_phys, then modify the + * redact_block_phys in memory so we generate + * the right records. + */ + if (resume->zb_object == rb->rbp_object && + resume->zb_blkid > rb->rbp_blkid) { + uint64_t diff = resume->zb_blkid - + rb->rbp_blkid; + rb->rbp_blkid = resume->zb_blkid; + redact_block_set_count(rb, + redact_block_get_count(rb) - diff); + } + resume = NULL; + } + } + + if (cb(rb, arg) != 0) { + err = EINTR; + break; + } + } + + zio_data_buf_free(buf, bufsize); + return (err); +} diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 21db8e51ff..26d4c2fe7e 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -107,24 +107,17 @@ dsl_wrapping_key_free(dsl_wrapping_key_t *wkey) kmem_free(wkey, sizeof (dsl_wrapping_key_t)); } -static int +static void dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out) { - int ret; dsl_wrapping_key_t *wkey; /* allocate the wrapping key */ wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP); - if (!wkey) - return (SET_ERROR(ENOMEM)); /* allocate and initialize the underlying crypto key */ wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP); - if (!wkey->wk_key.ck_data) { - ret = ENOMEM; - goto error; - } wkey->wk_key.ck_format = CRYPTO_KEY_RAW; wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN); @@ -137,13 +130,6 @@ dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, wkey->wk_iters = iters; *wkey_out = wkey; - return (0); - -error: - dsl_wrapping_key_free(wkey); - - *wkey_out = NULL; - return (ret); } int @@ -161,11 +147,6 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, char *keylocation = NULL; dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP); - if (!dcp) { - ret = SET_ERROR(ENOMEM); - goto error; - } - dcp->cp_cmd = cmd; /* get relevant arguments from the nvlists */ @@ -227,18 +208,15 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, goto error; } - /* if the user asked for the deault crypt, determine that now */ + /* if the user asked for the default crypt, determine that now */ if (dcp->cp_crypt == ZIO_CRYPT_ON) dcp->cp_crypt = ZIO_CRYPT_ON_VALUE; /* create the wrapping key from the raw data */ if (wkeydata != NULL) { /* create the wrapping key with the verified parameters */ - ret = dsl_wrapping_key_create(wkeydata, keyformat, salt, + dsl_wrapping_key_create(wkeydata, keyformat, salt, iters, &wkey); - if (ret != 0) - goto error; - dcp->cp_wkey = wkey; } @@ -257,11 +235,7 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, return (0); error: - if (wkey != NULL) - dsl_wrapping_key_free(wkey); - if (dcp != NULL) - kmem_free(dcp, sizeof (dsl_crypto_params_t)); - + kmem_free(dcp, sizeof (dsl_crypto_params_t)); *dcp_out = NULL; return (ret); } @@ -365,7 +339,7 @@ dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj) DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj)); } -int +static int dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version) { *version = 0; @@ -561,8 +535,6 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, /* allocate and initialize the key */ dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP); - if (!dck) - return (SET_ERROR(ENOMEM)); /* fetch all of the values we need from the ZAP */ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, @@ -854,7 +826,7 @@ spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, dsl_pool_rele(dp, FTAG); /* create any zvols under this ds */ - zvol_create_minors(dp->dp_spa, dsname, B_TRUE); + zvol_create_minors_recursive(dsname); return (0); @@ -921,7 +893,7 @@ spa_keystore_unload_wkey(const char *dsname) * Wait for any outstanding txg IO to complete, releasing any * remaining references on the wkey. */ - if (spa_mode(spa) != FREAD) + if (spa_mode(spa) != SPA_MODE_READ) txg_wait_synced(spa->spa_dsl_pool, 0); spa_close(spa, FTAG); @@ -1008,6 +980,7 @@ key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) rw_exit(&spa->spa_keystore.sk_km_lock); spa_keystore_dsl_key_rele(spa, km->km_key, km); + zfs_refcount_destroy(&km->km_refcnt); kmem_free(km, sizeof (dsl_key_mapping_t)); } @@ -1418,11 +1391,19 @@ error: return (ret); } - +/* + * This function deals with the intricacies of updating wrapping + * key references and encryption roots recursively in the event + * of a call to 'zfs change-key' or 'zfs promote'. The 'skip' + * parameter should always be set to B_FALSE when called + * externally. + */ static void spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, - uint64_t new_rddobj, dsl_wrapping_key_t *wkey, dmu_tx_t *tx) + uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip, + dmu_tx_t *tx) { + int ret; zap_cursor_t *zc; zap_attribute_t *za; dsl_pool_t *dp = dmu_tx_pool(tx); @@ -1435,18 +1416,21 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, /* hold the dd */ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); - /* ignore hidden dsl dirs */ + /* ignore special dsl dirs */ if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') { dsl_dir_rele(dd, FTAG); return; } + ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); + VERIFY(ret == 0 || ret == ENOENT); + /* * Stop recursing if this dsl dir didn't inherit from the root * or if this dd is a clone. */ - VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj)); - if (curr_rddobj != rddobj || dsl_dir_is_clone(dd)) { + if (ret == ENOENT || + (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) { dsl_dir_rele(dd, FTAG); return; } @@ -1454,19 +1438,23 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, /* * If we don't have a wrapping key just update the dck to reflect the * new encryption root. Otherwise rewrap the entire dck and re-sync it - * to disk. + * to disk. If skip is set, we don't do any of this work. */ - if (wkey == NULL) { - VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj, - DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx)); - } else { - VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, - FTAG, &dck)); - dsl_wrapping_key_hold(wkey, dck); - dsl_wrapping_key_rele(dck->dck_wkey, dck); - dck->dck_wkey = wkey; - dsl_crypto_key_sync(dck, tx); - spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + if (!skip) { + if (wkey == NULL) { + VERIFY0(zap_update(dp->dp_meta_objset, + dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, + &new_rddobj, tx)); + } else { + VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, + FTAG, &dck)); + dsl_wrapping_key_hold(wkey, dck); + dsl_wrapping_key_rele(dck->dck_wkey, dck); + dck->dck_wkey = wkey; + dsl_crypto_key_sync(dck, tx); + spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + } } zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); @@ -1478,7 +1466,27 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { spa_keystore_change_key_sync_impl(rddobj, - za->za_first_integer, new_rddobj, wkey, tx); + za->za_first_integer, new_rddobj, wkey, B_FALSE, tx); + } + zap_cursor_fini(zc); + + /* + * Recurse into all dsl dirs of clones. We utilize the skip parameter + * here so that we don't attempt to process the clones directly. This + * is because the clone and its origin share the same dck, which has + * already been updated. + */ + for (zap_cursor_init(zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer, + FTAG, &clone)); + spa_keystore_change_key_sync_impl(rddobj, + clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx); + dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(zc); @@ -1558,7 +1566,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) /* recurse through all children and rewrap their keys */ spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object, - new_rddobj, wkey, tx); + new_rddobj, wkey, B_FALSE, tx); /* * All references to the old wkey should be released now (if it @@ -1596,7 +1604,7 @@ spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp) /* * Perform the actual work in syncing context. The blocks modified * here could be calculated but it would require holding the pool - * lock and tarversing all of the datasets that will have their keys + * lock and traversing all of the datasets that will have their keys * changed. */ return (dsl_sync_task(dsname, spa_keystore_change_key_check, @@ -1610,15 +1618,8 @@ dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent) int ret; uint64_t curr_rddobj, parent_rddobj; - if (dd->dd_crypto_obj == 0) { - /* children of encrypted parents must be encrypted */ - if (newparent->dd_crypto_obj != 0) { - ret = SET_ERROR(EACCES); - goto error; - } - + if (dd->dd_crypto_obj == 0) return (0); - } ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); if (ret != 0) @@ -1683,11 +1684,15 @@ dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin) * Check that the parent of the target has the same encryption root. */ ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj); - if (ret != 0) + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) return (ret); ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj); - if (ret != 0) + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) return (ret); if (op_rddobj != tp_rddobj) @@ -1717,7 +1722,7 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, return; /* - * If the target is being promoted to the encyrption root update the + * If the target is being promoted to the encryption root update the * DSL Crypto Key and keylocation to reflect that. We also need to * update the DSL Crypto Keys of all children inheritting their * encryption root to point to the new target. Otherwise, the check @@ -1739,7 +1744,7 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER); spa_keystore_change_key_sync_impl(rddobj, origin->dd_object, - target->dd_object, NULL, tx); + target->dd_object, NULL, B_FALSE, tx); rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock); dsl_dataset_rele(targetds, FTAG); @@ -1747,34 +1752,6 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, kmem_free(keylocation, ZAP_MAXVALUELEN); } -int -dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd) -{ - int ret; - uint64_t pcrypt, crypt; - - /* - * Check that we are not making an unencrypted child of an - * encrypted parent. - */ - ret = dsl_dir_get_crypt(parentdd, &pcrypt); - if (ret != 0) - return (ret); - - ret = dsl_dir_get_crypt(origindd, &crypt); - if (ret != 0) - return (ret); - - ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); - ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); - - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) - return (SET_ERROR(EINVAL)); - - return (0); -} - - int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt) @@ -1805,13 +1782,6 @@ dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); - /* - * We can't create an unencrypted child of an encrypted parent - * under any circumstances. - */ - if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) - return (SET_ERROR(EINVAL)); - /* check for valid dcp with no encryption (inherited or local) */ if (crypt == ZIO_CRYPT_OFF) { /* Must not specify encryption params */ @@ -2330,7 +2300,7 @@ dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) iters, tx); } -int +static int dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx) { int ret; @@ -2371,7 +2341,7 @@ out: return (ret); } -void +static void dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx) { dsl_crypto_recv_key_arg_t *dcrka = arg; @@ -2408,11 +2378,11 @@ dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, } int -dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid, +dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid, nvlist_t **nvl_out) { int ret; - objset_t *os; + dsl_dataset_t *ds = os->os_dsl_dataset; dnode_t *mdn; uint64_t rddobj; nvlist_t *nvl = NULL; @@ -2430,12 +2400,9 @@ dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid, ASSERT(dckobj != 0); - VERIFY0(dmu_objset_from_ds(ds, &os)); mdn = DMU_META_DNODE(os); - ret = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); - if (ret != 0) - goto error; + nvl = fnvlist_alloc(); /* lookup values from the DSL Crypto Key */ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, @@ -2662,11 +2629,13 @@ dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv) } if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) { - VERIFY0(dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, - &enc_root)); - dsl_dir_name(enc_root, buf); - dsl_dir_rele(enc_root, FTAG); - dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf); + if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, + &enc_root) == 0) { + dsl_dir_name(enc_root, buf); + dsl_dir_rele(enc_root, FTAG); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_ENCRYPTION_ROOT, buf); + } } } @@ -2895,8 +2864,5 @@ error: return (ret); } -#if defined(_KERNEL) -module_param(zfs_disable_ivset_guid_check, int, 0644); -MODULE_PARM_DESC(zfs_disable_ivset_guid_check, +ZFS_MODULE_PARAM(zfs, zfs_, disable_ivset_guid_check, INT, ZMOD_RW, "Set to allow raw receives without IVset guids"); -#endif diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 966c2cc93d..f99964511a 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -21,13 +21,19 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2020 The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. */ #include @@ -57,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +79,7 @@ * of this setting. */ int zfs_max_recordsize = 1 * 1024 * 1024; +int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ { \ @@ -120,18 +128,18 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used, compressed, uncompressed; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - - used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - compressed = BP_GET_PSIZE(bp); - uncompressed = BP_GET_UCSIZE(bp); + spa_feature_t f; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); @@ -155,17 +163,37 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) (void *)B_TRUE; } - spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); ds->ds_feature_activation[f] = (void *)B_TRUE; } + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } + + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_allocs, bp); + } + mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, - compressed, uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, used - delta, + dsl_dir_diduse_transfer_space(ds->ds_dir, delta, + compressed, uncompressed, used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } @@ -205,8 +233,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); - - dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, + tx); } } @@ -220,7 +248,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); @@ -237,10 +265,23 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, bp); + } + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_lock); @@ -249,9 +290,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - delta, -compressed, -uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, -used - delta, + dsl_dir_diduse_transfer_space(ds->ds_dir, + delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); @@ -265,7 +305,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, */ bplist_append(&ds->ds_pending_deadlist, bp); } else { - dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); @@ -284,6 +324,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, DD_USED_HEAD, DD_USED_SNAP, tx); } } + + dsl_bookmark_block_killed(ds, bp, tx); + mutex_enter(&ds->ds_lock); ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); dsl_dataset_phys(ds)->ds_referenced_bytes -= used; @@ -363,7 +406,7 @@ load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f) } /* - * We have to release the fsid syncronously or we risk that a subsequent + * We have to release the fsid synchronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This * failure would manifest itself as the fsid of this dataset changing * between mounts which makes NFS clients quite unhappy. @@ -395,6 +438,8 @@ dsl_dataset_evict_async(void *dbu) ds->ds_prev = NULL; } + dsl_bookmark_fini_ds(ds); + bplist_destroy(&ds->ds_pending_deadlist); if (dsl_deadlist_is_open(&ds->ds_deadlist)) dsl_deadlist_close(&ds->ds_deadlist); @@ -564,8 +609,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, bplist_create(&ds->ds_pending_deadlist); - list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), - offsetof(dmu_sendarg_t, dsa_link)); + list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t), + offsetof(dmu_sendstatus_t, dss_link)); list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_ds_node)); @@ -588,14 +633,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } - if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - int zaperr = zap_lookup(mos, ds->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (ds->ds_bookmarks), 1, - &ds->ds_bookmarks); - if (zaperr != ENOENT) - VERIFY0(zaperr); - } + err = dsl_bookmark_init_ds(ds); } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); @@ -647,9 +685,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_bookmark_fini_ds(ds); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + unload_zfeature(ds, f); + } + list_destroy(&ds->ds_prop_cbs); list_destroy(&ds->ds_sendstreams); mutex_destroy(&ds->ds_lock); @@ -675,7 +719,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_phys(ds)->ds_fsid_guid, (long long)ds->ds_fsid_guid, spa_name(dp->dp_spa), - dsobj); + (u_longlong_t)dsobj); } } } @@ -784,14 +828,14 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); } -int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) +static int +dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { + if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); @@ -799,20 +843,49 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, return (0); } + int -dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); +} + +int +dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); +} + +static int +dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { + if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); return (SET_ERROR(EBUSY)); } return (0); } +int +dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); +} + /* * See the comment above dsl_pool_hold() for details. In summary, a long * hold is used to prevent destruction of a dataset while the pool hold @@ -846,7 +919,7 @@ void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { - (void) strcpy(name, "mos"); + (void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN); } else { dsl_dir_name(ds->ds_dir, name); VERIFY0(dsl_dataset_get_snapname(ds)); @@ -927,13 +1000,16 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override) { boolean_t gotit = FALSE; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { + if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) || + (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS) && + !zfs_allow_redacted_dataset_mount)))) { ds->ds_owner = tag; dsl_dataset_long_hold(ds, tag); gotit = TRUE; @@ -957,7 +1033,7 @@ zfeature_active(spa_feature_t f, void *arg) { switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: { - boolean_t val = (boolean_t)arg; + boolean_t val = (boolean_t)(uintptr_t)arg; ASSERT(val == B_FALSE || val == B_TRUE); return (val); } @@ -1013,7 +1089,7 @@ dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: - ASSERT3S((boolean_t)arg, ==, B_TRUE); + ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE); VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, sizeof (zero), 1, &zero, tx)); break; @@ -1029,7 +1105,7 @@ dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, } } -void +static void dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) { @@ -1203,6 +1279,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); + /* + * Filesystems will eventually have their origin set to dp_origin_snap, + * but that's taken care of in dsl_dataset_create_sync_dd. When + * creating a filesystem, this function is called with origin equal to + * NULL. + */ + if (origin != NULL) + ASSERT3P(origin, !=, dp->dp_origin_snap); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); @@ -1212,6 +1296,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_deleg_set_create_perms(dd, tx, cr); + /* + * If we are creating a clone and the livelist feature is enabled, + * add the entry DD_FIELD_LIVELIST to ZAP. + */ + if (origin != NULL && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dir_zapify(dd, tx); + uint64_t obj = dsl_deadlist_alloc(mos, tx); + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); + } + /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. @@ -1281,7 +1379,7 @@ dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - ASSERTV(uint64_t count); + uint64_t count __maybe_unused; int err; ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); @@ -1380,7 +1478,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc) { int error; uint64_t value; @@ -1425,7 +1523,7 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, */ if (cnt != 0 && cr != NULL) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc); if (error != 0) return (error); } @@ -1526,7 +1624,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) if (error == 0) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, - ddsa->ddsa_cr); + ddsa->ddsa_cr, ddsa->ddsa_proc); dsl_dataset_rele(ds, FTAG); } @@ -1564,7 +1662,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) if (error == 0) { /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, - atp + 1, tx, B_FALSE, 0, NULL); + atp + 1, tx, B_FALSE, 0, NULL, NULL); dsl_dataset_rele(ds, FTAG); } @@ -1589,8 +1687,8 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; - ASSERTV(static zil_header_t zero_zil); - ASSERTV(objset_t *os); + static zil_header_t zero_zil __maybe_unused; + objset_t *os __maybe_unused; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); @@ -1696,6 +1794,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + dsl_bookmark_snapshotted(ds, tx); if (dsl_dataset_remap_deadlist_exists(ds)) { uint64_t remap_deadlist_obj = @@ -1757,7 +1856,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dir_snap_cmtime_update(ds->ds_dir); - spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); } void @@ -1783,7 +1882,6 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) dsl_props_set_sync_impl(ds->ds_prev, ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); } - zvol_create_minors(dp->dp_spa, nvpair_name(pair), B_TRUE); dsl_dataset_rele(ds, FTAG); } } @@ -1842,6 +1940,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; ddsa.ddsa_cr = CRED(); + ddsa.ddsa_proc = curproc; if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, @@ -1858,6 +1957,13 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) fnvlist_free(suspended); } + if (error == 0) { + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + zvol_create_minor(nvpair_name(pair)); + } + } + return (error); } @@ -1882,7 +1988,7 @@ dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, - tx, B_FALSE, 0, NULL); + tx, B_FALSE, 0, NULL, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); @@ -1997,12 +2103,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) } } -static int -deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +/* + * Check if the percentage of blocks shared between the clone and the + * snapshot (as opposed to those that are clone only) is below a certain + * threshold + */ +static boolean_t +dsl_livelist_should_disable(dsl_dataset_t *ds) { - dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); - return (0); + uint64_t used, referenced; + int percent_shared; + + used = dsl_dir_get_usedds(ds->ds_dir); + referenced = dsl_get_referenced(ds); + ASSERT3U(referenced, >=, 0); + ASSERT3U(used, >=, 0); + if (referenced == 0) + return (B_FALSE); + percent_shared = (100 * (referenced - used)) / referenced; + if (percent_shared <= zfs_livelist_min_percent_shared) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Check if it is possible to combine two livelist entries into one. + * This is the case if the combined number of 'live' blkptrs (ALLOCs that + * don't have a matching FREE) is under the maximum sublist size. + * We check this by subtracting twice the total number of frees from the total + * number of blkptrs. FREEs are counted twice because each FREE blkptr + * will cancel out an ALLOC blkptr when the livelist is processed. + */ +static boolean_t +dsl_livelist_should_condense(dsl_deadlist_entry_t *first, + dsl_deadlist_entry_t *next) +{ + uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + + next->dle_bpobj.bpo_phys->bpo_num_freed; + uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) + return (B_TRUE); + return (B_FALSE); +} + +typedef struct try_condense_arg { + spa_t *spa; + dsl_dataset_t *ds; +} try_condense_arg_t; + +/* + * Iterate over the livelist entries, searching for a pair to condense. + * A nonzero return value means stop, 0 means keep looking. + */ +static int +dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) +{ + try_condense_arg_t *tca = arg; + spa_t *spa = tca->spa; + dsl_dataset_t *ds = tca->ds; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + dsl_deadlist_entry_t *next; + + /* The condense thread has not yet been created at import */ + if (spa->spa_livelist_condense_zthr == NULL) + return (1); + + /* A condense is already in progress */ + if (spa->spa_to_condense.ds != NULL) + return (1); + + next = AVL_NEXT(&ll->dl_tree, &first->dle_node); + /* The livelist has only one entry - don't condense it */ + if (next == NULL) + return (1); + + /* Next is the newest entry - don't condense it */ + if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) + return (1); + + /* This pair is not ready to condense but keep looking */ + if (!dsl_livelist_should_condense(first, next)) + return (0); + + /* + * Add a ref to prevent the dataset from being evicted while + * the condense zthr or synctask are running. Ref will be + * released at the end of the condense synctask + */ + dmu_buf_add_ref(ds->ds_dbuf, spa); + + spa->spa_to_condense.ds = ds; + spa->spa_to_condense.first = first; + spa->spa_to_condense.next = next; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + zthr_wakeup(spa->spa_livelist_condense_zthr); + return (1); +} + +static void +dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_dir_t *dd = ds->ds_dir; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); + + /* Check if we need to add a new sub-livelist */ + if (last == NULL) { + /* The livelist is empty */ + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } else if (spa_sync_pass(spa) == 1) { + /* + * Check if the newest entry is full. If it is, make a new one. + * We only do this once per sync because we could overfill a + * sublist in one sync pass and don't want to add another entry + * for a txg that is already represented. This ensures that + * blkptrs born in the same txg are stored in the same sublist. + */ + bpobj_t bpobj = last->dle_bpobj; + uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t free = bpobj.bpo_phys->bpo_num_freed; + uint64_t alloc = all - free; + if (alloc > zfs_livelist_max_entries) { + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } + } + + /* Insert each entry into the on-disk livelist */ + bplist_iterate(&dd->dd_pending_allocs, + dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); + bplist_iterate(&dd->dd_pending_frees, + dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); + + /* Attempt to condense every pair of adjacent entries */ + try_condense_arg_t arg = { + .spa = spa, + .ds = ds + }; + dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, + &arg); } void @@ -2011,13 +2254,19 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, - deadlist_enqueue_cb, &ds->ds_deadlist, tx); + dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); - if (os->os_synced_dnodes != NULL) { - multilist_destroy(os->os_synced_dnodes); - os->os_synced_dnodes = NULL; + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_flush_pending_livelist(ds, tx); + if (dsl_livelist_should_disable(ds)) { + dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); + } } + dsl_bookmark_sync_done(ds, tx); + + multilist_destroy(&os->os_synced_dnodes); + if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; else @@ -2048,7 +2297,7 @@ get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) &count)); } if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { - return (ENOENT); + return (SET_ERROR(ENOENT)); } for (zap_cursor_init(&zc, mos, dsl_dataset_phys(ds)->ds_next_clones_obj); @@ -2070,18 +2319,7 @@ void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { nvlist_t *propval = fnvlist_alloc(); - nvlist_t *val; - - /* - * We use nvlist_alloc() instead of fnvlist_alloc() because the - * latter would allocate the list with NV_UNIQUE_NAME flag. - * As a result, every time a clone name is appended to the list - * it would be (linearly) searched for for a duplicate name. - * We already know that all clone names must be unique and we - * want avoid the quadratic complexity of double-checking that - * because we can have a large number of clones. - */ - VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + nvlist_t *val = fnvlist_alloc(); if (get_clones_stat_impl(ds, val) == 0) { fnvlist_add_nvlist(propval, ZPROP_VALUE, val); @@ -2151,6 +2389,34 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds) DS_FIELD_RESUME_RAWOK) == 0) { fnvlist_add_boolean(token_nv, "rawok"); } + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t num_redact_snaps; + uint64_t *redact_snaps; + VERIFY(dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, + &redact_snaps)); + fnvlist_add_uint64_array(token_nv, "redact_snaps", + redact_snaps, num_redact_snaps); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { + uint64_t num_redact_snaps, int_size; + uint64_t *redact_snaps; + VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, + &num_redact_snaps)); + ASSERT3U(int_size, ==, sizeof (uint64_t)); + + redact_snaps = kmem_alloc(int_size * num_redact_snaps, + KM_SLEEP); + VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, + num_redact_snaps, redact_snaps)); + fnvlist_add_uint64_array(token_nv, "book_redact_snaps", + redact_snaps, num_redact_snaps); + kmem_free(redact_snaps, int_size * num_redact_snaps); + } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); @@ -2161,9 +2427,12 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds) zio_cksum_t cksum; fletcher_4_native_varsize(compressed, compressed_size, &cksum); - str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + size_t alloc_size = compressed_size * 2 + 1; + str = kmem_alloc(alloc_size, KM_SLEEP); for (int i = 0; i < compressed_size; i++) { - (void) sprintf(str + i * 2, "%02x", compressed[i]); + size_t offset = i * 2; + (void) snprintf(str + offset, alloc_size - offset, + "%02x", compressed[i]); } str[compressed_size * 2] = '\0'; char *propval = kmem_asprintf("%u-%llx-%llx-%s", @@ -2171,11 +2440,11 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds) (longlong_t)cksum.zc_word[0], (longlong_t)packed_size, str); kmem_free(packed, packed_size); - kmem_free(str, compressed_size * 2 + 1); + kmem_free(str, alloc_size); kmem_free(compressed, packed_size); return (propval); } - return (strdup("")); + return (kmem_strdup("")); } /* @@ -2198,7 +2467,7 @@ get_child_receive_stats(dsl_dataset_t *ds) dsl_dataset_rele(recv_ds, FTAG); return (propval); } - return (strdup("")); + return (kmem_strdup("")); } static void @@ -2214,9 +2483,9 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_string(nv, ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); } - strfree(childval); + kmem_strfree(childval); } - strfree(propval); + kmem_strfree(propval); } uint64_t @@ -2336,6 +2605,13 @@ dsl_get_inconsistent(dsl_dataset_t *ds) 1 : 0); } +uint64_t +dsl_get_redacted(dsl_dataset_t *ds) +{ + return (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); +} + uint64_t dsl_get_available(dsl_dataset_t *ds) { @@ -2387,7 +2663,19 @@ dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) dsl_dataset_name(ds->ds_prev, snap); return (0); } else { - return (ENOENT); + return (SET_ERROR(ENOENT)); + } +} + +void +dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval) +{ + uint64_t nsnaps; + uint64_t *snaps; + if (dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) { + fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps, + nsnaps); } } @@ -2404,7 +2692,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; - /* Retrieve the mountpoint value stored in the zap opbject */ + /* Retrieve the mountpoint value stored in the zap object */ error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, ZAP_MAXVALUELEN, value, source); if (error != 0) { @@ -2496,6 +2784,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_dir_stats(ds->ds_dir, nv); } + nvlist_t *propval = fnvlist_alloc(); + dsl_get_redact_snaps(ds, propval); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), + propval); + nvlist_free(propval); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, dsl_get_available(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, @@ -2558,12 +2852,13 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { - ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); + dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); stat->dds_creation_txg = dsl_get_creationtxg(ds); stat->dds_inconsistent = dsl_get_inconsistent(ds); stat->dds_guid = dsl_get_guid(ds); + stat->dds_redacted = dsl_get_redacted(ds); stat->dds_origin[0] = '\0'; if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; @@ -2613,7 +2908,7 @@ dsl_dataset_space(dsl_dataset_t *ds, boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) { - ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); + dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; uint64_t birth; ASSERT(dsl_pool_config_held(dp)); @@ -2785,20 +3080,26 @@ dsl_dataset_rename_snapshot(const char *fsname, static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) { - boolean_t held; + boolean_t held = B_FALSE; if (!dmu_tx_is_syncing(tx)) return (0); - if (owner != NULL) { - VERIFY3P(ds->ds_owner, ==, owner); - dsl_dataset_long_rele(ds, owner); - } - - held = dsl_dataset_long_held(ds); - - if (owner != NULL) - dsl_dataset_long_hold(ds, owner); + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + uint64_t holds = zfs_refcount_count(&ds->ds_longholds) - + (owner != NULL ? 1 : 0); + /* + * The value of dd_activity_waiters can chance as soon as we drop the + * lock, but we're fine with that; new waiters coming in or old + * waiters leaving doesn't cause problems, since we're going to cancel + * waiters later anyway. The goal of this check is to verify that no + * non-waiters have long-holds, and all new long-holds will be + * prevented because we're holding the pool config as writer. + */ + if (holds != dd->dd_activity_waiters) + held = B_TRUE; + mutex_exit(&dd->dd_activity_lock); if (held) return (SET_ERROR(EBUSY)); @@ -2891,28 +3192,11 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) } /* must not have any bookmarks after the most recent snapshot */ - nvlist_t *proprequest = fnvlist_alloc(); - fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - nvlist_t *bookmarks = fnvlist_alloc(); - error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); - fnvlist_free(proprequest); - if (error != 0) { + if (dsl_bookmark_latest_txg(ds) > + dsl_dataset_phys(ds)->ds_prev_snap_txg) { dsl_dataset_rele(ds, FTAG); - return (error); + return (SET_ERROR(EEXIST)); } - for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { - nvlist_t *valuenv = - fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), - zfs_prop_to_name(ZFS_PROP_CREATETXG)); - uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); - if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { - fnvlist_free(bookmarks); - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EEXIST)); - } - } - fnvlist_free(bookmarks); error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); if (error != 0) { @@ -3025,7 +3309,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; - dsl_dataset_t *origin_ds; + dsl_dataset_t *origin_ds, *origin_head; int err; uint64_t unused; uint64_t ss_mv_cnt; @@ -3045,6 +3329,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) } snap = list_head(&ddpa->shared_snaps); + origin_head = snap->ds; if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; @@ -3141,6 +3426,32 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) ddpa->uncomp += dluncomp; } + /* + * Check that bookmarks that are being transferred don't have + * name conflicts. + */ + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) { + if (strlen(dbn->dbn_name) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } + zfs_bookmark_phys_t bm; + err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone, + dbn->dbn_name, &bm); + + if (err == 0) { + fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name); + conflicting_snaps = B_TRUE; + } else if (err == ESRCH) { + err = 0; + } else if (err != 0) { + goto out; + } + } + /* * In order to return the full list of conflicting snapshots, we check * whether there was a conflict after traversing all of them. @@ -3166,7 +3477,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - 0, ss_mv_cnt, ddpa->used, ddpa->cr); + 0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc); if (err != 0) goto out; @@ -3230,6 +3541,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) uint64_t oldnext_obj; int64_t delta; + ASSERT(nvlist_empty(ddpa->err_ds)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; @@ -3298,6 +3611,25 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } + /* + * Move bookmarks to this dir. + */ + dsl_bookmark_node_t *dbn_next; + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = dbn_next) { + dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn); + + avl_remove(&origin_head->ds_bookmarks, dbn); + VERIFY0(zap_remove(dp->dp_meta_objset, + origin_head->ds_bookmarks_obj, dbn->dbn_name, tx)); + + dsl_bookmark_node_add(hds, dbn, tx); + } + + dsl_bookmark_next_changed(hds, origin_ds, tx); + /* move snapshots to this dir */ for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { @@ -3395,8 +3727,17 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; + /* + * Since livelists are specific to a clone's origin txg, they + * are no longer accurate. Destroy the livelist from the clone being + * promoted. If the origin dataset is a clone, destroy its livelist + * as well. + */ + dsl_dir_remove_livelist(dd, tx, B_TRUE); + dsl_dir_remove_livelist(odd, tx, B_TRUE); + /* log history record */ - spa_history_log_internal_ds(hds, "promote", tx, ""); + spa_history_log_internal_ds(hds, "promote", tx, " "); dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); @@ -3563,6 +3904,7 @@ dsl_dataset_promote(const char *name, char *conflsnap) ddpa.ddpa_clonename = name; ddpa.err_ds = fnvlist_alloc(); ddpa.cr = CRED(); + ddpa.proc = curproc; error = dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, @@ -3573,7 +3915,8 @@ dsl_dataset_promote(const char *name, char *conflsnap) */ snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); if (snap_pair != NULL && conflsnap != NULL) - (void) strcpy(conflsnap, nvpair_name(snap_pair)); + (void) strlcpy(conflsnap, nvpair_name(snap_pair), + ZFS_MAX_DATASET_NAME_LEN); fnvlist_free(ddpa.err_ds); return (error); @@ -3635,7 +3978,7 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, * The clone can't be too much over the head's refquota. * * To ensure that the entire refquota can be used, we allow one - * transaction to exceed the the refquota. Therefore, this check + * transaction to exceed the refquota. Therefore, this check * needs to also allow for the space referenced to be more than the * refquota. The maximum amount of space that one transaction can use * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this @@ -3704,6 +4047,8 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, DMU_MAX_ACCESS * spa_asize_inflation); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); + dsl_dir_cancel_waiters(origin_head->ds_dir); + /* * Swap per-dataset feature flags. */ @@ -3758,9 +4103,9 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(clone)->ds_unique_bytes); /* - * Reset origin's unique bytes, if it exists. + * Reset origin's unique bytes. */ - if (clone->ds_prev) { + { dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; @@ -3858,8 +4203,22 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); + /* + * If there is a bookmark at the origin, its "next dataset" is + * changing, so we need to reset its FBN. + */ + dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx); + dsl_scan_ds_clone_swapped(origin_head, clone, tx); + /* + * Destroy any livelists associated with the clone or the origin, + * since after the swap the corresponding livelists are no longer + * valid. + */ + dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); + dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); + spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } @@ -4147,94 +4506,212 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +typedef struct dsl_dataset_set_compression_arg { + const char *ddsca_name; + zprop_source_t ddsca_source; + uint64_t ddsca_value; +} dsl_dataset_set_compression_arg_t; + +/* ARGSUSED */ +static int +dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + + if (f == SPA_FEATURE_NONE) + return (SET_ERROR(EINVAL)); + + if (!spa_feature_is_enabled(dp->dp_spa, f)) + return (SET_ERROR(ENOTSUP)); + + return (0); +} + +static void +dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); + + VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); + if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) { + ds->ds_feature_activation[f] = (void *)B_TRUE; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_compression(const char *dsname, zprop_source_t source, + uint64_t compression) +{ + dsl_dataset_set_compression_arg_t ddsca; + + /* + * The sync task is only required for zstd in order to activate + * the feature flag when the property is first set. + */ + if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD) + return (0); + + ddsca.ddsca_name = dsname; + ddsca.ddsca_source = source; + ddsca.ddsca_value = compression; + + return (dsl_sync_task(dsname, dsl_dataset_set_compression_check, + dsl_dataset_set_compression_sync, &ddsca, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* + * Return (in *usedp) the amount of space referenced by "new" that was not + * referenced at the time the bookmark corresponds to. "New" may be a + * snapshot or a head. The bookmark must be before new, in + * new's filesystem (or its origin) -- caller verifies this. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two time points, thus reducing new's used space relative to + * old's. Specifically, this is the space that was born before + * zbm_creation_txg, and freed before new (ie. on new's deadlist or a + * previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * bookmark new + * + * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN + * flag is not set, we will calculate the freed_before_next based on the + * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap. + */ +static int +dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + if (dsl_dataset_is_snapshot(new)) { + ASSERT3U(bmp->zbm_creation_txg, <, + dsl_dataset_phys(new)->ds_creation_txg); + } + + *usedp = 0; + *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; + *usedp -= bmp->zbm_referenced_bytes_refd; + + *compp = 0; + *compp += dsl_dataset_phys(new)->ds_compressed_bytes; + *compp -= bmp->zbm_compressed_bytes_refd; + + *uncompp = 0; + *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; + *uncompp -= bmp->zbm_uncompressed_bytes_refd; + + dsl_dataset_t *snap = new; + + while (dsl_dataset_phys(snap)->ds_prev_snap_txg > + bmp->zbm_creation_txg) { + uint64_t used, comp, uncomp; + + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, bmp->zbm_creation_txg, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + if (snap != new) + dsl_dataset_rele(snap, FTAG); + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } + + /* + * We might not have the FBN if we are calculating written from + * a snapshot (because we didn't know the correct "next" snapshot + * until now). + */ + if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) { + *usedp += bmp->zbm_referenced_freed_before_next_snap; + *compp += bmp->zbm_compressed_freed_before_next_snap; + *uncompp += bmp->zbm_uncompressed_freed_before_next_snap; + } else { + ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==, + bmp->zbm_creation_txg); + uint64_t used, comp, uncomp; + dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } + if (snap != new) + dsl_dataset_rele(snap, FTAG); + return (err); +} + +/* + * Return (in *usedp) the amount of space written in new that was not + * present at the time the bookmark corresponds to. New may be a + * snapshot or the head. Old must be a bookmark before new, in + * new's filesystem (or its origin) -- caller verifies this. + */ +int +dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN)) + return (SET_ERROR(ENOTSUP)); + return (dsl_dataset_space_written_impl(bmp, new, + usedp, compp, uncompp)); +} + /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. - * - * The written space is calculated by considering two components: First, we - * ignore any freed space, and calculate the written as new's used space - * minus old's used space. Next, we add in the amount of space that was freed - * between the two snapshots, thus reducing new's used space relative to old's. - * Specifically, this is the space that was born before old->ds_creation_txg, - * and freed before new (ie. on new's deadlist or a previous deadlist). - * - * space freed [---------------------] - * snapshots ---O-------O--------O-------O------ - * oldsnap new */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - int err = 0; - uint64_t snapobj; - dsl_pool_t *dp = new->ds_dir->dd_pool; + if (!dsl_dataset_is_before(new, oldsnap, 0)) + return (SET_ERROR(EINVAL)); - ASSERT(dsl_pool_config_held(dp)); + zfs_bookmark_phys_t zbm = { 0 }; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap); + zbm.zbm_guid = dsp->ds_guid; + zbm.zbm_creation_txg = dsp->ds_creation_txg; + zbm.zbm_creation_time = dsp->ds_creation_time; + zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; - *usedp = 0; - *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; - *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; - - *compp = 0; - *compp += dsl_dataset_phys(new)->ds_compressed_bytes; - *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; - - *uncompp = 0; - *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; - *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; - - snapobj = new->ds_object; - while (snapobj != oldsnap->ds_object) { - dsl_dataset_t *snap; - uint64_t used, comp, uncomp; - - if (snapobj == new->ds_object) { - snap = new; - } else { - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; - } - - if (dsl_dataset_phys(snap)->ds_prev_snap_txg == - dsl_dataset_phys(oldsnap)->ds_creation_txg) { - /* - * The blocks in the deadlist can not be born after - * ds_prev_snap_txg, so get the whole deadlist space, - * which is more efficient (especially for old-format - * deadlists). Unfortunately the deadlist code - * doesn't have enough information to make this - * optimization itself. - */ - dsl_deadlist_space(&snap->ds_deadlist, - &used, &comp, &uncomp); - } else { - dsl_deadlist_space_range(&snap->ds_deadlist, - 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, - &used, &comp, &uncomp); - } - *usedp += used; - *compp += comp; - *uncompp += uncomp; - - /* - * If we get to the beginning of the chain of snapshots - * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap - * was not a snapshot of/before new. - */ - snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - if (snap != new) - dsl_dataset_rele(snap, FTAG); - if (snapobj == 0) { - err = SET_ERROR(EINVAL); - break; - } - - } - return (err); + /* + * If oldsnap is the origin (or origin's origin, ...) of new, + * we can't easily calculate the effective FBN. Therefore, + * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate + * it relative to the correct "next": the next snapshot towards "new", + * rather than the next snapshot in oldsnap's dsl_dir. + */ + return (dsl_dataset_space_written_impl(&zbm, new, + usedp, compp, uncompp)); } /* @@ -4327,16 +4804,26 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, if (later->ds_dir == earlier->ds_dir) return (B_TRUE); - if (!dsl_dir_is_clone(later->ds_dir)) + + /* + * We check dd_origin_obj explicitly here rather than using + * dsl_dir_is_clone() so that we will return TRUE if "earlier" + * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on + * this behavior. + */ + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0) return (B_FALSE); - if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) - return (B_TRUE); dsl_dataset_t *origin; error = dsl_dataset_hold_obj(dp, dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); + if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg && + origin->ds_dir == earlier->ds_dir) { + dsl_dataset_rele(origin, FTAG); + return (B_TRUE); + } ret = dsl_dataset_is_before(origin, earlier, earlier_txg); dsl_dataset_rele(origin, FTAG); return (ret); @@ -4453,15 +4940,38 @@ dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } -#if defined(_KERNEL) +void +dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx) +{ + uint64_t dsobj = ds->ds_object; + struct feature_type_uint64_array_arg *ftuaa = + kmem_zalloc(sizeof (*ftuaa), KM_SLEEP); + ftuaa->length = (int64_t)num_redact_snaps; + if (num_redact_snaps > 0) { + ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), + KM_SLEEP); + bcopy(redact_snaps, ftuaa->array, num_redact_snaps * + sizeof (uint64_t)); + } + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, + ftuaa, tx); + ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; +} + +/* BEGIN CSTYLED */ #if defined(_LP64) -module_param(zfs_max_recordsize, int, 0644); -MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size"); +#define RECORDSIZE_PERM ZMOD_RW #else /* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -module_param(zfs_max_recordsize, int, 0444); -MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size"); +#define RECORDSIZE_PERM ZMOD_RD #endif +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, + "Max allowed record size"); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, + "Allow mounting of redacted datasets"); +/* END CSTYLED */ EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); @@ -4499,4 +5009,3 @@ EXPORT_SYMBOL(dsl_dsobj_to_dsname); EXPORT_SYMBOL(dsl_dataset_check_quota); EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); -#endif diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 10846a3249..a77e381520 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,16 +20,15 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ -#include #include -#include #include #include #include +#include /* * Deadlist concurrency: @@ -51,13 +50,83 @@ * provides its own locking, and dl_oldfmt is immutable. */ +/* + * Livelist Overview + * ================ + * + * Livelists use the same 'deadlist_t' struct as deadlists and are also used + * to track blkptrs over the lifetime of a dataset. Livelists however, belong + * to clones and track the blkptrs that are clone-specific (were born after + * the clone's creation). The exception is embedded block pointers which are + * not included in livelists because they do not need to be freed. + * + * When it comes time to delete the clone, the livelist provides a quick + * reference as to what needs to be freed. For this reason, livelists also track + * when clone-specific blkptrs are freed before deletion to prevent double + * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the + * deletion algorithm iterates backwards over the livelist, matching + * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists + * are also updated in the case when blkptrs are remapped: the old version + * of the blkptr is cancelled out with a FREE and the new version is tracked + * with an ALLOC. + * + * To bound the amount of memory required for deletion, livelists over a + * certain size are spread over multiple entries. Entries are grouped by + * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will + * be in the same entry. This allows us to delete livelists incrementally + * over multiple syncs, one entry at a time. + * + * During the lifetime of the clone, livelists can get extremely large. + * Their size is managed by periodic condensing (preemptively cancelling out + * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when + * the shared space between the clone and its origin is so small that it + * doesn't make sense to use livelists anymore. + */ + +/* + * The threshold sublist size at which we create a new sub-livelist for the + * next txg. However, since blkptrs of the same transaction group must be in + * the same sub-list, the actual sublist size may exceed this. When picking the + * size we had to balance the fact that larger sublists mean fewer sublists + * (decreasing the cost of insertion) against the consideration that sublists + * will be loaded into memory and shouldn't take up an inordinate amount of + * space. We settled on ~500000 entries, corresponding to roughly 128M. + */ +unsigned long zfs_livelist_max_entries = 500000; + +/* + * We can approximate how much of a performance gain a livelist will give us + * based on the percentage of blocks shared between the clone and its origin. + * 0 percent shared means that the clone has completely diverged and that the + * old method is maximally effective: every read from the block tree will + * result in lots of frees. Livelists give us gains when they track blocks + * scattered across the tree, when one read in the old method might only + * result in a few frees. Once the clone has been overwritten enough, + * writes are no longer sparse and we'll no longer get much of a benefit from + * tracking them with a livelist. We chose a lower limit of 75 percent shared + * (25 percent overwritten). This means that 1/4 of all block pointers will be + * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists + * to make deletion 4x faster. Once the amount of shared space drops below this + * threshold, the clone will revert to the old deletion method. + */ +int zfs_livelist_min_percent_shared = 75; + static int dsl_deadlist_compare(const void *arg1, const void *arg2) { - const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; - const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; - return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); + return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); +} + +static int +dsl_deadlist_cache_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_cache_entry_t *dlce1 = arg1; + const dsl_deadlist_cache_entry_t *dlce2 = arg2; + + return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); } static void @@ -65,10 +134,28 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; + int error; ASSERT(MUTEX_HELD(&dl->dl_lock)); ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) { + /* + * After loading the tree, the caller may modify the tree, + * e.g. to add or remove nodes, or to make a node no longer + * refer to the empty_bpobj. These changes would make the + * dl_cache incorrect. Therefore we discard the cache here, + * so that it can't become incorrect. + */ + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + dl->dl_havecache = B_FALSE; + } if (dl->dl_havetree) return; @@ -76,18 +163,138 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) sizeof (dsl_deadlist_entry_t), offsetof(dsl_deadlist_entry_t, dle_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, - za.za_first_integer)); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dle->dle_bpobj.bpo_object = za.za_first_integer; + dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + avl_add(&dl->dl_tree, dle); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); + + for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); + dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, + dle->dle_bpobj.bpo_object)); + } dl->dl_havetree = B_TRUE; } +/* + * Load only the non-empty bpobj's into the dl_cache. The cache is an analog + * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It + * is used only for gathering space statistics. The dl_cache has two + * advantages over the dl_tree: + * + * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's + * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj + * many times and to inquire about its (zero) space stats many times. + * + * 2. The dl_cache uses less memory than the dl_tree. We only need to load + * the dl_tree of snapshots when deleting a snapshot, after which we free the + * dl_tree with dsl_deadlist_discard_tree + */ +static void +dsl_deadlist_load_cache(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) + return; + + uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; + + avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, + sizeof (dsl_deadlist_cache_entry_t), + offsetof(dsl_deadlist_cache_entry_t, dlce_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if (za.za_first_integer == empty_bpobj) + continue; + dsl_deadlist_cache_entry_t *dlce = + kmem_zalloc(sizeof (*dlce), KM_SLEEP); + dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dlce->dlce_bpobj = za.za_first_integer; + dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + avl_add(&dl->dl_cache, dlce); + } + VERIFY3U(error, ==, ENOENT); + zap_cursor_fini(&zc); + + for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); + dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { + bpobj_t bpo; + VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); + + VERIFY0(bpobj_space(&bpo, + &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); + bpobj_close(&bpo); + } + dl->dl_havecache = B_TRUE; +} + +/* + * Discard the tree to save memory. + */ +void +dsl_deadlist_discard_tree(dsl_deadlist_t *dl) +{ + mutex_enter(&dl->dl_lock); + + if (!dl->dl_havetree) { + mutex_exit(&dl->dl_lock); + return; + } + dsl_deadlist_entry_t *dle; + void *cookie = NULL; + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + + dl->dl_havetree = B_FALSE; + mutex_exit(&dl->dl_lock); +} + +void +dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) +{ + dsl_deadlist_entry_t *dle; + + ASSERT(dsl_deadlist_is_open(dl)); + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + mutex_exit(&dl->dl_lock); + for (dle = avl_first(&dl->dl_tree); dle != NULL; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + if (func(args, dle) != 0) + break; + } +} + void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { @@ -98,19 +305,20 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; - VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; - VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); return; } dl->dl_oldfmt = B_FALSE; dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_havetree = B_FALSE; + dl->dl_havecache = B_FALSE; } boolean_t @@ -122,9 +330,6 @@ dsl_deadlist_is_open(dsl_deadlist_t *dl) void dsl_deadlist_close(dsl_deadlist_t *dl) { - void *cookie = NULL; - dsl_deadlist_entry_t *dle; - ASSERT(dsl_deadlist_is_open(dl)); mutex_destroy(&dl->dl_lock); @@ -137,6 +342,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl) } if (dl->dl_havetree) { + dsl_deadlist_entry_t *dle; + void *cookie = NULL; while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { bpobj_close(&dle->dle_bpobj); @@ -144,6 +351,15 @@ dsl_deadlist_close(dsl_deadlist_t *dl) } avl_destroy(&dl->dl_tree); } + if (dl->dl_havecache) { + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + } dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_phys = NULL; @@ -166,15 +382,16 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) dmu_object_info_t doi; zap_cursor_t zc; zap_attribute_t za; + int error; - VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + VERIFY0(dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_free(os, dlobj, tx); return; } for (zap_cursor_init(&zc, os, dlobj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t obj = za.za_first_integer; if (obj == dmu_objset_pool(os)->dp_empty_bpobj) @@ -182,13 +399,14 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) else bpobj_free(os, obj, tx); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); - VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); + VERIFY0(dmu_object_free(os, dlobj, tx)); } static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, - const blkptr_t *bp, dmu_tx_t *tx) + const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == @@ -196,11 +414,11 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } - bpobj_enqueue(&dle->dle_bpobj, bp, tx); + bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); } static void @@ -214,21 +432,22 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } else { bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } } void -dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { - bpobj_enqueue(&dl->dl_bpobj, bp, tx); + bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); return; } @@ -236,10 +455,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); + + int sign = bp_freed ? -1 : +1; dl->dl_phys->dl_used += - bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); - dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); - dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); @@ -255,10 +476,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) } ASSERT3P(dle, !=, NULL); - dle_enqueue(dl, dle, bp, tx); + dle_enqueue(dl, dle, bp, bp_freed, tx); mutex_exit(&dl->dl_lock); } +int +dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_FALSE, tx); + return (0); +} + +int +dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_TRUE, tx); + return (0); +} + /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. @@ -279,10 +516,10 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dsl_deadlist_load_tree(dl); obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); mutex_exit(&dl->dl_lock); } @@ -298,12 +535,12 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) if (dl->dl_oldfmt) return; - mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); @@ -312,10 +549,114 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); mutex_exit(&dl->dl_lock); } +/* + * Remove a deadlist entry and all of its contents by removing the entry from + * the deadlist's avl tree, freeing the entry's bpobj and adjusting the + * deadlist's space accounting accordingly. + */ +void +dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + objset_t *os = dl->dl_os; + + if (dl->dl_oldfmt) + return; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + VERIFY3P(dle, !=, NULL); + + avl_remove(&dl->dl_tree, dle); + VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { + bpobj_decr_empty(os, tx); + } else { + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + } + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + mutex_exit(&dl->dl_lock); +} + +/* + * Clear out the contents of a deadlist_entry by freeing its bpobj, + * replacing it with an empty bpobj and adjusting the deadlist's + * space accounting + */ +void +dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx) +{ + uint64_t new_obj, used, comp, uncomp; + objset_t *os = dl->dl_os; + + mutex_enter(&dl->dl_lock); + VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + bpobj_close(&dle->dle_bpobj); + new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); + VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, + new_obj, tx)); + ASSERT(bpobj_is_empty(&dle->dle_bpobj)); + mutex_exit(&dl->dl_lock); +} + +/* + * Return the first entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_first(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_first(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* + * Return the last entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_last(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_last(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + /* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ @@ -334,7 +675,7 @@ dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, while (mrs_obj != 0) { dsl_dataset_t *ds; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); dsl_deadlist_add_key(&dl, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; @@ -368,7 +709,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, break; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + VERIFY0(zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } mutex_exit(&dl->dl_lock); @@ -381,7 +722,7 @@ dsl_deadlist_space(dsl_deadlist_t *dl, { ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + VERIFY0(bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); return; } @@ -397,18 +738,18 @@ dsl_deadlist_space(dsl_deadlist_t *dl, * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is - * larger than any bp in the deadlist (eg. UINT64_MAX)). + * UINT64_MAX). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - dsl_deadlist_entry_t *dle; - dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_cache_entry_t *dlce; + dsl_deadlist_cache_entry_t dlce_tofind; avl_index_t where; if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + VERIFY0(bpobj_space_range(&dl->dl_bpobj, mintxg, maxtxg, usedp, compp, uncompp)); return; } @@ -416,27 +757,25 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, *usedp = *compp = *uncompp = 0; mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - dle_tofind.dle_mintxg = mintxg; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + dsl_deadlist_load_cache(dl); + dlce_tofind.dlce_mintxg = mintxg; + dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); + /* - * If we don't find this mintxg, there shouldn't be anything - * after it either. + * If this mintxg doesn't exist, it may be an empty_bpobj which + * is omitted from the sparse tree. Start at the next non-empty + * entry. */ - ASSERT(dle != NULL || - avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + if (dlce == NULL) + dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); - for (; dle && dle->dle_mintxg < maxtxg; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - uint64_t used, comp, uncomp; - - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, - &used, &comp, &uncomp)); - - *usedp += used; - *compp += comp; - *uncompp += uncomp; + for (; dlce && dlce->dlce_mintxg < maxtxg; + dlce = AVL_NEXT(&dl->dl_tree, dlce)) { + *usedp += dlce->dlce_bytes; + *compp += dlce->dlce_comp; + *uncompp += dlce->dlce_uncomp; } + mutex_exit(&dl->dl_lock); } @@ -452,8 +791,8 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, ASSERT(MUTEX_HELD(&dl->dl_lock)); - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); dsl_deadlist_load_tree(dl); @@ -471,10 +810,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, } static int -dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); + dsl_deadlist_insert(dl, bp, bp_freed, tx); return (0); } @@ -490,28 +830,29 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; + int error; - VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_t bpo; - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_iterate(&bpo, - dsl_deadlist_insert_cb, dl, tx)); + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); bpobj_close(&bpo); return; } mutex_enter(&dl->dl_lock); for (zap_cursor_init(&zc, dl->dl_os, obj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); - VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); @@ -520,7 +861,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) } /* - * Remove entries on dl that are >= mintxg, and put them on the bpobj. + * Remove entries on dl that are born > mintxg, and put them on the bpobj. */ void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, @@ -546,7 +887,7 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); @@ -555,7 +896,7 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); dle_next = AVL_NEXT(&dl->dl_tree, dle); @@ -566,3 +907,137 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, } mutex_exit(&dl->dl_lock); } + +typedef struct livelist_entry { + blkptr_t le_bp; + uint32_t le_refcnt; + avl_node_t le_node; +} livelist_entry_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; + const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + + if (l_dva0_vdev != r_dva0_vdev) + return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset == r_dva0_offset) + ASSERT3U(l->blk_birth, ==, r->blk_birth); + return (TREE_CMP(l_dva0_offset, r_dva0_offset)); +} + +struct livelist_iter_arg { + avl_tree_t *avl; + bplist_t *to_free; + zthr_t *t; +}; + +/* + * Expects an AVL tree which is incrementally filled will FREE blkptrs + * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a + * corresponding FREE are stored in the supplied bplist. + * + * Note that multiple FREE and ALLOC entries for the same blkptr may + * be encountered when dedup is involved. For this reason we keep a + * refcount for all the FREE entries of each blkptr and ensure that + * each of those FREE entries has a corresponding ALLOC preceding it. + */ +static int +dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + struct livelist_iter_arg *lia = arg; + avl_tree_t *avl = lia->avl; + bplist_t *to_free = lia->to_free; + zthr_t *t = lia->t; + ASSERT(tx == NULL); + + if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) + return (SET_ERROR(EINTR)); + + livelist_entry_t node; + node.le_bp = *bp; + livelist_entry_t *found = avl_find(avl, &node, NULL); + if (bp_freed) { + if (found == NULL) { + /* first free entry for this blkptr */ + livelist_entry_t *e = + kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); + e->le_bp = *bp; + e->le_refcnt = 1; + avl_add(avl, e); + } else { + /* dedup block free */ + ASSERT(BP_GET_DEDUP(bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); + found->le_refcnt++; + } + } else { + if (found == NULL) { + /* block is currently marked as allocated */ + bplist_append(to_free, bp); + } else { + /* alloc matches a free entry */ + ASSERT3U(found->le_refcnt, !=, 0); + found->le_refcnt--; + if (found->le_refcnt == 0) { + /* all tracked free pairs have been matched */ + avl_remove(avl, found); + kmem_free(found, sizeof (livelist_entry_t)); + } else { + /* + * This is definitely a deduped blkptr so + * let's validate it. + */ + ASSERT(BP_GET_DEDUP(bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + } + } + } + return (0); +} + +/* + * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs + * which have an ALLOC entry but no matching FREE + */ +int +dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, + uint64_t *size) +{ + avl_tree_t avl; + avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), + offsetof(livelist_entry_t, le_node)); + + /* process the sublist */ + struct livelist_iter_arg arg = { + .avl = &avl, + .to_free = to_free, + .t = t + }; + int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + + VERIFY0(avl_numnodes(&avl)); + avl_destroy(&avl); + return (err); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, + "Size to start the next sub-livelist in a livelist"); + +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, + "Threshold at which livelist is disabled"); +/* END CSTYLED */ diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index cef460f020..cf8a3c9bbd 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -399,7 +399,7 @@ perm_set_compare(const void *arg1, const void *arg2) val = strcmp(node1->p_setname, node2->p_setname); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } /* diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 465b3dfac8..a2748197f2 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,9 @@ #include #include #include +#include +#include +#include int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) @@ -119,7 +123,7 @@ struct process_old_arg { }; static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; @@ -127,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { @@ -181,70 +185,86 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_phys(ds_next)->ds_deadlist_obj); } -struct removeclonesnode { - list_node_t link; - dsl_dataset_t *ds; -}; +typedef struct remaining_clones_key { + dsl_dataset_t *rck_clone; + list_node_t rck_node; +} remaining_clones_key_t; + +static remaining_clones_key_t * +rck_alloc(dsl_dataset_t *clone) +{ + remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP); + rck->rck_clone = clone; + return (rck); +} static void -dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx, + list_t *stack, void *tag) { - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - list_t clones; - struct removeclonesnode *rcn; + objset_t *mos = dd->dd_pool->dp_meta_objset; - list_create(&clones, sizeof (struct removeclonesnode), - offsetof(struct removeclonesnode, link)); + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (dsl_dir_phys(dd)->dd_clones == 0) + return; - rcn = kmem_zalloc(sizeof (struct removeclonesnode), KM_SLEEP); - rcn->ds = ds; - list_insert_head(&clones, rcn); + zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (; rcn != NULL; rcn = list_next(&clones, rcn)) { - zap_cursor_t zc; - zap_attribute_t za; - /* - * If it is the old version, dd_clones doesn't exist so we can't - * find the clones, but dsl_deadlist_remove_key() is a no-op so - * it doesn't matter. - */ - if (dsl_dir_phys(rcn->ds->ds_dir)->dd_clones == 0) - continue; + for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; - for (zap_cursor_init(&zc, mos, - dsl_dir_phys(rcn->ds->ds_dir)->dd_clones); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + za->za_first_integer, tag, &clone)); - VERIFY0(dsl_dataset_hold_obj(rcn->ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - if (clone->ds_dir->dd_origin_txg > mintxg) { - dsl_deadlist_remove_key(&clone->ds_deadlist, - mintxg, tx); - if (dsl_dataset_remap_deadlist_exists(clone)) { - dsl_deadlist_remove_key( - &clone->ds_remap_deadlist, mintxg, - tx); - } - rcn = kmem_zalloc( - sizeof (struct removeclonesnode), KM_SLEEP); - rcn->ds = clone; - list_insert_tail(&clones, rcn); - } else { - dsl_dataset_rele(clone, FTAG); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); } + + list_insert_head(stack, rck_alloc(clone)); + } else { + dsl_dataset_rele(clone, tag); } - zap_cursor_fini(&zc); + } + zap_cursor_fini(zc); + + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); +} + +void +dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx) +{ + list_t stack; + + list_create(&stack, sizeof (remaining_clones_key_t), + offsetof(remaining_clones_key_t, rck_node)); + + dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG); + for (remaining_clones_key_t *rck = list_remove_head(&stack); + rck != NULL; rck = list_remove_head(&stack)) { + dsl_dataset_t *clone = rck->rck_clone; + dsl_dir_t *clone_dir = clone->ds_dir; + + kmem_free(rck, sizeof (*rck)); + + dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx, + &stack, FTAG); + dsl_dataset_rele(clone, FTAG); } - rcn = list_remove_head(&clones); - kmem_free(rcn, sizeof (struct removeclonesnode)); - while ((rcn = list_remove_head(&clones)) != NULL) { - dsl_dataset_rele(rcn->ds, FTAG); - kmem_free(rcn, sizeof (struct removeclonesnode)); - } - list_destroy(&clones); + list_destroy(&stack); } static void @@ -301,19 +321,21 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + spa_history_log_internal_ds(ds, "defer_destroy", tx, " "); return; } ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); + spa_history_log_internal_ds(ds, "destroy", tx, " "); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; + boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) dsl_dataset_deactivate_feature(ds, f, tx); @@ -391,6 +413,13 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + + /* + * We are done with the deadlist tree (generated/used + * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()). + * Discard it to save memory. + */ + dsl_deadlist_discard_tree(&ds_next->ds_deadlist); } dsl_deadlist_close(&ds->ds_deadlist); @@ -400,9 +429,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); - /* Collapse range in clone heads */ - dsl_dataset_remove_clones_key(ds, - dsl_dataset_phys(ds)->ds_creation_txg, tx); + if (!book_exists) { + /* Collapse range in clone heads */ + dsl_dir_remove_clones_key(ds->ds_dir, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; @@ -430,9 +461,13 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); - dsl_deadlist_remove_key(&hds->ds_deadlist, - dsl_dataset_phys(ds)->ds_creation_txg, tx); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, + FTAG, &hds)); + if (!book_exists) { + /* Collapse range in this head. */ + dsl_deadlist_remove_key(&hds->ds_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } if (dsl_dataset_remap_deadlist_exists(hds)) { dsl_deadlist_remove_key(&hds->ds_remap_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); @@ -505,7 +540,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { - ASSERTV(uint64_t count); + uint64_t count __maybe_unused; ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && count == 0); @@ -565,26 +600,21 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, /* * lzc_destroy_snaps() is documented to take an nvlist whose * values "don't matter". We need to convert that nvlist to - * one that we know can be converted to LUA. We also don't - * care about any duplicate entries because the nvlist will - * be converted to a LUA table which should take care of this. + * one that we know can be converted to LUA. */ - nvlist_t *snaps_normalized; - VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); + nvlist_t *snaps_normalized = fnvlist_alloc(); for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { fnvlist_add_boolean_value(snaps_normalized, nvpair_name(pair), B_TRUE); } - nvlist_t *arg; - VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); + nvlist_t *arg = fnvlist_alloc(); fnvlist_add_nvlist(arg, "snaps", snaps_normalized); fnvlist_free(snaps_normalized); fnvlist_add_boolean_value(arg, "defer", defer); - nvlist_t *wrapper; - VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); + nvlist_t *wrapper = fnvlist_alloc(); fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); fnvlist_free(arg); @@ -619,20 +649,22 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, B_TRUE, 0, zfs_lua_max_memlimit, - nvlist_next_nvpair(wrapper, NULL), result); + fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result); if (error != 0) { char *errorstr = NULL; (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); if (errorstr != NULL) { - zfs_dbgmsg(errorstr); + zfs_dbgmsg("%s", errorstr); } + fnvlist_free(wrapper); + fnvlist_free(result); return (error); } fnvlist_free(wrapper); /* * lzc_destroy_snaps() is documented to fill the errlist with - * int32 values, so we need to covert the int64 values that are + * int32 values, so we need to convert the int64 values that are * returned from LUA. */ int rv = 0; @@ -675,7 +707,8 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { @@ -700,6 +733,10 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { struct killarg ka; + spa_history_log_internal_ds(ds, "destroy", tx, + "(synchronous, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) @@ -730,6 +767,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) if (zfs_refcount_count(&ds->ds_longholds) != expected_holds) return (SET_ERROR(EBUSY)); + ASSERT0(ds->ds_dir->dd_activity_waiters); + mos = ds->ds_dir->dd_pool->dp_meta_objset; /* @@ -826,6 +865,139 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) dmu_object_free_zapified(mos, ddobj, tx); } +static void +dsl_clone_destroy_assert(dsl_dir_t *dd) +{ + uint64_t used, comp, uncomp; + + ASSERT(dsl_dir_is_clone(dd)); + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + + ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); + ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); + /* + * Greater than because we do not track embedded block pointers in + * the livelist + */ + ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); + + ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); + ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); +} + +/* + * Start the delete process for a clone. Free its zil, verify the space usage + * and queue the blkptrs for deletion by adding the livelist to the pool-wide + * delete queue. + */ +static void +dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t zap_obj, to_delete, used, comp, uncomp; + objset_t *os; + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + uint64_t mintxg = 0; + dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist); + if (dle != NULL) + mintxg = dle->dle_mintxg; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(livelist, mintxg=%llu)", (long long)mintxg); + + /* Check that the clone is in a correct state to be deleted */ + dsl_clone_destroy_assert(dd); + + /* Destroy the zil */ + zil_destroy_sync(dmu_objset_zil(os), tx); + + VERIFY0(zap_lookup(mos, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); + /* Initialize deleted_clones entry to track livelists to cleanup */ + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (error == ENOENT) { + zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, + &(zap_obj), tx)); + spa->spa_livelists_to_delete = zap_obj; + } else if (error != 0) { + zfs_panic_recover("zfs: error %d was returned while looking " + "up DMU_POOL_DELETED_CLONES in the zap", error); + return; + } + VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); + + /* Clone is no longer using space, now tracked by dp_free_dir */ + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, + tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + dsl_dir_remove_livelist(dd, tx, B_FALSE); + zthr_wakeup(spa->spa_livelist_delete_zthr); +} + +/* + * Move the bptree into the pool's list of trees to clean up, update space + * accounting information and destroy the zil. + */ +static void +dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(bptree, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); +} + void dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -842,8 +1014,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); + dsl_dir_cancel_waiters(ds->ds_dir); rmorigin = (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && @@ -885,7 +1056,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) } /* - * Destroy the deadlist. Unless it's a clone, the + * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty since the dataset has no snapshots. * (If it's a clone, it's safe to ignore the deadlist contents * since they are still referenced by the origin snapshot.) @@ -898,51 +1069,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) if (dsl_dataset_remap_deadlist_exists(ds)) dsl_dataset_destroy_remap_deadlist(ds, tx); - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { - old_synchronous_dataset_destroy(ds, tx); + /* + * Each destroy is responsible for both destroying (enqueuing + * to be destroyed) the blkptrs comprising the dataset as well as + * those belonging to the zil. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_async_clone_destroy(ds, tx); + } else if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_async_dataset_destroy(ds, tx); } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_ASYNC_DESTROY)) { - dsl_scan_t *scn = dp->dp_scan; - spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY0(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx)); - ASSERT(!scn->scn_async_destroying); - scn->scn_async_destroying = B_TRUE; - } - - used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; - comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; - uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - dsl_dataset_phys(ds)->ds_unique_bytes == used); - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - bptree_add(mos, dp->dp_bptree_obj, - &dsl_dataset_phys(ds)->ds_bp, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - used, comp, uncomp, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); + old_synchronous_dataset_destroy(ds, tx); } if (ds->ds_prev != NULL) { @@ -973,8 +1111,28 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); - if (ds->ds_bookmarks != 0) { - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + if (ds->ds_bookmarks_obj != 0) { + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != + NULL) { + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); } @@ -1023,7 +1181,7 @@ dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; - spa_history_log_internal_ds(ds, "destroy begin", tx, ""); + spa_history_log_internal_ds(ds, "destroy begin", tx, " "); dsl_dataset_rele(ds, FTAG); } @@ -1059,9 +1217,10 @@ dsl_destroy_head(const char *name) /* * Head deletion is processed in one txg on old pools; * remove the objects from open context so that the txg sync - * is not too long. + * is not too long. This optimization can only work for + * encrypted datasets if the wrapping key is loaded. */ - error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE, + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE, FTAG, &os); if (error == 0) { uint64_t prev_snap_txg = @@ -1073,7 +1232,7 @@ dsl_destroy_head(const char *name) (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); - dmu_objset_disown(os, B_FALSE, FTAG); + dmu_objset_disown(os, B_TRUE, FTAG); } } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 6fb711f592..84caace4db 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -46,8 +46,10 @@ #include #include #include +#include #include #include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -96,7 +98,7 @@ * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up - * the initizized nodes in the tree. Renaming a filesystem into different point + * the initialized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. @@ -117,13 +119,6 @@ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). - * - * There is a special case when we receive a filesystem that already exists. In - * this case a temporary clone name of %X is created (see dmu_recv_begin). We - * never update the filesystem counts for temporary clones. - * - * Likewise, we do not update the snapshot counts for temporary snapshots, - * such as those created by zfs diff. */ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); @@ -140,7 +135,7 @@ dsl_dir_evict_async(void *dbu) { dsl_dir_t *dd = dbu; int t; - ASSERTV(dsl_pool_t *dp = dd->dd_pool); + dsl_pool_t *dp __maybe_unused = dd->dd_pool; dd->dd_dbuf = NULL; @@ -155,7 +150,12 @@ dsl_dir_evict_async(void *dbu) spa_async_close(dd->dd_pool->dp_spa, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } @@ -188,23 +188,27 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_dbuf = dbuf; dd->dd_pool = dp; - if (dsl_dir_is_zapified(dd) && - zap_contains(dp->dp_meta_objset, ddobj, - DD_FIELD_CRYPTO_KEY_OBJ) == 0) { - VERIFY0(zap_lookup(dp->dp_meta_objset, - ddobj, DD_FIELD_CRYPTO_KEY_OBJ, - sizeof (uint64_t), 1, &dd->dd_crypto_obj)); + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); + dsl_prop_init(dd); - /* check for on-disk format errata */ - if (dsl_dir_incompatible_encryption_version(dd)) { - dp->dp_spa->spa_errata = - ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + if (dsl_dir_is_zapified(dd)) { + err = zap_lookup(dp->dp_meta_objset, + ddobj, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj); + if (err == 0) { + /* check for on-disk format errata */ + if (dsl_dir_incompatible_encryption_version( + dd)) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + } + } else if (err != ENOENT) { + goto errout; } } - mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); - dsl_prop_init(dd); - dsl_dir_snap_cmtime_update(dd); if (dsl_dir_phys(dd)->dd_parent_obj) { @@ -234,7 +238,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (err != 0) goto errout; } else { - (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); + (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), + sizeof (dd->dd_myname)); } if (dsl_dir_is_clone(dd)) { @@ -255,6 +260,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); + if (dsl_dir_is_zapified(dd)) { + uint64_t obj; + err = zap_lookup(dp->dp_meta_objset, + dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj); + if (err == 0) + dsl_dir_livelist_open(dd, obj); + else if (err != ENOENT) + goto errout; + } } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, @@ -263,7 +278,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; @@ -291,7 +310,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); @@ -394,7 +417,7 @@ getcomponent(const char *path, char *component, const char **nextp) return (SET_ERROR(EINVAL)); if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); - (void) strcpy(component, path); + (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); p = NULL; } else if (p[0] == '/') { if (p - path >= ZFS_MAX_DATASET_NAME_LEN) @@ -465,7 +488,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", - buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); + buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, @@ -561,11 +584,9 @@ dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) &chld_dd)); /* - * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and - * temporary datasets. + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. */ - if (chld_dd->dd_myname[0] == '$' || - chld_dd->dd_myname[0] == '%') { + if (chld_dd->dd_myname[0] == '$') { dsl_dir_rele(chld_dd, FTAG); continue; } @@ -719,12 +740,14 @@ typedef enum { } enforce_res_t; static enforce_res_t -dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, + cred_t *cr, proc_t *proc) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; + const char *zonedstr; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); @@ -733,7 +756,13 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) if (crgetzoneid(cr) != GLOBAL_ZONEID) return (ENFORCE_ALWAYS); - if (secpolicy_zfs(cr) == 0) + /* + * We are checking the saved credentials of the user process, which is + * not the current process. Note that we can't use secpolicy_zfs(), + * because it only works if the cred is that of the current process (on + * Linux). + */ + if (secpolicy_zfs_proc(cr, proc) == 0) return (ENFORCE_NEVER); #endif @@ -745,7 +774,8 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); - if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { + zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); + if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { @@ -757,35 +787,6 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) return (enforce); } -static void -dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) -{ - ddulrt_arg_t *arg = varg; - uint64_t last_remap_txg; - dsl_dir_t *dd = arg->ddulrta_dd; - objset_t *mos = dd->dd_pool->dp_meta_objset; - - dsl_dir_zapify(dd, tx); - if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || - last_remap_txg < arg->ddlrta_txg) { - VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); - } -} - -int -dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) -{ - ddulrt_arg_t arg; - arg.ddulrta_dd = dd; - arg.ddlrta_txg = txg; - - return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), - NULL, dsl_dir_update_last_remap_txg_sync, &arg, - 1, ZFS_SPACE_CHECK_RESERVED)); -} - /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. @@ -796,7 +797,7 @@ dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, - dsl_dir_t *ancestor, cred_t *cr) + dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; @@ -816,7 +817,7 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ - enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); if (enforce == ENFORCE_NEVER) return (0); @@ -853,9 +854,14 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ - if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, - count_prop, sizeof (count), 1, &count) == ENOENT) + if (!dsl_dir_is_zapified(dd)) return (0); + err = zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); @@ -868,7 +874,7 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, - ancestor, cr); + ancestor, cr, proc); return (err); } @@ -893,14 +899,12 @@ dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* - * When we receive an incremental stream into a filesystem that already - * exists, a temporary clone is created. We don't count this temporary - * clone, whose name begins with a '%'. We also ignore hidden ($FREE, - * $MOS & $ORIGIN) objsets. + * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. */ - if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && - strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + if (dd->dd_myname[0] == '$' && strcmp(prop, + DD_FIELD_FILESYSTEM_COUNT) == 0) { return; + } /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 @@ -1067,7 +1071,7 @@ dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (*count), 1, count)); } else { - return (ENOENT); + return (SET_ERROR(ENOENT)); } } @@ -1079,23 +1083,10 @@ dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (*count), 1, count)); } else { - return (ENOENT); + return (SET_ERROR(ENOENT)); } } -int -dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) -{ - if (dsl_dir_is_zapified(dd)) { - objset_t *os = dd->dd_pool->dp_meta_objset; - return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (*count), 1, count)); - } else { - return (ENOENT); - } - -} - void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { @@ -1127,10 +1118,6 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } - if (dsl_dir_get_remaptxg(dd, &count) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, - count); - } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; @@ -1169,8 +1156,8 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); - dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, - dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg, + (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; mutex_exit(&dd->dd_lock); @@ -1341,7 +1328,7 @@ top_of_function: if (avail < quota) { quota = avail; - retval = ENOSPC; + retval = SET_ERROR(ENOSPC); } } @@ -1357,8 +1344,9 @@ top_of_function: retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", - used_on_disk>>10, est_inflight>>10, - quota>>10, asize>>10, retval); + (u_longlong_t)used_on_disk>>10, + (u_longlong_t)est_inflight>>10, + (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (SET_ERROR(retval)); @@ -1495,7 +1483,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) * less than the amount specified. * * NOTE: The behavior of this function is identical to the Illumos / FreeBSD - * version however it has been adjusted to use an iterative rather then + * version however it has been adjusted to use an iterative rather than * recursive algorithm to minimize stack usage. */ void @@ -1529,6 +1517,11 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, { int64_t accounted_delta; + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update @@ -1537,36 +1530,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(type < DD_USED_NUM); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - if (needlock) mutex_enter(&dd->dd_lock); - accounted_delta = - parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); - ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); - ASSERT(compressed >= 0 || - dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); + accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); + ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || - dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); - dsl_dir_phys(dd)->dd_used_bytes += used; - dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; - dsl_dir_phys(dd)->dd_compressed_bytes += compressed; + ddp->dd_uncompressed_bytes >= -uncompressed); + ddp->dd_used_bytes += used; + ddp->dd_uncompressed_bytes += uncompressed; + ddp->dd_compressed_bytes += compressed; - if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { - ASSERT(used > 0 || - dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); - dsl_dir_phys(dd)->dd_used_breakdown[type] += used; -#ifdef DEBUG + if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); + ddp->dd_used_breakdown[type] += used; +#ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) - u += dsl_dir_phys(dd)->dd_used_breakdown[t]; - ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); + u += ddp->dd_used_breakdown[t]; + ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } @@ -1574,11 +1559,9 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - accounted_delta, compressed, uncompressed, tx); - dsl_dir_transfer_space(dd->dd_parent, - used - accounted_delta, - DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + dsl_dir_diduse_transfer_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } @@ -1590,21 +1573,72 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); if (delta == 0 || - !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) + !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : - dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); - ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; - dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; + ddp->dd_used_breakdown[oldtype] >= delta : + ddp->dd_used_breakdown[newtype] >= -delta); + ASSERT(ddp->dd_used_bytes >= ABS(delta)); + ddp->dd_used_breakdown[oldtype] -= delta; + ddp->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } +void +dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + dsl_dir_phys_t *ddp = dsl_dir_phys(dd); + accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); + ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + ddp->dd_uncompressed_bytes >= -uncompressed); + ddp->dd_used_bytes += used; + ddp->dd_uncompressed_bytes += uncompressed; + ddp->dd_compressed_bytes += compressed; + + if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(tonew - used <= 0 || + ddp->dd_used_breakdown[oldtype] >= tonew - used); + ASSERT(tonew >= 0 || + ddp->dd_used_breakdown[newtype] >= -tonew); + ddp->dd_used_breakdown[oldtype] -= tonew - used; + ddp->dd_used_breakdown[newtype] += tonew; +#ifdef ZFS_DEBUG + { + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += ddp->dd_used_breakdown[t]; + ASSERT3U(u, ==, ddp->dd_used_bytes); + } +#endif + } + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_transfer_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + } +} + typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; @@ -1698,7 +1732,7 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) ZFS_SPACE_CHECK_EXTRA_RESERVED)); } -int +static int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; @@ -1854,6 +1888,7 @@ typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; + proc_t *ddra_proc; } dsl_dir_rename_arg_t; typedef struct dsl_valid_rename_arg { @@ -2032,7 +2067,8 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } error = dsl_dir_transfer_possible(dd->dd_parent, - newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); + newparent, fs_cnt, ss_cnt, myspace, + ddra->ddra_cred, ddra->ddra_proc); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); @@ -2052,7 +2088,6 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; - int error; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); @@ -2119,10 +2154,9 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - error = zap_remove(mos, + VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, - dd->dd_myname, tx); - ASSERT0(error); + dd->dd_myname, tx)); (void) strlcpy(dd->dd_myname, mynewname, sizeof (dd->dd_myname)); @@ -2135,6 +2169,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); + /* TODO: A rename callback to avoid these layering violations. */ + zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname, B_TRUE); @@ -2152,6 +2188,7 @@ dsl_dir_rename(const char *oldname, const char *newname) ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); + ddra.ddra_proc = curproc; return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, @@ -2160,7 +2197,8 @@ dsl_dir_rename(const char *oldname, const char *newname) int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, - uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, + cred_t *cr, proc_t *proc) { dsl_dir_t *ancestor; int64_t adelta; @@ -2174,11 +2212,11 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, - ancestor, cr); + ancestor, cr, proc); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, - ancestor, cr); + ancestor, cr, proc); if (err != 0) return (err); @@ -2224,6 +2262,188 @@ dsl_dir_is_zapified(dsl_dir_t *dd) return (doi.doi_type == DMU_OTN_ZAP_METADATA); } +void +dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, + SPA_FEATURE_LIVELIST)); + dsl_deadlist_open(&dd->dd_livelist, mos, obj); + bplist_create(&dd->dd_pending_allocs); + bplist_create(&dd->dd_pending_frees); +} + +void +dsl_dir_livelist_close(dsl_dir_t *dd) +{ + dsl_deadlist_close(&dd->dd_livelist); + bplist_destroy(&dd->dd_pending_allocs); + bplist_destroy(&dd->dd_pending_frees); +} + +void +dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) +{ + uint64_t obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + livelist_condense_entry_t to_condense = spa->spa_to_condense; + + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return; + + /* + * If the livelist being removed is set to be condensed, stop the + * condense zthr and indicate the cancellation in the spa_to_condense + * struct in case the condense no-wait synctask has already started + */ + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL && + (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { + /* + * We use zthr_wait_cycle_done instead of zthr_cancel + * because we don't want to destroy the zthr, just have + * it skip its current task. + */ + spa->spa_to_condense.cancelled = B_TRUE; + zthr_wait_cycle_done(ll_condense_thread); + /* + * If we've returned from zthr_wait_cycle_done without + * clearing the to_condense data structure it's either + * because the no-wait synctask has started (which is + * indicated by 'syncing' field of to_condense) and we + * can expect it to clear to_condense on its own. + * Otherwise, we returned before the zthr ran. The + * checkfunc will now fail as cancelled == B_TRUE so we + * can safely NULL out ds, allowing a different dir's + * livelist to be condensed. + * + * We can be sure that the to_condense struct will not + * be repopulated at this stage because both this + * function and dsl_livelist_try_condense execute in + * syncing context. + */ + if ((spa->spa_to_condense.ds != NULL) && + !spa->spa_to_condense.syncing) { + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, + spa); + spa->spa_to_condense.ds = NULL; + } + } + + dsl_dir_livelist_close(dd); + VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); + VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, tx)); + if (total) { + dsl_deadlist_free(dp->dp_meta_objset, obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + } +} + +static int +dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, + zfs_wait_activity_t activity, boolean_t *in_progress) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); + + switch (activity) { + case ZFS_WAIT_DELETEQ: { +#ifdef _KERNEL + objset_t *os; + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + break; + + mutex_enter(&os->os_user_ptr_lock); + void *user = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (dmu_objset_type(os) != DMU_OST_ZFS || + user == NULL || zfs_get_vfs_flag_unmounted(os)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t readonly = B_FALSE; + error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, + NULL); + + if (error != 0) + break; + + if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t count, unlinked_obj; + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &unlinked_obj); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + break; + } + error = zap_count(os, unlinked_obj, &count); + + if (error == 0) + *in_progress = (count != 0); + break; +#else + /* + * The delete queue is ZPL specific, and libzpool doesn't have + * it. It doesn't make sense to wait for it. + */ + *in_progress = B_FALSE; + break; +#endif + } + default: + panic("unrecognized value for activity %d", activity); + } + + return (error); +} + +int +dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited) +{ + int error = 0; + boolean_t in_progress; + dsl_pool_t *dp = dd->dd_pool; + for (;;) { + dsl_pool_config_enter(dp, FTAG); + error = dsl_dir_activity_in_progress(dd, ds, activity, + &in_progress); + dsl_pool_config_exit(dp, FTAG); + if (error != 0 || !in_progress) + break; + + *waited = B_TRUE; + + if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == + 0 || dd->dd_activity_cancelled) { + error = SET_ERROR(EINTR); + break; + } + } + return (error); +} + +void +dsl_dir_cancel_waiters(dsl_dir_t *dd) +{ + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_cancelled = B_TRUE; + cv_broadcast(&dd->dd_activity_cv); + while (dd->dd_activity_waiters > 0) + cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); + mutex_exit(&dd->dd_activity_lock); +} + #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 10e967ab91..1350f13295 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -42,14 +42,13 @@ #include #include #include -#include #include #include #include #include #include #include -#include +#include #include /* @@ -105,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +/* + * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. + * Once it is reached, write operation is blocked, + * until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + */ +unsigned long zfs_wrlog_data_max = 0; + /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than @@ -221,11 +228,17 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri, - max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + aggsum_init(&dp->dp_wrlog_total, 0); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_init(&dp->dp_wrlog_pertxg[i], 0); + } + + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, + boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", - max_ncpus, defclsyspri, max_ncpus, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } @@ -416,8 +429,16 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + + ASSERT0(aggsum_value(&dp->dp_wrlog_total)); + aggsum_fini(&dp->dp_wrlog_total); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); + aggsum_fini(&dp->dp_wrlog_pertxg[i]); + } + taskq_destroy(dp->dp_unlinked_drain_taskq); - taskq_destroy(dp->dp_iput_taskq); + taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) { mutex_destroy(&dp->dp_blkstats->zab_lock); vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -566,6 +587,10 @@ dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); + dmu_objset_sync_done(dp->dp_meta_objset, tx); + taskq_wait(dp->dp_sync_taskq); + multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } @@ -588,6 +613,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) +{ + ASSERT3S(size, >=, 0); + + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); + aggsum_add(&dp->dp_wrlog_total, size); + + /* Choose a value slightly bigger than min dirty sync bytes */ + uint64_t sync_min = + zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) + txg_kick(dp, txg); +} + +boolean_t +dsl_pool_wrlog_over_max(dsl_pool_t *dp) +{ + return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + int64_t delta; + delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); + aggsum_add(&dp->dp_wrlog_total, delta); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -659,15 +714,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } VERIFY0(zio_wait(zio)); - /* - * We have written all of the accounted dirty data, so our - * dp_space_towrite should now be zero. However, some seldom-used - * code paths do not adhere to this (e.g. dbuf_undirty(), also - * rounding error in dbuf_write_physdone). - * Shore up the accounting of any dirtied space now. - */ - dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - /* * Update the long range free counter after * we're done syncing user data @@ -686,7 +732,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); + dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); @@ -722,7 +768,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * - * - move dead blocks from the pending deadlist to the on-disk deadlist + * - move dead blocks from the pending deadlist and livelists + * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ @@ -758,10 +805,25 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_mos_uncompressed_delta = 0; } - if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { + if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up + * the accounting of any dirtied space now. + * + * Note that, besides any dirty data from datasets, the amount of + * dirty data in the MOS is also accounted by the pool. Therefore, + * we want to do this cleanup after dsl_pool_sync_mos() so we don't + * attempt to update the accounting for the same dirty data twice. + * (i.e. at this point we only update the accounting for the space + * that we know that we "leaked"). + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. @@ -805,6 +867,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -887,16 +952,24 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; - boolean_t rv; mutex_enter(&dp->dp_lock); - if (dp->dp_dirty_total > dirty_min_bytes) - txg_kick(dp); - rv = (dp->dp_dirty_total > delay_min_bytes); + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - return (rv); + + return (dirty > delay_min_bytes); +} + +static boolean_t +dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + return (dirty > dirty_min_bytes); } void @@ -906,7 +979,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = !dmu_tx_is_syncing(tx) && + dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } @@ -1096,9 +1174,9 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) } taskq_t * -dsl_pool_iput_taskq(dsl_pool_t *dp) +dsl_pool_zrele_taskq(dsl_pool_t *dp) { - return (dp->dp_iput_taskq); + return (dp->dp_zrele_taskq); } taskq_t * @@ -1152,7 +1230,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) /* * Create the pool-wide zap object for storing temporary snapshot holds. */ -void +static void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; @@ -1194,7 +1272,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); - strfree(name); + kmem_strfree(name); return (error); } @@ -1258,8 +1336,16 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * - * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only - * or modifying operations. + * Operations generally fall somewhere into the following taxonomy: + * + * Read-Only Modifying + * + * Dataset Layer / MOS zfs get zfs destroy + * + * Individual Dataset read() write() + * + * + * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the @@ -1269,6 +1355,25 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. + * + * + * Operations On Individual Datasets + * + * Objects _within_ an objset should only be modified by the current 'owner' + * of the objset to prevent incorrect concurrent modification. Thus, use + * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, + * and fail with EBUSY if there is already an owner. The owner can then + * implement its own locking strategy, independent of the dataset layer's + * locking infrastructure. + * (E.g., the ZPL has its own set of locks to control concurrency. A regular + * vnop will not reach into the dataset layer). + * + * Ideally, objects would also only be read by the objset’s owner, so that we + * don’t observe state mid-modification. + * (E.g. the ZPL is creating a new object and linking it into a directory; if + * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an + * intermediate state. The ioctl level violates this but in pretty benign + * ways, e.g. reading the zpl props object.) */ int @@ -1336,53 +1441,46 @@ dsl_pool_config_held_writer(dsl_pool_t *dp) return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); } -#if defined(_KERNEL) EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); /* BEGIN CSTYLED */ /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ -module_param(zfs_dirty_data_max_percent, int, 0444); -MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty"); +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD, + "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ -module_param(zfs_dirty_data_max_max_percent, int, 0444); -MODULE_PARM_DESC(zfs_dirty_data_max_max_percent, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); -module_param(zfs_delay_min_dirty_percent, int, 0644); -MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold"); +ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, + "Transaction delay threshold"); -module_param(zfs_dirty_data_max, ulong, 0644); -MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, + "Determines the dirty space limit"); + +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ -module_param(zfs_dirty_data_max_max, ulong, 0444); -MODULE_PARM_DESC(zfs_dirty_data_max_max, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); -module_param(zfs_dirty_data_sync_percent, int, 0644); -MODULE_PARM_DESC(zfs_dirty_data_sync_percent, - "dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, + "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); -module_param(zfs_delay_scale, ulong, 0644); -MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); +ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, + "How quickly delay approaches infinity"); -module_param(zfs_sync_taskq_batch_pct, int, 0644); -MODULE_PARM_DESC(zfs_sync_taskq_batch_pct, - "max percent of CPUs that are used to sync dirty data"); +ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, + "Max percent of CPUs that are used to sync dirty data"); -module_param(zfs_zil_clean_taskq_nthr_pct, int, 0644); -MODULE_PARM_DESC(zfs_zil_clean_taskq_nthr_pct, - "max percent of CPUs that are used per dp_sync_taskq"); +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, + "Max percent of CPUs that are used per dp_sync_taskq"); -module_param(zfs_zil_clean_taskq_minalloc, int, 0644); -MODULE_PARM_DESC(zfs_zil_clean_taskq_minalloc, - "number of taskq entries that are pre-populated"); - -module_param(zfs_zil_clean_taskq_maxalloc, int, 0644); -MODULE_PARM_DESC(zfs_zil_clean_taskq_maxalloc, - "max number of taskq entries that are cached"); +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, + "Number of taskq entries that are pre-populated"); +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, + "Max number of taskq entries that are cached"); /* END CSTYLED */ -#endif diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 9f892acdbf..dfa04d7681 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include @@ -73,7 +73,7 @@ int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) { - int err = ENOENT; + int err; dsl_dir_t *target = dd; objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; @@ -98,8 +98,10 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, */ for (; dd != NULL; dd = dd->dd_parent) { if (dd != target || snapshot) { - if (!inheritable) + if (!inheritable) { + err = SET_ERROR(ENOENT); break; + } inheriting = B_TRUE; } @@ -130,8 +132,9 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, if (inheriting) { dsl_dir_name(dd, setpoint); } else { - (void) strcpy(setpoint, - ZPROP_SOURCE_VAL_RECVD); + (void) strlcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD, + MAXNAMELEN); } } break; @@ -150,8 +153,8 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, if (err == ENOENT) err = dodefault(prop, intsz, numints, buf); - strfree(inheritstr); - strfree(recvdstr); + kmem_strfree(inheritstr); + kmem_strfree(recvdstr); return (err); } @@ -190,7 +193,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, char *inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, zapobj, inheritstr); - strfree(inheritstr); + kmem_strfree(inheritstr); if (err != 0 && err != ENOENT) return (err); } @@ -201,11 +204,12 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, ZPROP_RECVD_SUFFIX); err = zap_lookup(mos, zapobj, recvdstr, intsz, numints, buf); - strfree(recvdstr); + kmem_strfree(recvdstr); if (err != ENOENT) { if (setpoint != NULL && err == 0) - (void) strcpy(setpoint, - ZPROP_SOURCE_VAL_RECVD); + (void) strlcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD, + MAXNAMELEN); return (err); } } @@ -283,7 +287,7 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; int err; - ASSERTV(dsl_pool_t *dp = dd->dd_pool); + dsl_pool_t *dp __maybe_unused = dd->dd_pool; ASSERT(dsl_pool_config_held(dp)); @@ -424,7 +428,7 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname, panic("unexpected property source: %d", source); } - strfree(recvdstr); + kmem_strfree(recvdstr); if (err == ENOENT) return (0); @@ -649,7 +653,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj, intval, dummy; + uint64_t zapobj, intval, dummy, count; int isint; char valbuf[32]; const char *valstr = NULL; @@ -663,7 +667,8 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); - if (dsl_dataset_phys(ds)->ds_props_obj == 0) { + if (dsl_dataset_phys(ds)->ds_props_obj == 0 && + (source & ZPROP_SRC_NONE) == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_props_obj = zap_create(mos, @@ -674,6 +679,10 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; } + /* If we are removing objects from a non-existent ZAP just return */ + if (zapobj == 0) + return; + if (version < SPA_VERSION_RECVD_PROPS) { if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; @@ -740,7 +749,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - /* FALLTHRU */ + fallthrough; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * remove propname$recvd @@ -752,8 +761,20 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, cmn_err(CE_PANIC, "unexpected property source: %d", source); } - strfree(inheritstr); - strfree(recvdstr); + kmem_strfree(inheritstr); + kmem_strfree(recvdstr); + + /* + * If we are left with an empty snap zap we can destroy it. + * This will prevent unnecessary calls to zap_lookup() in + * the "zfs list" and "zfs get" code paths. + */ + if (ds->ds_is_snapshot && + zap_count(mos, zapobj, &count) == 0 && count == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_props_obj = 0; + zap_destroy(mos, zapobj, tx); + } if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); @@ -839,13 +860,7 @@ dsl_prop_inherit(const char *dsname, const char *propname, return (error); } -typedef struct dsl_props_set_arg { - const char *dpsa_dsname; - zprop_source_t dpsa_source; - nvlist_t *dpsa_props; -} dsl_props_set_arg_t; - -static int +int dsl_props_set_check(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; @@ -923,7 +938,7 @@ dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, } } -static void +void dsl_props_set_sync(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; @@ -1020,7 +1035,7 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, valstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, propobj, valstr); - strfree(valstr); + kmem_strfree(valstr); if (err == 0) continue; if (err != ENOENT) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b15c39ac9c..d25c067dfb 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,10 +20,11 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2016 Gary Mills - * Copyright (c) 2017 Datto Inc. - * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include @@ -125,7 +126,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_leaves(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(vdev_t *vd); extern int zfs_vdev_async_write_active_min_dirty_percent; @@ -175,7 +176,9 @@ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ -unsigned long zfs_async_block_max_blocks = 100000; +unsigned long zfs_async_block_max_blocks = ULONG_MAX; +/* max number of dedup blocks to free in a single TXG */ +unsigned long zfs_max_async_dedup_frees = 100000; int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */ @@ -279,7 +282,7 @@ struct dsl_scan_io_queue { /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - avl_tree_t q_exts_by_size; + zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -358,7 +361,7 @@ scan_init(void) for (int i = 0; i < SPA_DVAS_PER_BP; i++) { char name[36]; - (void) sprintf(name, "sio_cache_%d", i); + (void) snprintf(name, sizeof (name), "sio_cache_%d", i); sio_cache[i] = kmem_cache_create(name, (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -448,7 +451,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * phase are done per top-level vdev and are handled separately. */ scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20); + dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); @@ -464,8 +467,9 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; - zfs_dbgmsg("old-style scrub was in progress; " + zfs_dbgmsg("old-style scrub was in progress for %s; " "restarting new-style scrub in txg %llu", + spa->spa_name, (longlong_t)scn->scn_restart_txg); /* @@ -537,9 +541,28 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * the meantime. */ scn->scn_restart_txg = txg; - zfs_dbgmsg("new-style scrub was modified " + zfs_dbgmsg("new-style scrub for %s was modified " "by old software; restarting in txg %llu", + spa->spa_name, (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting on %s in txg " + "%llu", + spa->spa_name, + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } } } @@ -591,6 +614,13 @@ dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_restart_txg <= tx->tx_txg); } +boolean_t +dsl_scan_resilver_scheduled(dsl_pool_t *dp) +{ + return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || + (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); +} + boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { @@ -646,7 +676,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, + NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } @@ -661,7 +692,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) sizeof (scn->scn_phys)); if (scn->scn_checkpointing) - zfs_dbgmsg("finish scan checkpoint"); + zfs_dbgmsg("finish scan checkpoint for %s", + spa->spa_name); scn->scn_checkpointing = B_FALSE; scn->scn_last_checkpoint = ddi_get_lbolt(); @@ -674,18 +706,19 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) } /* ARGSUSED */ -static int +int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) return (SET_ERROR(EBUSY)); return (0); } -static void +void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; @@ -720,8 +753,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, NULL, + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); + nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } @@ -735,6 +772,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + /* + * When starting a resilver clear any existing rebuild state. + * This is required to prevent stale rebuild status from + * being reported when a rebuild is run, then a resilver and + * finally a scrub. In which case only the scrub status + * should be reported by 'zpool status'. + */ + if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + vdev_rebuild_clear_sync( + (void *)(uintptr_t)vd->vdev_id, tx); + } + } } /* back to the generic stuff */ @@ -759,7 +811,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", - *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); + *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg, + (u_longlong_t)scn->scn_phys.scn_max_txg); } /* @@ -786,7 +839,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { - dsl_resilver_restart(spa->spa_dsl_pool, 0); + dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } @@ -796,7 +849,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); - return (ECANCELED); + return (SET_ERROR(ECANCELED)); } return (SET_ERROR(err)); @@ -806,41 +859,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } -/* - * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns - * B_TRUE if we have devices that need to be resilvered and are available to - * accept resilver I/Os. - */ -static boolean_t -dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx) -{ - boolean_t resilver_needed = B_FALSE; - spa_t *spa = vd->vdev_spa; - - for (int c = 0; c < vd->vdev_children; c++) { - resilver_needed |= - dsl_scan_clear_deferred(vd->vdev_child[c], tx); - } - - if (vd == spa->spa_root_vdev && - spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { - spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); - vdev_config_dirty(vd); - spa->spa_resilver_deferred = B_FALSE; - return (resilver_needed); - } - - if (!vdev_is_concrete(vd) || vd->vdev_aux || - !vd->vdev_ops->vdev_op_leaf) - return (resilver_needed); - - if (vd->vdev_resilver_deferred) - vd->vdev_resilver_deferred = B_FALSE; - - return (!vdev_is_dead(vd) && !vd->vdev_offline && - vdev_resilver_needed(vd, NULL, NULL)); -} - /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -898,18 +916,19 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; + spa_notify_waiters(spa); + if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, - "errors=%llu", spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, - "errors=%llu", spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, - "errors=%llu", spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* @@ -925,17 +944,31 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - scn->scn_phys.scn_max_txg, B_TRUE); + scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + if (scn->scn_phys.scn_min_txg) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_FINISH); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_SCRUB_FINISH); + } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - 0, B_TRUE); + 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); + /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. @@ -943,23 +976,26 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* - * Clear any deferred_resilver flags in the config. + * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. - * dsl_scan_clear_deferred() may update the config + * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered - * when the machine reboots and start the resilver then. + * and start the resilver then. */ - boolean_t resilver_needed = - dsl_scan_clear_deferred(spa->spa_root_vdev, tx); - if (resilver_needed) { + if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && + vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, - "starting deferred resilver", tx, - "errors=%llu", spa_get_errlog_size(spa)); + "starting deferred resilver", tx, "errors=%llu", + (u_longlong_t)spa_get_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } + + /* Clear recent error events (i.e. duplicate events tracking) */ + if (complete) + zfs_ereport_clear(spa, NULL); } scn->scn_phys.scn_end_time = gethrestime_sec(); @@ -1036,6 +1072,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); + spa_notify_waiters(spa); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { @@ -1068,7 +1105,7 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) /* start a new scan, or restart an existing one. */ void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; @@ -1081,7 +1118,8 @@ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) } else { dp->dp_scan->scn_restart_txg = txg; } - zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg); + zfs_dbgmsg("restarting resilver for %s at txg=%llu", + dp->dp_spa->spa_name, (longlong_t)txg); } void @@ -1216,10 +1254,13 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { + spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - uint64_t mlim_hard, mlim_soft, mused; - uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( - scn->scn_dp->dp_spa)); + uint64_t alloc, mlim_hard, mlim_soft, mused; + + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); @@ -1235,8 +1276,8 @@ dsl_scan_should_clear(dsl_scan_t *scn) queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ - mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + queue->q_sio_memused; + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_gap_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -1272,8 +1313,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ - /* We only know how to resume from level-0 blocks. */ - if (zb && zb->zb_level != 0) + /* We only know how to resume from level-0 and objset blocks. */ + if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) return (B_FALSE); /* @@ -1304,7 +1345,16 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { - if (zb) { + if (zb && zb->zb_level == ZB_ROOT_LEVEL) { + dprintf("suspending at first available bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + zb->zb_objset, 0, 0, 0); + } else if (zb != NULL) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, @@ -1335,7 +1385,8 @@ typedef struct zil_scan_arg { /* ARGSUSED */ static int -dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, + uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; @@ -1343,6 +1394,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; + ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); @@ -1364,17 +1416,19 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) /* ARGSUSED */ static int -dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, + uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; + const lr_write_t *lr = (const lr_write_t *)lrc; + const blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; + ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); @@ -1519,7 +1573,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; - if (zfs_no_scrub_prefetch) + if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || @@ -1585,7 +1639,7 @@ dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, scan_prefetch_ctx_rele(spc, FTAG); } -void +static void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { @@ -1771,6 +1825,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; + ASSERT(!BP_IS_REDACTED(bp)); + if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; @@ -1908,7 +1964,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, /* * This debugging is commented out to conserve stack space. This - * function is called recursively and the debugging addes several + * function is called recursively and the debugging adds several * bytes to the stack for each call. It can be commented back in * if required to debug an issue in dsl_scan_visitbp(). * @@ -1924,6 +1980,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, return; } + if (BP_IS_REDACTED(bp)) { + ASSERT(dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); + return; + } + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; @@ -2009,18 +2071,20 @@ ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) */ scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; - zfs_dbgmsg("destroying ds %llu; currently traversing; " - "reset zb_objset to %llu", + zfs_dbgmsg("destroying ds %llu on %s; currently " + "traversing; reset zb_objset to %llu", (u_longlong_t)ds->ds_object, + ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); - zfs_dbgmsg("destroying ds %llu; currently traversing; " - "reset bookmark to -1,0,0,0", - (u_longlong_t)ds->ds_object); + zfs_dbgmsg("destroying ds %llu on %s; currently " + "traversing; reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object, + ds->ds_dir->dd_pool->dp_spa->spa_name); } } } @@ -2071,14 +2135,17 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); - zfs_dbgmsg("destroying ds %llu; in queue; " + zfs_dbgmsg("destroying ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, + dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { - zfs_dbgmsg("destroying ds %llu; in queue; removing", - (u_longlong_t)ds->ds_object); + zfs_dbgmsg("destroying ds %llu on %s; in queue; " + "removing", + (u_longlong_t)ds->ds_object, + dp->dp_spa->spa_name); } } @@ -2095,9 +2162,10 @@ ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) if (scn_bookmark->zb_objset == ds->ds_object) { scn_bookmark->zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; - zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, + ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } } @@ -2136,9 +2204,10 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); - zfs_dbgmsg("snapshotting ds %llu; in queue; " + zfs_dbgmsg("snapshotting ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, + dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } @@ -2151,30 +2220,33 @@ ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, { if (scn_bookmark->zb_objset == ds1->ds_object) { scn_bookmark->zb_objset = ds2->ds_object; - zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, + ds1->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); } else if (scn_bookmark->zb_objset == ds2->ds_object) { scn_bookmark->zb_objset = ds1->ds_object; - zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, + ds2->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } } /* - * Called when a parent dataset and its clone are swapped. If we were + * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the - * newly promoted parent. + * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; + uint64_t mintxg1, mintxg2; + boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; @@ -2182,47 +2254,86 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); - if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds1->ds_object); - scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + /* + * Handle the in-memory scan queue. + */ + ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); + ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } - if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds2->ds_object); - scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds1->ds_object, &mintxg) == 0) { - int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * The swapping code below would not handle this case correctly, + * since we can't insert ds2 if it is already there. That's + * because scan_ds_queue_insert() prohibits a duplicate insert + * and panics. + */ + } else if (ds1_queued) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); + } else if (ds2_queued) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); + } + + /* + * Handle the on-disk scan queue. + * The on-disk state is an out-of-date version of the in-memory state, + * so the in-memory and on-disk values for ds1_queued and ds2_queued may + * be different. Therefore we need to apply the swap logic to the + * on-disk state independently of the in-memory state. + */ + ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; + ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * Alternatively, we could check for EEXIST from + * zap_add_int_key() and back out to the original state, but + * that would be more work than checking for this case upfront. + */ + } else if (ds1_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); - err = zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, - ds1->ds_object, mintxg, tx)); - } - zfs_dbgmsg("clone_swap ds %llu; in queue; " + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); + zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, + dp->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); - } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds2->ds_object, &mintxg) == 0) { - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + } else if (ds2_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); - zfs_dbgmsg("clone_swap ds %llu; in queue; " + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); + zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, + dp->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } @@ -2357,7 +2468,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { - zfs_dbgmsg("incomplete pass; visiting again"); + zfs_dbgmsg("incomplete pass on %s; visiting again", + dp->dp_spa->spa_name); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; scan_ds_queue_insert(scn, ds->ds_object, scn->scn_phys.scn_cur_max_txg); @@ -2556,8 +2668,8 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) break; } - zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " - "suspending=%u", (longlong_t)n, + zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; " + "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); @@ -2665,22 +2777,16 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_leaves(vdev_t *vd) +dsl_scan_count_data_disks(vdev_t *rvd) { uint64_t i, leaves = 0; - /* we only count leaves that belong to the main pool and are readable */ - if (vd->vdev_islog || vd->vdev_isspare || - vd->vdev_isl2cache || !vdev_readable(vd)) - return (0); - - if (vd->vdev_ops->vdev_op_leaf) - return (1); - - for (i = 0; i < vd->vdev_children; i++) { - leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + for (i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache) + continue; + leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd); } - return (leaves); } @@ -2783,7 +2889,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; - SIO_SET_OFFSET(srch_sio, rs->rs_start); + SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, @@ -2795,10 +2901,12 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && - SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { - ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); - ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); + while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr) && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + queue->q_exts_by_addr)); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); @@ -2816,16 +2924,19 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); + SIO_GET_OFFSET(sio), rs_get_end(rs, + queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { - range_tree_remove(queue->q_exts_by_addr, rs->rs_start, - rs->rs_end - rs->rs_start); + uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); + range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); return (B_FALSE); } } @@ -2845,6 +2956,7 @@ static range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; + range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); @@ -2852,9 +2964,26 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (zfs_scan_issue_strategy == 2) { - return (avl_first(&queue->q_exts_by_size)); + /* + * We need to get the original entry in the by_addr + * tree so we can modify it. + */ + range_seg_t *size_rs = + zfs_btree_first(&queue->q_exts_by_size, NULL); + if (size_rs == NULL) + return (NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, + size); + ASSERT3P(addr_rs, !=, NULL); + ASSERT3U(rs_get_start(size_rs, rt), ==, + rs_get_start(addr_rs, rt)); + ASSERT3U(rs_get_end(size_rs, rt), ==, + rs_get_end(addr_rs, rt)); + return (addr_rs); } } @@ -2868,9 +2997,24 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (scn->scn_clearing) { - return (avl_first(&queue->q_exts_by_size)); + /* + * We need to get the original entry in the by_addr + * tree so we can modify it. + */ + range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, + NULL); + if (size_rs == NULL) + return (NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, size); + ASSERT3P(addr_rs, !=, NULL); + ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, + rt)); + ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt)); + return (addr_rs); } else { return (NULL); } @@ -2885,8 +3029,6 @@ scan_io_queues_run_one(void *arg) range_seg_t *rs = NULL; scan_io_t *sio = NULL; list_t sio_list; - uint64_t bytes_per_leaf = zfs_scan_vdev_limit; - uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); ASSERT(queue->q_scn->scn_is_sorted); @@ -2894,9 +3036,9 @@ scan_io_queues_run_one(void *arg) offsetof(scan_io_t, sio_nodes.sio_list_node)); mutex_enter(q_lock); - /* calculate maximum in-flight bytes for this txg (min 1MB) */ - queue->q_maxinflight_bytes = - MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + /* Calculate maximum in-flight bytes for this vdev. */ + queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit * + (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd))); /* reset per-queue scan statistics for this txg */ queue->q_total_seg_size_this_txg = 0; @@ -3025,8 +3167,15 @@ dsl_scan_async_block_should_pause(dsl_scan_t *scn) if (zfs_recover) return (B_FALSE); - if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) + if (zfs_async_block_max_blocks != 0 && + scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { return (B_TRUE); + } + + if (zfs_max_async_dedup_frees != 0 && + scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { + return (B_TRUE); + } elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || @@ -3052,6 +3201,8 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; + if (BP_GET_DEDUP(bp)) + scn->scn_dedup_frees_this_txg++; return (0); } @@ -3091,8 +3242,18 @@ dsl_scan_update_stats(dsl_scan_t *scn) } static int -dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { + ASSERT(!bp_freed); + return (dsl_scan_free_block_cb(arg, bp, tx)); +} + +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; @@ -3111,6 +3272,7 @@ dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; + boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); @@ -3124,7 +3286,8 @@ dsl_scan_active(dsl_scan_t *scn) (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } - return (used != 0); + clones_left = spa_livelist_delete_check(spa); + return ((used != 0) || (clones_left)); } static boolean_t @@ -3178,20 +3341,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, return (B_TRUE); } - /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* @@ -3221,7 +3377,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_block_cb, scn, tx); + bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; @@ -3274,12 +3430,13 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj/bptree txg %llu; err=%u", + "free_bpobj/bptree on %s in txg %llu; err=%u", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg, err); + spa->spa_name, (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; + scn->scn_dedup_frees_this_txg = 0; /* * Write out changes to the DDT that may be required as a @@ -3318,13 +3475,16 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } - if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } + spa_notify_waiters(spa); + EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ)); @@ -3348,7 +3508,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we - * cna guarantee that blocks we are currently scanning will not change out + * can guarantee that blocks we are currently scanning will not change out * from under us. While a scan is active, this function controls how quickly * transaction groups proceed, instead of the normal handling provided by * txg_sync_thread(). @@ -3378,8 +3538,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; - zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, (longlong_t)tx->tx_txg); + zfs_dbgmsg("restarting scan func=%u on %s txg=%llu", + func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } @@ -3405,6 +3565,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) /* reset scan statistics */ scn->scn_visited_this_txg = 0; + scn->scn_dedup_frees_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; @@ -3491,17 +3652,20 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ddi_get_lbolt() - scn->scn_last_checkpoint > SEC_TO_TICK(zfs_scan_checkpoint_intval)) { if (!scn->scn_checkpointing) - zfs_dbgmsg("begin scan checkpoint"); + zfs_dbgmsg("begin scan checkpoint for %s", + spa->spa_name); scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } else { boolean_t should_clear = dsl_scan_should_clear(scn); if (should_clear && !scn->scn_clearing) { - zfs_dbgmsg("begin scan clearing"); + zfs_dbgmsg("begin scan clearing for %s", + spa->spa_name); scn->scn_clearing = B_TRUE; } else if (!should_clear && scn->scn_clearing) { - zfs_dbgmsg("finish scan clearing"); + zfs_dbgmsg("finish scan clearing for %s", + spa->spa_name); scn->scn_clearing = B_FALSE; } } @@ -3514,30 +3678,30 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) /* Need to scan metadata for more blocks to scrub */ dsl_scan_phys_t *scnp = &scn->scn_phys; taskqid_t prefetch_tqid; - uint64_t bytes_per_leaf = zfs_scan_vdev_limit; - uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); /* * Recalculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB). Limits for the issuing * phase are done per top-level vdev and are handled separately. */ - scn->scn_maxinflight_bytes = - MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * + dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); - zfs_dbgmsg("doing scan sync txg %llu; " + zfs_dbgmsg("doing scan sync for %s txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", + spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } else { - zfs_dbgmsg("doing scan sync txg %llu; " + zfs_dbgmsg("doing scan sync for %s txg %llu; " "bm=%llu/%llu/%llu/%llu", + spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_bookmark.zb_objset, (longlong_t)scnp->scn_bookmark.zb_object, @@ -3566,10 +3730,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; - zfs_dbgmsg("scan visited %llu blocks in %llums " + zfs_dbgmsg("scan visited %llu blocks of %s in %llums " "(%llu os's, %llu holes, %llu < mintxg, " "%llu in ddt, %llu > maxtxg)", (longlong_t)scn->scn_visited_this_txg, + spa->spa_name, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_objsets_visited_this_txg, @@ -3585,7 +3750,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } - zfs_dbgmsg("scan complete txg %llu", + zfs_dbgmsg("scan complete for %s txg %llu", + spa->spa_name, (longlong_t)tx->tx_txg); } } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { @@ -3602,9 +3768,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) (void) dsl_scan_should_clear(scn); dsl_scan_update_stats(scn); - zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums " - "(avg_block_size = %llu, avg_seg_size = %llu)", + zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) " + "in %llums (avg_block_size = %llu, avg_seg_size = %llu)", (longlong_t)scn->scn_zios_this_txg, + spa->spa_name, (longlong_t)scn->scn_segs_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), @@ -3612,8 +3779,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) (longlong_t)scn->scn_avg_seg_size_this_txg); } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { /* Finished with everything. Mark the scrub as complete */ - zfs_dbgmsg("scan issuing complete txg %llu", - (longlong_t)tx->tx_txg); + zfs_dbgmsg("scan issuing complete txg %llu for %s", + (longlong_t)tx->tx_txg, + spa->spa_name); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); ASSERT0(scn->scn_bytes_pending); @@ -3833,7 +4001,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, /* * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. + * zpool(8) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); @@ -3899,9 +4067,8 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); - ASSERT3U(scn->scn_maxinflight_bytes, >, 0); - if (queue == NULL) { + ASSERT3U(scn->scn_maxinflight_bytes, >, 0); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); @@ -3910,6 +4077,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + ASSERT3U(queue->q_maxinflight_bytes, >, 0); mutex_enter(q_lock); while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); @@ -3952,14 +4120,15 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by - * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). + * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). */ static int ext_size_compare(const void *x, const void *y) { - const range_seg_t *rsa = x, *rsb = y; - uint64_t sa = rsa->rs_end - rsa->rs_start, - sb = rsb->rs_end - rsb->rs_start; + const range_seg_gap_t *rsa = x, *rsb = y; + + uint64_t sa = rsa->rs_end - rsa->rs_start; + uint64_t sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * @@ -3988,7 +4157,7 @@ sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); + return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -4002,8 +4171,8 @@ scan_io_queue_create(vdev_t *vd) q->q_vd = vd; q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); @@ -4180,74 +4349,95 @@ dsl_scan_freed(spa_t *spa, const blkptr_t *bp) dsl_scan_freed_dva(spa, bp, i); } -#if defined(_KERNEL) -/* CSTYLED */ -module_param(zfs_scan_vdev_limit, ulong, 0644); -MODULE_PARM_DESC(zfs_scan_vdev_limit, +/* + * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has + * not started, start it. Otherwise, only restart if max txg in DTL range is + * greater than the max txg in the current scan. If the DTL max is less than + * the scan max, then the vdev has not missed any new data since the resilver + * started, so a restart is not needed. + */ +void +dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) +{ + uint64_t min, max; + + if (!vdev_resilver_needed(vd, &min, &max)) + return; + + if (!dsl_scan_resilvering(dp)) { + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); + return; + } + + if (max <= dp->dp_scan->scn_phys.scn_max_txg) + return; + + /* restart is needed, check if it can be deferred */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_defer_resilver(vd); + else + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); -module_param(zfs_scrub_min_time_ms, int, 0644); -MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg"); +ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW, + "Min millisecs to scrub per txg"); -module_param(zfs_obsolete_min_time_ms, int, 0644); -MODULE_PARM_DESC(zfs_obsolete_min_time_ms, "Min millisecs to obsolete per txg"); +ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW, + "Min millisecs to obsolete per txg"); -module_param(zfs_free_min_time_ms, int, 0644); -MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); +ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW, + "Min millisecs to free per txg"); -module_param(zfs_resilver_min_time_ms, int, 0644); -MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg"); +ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW, + "Min millisecs to resilver per txg"); -module_param(zfs_scan_suspend_progress, int, 0644); -MODULE_PARM_DESC(zfs_scan_suspend_progress, +ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, "Set to prevent scans from progressing"); -module_param(zfs_no_scrub_io, int, 0644); -MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O"); +ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, + "Set to disable scrub I/O"); -module_param(zfs_no_scrub_prefetch, int, 0644); -MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching"); +ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, + "Set to disable scrub prefetching"); -/* CSTYLED */ -module_param(zfs_async_block_max_blocks, ulong, 0644); -MODULE_PARM_DESC(zfs_async_block_max_blocks, +ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW, "Max number of blocks freed in one txg"); -module_param(zfs_free_bpobj_enabled, int, 0644); -MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj"); +ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW, + "Max number of dedup blocks freed in one txg"); -module_param(zfs_scan_mem_lim_fact, int, 0644); -MODULE_PARM_DESC(zfs_scan_mem_lim_fact, "Fraction of RAM for scan hard limit"); +ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, + "Enable processing of the free_bpobj"); -module_param(zfs_scan_issue_strategy, int, 0644); -MODULE_PARM_DESC(zfs_scan_issue_strategy, - "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW, + "Fraction of RAM for scan hard limit"); -module_param(zfs_scan_legacy, int, 0644); -MODULE_PARM_DESC(zfs_scan_legacy, "Scrub using legacy non-sequential method"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW, + "IO issuing strategy during scrubbing. " + "0 = default, 1 = LBA, 2 = size"); -module_param(zfs_scan_checkpoint_intval, int, 0644); -MODULE_PARM_DESC(zfs_scan_checkpoint_intval, +ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, + "Scrub using legacy non-sequential method"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); -/* CSTYLED */ -module_param(zfs_scan_max_ext_gap, ulong, 0644); -MODULE_PARM_DESC(zfs_scan_max_ext_gap, +ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); -module_param(zfs_scan_mem_lim_soft_fact, int, 0644); -MODULE_PARM_DESC(zfs_scan_mem_lim_soft_fact, +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW, "Fraction of hard limit used as soft limit"); -module_param(zfs_scan_strict_mem_lim, int, 0644); -MODULE_PARM_DESC(zfs_scan_strict_mem_lim, +ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, "Tunable to attempt to reduce lock contention"); -module_param(zfs_scan_fill_weight, int, 0644); -MODULE_PARM_DESC(zfs_scan_fill_weight, +ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); -module_param(zfs_resilver_disable_defer, int, 0644); -MODULE_PARM_DESC(zfs_resilver_disable_defer, +ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); -#endif +/* END CSTYLED */ diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index b63ce5cad9..148e8fff24 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -41,7 +41,7 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx) static int dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, void *arg, + dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, int blocks_modified, zfs_space_check_t space_check, boolean_t early) { spa_t *spa; @@ -85,6 +85,11 @@ top: dmu_tx_commit(tx); + if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) { + /* current contract is to call func once */ + sigfunc(arg, tx); + sigfunc = NULL; /* in case we're performing an EAGAIN retry */ + } txg_wait_synced(dp, dst.dst_txg); if (dst.dst_error == EAGAIN) { @@ -124,7 +129,7 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified, zfs_space_check_t space_check) { - return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, blocks_modified, space_check, B_FALSE)); } @@ -138,7 +143,7 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, * For that reason, early synctasks can affect the process of writing dirty * changes to disk for the txg that they run and should be used with caution. * In addition, early synctasks should not dirty any metaslabs as this would - * invalidate the precodition/invariant for subsequent early synctasks. + * invalidate the precondition/invariant for subsequent early synctasks. * [see dsl_pool_sync() and dsl_early_sync_task_verify()] */ int @@ -146,21 +151,32 @@ dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified, zfs_space_check_t space_check) { - return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, blocks_modified, space_check, B_TRUE)); } +/* + * A standard synctask that can be interrupted from a signal. The sigfunc + * is called once if a signal occurred while waiting for the task to sync. + */ +int +dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg, + blocks_modified, space_check, B_FALSE)); +} + static void dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, - boolean_t early) + dmu_tx_t *tx, boolean_t early) { dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); dst->dst_pool = dp; dst->dst_txg = dmu_tx_get_txg(tx); - dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; - dst->dst_space_check = space_check; + dst->dst_space_check = ZFS_SPACE_CHECK_NONE; dst->dst_checkfunc = dsl_null_checkfunc; dst->dst_syncfunc = syncfunc; dst->dst_arg = arg; @@ -174,18 +190,16 @@ dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, void dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_FALSE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE); } void dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_TRUE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE); } /* diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 638805d0b9..75d153194a 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -101,9 +101,9 @@ dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) size_t len = strlen(nvpair_name(pair)) + strlen(fnvpair_value_string(pair)); char *nameval = kmem_zalloc(len + 2, KM_SLEEP); - (void) strcpy(nameval, nvpair_name(pair)); - (void) strcat(nameval, "@"); - (void) strcat(nameval, fnvpair_value_string(pair)); + (void) strlcpy(nameval, nvpair_name(pair), len + 2); + (void) strlcat(nameval, "@", len + 2); + (void) strlcat(nameval, fnvpair_value_string(pair), len + 2); fnvlist_add_string(tmp_holds, nameval, ""); kmem_free(nameval, len + 2); } @@ -197,7 +197,7 @@ dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, spa_history_log_internal_ds(ds, "hold", tx, "tag=%s temp=%d refs=%llu", - htag, minor != 0, ds->ds_userrefs); + htag, minor != 0, (u_longlong_t)ds->ds_userrefs); } typedef struct zfs_hold_cleanup_arg { @@ -302,7 +302,7 @@ dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) * holds is nvl of snapname -> holdname * errlist will be filled in with snapname -> error * - * The snaphosts must all be in the same pool. + * The snapshots must all be in the same pool. * * Holds for snapshots that don't exist will be skipped. * @@ -406,7 +406,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, snapname, holdname); fnvlist_add_int32(ddura->ddura_errlist, errtag, ENOENT); - strfree(errtag); + kmem_strfree(errtag); } continue; } @@ -556,9 +556,9 @@ dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) * errlist will be filled in with snapname -> error * * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, - * otherwise they should be the names of shapshots. + * otherwise they should be the names of snapshots. * - * As a release may cause snapshots to be destroyed this trys to ensure they + * As a release may cause snapshots to be destroyed this tries to ensure they * aren't mounted. * * The release of non-existent holds are skipped. diff --git a/module/zfs/edonr_zfs.c b/module/zfs/edonr_zfs.c index e92da6d6c1..aa00e1c941 100644 --- a/module/zfs/edonr_zfs.c +++ b/module/zfs/edonr_zfs.c @@ -27,8 +27,8 @@ */ #include #include +#include #include -#include /* For CTASSERT() */ #include #define EDONR_MODE 512 diff --git a/module/zfs/fm.c b/module/zfs/fm.c index cc5225dcbb..b8a1c7c8a5 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -31,7 +31,7 @@ * Name-Value Pair Lists * * The embodiment of an FMA protocol element (event, fmri or authority) is a - * name-value pair list (nvlist_t). FMA-specific nvlist construtor and + * name-value pair list (nvlist_t). FMA-specific nvlist constructor and * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used * to create an nvpair list using custom allocators. Callers may choose to * allocate either from the kernel memory allocator, or from a preallocated @@ -66,14 +66,9 @@ #ifdef _KERNEL #include #include -#include -#include -#include #include -int zfs_zevent_len_max = 0; -int zfs_zevent_cols = 80; -int zfs_zevent_console = 0; +int zfs_zevent_len_max = 512; static int zevent_len_cur = 0; static int zevent_waiters = 0; @@ -105,320 +100,21 @@ struct erpt_kstat { kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ + kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } + { "payload-set-failed", KSTAT_DATA_UINT64 }, + { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; #ifdef _KERNEL -/* - * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of - * output so they aren't split across console lines, and return the end column. - */ -/*PRINTFLIKE4*/ -static int -fm_printf(int depth, int c, int cols, const char *format, ...) -{ - va_list ap; - int width; - char c1; - - va_start(ap, format); - width = vsnprintf(&c1, sizeof (c1), format, ap); - va_end(ap); - - if (c + width >= cols) { - console_printf("\n"); - c = 0; - if (format[0] != ' ' && depth > 0) { - console_printf(" "); - c++; - } - } - - va_start(ap, format); - console_vprintf(format, ap); - va_end(ap); - - return ((c + width) % cols); -} - -/* - * Recursively print an nvlist in the specified column width and return the - * column we end up in. This function is called recursively by fm_nvprint(), - * below. We generically format the entire nvpair using hexadecimal - * integers and strings, and elide any integer arrays. Arrays are basically - * used for cache dumps right now, so we suppress them so as not to overwhelm - * the amount of console output we produce at panic time. This can be further - * enhanced as FMA technology grows based upon the needs of consumers. All - * FMA telemetry is logged using the dump device transport, so the console - * output serves only as a fallback in case this procedure is unsuccessful. - */ -static int -fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) -{ - nvpair_t *nvp; - - for (nvp = nvlist_next_nvpair(nvl, NULL); - nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { - - data_type_t type = nvpair_type(nvp); - const char *name = nvpair_name(nvp); - - boolean_t b; - uint8_t i8; - uint16_t i16; - uint32_t i32; - uint64_t i64; - char *str; - nvlist_t *cnv; - - if (strcmp(name, FM_CLASS) == 0) - continue; /* already printed by caller */ - - c = fm_printf(d, c, cols, " %s=", name); - - switch (type) { - case DATA_TYPE_BOOLEAN: - c = fm_printf(d + 1, c, cols, " 1"); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - (void) nvpair_value_boolean_value(nvp, &b); - c = fm_printf(d + 1, c, cols, b ? "1" : "0"); - break; - - case DATA_TYPE_BYTE: - (void) nvpair_value_byte(nvp, &i8); - c = fm_printf(d + 1, c, cols, "0x%x", i8); - break; - - case DATA_TYPE_INT8: - (void) nvpair_value_int8(nvp, (void *)&i8); - c = fm_printf(d + 1, c, cols, "0x%x", i8); - break; - - case DATA_TYPE_UINT8: - (void) nvpair_value_uint8(nvp, &i8); - c = fm_printf(d + 1, c, cols, "0x%x", i8); - break; - - case DATA_TYPE_INT16: - (void) nvpair_value_int16(nvp, (void *)&i16); - c = fm_printf(d + 1, c, cols, "0x%x", i16); - break; - - case DATA_TYPE_UINT16: - (void) nvpair_value_uint16(nvp, &i16); - c = fm_printf(d + 1, c, cols, "0x%x", i16); - break; - - case DATA_TYPE_INT32: - (void) nvpair_value_int32(nvp, (void *)&i32); - c = fm_printf(d + 1, c, cols, "0x%x", i32); - break; - - case DATA_TYPE_UINT32: - (void) nvpair_value_uint32(nvp, &i32); - c = fm_printf(d + 1, c, cols, "0x%x", i32); - break; - - case DATA_TYPE_INT64: - (void) nvpair_value_int64(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "0x%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_UINT64: - (void) nvpair_value_uint64(nvp, &i64); - c = fm_printf(d + 1, c, cols, "0x%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_HRTIME: - (void) nvpair_value_hrtime(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "0x%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_STRING: - (void) nvpair_value_string(nvp, &str); - c = fm_printf(d + 1, c, cols, "\"%s\"", - str ? str : ""); - break; - - case DATA_TYPE_NVLIST: - c = fm_printf(d + 1, c, cols, "["); - (void) nvpair_value_nvlist(nvp, &cnv); - c = fm_nvprintr(cnv, d + 1, c, cols); - c = fm_printf(d + 1, c, cols, " ]"); - break; - - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "["); - (void) nvpair_value_nvlist_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) { - c = fm_nvprintr(val[i], d + 1, c, cols); - } - c = fm_printf(d + 1, c, cols, " ]"); - } - break; - - case DATA_TYPE_INT8_ARRAY: { - int8_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_int8_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_UINT8_ARRAY: { - uint8_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_uint8_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_INT16_ARRAY: { - int16_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_int16_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_UINT16_ARRAY: { - uint16_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_uint16_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_INT32_ARRAY: { - int32_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_int32_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_UINT32_ARRAY: { - uint32_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_uint32_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_INT64_ARRAY: { - int64_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_int64_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_UINT64_ARRAY: { - uint64_t *val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "[ "); - (void) nvpair_value_uint64_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - c = fm_printf(d + 1, c, cols, "0x%llx ", - (u_longlong_t)val[i]); - - c = fm_printf(d + 1, c, cols, "]"); - break; - } - - case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_BYTE_ARRAY: - c = fm_printf(d + 1, c, cols, "[...]"); - break; - - case DATA_TYPE_UNKNOWN: - case DATA_TYPE_DONTCARE: - c = fm_printf(d + 1, c, cols, ""); - break; - } - } - - return (c); -} - -void -fm_nvprint(nvlist_t *nvl) -{ - char *class; - int c = 0; - - console_printf("\n"); - - if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) - c = fm_printf(0, c, zfs_zevent_cols, "%s", class); - - if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0) - console_printf("\n"); - - console_printf("\n"); -} - static zevent_t * zfs_zevent_alloc(void) { @@ -542,9 +238,6 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) goto out; } - if (zfs_zevent_console) - fm_nvprint(nvl); - ev = zfs_zevent_alloc(); if (ev == NULL) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); @@ -569,6 +262,12 @@ out: return (error); } +void +zfs_zevent_track_duplicate(void) +{ + atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); +} + static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { @@ -579,30 +278,29 @@ zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) return (0); } -int +zfs_file_t * zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) { - file_t *fp; - int error; - - fp = getf(fd); + zfs_file_t *fp = zfs_file_get(fd); if (fp == NULL) - return (SET_ERROR(EBADF)); + return (NULL); - error = zfsdev_getminor(fp->f_file, minorp); + int error = zfsdev_getminor(fp, minorp); if (error == 0) error = zfs_zevent_minor_to_state(*minorp, ze); - if (error) - zfs_zevent_fd_rele(fd); + if (error) { + zfs_zevent_fd_rele(fp); + fp = NULL; + } - return (error); + return (fp); } void -zfs_zevent_fd_rele(int fd) +zfs_zevent_fd_rele(zfs_file_t *fp) { - releasef(fd); + zfs_file_put(fp); } /* @@ -656,8 +354,7 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, #ifdef _KERNEL /* Include events dropped due to rate limiting */ - *dropped += ratelimit_dropped; - ratelimit_dropped = 0; + *dropped += atomic_swap_64(&ratelimit_dropped, 0); #endif ze->ze_dropped = 0; out: @@ -683,8 +380,7 @@ zfs_zevent_wait(zfs_zevent_t *ze) break; } - error = cv_timedwait_sig(&zevent_cv, &zevent_lock, - ddi_get_lbolt() + MSEC_TO_TICK(10)); + error = cv_wait_sig(&zevent_cv, &zevent_lock); if (signal_pending(current)) { error = SET_ERROR(EINTR); break; @@ -785,7 +481,7 @@ zfs_zevent_destroy(zfs_zevent_t *ze) #endif /* _KERNEL */ /* - * Wrapppers for FM nvlist allocators + * Wrappers for FM nvlist allocators */ /* ARGSUSED */ static void * @@ -1614,18 +1310,13 @@ fm_erpt_dropped_increment(void) { atomic_inc_64(&ratelimit_dropped); } -#endif -#ifdef _KERNEL void fm_init(void) { zevent_len_cur = 0; zevent_flags = 0; - if (zfs_zevent_len_max == 0) - zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4); - /* Initialize zevent allocation and generation kstats */ fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, sizeof (struct erpt_kstat) / sizeof (kstat_named_t), @@ -1642,6 +1333,8 @@ fm_init(void) list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); + + zfs_ereport_init(); } void @@ -1649,6 +1342,8 @@ fm_fini(void) { int count; + zfs_ereport_fini(); + zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); @@ -1671,14 +1366,7 @@ fm_fini(void) fm_ksp = NULL; } } - -module_param(zfs_zevent_len_max, int, 0644); -MODULE_PARM_DESC(zfs_zevent_len_max, "Max event queue length"); - -module_param(zfs_zevent_cols, int, 0644); -MODULE_PARM_DESC(zfs_zevent_cols, "Max event column width"); - -module_param(zfs_zevent_console, int, 0644); -MODULE_PARM_DESC(zfs_zevent_console, "Log events to the console"); - #endif /* _KERNEL */ + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW, + "Max event queue length"); diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c index 5cac2a7de6..e2c6e59969 100644 --- a/module/zfs/gzip.c +++ b/module/zfs/gzip.c @@ -29,7 +29,8 @@ #include #include #include -#include "qat.h" +#include +#include #ifdef _KERNEL diff --git a/module/zfs/lz4.c b/module/zfs/lz4.c index c04cfa7ba7..4b46e69489 100644 --- a/module/zfs/lz4.c +++ b/module/zfs/lz4.c @@ -33,6 +33,7 @@ */ #include +#include static int real_LZ4_compress(const char *source, char *dest, int isize, int osize); @@ -207,7 +208,7 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, * Little Endian or Big Endian? * Note: overwrite the below #define if you know your architecture endianness. */ -#if defined(_BIG_ENDIAN) +#if defined(_ZFS_BIG_ENDIAN) #define LZ4_BIG_ENDIAN 1 #else /* @@ -383,7 +384,7 @@ static inline int LZ4_NbCommonBytes(register U64 val) { #if defined(LZ4_BIG_ENDIAN) -#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clzll(val) >> 3); #else @@ -404,7 +405,7 @@ LZ4_NbCommonBytes(register U64 val) return (r); #endif #else -#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll(val) >> 3); #else @@ -426,7 +427,7 @@ static inline int LZ4_NbCommonBytes(register U32 val) { #if defined(LZ4_BIG_ENDIAN) -#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz(val) >> 3); #else diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c index ae18467011..a478e64c51 100644 --- a/module/zfs/lzjb.c +++ b/module/zfs/lzjb.c @@ -37,6 +37,7 @@ */ #include +#include #define MATCH_BITS 6 #define MATCH_MIN 3 diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ec89810b48..d1fee70f00 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,8 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -31,11 +32,13 @@ #include #include #include +#include #include #include #include #include #include +#include #define WITH_DF_BLOCK_ALLOCATOR @@ -56,12 +59,21 @@ unsigned long metaslab_aliquot = 512 << 10; unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* - * Since we can touch multiple metaslabs (and their respective space maps) - * with each transaction group, we benefit from having a smaller space map + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered - * around the disk. + * around the disk. So a sane default for the space map block size + * is 8~16K. */ -int zfs_metaslab_sm_blksz = (1 << 12); +int zfs_metaslab_sm_blksz_no_log = (1 << 14); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. @@ -103,12 +115,27 @@ int zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their - * fragmenation metric (measured as a percentage) is less than or equal to - * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold - * then it will be skipped unless all metaslab groups within the metaslab - * class have also crossed this threshold. + * fragmentation metric (measured as a percentage) is less than or + * equal to zfs_mg_fragmentation_threshold. If a metaslab group + * exceeds this threshold then it will be skipped unless all metaslab + * groups within the metaslab class have also crossed this threshold. + * + * This tunable was introduced to avoid edge cases where we continue + * allocating from very fragmented disks in our pool while other, less + * fragmented disks, exists. On the other hand, if all disks in the + * pool are uniformly approaching the threshold, the threshold can + * be a speed bump in performance, where we keep switching the disks + * that we allocate from (e.g. we allocate some segments from disk A + * making it bypassing the threshold while freeing segments from disk + * B getting its fragmentation below the threshold). + * + * Empirically, we've seen that our vdev selection for allocations is + * good enough that fragmentation increases uniformly across all vdevs + * the majority of the time. Thus we set the threshold percentage high + * enough to avoid hitting the speed bump on pools that are being pushed + * to the edge. */ -int zfs_mg_fragmentation_threshold = 85; +int zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation @@ -144,22 +171,57 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; */ int metaslab_df_free_pct = 4; +/* + * Maximum distance to search forward from the last offset. Without this + * limit, fragmented pools can see >100,000 iterations and + * metaslab_block_picker() becomes the performance limiting factor on + * high-performance storage. + * + * With the default setting of 16MB, we typically see less than 500 + * iterations, even with very fragmented, ashift=9 pools. The maximum number + * of iterations possible is: + * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + +static uint64_t metaslab_weight(metaslab_t *, boolean_t); +static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); -#ifdef _METASLAB_TRACING +static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); +static unsigned int metaslab_idx_func(multilist_t *, void *); +static void metaslab_evict(metaslab_t *, uint64_t); +static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; -#endif + +typedef struct metaslab_stats { + kstat_named_t metaslabstat_trace_over_limit; + kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; +} metaslab_stats_t; + +static metaslab_stats_t metaslab_stats = { + { "trace_over_limit", KSTAT_DATA_UINT64 }, + { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, +}; + +#define METASLABSTAT_BUMP(stat) \ + atomic_inc_64(&metaslab_stats.stat.value.ui64); + + +kstat_t *metaslab_ksp; + +void +metaslab_stat_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", + "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (metaslab_ksp != NULL) { + metaslab_ksp->ks_data = &metaslab_stats; + kstat_install(metaslab_ksp); + } +} + +void +metaslab_stat_fini(void) +{ + if (metaslab_ksp != NULL) { + kstat_delete(metaslab_ksp); + metaslab_ksp = NULL; + } + + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} /* * ========================================================================== @@ -245,18 +410,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc = kmem_zalloc(offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; - mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (zfs_refcount_t), KM_SLEEP); - mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (uint64_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) - zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); + multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), + offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + mca->mca_rotor = NULL; + zfs_refcount_create_tracked(&mca->mca_alloc_slots); + } return (mc); } @@ -264,20 +430,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) void metaslab_class_destroy(metaslab_class_t *mc) { - ASSERT(mc->mc_rotor == NULL); + spa_t *spa = mc->mc_spa; + ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) - zfs_refcount_destroy(&mc->mc_alloc_slots[i]); - kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * - sizeof (zfs_refcount_t)); - kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * - sizeof (uint64_t)); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + ASSERT(mca->mca_rotor == NULL); + zfs_refcount_destroy(&mca->mca_alloc_slots); + } mutex_destroy(&mc->mc_lock); - kmem_free(mc, sizeof (metaslab_class_t)); + multilist_destroy(&mc->mc_metaslab_txg_list); + kmem_free(mc, offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count])); } int @@ -292,7 +460,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mg = mc->mc_rotor) == NULL) + if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { @@ -301,7 +469,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); - } while ((mg = mg->mg_next) != mc->mc_rotor); + } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } @@ -354,9 +522,10 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); + mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; + metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or @@ -367,13 +536,18 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) continue; } + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + } + mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -462,6 +636,51 @@ metaslab_class_expandable_space(metaslab_class_t *mc) return (space); } +void +metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) +{ + multilist_t *ml = &mc->mc_metaslab_txg_list; + for (int i = 0; i < multilist_get_num_sublists(ml); i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL) { + mutex_enter(&msp->ms_lock); + + /* + * If the metaslab has been removed from the list + * (which could happen if we were at the memory limit + * and it was evicted during this loop), then we can't + * proceed and we should restart the sublist. + */ + if (!multilist_link_active(&msp->ms_class_txg_node)) { + mutex_exit(&msp->ms_lock); + i--; + break; + } + mls = multilist_sublist_lock(ml, i); + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + if (txg > + msp->ms_selected_txg + metaslab_unload_delay && + gethrtime() > msp->ms_selected_time + + (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + metaslab_evict(msp, txg); + } else { + /* + * Once we've hit a metaslab selected too + * recently to evict, we're done evicting for + * now. + */ + mutex_exit(&msp->ms_lock); + break; + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + } + } +} + static int metaslab_compare(const void *x1, const void *x2) { @@ -492,74 +711,13 @@ metaslab_compare(const void *x1, const void *x2) if (sort1 > sort2) return (1); - int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); - IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - return (AVL_CMP(m1->ms_start, m2->ms_start)); -} - -uint64_t -metaslab_allocated_space(metaslab_t *msp) -{ - return (msp->ms_allocated_space); -} - -/* - * Verify that the space accounting on disk matches the in-core range_trees. - */ -static void -metaslab_verify_space(metaslab_t *msp, uint64_t txg) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocating = 0; - uint64_t sm_free_space, msp_free_space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!msp->ms_condensing); - - if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) - return; - - /* - * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an - * allocated space map. Calling this in non-syncing context - * does not provide a consistent view of the metaslab since - * we're performing allocations in the future. - */ - if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || - !msp->ms_loaded) - return; - - /* - * Even though the smp_alloc field can get negative (e.g. - * see vdev_checkpoint_sm), that should never be the case - * when it come's to a metaslab's space map. - */ - ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); - - sm_free_space = msp->ms_size - metaslab_allocated_space(msp); - - /* - * Account for future allocations since we would have - * already deducted that space from the ms_allocatable. - */ - for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocating += - range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); - } - - ASSERT3U(msp->ms_deferspace, ==, - range_tree_space(msp->ms_defer[0]) + - range_tree_space(msp->ms_defer[1])); - - msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + - msp->ms_deferspace + range_tree_space(msp->ms_freed); - - VERIFY3U(sm_free_space, ==, msp_free_space); + return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* @@ -650,21 +808,37 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mutex_exit(&mg->mg_lock); } +int +metaslab_sort_by_flushed(const void *va, const void *vb) +{ + const metaslab_t *a = va; + const metaslab_t *b = vb; + + int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + if (likely(cmp)) + return (cmp); + + uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; + uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; + cmp = TREE_CMP(a_vdev_id, b_vdev_id); + if (cmp) + return (cmp); + + return (TREE_CMP(a->ms_id, b->ms_id)); +} + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mg = kmem_zalloc(offsetof(metaslab_group_t, + mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); - mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), - KM_SLEEP); - mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), - KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, - sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; @@ -672,13 +846,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; - mg->mg_alloc_queue_depth = kmem_zalloc(allocators * - sizeof (zfs_refcount_t), KM_SLEEP); - mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * - sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { - zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); - mg->mg_cur_max_alloc_queue_depth[i] = 0; + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, @@ -701,34 +871,27 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); - kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); - kmem_free(mg->mg_secondaries, mg->mg_allocators * - sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < mg->mg_allocators; i++) { - zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); - mg->mg_cur_max_alloc_queue_depth[i] = 0; + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } - kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * - sizeof (zfs_refcount_t)); - kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * - sizeof (uint64_t)); - - kmem_free(mg, sizeof (metaslab_group_t)); + kmem_free(mg, offsetof(metaslab_group_t, + mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; - ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); + ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); - ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); @@ -739,7 +902,7 @@ metaslab_group_activate(metaslab_group_t *mg) mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); - if ((mgprev = mc->mc_rotor) == NULL) { + if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { @@ -749,7 +912,10 @@ metaslab_group_activate(metaslab_group_t *mg) mgprev->mg_next = mg; mgnext->mg_prev = mg; } - mc->mc_rotor = mg; + for (int i = 0; i < spa->spa_alloc_count; i++) { + mc->mc_allocator[i].mca_rotor = mg; + mg = mg->mg_next; + } } /* @@ -770,7 +936,8 @@ metaslab_group_passivate(metaslab_group_t *mg) (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { - ASSERT(mc->mc_rotor != mg); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); @@ -796,14 +963,15 @@ metaslab_group_passivate(metaslab_group_t *mg) spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { - metaslab_t *msp = mg->mg_primaries[i]; + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } - msp = mg->mg_secondaries[i]; + msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, @@ -816,12 +984,15 @@ metaslab_group_passivate(metaslab_group_t *mg) mgnext = mg->mg_next; if (mg == mgnext) { - mc->mc_rotor = NULL; + mgnext = NULL; } else { - mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } + for (int i = 0; i < spa->spa_alloc_count; i++) { + if (mc->mc_allocator[i].mca_rotor == mg) + mc->mc_allocator[i].mca_rotor = mgnext; + } mg->mg_prev = NULL; mg->mg_next = NULL; @@ -839,16 +1010,22 @@ metaslab_group_initialized(metaslab_group_t *mg) uint64_t metaslab_group_get_space(metaslab_group_t *mg) { - return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); + /* + * Note that the number of nodes in mg_metaslab_tree may be one less + * than vdev_ms_count, due to the embedded log metaslab. + */ + mutex_enter(&mg->mg_lock); + uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); + mutex_exit(&mg->mg_lock); + return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; - vdev_t *vd = mg->mg_vd; - uint64_t ashift = vd->vdev_ashift; - int i; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; @@ -859,22 +1036,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT(msp != NULL); - - /* skip if not active or not a member */ - if (msp->ms_sm == NULL || msp->ms_group != mg) + mutex_enter(&mg->mg_lock); + for (metaslab_t *msp = avl_first(t); + msp != NULL; msp = AVL_NEXT(t, msp)) { + VERIFY3P(msp->ms_group, ==, mg); + /* skip if not active */ + if (msp->ms_sm == NULL) continue; - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; + } } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + mutex_exit(&mg->mg_lock); + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -889,12 +1069,16 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -909,17 +1093,21 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -948,6 +1136,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); + + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } @@ -955,8 +1151,10 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); @@ -1046,7 +1244,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then - * check if we have reached our allocation limit (mg_alloc_queue_depth) + * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest @@ -1054,9 +1252,9 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { - metaslab_group_t *mgp; + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; int64_t qdepth; - uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; + uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); @@ -1075,8 +1273,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, */ qmax = qmax * (4 + d) / 4; - qdepth = zfs_refcount_count( - &mg->mg_alloc_queue_depth[allocator]); + qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); /* * If this metaslab group is below its qmax or it's @@ -1094,11 +1291,14 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ - for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { - qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; + for (metaslab_group_t *mgp = mg->mg_next; + mgp != rotor; mgp = mgp->mg_next) { + metaslab_group_allocator_t *mgap = + &mgp->mg_allocator[allocator]; + qmax = mgap->mga_cur_max_alloc_queue_depth; qmax = qmax * (4 + d) / 4; - qdepth = zfs_refcount_count( - &mgp->mg_alloc_queue_depth[allocator]); + qdepth = + zfs_refcount_count(&mgap->mga_alloc_queue_depth); /* * If there is another metaslab group that @@ -1129,24 +1329,167 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, */ /* - * Comparison function for the private size-ordered tree. Tree is sorted - * by size, larger sizes at the end of the tree. + * Comparison function for the private size-ordered tree using 32-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ static int -metaslab_rangesize_compare(const void *x1, const void *x2) +metaslab_rangesize32_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - int cmp = AVL_CMP(rs_size1, rs_size2); + int cmp = TREE_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); - return (AVL_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } +/* + * Comparison function for the private size-ordered tree using 64-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} +typedef struct metaslab_rt_arg { + zfs_btree_t *mra_bt; + uint32_t mra_floor_shift; +} metaslab_rt_arg_t; + +struct mssa_arg { + range_tree_t *rt; + metaslab_rt_arg_t *mra; +}; + +static void +metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) +{ + struct mssa_arg *mssap = arg; + range_tree_t *rt = mssap->rt; + metaslab_rt_arg_t *mrap = mssap->mra; + range_seg_max_t seg = {0}; + rs_set_start(&seg, rt, start); + rs_set_end(&seg, rt, start + size); + metaslab_rt_add(rt, &seg, mrap); +} + +static void +metaslab_size_tree_full_load(range_tree_t *rt) +{ + metaslab_rt_arg_t *mrap = rt->rt_arg; + METASLABSTAT_BUMP(metaslabstat_reload_tree); + ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); + mrap->mra_floor_shift = 0; + struct mssa_arg arg = {0}; + arg.rt = rt; + arg.mra = mrap; + range_tree_walk(rt, metaslab_size_sorted_add, &arg); +} + +/* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ +/* ARGSUSED */ +static void +metaslab_rt_create(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + size_t size; + int (*compare) (const void *, const void *); + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = metaslab_rangesize32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = metaslab_rangesize64_compare; + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, compare, size); + mrap->mra_floor_shift = metaslab_by_size_min_shift; +} + +/* ARGSUSED */ +static void +metaslab_rt_destroy(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + zfs_btree_destroy(size_tree); + kmem_free(mrap, sizeof (*mrap)); +} + +/* ARGSUSED */ +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + (1 << mrap->mra_floor_shift)) + return; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + mrap->mra_floor_shift)) + return; + + zfs_btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_vacate(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + metaslab_rt_create(rt, arg); +} + +static range_tree_ops_t metaslab_rt_ops = { + .rtop_create = metaslab_rt_create, + .rtop_destroy = metaslab_rt_destroy, + .rtop_add = metaslab_rt_add, + .rtop_remove = metaslab_rt_remove, + .rtop_vacate = metaslab_rt_vacate +}; + /* * ========================================================================== * Common allocator routines @@ -1157,107 +1500,166 @@ metaslab_rangesize_compare(const void *x1, const void *x2) * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_block_maxsize(metaslab_t *msp) +metaslab_largest_allocatable(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); + if (t == NULL) + return (0); + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); - return (rs->rs_end - rs->rs_start); + rs = zfs_btree_last(t, NULL); + if (rs == NULL) + return (0); + + return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + msp->ms_allocatable)); +} + +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +static uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_unflushed_frees); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + NULL); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of inaccuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); } static range_seg_t * -metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, + uint64_t size, zfs_btree_index_t *where) { - range_seg_t *rs, rsearch; - avl_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + size; + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, start + size); - rs = avl_find(t, &rsearch, &where); + rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, where, where); } return (rs); } -#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ - defined(WITH_DF_BLOCK_ALLOCATOR) || \ +#if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) + /* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. + * This is a helper function that can be used by the allocator to find a + * suitable block to allocate. This will search the specified B-tree looking + * for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, + uint64_t max_search) { - range_seg_t *rs = metaslab_block_find(t, *cursor, size); + if (*cursor == 0) + *cursor = rt->rt_start; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; + range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); + uint64_t first_found; + int count_searched = 0; - while (rs != NULL) { - uint64_t offset = P2ROUNDUP(rs->rs_start, align); + if (rs != NULL) + first_found = rs_get_start(rs, rt); - if (offset + size <= rs->rs_end) { + while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + max_search || count_searched < metaslab_min_search_count)) { + uint64_t offset = rs_get_start(rs, rt); + if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } - rs = AVL_NEXT(t, rs); + rs = zfs_btree_next(bt, &where, &where); + count_searched++; } - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); - *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); + return (-1ULL); } -#endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ - -#if defined(WITH_FF_BLOCK_ALLOCATOR) -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_allocatable->rt_root; - - return (metaslab_block_picker(t, cursor, size, align)); -} - -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; -#endif /* WITH_FF_BLOCK_ALLOCATOR */ +#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ #if defined(WITH_DF_BLOCK_ALLOCATOR) /* * ========================================================================== - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * Dynamic Fit (df) block allocator + * + * Search for a free chunk of at least this size, starting from the last + * offset (for this alignment of block) looking for up to + * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not + * found within 16MB, then return a free chunk of exactly the requested size (or + * larger). + * + * If it seems like searching from the last offset will be unproductive, skip + * that and just return a free chunk of exactly the requested size (or larger). + * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This + * mechanism is probably not very useful and may be removed in the future. + * + * The behavior when not searching can be changed to return the largest free + * chunk, instead of a free chunk of exactly the requested size, by setting + * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t @@ -1273,28 +1675,45 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &rt->rt_root; - uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); - - if (max_size < size) - return (-1ULL); /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). + * If we're running low on space, find a segment based on size, + * rather than iterating based on offset. */ - if (max_size < metaslab_df_alloc_threshold || + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = &msp->ms_allocatable_by_size; - *cursor = 0; + offset = -1; + } else { + offset = metaslab_block_picker(rt, + cursor, size, metaslab_df_max_search); } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + if (offset == -1) { + range_seg_t *rs; + if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + + if (metaslab_df_use_largest_segment) { + /* use largest free segment */ + rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); + } else { + zfs_btree_index_t where; + /* use segment of this size, or next largest */ + rs = metaslab_block_find(&msp->ms_allocatable_by_size, + rt, msp->ms_start, size, &where); + } + if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, + rt)) { + offset = rs_get_start(rs, rt); + *cursor = offset + size; + } + } + + return (offset); } static metaslab_ops_t metaslab_df_ops = { @@ -1318,25 +1737,27 @@ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + rs = zfs_btree_last(t, NULL); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + size) return (-1ULL); - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; + *cursor = rs_get_start(rs, rt); + *cursor_end = rs_get_end(rs, rt); } offset = *cursor; @@ -1371,39 +1792,40 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_allocatable->rt_root; - avl_index_t where; - range_seg_t *rs, rsearch; + zfs_btree_t *t = &msp->ms_allocatable->rt_root; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); + uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); - rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; - rsearch.rs_start = 0; - rsearch.rs_end = MIN(max_size, - 1ULL << (hbit + metaslab_ndf_clump_shift)); - rs = avl_find(t, &rsearch, &where); + rs_set_start(&rsearch, rt, 0); + rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + metaslab_ndf_clump_shift))); + + rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); + if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { + *cursor = rs_get_start(rs, rt) + size; + return (rs_get_start(rs, rt)); } return (-1ULL); } @@ -1422,6 +1844,115 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; * ========================================================================== */ +/* + * Wait for any in-progress metaslab loads to complete. + */ +static void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +/* + * Wait for any in-progress flushing to complete. + */ +static void +metaslab_flush_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_flushing) + cv_wait(&msp->ms_flush_cv, &msp->ms_lock); +} + +static unsigned int +metaslab_idx_func(multilist_t *ml, void *arg) +{ + metaslab_t *msp = arg; + + /* + * ms_id values are allocated sequentially, so full 64bit + * division would be a waste of time, so limit it to 32 bits. + */ + return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); +} + +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +static void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocating = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + /* + * Even though the smp_alloc field can get negative, + * when it comes to a metaslab's space map, that should + * never be the case. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + ASSERT3U(space_map_allocated(msp->ms_sm), >=, + range_tree_space(msp->ms_unflushed_frees)); + + ASSERT3U(metaslab_allocated_space(msp), ==, + space_map_allocated(msp->ms_sm) + + range_tree_space(msp->ms_unflushed_allocs) - + range_tree_space(msp->ms_unflushed_frees)); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + + /* + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocating += + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + } + ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, + msp->ms_allocating_total); + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + + msp->ms_deferspace + range_tree_space(msp->ms_freed); + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + static void metaslab_aux_histograms_clear(metaslab_t *msp) { @@ -1545,7 +2076,15 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; - /* see comment in metaslab_verify_unflushed_changes() */ + /* + * We can end up here from vdev_remove_complete(), in which case we + * cannot do these assertions because we hold spa config locks and + * thus we are not allowed to read from the DMU. + * + * We check if the metaslab group has been removed and if that's + * the case we return immediately as that would mean that we are + * here from the aforementioned code path. + */ if (msp->ms_group == NULL) return; @@ -1591,16 +2130,21 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) msp->ms_weight = 0; msp->ms_fragmentation = 0; - msp->ms_max_size = 0; /* - * This function is used for verification purposes. Regardless of - * whether metaslab_weight() thinks this metaslab should be active or - * not, we want to ensure that the actual weight (and therefore the - * value of ms_weight) would be the same if it was to be recalculated - * at this point. + * This function is used for verification purposes and thus should + * not introduce any side-effects/mutations on the system's state. + * + * Regardless of whether metaslab_weight() thinks this metaslab + * should be active or not, we want to ensure that the actual weight + * (and therefore the value of ms_weight) would be the same if it + * was to be recalculated at this point. + * + * In addition we set the nodirty flag so metaslab_weight() does + * not dirty the metaslab for future TXGs (e.g. when trying to + * force condensing to upgrade the metaslab spacemaps). */ - msp->ms_weight = metaslab_weight(msp) | was_active; + msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); @@ -1620,17 +2164,83 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) } /* - * Wait for any in-progress metaslab loads to complete. + * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from + * this class that was used longest ago, and attempt to unload it. We don't + * want to spend too much time in this loop to prevent performance + * degradation, and we expect that most of the time this operation will + * succeed. Between that and the normal unloading processing during txg sync, + * we expect this to keep the metaslab memory usage under control. */ static void -metaslab_load_wait(metaslab_t *msp) +metaslab_potentially_evict(metaslab_class_t *mc) { - ASSERT(MUTEX_HELD(&msp->ms_lock)); +#ifdef _KERNEL + uint64_t allmem = arc_all_memory(); + uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); + uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); + int tries = 0; + for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && + tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; + tries++) { + unsigned int idx = multilist_get_random_index( + &mc->mc_metaslab_txg_list); + multilist_sublist_t *mls = + multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < + inuse * size) { + VERIFY3P(mls, ==, multilist_sublist_lock( + &mc->mc_metaslab_txg_list, idx)); + ASSERT3U(idx, ==, + metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); - while (msp->ms_loading) { - ASSERT(!msp->ms_loaded); - cv_wait(&msp->ms_load_cv, &msp->ms_lock); + if (!multilist_link_active(&msp->ms_class_txg_node)) { + multilist_sublist_unlock(mls); + break; + } + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + /* + * If the metaslab is currently loading there are two + * cases. If it's the metaslab we're evicting, we + * can't continue on or we'll panic when we attempt to + * recursively lock the mutex. If it's another + * metaslab that's loading, it can be safely skipped, + * since we know it's very new and therefore not a + * good eviction candidate. We check later once the + * lock is held that the metaslab is fully loaded + * before actually unloading it. + */ + if (msp->ms_loading) { + msp = next_msp; + inuse = + spl_kmem_cache_inuse(zfs_btree_leaf_cache); + continue; + } + /* + * We can't unload metaslabs with no spacemap because + * they're not ready to be unloaded yet. We can't + * unload metaslabs with outstanding allocations + * because doing so could cause the metaslab's weight + * to decrease while it's unloaded, which violates an + * invariant that we use to prevent unnecessary + * loading. We also don't unload metaslabs that are + * currently active because they are high-weight + * metaslabs that are likely to be used in the near + * future. + */ + mutex_enter(&msp->ms_lock); + if (msp->ms_allocator == -1 && msp->ms_sm != NULL && + msp->ms_allocating_total == 0) { + metaslab_unload(msp); + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); + } } +#endif } static int @@ -1647,13 +2257,19 @@ metaslab_load_impl(metaslab_t *msp) * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * - * metaslab_sync() can append to the space map while we are loading. - * Therefore we load only entries that existed when we started the - * load. Additionally, metaslab_sync_done() has to wait for the load - * to complete because there are potential races like metaslab_load() - * loading parts of the space map that are currently being appended - * by metaslab_sync(). If we didn't, the ms_allocatable would have - * entries that metaslab_sync_done() would try to re-add later. + * If we are using the log space maps, metaslab_sync() can't write to + * the metaslab's space map while we are loading as we only write to + * it when we are flushing the metaslab, and that can't happen while + * we are loading it. + * + * If we are not using log space maps though, metaslab_sync() can + * append to the space map while we are loading. Therefore we load + * only entries that existed when we started the load. Additionally, + * metaslab_sync_done() has to wait for the load to complete because + * there are potential races like metaslab_load() loading parts of the + * space map that are currently being appended by metaslab_sync(). If + * we didn't, the ms_allocatable would have entries that + * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, @@ -1663,10 +2279,40 @@ metaslab_load_impl(metaslab_t *msp) uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); + hrtime_t load_start = gethrtime(); + metaslab_rt_arg_t *mrap; + if (msp->ms_allocatable->rt_arg == NULL) { + mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + } else { + mrap = msp->ms_allocatable->rt_arg; + msp->ms_allocatable->rt_ops = NULL; + msp->ms_allocatable->rt_arg = NULL; + } + mrap->mra_bt = &msp->ms_allocatable_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); + + /* Now, populate the size-sorted tree. */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + + struct mssa_arg arg = {0}; + arg.rt = msp->ms_allocatable; + arg.mra = mrap; + range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, + &arg); } else { + /* + * Add the size-sorted tree first, since we don't need to load + * the metaslab from the spacemap. + */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the @@ -1674,18 +2320,32 @@ metaslab_load_impl(metaslab_t *msp) */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); + + if (msp->ms_new) { + /* + * If the ms_sm doesn't exist, this means that this + * metaslab hasn't gone through metaslab_sync() and + * thus has never been dirtied. So we shouldn't + * expect any unflushed allocs or frees from previous + * TXGs. + */ + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from - * changing the ms_sm and the metaslab's range trees while we are - * about to use them and populate the ms_allocatable. The ms_lock - * is insufficient for this because metaslab_sync() doesn't hold - * the ms_lock while writing the ms_checkpointing tree to disk. + * changing the ms_sm (or log_sm) and the metaslab's range trees + * while we are about to use them and populate the ms_allocatable. + * The ms_lock is insufficient for this because metaslab_sync() doesn't + * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); + ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); @@ -1696,10 +2356,58 @@ metaslab_load_impl(metaslab_t *msp) msp->ms_loaded = B_TRUE; /* - * The ms_allocatable contains the segments that exist in the - * ms_defer trees [see ms_synced_length]. Thus we need to remove - * them from ms_allocatable as they will be added again in + * Apply all the unflushed changes to ms_allocatable right + * away so any manipulations we do below have a clear view + * of what is allocated and what is free. + */ + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_remove, msp->ms_allocatable); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_add, msp->ms_allocatable); + + ASSERT3P(msp->ms_group, !=, NULL); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + if (spa_syncing_log_sm(spa) != NULL) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LOG_SPACEMAP)); + + /* + * If we use a log space map we add all the segments + * that are in ms_unflushed_frees so they are available + * for allocation. + * + * ms_allocatable needs to contain all free segments + * that are ready for allocations (thus not segments + * from ms_freeing, ms_freed, and the ms_defer trees). + * But if we grab the lock in this code path at a sync + * pass later that 1, then it also contains the + * segments of ms_freed (they were added to it earlier + * in this path through ms_unflushed_frees). So we + * need to remove all the segments that exist in + * ms_freed from ms_allocatable as they will be added + * later in metaslab_sync_done(). + * + * When there's no log space map, the ms_allocatable + * correctly doesn't contain any segments that exist + * in ms_freed [see ms_synced_length]. + */ + range_tree_walk(msp->ms_freed, + range_tree_remove, msp->ms_allocatable); + } + + /* + * If we are not using the log space map, ms_allocatable + * contains the segments that exist in the ms_defer trees + * [see ms_synced_length]. Thus we need to remove them + * from ms_allocatable as they will be added again in * metaslab_sync_done(). + * + * If we are using the log space map, ms_allocatable still + * contains the segments that exist in the ms_defer trees. + * Not because it read them through the ms_sm though. But + * because these segments are part of ms_unflushed_frees + * whose segments we add to ms_allocatable earlier in this + * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], @@ -1719,15 +2427,38 @@ metaslab_load_impl(metaslab_t *msp) * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); + hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; + zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, smp_length %llu, " + "unflushed_allocs %llu, unflushed_frees %llu, " + "freed %llu, defer %llu + %llu, unloaded time %llu ms, " + "loading_time %lld ms, ms_max_size %llu, " + "max size error %lld, " + "old_weight %llx, new_weight %llx", + (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), + (u_longlong_t)msp->ms_group->mg_vd->vdev_id, + (u_longlong_t)msp->ms_id, + (u_longlong_t)space_map_length(msp->ms_sm), + (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), + (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), + (u_longlong_t)range_tree_space(msp->ms_freed), + (u_longlong_t)range_tree_space(msp->ms_defer[0]), + (u_longlong_t)range_tree_space(msp->ms_defer[1]), + (longlong_t)((load_start - msp->ms_unload_time) / 1000000), + (longlong_t)((load_end - load_start) / 1000000), + (u_longlong_t)msp->ms_max_size, + (u_longlong_t)msp->ms_max_size - max_size, + (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); - return (0); } @@ -1746,8 +2477,42 @@ metaslab_load(metaslab_t *msp) VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); + /* + * We set the loading flag BEFORE potentially dropping the lock to + * wait for an ongoing flush (see ms_flushing below). This way other + * threads know that there is already a thread that is loading this + * metaslab. + */ msp->ms_loading = B_TRUE; + + /* + * Wait for any in-progress flushing to finish as we drop the ms_lock + * both here (during space_map_load()) and in metaslab_flush() (when + * we flush our changes to the ms_sm). + */ + if (msp->ms_flushing) + metaslab_flush_wait(msp); + + /* + * In the possibility that we were waiting for the metaslab to be + * flushed (where we temporarily dropped the ms_lock), ensure that + * no one else loaded the metaslab somehow. + */ + ASSERT(!msp->ms_loaded); + + /* + * If we're loading a metaslab in the normal class, consider evicting + * another one to keep our memory usage under the limit defined by the + * zfs_metaslab_mem_limit tunable. + */ + if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == + msp->ms_group->mg_class) { + metaslab_potentially_evict(msp->ms_group->mg_class); + } + int error = metaslab_load_impl(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); @@ -1759,13 +2524,46 @@ metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - metaslab_verify_weight_and_frag(msp); + /* + * This can happen if a metaslab is selected for eviction (in + * metaslab_potentially_evict) and then unloaded during spa_sync (via + * metaslab_class_evict_old). + */ + if (!msp->ms_loaded) + return; range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_unload_time = gethrtime(); + msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; + + if (msp->ms_group != NULL) { + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, weight %llx, " + "selected txg %llu (%llu ms ago), alloc_txg %llu, " + "loaded %llu ms ago, max_size %llu", + (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), + (u_longlong_t)msp->ms_group->mg_vd->vdev_id, + (u_longlong_t)msp->ms_id, + (u_longlong_t)msp->ms_weight, + (u_longlong_t)msp->ms_selected_txg, + (u_longlong_t)(msp->ms_unload_time - + msp->ms_selected_time) / 1000 / 1000, + (u_longlong_t)msp->ms_alloc_txg, + (u_longlong_t)(msp->ms_unload_time - + msp->ms_load_time) / 1000 / 1000, + (u_longlong_t)msp->ms_max_size); + } /* * We explicitly recalculate the metaslab's weight based on its space @@ -1773,7 +2571,7 @@ metaslab_unload(metaslab_t *msp) * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information - * available in-core, whether it is loaded or not + * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation @@ -1783,7 +2581,45 @@ metaslab_unload(metaslab_t *msp) metaslab_recalculate_weight_and_sort(msp); } -static void +/* + * We want to optimize the memory use of the per-metaslab range + * trees. To do this, we store the segments in the range trees in + * units of sectors, zero-indexing from the start of the metaslab. If + * the vdev_ms_shift - the vdev_ashift is less than 32, we can store + * the ranges using two uint32_ts, rather than two uint64_ts. + */ +range_seg_type_t +metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, + uint64_t *start, uint64_t *shift) +{ + if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && + !zfs_metaslab_force_large_segs) { + *shift = vdev->vdev_ashift; + *start = msp->ms_start; + return (RANGE_SEG32); + } else { + *shift = 0; + *start = 0; + return (RANGE_SEG64); + } +} + +void +metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + msp->ms_selected_txg = txg; + msp->ms_selected_time = gethrtime(); + multilist_sublist_insert_tail(mls, msp); + multilist_sublist_unlock(mls); +} + +void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { @@ -1797,8 +2633,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, } int -metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, - metaslab_t **msp) +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, + uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; @@ -1810,6 +2646,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); + multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1817,9 +2655,14 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_allocator = -1; ms->ms_new = B_TRUE; + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_metaslab_init != NULL) + ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); + /* * We only open space map objects that already exist. All others - * will be opened when we finally allocate an object for it. + * will be opened when we finally allocate an object for it. For + * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as @@ -1828,7 +2671,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * that case, the object parameter is zero though, so we won't * call into the DMU. */ - if (object != 0) { + if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && + !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); @@ -1841,21 +2685,36 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } - /* - * We create the ms_allocatable here, but we don't create the - * other range trees until metaslab_sync_done(). This serves - * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that - * we'd data fault on any attempt to use this metaslab before - * it's ready. - */ - ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, - &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - ms->ms_trim = range_tree_create(NULL, NULL); + ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_SIZE; t++) { + ms->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); + } + ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ms->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); + } + ms->ms_checkpointing = + range_tree_create(NULL, type, NULL, start, shift); + ms->ms_unflushed_allocs = + range_tree_create(NULL, type, NULL, start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &ms->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); + + ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); - metaslab_set_fragmentation(ms); + metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating @@ -1872,17 +2731,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_allocated_space(ms), 0, 0); } - /* - * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the space map - * so that we can verify frees. - */ - if (metaslab_debug_load && ms->ms_sm != NULL) { - mutex_enter(&ms->ms_lock); - VERIFY0(metaslab_load(ms)); - mutex_exit(&ms->ms_lock); - } - if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); @@ -1893,20 +2741,60 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, return (0); } +static void +metaslab_fini_flush_data(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (metaslab_unflushed_txg(msp) == 0) { + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), + ==, NULL); + return; + } + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); +} + +uint64_t +metaslab_unflushed_changes_memused(metaslab_t *ms) +{ + return ((range_tree_numsegs(ms->ms_unflushed_allocs) + + range_tree_numsegs(ms->ms_unflushed_frees)) * + ms->ms_unflushed_allocs->rt_root.bt_elem_size); +} + void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + + metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - metaslab_space_update(vd, mg->mg_class, - -metaslab_allocated_space(msp), 0, -msp->ms_size); + /* + * If this metaslab hasn't been through metaslab_sync_done() yet its + * space hasn't been accounted for in its vdev and doesn't need to be + * subtracted. + */ + if (!msp->ms_new) { + metaslab_space_update(vd, mg->mg_class, + -metaslab_allocated_space(msp), 0, -msp->ms_size); + + } space_map_close(msp->ms_sm); + msp->ms_sm = NULL; metaslab_unload(msp); @@ -1914,17 +2802,24 @@ metaslab_fini(metaslab_t *msp) range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_destroy(msp->ms_checkpointing); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); + for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); - range_tree_destroy(msp->ms_checkpointing); - for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); @@ -1933,6 +2828,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); + cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); @@ -1987,7 +2883,7 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { * value should be in the range [0, 100]. */ static void -metaslab_set_fragmentation(metaslab_t *msp) +metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; @@ -2022,14 +2918,17 @@ metaslab_set_fragmentation(metaslab_t *msp) * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool - * is imported. + * is imported. We also skip marking this metaslab for + * condensing if the caller has explicitly set nodirty. */ - if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { + if (!nodirty && + spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " - "ms_id %llu, vdev_id %llu", txg, msp->ms_id, - vd->vdev_id); + "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, + (u_longlong_t)msp->ms_id, + (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; @@ -2072,7 +2971,6 @@ metaslab_space_weight(metaslab_t *msp) uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. @@ -2174,9 +3072,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp) } /* - * Calculate the weight based on the on-disk histogram. This should only - * be called after a sync pass has completely finished since the on-disk - * information is updated in metaslab_sync(). + * Calculate the weight based on the on-disk histogram. Should be applied + * only to unloaded metaslabs (i.e no incoming allocations) in-order to + * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) @@ -2250,7 +3148,6 @@ metaslab_segment_weight(metaslab_t *msp) } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); - return (weight); } @@ -2284,21 +3181,29 @@ metaslab_segment_weight(metaslab_t *msp) /* * Determine if we should attempt to allocate from this metaslab. If the - * metaslab has a maximum size then we can quickly determine if the desired - * allocation size can be satisfied. Otherwise, if we're using segment-based - * weighting then we can determine the maximum allocation that this metaslab - * can accommodate based on the index encoded in the weight. If we're using - * space-based weights then rely on the entire weight (excluding the weight - * type bit). + * metaslab is loaded, then we can determine if the desired allocation + * can be satisfied by looking at the size of the maximum free segment + * on that metaslab. Otherwise, we make our decision based on the metaslab's + * weight. For segment-based weighting we can determine the maximum + * allocation based on the index encoded in its value. For space-based + * weights we rely on the entire weight (excluding the weight-type bit). */ -boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +static boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { - boolean_t should_allocate; - - if (msp->ms_max_size != 0) + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); + boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the @@ -2312,10 +3217,12 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize) should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } + return (should_allocate); } + static uint64_t -metaslab_weight(metaslab_t *msp) +metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; @@ -2323,24 +3230,24 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); - /* - * If this vdev is in the process of being removed, there is nothing - * for us to do here. - */ - if (vd->vdev_removing) - return (0); - - metaslab_set_fragmentation(msp); + metaslab_set_fragmentation(msp, nodirty); /* - * Update the maximum size if the metaslab is loaded. This will + * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } /* * Segment-based weighting requires space map histogram support. @@ -2359,36 +3266,51 @@ metaslab_weight(metaslab_t *msp) void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, - metaslab_weight(msp) | was_active); + metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ - if (activation_weight == METASLAB_WEIGHT_CLAIM) + if (activation_weight == METASLAB_WEIGHT_CLAIM) { + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort(mg, msp, msp->ms_weight | + activation_weight); return (0); - metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? - mg->mg_primaries : mg->mg_secondaries); + } + + metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + &mga->mga_primary : &mga->mga_secondary); - ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); - if (arr[allocator] != NULL) { + if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } - arr[allocator] = msp; + *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort_impl(mg, msp, + msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); @@ -2399,28 +3321,72 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = metaslab_load(msp); - if (error != 0) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); - } - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { - /* - * The metaslab was activated for another allocator - * while we were waiting, we should reselect. - */ - return (SET_ERROR(EBUSY)); - } - if ((error = metaslab_activate_allocator(msp->ms_group, msp, - allocator, activation_weight)) != 0) { - return (error); - } - - msp->ms_activation_weight = msp->ms_weight; - metaslab_group_sort(msp->ms_group, msp, - msp->ms_weight | activation_weight); + /* + * The current metaslab is already activated for us so there + * is nothing to do. Already activated though, doesn't mean + * that this metaslab is activated for our allocator nor our + * requested activation weight. The metaslab could have started + * as an active one for our allocator but changed allocators + * while we were waiting to grab its ms_lock or we stole it + * [see find_valid_metaslab()]. This means that there is a + * possibility of passivating a metaslab of another allocator + * or from a different activation mask, from this thread. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + ASSERT(msp->ms_loaded); + return (0); } + + int error = metaslab_load(msp); + if (error != 0) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + + /* + * When entering metaslab_load() we may have dropped the + * ms_lock because we were loading this metaslab, or we + * were waiting for another thread to load it for us. In + * that scenario, we recheck the weight of the metaslab + * to see if it was activated by another thread. + * + * If the metaslab was activated for another allocator or + * it was activated with a different activation weight (e.g. + * we wanted to make it a primary but it was activated as + * secondary) we return error (EBUSY). + * + * If the metaslab was activated for the same allocator + * and requested activation mask, skip activating it. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + if (msp->ms_allocator != allocator) + return (EBUSY); + + if ((msp->ms_weight & activation_weight) == 0) + return (SET_ERROR(EBUSY)); + + EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), + msp->ms_primary); + return (0); + } + + /* + * If the metaslab has literally 0 space, it will have weight 0. In + * that case, don't bother activating it. This can happen if the + * metaslab had space during find_valid_metaslab, but another thread + * loaded it and used all that space while we were waiting to grab the + * lock. + */ + if (msp->ms_weight == 0) { + ASSERT0(range_tree_space(msp->ms_allocatable)); + return (SET_ERROR(ENOSPC)); + } + + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } + ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); @@ -2432,6 +3398,8 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; @@ -2439,16 +3407,18 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); + ASSERT3S(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + + metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { - ASSERT3U(0, <=, msp->ms_allocator); - ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); - ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); + ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); - mg->mg_primaries[msp->ms_allocator] = NULL; + mga->mga_primary = NULL; } else { + ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); - ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); - mg->mg_secondaries[msp->ms_allocator] = NULL; + mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); @@ -2458,7 +3428,7 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { - ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE); + uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from @@ -2470,9 +3440,10 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); + ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); - ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); + ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* @@ -2480,13 +3451,13 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see - * if we've exhaused the zfs_metaslab_switch_threshold buckets in the + * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ -void +static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; @@ -2511,14 +3482,15 @@ static void metaslab_preload(void *arg) { metaslab_t *msp = arg; - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_class_t *mc = msp->ms_group->mg_class; + spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); - msp->ms_selected_txg = spa_syncing_txg(spa); + metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } @@ -2561,18 +3533,19 @@ metaslab_group_preload(metaslab_group_t *mg) } /* - * Determine if the space map's on-disk footprint is past our tolerance - * for inefficiency. We would like to use the following criteria to make - * our decision: + * Determine if the space map's on-disk footprint is past our tolerance for + * inefficiency. We would like to use the following criteria to make our + * decision: * - * 1. The size of the space map object should not dramatically increase as a - * result of writing out the free space range tree. + * 1. Do not condense if the size of the space map object would dramatically + * increase as a result of writing out the free space range tree. * - * 2. The minimal on-disk space map representation is zfs_condense_pct/100 - * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). + * 2. Condense if the on on-disk space map representation is at least + * zfs_condense_pct/100 times the size of the optimal representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * - * 3. The on-disk size of the space map should actually decrease. + * 3. Do not condense if the on-disk size of the space map does not actually + * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. @@ -2586,127 +3559,371 @@ metaslab_should_condense(metaslab_t *msp) space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; - uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); - - /* - * Allocations and frees in early passes are generally more space - * efficient (in terms of blocks described in space map entries) - * than the ones in later passes (e.g. we don't compress after - * sync pass 5) and condensing a metaslab multiple times in a txg - * could degrade performance. - * - * Thus we prefer condensing each metaslab at most once every txg at - * the earliest sync pass possible. If a metaslab is eligible for - * condensing again after being considered for condensing within the - * same txg, it will hopefully be dirty in the next txg where it will - * be condensed at an earlier pass. - */ - if (msp->ms_condense_checked_txg == current_txg) - return (B_FALSE); - msp->ms_condense_checked_txg = current_txg; + ASSERT(sm != NULL); + ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ - if (avl_is_empty(&msp->ms_allocatable_by_size) || + if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); - uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); + uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); - dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); - uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. - * The minimized form consists of a small number of allocations followed by - * the entries of the free range tree. + * The minimized form consists of a small number of allocations followed + * by the entries of the free range tree (ms_allocatable). The condensed + * spacemap contains all the entries of previous TXGs (including those in + * the pool-wide log spacemaps; thus this is effectively a superset of + * metaslab_flush()), but this TXG's entries still need to be written. */ static void -metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + ASSERT(msp->ms_sm != NULL); + /* + * In order to condense the space map, we need to change it so it + * only describes which segments are currently allocated and free. + * + * All the current free space resides in the ms_allocatable, all + * the ms_defer trees, and all the ms_allocating trees. We ignore + * ms_freed because it is empty because we're in sync pass 1. We + * ignore ms_freeing because these changes are not yet reflected + * in the spacemap (they will be written later this txg). + * + * So to truncate the space map to represent all the entries of + * previous TXGs we do the following: + * + * 1] We create a range tree (condense tree) that is 100% empty. + * 2] We add to it all segments found in the ms_defer trees + * as those segments are marked as free in the original space + * map. We do the same with the ms_allocating trees for the same + * reason. Adding these segments should be a relatively + * inexpensive operation since we expect these trees to have a + * small number of nodes. + * 3] We vacate any unflushed allocs, since they are not frees we + * need to add to the condense tree. Then we vacate any + * unflushed frees as they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to add all segments + * in the ms_allocatable tree from the condense tree. This way + * we would write all the entries of the condense tree as the + * condensed space map, which would only contain freed + * segments with everything else assumed to be allocated. + * + * Doing so can be prohibitively expensive as ms_allocatable can + * be large, and therefore computationally expensive to add to + * the condense_tree. Instead we first sync out an entry marking + * everything as allocated, then the condense_tree and then the + * ms_allocatable, in the condensed space map. While this is not + * optimal, it is typically close to optimal and more importantly + * much cheaper to compute. + * + * 5] Finally, as both of the unflushed trees were written to our + * new and condensed metaslab space map, we basically flushed + * all the unflushed changes to disk, thus we call + * metaslab_flush_update(). + */ + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " - "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, - msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, - msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), + "spa %s, smp size %llu, segments %llu, forcing condense=%s", + (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, + (u_longlong_t)msp->ms_group->mg_vd->vdev_id, + spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), + (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - /* - * Create an range tree that is 100% allocated. We remove segments - * that have been freed in this txg, any deferred frees that exist, - * and any allocation in the future. Removing segments should be - * a relatively inexpensive operation since we expect these trees to - * have a small number of nodes. - */ - condense_tree = range_tree_create(NULL, NULL); - range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, + &start, &shift); - range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); - range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); + condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + /* - * We're about to drop the metaslab's lock thus allowing - * other consumers to change it's content. Set the - * metaslab's ms_condensing flag to ensure that - * allocations on this metaslab do not occur while we're - * in the middle of committing it to disk. This is only critical - * for ms_allocatable as all other range trees use per txg + * We're about to drop the metaslab's lock thus allowing other + * consumers to change it's content. Set the metaslab's ms_condensing + * flag to ensure that allocations on this metaslab do not occur + * while we're in the middle of committing it to disk. This is only + * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); - space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); + uint64_t object = space_map_object(msp->ms_sm); + space_map_truncate(sm, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* - * While we would ideally like to create a space map representation - * that consists only of allocation records, doing so can be - * prohibitively expensive because the in-core free tree can be - * large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we sync out two trees, a cheap - * allocation only tree followed by the in-core free tree. While not - * optimal, this is typically close to optimal, and much cheaper to - * compute. + * space_map_truncate() may have reallocated the spacemap object. + * If so, update the vdev_ms_array. */ - space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + if (space_map_object(msp->ms_sm) != object) { + object = space_map_object(msp->ms_sm); + dmu_write(spa->spa_meta_objset, + msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } + + /* + * Note: + * When the log space map feature is enabled, each space map will + * always have ALLOCS followed by FREES for each sync pass. This is + * typically true even when the log space map feature is disabled, + * except from the case where a metaslab goes through metaslab_sync() + * and gets condensed. In that case the metaslab's space map will have + * ALLOCS followed by FREES (due to condensing) followed by ALLOCS + * followed by FREES (due to space_map_write() in metaslab_sync()) for + * sync pass 1. + */ + range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, + shift); + range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); + range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + range_tree_vacate(tmp_tree, NULL, NULL); + range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); + msp->ms_condensing = B_FALSE; + metaslab_flush_update(msp, tx); +} + +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); + + /* update metaslab's position in our flushing tree */ + uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + /* update metaslab counts of spa_log_sm_t nodes */ + spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_sm_increment_current_mscount(spa); + + /* cleanup obsolete logs if any */ + uint64_t log_blocks_before = spa_log_sm_nblocks(spa); + spa_cleanup_old_sm_logs(spa, tx); + uint64_t log_blocks_after = spa_log_sm_nblocks(spa); + VERIFY3U(log_blocks_after, <=, log_blocks_before); + + /* update log space map summary */ + uint64_t blocks_gone = log_blocks_before - log_blocks_after; + spa_log_summary_add_flushed_metaslab(spa); + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_summary_decrement_blkcount(spa, blocks_gone); +} + +boolean_t +metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); + + /* + * There is nothing wrong with flushing the same metaslab twice, as + * this codepath should work on that case. However, the current + * flushing scheme makes sure to avoid this situation as we would be + * making all these calls without having anything meaningful to write + * to disk. We assert this behavior here. + */ + ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); + + /* + * We can not flush while loading, because then we would + * not load the ms_unflushed_{allocs,frees}. + */ + if (msp->ms_loading) + return (B_FALSE); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + /* + * Metaslab condensing is effectively flushing. Therefore if the + * metaslab can be condensed we can just condense it instead of + * flushing it. + * + * Note that metaslab_condense() does call metaslab_flush_update() + * so we can just return immediately after condensing. We also + * don't need to care about setting ms_flushing or broadcasting + * ms_flush_cv, even if we temporarily drop the ms_lock in + * metaslab_condense(), as the metaslab is already loaded. + */ + if (msp->ms_loaded && metaslab_should_condense(msp)) { + metaslab_group_t *mg = msp->ms_group; + + /* + * For all histogram operations below refer to the + * comments of metaslab_sync() where we follow a + * similar procedure. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + metaslab_condense(msp, tx); + + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + ASSERT(range_tree_is_empty(msp->ms_freed)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + metaslab_aux_histograms_update(msp); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + + /* + * Since we recreated the histogram (and potentially + * the ms_sm too while condensing) ensure that the + * weight is updated too because we are not guaranteed + * that this metaslab is dirty and will go through + * metaslab_sync_done(). + */ + metaslab_recalculate_weight_and_sort(msp); + return (B_TRUE); + } + + msp->ms_flushing = B_TRUE; + uint64_t sm_len_before = space_map_length(msp->ms_sm); + + mutex_exit(&msp->ms_lock); + space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + + uint64_t sm_len_after = space_map_length(msp->ms_sm); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " + "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), + spa_name(spa), + (u_longlong_t)msp->ms_group->mg_vd->vdev_id, + (u_longlong_t)msp->ms_id, + (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), + (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), + (u_longlong_t)(sm_len_after - sm_len_before)); + } + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + metaslab_flush_update(msp, tx); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + msp->ms_flushing = B_FALSE; + cv_broadcast(&msp->ms_flush_cv); + return (B_TRUE); } /* @@ -2721,37 +3938,41 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; - uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ - if (msp->ms_freeing == NULL) { - ASSERT3P(alloctree, ==, NULL); + if (msp->ms_new) { + ASSERT0(range_tree_space(alloctree)); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(range_tree_space(msp->ms_trim)); return; } - ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeing, !=, NULL); - ASSERT3P(msp->ms_freed, !=, NULL); - ASSERT3P(msp->ms_checkpointing, !=, NULL); - ASSERT3P(msp->ms_trim, !=, NULL); - /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being - * forced to condense and it's loaded, we need to let it through. + * forced to condense, it's loaded and we're not beyond the final + * dirty txg, we need to let it through. Not condensing beyond the + * final dirty txg prevents an issue where metaslabs that need to be + * condensed but were loaded for other reasons could cause a panic + * here. By only checking the txg in that branch of the conditional, + * we preserve the utility of the VERIFY statements in all other + * cases. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && - !(msp->ms_loaded && msp->ms_condense_wanted)) + !(msp->ms_loaded && msp->ms_condense_wanted && + txg <= spa_final_dirty_txg(spa))) return; - VERIFY(txg <= spa_final_dirty_txg(spa)); + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently @@ -2768,17 +3989,45 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - if (msp->ms_sm == NULL) { - uint64_t new_object; + /* + * Generate a log space map if one doesn't exist already. + */ + spa_generate_syncing_log_sm(spa, tx); - new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); + if (msp->ms_sm == NULL) { + uint64_t new_object = space_map_alloc(mos, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : + zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &new_object, tx); + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT0(metaslab_allocated_space(msp)); + } + + if (metaslab_unflushed_txg(msp) == 0 && + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT(spa_syncing_log_sm(spa) != NULL); + + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa); ASSERT(msp->ms_sm != NULL); - ASSERT0(metaslab_allocated_space(msp)); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); } if (!range_tree_is_empty(msp->ms_checkpointing) && @@ -2786,7 +4035,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, - vdev_standard_sm_blksz, tx); + zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, @@ -2815,10 +4064,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && metaslab_should_condense(msp)) { - metaslab_condense(msp, txg, tx); + if (spa->spa_sync_pass == 1 && msp->ms_loaded && + metaslab_should_condense(msp)) + metaslab_condense(msp, tx); + + /* + * We'll be going to disk to sync our space accounting, thus we + * drop the ms_lock during that time so allocations coming from + * open-context (ZIL) for future TXGs do not block. + */ + mutex_exit(&msp->ms_lock); + space_map_t *log_sm = spa_syncing_log_sm(spa); + if (log_sm != NULL) { + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + + space_map_write(log_sm, alloctree, SM_ALLOC, + vd->vdev_id, tx); + space_map_write(log_sm, msp->ms_freeing, SM_FREE, + vd->vdev_id, tx); + mutex_enter(&msp->ms_lock); + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_remove_xor_add(alloctree, + msp->ms_unflushed_frees, msp->ms_unflushed_allocs); + range_tree_remove_xor_add(msp->ms_freeing, + msp->ms_unflushed_allocs, msp->ms_unflushed_frees); + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(msp); } else { - mutex_exit(&msp->ms_lock); + ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, @@ -2838,7 +4116,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the - * ms_lock while writing to the checkpoint space map. + * ms_lock while writing to the checkpoint space map, for the + * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, @@ -2906,6 +4185,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. + * + * Keep in mind that even if we are currently using a log spacemap + * we want current frees to end up in the ms_allocatable (but not + * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); @@ -2925,15 +4208,36 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); - if (object != space_map_object(msp->ms_sm)) { - object = space_map_object(msp->ms_sm); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - msp->ms_id, sizeof (uint64_t), &object, tx); - } + /* + * Verify that the space map object ID has been recorded in the + * vdev_ms_array. + */ + uint64_t object; + VERIFY0(dmu_read(mos, vd->vdev_ms_array, + msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); + VERIFY3U(object, ==, space_map_object(msp->ms_sm)); + mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } +static void +metaslab_evict(metaslab_t *msp, uint64_t txg) +{ + if (!msp->ms_loaded || msp->ms_disabled != 0) + return; + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) + metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); + + if (!metaslab_debug_unload) + metaslab_unload(msp); +} + /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. @@ -2952,34 +4256,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); - /* - * If this metaslab is just becoming available, initialize its - * range trees and add its capacity to the vdev. - */ - if (msp->ms_freed == NULL) { - for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_allocating[t] == NULL); - - msp->ms_allocating[t] = range_tree_create(NULL, NULL); - } - - ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, NULL); - - ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, NULL); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defer[t] == NULL); - - msp->ms_defer[t] = range_tree_create(NULL, NULL); - } - - ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, NULL); - + if (msp->ms_new) { + /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); + + /* there should be no allocations nor frees at this point */ + VERIFY0(msp->ms_allocated_this_txg); + VERIFY0(range_tree_space(msp->ms_freed)); } + ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); @@ -2994,21 +4279,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); + if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } - metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); - /* - * If there's a metaslab_load() in progress, wait for it to complete - * so that we have a consistent view of the in-core space map. - */ - metaslab_load_wait(msp); + if (spa_syncing_log_sm(spa) == NULL) { + /* + * If there's a metaslab_load() in progress and we don't have + * a log space map, it means that we probably wrote to the + * metaslab's space map. If this is the case, we need to + * make sure that we wait for the load to complete so that we + * have a consistent view at the in-core side of the metaslab. + */ + metaslab_load_wait(msp); + } else { + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + } /* * When auto-trimming is enabled, free ranges which are added to @@ -3071,32 +4363,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ metaslab_recalculate_weight_and_sort(msp); - /* - * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. - */ - if (msp->ms_loaded && - msp->ms_disabled == 0 && - msp->ms_selected_txg + metaslab_unload_delay < txg) { - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } - - if (!metaslab_debug_unload) - metaslab_unload(msp); - } - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); - + msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } @@ -3149,37 +4420,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; - -void -metaslab_alloc_trace_init(void) -{ - ASSERT(metaslab_alloc_trace_cache == NULL); - metaslab_alloc_trace_cache = kmem_cache_create( - "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } -} - -void -metaslab_alloc_trace_fini(void) -{ - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; - } - kmem_cache_destroy(metaslab_alloc_trace_cache); - metaslab_alloc_trace_cache = NULL; -} /* * Add an allocation trace element to the allocation tracing list. @@ -3203,10 +4443,10 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; -#ifdef DEBUG +#ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif - atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); @@ -3254,31 +4494,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_alloc_trace_init(void) -{ -} - -void -metaslab_alloc_trace_fini(void) -{ -} - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -3298,22 +4513,25 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + metaslab_class_allocator_t *mca = + &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; - uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { - if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], + if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { - atomic_inc_64( - &mg->mg_class->mc_alloc_max_slots[allocator]); + atomic_inc_64(&mca->mca_alloc_max_slots); return; } - cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + cur = mga->mga_cur_max_alloc_queue_depth; } } @@ -3329,7 +4547,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } @@ -3345,8 +4564,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - VERIFY(zfs_refcount_not_held( - &mg->mg_alloc_queue_depth[allocator], tag)); + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); } #endif } @@ -3358,6 +4577,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; + ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); @@ -3376,6 +4596,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); + msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; @@ -3386,7 +4607,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } @@ -3399,12 +4620,13 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full - * except for the the newly-activated metaslab which we fail to examine). + * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -3412,9 +4634,17 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; - if (!metaslab_should_allocate(msp, asize)) { + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -3456,17 +4686,51 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, return (msp); } +static void +metaslab_active_mask_verify(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) + return; + + if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(!msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY3S(msp->ms_allocator, ==, -1); + return; + } +} + /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; - uint64_t activation_weight; - activation_weight = METASLAB_WEIGHT_PRIMARY; + uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { @@ -3484,6 +4748,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); @@ -3505,17 +4770,39 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && - mg->mg_primaries[allocator] != NULL) { - msp = mg->mg_primaries[allocator]; + mga->mga_primary != NULL) { + msp = mga->mga_primary; + + /* + * Even though we don't hold the ms_lock for the + * primary metaslab, those fields should not + * change while we hold the mg_lock. Thus it is + * safe to make assertions on them. + */ + ASSERT(msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && - mg->mg_secondaries[allocator] != NULL) { - msp = mg->mg_secondaries[allocator]; + mga->mga_secondary != NULL) { + msp = mga->mga_secondary; + + /* + * See comment above about the similar assertions + * for the primary metaslab. + */ + ASSERT(!msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); + want_unique, asize, allocator, try_hard, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); @@ -3523,68 +4810,115 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, kmem_free(search, sizeof (*search)); return (-1ULL); } - mutex_enter(&msp->ms_lock); + + metaslab_active_mask_verify(msp); + + /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE3(ms__activation__attempt, + metaslab_t *, msp, uint64_t, activation_weight, + boolean_t, was_active); +#endif + /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the - * active status first to see if we need to reselect + * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { + ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* - * If the metaslab is freshly activated for an allocator that - * isn't the one we're allocating from, or if it's a primary and - * we're seeking a secondary (or vice versa), we go back and - * select a new metaslab. + * If the metaslab was activated for another allocator + * while we were waiting in the ms_lock above, or it's + * a primary and we're seeking a secondary (or vice versa), + * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { + ASSERT(msp->ms_loaded); + ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || + msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } + /* + * This metaslab was used for claiming regions allocated + * by the ZIL during pool import. Once these regions are + * claimed we don't need to keep the CLAIM bit set + * anymore. Passivate this metaslab to zero its activation + * mask. + */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { + ASSERT(msp->ms_loaded); + ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } - if (metaslab_activate(msp, allocator, activation_weight) != 0) { + metaslab_set_selected_txg(msp, txg); + + int activation_error = + metaslab_activate(msp, allocator, activation_weight); + metaslab_active_mask_verify(msp); + + /* + * If the metaslab was activated by another thread for + * another allocator or activation_weight (EBUSY), or it + * failed because another metaslab was assigned as primary + * for this allocator (EEXIST) we continue using this + * metaslab for our allocation, rather than going on to a + * worse metaslab (we waited for that metaslab to be loaded + * after all). + * + * If the activation failed due to an I/O error or ENOSPC we + * skip to the next metaslab. + */ + boolean_t activated; + if (activation_error == 0) { + activated = B_TRUE; + } else if (activation_error == EBUSY || + activation_error == EEXIST) { + activated = B_FALSE; + } else { mutex_exit(&msp->ms_lock); continue; } - - msp->ms_selected_txg = txg; + ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The - * the metaslab is now loaded so metaslab_should_allocate() can - * accurately determine if the allocation attempt should + * the metaslab is now loaded so metaslab_should_allocate() + * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } - /* - * If this metaslab is currently condensing then pick again as - * we can't manipulate this metaslab until it's committed + * If this metaslab is currently condensing then pick again + * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. @@ -3592,15 +4926,19 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } mutex_exit(&msp->ms_lock); continue; } @@ -3610,12 +4948,22 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ - metaslab_segment_may_passivate(msp); + if (activated) + metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); + /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, + uint64_t, asize); +#endif + /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded @@ -3637,14 +4985,33 @@ next: * currently available for allocation and is accurate * even within a sync pass. */ + uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - uint64_t weight = metaslab_block_maxsize(msp); + weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); + } else { + weight = metaslab_weight_from_range_tree(msp); + } + + if (activated) { metaslab_passivate(msp, weight); } else { - metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + /* + * For the case where we use the metaslab that is + * active for another allocator we want to make + * sure that we retain the activation mask. + * + * Note that we could attempt to use something like + * metaslab_recalculate_weight_and_sort() that + * retains the activation mask here. That function + * uses metaslab_weight() to set the weight though + * which is not as accurate as the calculations + * above. + */ + weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(mg, msp, weight); } + metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check @@ -3652,7 +5019,7 @@ next: * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } @@ -3663,14 +5030,14 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); + dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -3705,6 +5072,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -3718,7 +5086,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ - if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) { + if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); @@ -3726,7 +5094,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_aliquot because + * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -3756,29 +5124,29 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { - mg = vd->vdev_mg; + mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) mg = mg->mg_next; } else { - mg = mc->mc_rotor; + mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mc->mc_rotor; + mg = fast_mg = mca->mca_rotor; do { if (fast_mg->mg_vd->vdev_pending_fastwrite < mg->mg_vd->vdev_pending_fastwrite) mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); } else { - ASSERT(mc->mc_rotor != NULL); - mg = mc->mc_rotor; + ASSERT(mca->mca_rotor != NULL); + mg = mca->mca_rotor; } /* @@ -3786,7 +5154,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) - mg = mc->mc_rotor; + mg = mca->mca_rotor; rotor = mg; top: @@ -3847,12 +5215,12 @@ top: /* * If we don't need to try hard, then require that the - * block be on an different metaslab from any other DVAs + * block be on a different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); + !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* @@ -3864,7 +5232,7 @@ top: * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ - if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { + if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; @@ -3902,10 +5270,10 @@ top: } if ((flags & METASLAB_FASTWRITE) || - atomic_add_64_nv(&mc->mc_aliquot, asize) >= + atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -3922,14 +5290,17 @@ top: return (0); } next: - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -4029,7 +5400,7 @@ typedef struct remap_blkptr_cb_arg { void *rbca_cb_arg; } remap_blkptr_cb_arg_t; -void +static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { @@ -4189,13 +5560,14 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); + msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); @@ -4223,7 +5595,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); @@ -4240,48 +5612,40 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { - uint64_t available_slots = 0; - boolean_t slot_reserved = B_FALSE; - uint64_t max = mc->mc_alloc_max_slots[allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); - - uint64_t reserved_slots = - zfs_refcount_count(&mc->mc_alloc_slots[allocator]); - if (reserved_slots < max) - available_slots = max - reserved_slots; - - if (slots <= available_slots || GANG_ALLOCATION(flags) || - flags & METASLAB_MUST_RESERVE) { + if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || + zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { /* + * The potential race between _count() and _add() is covered + * by the allocator lock in most cases, or irrelevant due to + * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. + * But even if we assume some other non-existing scenario, the + * worst that can happen is few more I/Os get to allocation + * earlier, that is not a problem. + * * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) { - reserved_slots = - zfs_refcount_add(&mc->mc_alloc_slots[allocator], - zio); - } + for (int d = 0; d < slots; d++) + zfs_refcount_add(&mca->mca_alloc_slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; - slot_reserved = B_TRUE; + return (B_TRUE); } - - mutex_exit(&mc->mc_lock); - return (slot_reserved); + return (B_FALSE); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); - for (int d = 0; d < slots; d++) { - (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], - zio); - } - mutex_exit(&mc->mc_lock); + for (int d = 0; d < slots; d++) + zfs_refcount_remove(&mca->mca_alloc_slots, zio); } static int @@ -4326,11 +5690,21 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); + if (!multilist_link_active(&msp->ms_class_txg_node)) { + msp->ms_selected_txg = txg; + multilist_sublist_insert_head(mls, msp); + } + multilist_sublist_unlock(mls); + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); + msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); @@ -4363,7 +5737,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) metaslab_claim_cb_arg_t arg; /* - * Only zdb(1M) can claim on indirect vdevs. This is used + * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ @@ -4405,7 +5779,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } @@ -4424,7 +5798,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + if (mc->mc_allocator[allocator].mca_rotor == NULL) { + /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } @@ -4455,7 +5830,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } - } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); @@ -4614,7 +5988,7 @@ static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; - ASSERTV(spa_t *spa = vd->vdev_spa); + spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; @@ -4637,12 +6011,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) offset, size); } - range_tree_verify_not_present(msp->ms_trim, offset, size); + /* + * Check all segments that currently exist in the freeing pipeline. + * + * It would intuitively make sense to also check the current allocating + * tree since metaslab_unalloc_dva() exists for extents that are + * allocated and freed in the same sync pass within the same txg. + * Unfortunately there are places (e.g. the ZIL) where we allocate a + * segment but then we free part of it within the same txg + * [see zil_sync()]. Thus, we don't call range_tree_verify() in the + * current allocating tree. + */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } @@ -4660,7 +6045,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); @@ -4729,7 +6114,7 @@ metaslab_disable(metaslab_t *msp) } void -metaslab_enable(metaslab_t *msp, boolean_t sync) +metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; @@ -4747,64 +6132,128 @@ metaslab_enable(metaslab_t *msp, boolean_t sync) if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); + if (unload) + metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } -#if defined(_KERNEL) +static void +metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) +{ + vdev_t *vd = ms->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + metaslab_unflushed_phys_t entry = { + .msp_unflushed_txg = metaslab_unflushed_txg(ms), + }; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object); + if (err == ENOENT) { + object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object, tx)); + } else { + VERIFY0(err); + } + + dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, + &entry, tx); +} + +void +metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = ms->ms_group->mg_vd->vdev_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ms->ms_unflushed_txg = txg; + metaslab_update_ondisk_flush_data(ms, tx); +} + +uint64_t +metaslab_unflushed_txg(metaslab_t *ms) +{ + return (ms->ms_unflushed_txg); +} + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW, + "Allocation granularity (a.k.a. stripe size)"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, + "Load all metaslabs when pool is first opened"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, + "Prevent metaslabs from being unloaded"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, + "Preload potential metaslabs during reassessment"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW, + "Delay in txgs after metaslab was last used before unloading"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW, + "Delay in milliseconds after metaslab was last used before unloading"); + /* BEGIN CSTYLED */ -module_param(metaslab_aliquot, ulong, 0644); -MODULE_PARM_DESC(metaslab_aliquot, - "allocation granularity (a.k.a. stripe size)"); +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW, + "Percentage of metaslab group size that should be free to make it " + "eligible for allocation"); -module_param(metaslab_debug_load, int, 0644); -MODULE_PARM_DESC(metaslab_debug_load, - "load all metaslabs when pool is first opened"); +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW, + "Percentage of metaslab group size that should be considered eligible " + "for allocations unless all metaslab groups within the metaslab class " + "have also crossed this threshold"); -module_param(metaslab_debug_unload, int, 0644); -MODULE_PARM_DESC(metaslab_debug_unload, - "prevent metaslabs from being unloaded"); +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT, + ZMOD_RW, "Fragmentation for metaslab to allow allocation"); -module_param(metaslab_preload_enabled, int, 0644); -MODULE_PARM_DESC(metaslab_preload_enabled, - "preload potential metaslabs during reassessment"); - -module_param(zfs_mg_noalloc_threshold, int, 0644); -MODULE_PARM_DESC(zfs_mg_noalloc_threshold, - "percentage of free space for metaslab group to allow allocation"); - -module_param(zfs_mg_fragmentation_threshold, int, 0644); -MODULE_PARM_DESC(zfs_mg_fragmentation_threshold, - "fragmentation for metaslab group to allow allocation"); - -module_param(zfs_metaslab_fragmentation_threshold, int, 0644); -MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold, - "fragmentation for metaslab to allow allocation"); - -module_param(metaslab_fragmentation_factor_enabled, int, 0644); -MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled, - "use the fragmentation metric to prefer less fragmented metaslabs"); - -module_param(metaslab_lba_weighting_enabled, int, 0644); -MODULE_PARM_DESC(metaslab_lba_weighting_enabled, - "prefer metaslabs with lower LBAs"); - -module_param(metaslab_bias_enabled, int, 0644); -MODULE_PARM_DESC(metaslab_bias_enabled, - "enable metaslab group biasing"); - -module_param(zfs_metaslab_segment_weight_enabled, int, 0644); -MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled, - "enable segment-based metaslab selection"); - -module_param(zfs_metaslab_switch_threshold, int, 0644); -MODULE_PARM_DESC(zfs_metaslab_switch_threshold, - "segment-based metaslab selection maximum buckets before switching"); - -module_param(metaslab_force_ganging, ulong, 0644); -MODULE_PARM_DESC(metaslab_force_ganging, - "blocks larger than this size are forced to be gang blocks"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, + "Use the fragmentation metric to prefer less fragmented metaslabs"); /* END CSTYLED */ -#endif +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, + "Prefer metaslabs with lower LBAs"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, + "Enable metaslab group biasing"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, + ZMOD_RW, "Enable segment-based metaslab selection"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, + "Segment-based metaslab selection maximum buckets before switching"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW, + "Blocks larger than this size are forced to be gang blocks"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW, + "Max distance (bytes) to search forward before using size tree"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, + "When looking in size tree, use largest segment instead of exact fit"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, + ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, + "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index cd5603a1a5..f67a4eb22a 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -87,12 +87,12 @@ * * In this case, a weak guarantee is provided. Since the host which last had * the pool imported will suspend the pool if no mmp writes land within - * fail_intervals * multihost_interval ms, the absense of writes during that + * fail_intervals * multihost_interval ms, the absence of writes during that * time means either the pool is not imported, or it is imported but the pool * is suspended and no further writes will occur. * * Note that resuming the suspended pool on the remote host would invalidate - * this gurantee, and so it is not allowed. + * this guarantee, and so it is not allowed. * * The factor of 2 provides a conservative safety factor and derives from * MMP_IMPORT_SAFETY_FACTOR; @@ -198,14 +198,6 @@ mmp_init(spa_t *spa) cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); mmp->mmp_kstat_id = 1; - - /* - * mmp_write_done() calculates mmp_delay based on prior mmp_delay and - * the elapsed time since the last write. For the first mmp write, - * there is no "last write", so we start with fake non-zero values. - */ - mmp->mmp_last_write = gethrtime(); - mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); } void @@ -315,8 +307,17 @@ mmp_next_leaf(spa_t *spa) if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); - if (!vdev_writeable(leaf)) { + /* + * We skip unwritable, offline, detached, and dRAID spare + * devices as they are either not legal targets or the write + * may fail or not be seen by other hosts. Skipped dRAID + * spares can never be written so the fail mask is not set. + */ + if (!vdev_writeable(leaf) || leaf->vdev_offline || + leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { + continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { @@ -484,8 +485,9 @@ mmp_write_uberblock(spa_t *spa) if (mmp->mmp_skip_error != 0) { mmp->mmp_skip_error = 0; zfs_dbgmsg("MMP write after skipping due to unavailable " - "leaves, pool '%s' gethrtime %llu leaf %#llu", - spa_name(spa), gethrtime(), vd->vdev_guid); + "leaves, pool '%s' gethrtime %llu leaf %llu", + spa_name(spa), (u_longlong_t)gethrtime(), + (u_longlong_t)vd->vdev_guid); } if (mmp->mmp_zio_root == NULL) @@ -522,9 +524,9 @@ mmp_write_uberblock(spa_t *spa) mutex_exit(&mmp->mmp_io_lock); offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - - MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL)); + MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL)); - label = spa_get_random(VDEV_LABELS); + label = random_in_range(VDEV_LABELS); vdev_label_write(zio, vd, label, ub_abd, offset, VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, flags | ZIO_FLAG_DONT_PROPAGATE); @@ -557,6 +559,18 @@ mmp_thread(void *arg) mmp_thread_enter(mmp, &cpr); + /* + * There have been no MMP writes yet. Setting mmp_last_write here gives + * us one mmp_fail_ns period, which is consistent with the activity + * check duration, to try to land an MMP write before MMP suspends the + * pool (if so configured). + */ + + mutex_enter(&mmp->mmp_io_lock); + mmp->mmp_last_write = gethrtime(); + mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); + mutex_exit(&mmp->mmp_io_lock); + while (!mmp->mmp_thread_exiting) { hrtime_t next_time = gethrtime() + MSEC2NSEC(MMP_DEFAULT_INTERVAL); @@ -604,10 +618,11 @@ mmp_thread(void *arg) "mmp_interval %llu last_mmp_fail_intervals %u " "mmp_fail_intervals %u mmp_fail_ns %llu " "skip_wait %d leaves %d next_time %llu", - spa_name(spa), gethrtime(), last_mmp_interval, - mmp_interval, last_mmp_fail_intervals, - mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves, - next_time); + spa_name(spa), (u_longlong_t)gethrtime(), + (u_longlong_t)last_mmp_interval, + (u_longlong_t)mmp_interval, last_mmp_fail_intervals, + mmp_fail_intervals, (u_longlong_t)mmp_fail_ns, + skip_wait, leaves, (u_longlong_t)next_time); } /* @@ -620,8 +635,9 @@ mmp_thread(void *arg) zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " "last_spa_multihost %u multihost %u " "last_spa_suspended %u suspended %u", - spa_name(spa), last_spa_multihost, multihost, - last_spa_suspended, suspended); + spa_name(spa), (u_longlong_t)gethrtime(), + last_spa_multihost, multihost, last_spa_suspended, + suspended); mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mmp->mmp_delay = mmp_interval; @@ -671,15 +687,14 @@ mmp_thread(void *arg) } CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, - &mmp->mmp_thread_lock, next_time, USEC2NSEC(1), + (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv, + &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), CALLOUT_FLAG_ABSOLUTE); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ - if (mmp->mmp_zio_root) - zio_wait(mmp->mmp_zio_root); + zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); @@ -716,36 +731,14 @@ mmp_signal_all_threads(void) mutex_exit(&spa_namespace_lock); } -#if defined(_KERNEL) -#include - -static int -param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp) -{ - int ret; - - ret = param_set_ulong(val, kp); - if (ret < 0) - return (ret); - - if (spa_mode_global != 0) - mmp_signal_all_threads(); - - return (ret); -} - /* BEGIN CSTYLED */ -module_param(zfs_multihost_fail_intervals, uint, 0644); -MODULE_PARM_DESC(zfs_multihost_fail_intervals, +ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, + param_set_multihost_interval, param_get_ulong, ZMOD_RW, + "Milliseconds between mmp writes to each leaf"); +/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, "Max allowed period without a successful mmp write"); -module_param_call(zfs_multihost_interval, param_set_multihost_interval, - param_get_ulong, &zfs_multihost_interval, 0644); -MODULE_PARM_DESC(zfs_multihost_interval, - "Milliseconds between mmp writes to each leaf"); - -module_param(zfs_multihost_import_intervals, uint, 0644); -MODULE_PARM_DESC(zfs_multihost_import_intervals, +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW, "Number of zfs_multihost_interval periods to wait for activity"); -/* END CSTYLED */ -#endif diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index 2a594c56cb..8bbc9b376a 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -18,10 +18,7 @@ #include #include -#include - -/* needed for spa_get_random() */ -#include +#include /* * This overrides the number of sublists in each multilist_t, which defaults @@ -33,7 +30,7 @@ int zfs_multilist_num_sublists = 0; * Given the object contained on the list, return a pointer to the * object's multilist_node_t structure it contains. */ -#ifdef DEBUG +#ifdef ZFS_DEBUG static multilist_node_t * multilist_d2l(multilist_t *ml, void *obj) { @@ -68,8 +65,8 @@ multilist_d2l(multilist_t *ml, void *obj) * requirement, but a general rule of thumb in order to garner the * best multi-threaded performance out of the data structure. */ -static multilist_t * -multilist_create_impl(size_t size, size_t offset, +static void +multilist_create_impl(multilist_t *ml, size_t size, size_t offset, unsigned int num, multilist_sublist_index_func_t *index_func) { ASSERT3U(size, >, 0); @@ -77,7 +74,6 @@ multilist_create_impl(size_t size, size_t offset, ASSERT3U(num, >, 0); ASSERT3P(index_func, !=, NULL); - multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); ml->ml_offset = offset; ml->ml_num_sublists = num; ml->ml_index_func = index_func; @@ -92,16 +88,18 @@ multilist_create_impl(size_t size, size_t offset, mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&mls->mls_list, size, offset); } - return (ml); } /* - * Allocate a new multilist, using the default number of sublists - * (the number of CPUs, or at least 4, or the tunable - * zfs_multilist_num_sublists). + * Allocate a new multilist, using the default number of sublists (the number + * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note + * that the multilists do not expand if more CPUs are hot-added. In that case, + * we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ -multilist_t * -multilist_create(size_t size, size_t offset, +void +multilist_create(multilist_t *ml, size_t size, size_t offset, multilist_sublist_index_func_t *index_func) { int num_sublists; @@ -112,7 +110,7 @@ multilist_create(size_t size, size_t offset, num_sublists = MAX(boot_ncpus, 4); } - return (multilist_create_impl(size, offset, num_sublists, index_func)); + multilist_create_impl(ml, size, offset, num_sublists, index_func); } /* @@ -138,7 +136,7 @@ multilist_destroy(multilist_t *ml) ml->ml_num_sublists = 0; ml->ml_offset = 0; - kmem_free(ml, sizeof (multilist_t)); + ml->ml_sublists = NULL; } /* @@ -274,7 +272,7 @@ multilist_get_num_sublists(multilist_t *ml) unsigned int multilist_get_random_index(multilist_t *ml) { - return (spa_get_random(ml->ml_num_sublists)); + return (random_in_range(ml->ml_num_sublists)); } /* Lock and return the sublist specified at the given index */ @@ -363,6 +361,28 @@ multilist_sublist_remove(multilist_sublist_t *mls, void *obj) list_remove(&mls->mls_list, obj); } +int +multilist_sublist_is_empty(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_is_empty(&mls->mls_list)); +} + +int +multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + int empty; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + ASSERT(!MUTEX_HELD(&mls->mls_lock)); + mutex_enter(&mls->mls_lock); + empty = list_is_empty(&mls->mls_list); + mutex_exit(&mls->mls_lock); + return (empty); +} + void * multilist_sublist_head(multilist_sublist_t *mls) { @@ -403,13 +423,7 @@ multilist_link_active(multilist_node_t *link) return (list_link_active(link)); } -#if defined(_KERNEL) - /* BEGIN CSTYLED */ - -module_param(zfs_multilist_num_sublists, int, 0644); -MODULE_PARM_DESC(zfs_multilist_num_sublists, +ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW, "Number of sublists used in each multilist"); - /* END CSTYLED */ -#endif diff --git a/module/zfs/objlist.c b/module/zfs/objlist.c new file mode 100644 index 0000000000..c80bab2a77 --- /dev/null +++ b/module/zfs/objlist.c @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include + +objlist_t * +objlist_create(void) +{ + objlist_t *list = kmem_alloc(sizeof (*list), KM_SLEEP); + list_create(&list->ol_list, sizeof (objlist_node_t), + offsetof(objlist_node_t, on_node)); + list->ol_last_lookup = 0; + return (list); +} + +void +objlist_destroy(objlist_t *list) +{ + for (objlist_node_t *n = list_remove_head(&list->ol_list); + n != NULL; n = list_remove_head(&list->ol_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->ol_list); + kmem_free(list, sizeof (*list)); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +boolean_t +objlist_exists(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = list_head(&list->ol_list); + ASSERT3U(object, >=, list->ol_last_lookup); + list->ol_last_lookup = object; + while (node != NULL && node->on_object < object) { + VERIFY3P(node, ==, list_remove_head(&list->ol_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->ol_list); + } + return (node != NULL && node->on_object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +void +objlist_insert(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->on_object = object; +#ifdef ZFS_DEBUG + objlist_node_t *last_object = list_tail(&list->ol_list); + uint64_t last_objnum = (last_object != NULL ? last_object->on_object : + 0); + ASSERT3U(node->on_object, >, last_objnum); +#endif + list_insert_tail(&list->ol_list, node); +} diff --git a/module/zfs/pathname.c b/module/zfs/pathname.c index e3e97c9bb3..84ab7b7e11 100644 --- a/module/zfs/pathname.c +++ b/module/zfs/pathname.c @@ -71,8 +71,7 @@ pn_alloc(struct pathname *pnp) void pn_alloc_sz(struct pathname *pnp, size_t sz) { - pnp->pn_path = pnp->pn_buf = kmem_alloc(sz, KM_SLEEP); - pnp->pn_pathlen = 0; + pnp->pn_buf = kmem_alloc(sz, KM_SLEEP); pnp->pn_bufsize = sz; } @@ -84,6 +83,6 @@ pn_free(struct pathname *pnp) { /* pn_bufsize is usually MAXPATHLEN, but may not be */ kmem_free(pnp->pn_buf, pnp->pn_bufsize); - pnp->pn_path = pnp->pn_buf = NULL; - pnp->pn_pathlen = pnp->pn_bufsize = 0; + pnp->pn_buf = NULL; + pnp->pn_bufsize = 0; } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 391533b3f4..595918e5a7 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -23,7 +23,8 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. */ #include @@ -74,42 +75,38 @@ * support removing complete segments. */ -kmem_cache_t *range_seg_cache; - -/* Generic ops for managing an AVL tree alongside a range tree */ -struct range_tree_ops rt_avl_ops = { - .rtop_create = rt_avl_create, - .rtop_destroy = rt_avl_destroy, - .rtop_add = rt_avl_add, - .rtop_remove = rt_avl_remove, - .rtop_vacate = rt_avl_vacate, -}; - -void -range_tree_init(void) +static inline void +rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) { - ASSERT(range_seg_cache == NULL); - range_seg_cache = kmem_cache_create("range_seg_cache", - sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -range_tree_fini(void) -{ - kmem_cache_destroy(range_seg_cache); - range_seg_cache = NULL; + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + size_t size = 0; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + VERIFY(0); + } + bcopy(src, dest, size); } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; + zfs_btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; - for (rs = avl_first(&rt->rt_root); rs != NULL; - rs = AVL_NEXT(&rt->rt_root, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; @@ -119,7 +116,8 @@ range_tree_stat_verify(range_tree_t *rt) for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", - i, hist, hist[i], rt->rt_histogram[i]); + i, hist, (u_longlong_t)hist[i], + (u_longlong_t)rt->rt_histogram[i]); } VERIFY3U(hist[i], ==, rt->rt_histogram[i]); } @@ -128,7 +126,7 @@ range_tree_stat_verify(range_tree_t *rt) static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -142,7 +140,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -153,14 +151,35 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } -/* - * NOTE: caller is responsible for all locking. - */ static int -range_tree_seg_compare(const void *x1, const void *x2) +range_tree_seg32_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = (const range_seg_t *)x1; - const range_seg_t *r2 = (const range_seg_t *)x2; + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg_gap_compare(const void *x1, const void *x2) +{ + const range_seg_gap_t *r1 = x1; + const range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -169,18 +188,42 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap) +range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, + uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), + uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - avl_create(&rt->rt_root, range_tree_seg_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + ASSERT3U(shift, <, 64); + ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + size_t size; + int (*compare) (const void *, const void *); + switch (type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = range_tree_seg32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = range_tree_seg64_compare; + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + compare = range_tree_seg_gap_compare; + break; + default: + panic("Invalid range seg type %d", type); + } + zfs_btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; - rt->rt_avl_compare = avl_compare; + rt->rt_type = type; + rt->rt_start = start; + rt->rt_shift = shift; + rt->rt_btree_compare = zfs_btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -189,9 +232,10 @@ range_tree_create_impl(range_tree_ops_t *ops, void *arg, } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_impl(ops, arg, NULL, 0)); + return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); } void @@ -202,19 +246,30 @@ range_tree_destroy(range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - avl_destroy(&rt->rt_root); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { - ASSERT3U(rs->rs_fill + delta, !=, 0); - ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) { + zfs_panic_recover("zfs: attempting to decrease fill to or " + "below 0; probable double remove in segment [%llx:%llx]", + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt)); + } + if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) - + rs_get_start(rs, rt)) { + zfs_panic_recover("zfs: attempting to increase fill beyond " + "max; probable double add in segment [%llx:%llx]", + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt)); + } if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += delta; + rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } @@ -223,28 +278,20 @@ static void range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; - avl_index_t where; - range_seg_t rsearch, *rs_before, *rs_after, *rs; + zfs_btree_index_t where; + range_seg_t *rs_before, *rs_after, *rs; + range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); - - if (gap == 0 && rs != NULL && - rs->rs_start <= start && rs->rs_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we @@ -255,27 +302,32 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { - ASSERT3U(gap, !=, 0); - if (rs->rs_start <= start && rs->rs_end >= end) { + if (gap == 0) { + zfs_panic_recover("zfs: adding existent segment to " + "range tree (offset=%llx size=%llx)", + (longlong_t)start, (longlong_t)size); + return; + } + uint64_t rstart = rs_get_start(rs, rt); + uint64_t rend = rs_get_end(rs, rt); + if (rstart <= start && rend >= end) { range_tree_adjust_fill(rt, rs, fill); return; } - avl_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); range_tree_stat_decr(rt, rs); - rt->rt_space -= rs->rs_end - rs->rs_start; + rt->rt_space -= rend - rstart; - fill += rs->rs_fill; - start = MIN(start, rs->rs_start); - end = MAX(end, rs->rs_end); + fill += rs_get_fill(rs, rt); + start = MIN(start, rstart); + end = MAX(end, rend); size = end - start; + zfs_btree_remove(&rt->rt_root, rs); range_tree_add_impl(rt, start, size, fill); - - kmem_cache_free(range_seg_cache, rs); return; } @@ -286,19 +338,21 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ - rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); - rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + zfs_btree_index_t where_before, where_after; + rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); + rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); - merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); - merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + start - gap); + merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + + gap); if (merge_before && gap != 0) - bridge_size += start - rs_before->rs_end; + bridge_size += start - rs_get_end(rs_before, rt); if (merge_after && gap != 0) - bridge_size += rs_after->rs_start - end; + bridge_size += rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { - avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); @@ -307,9 +361,19 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += rs_before->rs_fill + fill; - rs_after->rs_start = rs_before->rs_start; - kmem_cache_free(range_seg_cache, rs_before); + rs_copy(rs_after, &tmp, rt); + uint64_t before_start = rs_get_start_raw(rs_before, rt); + uint64_t before_fill = rs_get_fill(rs_before, rt); + uint64_t after_fill = rs_get_fill(rs_after, rt); + zfs_btree_remove_idx(&rt->rt_root, &where_before); + + /* + * We have to re-find the node because our old reference is + * invalid as soon as we do any mutating btree operations. + */ + rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + rs_set_start_raw(rs_after, rt, before_start); + rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -317,8 +381,9 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); - rs_before->rs_fill += fill; - rs_before->rs_end = end; + uint64_t before_fill = rs_get_fill(rs_before, rt); + rs_set_end(rs_before, rt, end); + rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -326,22 +391,26 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += fill; - rs_after->rs_start = start; + uint64_t after_fill = rs_get_fill(rs_after, rt); + rs_set_start(rs_after, rt, start); + rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { - rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + rs = &tmp; - rs->rs_fill = fill; - rs->rs_start = start; - rs->rs_end = end; - avl_insert(&rt->rt_root, rs, where); + rs_set_start(rs, rt, start); + rs_set_end(rs, rt, end); + rs_set_fill(rs, rt, fill); + zfs_btree_add_idx(&rt->rt_root, rs, &where); } - if (gap != 0) - ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); - else - ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + if (gap != 0) { + ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } else { + ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); @@ -360,22 +429,25 @@ static void range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { - avl_index_t where; - range_seg_t rsearch, *rs, *newseg; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", + zfs_panic_recover("zfs: removing nonexistent segment from " + "range tree (offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size); return; } @@ -388,30 +460,32 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, */ if (rt->rt_gap != 0) { if (do_fill) { - if (rs->rs_fill == size) { - start = rs->rs_start; - end = rs->rs_end; + if (rs_get_fill(rs, rt) == size) { + start = rs_get_start(rs, rt); + end = rs_get_end(rs, rt); size = end - start; } else { range_tree_adjust_fill(rt, rs, -size); return; } - } else if (rs->rs_start != start || rs->rs_end != end) { + } else if (rs_get_start(rs, rt) != start || + rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " - "gap tree (offset=%llu size=%llu) of " - "(offset=%llu size=%llu)", + "gap tree (offset=%llx size=%llx) of " + "(offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, + rt)); return; } } - VERIFY3U(rs->rs_start, <=, start); - VERIFY3U(rs->rs_end, >=, end); + VERIFY3U(rs_get_start(rs, rt), <=, start); + VERIFY3U(rs_get_end(rs, rt), >=, end); - left_over = (rs->rs_start != start); - right_over = (rs->rs_end != end); + left_over = (rs_get_start(rs, rt) != start); + right_over = (rs_get_end(rs, rt) != end); range_tree_stat_decr(rt, rs); @@ -419,24 +493,33 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - newseg->rs_start = end; - newseg->rs_end = rs->rs_end; - newseg->rs_fill = newseg->rs_end - newseg->rs_start; - range_tree_stat_incr(rt, newseg); + range_seg_max_t newseg; + rs_set_start(&newseg, rt, end); + rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); + rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); + range_tree_stat_incr(rt, &newseg); - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + + rs_copy(rs, &rs_tmp, rt); + if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) + zfs_btree_add_idx(&rt->rt_root, &newseg, &where); + else + zfs_btree_add(&rt->rt_root, &newseg); - avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + rs_copy(rs, &rs_tmp, rt); } else if (right_over) { - rs->rs_start = end; + // This modifies the buffer already inside the range tree + rs_set_start(rs, rt, end); + rs_copy(rs, &rs_tmp, rt); } else { - avl_remove(&rt->rt_root, rs); - kmem_cache_free(range_seg_cache, rs); + zfs_btree_remove_idx(&rt->rt_root, &where); rs = NULL; } @@ -446,11 +529,12 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - rs->rs_fill = rs->rs_end - rs->rs_start; - range_tree_stat_incr(rt, rs); + rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - + rs_get_start_raw(rs, rt)); + range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); } rt->rt_space -= size; @@ -472,14 +556,14 @@ void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize) { - int64_t delta = newsize - (rs->rs_end - rs->rs_start); + int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_start = newstart; - rs->rs_end = newstart + newsize; + rs_set_start(rs, rt, newstart); + rs_set_end(rs, rt, newstart + newsize); range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) @@ -491,22 +575,27 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_t rsearch; + range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); - rsearch.rs_start = start; - rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, NULL)); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + if (rs != NULL && rs_get_start(rs, rt) <= start && + rs_get_end(rs, rt) >= start + size) { return (rs); + } return (NULL); } @@ -524,6 +613,40 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) return (range_tree_find(rt, start, size) != NULL); } +/* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + range_seg_max_t rsearch; + rs_set_start(&rsearch, rt, start); + rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); + + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs_get_end(rs, rt) - start); + return (B_TRUE); + } + + rs = zfs_btree_next(&rt->rt_root, &where, &where); + if (rs == NULL || rs_get_start(rs, rt) > start + size) + return (B_FALSE); + + *ostart = rs_get_start(rs, rt); + *osize = MIN(start + size, rs_get_end(rs, rt)) - + rs_get_start(rs, rt); + return (B_TRUE); +} + /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. @@ -536,9 +659,12 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) if (size == 0) return; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs->rs_start, start); - uint64_t free_end = MIN(rs->rs_end, start + size); + uint64_t free_start = MAX(rs_get_start(rs, rt), start); + uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); range_tree_remove(rt, free_start, free_end - free_start); } } @@ -549,7 +675,7 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); - ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; @@ -559,16 +685,20 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - void *cookie = NULL; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); - while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - if (func != NULL) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); - kmem_cache_free(range_seg_cache, rs); + if (func != NULL) { + range_seg_t *rs; + zfs_btree_index_t *cookie = NULL; + + while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != + NULL) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + } else { + zfs_btree_clear(&rt->rt_root); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); @@ -578,16 +708,18 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } } range_seg_t * range_tree_first(range_tree_t *rt) { - return (avl_first(&rt->rt_root)); + return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t @@ -596,6 +728,12 @@ range_tree_space(range_tree_t *rt) return (rt->rt_space); } +uint64_t +range_tree_numsegs(range_tree_t *rt) +{ + return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); +} + boolean_t range_tree_is_empty(range_tree_t *rt) { @@ -603,63 +741,179 @@ range_tree_is_empty(range_tree_t *rt) return (range_tree_space(rt) == 0); } -/* Generic range tree functions for maintaining segments in an AVL tree. */ +/* ARGSUSED */ void -rt_avl_create(range_tree_t *rt, void *arg) +rt_btree_create(range_tree_t *rt, void *arg) { - avl_tree_t *tree = arg; + zfs_btree_t *size_tree = arg; - avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), - offsetof(range_seg_t, rs_pp_node)); + size_t size; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, rt->rt_btree_compare, size); } +/* ARGSUSED */ void -rt_avl_destroy(range_tree_t *rt, void *arg) +rt_btree_destroy(range_tree_t *rt, void *arg) { - avl_tree_t *tree = arg; + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); - ASSERT0(avl_numnodes(tree)); - avl_destroy(tree); + zfs_btree_destroy(size_tree); } +/* ARGSUSED */ void -rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) { - avl_tree_t *tree = arg; - avl_add(tree, rs); + zfs_btree_t *size_tree = arg; + + zfs_btree_add(size_tree, rs); } +/* ARGSUSED */ void -rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { - avl_tree_t *tree = arg; - avl_remove(tree, rs); + zfs_btree_t *size_tree = arg; + + zfs_btree_remove(size_tree, rs); } +/* ARGSUSED */ void -rt_avl_vacate(range_tree_t *rt, void *arg) +rt_btree_vacate(range_tree_t *rt, void *arg) { - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - rt_avl_create(rt, arg); + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + rt_btree_create(rt, arg); +} + +range_tree_ops_t rt_btree_ops = { + .rtop_create = rt_btree_create, + .rtop_destroy = rt_btree_destroy, + .rtop_add = rt_btree_add, + .rtop_remove = rt_btree_remove, + .rtop_vacate = rt_btree_vacate +}; + +/* + * Remove any overlapping ranges between the given segment [start, end) + * from removefrom. Add non-overlapping leftovers to addto. + */ +void +range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto) +{ + zfs_btree_index_t where; + range_seg_max_t starting_rs; + rs_set_start(&starting_rs, removefrom, start); + rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, + removefrom) + 1); + + range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, + &starting_rs, &where); + + if (curr == NULL) + curr = zfs_btree_next(&removefrom->rt_root, &where, &where); + + range_seg_t *next; + for (; curr != NULL; curr = next) { + if (start == end) + return; + VERIFY3U(start, <, end); + + /* there is no overlap */ + if (end <= rs_get_start(curr, removefrom)) { + range_tree_add(addto, start, end - start); + return; + } + + uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + start); + uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + end); + uint64_t overlap_size = overlap_end - overlap_start; + ASSERT3S(overlap_size, >, 0); + range_seg_max_t rs; + rs_copy(curr, &rs, removefrom); + + range_tree_remove(removefrom, overlap_start, overlap_size); + + if (start < overlap_start) + range_tree_add(addto, start, overlap_start - start); + + start = overlap_end; + next = zfs_btree_find(&removefrom->rt_root, &rs, &where); + /* + * If we find something here, we only removed part of the + * curr segment. Either there's some left at the end + * because we've reached the end of the range we're removing, + * or there's some left at the start because we started + * partway through the range. Either way, we continue with + * the loop. If it's the former, we'll return at the start of + * the loop, and if it's the latter we'll see if there is more + * area to process. + */ + if (next != NULL) { + ASSERT(start == end || start == rs_get_end(&rs, + removefrom)); + } + + next = zfs_btree_next(&removefrom->rt_root, &where, &where); + } + VERIFY3P(curr, ==, NULL); + + if (start != end) { + VERIFY3U(start, <, end); + range_tree_add(addto, start, end - start); + } else { + VERIFY3U(start, ==, end); + } +} + +/* + * For each entry in rt, if it exists in removefrom, remove it + * from removefrom. Otherwise, add it to addto. + */ +void +range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto) +{ + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + range_tree_remove_xor_add_segment(rs_get_start(rs, rt), + rs_get_end(rs, rt), removefrom, addto); + } } uint64_t range_tree_min(range_tree_t *rt) { - range_seg_t *rs = avl_first(&rt->rt_root); - return (rs != NULL ? rs->rs_start : 0); + range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_start(rs, rt) : 0); } uint64_t range_tree_max(range_tree_t *rt) { - range_seg_t *rs = avl_last(&rt->rt_root); - return (rs != NULL ? rs->rs_end : 0); + range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_end(rs, rt) : 0); } uint64_t diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index bcaa6d3875..35a379dded 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -20,17 +20,18 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include -#include +#include -#ifdef _KERNEL -int reference_tracking_enable = FALSE; /* runs out of memory too easily */ -#else -int reference_tracking_enable = TRUE; -#endif +/* + * Reference count tracking is disabled by default. It's memory requirements + * are reasonable, however as implemented it consumes a significant amount of + * cpu time. Until its performance is improved it should be manually enabled. + */ +int reference_tracking_enable = FALSE; int reference_history = 3; /* tunable */ #ifdef ZFS_DEBUG @@ -86,7 +87,7 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; - ASSERT(rc->rc_count == number); + ASSERT3U(rc->rc_count, ==, number); while ((ref = list_head(&rc->rc_list))) { list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); @@ -111,30 +112,33 @@ zfs_refcount_destroy(zfs_refcount_t *rc) int zfs_refcount_is_zero(zfs_refcount_t *rc) { - return (rc->rc_count == 0); + return (zfs_refcount_count(rc) == 0); } int64_t zfs_refcount_count(zfs_refcount_t *rc) { - return (rc->rc_count); + return (atomic_load_64(&rc->rc_count)); } int64_t -zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, void *holder) +zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref = NULL; int64_t count; - if (rc->rc_tracked) { - ref = kmem_cache_alloc(reference_cache, KM_SLEEP); - ref->ref_holder = holder; - ref->ref_number = number; + if (!rc->rc_tracked) { + count = atomic_add_64_nv(&(rc)->rc_count, number); + ASSERT3U(count, >=, number); + return (count); } + + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= 0); - if (rc->rc_tracked) - list_insert_head(&rc->rc_list, ref); + ASSERT3U(rc->rc_count, >=, 0); + list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -143,27 +147,26 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, void *holder) } int64_t -zfs_refcount_add(zfs_refcount_t *rc, void *holder) +zfs_refcount_add(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_add_many(rc, 1, holder)); } int64_t -zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, void *holder) +zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, + const void *holder) { reference_t *ref; int64_t count; - mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= number); - if (!rc->rc_tracked) { - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); + count = atomic_add_64_nv(&(rc)->rc_count, -number); + ASSERT3S(count, >=, 0); return (count); } + mutex_enter(&rc->rc_mtx); + ASSERT3U(rc->rc_count, >=, number); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder && ref->ref_number == number) { @@ -197,7 +200,7 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, void *holder) } int64_t -zfs_refcount_remove(zfs_refcount_t *rc, void *holder) +zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_remove_many(rc, 1, holder)); } @@ -235,17 +238,15 @@ zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, - void *current_holder, void *new_holder) + const void *current_holder, const void *new_holder) { reference_t *ref; boolean_t found = B_FALSE; - mutex_enter(&rc->rc_mtx); - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); + if (!rc->rc_tracked) return; - } + mutex_enter(&rc->rc_mtx); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == current_holder && @@ -260,8 +261,8 @@ zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, } void -zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, - void *new_holder) +zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, + const void *new_holder) { return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, new_holder)); @@ -273,17 +274,14 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, * might be held. */ boolean_t -zfs_refcount_held(zfs_refcount_t *rc, void *holder) +zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref; + if (!rc->rc_tracked) + return (zfs_refcount_count(rc) > 0); + mutex_enter(&rc->rc_mtx); - - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); - return (rc->rc_count > 0); - } - for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder) { @@ -301,17 +299,14 @@ zfs_refcount_held(zfs_refcount_t *rc, void *holder) * since the reference might not be held. */ boolean_t -zfs_refcount_not_held(zfs_refcount_t *rc, void *holder) +zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref; - mutex_enter(&rc->rc_mtx); - - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); + if (!rc->rc_tracked) return (B_TRUE); - } + mutex_enter(&rc->rc_mtx); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder) { @@ -322,4 +317,20 @@ zfs_refcount_not_held(zfs_refcount_t *rc, void *holder) mutex_exit(&rc->rc_mtx); return (B_TRUE); } + +EXPORT_SYMBOL(zfs_refcount_create); +EXPORT_SYMBOL(zfs_refcount_destroy); +EXPORT_SYMBOL(zfs_refcount_is_zero); +EXPORT_SYMBOL(zfs_refcount_count); +EXPORT_SYMBOL(zfs_refcount_add); +EXPORT_SYMBOL(zfs_refcount_remove); +EXPORT_SYMBOL(zfs_refcount_held); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW, + "Track reference holders to refcount_t objects"); + +ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW, + "Maximum reference holders being tracked"); +/* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 582b40a583..d23fc3ad10 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -26,8 +26,8 @@ * Copyright (c) 2012 by Delphix. All rights reserved. */ -#include #include +#include /* * This file contains the implementation of a re-entrant read @@ -163,7 +163,7 @@ static void rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) { mutex_enter(&rrl->rr_lock); -#if !defined(DEBUG) && defined(_KERNEL) +#if !defined(ZFS_DEBUG) && defined(_KERNEL) if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && !rrl->rr_track_all) { rrl->rr_anon_rcount.rc_count++; @@ -240,7 +240,7 @@ void rrw_exit(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); -#if !defined(DEBUG) && defined(_KERNEL) +#if !defined(ZFS_DEBUG) && defined(_KERNEL) if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { rrl->rr_anon_rcount.rc_count--; if (rrl->rr_anon_rcount.rc_count == 0) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 56a606962a..2604a7513e 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include @@ -83,7 +82,7 @@ * Layouts are simply an array of the attributes and their * ordering i.e. [0, 1, 4, 5, 2] * - * Each distinct layout is given a unique layout number and that is whats + * Each distinct layout is given a unique layout number and that is what's * stored in the header at the beginning of the SA data buffer. * * A layout only covers a single dbuf (bonus or spill). If a set of @@ -95,7 +94,7 @@ * Adding a single attribute will cause the entire set of attributes to * be rewritten and could result in a new layout number being constructed * as part of the rewrite if no such layout exists for the new set of - * attribues. The new attribute will be appended to the end of the already + * attributes. The new attribute will be appended to the end of the already * existing attributes. * * Both the attribute registration and attribute layout information are @@ -252,7 +251,7 @@ layout_num_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - return (AVL_CMP(node1->lot_num, node2->lot_num)); + return (TREE_CMP(node1->lot_num, node2->lot_num)); } static int @@ -261,14 +260,14 @@ layout_hash_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(node1->lot_instance, node2->lot_instance)); + return (TREE_CMP(node1->lot_instance, node2->lot_instance)); } -boolean_t +static boolean_t sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) { int i; @@ -318,7 +317,7 @@ sa_get_spill(sa_handle_t *hdl) * * Operates on bulk array, first failure will abort further processing */ -int +static int sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, sa_data_op_t data_op, dmu_tx_t *tx) { @@ -1014,7 +1013,7 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); - mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL); sa->sa_master_obj = sa_obj; os->os_sa = sa; @@ -1156,7 +1155,7 @@ sa_tear_down(objset_t *os) os->os_sa = NULL; } -void +static void sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t var_length, void *userp) { @@ -1220,7 +1219,7 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, } /*ARGSUSED*/ -void +static void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { @@ -1230,14 +1229,14 @@ sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); } -void +static void sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); dmu_buf_impl_t *db; int num_lengths = 1; int i; - ASSERTV(sa_os_t *sa = hdl->sa_os->os_sa); + sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); if (sa_hdr_phys->sa_magic == SA_MAGIC) @@ -1293,7 +1292,7 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) mutex_exit(&sa->sa_lock); zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x " "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC, - db->db.db_object); + (u_longlong_t)db->db.db_object); return (SET_ERROR(EIO)); } sa_byteswap(hdl, buftype); @@ -1344,7 +1343,7 @@ sa_idx_tab_rele(objset_t *os, void *arg) static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) { - ASSERTV(sa_os_t *sa = os->os_sa); + sa_os_t *sa __maybe_unused = os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL); @@ -1380,7 +1379,7 @@ sa_handle_destroy(sa_handle_t *hdl) dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) - dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + dmu_buf_rele(hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); @@ -1462,7 +1461,7 @@ sa_buf_rele(dmu_buf_t *db, void *tag) dmu_buf_rele(db, tag); } -int +static int sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) { ASSERT(hdl); @@ -1503,7 +1502,7 @@ sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) #ifdef _KERNEL int -sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio) { int error; sa_bulk_attr_t bulk; @@ -1516,8 +1515,8 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { - error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, - uio->uio_resid), UIO_READ, uio); + error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + zfs_uio_resid(uio)), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); @@ -1586,7 +1585,7 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); - if (S_ISBLK(ZTOI(zp)->i_mode) || S_ISCHR(ZTOI(zp)->i_mode)) + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } else { @@ -1625,7 +1624,7 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) zp->z_projid = projid; zp->z_pflags |= ZFS_PROJID; - links = ZTOI(zp)->i_nlink; + links = ZTONLNK(zp); count = 0; err = 0; @@ -1646,7 +1645,7 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); - if (S_ISBLK(ZTOI(zp)->i_mode) || S_ISCHR(ZTOI(zp)->i_mode)) + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); @@ -2028,7 +2027,7 @@ sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, hdl->sa_spill_tab = NULL; } - dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; } @@ -2131,13 +2130,13 @@ sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { - dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); + dmu_object_info_from_db(hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { - dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + dmu_object_size_from_db(hdl->sa_bonus, blksize, nblocks); } @@ -2150,7 +2149,7 @@ sa_set_userp(sa_handle_t *hdl, void *ptr) dmu_buf_t * sa_get_db(sa_handle_t *hdl) { - return ((dmu_buf_t *)hdl->sa_bonus); + return (hdl->sa_bonus); } void * diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c index 2adadf56f9..d297768ead 100644 --- a/module/zfs/sha256.c +++ b/module/zfs/sha256.c @@ -28,9 +28,10 @@ */ #include #include +#include #include #include -#include "qat.h" +#include static int sha_incremental(void *buf, size_t size, void *arg) diff --git a/module/zfs/skein_zfs.c b/module/zfs/skein_zfs.c index 8deb84b266..11b9940e02 100644 --- a/module/zfs/skein_zfs.c +++ b/module/zfs/skein_zfs.c @@ -24,6 +24,7 @@ */ #include #include +#include #include #include diff --git a/module/zfs/spa.c b/module/zfs/spa.c index eb3ff91a07..1083b5a90d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -29,9 +29,10 @@ * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2021, Colm Buckley */ /* @@ -57,8 +58,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -90,6 +93,7 @@ #include #include #include +#include #endif /* _KERNEL */ #include "zfs_prop.h" @@ -104,6 +108,7 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; @@ -111,6 +116,7 @@ typedef enum zti_modes { #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -137,7 +143,8 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. + * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, + * but with number of taskqs also scaling with number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that @@ -146,9 +153,9 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ - { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ - { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ + { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ @@ -160,7 +167,8 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); -uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ +uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ +uint_t zio_taskq_batch_tpq; /* threads per taskq */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ @@ -172,6 +180,12 @@ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ */ boolean_t spa_load_verify_dryrun = B_FALSE; +/* + * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). + * This is used by zdb for spacemaps verification. + */ +boolean_t spa_mode_readable_spacemaps = B_FALSE; + /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. @@ -232,6 +246,27 @@ uint64_t zfs_max_missing_tvds_scan = 0; */ boolean_t zfs_pause_spa_sync = B_FALSE; +/* + * Variables to indicate the livelist condense zthr func should wait at certain + * points for the livelist to be removed - used to test condense/destroy races + */ +int zfs_livelist_condense_zthr_pause = 0; +int zfs_livelist_condense_sync_pause = 0; + +/* + * Variables to track whether or not condense cancellation has been + * triggered in testing. + */ +int zfs_livelist_condense_sync_cancel = 0; +int zfs_livelist_condense_zthr_cancel = 0; + +/* + * Variable to track whether or not extra ALLOC blkptrs were added to a + * livelist entry while it was being condensed (caused by the way we track + * remapped blkptrs in dbuf_remap_impl) + */ +int zfs_livelist_condense_new_alloc = 0; + /* * ========================================================================== * SPA properties routines @@ -248,15 +283,15 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + propval = fnvlist_alloc(); + fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + fnvlist_add_string(propval, ZPROP_VALUE, strval); else - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + fnvlist_add_uint64(propval, ZPROP_VALUE, intval); - VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } @@ -279,10 +314,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); + size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); @@ -297,7 +334,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, - (spa_mode(spa) == FREAD), src); + (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); @@ -351,6 +388,11 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 0, ZPROP_SRC_LOCAL); } + if (spa->spa_compatibility != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, + spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); + } + if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); @@ -391,12 +433,15 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; + dsl_pool_t *dp; int err; err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); if (err) return (err); + dp = spa_get_dsl(spa); + dsl_pool_config_enter(dp, FTAG); mutex_enter(&spa->spa_props_lock); /* @@ -405,10 +450,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ - if (mos == NULL || spa->spa_pool_props_object == 0) { - mutex_exit(&spa->spa_props_lock); + if (mos == NULL || spa->spa_pool_props_object == 0) goto out; - } /* * Get properties from the MOS pool property object. @@ -432,23 +475,17 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { - dsl_pool_t *dp; dsl_dataset_t *ds = NULL; - dp = spa_get_dsl(spa); - dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); - if (err != 0) { - dsl_pool_config_exit(dp, FTAG); + if (err != 0) break; - } strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; @@ -479,8 +516,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) } } zap_cursor_fini(&zc); - mutex_exit(&spa->spa_props_lock); out: + mutex_exit(&spa->spa_props_lock); + dsl_pool_config_exit(dp, FTAG); if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; @@ -567,8 +605,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (!error && intval > 1) error = SET_ERROR(EINVAL); - if (!error && !spa_get_hostid()) - error = SET_ERROR(ENOTSUP); + if (!error) { + uint32_t hostid = zone_get_hostid(NULL); + if (hostid) + spa->spa_hostid = hostid; + else + error = SET_ERROR(ENOTSUP); + } break; @@ -597,7 +640,6 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (!error) { objset_t *os; - uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( @@ -609,27 +651,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (error != 0) break; - /* - * Must be ZPL, and its property settings - * must be supported by GRUB (compression - * is not gzip, and large dnodes are not - * used). - */ - + /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); - } else if ((error = - dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &propval)) == 0 && - !BOOTFS_COMPRESS_VALID(propval)) { - error = SET_ERROR(ENOTSUP); - } else if ((error = - dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_DNODESIZE), - &propval)) == 0 && - propval != ZFS_DNSIZE_LEGACY) { - error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } @@ -694,16 +718,6 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(E2BIG); break; - case ZPOOL_PROP_DEDUPDITTO: - if (spa_version(spa) < SPA_VERSION_DEDUP) - error = SET_ERROR(ENOTSUP); - else - error = nvpair_value_uint64(elem, &intval); - if (error == 0 && - intval != 0 && intval < ZIO_DEDUPDITTO_MIN) - error = SET_ERROR(EINVAL); - break; - default: break; } @@ -712,6 +726,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; } + (void) nvlist_remove_all(props, + zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); + if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); @@ -827,7 +844,7 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { - ASSERTV(uint64_t *newguid = arg); + uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; @@ -867,7 +884,7 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx) spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", - oldguid, *newguid); + (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* @@ -919,7 +936,7 @@ spa_error_entry_compare(const void *a, const void *b) ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - return (AVL_ISIGN(ret)); + return (TREE_ISIGN(ret)); } /* @@ -950,25 +967,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - uint_t flags = 0; + uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; - if (mode == ZTI_MODE_NULL) { - tqs->stqs_count = 0; - tqs->stqs_taskq = NULL; - return; - } - - ASSERT3U(count, >, 0); - - tqs->stqs_count = count; - tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); - switch (mode) { case ZTI_MODE_FIXED: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - flags |= TASKQ_DYNAMIC; + ASSERT3U(value, >, 0); break; case ZTI_MODE_BATCH: @@ -977,6 +981,48 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) value = MIN(zio_taskq_batch_pct, 100); break; + case ZTI_MODE_SCALE: + flags |= TASKQ_THREADS_CPU_PCT; + /* + * We want more taskqs to reduce lock contention, but we want + * less for better request ordering and CPU utilization. + */ + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + if (zio_taskq_batch_tpq > 0) { + count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / + zio_taskq_batch_tpq); + } else { + /* + * Prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. For 80%: + * + * taskq taskq total + * cpus taskqs percent threads threads + * ------- ------- ------- ------- ------- + * 1 1 80% 1 1 + * 2 1 80% 1 1 + * 4 1 80% 3 3 + * 8 2 40% 3 6 + * 16 3 27% 4 12 + * 32 5 16% 5 25 + * 64 7 11% 7 49 + * 128 10 8% 10 100 + * 256 14 6% 15 210 + */ + count = 1 + cpus / 6; + while (count * count > cpus) + count--; + } + /* Limit each taskq within 100% to not trigger assertion. */ + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + value = (zio_taskq_batch_pct + count / 2) / count; + break; + + case ZTI_MODE_NULL: + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", @@ -984,12 +1030,20 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) break; } + ASSERT3U(count, >, 0); + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); + for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; - (void) snprintf(name, sizeof (name), "%s_%s", - zio_type_name[t], zio_taskq_types[q]); + if (count > 1) + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + else + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) @@ -1002,13 +1056,25 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important - * priority than the other taskqs. Under Linux this - * means incrementing the priority value on platforms - * like illumos it should be decremented. + * priority than the other taskqs. + * + * Under Linux and FreeBSD this means incrementing + * the priority value as opposed to platforms like + * illumos where it should be decremented. + * + * On FreeBSD, if priorities divided by four (RQ_PPQ) + * are equal then a difference between them is + * insignificant. */ - if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { +#if defined(__linux__) pri++; - +#elif defined(__FreeBSD__) + pri += 4; +#else +#error "unknown OS" +#endif + } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } @@ -1176,15 +1242,18 @@ spa_thread(void *arg) * Activate an uninitialized pool. */ static void -spa_activate(spa_t *spa, int mode) +spa_activate(spa_t *spa, spa_mode_t mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; + spa->spa_read_spacemaps = spa_mode_readable_spacemaps; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_embedded_log_class = + metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); @@ -1250,7 +1319,7 @@ spa_activate(spa_t *spa, int mode) /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy - * resolution of various deadlocks (zfsonlinux bug #3681). + * resolution of various deadlocks. * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. @@ -1270,15 +1339,15 @@ spa_activate(spa_t *spa, int mode) * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ - spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ - spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* @@ -1336,6 +1405,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_embedded_log_class); + spa->spa_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; @@ -1386,7 +1458,7 @@ spa_deactivate(spa_t *spa) * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ -static int +int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { @@ -1427,19 +1499,111 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (0); } +static boolean_t +spa_should_flush_logs_on_unload(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return (B_FALSE); + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (zfs_keep_log_spacemaps_at_export) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Opens a transaction that will set the flag that will instruct + * spa_sync to attempt to flush all the metaslabs for that txg. + */ +static void +spa_unload_log_sm_flush_all(spa_t *spa) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + ASSERT3U(spa->spa_log_flushall_txg, ==, 0); + spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); + + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); +} + +static void +spa_unload_log_sm_metadata(spa_t *spa) +{ + void *cookie = NULL; + spa_log_sm_t *sls; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, + &cookie)) != NULL) { + VERIFY0(sls->sls_mscount); + kmem_free(sls, sizeof (spa_log_sm_t)); + } + + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + VERIFY0(e->lse_mscount); + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } + + spa->spa_unflushed_stats.sus_nblocks = 0; + spa->spa_unflushed_stats.sus_memused = 0; + spa->spa_unflushed_stats.sus_blocklimit = 0; +} + +static void +spa_destroy_aux_threads(spa_t *spa) +{ + if (spa->spa_condense_zthr != NULL) { + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + if (spa->spa_checkpoint_discard_zthr != NULL) { + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + if (spa->spa_livelist_delete_zthr != NULL) { + zthr_destroy(spa->spa_livelist_delete_zthr); + spa->spa_livelist_delete_zthr = NULL; + } + if (spa->spa_livelist_condense_zthr != NULL) { + zthr_destroy(spa->spa_livelist_condense_zthr); + spa->spa_livelist_condense_zthr = NULL; + } +} + /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { - int i; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); + spa_wake_waiters(spa); + + /* + * If the log space map feature is enabled and the pool is getting + * exported (but not destroyed), we want to spend some time flushing + * as many metaslabs as we can in an attempt to destroy log space + * maps and save import time. + */ + if (spa_should_flush_logs_on_unload(spa)) + spa_unload_log_sm_flush_all(spa); + /* * Stop async tasks. */ @@ -1450,6 +1614,7 @@ spa_unload(spa_t *spa) vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -1461,16 +1626,15 @@ spa_unload(spa_t *spa) } /* - * Even though vdev_free() also calls vdev_metaslab_fini, we need - * to call it earlier, before we wait for async i/o to complete. - * This ensures that there is no async metaslab prefetching, by - * calling taskq_wait(mg_taskq). + * This ensures that there is no async metaslab prefetching + * while we attempt to unload the spa. */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) - vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, spa); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; + if (vc->vdev_mg != NULL) + taskq_wait(vc->vdev_mg->mg_taskq); + } } if (spa->spa_mmp.mmp_thread) @@ -1491,15 +1655,7 @@ spa_unload(spa_t *spa) spa->spa_vdev_removal = NULL; } - if (spa->spa_condense_zthr != NULL) { - zthr_destroy(spa->spa_condense_zthr); - spa->spa_condense_zthr = NULL; - } - - if (spa->spa_checkpoint_discard_zthr != NULL) { - zthr_destroy(spa->spa_checkpoint_discard_zthr); - spa->spa_checkpoint_discard_zthr = NULL; - } + spa_destroy_aux_threads(spa); spa_condense_fini(spa); @@ -1524,13 +1680,14 @@ spa_unload(spa_t *spa) } ddt_unload(spa); + spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); - for (i = 0; i < spa->spa_spares.sav_count; i++) + for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, @@ -1543,7 +1700,7 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } @@ -1566,6 +1723,10 @@ spa_unload(spa_t *spa) spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } + if (spa->spa_compatibility != NULL) { + spa_strfree(spa->spa_compatibility); + spa->spa_compatibility = NULL; + } spa_config_exit(spa, SCL_ALL, spa); } @@ -1619,8 +1780,8 @@ spa_load_spares(spa_t *spa) if (spa->spa_spares.sav_config == NULL) nspares = 0; else - VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares)); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; @@ -1682,16 +1843,15 @@ spa_load_spares(spa_t *spa) * Recompute the stashed list of spares, with status information * this time. */ - VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, - DATA_TYPE_NVLIST_ARRAY) == 0); + fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + fnvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); @@ -1741,16 +1901,15 @@ spa_load_l2cache(spa_t *spa) goto out; } - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { - VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, - &guid) == 0); + guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { @@ -1792,6 +1951,15 @@ spa_load_l2cache(spa_t *spa) if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); + + /* + * Upon cache device addition to a pool or pool + * creation with a cache device or if the header + * of the device is invalid we issue an async + * TRIM command for the whole device which will + * execute if l2arc_trim_ahead > 0. + */ + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } @@ -1802,8 +1970,7 @@ spa_load_l2cache(spa_t *spa) * Recompute the stashed list of l2cache devices, with status * information this time. */ - VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, - DATA_TYPE_NVLIST_ARRAY) == 0); + fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), @@ -1811,8 +1978,8 @@ spa_load_l2cache(spa_t *spa) for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); + fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, + sav->sav_count); out: /* @@ -1922,7 +2089,7 @@ spa_check_for_missing_logs(spa_t *spa) child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + nv = fnvlist_alloc(); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; @@ -1998,6 +2165,9 @@ spa_check_logs(spa_t *spa) return (rv); } +/* + * Passivate any log vdevs (note, does not apply to embedded log metaslabs). + */ static boolean_t spa_passivate_log(spa_t *spa) { @@ -2006,15 +2176,12 @@ spa_passivate_log(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - if (!spa_has_slogs(spa)) - return (B_FALSE); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { - metaslab_group_passivate(mg); + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } @@ -2022,6 +2189,9 @@ spa_passivate_log(spa_t *spa) return (slog_found); } +/* + * Activate any log vdevs (note, does not apply to embedded log metaslabs). + */ static void spa_activate_log(spa_t *spa) { @@ -2031,10 +2201,11 @@ spa_activate_log(spa_t *spa) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - if (tvd->vdev_islog) - metaslab_group_activate(mg); + if (tvd->vdev_islog) { + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_activate(tvd->vdev_mg); + } } } @@ -2101,16 +2272,16 @@ spa_load_verify_done(zio_t *zio) } mutex_enter(&spa->spa_scrub_lock); - spa->spa_load_verify_ios--; + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* - * Maximum number of concurrent scrub i/os to create while verifying - * a pool while importing it. + * Maximum number of inflight bytes is the log2 fraction of the arc size. + * By default, we set it to 1/16th of the arc. */ -int spa_load_verify_maxinflight = 10000; +int spa_load_verify_shift = 4; int spa_load_verify_metadata = B_TRUE; int spa_load_verify_data = B_TRUE; @@ -2119,7 +2290,8 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); /* * Note: normally this routine will not be called if @@ -2131,13 +2303,15 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); + uint64_t maxinflight_bytes = + arc_target_bytes() >> spa_load_verify_shift; zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) + while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_load_verify_ios++; + spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, @@ -2148,7 +2322,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* ARGSUSED */ -int +static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) @@ -2190,12 +2364,14 @@ spa_load_verify(spa_t *spa) "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } + error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); + ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; @@ -2216,12 +2392,12 @@ spa_load_verify(spa_t *spa) spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; - VERIFY(nvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); - VERIFY(nvlist_add_int64(spa->spa_load_info, - ZPOOL_CONFIG_REWIND_TIME, loss) == 0); - VERIFY(nvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, + spa->spa_load_txg_ts); + fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, + loss); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } @@ -2272,6 +2448,386 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) return (SET_ERROR(err)); } +boolean_t +spa_livelist_delete_check(spa_t *spa) +{ + return (spa->spa_livelists_to_delete != 0); +} + +/* ARGSUSED */ +static boolean_t +spa_livelist_delete_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + return (spa_livelist_delete_check(spa)); +} + +static int +delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + spa_t *spa = arg; + zio_free(spa, tx->tx_txg, bp); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + return (0); +} + +static int +dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) +{ + int err; + zap_cursor_t zc; + zap_attribute_t za; + zap_cursor_init(&zc, os, zap_obj); + err = zap_cursor_retrieve(&zc, &za); + zap_cursor_fini(&zc); + if (err == 0) + *llp = za.za_first_integer; + return (err); +} + +/* + * Components of livelist deletion that must be performed in syncing + * context: freeing block pointers and updating the pool-wide data + * structures to indicate how much work is left to do + */ +typedef struct sublist_delete_arg { + spa_t *spa; + dsl_deadlist_t *ll; + uint64_t key; + bplist_t *to_free; +} sublist_delete_arg_t; + +static void +sublist_delete_sync(void *arg, dmu_tx_t *tx) +{ + sublist_delete_arg_t *sda = arg; + spa_t *spa = sda->spa; + dsl_deadlist_t *ll = sda->ll; + uint64_t key = sda->key; + bplist_t *to_free = sda->to_free; + + bplist_iterate(to_free, delete_blkptr_cb, spa, tx); + dsl_deadlist_remove_entry(ll, key, tx); +} + +typedef struct livelist_delete_arg { + spa_t *spa; + uint64_t ll_obj; + uint64_t zap_obj; +} livelist_delete_arg_t; + +static void +livelist_delete_sync(void *arg, dmu_tx_t *tx) +{ + livelist_delete_arg_t *lda = arg; + spa_t *spa = lda->spa; + uint64_t ll_obj = lda->ll_obj; + uint64_t zap_obj = lda->zap_obj; + objset_t *mos = spa->spa_meta_objset; + uint64_t count; + + /* free the livelist and decrement the feature count */ + VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); + dsl_deadlist_free(mos, ll_obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + VERIFY0(zap_count(mos, zap_obj, &count)); + if (count == 0) { + /* no more livelists to delete */ + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, tx)); + VERIFY0(zap_destroy(mos, zap_obj, tx)); + spa->spa_livelists_to_delete = 0; + spa_notify_waiters(spa); + } +} + +/* + * Load in the value for the livelist to be removed and open it. Then, + * load its first sublist and determine which block pointers should actually + * be freed. Then, call a synctask which performs the actual frees and updates + * the pool-wide livelist data. + */ +/* ARGSUSED */ +static void +spa_livelist_delete_cb(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + uint64_t ll_obj = 0, count; + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj = spa->spa_livelists_to_delete; + /* + * Determine the next livelist to delete. This function should only + * be called if there is at least one deleted clone. + */ + VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); + VERIFY0(zap_count(mos, ll_obj, &count)); + if (count > 0) { + dsl_deadlist_t *ll; + dsl_deadlist_entry_t *dle; + bplist_t to_free; + ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); + dsl_deadlist_open(ll, mos, ll_obj); + dle = dsl_deadlist_first(ll); + ASSERT3P(dle, !=, NULL); + bplist_create(&to_free); + int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, + z, NULL); + if (err == 0) { + sublist_delete_arg_t sync_arg = { + .spa = spa, + .ll = ll, + .key = dle->dle_mintxg, + .to_free = &to_free + }; + zfs_dbgmsg("deleting sublist (id %llu) from" + " livelist %llu, %lld remaining", + (u_longlong_t)dle->dle_bpobj.bpo_object, + (u_longlong_t)ll_obj, (longlong_t)count - 1); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + sublist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY)); + } else { + VERIFY3U(err, ==, EINTR); + } + bplist_clear(&to_free); + bplist_destroy(&to_free); + dsl_deadlist_close(ll); + kmem_free(ll, sizeof (dsl_deadlist_t)); + } else { + livelist_delete_arg_t sync_arg = { + .spa = spa, + .ll_obj = ll_obj, + .zap_obj = zap_obj + }; + zfs_dbgmsg("deletion of livelist %llu completed", + (u_longlong_t)ll_obj); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, + &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + } +} + +static void +spa_start_livelist_destroy_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); + spa->spa_livelist_delete_zthr = + zthr_create("z_livelist_destroy", + spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, + minclsyspri); +} + +typedef struct livelist_new_arg { + bplist_t *allocs; + bplist_t *frees; +} livelist_new_arg_t; + +static int +livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(tx == NULL); + livelist_new_arg_t *lna = arg; + if (bp_freed) { + bplist_append(lna->frees, bp); + } else { + bplist_append(lna->allocs, bp); + zfs_livelist_condense_new_alloc++; + } + return (0); +} + +typedef struct livelist_condense_arg { + spa_t *spa; + bplist_t to_keep; + uint64_t first_size; + uint64_t next_size; +} livelist_condense_arg_t; + +static void +spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) +{ + livelist_condense_arg_t *lca = arg; + spa_t *spa = lca->spa; + bplist_t new_frees; + dsl_dataset_t *ds = spa->spa_to_condense.ds; + + /* Have we been cancelled? */ + if (spa->spa_to_condense.cancelled) { + zfs_livelist_condense_sync_cancel++; + goto out; + } + + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + + /* + * It's possible that the livelist was changed while the zthr was + * running. Therefore, we need to check for new blkptrs in the two + * entries being condensed and continue to track them in the livelist. + * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), + * it's possible that the newly added blkptrs are FREEs or ALLOCs so + * we need to sort them into two different bplists. + */ + uint64_t first_obj = first->dle_bpobj.bpo_object; + uint64_t next_obj = next->dle_bpobj.bpo_object; + uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + + bplist_create(&new_frees); + livelist_new_arg_t new_bps = { + .allocs = &lca->to_keep, + .frees = &new_frees, + }; + + if (cur_first_size > lca->first_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->first_size)); + } + if (cur_next_size > lca->next_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->next_size)); + } + + dsl_deadlist_clear_entry(first, ll, tx); + ASSERT(bpobj_is_empty(&first->dle_bpobj)); + dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); + + bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); + bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); + bplist_destroy(&new_frees); + + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " + "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " + "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, + (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, + (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, + (u_longlong_t)cur_next_size, + (u_longlong_t)first->dle_bpobj.bpo_object, + (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); +out: + dmu_buf_rele(ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + spa->spa_to_condense.syncing = B_FALSE; +} + +static void +spa_livelist_condense_cb(void *arg, zthr_t *t) +{ + while (zfs_livelist_condense_zthr_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + spa_t *spa = arg; + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + uint64_t first_size, next_size; + + livelist_condense_arg_t *lca = + kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); + bplist_create(&lca->to_keep); + + /* + * Process the livelists (matching FREEs and ALLOCs) in open context + * so we have minimal work in syncing context to condense. + * + * We save bpobj sizes (first_size and next_size) to use later in + * syncing context to determine if entries were added to these sublists + * while in open context. This is possible because the clone is still + * active and open for normal writes and we want to make sure the new, + * unprocessed blockpointers are inserted into the livelist normally. + * + * Note that dsl_process_sub_livelist() both stores the size number of + * blockpointers and iterates over them while the bpobj's lock held, so + * the sizes returned to us are consistent which what was actually + * processed. + */ + int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, + &first_size); + if (err == 0) + err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, + t, &next_size); + + if (err == 0) { + while (zfs_livelist_condense_sync_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_mark_netfree(tx); + dmu_tx_hold_space(tx, 1); + err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); + if (err == 0) { + /* + * Prevent the condense zthr restarting before + * the synctask completes. + */ + spa->spa_to_condense.syncing = B_TRUE; + lca->spa = spa; + lca->first_size = first_size; + lca->next_size = next_size; + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_livelist_condense_sync, lca, tx); + dmu_tx_commit(tx); + return; + } + } + /* + * Condensing can not continue: either it was externally stopped or + * we were unable to assign to a tx because the pool has run out of + * space. In the second case, we'll just end up trying to condense + * again in a later txg. + */ + ASSERT(err != 0); + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + if (err == EINTR) + zfs_livelist_condense_zthr_cancel++; +} + +/* ARGSUSED */ +/* + * Check that there is something to condense but that a condense is not + * already in progress and that condensing has not been cancelled. + */ +static boolean_t +spa_livelist_condense_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + if ((spa->spa_to_condense.ds != NULL) && + (spa->spa_to_condense.syncing == B_FALSE) && + (spa->spa_to_condense.cancelled == B_FALSE)) { + return (B_TRUE); + } + return (B_FALSE); +} + +static void +spa_start_livelist_condensing_thread(spa_t *spa) +{ + spa->spa_to_condense.ds = NULL; + spa->spa_to_condense.first = NULL; + spa->spa_to_condense.next = NULL; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); + spa->spa_livelist_condense_zthr = + zthr_create("z_livelist_condense", + spa_livelist_condense_cb_check, + spa_livelist_condense_cb, spa, minclsyspri); +} + static void spa_spawn_aux_threads(spa_t *spa) { @@ -2280,11 +2836,14 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); + spa_start_livelist_destroy_thread(spa); + spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = - zthr_create(spa_checkpoint_discard_thread_check, - spa_checkpoint_discard_thread, spa); + zthr_create("z_checkpoint_discard", + spa_checkpoint_discard_thread_check, + spa_checkpoint_discard_thread, spa, minclsyspri); } /* @@ -2394,7 +2953,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); + (void) zfs_ereport_post(ereport, spa, + NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; @@ -2496,7 +3056,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); - if (hostid == spa_get_hostid()) + if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* @@ -2540,8 +3100,10 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " - "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), - MMP_INTERVAL(ub), import_intervals); + "import_intervals=%llu", (u_longlong_t)import_delay, + (u_longlong_t)MMP_FAIL_INT(ub), + (u_longlong_t)MMP_INTERVAL(ub), + (u_longlong_t)import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { @@ -2552,27 +3114,32 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " - "import_intervals=%u", import_delay, MMP_INTERVAL(ub), - ub->ub_mmp_delay, import_intervals); + "import_intervals=%llu", (u_longlong_t)import_delay, + (u_longlong_t)MMP_INTERVAL(ub), + (u_longlong_t)ub->ub_mmp_delay, + (u_longlong_t)import_intervals); } else if (MMP_VALID(ub)) { /* - * zfs-0.7 compatability case + * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " - "import_intervals=%u leaves=%u", import_delay, - ub->ub_mmp_delay, import_intervals, + "import_intervals=%llu leaves=%u", + (u_longlong_t)import_delay, + (u_longlong_t)ub->ub_mmp_delay, + (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " - "import_intervals=%u", import_delay, multihost_interval, - import_intervals); + "import_intervals=%llu", (u_longlong_t)import_delay, + (u_longlong_t)multihost_interval, + (u_longlong_t)import_intervals); } return (import_delay); @@ -2624,7 +3191,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ - import_delay += import_delay * spa_get_random(250) / 1000; + import_delay += import_delay * random_in_range(250) / 1000; import_expire = gethrtime() + import_delay; @@ -2640,8 +3207,11 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", - txg, ub->ub_txg, timestamp, ub->ub_timestamp, - mmp_config, ub->ub_mmp_config); + (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, + (u_longlong_t)timestamp, + (u_longlong_t)ub->ub_timestamp, + (u_longlong_t)mmp_config, + (u_longlong_t)ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; @@ -2727,7 +3297,8 @@ spa_verify_host(spa_t *spa, nvlist_t *mos_config) cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " - "See: http://illumos.org/msg/ZFS-8000-EY", + "See: https://openzfs.github.io/openzfs-docs/msg/" + "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", @@ -2748,6 +3319,7 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type) vdev_t *rvd; uint64_t pool_guid; char *comment; + char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if @@ -2796,6 +3368,11 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type) if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); + ASSERT(spa->spa_compatibility == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, + &compatibility) == 0) + spa->spa_compatibility = spa_strdup(compatibility); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); @@ -2880,7 +3457,7 @@ spa_ld_open_vdevs(spa_t *spa) if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); - if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { + if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot @@ -3015,7 +3592,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && - spa_get_hostid() == 0) { + spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); @@ -3074,7 +3651,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) * from the label. */ nvlist_free(spa->spa_label_features); - VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + spa->spa_label_features = fnvlist_dup(features); } nvlist_free(label); @@ -3087,21 +3664,20 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; - VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == - 0); + unsup_feat = fnvlist_alloc(); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { - VERIFY(nvlist_add_string(unsup_feat, - nvpair_name(nvp), "") == 0); + fnvlist_add_string(unsup_feat, + nvpair_name(nvp), ""); } } if (!nvlist_empty(unsup_feat)) { - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, @@ -3192,7 +3768,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -3540,6 +4123,15 @@ spa_ld_get_props(spa_t *spa) if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* + * Load the livelist deletion field. If a livelist is queued for + * deletion, indicate that in the spa + */ + error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, + &spa->spa_livelists_to_delete, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* * Load the history object. If we have an older pool, this * will not be present. @@ -3595,7 +4187,7 @@ spa_ld_get_props(spa_t *spa) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { - uint64_t autoreplace; + uint64_t autoreplace = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); @@ -3603,8 +4195,6 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); - spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, - &spa->spa_dedup_ditto); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } @@ -3695,7 +4285,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ - if (spa_multihost(spa) && spa_get_hostid() == 0 && + if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); @@ -3731,11 +4321,18 @@ spa_ld_load_vdev_metadata(spa_t *spa) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } + error = spa_ld_log_spacemaps(spa); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); @@ -3852,7 +4449,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, need_update = B_TRUE; /* - * Update the config cache asychronously in case we're the + * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) @@ -3862,7 +4459,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, static void spa_ld_prepare_for_reload(spa_t *spa) { - int mode = spa->spa_mode; + spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); @@ -4040,7 +4637,7 @@ spa_ld_checkpoint_rewind(spa_t *spa) vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; - int c0 = spa_get_random(children); + int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; @@ -4165,7 +4762,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) return (error); /* - * Redo the loading process process again with the + * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); @@ -4324,11 +4921,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) update_config_cache); /* - * Check all DTLs to see if anything needs resilvering. + * Check if a rebuild was in progress and if so resume it. + * Then check all DTLs to see if anything needs resilvering. + * The resilver will be deferred if a rebuild was started. */ - if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + if (vdev_rebuild_active(spa->spa_root_vdev)) { + vdev_rebuild_restart(spa); + } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); + } /* * Log the fact that we booted up (so that we can detect if @@ -4364,6 +4966,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) } spa_import_progress_remove(spa_guid(spa)); + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_load_note(spa, "LOADED"); return (0); @@ -4372,7 +4976,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) static int spa_load_retry(spa_t *spa, spa_load_state_t state) { - int mode = spa->spa_mode; + spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); @@ -4580,11 +5184,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { - VERIFY(nvlist_dup(spa->spa_config, config, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist(*config, + *config = fnvlist_dup(spa->spa_config); + fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); + spa->spa_load_info); } spa_unload(spa); spa_deactivate(spa); @@ -4606,8 +5209,8 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { - VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); + fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info); } if (locked) { @@ -4618,7 +5221,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } if (firstopen) - zvol_create_minors(spa, spa_name(spa), B_TRUE); + zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; @@ -4685,15 +5288,14 @@ spa_add_spares(spa_t *spa, nvlist_t *config) if (spa->spa_spares.sav_count == 0) return; - VERIFY(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, + nspares); + VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares)); /* * Go through and find any spares which have since been @@ -4701,13 +5303,13 @@ spa_add_spares(spa_t *spa, nvlist_t *config) * their status appropriately. */ for (i = 0; i < nspares; i++) { - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &guid) == 0); + guid = fnvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { - VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); + VERIFY0(nvlist_lookup_uint64_array(spares[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, + &vsc)); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } @@ -4734,23 +5336,22 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) if (spa->spa_l2cache.sav_count == 0) return; - VERIFY(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, + nl2cache); + VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache)); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { - VERIFY(nvlist_lookup_uint64(l2cache[i], - ZPOOL_CONFIG_GUID, &guid) == 0); + guid = fnvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { @@ -4762,9 +5363,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } ASSERT(vd != NULL); - VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) - == 0); + VERIFY0(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); @@ -4879,20 +5479,20 @@ spa_get_stats(const char *name, nvlist_t **config, loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; - VERIFY(nvlist_add_uint64_array(*config, - ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + fnvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); - VERIFY(nvlist_add_uint64(*config, + fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); + spa_get_errlog_size(spa)); if (spa_suspended(spa)) { - VERIFY(nvlist_add_uint64(*config, + fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, - spa->spa_failmode) == 0); - VERIFY(nvlist_add_uint64(*config, + spa->spa_failmode); + fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, - spa->spa_suspended) == 0); + spa->spa_suspended); } spa_add_spares(spa, *config); @@ -4984,8 +5584,8 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { - VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); + fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid); } vdev_free(vd); @@ -5036,23 +5636,20 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, * Generate new dev list by concatenating with the * current dev list. */ - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, - &olddevs, &oldndevs) == 0); + VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs)); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) - VERIFY(nvlist_dup(olddevs[i], &newdevs[i], - KM_SLEEP) == 0); + newdevs[i] = fnvlist_dup(olddevs[i]); for (i = 0; i < ndevs; i++) - VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], - KM_SLEEP) == 0); + newdevs[i + oldndevs] = fnvlist_dup(devs[i]); - VERIFY(nvlist_remove(sav->sav_config, config, - DATA_TYPE_NVLIST_ARRAY) == 0); + fnvlist_remove(sav->sav_config, config); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, - config, newdevs, ndevs + oldndevs) == 0); + fnvlist_add_nvlist_array(sav->sav_config, config, newdevs, + ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); @@ -5060,10 +5657,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, /* * Generate a new dev list. */ - VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, - devs, ndevs) == 0); + sav->sav_config = fnvlist_alloc(); + fnvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs); } } @@ -5121,9 +5716,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; + boolean_t has_allocclass; spa_feature_t feat; char *feat_name; char *poolname; @@ -5168,6 +5764,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, has_features = B_FALSE; has_encryption = B_FALSE; + has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { @@ -5177,6 +5774,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; + if (feat == SPA_FEATURE_ALLOCATION_CLASSES) + has_allocclass = B_TRUE; } } @@ -5190,6 +5789,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (error); } } + if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (ENOTSUP); + } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { @@ -5233,8 +5838,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5262,10 +5867,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa->spa_spares.sav_config = fnvlist_alloc(); + fnvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5277,10 +5881,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa->spa_l2cache.sav_config = fnvlist_alloc(); + fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5375,6 +5978,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -5409,7 +6015,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; - uint64_t mode = spa_mode_global; + spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; @@ -5433,7 +6039,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) - mode = FREAD; + mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; @@ -5478,8 +6084,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -5497,8 +6102,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) spa_load_l2cache(spa); } - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) @@ -5522,13 +6126,12 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) - VERIFY(nvlist_remove(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + fnvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES); else - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa->spa_spares.sav_config = fnvlist_alloc(); + fnvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5537,13 +6140,12 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) - VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + fnvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE); else - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa->spa_l2cache.sav_config = fnvlist_alloc(); + fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5575,10 +6177,10 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); - zvol_create_minors(spa, pool, B_TRUE); - mutex_exit(&spa_namespace_lock); + zvol_create_minors_recursive(pool); + return (0); } @@ -5603,7 +6205,7 @@ spa_tryimport(nvlist_t *tryconfig) */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); - spa_activate(spa, FREAD); + spa_activate(spa, SPA_MODE_READ); /* * Rewind pool if a max txg was provided. @@ -5633,16 +6235,14 @@ spa_tryimport(nvlist_t *tryconfig) */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, - poolname) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - state) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, - spa->spa_uberblock.ub_timestamp) == 0); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, - spa->spa_errata) == 0); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + spa->spa_uberblock.ub_timestamp); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info); + fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, + spa->spa_errata); /* * If the bootfs property exists on this pool then we @@ -5671,8 +6271,8 @@ spa_tryimport(nvlist_t *tryconfig) (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } - VERIFY(nvlist_add_string(config, - ZPOOL_CONFIG_BOOTFS, dsname) == 0); + fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, + dsname); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); @@ -5705,15 +6305,16 @@ spa_tryimport(nvlist_t *tryconfig) * we don't sync the labels or remove the configuration cache. */ static int -spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, +spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { + int error; spa_t *spa; if (oldconfig) *oldconfig = NULL; - if (!(spa_mode_global & FWRITE)) + if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); @@ -5722,6 +6323,13 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, return (SET_ERROR(ENOENT)); } + if (spa->spa_is_exporting) { + /* the pool is being exported by another thread */ + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); + } + spa->spa_is_exporting = B_TRUE; + /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. @@ -5753,12 +6361,9 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * references. If we are resetting a pool, allow references by * fault injection handlers. */ - if (!spa_refcount_zero(spa) || - (spa->spa_inject_ref != 0 && - new_state != POOL_STATE_UNINITIALIZED)) { - spa_async_resume(spa); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EBUSY)); + if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { + error = SET_ERROR(EBUSY); + goto fail; } if (spa->spa_sync_on) { @@ -5770,9 +6375,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { - spa_async_resume(spa); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EXDEV)); + error = SET_ERROR(EXDEV); + goto fail; } /* @@ -5787,6 +6391,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -5816,23 +6421,36 @@ export_spa: } if (oldconfig && spa->spa_config) - VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + *oldconfig = fnvlist_dup(spa->spa_config); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); + } else { + /* + * If spa_remove() is not called for this spa_t and + * there is any possibility that it can be reused, + * we make sure to reset the exporting flag. + */ + spa->spa_is_exporting = B_FALSE; } - mutex_exit(&spa_namespace_lock); + mutex_exit(&spa_namespace_lock); return (0); + +fail: + spa->spa_is_exporting = B_FALSE; + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (error); } /* * Destroy a storage pool. */ int -spa_destroy(char *pool) +spa_destroy(const char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); @@ -5842,7 +6460,7 @@ spa_destroy(char *pool) * Export a storage pool. */ int -spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, +spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, @@ -5854,7 +6472,7 @@ spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, * from the namespace in any way. */ int -spa_reset(char *pool) +spa_reset(const char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); @@ -5866,13 +6484,26 @@ spa_reset(char *pool) * ========================================================================== */ +/* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg, id; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -5901,8 +6532,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their associated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { + return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -5915,7 +6561,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -5925,10 +6571,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -5947,19 +6593,9 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } for (int c = 0; c < vd->vdev_children; c++) { - - /* - * Set the vdev id to the first hole, if one exists. - */ - for (id = 0; id < rvd->vdev_children; id++) { - if (rvd->vdev_child[id]->vdev_ishole) { - vdev_free(rvd->vdev_child[id]); - break; - } - } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); - tvd->vdev_id = id; + tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } @@ -5978,6 +6614,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) spa->spa_l2cache.sav_sync = B_TRUE; } + /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we @@ -6013,12 +6662,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. + * + * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) + * should be performed instead of traditional healing reconstruction. From + * an administrators perspective these are both resilver operations. */ int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, + int rebuild) { uint64_t txg, dtl_max_txg; - ASSERTV(vdev_t *rvd = spa->spa_root_vdev); + vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; @@ -6038,6 +6692,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) return (spa_vdev_exit(spa, NULL, txg, error)); } + if (rebuild) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + if (dsl_scan_resilvering(spa_get_dsl(spa))) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RESILVER_IN_PROGRESS)); + } else { + if (vdev_rebuild_active(rvd)) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_REBUILD_IN_PROGRESS)); + } + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); @@ -6070,6 +6737,31 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + + if (rebuild) { + /* + * For rebuilds, the top vdev must support reconstruction + * using only space maps. This means the only allowable + * vdevs types are the root vdev, a mirror, or dRAID. + */ + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + } + if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root @@ -6123,7 +6815,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid @@ -6133,17 +6825,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); - (void) sprintf(oldvd->vdev_path, "%s/%s", - newvd->vdev_path, "old"); + (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, + "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } - /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; - /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -6181,8 +6870,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -6199,16 +6888,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_dirty(tvd, VDD_DTL, newvd, txg); /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. We do not do this if resilvers have been - * deferred. + * Schedule the resilver or rebuild to restart in the future. We do + * this to ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. */ - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, newvd); - else - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); + } else { + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } + } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6243,7 +6941,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; - ASSERTV(vdev_t *rvd = spa->spa_root_vdev); + vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; @@ -6251,7 +6949,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) ASSERT(spa_writeable(spa)); - txg = spa_vdev_enter(spa); + txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -6352,14 +7050,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -6448,6 +7152,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); + spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); @@ -6782,7 +7487,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || !vdev_is_concrete(vd)) { + if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && + !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; @@ -6818,6 +7524,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, } } + /* deal with indirect vdevs */ + if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == + &vdev_indirect_ops) + continue; + /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { @@ -6853,14 +7564,14 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, } /* we need certain info from the top level */ - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, - vml[c]->vdev_top->vdev_ms_array) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, - vml[c]->vdev_top->vdev_ms_shift) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, - vml[c]->vdev_top->vdev_asize) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, - vml[c]->vdev_top->vdev_ashift) == 0); + fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array); + fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift); + fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize); + fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); @@ -6890,28 +7601,24 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ - VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, - glist, children) == 0); + nvl = fnvlist_alloc(); + fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); - VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, - nvl) == 0); + fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, - spa->spa_config_txg) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, - spa_generate_guid(NULL)) == 0); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); @@ -6945,7 +7652,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); @@ -6964,6 +7671,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; + newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); @@ -6972,10 +7680,9 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { - VERIFY(nvlist_alloc(&newspa->spa_config_splitting, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, - ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + newspa->spa_config_splitting = fnvlist_alloc(); + fnvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } @@ -7005,7 +7712,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* @@ -7041,6 +7748,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); + newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ @@ -7196,12 +7904,18 @@ spa_vdev_resilver_done(spa_t *spa) } spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * If a detach was not performed above replace waiters will not have + * been notified. In which case we must do so now. + */ + spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ -int +static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { @@ -7325,6 +8039,9 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) @@ -7384,12 +8101,16 @@ spa_async_thread(void *arg) old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); + old_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); + new_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -7399,7 +8120,8 @@ spa_async_thread(void *arg) if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", - spa_name(spa), new_space, new_space - old_space); + spa_name(spa), (u_longlong_t)new_space, + (u_longlong_t)(new_space - old_space)); } } @@ -7434,16 +8156,19 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); + } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && + !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) - dsl_resilver_restart(dp, 0); + dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); @@ -7469,6 +8194,28 @@ spa_async_thread(void *arg) mutex_exit(&spa_namespace_lock); } + /* + * Kick off L2 cache whole device TRIM. + */ + if (tasks & SPA_ASYNC_L2CACHE_TRIM) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_l2arc(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* + * Kick off L2 cache rebuilding. + */ + if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); + l2arc_spa_rebuild_start(spa); + spa_config_exit(spa, SCL_L2ARC, FTAG); + mutex_exit(&spa_namespace_lock); + } + /* * Let the world know that we're done. */ @@ -7497,6 +8244,14 @@ spa_async_suspend(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_cancel(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_cancel(ll_condense_thread); } void @@ -7515,6 +8270,14 @@ spa_async_resume(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_resume(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_resume(ll_condense_thread); } static boolean_t @@ -7543,8 +8306,7 @@ spa_async_dispatch(spa_t *spa) mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && - spa->spa_async_thread == NULL && - rootdir != NULL) + spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); @@ -7559,30 +8321,58 @@ spa_async_request(spa_t *spa, int task) mutex_exit(&spa->spa_async_lock); } +int +spa_async_tasks(spa_t *spa) +{ + return (spa->spa_async_tasks); +} + /* * ========================================================================== * SPA syncing routines * ========================================================================== */ + static int -bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { bpobj_t *bpo = arg; - bpobj_enqueue(bpo, bp, tx); + bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } +int +bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); +} + +int +bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); +} + static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { - zio_t *zio = arg; + zio_t *pio = arg; - zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - zio->io_flags)); + zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, + pio->io_flags)); return (0); } +static int +bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (spa_free_sync_cb(arg, bp, tx)); +} + /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. @@ -7605,9 +8395,21 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) if (spa_sync_pass(spa) != 1) return; + /* + * Note: + * If the log space map feature is active, we stop deferring + * frees to the next TXG and therefore running this function + * would be considered a no-op as spa_deferred_bpobj should + * not have any entries. + * + * That said we run this function anyway (instead of returning + * immediately) for the edge-case scenario where we just + * activated the log space map feature in this TXG but we have + * deferred frees from the previous TXG. + */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, - spa_free_sync_cb, zio, tx), ==, 0); + bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } @@ -7668,16 +8470,15 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, &sav->sav_object, tx) == 0); } - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); + fnvlist_add_nvlist_array(nvroot, config, NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); - VERIFY(nvlist_add_nvlist_array(nvroot, config, list, - sav->sav_count) == 0); + fnvlist_add_nvlist_array(nvroot, config, list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); @@ -7839,7 +8640,8 @@ spa_sync_version(void *arg, dmu_tx_t *tx) spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); - spa_history_log_internal(spa, "set", tx, "version=%lld", version); + spa_history_log_internal(spa, "set", tx, "version=%lld", + (longlong_t)version); } /* @@ -7898,7 +8700,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* - * 'readonly' and 'cachefile' are also non-persisitent + * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; @@ -7909,15 +8711,36 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs - * so that their labels get updated. It's unnecessary - * to do this for pool creation since the vdev's - * configuration has already been dirtied. + * so that their labels get updated. We also need to + * update the cache file to keep it in sync with the + * MOS version. It's unnecessary to do this for pool + * creation since the vdev's configuration has already + * been dirtied. */ - if (tx->tx_txg != TXG_INITIAL) + if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; + case ZPOOL_PROP_COMPATIBILITY: + strval = fnvpair_value_string(elem); + if (spa->spa_compatibility != NULL) + spa_strfree(spa->spa_compatibility); + spa->spa_compatibility = spa_strdup(strval); + /* + * Dirty the configuration on vdevs as above. + */ + if (tx->tx_txg != TXG_INITIAL) { + vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + default: /* * Set pool property values in the poolprops mos object. @@ -7953,7 +8776,8 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, - "%s=%lld", nvpair_name(elem), intval); + "%s=%lld", nvpair_name(elem), + (longlong_t)intval); } else { ASSERT(0); /* not allowed */ } @@ -7982,9 +8806,6 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; - case ZPOOL_PROP_DEDUPDITTO: - spa->spa_dedup_ditto = intval; - break; default: break; } @@ -8072,8 +8893,8 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) static void vdev_indirect_state_sync_verify(vdev_t *vd) { - ASSERTV(vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping); - ASSERTV(vdev_indirect_births_t *vib = vd->vdev_indirect_births); + vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); @@ -8137,25 +8958,32 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - for (int i = 0; i < spa->spa_alloc_count; i++) + for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( - &(mg->mg_alloc_queue_depth[i]))); + &(mg->mg_allocator[i].mga_alloc_queue_depth))); + } mg->mg_max_alloc_queue_depth = max_queue_depth; - for (int i = 0; i < spa->spa_alloc_count; i++) { - mg->mg_cur_max_alloc_queue_depth[i] = + for (int i = 0; i < mg->mg_allocators; i++) { + mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); - normal->mc_alloc_max_slots[i] = slots_per_allocator; - special->mc_alloc_max_slots[i] = slots_per_allocator; - dedup->mc_alloc_max_slots[i] = slots_per_allocator; + ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. + mca_alloc_slots)); + normal->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + special->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + dedup->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; @@ -8198,7 +9026,14 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - if (pass < zfs_sync_pass_deferred_free) { + if (pass < zfs_sync_pass_deferred_free || + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + /* + * If the log space map feature is active we don't + * care about deferred frees and the deferred bpobj + * as the log space map should effectively have the + * same results (i.e. appending only to one object). + */ spa_sync_frees(spa, free_bpl, tx); } else { /* @@ -8206,7 +9041,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); - bplist_iterate(free_bpl, bpobj_enqueue_cb, + bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } @@ -8215,6 +9050,8 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) svr_sync(spa, tx); spa_sync_upgrades(spa, tx); + spa_flush_metaslabs(spa, tx); + vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) @@ -8276,7 +9113,7 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; - int c0 = spa_get_random(children); + int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = @@ -8341,9 +9178,9 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); + mutex_enter(&spa->spa_allocs[i].spaa_lock); + VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); + mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* @@ -8453,9 +9290,9 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); + mutex_enter(&spa->spa_allocs[i].spaa_lock); + VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); + mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* @@ -8465,6 +9302,11 @@ spa_sync(spa_t *spa, uint64_t txg) != NULL) vdev_sync_done(vd, txg); + metaslab_class_evict_old(spa->spa_normal_class, txg); + metaslab_class_evict_old(spa->spa_log_class, txg); + + spa_sync_close_syncing_log_sm(spa); + spa_update_dspace(spa); /* @@ -8650,6 +9492,308 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } +uint64_t +spa_total_metaslabs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + uint64_t m = 0; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + if (!vdev_is_concrete(vd)) + continue; + m += vd->vdev_ms_count; + } + return (m); +} + +/* + * Notify any waiting threads that some activity has switched from being in- + * progress to not-in-progress so that the thread can wake up and determine + * whether it is finished waiting. + */ +void +spa_notify_waiters(spa_t *spa) +{ + /* + * Acquiring spa_activities_lock here prevents the cv_broadcast from + * happening between the waiting thread's check and cv_wait. + */ + mutex_enter(&spa->spa_activities_lock); + cv_broadcast(&spa->spa_activities_cv); + mutex_exit(&spa->spa_activities_lock); +} + +/* + * Notify any waiting threads that the pool is exporting, and then block until + * they are finished using the spa_t. + */ +void +spa_wake_waiters(spa_t *spa) +{ + mutex_enter(&spa->spa_activities_lock); + spa->spa_waiters_cancel = B_TRUE; + cv_broadcast(&spa->spa_activities_cv); + while (spa->spa_waiters != 0) + cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); + spa->spa_waiters_cancel = B_FALSE; + mutex_exit(&spa->spa_activities_lock); +} + +/* Whether the vdev or any of its descendants are being initialized/trimmed. */ +static boolean_t +spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); + ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); + ASSERT(activity == ZPOOL_WAIT_INITIALIZE || + activity == ZPOOL_WAIT_TRIM); + + kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? + &vd->vdev_initialize_lock : &vd->vdev_trim_lock; + + mutex_exit(&spa->spa_activities_lock); + mutex_enter(lock); + mutex_enter(&spa->spa_activities_lock); + + boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? + (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : + (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); + mutex_exit(lock); + + if (in_progress) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], + activity)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * If use_guid is true, this checks whether the vdev specified by guid is + * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool + * is being initialized/trimmed. The caller must hold the config lock and + * spa_activities_lock. + */ +static int +spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, + zpool_wait_activity_t activity, boolean_t *in_progress) +{ + mutex_exit(&spa->spa_activities_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + mutex_enter(&spa->spa_activities_lock); + + vdev_t *vd; + if (use_guid) { + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (EINVAL); + } + } else { + vd = spa->spa_root_vdev; + } + + *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); + + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (0); +} + +/* + * Locking for waiting threads + * --------------------------- + * + * Waiting threads need a way to check whether a given activity is in progress, + * and then, if it is, wait for it to complete. Each activity will have some + * in-memory representation of the relevant on-disk state which can be used to + * determine whether or not the activity is in progress. The in-memory state and + * the locking used to protect it will be different for each activity, and may + * not be suitable for use with a cvar (e.g., some state is protected by the + * config lock). To allow waiting threads to wait without any races, another + * lock, spa_activities_lock, is used. + * + * When the state is checked, both the activity-specific lock (if there is one) + * and spa_activities_lock are held. In some cases, the activity-specific lock + * is acquired explicitly (e.g. the config lock). In others, the locking is + * internal to some check (e.g. bpobj_is_empty). After checking, the waiting + * thread releases the activity-specific lock and, if the activity is in + * progress, then cv_waits using spa_activities_lock. + * + * The waiting thread is woken when another thread, one completing some + * activity, updates the state of the activity and then calls + * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only + * needs to hold its activity-specific lock when updating the state, and this + * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. + * + * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, + * and because it is held when the waiting thread checks the state of the + * activity, it can never be the case that the completing thread both updates + * the activity state and cv_broadcasts in between the waiting thread's check + * and cv_wait. Thus, a waiting thread can never miss a wakeup. + * + * In order to prevent deadlock, when the waiting thread does its check, in some + * cases it will temporarily drop spa_activities_lock in order to acquire the + * activity-specific lock. The order in which spa_activities_lock and the + * activity specific lock are acquired in the waiting thread is determined by + * the order in which they are acquired in the completing thread; if the + * completing thread calls spa_notify_waiters with the activity-specific lock + * held, then the waiting thread must also acquire the activity-specific lock + * first. + */ + +static int +spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, + boolean_t use_tag, uint64_t tag, boolean_t *in_progress) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); + + switch (activity) { + case ZPOOL_WAIT_CKPT_DISCARD: + *in_progress = + (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && + zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == + ENOENT); + break; + case ZPOOL_WAIT_FREE: + *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && + !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || + spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || + spa_livelist_delete_check(spa)); + break; + case ZPOOL_WAIT_INITIALIZE: + case ZPOOL_WAIT_TRIM: + error = spa_vdev_activity_in_progress(spa, use_tag, tag, + activity, in_progress); + break; + case ZPOOL_WAIT_REPLACE: + mutex_exit(&spa->spa_activities_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + mutex_enter(&spa->spa_activities_lock); + + *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + break; + case ZPOOL_WAIT_REMOVE: + *in_progress = (spa->spa_removing_phys.sr_state == + DSS_SCANNING); + break; + case ZPOOL_WAIT_RESILVER: + if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + break; + fallthrough; + case ZPOOL_WAIT_SCRUB: + { + boolean_t scanning, paused, is_scrub; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); + scanning = (scn->scn_phys.scn_state == DSS_SCANNING); + paused = dsl_scan_is_paused_scrub(scn); + *in_progress = (scanning && !paused && + is_scrub == (activity == ZPOOL_WAIT_SCRUB)); + break; + } + default: + panic("unrecognized value for activity %d", activity); + } + + return (error); +} + +static int +spa_wait_common(const char *pool, zpool_wait_activity_t activity, + boolean_t use_tag, uint64_t tag, boolean_t *waited) +{ + /* + * The tag is used to distinguish between instances of an activity. + * 'initialize' and 'trim' are the only activities that we use this for. + * The other activities can only have a single instance in progress in a + * pool at one time, making the tag unnecessary. + * + * There can be multiple devices being replaced at once, but since they + * all finish once resilvering finishes, we don't bother keeping track + * of them individually, we just wait for them all to finish. + */ + if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && + activity != ZPOOL_WAIT_TRIM) + return (EINVAL); + + if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) + return (EINVAL); + + spa_t *spa; + int error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + /* + * Increment the spa's waiter count so that we can call spa_close and + * still ensure that the spa_t doesn't get freed before this thread is + * finished with it when the pool is exported. We want to call spa_close + * before we start waiting because otherwise the additional ref would + * prevent the pool from being exported or destroyed throughout the + * potentially long wait. + */ + mutex_enter(&spa->spa_activities_lock); + spa->spa_waiters++; + spa_close(spa, FTAG); + + *waited = B_FALSE; + for (;;) { + boolean_t in_progress; + error = spa_activity_in_progress(spa, activity, use_tag, tag, + &in_progress); + + if (error || !in_progress || spa->spa_waiters_cancel) + break; + + *waited = B_TRUE; + + if (cv_wait_sig(&spa->spa_activities_cv, + &spa->spa_activities_lock) == 0) { + error = EINTR; + break; + } + } + + spa->spa_waiters--; + cv_signal(&spa->spa_waiters_cv); + mutex_exit(&spa->spa_activities_lock); + + return (error); +} + +/* + * Wait for a particular instance of the specified activity to complete, where + * the instance is identified by 'tag' + */ +int +spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, + boolean_t *waited) +{ + return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); +} + +/* + * Wait for all instances of the specified activity complete + */ +int +spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) +{ + + return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); +} + sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { @@ -8690,7 +9834,6 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } -#if defined(_KERNEL) /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); @@ -8709,7 +9852,7 @@ EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); -/* device maniion */ +/* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); @@ -8745,35 +9888,44 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); -#endif - -#if defined(_KERNEL) -module_param(spa_load_verify_maxinflight, int, 0644); -MODULE_PARM_DESC(spa_load_verify_maxinflight, - "Max concurrent traversal I/Os while verifying pool during import -X"); - -module_param(spa_load_verify_metadata, int, 0644); -MODULE_PARM_DESC(spa_load_verify_metadata, - "Set to traverse metadata on pool import"); - -module_param(spa_load_verify_data, int, 0644); -MODULE_PARM_DESC(spa_load_verify_data, - "Set to traverse data on pool import"); - -module_param(spa_load_print_vdev_tree, int, 0644); -MODULE_PARM_DESC(spa_load_print_vdev_tree, - "Print vdev tree to zfs_dbgmsg during pool import"); - -/* CSTYLED */ -module_param(zio_taskq_batch_pct, uint, 0444); -MODULE_PARM_DESC(zio_taskq_batch_pct, - "Percentage of CPUs to run an IO worker thread"); /* BEGIN CSTYLED */ -module_param(zfs_max_missing_tvds, ulong, 0644); -MODULE_PARM_DESC(zfs_max_missing_tvds, - "Allow importing pool with up to this number of missing top-level vdevs" - " (in read-only mode)"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, + "log2 fraction of arc that can be used by inflight I/Os when " + "verifying pool during import"); -#endif +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, + "Set to traverse metadata on pool import"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, + "Set to traverse data on pool import"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, + "Print vdev tree to zfs_dbgmsg during pool import"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, + "Percentage of CPUs to run an IO worker thread"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, + "Number of threads per IO worker taskqueue"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, + "Allow importing pool with up to this number of missing top-level " + "vdevs (in read-only mode)"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, + "Set the livelist condense zthr to pause"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, + "Set the livelist condense synctask to pause"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, + "Whether livelist condensing was canceled in the synctask"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, + "Whether livelist condensing was canceled in the zthr function"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, + "Whether extra ALLOC blkptrs were added to a livelist entry while it " + "was being condensed"); +/* END CSTYLED */ diff --git a/module/zfs/spa_boot.c b/module/zfs/spa_boot.c index be79542c90..674394650f 100644 --- a/module/zfs/spa_boot.c +++ b/module/zfs/spa_boot.c @@ -27,7 +27,7 @@ #ifdef _KERNEL #include -#include +#include #include char * diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index d6f68ceda5..09f6299685 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -102,7 +102,7 @@ * Once the synctask is done and the discarding zthr is awake, we discard * the checkpointed data over multiple TXGs by having the zthr prefetching * entries from vdev_checkpoint_sm and then starting a synctask that places - * them as free blocks in to their respective ms_allocatable and ms_sm + * them as free blocks into their respective ms_allocatable and ms_sm * structures. * [see spa_checkpoint_discard_thread()] * @@ -191,6 +191,7 @@ spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) spa->spa_checkpoint_info.sci_timestamp = 0; spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + spa_notify_waiters(spa); spa_history_log_internal(spa, "spa discard checkpoint", tx, "finished discarding checkpointed state from the pool"); @@ -211,7 +212,7 @@ spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) - return (EINTR); + return (SET_ERROR(EINTR)); /* * Since the space map is not condensed, we know that @@ -336,17 +337,18 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) spa_checkpoint_accounting_verify(vd->vdev_spa); #endif - zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " + zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, " "deleted %llu words - %llu words are left", - tx->tx_txg, vd->vdev_id, (words_before - words_after), - words_after); + (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id, + (u_longlong_t)(words_before - words_after), + (u_longlong_t)words_after); if (error != EINTR) { if (error != 0) { - zfs_panic_recover("zfs: error %d was returned " + zfs_panic_recover("zfs: error %lld was returned " "while incrementally destroying the checkpoint " - "space map of vdev %llu\n", - error, vd->vdev_id); + "space map of vdev %u\n", + (longlong_t)error, vd->vdev_id); } ASSERT0(words_after); ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); @@ -524,7 +526,7 @@ spa_checkpoint_sync(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); spa_history_log_internal(spa, "spa checkpoint", tx, - "checkpointed uberblock txg=%llu", checkpoint.ub_txg); + "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); } /* @@ -624,15 +626,12 @@ spa_checkpoint_discard(const char *pool) ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); } -#if defined(_KERNEL) EXPORT_SYMBOL(spa_checkpoint_get_stats); EXPORT_SYMBOL(spa_checkpoint_discard_thread); EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); /* BEGIN CSTYLED */ -module_param(zfs_spa_discard_memory_limit, ulong, 0644); -MODULE_PARM_DESC(zfs_spa_discard_memory_limit, - "Maximum memory for prefetching checkpoint space " - "map per top-level vdev while discarding checkpoint"); +ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW, + "Limit for memory used in prefetching the checkpoint space map done " + "on each vdev while discarding the checkpoint"); /* END CSTYLED */ -#endif diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 8616abda37..ad82932ce5 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,23 +22,25 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2021, Colm Buckley */ #include +#include #include #include #include -#include #include #include #include #include #include #include +#include +#include #ifdef _KERNEL -#include #include #endif @@ -80,8 +82,10 @@ spa_config_load(void) nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; - struct _buf *file; + zfs_file_t *fp; + zfs_file_attr_t zfa; uint64_t fsize; + int err; #ifdef _KERNEL if (zfs_autoimport_disable) @@ -93,25 +97,29 @@ spa_config_load(void) */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) snprintf(pathname, MAXPATHLEN, "%s%s", - (rootdir != NULL) ? "./" : "", spa_config_path); + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); - file = kobj_open_file(pathname); + err = zfs_file_open(pathname, O_RDONLY, 0, &fp); +#ifdef __FreeBSD__ + if (err) + err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); +#endif kmem_free(pathname, MAXPATHLEN); - if (file == (struct _buf *)-1) + if (err) return; - if (kobj_get_filesize(file, &fsize) != 0) + if (zfs_file_getattr(fp, &zfa)) goto out; + fsize = zfa.zfa_size; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ - if (kobj_read_file(file, buf, fsize, 0) < 0) + if (zfs_file_read(fp, buf, fsize, NULL) < 0) goto out; /* @@ -144,27 +152,32 @@ out: if (buf != NULL) kmem_free(buf, fsize); - kobj_close_file(file); + zfs_file_close(fp); } static int spa_config_remove(spa_config_dirent_t *dp) { -#if defined(__linux__) && defined(_KERNEL) - int error, flags = FWRITE | FTRUNC; - uio_seg_t seg = UIO_SYSSPACE; - vnode_t *vp; + int error = 0; - error = vn_open(dp->scd_path, seg, flags, 0644, &vp, 0, 0); - if (error == 0) { - (void) VOP_FSYNC(vp, FSYNC, kcred, NULL); - (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL); + /* + * Remove the cache file. If zfs_file_unlink() in not supported by the + * platform fallback to truncating the file which is functionally + * equivalent. + */ + error = zfs_file_unlink(dp->scd_path); + if (error == EOPNOTSUPP) { + int flags = O_RDWR | O_TRUNC; + zfs_file_t *fp; + + error = zfs_file_open(dp->scd_path, flags, 0644, &fp); + if (error == 0) { + (void) zfs_file_fsync(fp, O_SYNC); + (void) zfs_file_close(fp); + } } return (error); -#else - return (vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE)); -#endif } static int @@ -172,10 +185,10 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; - vnode_t *vp; - int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; + zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. @@ -194,46 +207,22 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); -#if defined(__linux__) && defined(_KERNEL) /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ - err = vn_open(dp->scd_path, UIO_SYSSPACE, oflags, 0644, &vp, 0, 0); + err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { - err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, - UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, NULL); + err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) - err = VOP_FSYNC(vp, FSYNC, kcred, NULL); + err = zfs_file_fsync(fp, O_SYNC); - (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); + zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } -#else - /* - * Write the configuration to disk. We need to do the traditional - * 'write to temporary file, sync, move over original' to make sure we - * always have a consistent view of the data. - */ - (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); - - err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); - if (err == 0) { - err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL); - if (err == 0) - err = VOP_FSYNC(vp, FSYNC, kcred, NULL); - if (err == 0) - err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); - (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); - } - - (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); -#endif - fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); @@ -259,7 +248,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (rootdir == NULL || !(spa_mode_global & FWRITE)) + if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* @@ -326,8 +315,9 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { - zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, NULL, 0, 0); + (void) zfs_ereport_post( + FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); @@ -457,8 +447,11 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); + if (spa->spa_compatibility != NULL) + fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, + spa->spa_compatibility); - hostid = spa_get_hostid(); + hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); @@ -612,17 +605,19 @@ spa_config_update(spa_t *spa, int what) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } -#if defined(_KERNEL) EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); -module_param(spa_config_path, charp, 0444); -MODULE_PARM_DESC(spa_config_path, "SPA config file (/etc/zfs/zpool.cache)"); - -module_param(zfs_autoimport_disable, int, 0644); -MODULE_PARM_DESC(zfs_autoimport_disable, "Disable pool import at module load"); - +/* BEGIN CSTYLED */ +#ifdef __linux__ +/* string sysctls require a char array on FreeBSD */ +ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, + "SPA config file (/etc/zfs/zpool.cache)"); #endif + +ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, + "Disable pool import at module load"); +/* END CSTYLED */ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index e42f8a0212..fa5120eb61 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -31,7 +31,7 @@ * and the current log. All errors seen are logged to the current log. When a * scrub completes, the current log becomes the last log, the last log is thrown * out, and the current log is reinitialized. This way, if an error is somehow - * corrected, a new scrub will show that that it no longer exists, and will be + * corrected, a new scrub will show that it no longer exists, and will be * deleted from the log when the scrub completes. * * The log is stored using a ZAP object whose key is a string form of the diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index b590a1d57b..dae06e46c3 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. */ @@ -63,7 +63,7 @@ * overwrite the original creation of the pool. 'sh_phys_max_off' is the * physical ending offset in bytes of the log. This tells you the length of * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record - * is added, 'sh_eof' is incremented by the the size of the record. + * is added, 'sh_eof' is incremented by the size of the record. * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). * This is where the consumer should start reading from after reading in * the 'zpool create' portion of the log. @@ -180,16 +180,6 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, return (0); } -static char * -spa_history_zone(void) -{ -#ifdef _KERNEL - return ("linux"); -#else - return (NULL); -#endif -} - /* * Post a history sysevent. * @@ -298,7 +288,6 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) } #endif - fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename); if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { @@ -307,14 +296,17 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { zfs_dbgmsg("txg %lld %s %s (id %llu) %s", - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + (longlong_t)fnvlist_lookup_uint64(nvl, + ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), + (u_longlong_t)fnvlist_lookup_uint64(nvl, + ZPOOL_HIST_DSID), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } else { zfs_dbgmsg("txg %lld %s %s", - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + (longlong_t)fnvlist_lookup_uint64(nvl, + ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } @@ -331,7 +323,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know - * how to parse and understand zfs(1M) command invocations. + * how to parse and understand zfs(8) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { @@ -406,14 +398,18 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) } fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); + /* + * Since the history is recorded asynchronously, the effective time is + * now, which may be considerably before the change is made on disk. + */ + fnvlist_add_uint64(nvarg, ZPOOL_HIST_TIME, gethrestime_sec()); + /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, - nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ return (err); - } /* @@ -534,16 +530,17 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, msg = kmem_vasprintf(fmt, adx); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); - strfree(msg); + kmem_strfree(msg); fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), - spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); + spa_history_log_sync, nvl, tx); } /* spa_history_log_sync() will free nvl */ } @@ -623,6 +620,14 @@ spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx) u->nodename, u->release, u->version, u->machine); } +#ifndef _KERNEL +const char * +spa_history_zone(void) +{ + return (NULL); +} +#endif + #if defined(_KERNEL) EXPORT_SYMBOL(spa_history_create_obj); EXPORT_SYMBOL(spa_history_get); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c new file mode 100644 index 0000000000..6fd302b8df --- /dev/null +++ b/module/zfs/spa_log_spacemap.c @@ -0,0 +1,1322 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Log Space Maps + * + * Log space maps are an optimization in ZFS metadata allocations for pools + * whose workloads are primarily random-writes. Random-write workloads are also + * typically random-free, meaning that they are freeing from locations scattered + * throughout the pool. This means that each TXG we will have to append some + * FREE records to almost every metaslab. With log space maps, we hold their + * changes in memory and log them altogether in one pool-wide space map on-disk + * for persistence. As more blocks are accumulated in the log space maps and + * more unflushed changes are accounted in memory, we flush a selected group + * of metaslabs every TXG to relieve memory pressure and potential overheads + * when loading the pool. Flushing a metaslab to disk relieves memory as we + * flush any unflushed changes from memory to disk (i.e. the metaslab's space + * map) and saves import time by making old log space maps obsolete and + * eventually destroying them. [A log space map is said to be obsolete when all + * its entries have made it to their corresponding metaslab space maps]. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * is activated when we create the first log space map and remains active + * for the lifetime of the pool. The new entry in the MOS Directory [refer + * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value + * pairs are of the form . + * This entry is our on-disk reference of the log space maps that exist in + * the pool for each TXG and it is used during import to load all the + * metaslab unflushed changes in memory. To see how this structure is first + * created and later populated refer to spa_generate_syncing_log_sm(). To see + * how it is used during import time refer to spa_ld_log_sm_metadata(). + * + * - Each vdev has a new entry in its vdev_top_zap (see field + * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of + * each metaslab in this vdev. This field is the on-disk counterpart of the + * in-memory field ms_unflushed_txg which tells us from which TXG and onwards + * the metaslab haven't had its changes flushed. During import, we use this + * to ignore any entries in the space map log that are for this metaslab but + * from a TXG before msp_unflushed_txg. At that point, we also populate its + * in-memory counterpart and from there both fields are updated every time + * we flush that metaslab. + * + * - A space map is created every TXG and, during that TXG, it is used to log + * all incoming changes (the log space map). When created, the log space map + * is referenced in memory by spa_syncing_log_sm and its object ID is inserted + * to the space map ZAP mentioned above. The log space map is closed at the + * end of the TXG and will be destroyed when it becomes fully obsolete. We + * know when a log space map has become obsolete by looking at the oldest + * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger + * than the log space map's TXG, then it means that there is no metaslab who + * doesn't have the changes from that log and we can therefore destroy it. + * [see spa_cleanup_old_sm_logs()]. + * + * == Important in-memory structures == + * + * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in + * the pool by their ms_unflushed_txg field. It is primarily used for three + * reasons. First of all, it is used during flushing where we try to flush + * metaslabs in-order from the oldest-flushed to the most recently flushed + * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the + * oldest flushed metaslab to distinguish which log space maps have become + * obsolete and which ones are still relevant. Finally it tells us which + * metaslabs have unflushed changes in a pool where this feature was just + * enabled, as we don't immediately add all of the pool's metaslabs but we + * add them over time as they go through metaslab_sync(). The reason that + * we do that is to ease these pools into the behavior of the flushing + * algorithm (described later on). + * + * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory + * counterpart of the space map ZAP mentioned above. It's an AVL tree whose + * nodes represent the log space maps in the pool. This in-memory + * representation of log space maps in the pool sorts the log space maps by + * the TXG that they were created (which is also the TXG of their unflushed + * changes). It also contains the following extra information for each + * space map: + * [1] The number of metaslabs that were last flushed on that TXG. This is + * important because if that counter is zero and this is the oldest + * log then it means that it is also obsolete. + * [2] The number of blocks of that space map. This field is used by the + * block heuristic of our flushing algorithm (described later on). + * It represents how many blocks of metadata changes ZFS had to write + * to disk for that TXG. + * + * - The per-spa field spa_log_summary is a list of entries that summarizes + * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg + * AVL tree mentioned above. The reason this exists is that our flushing + * algorithm (described later) tries to estimate how many metaslabs to flush + * in each TXG by iterating over all the log space maps and looking at their + * block counts. Summarizing that information means that don't have to + * iterate through each space map, minimizing the runtime overhead of the + * flushing algorithm which would be induced in syncing context. In terms of + * implementation the log summary is used as a queue: + * * we modify or pop entries from its head when we flush metaslabs + * * we modify or append entries to its tail when we sync changes. + * + * - Each metaslab has two new range trees that hold its unflushed changes, + * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint. + * + * == Flushing algorithm == + * + * The decision of how many metaslabs to flush on a give TXG is guided by + * two heuristics: + * + * [1] The memory heuristic - + * We keep track of the memory used by the unflushed trees from all the + * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it + * stays below a certain threshold which is determined by an arbitrary hard + * limit and an arbitrary percentage of the system's memory [see + * spa_log_exceeds_memlimit()]. When we see that the memory usage of the + * unflushed changes are passing that threshold, we flush metaslabs, which + * empties their unflushed range trees, reducing the memory used. + * + * [2] The block heuristic - + * We try to keep the total number of blocks in the log space maps in check + * so the log doesn't grow indefinitely and we don't induce a lot of overhead + * when loading the pool. At the same time we don't want to flush a lot of + * metaslabs too often as this would defeat the purpose of the log space map. + * As a result we set a limit in the amount of blocks that we think it's + * acceptable for the log space maps to have and try not to cross it. + * [see sus_blocklimit from spa_unflushed_stats]. + * + * In order to stay below the block limit every TXG we have to estimate how + * many metaslabs we need to flush based on the current rate of incoming blocks + * and our history of log space map blocks. The main idea here is to answer + * the question of how many metaslabs do we need to flush in order to get rid + * at least an X amount of log space map blocks. We can answer this question + * by iterating backwards from the oldest log space map to the newest one + * and looking at their metaslab and block counts. At this point the log summary + * mentioned above comes handy as it reduces the amount of things that we have + * to iterate (even though it may reduce the preciseness of our estimates due + * to its aggregation of data). So with that in mind, we project the incoming + * rate of the current TXG into the future and attempt to approximate how many + * metaslabs would we need to flush from now in order to avoid exceeding our + * block limit in different points in the future (granted that we would keep + * flushing the same number of metaslabs for every TXG). Then we take the + * maximum number from all these estimates to be on the safe side. For the + * exact implementation details of algorithm refer to + * spa_estimate_metaslabs_to_flush. + */ + +/* + * This is used as the block size for the space maps used for the + * log space map feature. These space maps benefit from a bigger + * block size as we expect to be writing a lot of data to them at + * once. + */ +unsigned long zfs_log_sm_blksz = 1ULL << 17; + +/* + * Percentage of the overall system's memory that ZFS allows to be + * used for unflushed changes (e.g. the sum of size of all the nodes + * in the unflushed trees). + * + * Note that this value is calculated over 1000000 for finer granularity + * (thus the _ppm suffix; reads as "parts per million"). As an example, + * the default of 1000 allows 0.1% of memory to be used. + */ +unsigned long zfs_unflushed_max_mem_ppm = 1000; + +/* + * Specific hard-limit in memory that ZFS allows to be used for + * unflushed changes. + */ +unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; + +/* + * The following tunable determines the number of blocks that can be used for + * the log space maps. It is expressed as a percentage of the total number of + * metaslabs in the pool (i.e. the default of 400 means that the number of log + * blocks is capped at 4 times the number of metaslabs). + * + * This value exists to tune our flushing algorithm, with higher values + * flushing metaslabs less often (doing less I/Os) per TXG versus lower values + * flushing metaslabs more aggressively with the upside of saving overheads + * when loading the pool. Another factor in this tradeoff is that flushing + * less often can potentially lead to better utilization of the metaslab space + * map's block size as we accumulate more changes per flush. + * + * Given that this tunable indirectly controls the flush rate (metaslabs + * flushed per txg) and that's why making it a percentage in terms of the + * number of metaslabs in the pool makes sense here. + * + * As a rule of thumb we default this tunable to 400% based on the following: + * + * 1] Assuming a constant flush rate and a constant incoming rate of log blocks + * it is reasonable to expect that the amount of obsolete entries changes + * linearly from txg to txg (e.g. the oldest log should have the most + * obsolete entries, and the most recent one the least). With this we could + * say that, at any given time, about half of the entries in the whole space + * map log are obsolete. Thus for every two entries for a metaslab in the + * log space map, only one of them is valid and actually makes it to the + * metaslab's space map. + * [factor of 2] + * 2] Each entry in the log space map is guaranteed to be two words while + * entries in metaslab space maps are generally single-word. + * [an extra factor of 2 - 400% overall] + * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into + * account any consolidation of segments from the log space map to the + * unflushed range trees nor their history (e.g. a segment being allocated, + * then freed, then allocated again means 3 log space map entries but 0 + * metaslab space map entries). Depending on the workload, we've seen ~1.8 + * non-obsolete log space map entries per metaslab entry, for a total of + * ~600%. Since most of these estimates though are workload dependent, we + * default on 400% to be conservative. + * + * Thus we could say that even in the worst + * case of [1] and [2], the factor should end up being 4. + * + * That said, regardless of the number of metaslabs in the pool we need to + * provide upper and lower bounds for the log block limit. + * [see zfs_unflushed_log_block_{min,max}] + */ +unsigned long zfs_unflushed_log_block_pct = 400; + +/* + * If the number of metaslabs is small and our incoming rate is high, we could + * get into a situation that we are flushing all our metaslabs every TXG. Thus + * we always allow at least this many log blocks. + */ +unsigned long zfs_unflushed_log_block_min = 1000; + +/* + * If the log becomes too big, the import time of the pool can take a hit in + * terms of performance. Thus we have a hard limit in the size of the log in + * terms of blocks. + */ +unsigned long zfs_unflushed_log_block_max = (1ULL << 18); + +/* + * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and + * stability of the flushing algorithm (longer summary) vs its runtime overhead + * (smaller summary is faster to traverse). + */ +unsigned long zfs_max_logsm_summary_length = 10; + +/* + * Tunable that sets the lower bound on the metaslabs to flush every TXG. + * + * Setting this to 0 has no effect since if the pool is idle we won't even be + * creating log space maps and therefore we won't be flushing. On the other + * hand if the pool has any incoming workload our block heuristic will start + * flushing metaslabs anyway. + * + * The point of this tunable is to be used in extreme cases where we really + * want to flush more metaslabs than our adaptable heuristic plans to flush. + */ +unsigned long zfs_min_metaslabs_to_flush = 1; + +/* + * Tunable that specifies how far in the past do we want to look when trying to + * estimate the incoming log blocks for the current TXG. + * + * Setting this too high may not only increase runtime but also minimize the + * effect of the incoming rates from the most recent TXGs as we take the + * average over all the blocks that we walk + * [see spa_estimate_incoming_log_blocks]. + */ +unsigned long zfs_max_log_walking = 5; + +/* + * This tunable exists solely for testing purposes. It ensures that the log + * spacemaps are not flushed and destroyed during export in order for the + * relevant log spacemap import code paths to be tested (effectively simulating + * a crash). + */ +int zfs_keep_log_spacemaps_at_export = 0; + +static uint64_t +spa_estimate_incoming_log_blocks(spa_t *spa) +{ + ASSERT3U(spa_sync_pass(spa), ==, 1); + uint64_t steps = 0, sum = 0; + for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + sls != NULL && steps < zfs_max_log_walking; + sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_txg == spa_syncing_txg(spa)) { + /* + * skip the log created in this TXG as this would + * make our estimations inaccurate. + */ + continue; + } + sum += sls->sls_nblocks; + steps++; + } + return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0); +} + +uint64_t +spa_log_sm_blocklimit(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_blocklimit); +} + +void +spa_log_sm_set_blocklimit(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT0(spa_log_sm_blocklimit(spa)); + return; + } + + uint64_t calculated_limit = + (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); +} + +uint64_t +spa_log_sm_nblocks(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_nblocks); +} + +/* + * Ensure that the in-memory log space map structures and the summary + * have the same block and metaslab counts. + */ +static void +spa_log_summary_verify_counts(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0) + return; + + uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed); + + uint64_t ms_in_summary = 0, blk_in_summary = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + ms_in_summary += e->lse_mscount; + blk_in_summary += e->lse_blkcount; + } + + uint64_t ms_in_logs = 0, blk_in_logs = 0; + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + ms_in_logs += sls->sls_mscount; + blk_in_logs += sls->sls_nblocks; + } + + VERIFY3U(ms_in_logs, ==, ms_in_summary); + VERIFY3U(ms_in_logs, ==, ms_in_avl); + VERIFY3U(blk_in_logs, ==, blk_in_summary); + VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa)); +} + +static boolean_t +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +{ + uint64_t blocks_per_row = MAX(1, + DIV_ROUND_UP(spa_log_sm_blocklimit(spa), + zfs_max_logsm_summary_length)); + return (blocks_per_row <= e->lse_blkcount); +} + +/* + * Update the log summary information to reflect the fact that a metaslab + * was flushed or destroyed (e.g due to device removal or pool export/destroy). + * + * We typically flush the oldest flushed metaslab so the first (and oldest) + * entry of the summary is updated. However if that metaslab is getting loaded + * we may flush the second oldest one which may be part of an entry later in + * the summary. Moreover, if we call into this function from metaslab_fini() + * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask + * for a txg as an argument so we can locate the appropriate summary entry for + * the metaslab. + */ +void +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +{ + /* + * We don't track summary data for read-only pools and this function + * can be called from metaslab_fini(). In that case return immediately. + */ + if (!spa_writeable(spa)) + return; + + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + + if (target == NULL || target->lse_mscount == 0) { + /* + * We didn't find a summary entry for this metaslab. We must be + * at the teardown of a spa_load() attempt that got an error + * while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + target->lse_mscount--; +} + +/* + * Update the log summary information to reflect the fact that we destroyed + * old log space maps. Since we can only destroy the oldest log space maps, + * we decrement the block count of the oldest summary entry and potentially + * destroy it when that count hits 0. + * + * This function is called after a metaslab is flushed and typically that + * metaslab is the oldest flushed, which means that this function will + * typically decrement the block count of the first entry of the summary and + * potentially free it if the block count gets to zero (its metaslab count + * should be zero too at that point). + * + * There are certain scenarios though that don't work exactly like that so we + * need to account for them: + * + * Scenario [1]: It is possible that after we flushed the oldest flushed + * metaslab and we destroyed the oldest log space map, more recent logs had 0 + * metaslabs pointing to them so we got rid of them too. This can happen due + * to metaslabs being destroyed through device removal, or because the oldest + * flushed metaslab was loading but we kept flushing more recently flushed + * metaslabs due to the memory pressure of unflushed changes. Because of that, + * we always iterate from the beginning of the summary and if blocks_gone is + * bigger than the block_count of the current entry we free that entry (we + * expect its metaslab count to be zero), we decrement blocks_gone and on to + * the next entry repeating this procedure until blocks_gone gets decremented + * to 0. Doing this also works for the typical case mentioned above. + * + * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by + * the first (and oldest) entry in the summary. If the first few entries of + * the summary were only accounting metaslabs from a device that was just + * removed, then the current oldest flushed metaslab could be accounted by an + * entry somewhere in the middle of the summary. Moreover flushing that + * metaslab will destroy all the log space maps older than its ms_unflushed_txg + * because they became obsolete after the removal. Thus, iterating as we did + * for scenario [1] works out for this case too. + * + * Scenario [3]: At times we decide to flush all the metaslabs in the pool + * in one TXG (either because we are exporting the pool or because our flushing + * heuristics decided to do so). When that happens all the log space maps get + * destroyed except the one created for the current TXG which doesn't have + * any log blocks yet. As log space maps get destroyed with every metaslab that + * we flush, entries in the summary are also destroyed. This brings a weird + * corner-case when we flush the last metaslab and the log space map of the + * current TXG is in the same summary entry with other log space maps that + * are older. When that happens we are eventually left with this one last + * summary entry whose blocks are gone (blocks_gone equals the entry's block + * count) but its metaslab count is non-zero (because it accounts all the + * metaslabs in the pool as they all got flushed). Under this scenario we can't + * free this last summary entry as it's referencing all the metaslabs in the + * pool and its block count will get incremented at the end of this sync (when + * we close the syncing log space map). Thus we just decrement its current + * block count and leave it alone. In the case that the pool gets exported, + * its metaslab count will be decremented over time as we call metaslab_fini() + * for all the metaslabs in the pool and the entry will be freed at + * spa_unload_log_sm_metadata(). + */ +void +spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) +{ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + if (e->lse_blkcount > blocks_gone) { + /* + * Assert that we stopped at an entry that is not + * obsolete. + */ + ASSERT(e->lse_mscount != 0); + + e->lse_blkcount -= blocks_gone; + blocks_gone = 0; + break; + } else if (e->lse_mscount == 0) { + /* remove obsolete entry */ + blocks_gone -= e->lse_blkcount; + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } else { + /* Verify that this is scenario [3] mentioned above. */ + VERIFY3U(blocks_gone, ==, e->lse_blkcount); + + /* + * Assert that this is scenario [3] further by ensuring + * that this is the only entry in the summary. + */ + VERIFY3P(e, ==, list_tail(&spa->spa_log_summary)); + ASSERT3P(e, ==, list_head(&spa->spa_log_summary)); + + blocks_gone = e->lse_blkcount = 0; + break; + } + } + + /* + * Ensure that there is no way we are trying to remove more blocks + * than the # of blocks in the summary. + */ + ASSERT0(blocks_gone); +} + +void +spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg) +{ + spa_log_sm_t target = { .sls_txg = txg }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + if (sls == NULL) { + /* + * We must be at the teardown of a spa_load() attempt that + * got an error while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + ASSERT(sls->sls_mscount > 0); + sls->sls_mscount--; +} + +void +spa_log_sm_increment_current_mscount(spa_t *spa) +{ + spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg); + ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa)); + last_sls->sls_mscount++; +} + +static void +summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, + uint64_t nblocks) +{ + log_summary_entry_t *e = list_tail(&spa->spa_log_summary); + + if (e == NULL || summary_entry_is_full(spa, e)) { + e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); + e->lse_start = txg; + list_insert_tail(&spa->spa_log_summary, e); + } + + ASSERT3U(e->lse_start, <=, txg); + e->lse_mscount += metaslabs_flushed; + e->lse_blkcount += nblocks; +} + +static void +spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) +{ + summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); +} + +void +spa_log_summary_add_flushed_metaslab(spa_t *spa) +{ + summary_add_data(spa, spa_syncing_txg(spa), 1, 0); +} + +/* + * This function attempts to estimate how many metaslabs should + * we flush to satisfy our block heuristic for the log spacemap + * for the upcoming TXGs. + * + * Specifically, it first tries to estimate the number of incoming + * blocks in this TXG. Then by projecting that incoming rate to + * future TXGs and using the log summary, it figures out how many + * flushes we would need to do for future TXGs individually to + * stay below our block limit and returns the maximum number of + * flushes from those estimates. + */ +static uint64_t +spa_estimate_metaslabs_to_flush(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_log_sm_blocklimit(spa) != 0); + + /* + * This variable contains the incoming rate that will be projected + * and used for our flushing estimates in the future. + */ + uint64_t incoming = spa_estimate_incoming_log_blocks(spa); + + /* + * At any point in time this variable tells us how many + * TXGs in the future we are so we can make our estimations. + */ + uint64_t txgs_in_future = 1; + + /* + * This variable tells us how much room do we have until we hit + * our limit. When it goes negative, it means that we've exceeded + * our limit and we need to flush. + * + * Note that since we start at the first TXG in the future (i.e. + * txgs_in_future starts from 1) we already decrement this + * variable by the incoming rate. + */ + int64_t available_blocks = + spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + + /* + * This variable tells us the total number of flushes needed to + * keep the log size within the limit when we reach txgs_in_future. + */ + uint64_t total_flushes = 0; + + /* Holds the current maximum of our estimates so far. */ + uint64_t max_flushes_pertxg = + MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), + zfs_min_metaslabs_to_flush); + + /* + * For our estimations we only look as far in the future + * as the summary allows us. + */ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + + /* + * If there is still room before we exceed our limit + * then keep skipping TXGs accumulating more blocks + * based on the incoming rate until we exceed it. + */ + if (available_blocks >= 0) { + uint64_t skip_txgs = (available_blocks / incoming) + 1; + available_blocks -= (skip_txgs * incoming); + txgs_in_future += skip_txgs; + ASSERT3S(available_blocks, >=, -incoming); + } + + /* + * At this point we're far enough into the future where + * the limit was just exceeded and we flush metaslabs + * based on the current entry in the summary, updating + * our available_blocks. + */ + ASSERT3S(available_blocks, <, 0); + available_blocks += e->lse_blkcount; + total_flushes += e->lse_mscount; + + /* + * Keep the running maximum of the total_flushes that + * we've done so far over the number of TXGs in the + * future that we are. The idea here is to estimate + * the average number of flushes that we should do + * every TXG so that when we are that many TXGs in the + * future we stay under the limit. + */ + max_flushes_pertxg = MAX(max_flushes_pertxg, + DIV_ROUND_UP(total_flushes, txgs_in_future)); + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + max_flushes_pertxg); + } + return (max_flushes_pertxg); +} + +uint64_t +spa_log_sm_memused(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_memused); +} + +static boolean_t +spa_log_exceeds_memlimit(spa_t *spa) +{ + if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt) + return (B_TRUE); + + uint64_t system_mem_allowed = ((physmem * PAGESIZE) * + zfs_unflushed_max_mem_ppm) / 1000000; + if (spa_log_sm_memused(spa) > system_mem_allowed) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +spa_flush_all_logs_requested(spa_t *spa) +{ + return (spa->spa_log_flushall_txg != 0); +} + +void +spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + + if (spa_sync_pass(spa) != 1) + return; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + /* + * If we don't have any metaslabs with unflushed changes + * return immediately. + */ + if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0) + return; + + /* + * During SPA export we leave a few empty TXGs to go by [see + * spa_final_dirty_txg() to understand why]. For this specific + * case, it is important to not flush any metaslabs as that + * would dirty this TXG. + * + * That said, during one of these dirty TXGs that is less or + * equal to spa_final_dirty(), spa_unload() will request that + * we try to flush all the metaslabs for that TXG before + * exporting the pool, thus we ensure that we didn't get a + * request of flushing everything before we attempt to return + * immediately. + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && + !spa_flush_all_logs_requested(spa)) + return; + + /* + * We need to generate a log space map before flushing because this + * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg) + * for this TXG's flushed metaslab count (aka sls_mscount which is + * manipulated in many ways down the metaslab_flush() codepath). + * + * That is not to say that we may generate a log space map when we + * don't need it. If we are flushing metaslabs, that means that we + * were going to write changes to disk anyway, so even if we were + * not flushing, a log space map would have been created anyway in + * metaslab_sync(). + */ + spa_generate_syncing_log_sm(spa, tx); + + /* + * This variable tells us how many metaslabs we want to flush based + * on the block-heuristic of our flushing algorithm (see block comment + * of log space map feature). We also decrement this as we flush + * metaslabs and attempt to destroy old log space maps. + */ + uint64_t want_to_flush; + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + } else { + want_to_flush = spa_estimate_metaslabs_to_flush(spa); + } + + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + want_to_flush); + + /* Used purely for verification purposes */ + uint64_t visited = 0; + + /* + * Ideally we would only iterate through spa_metaslabs_by_flushed + * using only one variable (curr). We can't do that because + * metaslab_flush() mutates position of curr in the AVL when + * it flushes that metaslab by moving it to the end of the tree. + * Thus we always keep track of the original next node of the + * current node (curr) in another variable (next). + */ + metaslab_t *next = NULL; + for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed); + curr != NULL; curr = next) { + next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr); + + /* + * If this metaslab has been flushed this txg then we've done + * a full circle over the metaslabs. + */ + if (metaslab_unflushed_txg(curr) == txg) + break; + + /* + * If we are done flushing for the block heuristic and the + * unflushed changes don't exceed the memory limit just stop. + */ + if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) + break; + + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + boolean_t flushed = metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + + /* + * If we failed to flush a metaslab (because it was loading), + * then we are done with the block heuristic as it's not + * possible to destroy any log space maps once you've skipped + * a metaslab. In that case we just set our counter to 0 but + * we continue looping in case there is still memory pressure + * due to unflushed changes. Note that, flushing a metaslab + * that is not the oldest flushed in the pool, will never + * destroy any log space maps [see spa_cleanup_old_sm_logs()]. + */ + if (!flushed) { + want_to_flush = 0; + } else if (want_to_flush > 0) { + want_to_flush--; + } + + visited++; + } + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); +} + +/* + * Close the log space map for this TXG and update the block counts + * for the log's in-memory structure and the summary. + */ +void +spa_sync_close_syncing_log_sm(spa_t *spa) +{ + if (spa_syncing_log_sm(spa) == NULL) + return; + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa)); + + sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa)); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + + /* + * Note that we can't assert that sls_mscount is not 0, + * because there is the case where the first metaslab + * in spa_metaslabs_by_flushed is loading and we were + * not able to flush any metaslabs the current TXG. + */ + ASSERT(sls->sls_nblocks != 0); + + spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks); + spa_log_summary_verify_counts(spa); + + space_map_close(spa->spa_syncing_log_sm); + spa->spa_syncing_log_sm = NULL; + + /* + * At this point we tried to flush as many metaslabs as we + * can as the pool is getting exported. Reset the "flush all" + * so the last few TXGs before closing the pool can be empty + * (e.g. not dirty). + */ + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + spa->spa_log_flushall_txg = 0; + } +} + +void +spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(spa); + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + return; + } + VERIFY0(error); + + metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); + uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); + + /* Free all log space maps older than the oldest_flushed_txg. */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls && sls->sls_txg < oldest_flushed_txg; + sls = avl_first(&spa->spa_sm_logs_by_txg)) { + ASSERT0(sls->sls_mscount); + avl_remove(&spa->spa_sm_logs_by_txg, sls); + space_map_free_obj(mos, sls->sls_sm_obj, tx); + VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; + kmem_free(sls, sizeof (spa_log_sm_t)); + } +} + +static spa_log_sm_t * +spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg) +{ + spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP); + sls->sls_sm_obj = sm_obj; + sls->sls_txg = txg; + return (sls); +} + +void +spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + objset_t *mos = spa_meta_objset(spa); + + if (spa_syncing_log_sm(spa) != NULL) + return; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = 0; + spacemap_zap = zap_create(mos, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, + &spacemap_zap, tx)); + spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); + } + VERIFY0(error); + + uint64_t sm_obj; + ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), + ==, ENOENT); + sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx); + VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx)); + avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg)); + + /* + * We pass UINT64_MAX as the space map's representation size + * and SPA_MINBLOCKSHIFT as the shift, to make the space map + * accept any sorts of segments since there's no real advantage + * to being more restrictive (given that we're already going + * to be using 2-word entries). + */ + VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, + 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + /* + * If the log space map feature was just enabled, the blocklimit + * has not yet been set. + */ + if (spa_log_sm_blocklimit(spa) == 0) + spa_log_sm_set_blocklimit(spa); +} + +/* + * Find all the log space maps stored in the space map ZAP and sort + * them by their TXG in spa_sm_logs_by_txg. + */ +static int +spa_ld_log_sm_metadata(spa_t *spa) +{ + int error; + uint64_t spacemap_zap; + + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + /* the space map ZAP doesn't exist yet */ + return (0); + } else if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]", + error); + return (error); + } + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t log_txg = zfs_strtonum(za.za_name, NULL); + spa_log_sm_t *sls = + spa_log_sm_alloc(za.za_first_integer, log_txg); + avl_add(&spa->spa_sm_logs_by_txg, sls); + } + zap_cursor_fini(&zc); + if (error != ENOENT) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_cursor_retrieve(spacemap_zap) [error %d]", + error); + return (error); + } + + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + /* + * At this point if sls is zero it means that a bug occurred + * in ZFS the last time the pool was open or earlier in the + * import code path. In general, we would have placed a + * VERIFY() here or in this case just let the kernel panic + * with NULL pointer dereference when incrementing sls_mscount, + * but since this is the import code path we can be a bit more + * lenient. Thus, for DEBUG bits we always cause a panic, while + * in production we log the error and just fail the import. + */ + ASSERT(sls != NULL); + if (sls == NULL) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug " + "encountered: could not find log spacemap for " + "TXG %llu [error %d]", + (u_longlong_t)metaslab_unflushed_txg(m), ENOENT); + return (ENOENT); + } + sls->sls_mscount++; + } + + return (0); +} + +typedef struct spa_ld_log_sm_arg { + spa_t *slls_spa; + uint64_t slls_txg; +} spa_ld_log_sm_arg_t; + +static int +spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) +{ + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint32_t vdev_id = sme->sme_vdev; + + spa_ld_log_sm_arg_t *slls = arg; + spa_t *spa = slls->slls_spa; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* + * If the vdev has been removed (i.e. it is indirect or a hole) + * skip this entry. The contents of this vdev have already moved + * elsewhere. + */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(!ms->ms_loaded); + + /* + * If we have already flushed entries for this TXG to this + * metaslab's space map, then ignore it. Note that we flush + * before processing any allocations/frees for that TXG, so + * the metaslab's space map only has entries from *before* + * the unflushed TXG. + */ + if (slls->slls_txg < metaslab_unflushed_txg(ms)) + return (0); + + switch (sme->sme_type) { + case SM_ALLOC: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_frees, ms->ms_unflushed_allocs); + break; + case SM_FREE: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_allocs, ms->ms_unflushed_frees); + break; + default: + panic("invalid maptype_t"); + break; + } + return (0); +} + +static int +spa_ld_log_sm_data(spa_t *spa) +{ + int error = 0; + + /* + * If we are not going to do any writes there is no need + * to read the log space maps. + */ + if (!spa_writeable(spa)) + return (0); + + ASSERT0(spa->spa_unflushed_stats.sus_nblocks); + ASSERT0(spa->spa_unflushed_stats.sus_memused); + + hrtime_t read_logs_starttime = gethrtime(); + /* this is a no-op when we don't have space map logs */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + error = space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " + "space_map_open(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + struct spa_ld_log_sm_arg vla = { + .slls_spa = spa, + .slls_txg = sls->sls_txg + }; + error = space_map_iterate(sm, space_map_length(sm), + spa_ld_log_sm_cb, &vla); + if (error != 0) { + space_map_close(sm); + spa_load_failed(spa, "spa_ld_log_sm_data(): failed " + "at space_map_iterate(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, sls->sls_nblocks); + + space_map_close(sm); + } + hrtime_t read_logs_endtime = gethrtime(); + spa_load_note(spa, + "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + (u_longlong_t)spa_log_sm_nblocks(spa), + (u_longlong_t)zfs_log_sm_blksz, + (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + +out: + /* + * Now that the metaslabs contain their unflushed changes: + * [1] recalculate their actual allocated space + * [2] recalculate their weights + * [3] sum up the memory usage of their unflushed range trees + * [4] optionally load them, if debug_load is set + * + * Note that even in the case where we get here because of an + * error (e.g. error != 0), we still want to update the fields + * below in order to have a proper teardown in spa_unload(). + */ + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + mutex_enter(&m->ms_lock); + m->ms_allocated_space = space_map_allocated(m->ms_sm) + + range_tree_space(m->ms_unflushed_allocs) - + range_tree_space(m->ms_unflushed_frees); + + vdev_t *vd = m->ms_group->mg_vd; + metaslab_space_update(vd, m->ms_group->mg_class, + range_tree_space(m->ms_unflushed_allocs), 0, 0); + metaslab_space_update(vd, m->ms_group->mg_class, + -range_tree_space(m->ms_unflushed_frees), 0, 0); + + ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); + metaslab_recalculate_weight_and_sort(m); + + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(m); + + if (metaslab_debug_load && m->ms_sm != NULL) { + VERIFY0(metaslab_load(m)); + metaslab_set_selected_txg(m, 0); + } + mutex_exit(&m->ms_lock); + } + + return (error); +} + +static int +spa_ld_unflushed_txgs(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + if (vd->vdev_top_zap == 0) + return (0); + + uint64_t object = 0; + int error = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &object); + if (error == ENOENT) + return (0); + else if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " + "zap_lookup(vdev_top_zap=%llu) [error %d]", + (u_longlong_t)vd->vdev_top_zap, error); + return (error); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + ASSERT(ms != NULL); + + metaslab_unflushed_phys_t entry; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + error = dmu_read(mos, object, + entry_offset, entry_size, &entry, 0); + if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): " + "failed at dmu_read(obj=%llu) [error %d]", + (u_longlong_t)object, error); + return (error); + } + + ms->ms_unflushed_txg = entry.msp_unflushed_txg; + if (ms->ms_unflushed_txg != 0) { + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, ms); + mutex_exit(&spa->spa_flushed_ms_lock); + } + } + return (0); +} + +/* + * Read all the log space map entries into their respective + * metaslab unflushed trees and keep them sorted by TXG in the + * SPA's metadata. In addition, setup all the metadata for the + * memory and the block heuristics. + */ +int +spa_ld_log_spacemaps(spa_t *spa) +{ + int error; + + spa_log_sm_set_blocklimit(spa); + + for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + error = spa_ld_unflushed_txgs(vd); + if (error != 0) + return (error); + } + + error = spa_ld_log_sm_metadata(spa); + if (error != 0) + return (error); + + /* + * Note: we don't actually expect anything to change at this point + * but we grab the config lock so we don't fail any assertions + * when using vdev_lookup_top(). + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + error = spa_ld_log_sm_data(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW, + "Specific hard-limit in memory that ZFS allows to be used for " + "unflushed changes"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW, + "Percentage of the overall system memory that ZFS allows to be " + "used for unflushed changes (value is calculated over 1000000 for " + "finer granularity)"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of blocks."); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, + "Lower-bound limit for the maximum amount of blocks allowed in " + "log spacemap (see zfs_unflushed_log_block_max)"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, + "Tunable used to determine the number of blocks that can be used for " + "the spacemap log, expressed as a percentage of the total number of " + "metaslabs in the pool (e.g. 400 means the number of log blocks is " + "capped at 4 times the number of metaslabs)"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW, + "The number of past TXGs that the flushing algorithm of the log " + "spacemap feature uses to estimate incoming log blocks"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW, + "Maximum number of rows allowed in the summary of the spacemap log"); + +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW, + "Minimum number of metaslabs to flush per dirty TXG"); + +ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, + "Prevent the log spacemaps from being flushed and destroyed " + "during pool export/destroy"); +/* END CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a111a9e4e6..1ecd2294db 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,12 +20,13 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, loli10K . All rights reserved. */ #include @@ -58,13 +59,15 @@ #include #include #include "zfs_prop.h" +#include #include -#include "qat.h" +#include +#include /* * SPA locking * - * There are four basic locks for managing spa_t structures: + * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * @@ -240,7 +243,7 @@ static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; -int spa_mode_global; +spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* @@ -301,20 +304,20 @@ int zfs_free_leak_on_eio = B_FALSE; * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ -unsigned long zfs_deadman_synctime_ms = 600000ULL; +unsigned long zfs_deadman_synctime_ms = 600000UL; /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ -unsigned long zfs_deadman_ziotime_ms = 300000ULL; +unsigned long zfs_deadman_ziotime_ms = 300000UL; /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 60000ULL; +unsigned long zfs_deadman_checktime_ms = 60000UL; /* * By default the deadman is enabled. @@ -344,11 +347,14 @@ int spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in - * the pool to be consumed. This ensures that we don't run the pool - * completely out of space, due to unaccounted changes (e.g. to the MOS). - * It also limits the worst-case time to allocate space. If we have - * less than this amount of free space, most ZPL operations (e.g. write, - * create) will return ENOSPC. + * the pool to be consumed (bounded by spa_max_slop). This ensures that we + * don't run the pool completely out of space, due to unaccounted changes (e.g. + * to the MOS). It also limits the worst-case time to allocate space. If we + * have less than this amount of free space, most ZPL operations (e.g. write, + * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are + * also part of this 3.2% of space which can't be consumed by normal writes; + * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded + * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half @@ -371,14 +377,18 @@ int spa_asize_inflation = 24; * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * + * Further, on very large pools, the slop space will be smaller than + * 3.2%, to avoid reserving much more space than we actually need; bounded + * by spa_max_slop (128GB). + * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; -uint64_t spa_min_slop = 128 * 1024 * 1024; +uint64_t spa_min_slop = 128ULL * 1024 * 1024; +uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; int spa_allocators = 4; -/*PRINTFLIKE2*/ void spa_load_failed(spa_t *spa, const char *fmt, ...) { @@ -393,7 +403,6 @@ spa_load_failed(spa_t *spa, const char *fmt, ...) spa->spa_trust_config ? "trusted" : "untrusted", buf); } -/*PRINTFLIKE2*/ void spa_load_note(spa_t *spa, const char *fmt, ...) { @@ -433,9 +442,9 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - zfs_refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; + scl->scl_count = 0; } } @@ -446,9 +455,9 @@ spa_config_lock_destroy(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); - zfs_refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); + ASSERT(scl->scl_count == 0); } } @@ -469,7 +478,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) } } else { ASSERT(scl->scl_writer != curthread); - if (!zfs_refcount_is_zero(&scl->scl_count)) { + if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); @@ -477,14 +486,14 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) } scl->scl_writer = curthread; } - (void) zfs_refcount_add(&scl->scl_count, tag); + scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } void -spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { int wlocks_held = 0; @@ -503,29 +512,29 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) } } else { ASSERT(scl->scl_writer != curthread); - while (!zfs_refcount_is_zero(&scl->scl_count)) { + while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } - (void) zfs_refcount_add(&scl->scl_count, tag); + scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void -spa_config_exit(spa_t *spa, int locks, void *tag) +spa_config_exit(spa_t *spa, int locks, const void *tag) { for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); - ASSERT(!zfs_refcount_is_zero(&scl->scl_count)); - if (zfs_refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_count > 0); + if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ @@ -544,8 +553,7 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; - if ((rw == RW_READER && - !zfs_refcount_is_zero(&scl->scl_count)) || + if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } @@ -604,7 +612,7 @@ spa_deadman(void *arg) zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, - ++spa->spa_deadman_calls); + (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); @@ -613,6 +621,15 @@ spa_deadman(void *arg) MSEC_TO_TICK(zfs_deadman_checktime_ms)); } +static int +spa_log_sm_sort_by_txg(const void *va, const void *vb) +{ + const spa_log_sm_t *a = va; + const spa_log_sm_t *b = vb; + + return (TREE_CMP(a->sls_txg, b->sls_txg)); +} + /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already @@ -640,12 +657,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); @@ -658,6 +679,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; + spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); @@ -676,15 +698,20 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_root = spa_strdup(altroot); spa->spa_alloc_count = spa_allocators; - spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * - sizeof (kmutex_t), KM_SLEEP); - spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * - sizeof (avl_tree_t), KM_SLEEP); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * + sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); - avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, + NULL); + avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, + sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); + avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, + sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); + list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), + offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile @@ -718,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; @@ -748,8 +776,9 @@ spa_remove(spa_t *spa) spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); + ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); @@ -767,14 +796,15 @@ spa_remove(spa_t *spa) } for (int i = 0; i < spa->spa_alloc_count; i++) { - avl_destroy(&spa->spa_alloc_trees[i]); - mutex_destroy(&spa->spa_alloc_locks[i]); + avl_destroy(&spa->spa_allocs[i].spaa_tree); + mutex_destroy(&spa->spa_allocs[i].spaa_lock); } - kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * - sizeof (kmutex_t)); - kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * - sizeof (avl_tree_t)); + kmem_free(spa->spa_allocs, spa->spa_alloc_count * + sizeof (spa_alloc_t)); + avl_destroy(&spa->spa_metaslabs_by_flushed); + avl_destroy(&spa->spa_sm_logs_by_txg); + list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); @@ -798,7 +828,10 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + cv_destroy(&spa->spa_activities_cv); + cv_destroy(&spa->spa_waiters_cv); + mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); @@ -811,6 +844,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); + mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } @@ -911,10 +945,10 @@ spa_aux_compare(const void *a, const void *b) const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; - return (AVL_CMP(sa->aux_guid, sb->aux_guid)); + return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } -void +static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; @@ -932,7 +966,7 @@ spa_aux_add(vdev_t *vd, avl_tree_t *avl) } } -void +static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; @@ -952,7 +986,7 @@ spa_aux_remove(vdev_t *vd, avl_tree_t *avl) } } -boolean_t +static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; @@ -977,7 +1011,7 @@ spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) return (found != NULL); } -void +static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; @@ -994,10 +1028,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference @@ -1135,6 +1169,30 @@ spa_vdev_enter(spa_t *spa) return (spa_vdev_config_enter(spa)); } +/* + * The same as spa_vdev_enter() above but additionally takes the guid of + * the vdev being detached. When there is a rebuild in process it will be + * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). + * The rebuild is canceled if only a single child remains after the detach. + */ +uint64_t +spa_vdev_detach_enter(spa_t *spa, uint64_t guid) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + if (guid != 0) { + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd) { + vdev_rebuild_stop_wait(vd->vdev_top); + } + } + + return (spa_vdev_config_enter(spa)); +} + /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while @@ -1168,7 +1226,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) /* * Reassess the DTLs. */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; @@ -1180,6 +1238,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); @@ -1219,9 +1278,9 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ vdev_autotrim_stop_wait(vd); - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); - spa_config_exit(spa, SCL_ALL, spa); + spa_config_exit(spa, SCL_STATE_ALL, spa); } /* @@ -1241,6 +1300,7 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); + vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); @@ -1292,7 +1352,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) } if (vd != NULL || error == 0) - vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE); + vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) @@ -1310,7 +1370,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) /* * If anything changed, wait for it to sync. This ensures that, - * from the system administrator's perspective, zpool(1M) commands + * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ @@ -1428,32 +1488,21 @@ spa_strfree(char *s) kmem_free(s, strlen(s) + 1); } -uint64_t -spa_get_random(uint64_t range) -{ - uint64_t r; - - ASSERT(range != 0); - - if (range == 1) - return (0); - - (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); - - return (r % range); -} - uint64_t spa_generate_guid(spa_t *spa) { - uint64_t guid = spa_get_random(-1ULL); + uint64_t guid; if (spa != NULL) { - while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); + do { + (void) random_get_pseudo_bytes((void *)&guid, + sizeof (guid)); + } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); + do { + (void) random_get_pseudo_bytes((void *)&guid, + sizeof (guid)); + } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); @@ -1719,17 +1768,52 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* - * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), - * or at least 128MB, unless that would cause it to be more than half the - * pool size. + * Return the amount of slop space in bytes. It is typically 1/32 of the pool + * (3.2%), minus the embedded log space. On very small pools, it may be + * slightly larger than this. On very large pools, it will be capped to + * the value of spa_max_slop. The embedded log space is not included in + * spa_dspace. By subtracting it, the usable space (per "zfs list") is a + * constant 97% of the total space, regardless of metaslab size (assuming the + * default spa_slop_shift=5 and a non-tiny pool). * - * See the comment above spa_slop_shift for details. + * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { - uint64_t space = spa_get_dspace(spa); - return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); + uint64_t space = 0; + uint64_t slop = 0; + + /* + * Make sure spa_dedup_dspace has been set. + */ + if (spa->spa_dedup_dspace == ~0ULL) + spa_update_dspace(spa); + + /* + * spa_get_dspace() includes the space only logically "used" by + * deduplicated data, so since it's not useful to reserve more + * space with more deduplicated data, we subtract that out here. + */ + space = spa_get_dspace(spa) - spa->spa_dedup_dspace; + slop = MIN(space >> spa_slop_shift, spa_max_slop); + + /* + * Subtract the embedded log space, but no more than half the (3.2%) + * unusable space. Note, the "no more than half" is only relevant if + * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by + * default. + */ + uint64_t embedded_log = + metaslab_class_get_dspace(spa_embedded_log_class(spa)); + slop -= MIN(embedded_log, slop >> 1); + + /* + * Slop space should be at least spa_min_slop, but no more than half + * the entire pool. + */ + slop = MAX(slop, MIN(space >> 1, spa_min_slop)); + return (slop); } uint64_t @@ -1751,10 +1835,11 @@ spa_update_dspace(spa_t *spa) ddt_get_dedup_dspace(spa); if (spa->spa_vdev_removal != NULL) { /* - * We can't allocate from the removing device, so - * subtract its size. This prevents the DMU/DSL from - * filling up the (now smaller) pool while we are in the - * middle of removing the device. + * We can't allocate from the removing device, so subtract + * its size if it was included in dspace (i.e. if this is a + * normal-class vdev, not special/dedup). This prevents the + * DMU/DSL from filling up the (now smaller) pool while we + * are in the middle of removing the device. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking @@ -1766,8 +1851,17 @@ spa_update_dspace(spa_t *spa) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - spa->spa_dspace -= spa_deflate(spa) ? - vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + /* + * If the stars align, we can wind up here after + * vdev_remove_complete() has cleared vd->vdev_mg but before + * spa->spa_vdev_removal gets cleared, so we must check before + * we dereference. + */ + if (vd->vdev_mg && + vd->vdev_mg->mg_class == spa_normal_class(spa)) { + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } spa_config_exit(spa, SCL_VDEV, FTAG); } } @@ -1812,6 +1906,12 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +metaslab_class_t * +spa_embedded_log_class(spa_t *spa) +{ + return (spa->spa_embedded_log_class); +} + metaslab_class_t * spa_special_class(spa_t *spa) { @@ -1831,12 +1931,10 @@ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { - if (DMU_OT_IS_ZIL(objtype)) { - if (spa->spa_log_class->mc_groups != 0) - return (spa_log_class(spa)); - else - return (spa_normal_class(spa)); - } + /* + * ZIL allocations determine their class in zio_alloc_zil(). + */ + ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; @@ -1969,6 +2067,32 @@ spa_set_deadman_failmode(spa_t *spa, const char *failmode) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } +void +spa_set_deadman_ziotime(hrtime_t ns) +{ + spa_t *spa = NULL; + + if (spa_mode_global != SPA_MODE_UNINIT) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa->spa_deadman_ziotime = ns; + mutex_exit(&spa_namespace_lock); + } +} + +void +spa_set_deadman_synctime(hrtime_t ns) +{ + spa_t *spa = NULL; + + if (spa_mode_global != SPA_MODE_UNINIT) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa->spa_deadman_synctime = ns; + mutex_exit(&spa_namespace_lock); + } +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -2087,6 +2211,7 @@ spa_import_progress_init(void) spa_import_progress_list; procfs_list_install("zfs", + NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, @@ -2242,7 +2367,7 @@ spa_name_compare(const void *a1, const void *a2) s = strcmp(s1->spa_name, s2->spa_name); - return (AVL_ISIGN(s)); + return (TREE_ISIGN(s)); } void @@ -2252,7 +2377,7 @@ spa_boot_init(void) } void -spa_init(int mode) +spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); @@ -2271,7 +2396,7 @@ spa_init(int mode) spa_mode_global = mode; #ifndef _KERNEL - if (spa_mode_global != FREAD && dprintf_find_string("watch")) { + if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; @@ -2290,8 +2415,8 @@ spa_init(int mode) fm_init(); zfs_refcount_init(); unique_init(); - range_tree_init(); - metaslab_alloc_trace_init(); + zfs_btree_init(); + metaslab_stat_init(); ddt_init(); zio_init(); dmu_init(); @@ -2325,8 +2450,8 @@ spa_fini(void) dmu_fini(); zio_fini(); ddt_fini(); - metaslab_alloc_trace_fini(); - range_tree_fini(); + metaslab_stat_fini(); + zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); @@ -2345,14 +2470,14 @@ spa_fini(void) } /* - * Return whether this pool has slogs. No locking needed. + * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for - * performance and not correctness + * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { - return (spa->spa_log_class->mc_rotor != NULL); + return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t @@ -2376,7 +2501,7 @@ spa_is_root(spa_t *spa) boolean_t spa_writeable(spa_t *spa) { - return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); + return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* @@ -2390,7 +2515,7 @@ spa_has_pending_synctask(spa_t *spa) !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } -int +spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); @@ -2540,22 +2665,10 @@ spa_multihost(spa_t *spa) return (spa->spa_multihost ? B_TRUE : B_FALSE); } -unsigned long -spa_get_hostid(void) +uint32_t +spa_get_hostid(spa_t *spa) { - unsigned long myhostid; - -#ifdef _KERNEL - myhostid = zone_get_hostid(NULL); -#else /* _KERNEL */ - /* - * We're emulating the system's hostid in userland, so - * we can't use zone_get_hostid(). - */ - (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); -#endif /* _KERNEL */ - - return (myhostid); + return (spa->spa_hostid); } boolean_t @@ -2570,6 +2683,12 @@ spa_missing_tvds_allowed(spa_t *spa) return (spa->spa_missing_tvds_allowed); } +space_map_t * +spa_syncing_log_sm(spa_t *spa) +{ + return (spa->spa_syncing_log_sm); +} + void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { @@ -2646,7 +2765,7 @@ boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && - spa->spa_mode == FREAD); + spa->spa_mode == SPA_MODE_READ); } uint64_t @@ -2684,95 +2803,32 @@ spa_suspend_async_destroy(spa_t *spa) #if defined(_KERNEL) -#include - -static int -param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp) +int +param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) - return (SET_ERROR(-EINVAL)); + return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) - return (SET_ERROR(-EINVAL)); + return (SET_ERROR(EINVAL)); - if (spa_mode_global != 0) { + if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } - return (param_set_charp(val, kp)); -} - -static int -param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp) -{ - spa_t *spa = NULL; - int error; - - error = param_set_ulong(val, kp); - if (error < 0) - return (SET_ERROR(error)); - - if (spa_mode_global != 0) { - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) - spa->spa_deadman_ziotime = - MSEC2NSEC(zfs_deadman_ziotime_ms); - mutex_exit(&spa_namespace_lock); - } - - return (0); -} - -static int -param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp) -{ - spa_t *spa = NULL; - int error; - - error = param_set_ulong(val, kp); - if (error < 0) - return (SET_ERROR(error)); - - if (spa_mode_global != 0) { - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) - spa->spa_deadman_synctime = - MSEC2NSEC(zfs_deadman_synctime_ms); - mutex_exit(&spa_namespace_lock); - } - - return (0); -} - -static int -param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp) -{ - unsigned long val; - int error; - - error = kstrtoul(buf, 0, &val); - if (error) - return (SET_ERROR(error)); - - if (val < 1 || val > 31) - return (SET_ERROR(-EINVAL)); - - error = param_set_int(buf, kp); - if (error < 0) - return (SET_ERROR(error)); - return (0); } +#endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); @@ -2836,7 +2892,6 @@ EXPORT_SYMBOL(spa_maxdnodesize); EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); -EXPORT_SYMBOL(spa_get_random); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); @@ -2862,57 +2917,47 @@ EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); -/* BEGIN CSTYLED */ -module_param(zfs_flags, uint, 0644); -MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); +ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, + "Set additional debugging flags"); -module_param(zfs_recover, int, 0644); -MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); +ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, + "Set to attempt to recover from fatal errors"); -module_param(zfs_free_leak_on_eio, int, 0644); -MODULE_PARM_DESC(zfs_free_leak_on_eio, +ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); -module_param_call(zfs_deadman_synctime_ms, param_set_deadman_synctime, - param_get_ulong, &zfs_deadman_synctime_ms, 0644); -MODULE_PARM_DESC(zfs_deadman_synctime_ms, - "Pool sync expiration time in milliseconds"); - -module_param_call(zfs_deadman_ziotime_ms, param_set_deadman_ziotime, - param_get_ulong, &zfs_deadman_ziotime_ms, 0644); -MODULE_PARM_DESC(zfs_deadman_ziotime_ms, - "IO expiration time in milliseconds"); - -module_param(zfs_deadman_checktime_ms, ulong, 0644); -MODULE_PARM_DESC(zfs_deadman_checktime_ms, +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW, "Dead I/O check interval in milliseconds"); -module_param(zfs_deadman_enabled, int, 0644); -MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, + "Enable deadman timer"); -module_param_call(zfs_deadman_failmode, param_set_deadman_failmode, - param_get_charp, &zfs_deadman_failmode, 0644); -MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer"); - -module_param(spa_asize_inflation, int, 0644); -MODULE_PARM_DESC(spa_asize_inflation, +ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW, "SPA size estimate multiplication factor"); -module_param_call(spa_slop_shift, param_set_slop_shift, param_get_int, - &spa_slop_shift, 0644); -MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool"); - -module_param(zfs_ddt_data_is_special, int, 0644); -MODULE_PARM_DESC(zfs_ddt_data_is_special, +ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); -module_param(zfs_user_indirect_is_special, int, 0644); -MODULE_PARM_DESC(zfs_user_indirect_is_special, +ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); -module_param(zfs_special_class_metadata_reserve_pct, int, 0644); -MODULE_PARM_DESC(zfs_special_class_metadata_reserve_pct, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, + param_set_deadman_failmode, param_get_charp, ZMOD_RW, + "Failmode for deadman timer"); + +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, + param_set_deadman_synctime, param_get_ulong, ZMOD_RW, + "Pool sync expiration time in milliseconds"); + +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, + param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, + "IO expiration time in milliseconds"); + +ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ -#endif + +ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, + param_get_int, ZMOD_RW, "Reserved free space in pool"); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 6895428f4f..534ac72fee 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -122,14 +122,11 @@ static void spa_read_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.read_history; - char *module; shl->size = 0; - - module = kmem_asprintf("zfs/%s", spa_name(spa)); - shl->procfs_list.pl_private = shl; - procfs_list_install(module, + procfs_list_install("zfs", + spa_name(spa), "reads", 0600, &shl->procfs_list, @@ -137,8 +134,6 @@ spa_read_history_init(spa_t *spa) spa_read_history_show_header, spa_read_history_clear, offsetof(spa_read_history_t, srh_node)); - - strfree(module); } static void @@ -293,14 +288,11 @@ static void spa_txg_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.txg_history; - char *module; shl->size = 0; - - module = kmem_asprintf("zfs/%s", spa_name(spa)); - shl->procfs_list.pl_private = shl; - procfs_list_install(module, + procfs_list_install("zfs", + spa_name(spa), "txgs", 0644, &shl->procfs_list, @@ -308,8 +300,6 @@ spa_txg_history_init(spa_t *spa) spa_txg_history_show_header, spa_txg_history_clear, offsetof(spa_txg_history_t, sth_node)); - - strfree(module); } static void @@ -478,11 +468,11 @@ spa_tx_assign_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) { for (i = 0; i < shk->count; i++) - ((kstat_named_t *)shk->private)[i].value.ui64 = 0; + ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; } for (i = shk->count; i > 0; i--) - if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) + if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; @@ -504,12 +494,12 @@ spa_tx_assign_init(spa_t *spa) shk->count = 42; /* power of two buckets for 1ns to 2,199s */ shk->size = shk->count * sizeof (kstat_named_t); - shk->private = kmem_alloc(shk->size, KM_SLEEP); + shk->priv = kmem_alloc(shk->size, KM_SLEEP); name = kmem_asprintf("zfs/%s", spa_name(spa)); for (i = 0; i < shk->count; i++) { - ks = &((kstat_named_t *)shk->private)[i]; + ks = &((kstat_named_t *)shk->priv)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", @@ -522,14 +512,14 @@ spa_tx_assign_init(spa_t *spa) if (ksp) { ksp->ks_lock = &shk->lock; - ksp->ks_data = shk->private; + ksp->ks_data = shk->priv; ksp->ks_ndata = shk->count; ksp->ks_data_size = shk->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); } - strfree(name); + kmem_strfree(name); } static void @@ -542,7 +532,7 @@ spa_tx_assign_destroy(spa_t *spa) if (ksp) kstat_delete(ksp); - kmem_free(shk->private, shk->size); + kmem_free(shk->priv, shk->size); mutex_destroy(&shk->lock); } @@ -555,55 +545,7 @@ spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) idx++; - atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); -} - -/* - * ========================================================================== - * SPA IO History Routines - * ========================================================================== - */ -static int -spa_io_history_update(kstat_t *ksp, int rw) -{ - if (rw == KSTAT_WRITE) - memset(ksp->ks_data, 0, ksp->ks_data_size); - - return (0); -} - -static void -spa_io_history_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - char *name; - kstat_t *ksp; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - name = kmem_asprintf("zfs/%s", spa_name(spa)); - - ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); - shk->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &shk->lock; - ksp->ks_private = spa; - ksp->ks_update = spa_io_history_update; - kstat_install(ksp); - } - strfree(name); -} - -static void -spa_io_history_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - - if (shk->kstat) - kstat_delete(shk->kstat); - - mutex_destroy(&shk->lock); + atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); } /* @@ -675,7 +617,7 @@ spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) while (shl->size > size) { smh = list_remove_head(&shl->procfs_list.pl_list); if (smh->vdev_path) - strfree(smh->vdev_path); + kmem_strfree(smh->vdev_path); kmem_free(smh, sizeof (spa_mmp_history_t)); shl->size--; } @@ -699,14 +641,12 @@ static void spa_mmp_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; - char *module; shl->size = 0; - module = kmem_asprintf("zfs/%s", spa_name(spa)); - shl->procfs_list.pl_private = shl; - procfs_list_install(module, + procfs_list_install("zfs", + spa_name(spa), "multihost", 0644, &shl->procfs_list, @@ -714,8 +654,6 @@ spa_mmp_history_init(spa_t *spa) spa_mmp_history_show_header, spa_mmp_history_clear, offsetof(spa_mmp_history_t, smh_node)); - - strfree(module); } static void @@ -814,7 +752,7 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, if (vd) { smh->vdev_guid = vd->vdev_guid; if (vd->vdev_path) - smh->vdev_path = strdup(vd->vdev_path); + smh->vdev_path = kmem_strdup(vd->vdev_path); } smh->vdev_label = label; smh->mmp_node_id = mmp_node_id; @@ -835,7 +773,9 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, static void * spa_state_addr(kstat_t *ksp, loff_t n) { - return (ksp->ks_private); /* return the spa_t */ + if (n == 0) + return (ksp->ks_private); /* return the spa_t */ + return (NULL); } static int @@ -876,7 +816,7 @@ spa_state_init(spa_t *spa) kstat_install(ksp); } - strfree(name); + kmem_strfree(name); } static void @@ -903,6 +843,12 @@ static spa_iostats_t spa_iostats_template = { { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -929,17 +875,24 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); - } else { + } else if (type == TRIM_TYPE_AUTO) { SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); + } else { + SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); + SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); } } -int +static int spa_iostats_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) { @@ -973,7 +926,7 @@ spa_iostats_init(spa_t *spa) kstat_install(ksp); } - strfree(name); + kmem_strfree(name); } static void @@ -995,7 +948,6 @@ spa_stats_init(spa_t *spa) spa_read_history_init(spa); spa_txg_history_init(spa); spa_tx_assign_init(spa); - spa_io_history_init(spa); spa_mmp_history_init(spa); spa_state_init(spa); spa_iostats_init(spa); @@ -1009,26 +961,19 @@ spa_stats_destroy(spa_t *spa) spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); - spa_io_history_destroy(spa); spa_mmp_history_destroy(spa); } -#if defined(_KERNEL) -/* CSTYLED */ -module_param(zfs_read_history, int, 0644); -MODULE_PARM_DESC(zfs_read_history, - "Historical statistics for the last N reads"); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, + "Historical statistics for the last N reads"); -module_param(zfs_read_history_hits, int, 0644); -MODULE_PARM_DESC(zfs_read_history_hits, - "Include cache hits in read history"); +ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, + "Include cache hits in read history"); -module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, - "Historical statistics for the last N txgs"); +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, + "Historical statistics for the last N txgs"); -module_param(zfs_multihost_history, int, 0644); -MODULE_PARM_DESC(zfs_multihost_history, - "Historical statistics for last N multihost writes"); +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, + "Historical statistics for last N multihost writes"); /* END CSTYLED */ -#endif diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index d9cd8767e0..11d4798925 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include @@ -34,7 +34,6 @@ #include #include #include -#include #include /* @@ -96,6 +95,7 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) ZIO_PRIORITY_SYNC_READ); int error = 0; + uint64_t txg = 0, sync_pass = 0; for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; @@ -117,8 +117,29 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) block_cursor < block_end && error == 0; block_cursor++) { uint64_t e = *block_cursor; - if (sm_entry_is_debug(e)) /* Skip debug entries */ + if (sm_entry_is_debug(e)) { + /* + * Debug entries are only needed to record the + * current TXG and sync pass if available. + * + * Note though that sometimes there can be + * debug entries that are used as padding + * at the end of space map blocks in-order + * to not split a double-word entry in the + * middle between two blocks. These entries + * have their TXG field set to 0 and we + * skip them without recording the TXG. + * [see comment in space_map_write_seg()] + */ + uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); + if (e_txg != 0) { + txg = e_txg; + sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); + } else { + ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); + } continue; + } uint64_t raw_offset, raw_run, vdev_id; maptype_t type; @@ -158,7 +179,9 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, - .sme_run = entry_run + .sme_run = entry_run, + .sme_txg = txg, + .sme_sync_pass = sync_pass }; error = callback(&sme, arg); } @@ -523,8 +546,9 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) * dbuf must be dirty for the changes in sm_phys to take effect. */ static void -space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, - uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, + maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, + void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); @@ -548,14 +572,14 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, ASSERT3P(block_cursor, <=, block_end); - uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t size = (rend - rstart) >> sm->sm_shift; + uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; - ASSERT3U(rs->rs_start, >=, sm->sm_start); - ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); - ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); - ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + ASSERT3U(rstart, >=, sm->sm_start); + ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); + ASSERT3U(rend - rstart, <=, sm->sm_size); + ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); @@ -650,7 +674,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, space_map_write_intro_debug(sm, maptype, tx); -#ifdef DEBUG +#ifdef ZFS_DEBUG /* * We do this right after we write the intro debug entry * because the estimate does not take it into account. @@ -673,10 +697,14 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_buf_will_dirty(db, tx); - avl_tree_t *t = &rt->rt_root; - for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + zfs_btree_t *t = &rt->rt_root; + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + rs = zfs_btree_next(t, &where, &where)) { + uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> + sm->sm_shift; + uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + sm->sm_shift; uint8_t words = 1; /* @@ -698,16 +726,16 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, length > SM_RUN_MAX || vdev_id != SM_NO_VDEVID || (zfs_force_some_double_word_sm_entries && - spa_get_random(100) == 0))) + random_in_range(100) == 0))) words = 2; - space_map_write_seg(sm, rs, maptype, vdev_id, words, - &db, FTAG, tx); + space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, + rt), maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); -#ifdef DEBUG +#ifdef ZFS_DEBUG /* * We expect our estimation to be based on the worst case * scenario [see comment in space_map_estimate_optimal_size()]. @@ -749,7 +777,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -758,7 +786,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ - VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } @@ -849,9 +877,11 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) doi.doi_data_block_size != blocksize || doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating " - "object[%llu]: old bonus %u, old blocksz %u", - dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, - doi.doi_bonus_size, doi.doi_data_block_size); + "object[%llu]: old bonus %llu, old blocksz %u", + (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), sm, + (u_longlong_t)sm->sm_object, + (u_longlong_t)doi.doi_bonus_size, + doi.doi_data_block_size); space_map_free(sm, tx); dmu_buf_rele(sm->sm_dbuf, sm); @@ -1067,3 +1097,11 @@ space_map_length(space_map_t *sm) { return (sm != NULL ? sm->sm_phys->smp_length : 0); } + +uint64_t +space_map_nblocks(space_map_t *sm) +{ + if (sm == NULL) + return (0); + return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); +} diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index aa289ba106..080fc66465 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include @@ -57,11 +57,11 @@ space_reftree_compare(const void *x1, const void *x2) const space_ref_t *sr1 = (const space_ref_t *)x1; const space_ref_t *sr2 = (const space_ref_t *)x2; - int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(sr1, sr2)); + return (TREE_PCMP(sr1, sr2)); } void @@ -109,10 +109,13 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { - range_seg_t *rs; + zfs_btree_index_t where; - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = + zfs_btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, + rt), refcnt); + } } /* diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 0fcd569e3b..c9eb84bbdb 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include @@ -33,7 +33,7 @@ #include #include #include -#include +#include /* * ZFS Transaction Groups @@ -242,16 +242,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) { CALLB_CPR_SAFE_BEGIN(cpr); - /* - * cv_wait_sig() is used instead of cv_wait() in order to prevent - * this process from incorrectly contributing to the system load - * average when idle. - */ if (time) { - (void) cv_timedwait_sig(cv, &tx->tx_sync_lock, + (void) cv_timedwait_idle(cv, &tx->tx_sync_lock, ddi_get_lbolt() + time); } else { - cv_wait_sig(cv, &tx->tx_sync_lock); + cv_wait_idle(cv, &tx->tx_sync_lock); } CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); @@ -272,7 +267,7 @@ txg_sync_stop(dsl_pool_t *dp) ASSERT3U(tx->tx_threads, ==, 2); /* - * We need to ensure that we've vacated the deferred space_maps. + * We need to ensure that we've vacated the deferred metaslab trees. */ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); @@ -297,6 +292,27 @@ txg_sync_stop(dsl_pool_t *dp) mutex_exit(&tx->tx_sync_lock); } +/* + * Get a handle on the currently open txg and keep it open. + * + * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for + * the handle. Once txg_rele_to_quiesce() has been called, the txg stays + * in quiescing state until txg_rele_to_sync() is called for the handle. + * + * It is guaranteed that subsequent calls return monotonically increasing + * txgs for the same dsl_pool_t. Of course this is not strong monotonicity, + * because the same txg can be returned multiple times in a row. This + * guarantee holds both for subsequent calls from one thread and for multiple + * threads. For example, it is impossible to observe the following sequence + * of events: + * + * Thread 1 Thread 2 + * + * 1 <- txg_hold_open(P, ...) + * 2 <- txg_hold_open(P, ...) + * 1 <- txg_hold_open(P, ...) + * + */ uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { @@ -310,9 +326,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) * significance to the chosen tx_cpu. Because.. Why not use * the current cpu to index into the array? */ - kpreempt_disable(); - tc = &tx->tx_cpu[CPU_SEQID]; - kpreempt_enable(); + tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE]; mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; @@ -400,7 +414,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time); /* - * Quiesce the transaction group by waiting for everyone to txg_exit(). + * Quiesce the transaction group by waiting for everyone to + * call txg_rele_to_sync() for their open transaction handles. */ for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; @@ -453,8 +468,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - max_ncpus, defclsyspri, max_ncpus, max_ncpus * 2, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, boot_ncpus * 2, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); @@ -482,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - static boolean_t txg_is_quiescing(dsl_pool_t *dp) { @@ -523,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -535,10 +541,10 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + (u_longlong_t)tx->tx_synced_txg, + (u_longlong_t)tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); @@ -549,6 +555,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (txg_is_quiescing(dp)) { + txg_thread_wait(tx, &cpr, + &tx->tx_quiesce_done_cv, 0); + continue; + } if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -571,7 +582,8 @@ txg_sync_thread(void *arg) cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, + (u_longlong_t)tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp); @@ -622,8 +634,9 @@ txg_quiesce_thread(void *arg) txg = tx->tx_open_txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, - tx->tx_sync_txg_waiting); + (u_longlong_t)txg, + (u_longlong_t)tx->tx_quiesce_txg_waiting, + (u_longlong_t)tx->tx_sync_txg_waiting); tx->tx_quiescing_txg = txg; mutex_exit(&tx->tx_sync_lock); @@ -633,7 +646,8 @@ txg_quiesce_thread(void *arg) /* * Hand this txg off to the sync thread. */ - dprintf("quiesce done, handing off txg %llu\n", txg); + dprintf("quiesce done, handing off txg %llu\n", + (u_longlong_t)txg); tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); @@ -644,8 +658,8 @@ txg_quiesce_thread(void *arg) /* * Delay this thread by delay nanoseconds if we are still in the open - * transaction group and there is already a waiting txg quiesing or quiesced. - * Abort the delay if this txg stalls or enters the quiesing state. + * transaction group and there is already a waiting txg quiescing or quiesced. + * Abort the delay if this txg stalls or enters the quiescing state. */ void txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) @@ -675,8 +689,8 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) mutex_exit(&tx->tx_sync_lock); } -void -txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +static boolean_t +txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) { tx_state_t *tx = &dp->dp_tx; @@ -689,15 +703,47 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, + (u_longlong_t)tx->tx_sync_txg_waiting); while (tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " - "tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + "tx_synced=%llu waiting=%llu dp=%px\n", + (u_longlong_t)tx->tx_synced_txg, + (u_longlong_t)tx->tx_sync_txg_waiting, dp); cv_broadcast(&tx->tx_sync_more_cv); - cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + if (wait_sig) { + /* + * Condition wait here but stop if the thread receives a + * signal. The caller may call txg_wait_synced*() again + * to resume waiting for this txg. + */ + if (cv_wait_io_sig(&tx->tx_sync_done_cv, + &tx->tx_sync_lock) == 0) { + mutex_exit(&tx->tx_sync_lock); + return (B_TRUE); + } + } else { + cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } } mutex_exit(&tx->tx_sync_lock); + return (B_FALSE); +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); +} + +/* + * Similar to a txg_wait_synced but it can be interrupted from a signal. + * Returns B_TRUE if the thread was signaled while waiting. + */ +boolean_t +txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) +{ + return (txg_wait_synced_impl(dp, txg, B_TRUE)); } /* @@ -718,7 +764,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) if (tx->tx_quiesce_txg_waiting < txg && should_quiesce) tx->tx_quiesce_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, + (u_longlong_t)tx->tx_sync_txg_waiting); while (tx->tx_open_txg < txg) { cv_broadcast(&tx->tx_quiesce_more_cv); /* @@ -730,31 +777,30 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) if (should_quiesce == B_TRUE) { cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); } else { - cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + cv_wait_idle(&tx->tx_quiesce_done_cv, + &tx->tx_sync_lock); } } mutex_exit(&tx->tx_sync_lock); } /* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by queiscing the open txg. + * Pass in the txg number that should be synced. */ void -txg_kick(dsl_pool_t *dp) +txg_kick(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); + if (tx->tx_sync_txg_waiting >= txg) + return; + mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && - !txg_is_quiescing(dp) && - tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { - tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (tx->tx_sync_txg_waiting < txg) { + tx->tx_sync_txg_waiting = txg; + cv_broadcast(&tx->tx_sync_more_cv); } mutex_exit(&tx->tx_sync_lock); } @@ -783,7 +829,7 @@ txg_sync_waiting(dsl_pool_t *dp) void txg_verify(spa_t *spa, uint64_t txg) { - ASSERTV(dsl_pool_t *dp = spa_get_dsl(spa)); + dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa); if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) return; ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); @@ -1008,7 +1054,6 @@ txg_list_next(txg_list_t *tl, void *p, uint64_t txg) return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } -#if defined(_KERNEL) EXPORT_SYMBOL(txg_init); EXPORT_SYMBOL(txg_fini); EXPORT_SYMBOL(txg_sync_start); @@ -1024,6 +1069,7 @@ EXPORT_SYMBOL(txg_wait_callbacks); EXPORT_SYMBOL(txg_stalled); EXPORT_SYMBOL(txg_sync_waiting); -module_param(zfs_txg_timeout, int, 0644); -MODULE_PARM_DESC(zfs_txg_timeout, "Max seconds worth of delta per txg"); -#endif +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW, + "Max seconds worth of delta per txg"); +/* END CSTYLED */ diff --git a/module/zfs/unique.c b/module/zfs/unique.c index 5cdd025f49..0e076797a0 100644 --- a/module/zfs/unique.c +++ b/module/zfs/unique.c @@ -45,7 +45,7 @@ unique_compare(const void *a, const void *b) const unique_t *una = (const unique_t *)a; const unique_t *unb = (const unique_t *)b; - return (AVL_CMP(una->un_value, unb->un_value)); + return (TREE_CMP(una->un_value, unb->un_value)); } void diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 085ae68731..4a67ba85f5 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -21,12 +21,14 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. + * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include @@ -38,6 +40,8 @@ #include #include #include +#include +#include #include #include #include @@ -49,12 +53,34 @@ #include #include #include +#include #include #include #include #include #include +/* + * One metaslab from each (normal-class) vdev is used by the ZIL. These are + * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are + * part of the spa_embedded_log_class. The metaslab with the most free space + * in each vdev is selected for this purpose when the pool is opened (or a + * vdev is added). See vdev_metaslab_init(). + * + * Log blocks can be allocated from the following locations. Each one is tried + * in order until the allocation succeeds: + * 1. dedicated log vdevs, aka "slog" (spa_log_class) + * 2. embedded slog metaslabs (spa_embedded_log_class) + * 3. other metaslabs in normal vdevs (spa_normal_class) + * + * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer + * than this number of metaslabs in the vdev. This ensures that we don't set + * aside an unreasonable amount of space for the ZIL. If set to less than + * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced + * (by more than 1<vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { - zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, + zfs_dbgmsg("%*svdev %llu: %s", indent, "", + (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } @@ -188,6 +217,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -214,17 +245,34 @@ vdev_getops(const char *type) return (ops); } +/* + * Given a vdev and a metaslab class, find which metaslab group we're + * interested in. All vdevs may belong to two different metaslab classes. + * Dedicated slog devices use only the primary metaslab group, rather than a + * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. + */ +metaslab_group_t * +vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) +{ + if (mc == spa_embedded_log_class(vd->vdev_spa) && + vd->vdev_log_mg != NULL) + return (vd->vdev_log_mg); + else + return (vd->vdev_mg); +} + /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* - * Derive the enumerated alloction bias from string input. - * String origin is either the per-vdev zap or zpool(1M). + * Derive the enumerated allocation bias from string input. + * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) @@ -259,6 +307,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -284,15 +338,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -304,6 +350,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -528,7 +616,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -537,6 +626,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); + zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, + 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -545,14 +636,17 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); @@ -560,9 +654,14 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL); + vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); } + txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, @@ -585,7 +684,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -642,48 +741,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -695,13 +759,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } + } + + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } } vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; - + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -741,6 +824,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -832,8 +917,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + &vd->vdev_rebuild_txg); + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); + vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the @@ -887,6 +975,7 @@ vdev_free(vdev_t *vd) ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -918,12 +1007,21 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); @@ -994,10 +1092,12 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); + mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); @@ -1005,7 +1105,11 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); + mutex_destroy(&vd->vdev_rebuild_lock); + cv_destroy(&vd->vdev_rebuild_cv); + zfs_ratelimit_fini(&vd->vdev_delay_rl); + zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) @@ -1040,14 +1144,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); + if (tvd->vdev_log_mg) + ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; + svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; + if (tvd->vdev_log_mg != NULL) + tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; @@ -1074,7 +1184,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); + ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_rebuilding = svd->vdev_rebuilding; + tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; @@ -1088,6 +1201,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; + svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) @@ -1130,7 +1244,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1148,6 +1263,8 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -1179,7 +1296,8 @@ vdev_remove_parent(vdev_t *cvd) mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; - + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); @@ -1217,7 +1335,7 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } -static void +void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; @@ -1251,10 +1369,15 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); + if (!vd->vdev_islog) { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd, 1); + } + /* - * The spa ashift values currently only reflect the - * general vdev classes. Class destination is late - * binding so ashift checking had to wait until now + * The spa ashift min/max only apply for the normal metaslab + * class. Class destination is late binding so ashift boundary + * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { @@ -1262,6 +1385,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1270,8 +1397,6 @@ int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; @@ -1299,16 +1424,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { - uint64_t object = 0; + for (uint64_t m = oldc; m < newc; m++) { + uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { - error = dmu_read(mos, vd->vdev_ms_array, + error = dmu_read(spa->spa_meta_objset, + vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { @@ -1318,17 +1444,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } -#ifndef _KERNEL - /* - * To accomodate zdb_leak_init() fake indirect - * metaslabs, we allocate a metaslab group for - * indirect vdevs which normally don't have one. - */ - if (vd->vdev_mg == NULL) { - ASSERT0(vdev_is_concrete(vd)); - vdev_metaslab_group_create(vd); - } -#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1338,6 +1453,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } + /* + * Find the emptiest metaslab on the vdev and mark it for use for + * embedded slog by moving it from the regular to the log metaslab + * group. + */ + if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + vd->vdev_ms_count > zfs_embedded_slog_min_ms && + avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { + uint64_t slog_msid = 0; + uint64_t smallest = UINT64_MAX; + + /* + * Note, we only search the new metaslabs, because the old + * (pre-existing) ones may be active (e.g. have non-empty + * range_tree's), and we don't move them to the new + * metaslab_t. + */ + for (uint64_t m = oldc; m < newc; m++) { + uint64_t alloc = + space_map_allocated(vd->vdev_ms[m]->ms_sm); + if (alloc < smallest) { + slog_msid = m; + smallest = alloc; + } + } + metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; + /* + * The metaslab was marked as dirty at the end of + * metaslab_init(). Remove it from the dirty list so that we + * can uninitialize and reinitialize it to the new class. + */ + if (txg != 0) { + (void) txg_list_remove_this(&vd->vdev_ms_list, + slog_ms, txg); + } + uint64_t sm_obj = space_map_object(slog_ms->ms_sm); + metaslab_fini(slog_ms); + VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, + &vd->vdev_ms[slog_msid])); + } + if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); @@ -1348,11 +1504,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); + /* + * Regardless whether this vdev was just added or it is being + * expanded, the metaslab count has changed. Recalculate the + * block limit. + */ + spa_log_sm_set_blocklimit(spa); + return (0); } @@ -1376,7 +1541,12 @@ vdev_metaslab_fini(vdev_t *vd) if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + if (vd->vdev_log_mg != NULL) { + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); + } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { @@ -1386,11 +1556,13 @@ vdev_metaslab_fini(vdev_t *vd) } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; - vd->vdev_ms_count = 0; - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); + if (vd->vdev_log_mg != NULL) + ASSERT0(vd->vdev_log_mg->mg_histogram[i]); + } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); @@ -1439,8 +1611,8 @@ vdev_probe_done(zio_t *zio) } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); - zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, NULL, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1543,7 +1715,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); @@ -1556,6 +1728,14 @@ vdev_probe(vdev_t *vd, zio_t *zio) return (NULL); } +static void +vdev_load_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_load_error = vdev_load(vd); +} + static void vdev_open_child(void *arg) { @@ -1581,39 +1761,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + int children = vd->vdev_children; + + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_open_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } +} + +/* + * Open all child vdevs. + */ void vdev_open_children(vdev_t *vd) { - taskq_t *tq; - int children = vd->vdev_children; + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; - - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); - - taskq_destroy(tq); - } - - vd->vdev_nonrot = B_TRUE; - - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1631,6 +1839,38 @@ vdev_set_deflate_ratio(vdev_t *vd) } } +/* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +static void +vdev_ashift_optimize(vdev_t *vd) +{ + ASSERT(vd == vd->vdev_top); + + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_vdev_min_auto_ashift, + vd->vdev_physical_ashift)); + } else { + /* + * If the logical and physical ashifts are the same, then + * we ensure that the top-level vdev's ashift is not smaller + * than our minimum ashift value. For the unusual case + * where logical ashift > physical ashift, we can't cap + * the calculated ashift based on max ashift as that + * would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, + vd->vdev_ashift); + } +} + /* * Prepare a virtual device for access. */ @@ -1642,7 +1882,8 @@ vdev_open(vdev_t *vd) uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; - uint64_t ashift = 0; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -1672,8 +1913,8 @@ vdev_open(vdev_t *vd) return (SET_ERROR(ENXIO)); } - error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); - + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy @@ -1691,7 +1932,7 @@ vdev_open(vdev_t *vd) */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, NULL, ENXIO); + error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && @@ -1788,6 +2029,18 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EINVAL)); } + /* + * We can always set the logical/physical ashift members since + * their values are only used to calculate the vdev_ashift when + * the device is first added to the config. These values should + * not be used for anything else since they may change whenever + * the device is reopened and we don't store them in the label. + */ + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, + vd->vdev_logical_ashift); + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1795,8 +2048,23 @@ vdev_open(vdev_t *vd) */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; + + /* + * If the vdev_ashift was not overridden at creation time, + * then set it the logical ashift and optimize the ashift. + */ if (vd->vdev_ashift == 0) { - vd->vdev_ashift = ashift; /* use detected value */ + vd->vdev_ashift = vd->vdev_logical_ashift; + + if (vd->vdev_logical_ashift > ASHIFT_MAX) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (SET_ERROR(EDOM)); + } + + if (vd->vdev_top == vd) { + vdev_ashift_optimize(vd); + } } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { @@ -1806,16 +2074,17 @@ vdev_open(vdev_t *vd) } } else { /* - * Detect if the alignment requirement has increased. - * We don't want to make the pool unavailable, just - * post an event instead. + * Make sure the alignment required hasn't increased. */ - if (ashift > vd->vdev_top->vdev_ashift && + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { - zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, - spa, vd, NULL, NULL, 0, 0); + (void) zfs_ereport_post( + FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + spa, vd, NULL, NULL, 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EDOM)); } - vd->vdev_max_asize = max_asize; } @@ -1851,37 +2120,36 @@ vdev_open(vdev_t *vd) } /* - * Track the min and max ashift values for normal data devices. - * - * DJB - TBD these should perhaps be tracked per allocation class - * (e.g. spa_min_ashift is used to round up post compression buffers) + * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - vd->vdev_alloc_bias == VDEV_BIAS_NONE && - vd->vdev_aux == NULL) { - if (vd->vdev_ashift > spa->spa_max_ashift) - spa->spa_max_ashift = vd->vdev_ashift; - if (vd->vdev_ashift < spa->spa_min_ashift) - spa->spa_min_ashift = vd->vdev_ashift; + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } /* - * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a scrub, - * since this would just restart the scrub we are already doing. + * If this is a leaf vdev, assess whether a resilver is needed. + * But don't do this if we are doing a reopen for a scrub, since + * this would just restart the scrub we are already doing. */ - if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && - vdev_resilver_needed(vd, NULL, NULL)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) + dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } +static void +vdev_validate_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_validate_thread = curthread; + vd->vdev_validate_error = vdev_validate(vd); + vd->vdev_validate_thread = NULL; +} + /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't @@ -1896,18 +2164,43 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; + int children = vd->vdev_children; if (vdev_validate_skip) return (0); - for (uint64_t c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) + if (children > 0) { + tq = taskq_create("vdev_validate", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + } + + for (uint64_t c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (tq == NULL || vdev_uses_zvols(cvd)) { + vdev_validate_child(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, + TQ_SLEEP) != TASKQID_INVALID); + } + } + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } + for (int c = 0; c < children; c++) { + int error = vd->vdev_child[c]->vdev_validate_error; + + if (error != 0) return (SET_ERROR(EBADF)); + } + /* * If the device has already failed, or was marked offline, don't do @@ -1930,7 +2223,7 @@ vdev_validate(vdev_t *vd) txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); @@ -2080,6 +2373,7 @@ vdev_validate(vdev_t *vd) static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { + char *old, *new; if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " @@ -2093,6 +2387,29 @@ vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } + + /* + * Our enclosure sysfs path may have changed between imports + */ + old = dvd->vdev_enc_sysfs_path; + new = svd->vdev_enc_sysfs_path; + if ((old != NULL && new == NULL) || + (old == NULL && new != NULL) || + ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " + "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, + old, new); + + if (dvd->vdev_enc_sysfs_path) + spa_strfree(dvd->vdev_enc_sysfs_path); + + if (svd->vdev_enc_sysfs_path) { + dvd->vdev_enc_sysfs_path = spa_strdup( + svd->vdev_enc_sysfs_path); + } else { + dvd->vdev_enc_sysfs_path = NULL; + } + } } /* @@ -2195,9 +2512,11 @@ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; - ASSERTV(spa_t *spa = vd->vdev_spa); + spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2236,7 +2555,7 @@ vdev_hold(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } @@ -2247,7 +2566,7 @@ vdev_rele(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } @@ -2277,9 +2596,20 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) - l2arc_add_vdev(spa, vd); + vd->vdev_aux == &spa->spa_l2cache) { + /* + * In case the vdev is present we should evict all ARC + * buffers and pointers to log blocks and reclaim their + * space before restoring its contents to L2ARC. + */ + if (l2arc_vdev_present(vd)) { + l2arc_rebuild_vdev(vd, B_TRUE); + } else { + l2arc_add_vdev(spa, vd); + } + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); + } } else { (void) vdev_validate(vd); } @@ -2304,7 +2634,7 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); - return (error ? error : ENXIO); + return (error ? error : SET_ERROR(ENXIO)); } /* @@ -2483,15 +2813,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) /* * While we are loading the pool, the DTLs have not been loaded yet. - * Ignore the DTLs and try all devices. This avoids a recursive - * mutex enter on the vdev_dtl_lock, and also makes us try hard - * when loading the pool (relying on the checksum to ensure that - * we get the right data -- note that we while loading, we are - * only reading the MOS, which is always checksummed). + * This isn't a problem but it can result in devices being tried + * which are known to not have the data. In which case, the import + * is relying on the checksum to ensure that we get the right data. + * Note that while importing we are only reading the MOS, which is + * always checksummed. */ - if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) - return (B_FALSE); - mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); @@ -2514,10 +2841,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2525,7 +2868,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2534,14 +2878,11 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) static uint64_t vdev_dtl_min(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_start - 1); + return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* @@ -2550,14 +2891,11 @@ vdev_dtl_min(vdev_t *vd) static uint64_t vdev_dtl_max(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_end); + return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* @@ -2569,12 +2907,8 @@ vdev_dtl_max(vdev_t *vd) * excise the DTLs. */ static boolean_t -vdev_dtl_should_excise(vdev_t *vd) +vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { - spa_t *spa = vd->vdev_spa; - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - - ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2583,23 +2917,52 @@ vdev_dtl_should_excise(vdev_t *vd) if (vd->vdev_resilver_deferred) return (B_FALSE); - if (vd->vdev_resilver_txg == 0 || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); - /* - * When a resilver is initiated the scan will assign the scn_max_txg - * value to the highest txg value that exists in all DTLs. If this - * device's max DTL is not part of this scan (i.e. it is not in - * the range (scn_min_txg, scn_max_txg] then it is not eligible - * for excision. - */ - if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { - ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); - ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); - ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); - return (B_TRUE); + if (rebuild_done) { + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + /* Rebuild not initiated by attach */ + if (vd->vdev_rebuild_txg == 0) + return (B_TRUE); + + /* + * When a rebuild completes without error then all missing data + * up to the rebuild max txg has been reconstructed and the DTL + * is eligible for excision. + */ + if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && + vdev_dtl_max(vd) <= vrp->vrp_max_txg) { + ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); + ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); + return (B_TRUE); + } + } else { + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; + + /* Resilver not initiated by attach */ + if (vd->vdev_resilver_txg == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the + * scn_max_txg value to the highest txg value that exists + * in all DTLs. If this device's max DTL is not part of this + * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] + * then it is not eligible for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); + return (B_TRUE); + } } + return (B_FALSE); } @@ -2608,7 +2971,8 @@ vdev_dtl_should_excise(vdev_t *vd) * write operations will be issued to the pool. */ void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; @@ -2618,38 +2982,65 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done); + scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + boolean_t check_excise = B_FALSE; + boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* - * If requested, pretend the scan completed cleanly. + * If requested, pretend the scan or rebuild completed cleanly. */ - if (zfs_scan_ignore_errors && scn) - scn->scn_phys.scn_errors = 0; + if (zfs_scan_ignore_errors) { + if (scn != NULL) + scn->scn_phys.scn_errors = 0; + if (vr != NULL) + vr->vr_rebuild_phys.vrp_errors = 0; + } + + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } /* - * If we've completed a scan cleanly then determine - * if this vdev should remove any DTLs. We only want to - * excise regions on vdevs that were available during - * the entire duration of this scan. + * If we've completed a scrub/resilver or a rebuild cleanly + * then determine if this vdev should remove any DTLs. We + * only want to excise regions on vdevs that were available + * during the entire duration of this scan. */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || - (scn != NULL && scn->scn_phys.scn_errors == 0)) && - vdev_dtl_should_excise(vd)) { + if (rebuild_done && + vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { + check_excise = B_TRUE; + } else { + if (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) { + check_excise = B_TRUE; + } + } + + if (scrub_txg && check_excise && + vdev_dtl_should_excise(vd, rebuild_done)) { /* - * We completed a scrub up to scrub_txg. If we - * did it without rebooting, then the scrub dtl - * will be valid, so excise the old region and - * fold in the scrub dtl. Otherwise, leave the - * dtl as-is if there was an error. + * We completed a scrub, resilver or rebuild up to + * scrub_txg. If we did it without rebooting, then + * the scrub dtl will be valid, so excise the old + * region and fold in the scrub dtl. Otherwise, + * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference @@ -2670,6 +3061,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], @@ -2684,15 +3083,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* - * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag and dirty + * If the vdev was resilvering or rebuilding and no longer + * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ - if (txg != 0 && vd->vdev_resilver_txg != 0 && + if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { - vd->vdev_resilver_txg = 0; - vdev_config_dirty(vd->vdev_top); + if (vd->vdev_rebuild_txg != 0) { + vd->vdev_rebuild_txg = 0; + vdev_config_dirty(vd->vdev_top); + } else if (vd->vdev_resilver_txg != 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } } mutex_exit(&vd->vdev_dtl_lock); @@ -2710,8 +3114,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -2732,21 +3136,35 @@ vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; + range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); + /* + * If the dtl cannot be sync'd there is no need to open it. + */ + if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) + return (0); + error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(vd->vdev_dtl_sm, - vd->vdev_dtl[DTL_MISSING], SM_ALLOC); - mutex_exit(&vd->vdev_dtl_lock); + rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); + if (error == 0) { + mutex_enter(&vd->vdev_dtl_lock); + range_tree_walk(rt, range_tree_add, + vd->vdev_dtl[DTL_MISSING]); + mutex_exit(&vd->vdev_dtl_lock); + } + + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); return (error); } @@ -2830,7 +3248,7 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) } } -void +static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; @@ -2870,7 +3288,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; - new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); + new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, @@ -2878,13 +3296,13 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = range_tree_create(NULL, NULL); + rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); - space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); + space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); @@ -2928,13 +3346,15 @@ vdev_dtl_required(vdev_t *vd) * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); - if (!required && zio_injection_enabled) - required = !!zio_handle_device_injection(vd, NULL, ECHILD); + if (!required && zio_injection_enabled) { + required = !!zio_handle_device_injection(vd, NULL, + SET_ERROR(ECHILD)); + } return (required); } @@ -3007,18 +3427,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) int vdev_load(vdev_t *vd) { + int children = vd->vdev_children; int error = 0; + taskq_t *tq = NULL; + + /* + * It's only worthwhile to use the taskq for the root vdev, because the + * slow part is metaslab_init, and that only happens for top-level + * vdevs. + */ + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { + tq = taskq_create("vdev_load", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { - error = vdev_load(vd->vdev_child[c]); - if (error != 0) { - return (error); + vdev_t *cvd = vd->vdev_child[c]; + + if (tq == NULL || vdev_uses_zvols(cvd)) { + cvd->vdev_load_error = vdev_load(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_load_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); } } + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } + + for (int c = 0; c < vd->vdev_children; c++) { + int error = vd->vdev_child[c]->vdev_load_error; + + if (error != 0) + return (error); + } + vdev_set_deflate_ratio(vd); /* @@ -3028,11 +3476,33 @@ vdev_load(vdev_t *vd) spa_t *spa = vd->vdev_spa; char bias_str[64]; - if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), - bias_str) == 0) { + bias_str); + if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } else if (error != ENOENT) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + return (error); + } + } + + /* + * Load any rebuild state from the top-level vdev zap. + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + error = vdev_rebuild_load(vd); + if (error && error != ENOTSUP) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " + "failed [error=%d]", error); + return (error); } } @@ -3175,6 +3645,26 @@ vdev_validate_aux(vdev_t *vd) return (0); } +static void +vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(vd->vdev_spa); + + if (vd->vdev_top_zap == 0) + return; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); + if (err == ENOENT) + return; + VERIFY0(err); + + VERIFY0(dmu_object_free(mos, object, tx)); + VERIFY0(zap_remove(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); +} + /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. @@ -3202,6 +3692,7 @@ vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } @@ -3237,8 +3728,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) != NULL) metaslab_sync_done(msp, txg); - if (reassess) + if (reassess) { metaslab_sync_reassess(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_sync_reassess(vd->vdev_log_mg); + } } void @@ -3317,10 +3811,10 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; @@ -3399,10 +3893,10 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. @@ -3436,10 +3930,10 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -3491,9 +3985,14 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) } mutex_exit(&vd->vdev_initialize_lock); - /* Restart trimming if necessary */ + /* + * Restart trimming if necessary. We do not restart trimming for cache + * devices here. This is triggered by l2arc_rebuild_vdev() + * asynchronously for the whole device or in l2arc_evict() as it evicts + * space for upcoming writes. + */ mutex_enter(&vd->vdev_trim_lock); - if (vdev_writeable(vd) && + if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, @@ -3521,9 +4020,12 @@ top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + + if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; @@ -3541,7 +4043,8 @@ top: */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) - return (spa_vdev_state_exit(spa, NULL, EBUSY)); + return (spa_vdev_state_exit(spa, NULL, + SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations @@ -3553,6 +4056,7 @@ top: /* * Prevent any future allocations. */ + ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); @@ -3598,7 +4102,8 @@ top: vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); - return (spa_vdev_state_exit(spa, NULL, EBUSY)); + return (spa_vdev_state_exit(spa, NULL, + SET_ERROR(EBUSY))); } /* @@ -3682,14 +4187,11 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); - if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, - SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + /* If a resilver isn't required, check if vdevs can be culled */ + if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } @@ -3703,6 +4205,9 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; + + /* Clear recent error events cache (i.e. duplicate events tracking) */ + zfs_ereport_clear(spa, vd); } boolean_t @@ -3771,6 +4276,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -3863,7 +4375,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -3894,6 +4405,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; @@ -3920,7 +4432,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; + + /* Set when there is a deferred resilver. */ + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } + /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab @@ -3932,13 +4448,27 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } + + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; + + /* + * Report fragmentation and rebuild progress for top-level, + * non-auxiliary, concrete devices. + */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { + /* + * The vdev fragmentation rating doesn't take into + * account the embedded slog metaslab (vdev_log_mg). + * Since it's only one metaslab, it would have a tiny + * impact on the overall fragmentation. + */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -4019,17 +4549,39 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { + /* + * Repair is the result of a resilver issued by the + * scan thread (spa_sync). + */ if (flags & ZIO_FLAG_SCAN_THREAD) { - dsl_scan_phys_t *scn_phys = - &spa->spa_dsl_pool->dp_scan->scn_phys; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; - /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } + /* + * Repair is the result of a rebuild issued by the + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + vdev_t *tvd = vd->vdev_top; + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; + + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { + atomic_add_64(rebuilt, psize); + } + vs->vs_rebuild_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -4041,6 +4593,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; + zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as @@ -4050,19 +4603,41 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; + /* + * Solely for the purposes of 'zpool iostat -lqrw' + * reporting use the priority to categorize the IO. + * Only the following are reported to user space: + * + * ZIO_PRIORITY_SYNC_READ, + * ZIO_PRIORITY_SYNC_WRITE, + * ZIO_PRIORITY_ASYNC_READ, + * ZIO_PRIORITY_ASYNC_WRITE, + * ZIO_PRIORITY_SCRUB, + * ZIO_PRIORITY_TRIM, + * ZIO_PRIORITY_REBUILD. + */ + if (priority == ZIO_PRIORITY_INITIALIZING) { + ASSERT3U(type, ==, ZIO_TYPE_WRITE); + priority = ZIO_PRIORITY_ASYNC_WRITE; + } else if (priority == ZIO_PRIORITY_REMOVAL) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_ASYNC_READ); + } + vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { - vsx->vsx_agg_histo[zio->io_priority] + vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { - vsx->vsx_ind_histo[zio->io_priority] + vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { - vsx->vsx_queue_histo[zio->io_priority] + vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; @@ -4096,8 +4671,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; - if (spa->spa_load_state == SPA_LOAD_NONE && - type == ZIO_TYPE_WRITE && txg != 0 && + if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { @@ -4166,7 +4740,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its - * childrens', thus not accurate enough for us. + * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); @@ -4510,8 +5084,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } - zfs_ereport_post(class, spa, vd, NULL, NULL, - save_state, 0); + (void) zfs_ereport_post(class, spa, vd, NULL, NULL, + save_state); } /* Erase any notion of persistent removed state */ @@ -4563,10 +5137,8 @@ vdev_is_bootable(vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || - strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { + if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); - } } for (int c = 0; c < vd->vdev_children; c++) { @@ -4664,7 +5236,7 @@ vdev_deadman(vdev_t *vd, char *tag) zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %d active IOs", + zfs_dbgmsg("slow vdev: %s has %lu active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* @@ -4682,44 +5254,84 @@ vdev_deadman(vdev_t *vd, char *tag) } void -vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) +vdev_defer_resilver(vdev_t *vd) { - for (uint64_t i = 0; i < vd->vdev_children; i++) - vdev_set_deferred_resilver(spa, vd->vdev_child[i]); - - if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { - return; - } + ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; - spa->spa_resilver_deferred = B_TRUE; + vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Clears the resilver deferred flag on all leaf devs under vd. Returns + * B_TRUE if we have devices that need to be resilvered and are available to + * accept resilver I/Os. + */ +boolean_t +vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) +{ + boolean_t resilver_needed = B_FALSE; + spa_t *spa = vd->vdev_spa; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); + } + + if (vd == spa->spa_root_vdev && + spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { + spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); + vdev_config_dirty(vd); + spa->spa_resilver_deferred = B_FALSE; + return (resilver_needed); + } + + if (!vdev_is_concrete(vd) || vd->vdev_aux || + !vd->vdev_ops->vdev_op_leaf) + return (resilver_needed); + + vd->vdev_resilver_deferred = B_FALSE; + + return (!vdev_is_dead(vd) && !vd->vdev_offline && + vdev_resilver_needed(vd, NULL, NULL)); +} + +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + +/* + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -4729,17 +5341,69 @@ vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ - range_seg_t intermediate = { { { 0, 0 } } }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + range_seg64_t intermediate = { 0 }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } -#if defined(_KERNEL) +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + +/* + * Look at the vdev tree and determine whether any devices are currently being + * replaced. + */ +boolean_t +vdev_replace_in_progress(vdev_t *vdev) +{ + ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); + + if (vdev->vdev_ops == &vdev_replacing_ops) + return (B_TRUE); + + /* + * A 'spare' vdev indicates that we have a replace in progress, unless + * it has exactly two children, and the second, the hot spare, has + * finished being resilvered. + */ + if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || + !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) + return (B_TRUE); + + for (int i = 0; i < vdev->vdev_children; i++) { + if (vdev_replace_in_progress(vdev->vdev_child[i])) + return (B_TRUE); + } + + return (B_FALSE); +} + EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); @@ -4747,36 +5411,43 @@ EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ -module_param(zfs_vdev_default_ms_count, int, 0644); -MODULE_PARM_DESC(zfs_vdev_default_ms_count, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); -module_param(zfs_vdev_min_ms_count, int, 0644); -MODULE_PARM_DESC(zfs_vdev_min_ms_count, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, + "Default limit for metaslab size"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); -module_param(zfs_vdev_ms_count_limit, int, 0644); -MODULE_PARM_DESC(zfs_vdev_ms_count_limit, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); -module_param(zfs_slow_io_events_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_slow_io_events_per_second, +ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); -module_param(zfs_checksum_events_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events " - "to this many checksum errors per second (do not set below zed" - "threshold)."); +ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, + "Rate limit checksum events to this many checksum errors per second " + "(do not set below zed threshold)."); -module_param(zfs_scan_ignore_errors, int, 0644); -MODULE_PARM_DESC(zfs_scan_ignore_errors, +ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); -module_param(vdev_validate_skip, int, 0644); -MODULE_PARM_DESC(vdev_validate_skip, +ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); -module_param(zfs_nocacheflush, int, 0644); -MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); +ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, + "Disable cache flushes"); + +ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, + "Minimum number of metaslabs required to dedicate one for log blocks"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, + param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, + "Minimum ashift used when creating new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, + param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, + "Maximum ashift used when optimizing for logical -> physical sector " + "size on new top-level vdevs"); /* END CSTYLED */ -#endif diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 0f1d9448b5..6e82184b80 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -46,7 +46,7 @@ * terribly wasteful of bandwidth. A more intelligent version of the cache * could keep track of access patterns and not do read-ahead unless it sees * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A futher enhancement could take advantage of + * metadata I/O is inflated. A further enhancement could take advantage of * more semantic information about the I/O. And it could use something * faster than an AVL tree; that was chosen solely for convenience. * @@ -111,7 +111,7 @@ vdev_cache_offset_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); + return (TREE_CMP(ve1->ve_offset, ve2->ve_offset)); } static int @@ -120,7 +120,7 @@ vdev_cache_lastused_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); if (likely(cmp)) return (cmp); @@ -254,7 +254,7 @@ vdev_cache_read(zio_t *zio) vdev_cache_entry_t *ve, *ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); zio_t *fio; - ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); + uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); @@ -425,13 +425,13 @@ vdev_cache_stat_fini(void) } } -#if defined(_KERNEL) -module_param(zfs_vdev_cache_max, int, 0644); -MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW, + "Inflate reads small than max"); -module_param(zfs_vdev_cache_size, int, 0444); -MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD, + "Total size of the per-disk cache"); -module_param(zfs_vdev_cache_bshift, int, 0644); -MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); -#endif +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW, + "Shift size to inflate reads too"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 0000000000..b8f82d52e8 --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,2784 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +#include /* For vdev_xlate() in vdev_draid_io_verify() */ +#endif + +/* + * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is + * comprised of multiple raidz redundancy groups which are spread over the + * dRAID children. To ensure an even distribution, and avoid hot spots, a + * permutation mapping is applied to the order of the dRAID children. + * This mixing effectively distributes the parity columns evenly over all + * of the disks in the dRAID. + * + * This is beneficial because it means when resilvering all of the disks + * can participate thereby increasing the available IOPs and bandwidth. + * Furthermore, by reserving a small fraction of each child's total capacity + * virtual distributed spare disks can be created. These spares similarly + * benefit from the performance gains of spanning all of the children. The + * consequence of which is that resilvering to a distributed spare can + * substantially reduce the time required to restore full parity to pool + * with a failed disks. + * + * === dRAID group layout === + * + * First, let's define a "row" in the configuration to be a 16M chunk from + * each physical drive at the same offset. This is the minimum allowable + * size since it must be possible to store a full 16M block when there is + * only a single data column. Next, we define a "group" to be a set of + * sequential disks containing both the parity and data columns. We allow + * groups to span multiple rows in order to align any group size to any + * number of physical drives. Finally, a "slice" is comprised of the rows + * which contain the target number of groups. The permutation mappings + * are applied in a round robin fashion to each slice. + * + * Given D+P drives in a group (including parity drives) and C-S physical + * drives (not including the spare drives), we can distribute the groups + * across R rows without remainder by selecting the least common multiple + * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). + * + * In the example below, there are C=14 physical drives in the configuration + * with S=2 drives worth of spare capacity. Each group has a width of 9 + * which includes D=8 data and P=1 parity drive. There are 4 groups and + * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice + * size is 576M (144M * 4). When allocating from a dRAID each group is + * filled before moving on to the next as show in slice0 below. + * + * data disks (8 data + 1 parity) spares (2) + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 + * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | | group 0 | group 1..| | + * | +-----------------------------------+-----------+-------| + * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r + * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o + * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w + * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 + * s +-----------------------+-----------------------+-------+ + * l | ..group 1 | group 2.. | | + * i +-----------------------+-----------------------+-------+ + * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r + * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o + * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w + * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 + * | +-----------+-----------+-----------------------+-------+ + * | |..group 2 | group 3 | | + * | +-----------+-----------+-----------------------+-------+ + * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r + * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o + * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w + * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 4 | group 5..| | row 3 + * i +-----------------------+-----------+-----------+-------| + * c | ..group 5 | group 6.. | | row 4 + * e +-----------+-----------+-----------------------+-------+ + * 1 |..group 6 | group 7 | | row 5 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 8 | group 9..| | row 6 + * i +-----------------------------------------------+-------| + * c | ..group 9 | group 10.. | | row 7 + * e +-----------------------+-----------------------+-------+ + * 2 |..group 10 | group 11 | | row 8 + * +-----------+-----------------------------------+-------+ + * + * This layout has several advantages over requiring that each row contain + * a whole number of groups. + * + * 1. The group count is not a relevant parameter when defining a dRAID + * layout. Only the group width is needed, and *all* groups will have + * the desired size. + * + * 2. All possible group widths (<= physical disk count) can be supported. + * + * 3. The logic within vdev_draid.c is simplified when the group width is + * the same for all groups (although some of the logic around computing + * permutation numbers and drive offsets is more complicated). + * + * N.B. The following array describes all valid dRAID permutation maps. + * Each row is used to generate a permutation map for a different number + * of children from a unique seed. The seeds were generated and carefully + * evaluated by the 'draid' utility in order to provide balanced mappings. + * In addition to the seed a checksum of the in-memory mapping is stored + * for verification. + * + * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, + * with a given permutation map) is the ratio of the amounts of I/O that will + * be sent to the least and most busy disks when resilvering. The average + * imbalance ratio (of a given number of disks and permutation map) is the + * average of the ratios of all possible single and double disk failures. + * + * In order to achieve a low imbalance ratio the number of permutations in + * the mapping must be significantly larger than the number of children. + * For dRAID the number of permutations has been limited to 512 to minimize + * the map size. This does result in a gradually increasing imbalance ratio + * as seen in the table below. Increasing the number of permutations for + * larger child counts would reduce the imbalance ratio. However, in practice + * when there are a large number of children each child is responsible for + * fewer total IOs so it's less of a concern. + * + * Note these values are hard coded and must never be changed. Existing + * pools depend on the same mapping always being generated in order to + * read and write from the correct locations. Any change would make + * existing pools completely inaccessible. + */ +static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { + { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ + { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ + { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ + { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ + { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ + { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ + { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ + { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ + { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ + { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ + { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ + { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ + { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ + { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ + { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ + { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ + { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ + { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ + { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ + { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ + { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ + { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ + { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ + { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ + { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ + { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ + { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ + { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ + { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ + { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ + { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ + { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ + { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ + { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ + { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ + { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ + { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ + { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ + { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ + { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ + { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ + { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ + { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ + { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ + { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ + { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ + { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ + { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ + { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ + { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ + { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ + { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ + { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ + { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ + { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ + { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ + { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ + { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ + { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ + { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ + { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ + { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ + { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ + { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ + { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ + { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ + { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ + { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ + { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ + { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ + { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ + { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ + { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ + { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ + { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ + { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ + { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ + { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ + { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ + { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ + { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ + { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ + { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ + { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ + { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ + { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ + { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ + { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ + { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ + { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ + { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ + { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ + { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ + { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ + { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ + { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ + { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ + { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ + { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ + { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ + { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ + { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ + { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ + { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ + { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ + { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ + { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ + { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ + { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ + { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ + { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ + { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ + { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ + { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ + { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ + { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ + { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ + { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ + { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ + { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ + { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ + { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ + { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ + { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ + { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ + { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ + { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ + { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ + { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ + { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ + { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ + { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ + { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ + { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ + { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ + { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ + { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ + { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ + { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ + { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ + { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ + { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ + { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ + { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ + { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ + { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ + { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ + { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ + { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ + { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ + { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ + { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ + { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ + { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ + { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ + { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ + { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ + { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ + { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ + { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ + { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ + { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ + { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ + { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ + { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ + { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ + { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ + { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ + { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ + { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ + { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ + { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ + { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ + { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ + { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ + { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ + { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ + { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ + { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ + { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ + { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ + { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ + { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ + { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ + { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ + { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ + { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ + { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ + { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ + { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ + { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ + { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ + { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ + { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ + { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ + { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ + { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ + { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ + { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ + { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ + { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ + { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ + { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ + { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ + { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ + { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ + { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ + { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ + { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ + { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ + { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ + { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ + { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ + { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ + { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ + { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ + { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ + { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ + { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ + { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ + { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ + { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ + { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ + { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ + { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ + { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ + { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ + { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ + { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ + { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ + { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ + { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ + { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ + { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ + { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ + { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ + { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ + { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ + { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ + { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ + { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ + { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ + { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ + { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ + { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ + { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ + { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ + { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ + { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ + { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ + { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ + { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ + { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ + { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ +}; + +/* + * Verify the map is valid. Each device index must appear exactly + * once in every row, and the permutation array checksum must match. + */ +static int +verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, + uint64_t checksum) +{ + int countssz = sizeof (uint16_t) * children; + uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); + + for (int i = 0; i < nperms; i++) { + for (int j = 0; j < children; j++) { + uint8_t val = perms[(i * children) + j]; + + if (val >= children || counts[val] != i) { + kmem_free(counts, countssz); + return (EINVAL); + } + + counts[val]++; + } + } + + if (checksum != 0) { + int permssz = sizeof (uint8_t) * children * nperms; + zio_cksum_t cksum; + + fletcher_4_native_varsize(perms, permssz, &cksum); + + if (checksum != cksum.zc_word[0]) { + kmem_free(counts, countssz); + return (ECKSUM); + } + } + + kmem_free(counts, countssz); + + return (0); +} + +/* + * Generate the permutation array for the draid_map_t. These maps control + * the placement of all data in a dRAID. Therefore it's critical that the + * seed always generates the same mapping. We provide our own pseudo-random + * number generator for this purpose. + */ +int +vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) +{ + VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); + VERIFY3U(map->dm_seed, !=, 0); + VERIFY3U(map->dm_nperms, !=, 0); + VERIFY3P(map->dm_perms, ==, NULL); + +#ifdef _KERNEL + /* + * The kernel code always provides both a map_seed and checksum. + * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide + * a zero checksum when generating new candidate maps. + */ + VERIFY3U(map->dm_checksum, !=, 0); +#endif + uint64_t children = map->dm_children; + uint64_t nperms = map->dm_nperms; + int rowsz = sizeof (uint8_t) * children; + int permssz = rowsz * nperms; + uint8_t *perms; + + /* Allocate the permutation array */ + perms = vmem_alloc(permssz, KM_SLEEP); + + /* Setup an initial row with a known pattern */ + uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); + for (int i = 0; i < children; i++) + initial_row[i] = i; + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + uint8_t *current_row, *previous_row = initial_row; + + /* + * Perform a Fisher-Yates shuffle of each row using the previous + * row as the starting point. An initial_row with known pattern + * is used as the input for the first row. + */ + for (int i = 0; i < nperms; i++) { + current_row = &perms[i * children]; + memcpy(current_row, previous_row, rowsz); + + for (int j = children - 1; j > 0; j--) { + uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); + uint8_t val = current_row[j]; + current_row[j] = current_row[k]; + current_row[k] = val; + } + + previous_row = current_row; + } + + kmem_free(initial_row, rowsz); + + int error = verify_perms(perms, children, nperms, map->dm_checksum); + if (error) { + vmem_free(perms, permssz); + return (error); + } + + *permsp = perms; + + return (0); +} + +/* + * Lookup the fixed draid_map_t for the requested number of children. + */ +int +vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) +{ + for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + if (draid_maps[i].dm_children == children) { + *mapp = &draid_maps[i]; + return (0); + } + } + + return (ENOENT); +} + +/* + * Lookup the permutation array and iteration id for the provided offset. + */ +static void +vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, + uint8_t **base, uint64_t *iter) +{ + uint64_t ncols = vdc->vdc_children; + uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + + *base = vdc->vdc_perms + (poff / ncols) * ncols; + *iter = poff % ncols; +} + +static inline uint64_t +vdev_draid_permute_id(vdev_draid_config_t *vdc, + uint8_t *base, uint64_t iter, uint64_t index) +{ + return ((base[index] + iter) % vdc->vdc_children); +} + +/* + * Return the asize which is the psize rounded up to a full group width. + * i.e. vdev_draid_psize_to_asize(). + */ +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_ashift; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; + uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; + + ASSERT3U(asize, !=, 0); + ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + + return (asize); +} + +/* + * Deflate the asize to the psize, this includes stripping parity. + */ +uint64_t +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT0(asize % vdc->vdc_groupwidth); + + return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); +} + +/* + * Convert a logical offset to the corresponding group number. + */ +static uint64_t +vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (offset / vdc->vdc_groupsz); +} + +/* + * Convert a group number to the logical starting offset for that group. + */ +static uint64_t +vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (group * vdc->vdc_groupsz); +} + +/* + * Full stripe writes. When writing, all columns (D+P) are required. Parity + * is calculated over all the columns, including empty zero filled sectors, + * and each is written to disk. While only the data columns are needed for + * a normal read, all of the columns are required for reconstruction when + * performing a sequential resilver. + * + * For "big columns" it's sufficient to map the correct range of the zio ABD. + * Partial columns require allocating a gang ABD in order to zero fill the + * empty sectors. When the column is empty a zero filled sector must be + * mapped. In all cases the data ABDs must be the same size as the parity + * ABDs (e.g. rc->rc_size == parity_size). + */ +static void +vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small write), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + rc->rc_abd = abd_get_zeros(skip_size); + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + rc->rc_abd = abd_alloc_gang(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), + B_TRUE); + } + + ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); + + abd_off += rc->rc_size; + rc->rc_size = parity_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Scrub/resilver reads. In order to store the contents of the skip sectors + * an additional ABD is allocated. The columns are handled in the same way + * as a full stripe write except instead of using the zero ABD the newly + * allocated skip ABD is used to back the skip sectors. In all cases the + * data ABD must be the same size as the parity ABDs. + */ +static void +vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_alloc_gang(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + uint64_t abd_size = abd_get_size(rc->rc_abd); + ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + /* + * Increase rc_size so the skip ABD is included in subsequent + * parity calculations. + */ + abd_off += rc->rc_size; + rc->rc_size = abd_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Normal reads. In this common case only the columns containing data + * are read in to the zio ABDs. Neither the parity columns or empty skip + * sectors are read unless the checksum fails verification. In which case + * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand + * the raid map in order to allow reconstruction using the parity data and + * skip sectors. + */ +static void +vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size > 0) { + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); + abd_off += rc->rc_size; + } + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key + * difference is that an ABD is allocated to back skip sectors so they may + * be read in to memory, verified, and repaired if needed. + */ +void +vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, ==, NULL); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column", nothing to add */ + ASSERT3P(rc->rc_abd, !=, NULL); + } else { + /* + * short data column, add a skip sector and clear + * rc_tried to force the entire column to be re-read + * thereby including the missing skip sector data + * which is needed for reconstruction. + */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT(!abd_is_gang(rc->rc_abd)); + abd_t *read_abd = rc->rc_abd; + rc->rc_abd = abd_alloc_gang(); + abd_gang_add(rc->rc_abd, read_abd, B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + rc->rc_tried = 0; + } + + /* + * Increase rc_size so the empty ABD is included in subsequent + * parity calculations. + */ + rc->rc_size = parity_size; + } + + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Given a logical address within a dRAID configuration, return the physical + * address on the first drive in the group that this address maps to + * (at position 'start' in permutation number 'perm'). + */ +static uint64_t +vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, + uint64_t *perm, uint64_t *start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + /* b is the dRAID (parent) sector offset. */ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b_offset = logical_offset >> ashift; + + /* + * The height of a row in units of the vdev's minimum sector size. + * This is the amount of data written to each disk of each group + * in a given permutation. + */ + uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; + + /* + * We cycle through a disk permutation every groupsz * ngroups chunk + * of address space. Note that ngroups * groupsz must be a multiple + * of the number of data drives (ndisks) in order to guarantee + * alignment. So, for example, if our row height is 16MB, our group + * size is 10, and there are 13 data drives in the draid, then ngroups + * will be 13, we will change permutation every 2.08GB and each + * disk will have 160MB of data per chunk. + */ + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t ngroups = vdc->vdc_ngroups; + uint64_t ndisks = vdc->vdc_ndisks; + + /* + * groupstart is where the group this IO will land in "starts" in + * the permutation array. + */ + uint64_t group = logical_offset / vdc->vdc_groupsz; + uint64_t groupstart = (group * groupwidth) % ndisks; + ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); + *start = groupstart; + + /* b_offset is the sector offset within a group chunk */ + b_offset = b_offset % (rowheight_sectors * groupwidth); + ASSERT0(b_offset % groupwidth); + + /* + * Find the starting byte offset on each child vdev: + * - within a permutation there are ngroups groups spread over the + * rows, where each row covers a slice portion of the disk + * - each permutation has (groupwidth * ngroups) / ndisks rows + * - so each permutation covers rows * slice portion of the disk + * - so we need to find the row where this IO group target begins + */ + *perm = group / ngroups; + uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + (((group % ngroups) * groupwidth) / ndisks); + + return (((rowheight_sectors * row) + + (b_offset / groupwidth)) << ashift); +} + +static uint64_t +vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, + uint64_t abd_offset, uint64_t abd_size) +{ + vdev_t *vd = zio->io_vd; + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t io_size = abd_size; + uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t group = vdev_draid_offset_to_group(vd, io_offset); + uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); + + /* + * Limit the io_size to the space remaining in the group. A second + * row in the raidz_map_t is created for the remainder. + */ + if (io_offset + io_asize > start_offset) { + io_size = vdev_draid_asize_to_psize(vd, + start_offset - io_offset); + } + + /* + * At most a block may span the logical end of one group and the start + * of the next group. Therefore, at the end of a group the io_size must + * span the group width evenly and the remainder must be aligned to the + * start of the next group. + */ + IMPLY(abd_offset == 0 && io_size < zio->io_size, + (io_asize >> ashift) % vdc->vdc_groupwidth == 0); + IMPLY(abd_offset != 0, + vdev_draid_group_to_offset(vd, group) == io_offset); + + /* Lookup starting byte offset on each child vdev */ + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + io_offset, &perm, &groupstart); + + /* + * If there is less than groupwidth drives available after the group + * start, the group is going to wrap onto the next row. 'wrap' is the + * group disk number that starts on the next row. + */ + uint64_t ndisks = vdc->vdc_ndisks; + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t wrap = groupwidth; + + if (groupstart + groupwidth > ndisks) + wrap = ndisks - groupstart; + + /* The io size in units of the vdev's minimum sector size. */ + const uint64_t psize = io_size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + uint64_t q = psize / vdc->vdc_ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = psize - q * vdc->vdc_ndata; + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); + ASSERT3U(bc, <, groupwidth); + + /* The total number of data and parity sectors for this I/O. */ + uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); + + raidz_row_t *rr; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); + rr->rr_cols = groupwidth; + rr->rr_scols = groupwidth; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = vdc->vdc_nparity; + rr->rr_abd_empty = NULL; +#ifdef ZFS_DEBUG + rr->rr_offset = io_offset; + rr->rr_size = io_size; +#endif + *rrp = rr; + + uint8_t *base; + uint64_t iter, asize = 0; + vdev_draid_get_perm(vdc, perm, &base, &iter); + for (uint64_t i = 0; i < groupwidth; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + uint64_t c = (groupstart + i) % ndisks; + + /* increment the offset if we wrap to the next row */ + if (i == wrap) + physical_offset += VDEV_DRAID_ROWHEIGHT; + + rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); + rc->rc_offset = physical_offset; + rc->rc_abd = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_force_repair = 0; + rc->rc_allow_repair = 1; + rc->rc_need_orig_restore = B_FALSE; + + if (q == 0 && i >= bc) + rc->rc_size = 0; + else if (i < bc) + rc->rc_size = (q + 1) << ashift; + else + rc->rc_size = q << ashift; + + asize += rc->rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rr->rr_nempty = roundup(tot, groupwidth) - tot; + IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); + + /* Allocate buffers for the parity columns */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + + /* + * Map buffers for data columns and allocate/map buffers for skip + * sectors. There are three distinct cases for dRAID which are + * required to support sequential rebuild. + */ + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_draid_map_alloc_write(zio, abd_offset, rr); + } else if ((rr->rr_nempty > 0) && + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + vdev_draid_map_alloc_scrub(zio, abd_offset, rr); + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + vdev_draid_map_alloc_read(zio, abd_offset, rr); + } + + return (io_size); +} + +/* + * Allocate the raidz mapping to be applied to the dRAID I/O. The parity + * calculations for dRAID are identical to raidz however there are a few + * differences in the layout. + * + * - dRAID always allocates a full stripe width. Any extra sectors due + * this padding are zero filled and written to disk. They will be read + * back during a scrub or repair operation since they are included in + * the parity calculation. This property enables sequential resilvering. + * + * - When the block at the logical offset spans redundancy groups then two + * rows are allocated in the raidz_map_t. One row resides at the end of + * the first group and the other at the start of the following group. + */ +static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio) +{ + raidz_row_t *rr[2]; + uint64_t abd_offset = 0; + uint64_t abd_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + uint64_t size; + int nrows = 1; + + size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, + abd_offset, abd_size); + if (size < abd_size) { + vdev_t *vd = zio->io_vd; + + io_offset += vdev_draid_asize(vd, size); + abd_offset += size; + abd_size -= size; + nrows++; + + ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( + vd, vdev_draid_offset_to_group(vd, io_offset))); + ASSERT3U(abd_offset, <, zio->io_size); + ASSERT3U(abd_size, !=, 0); + + size = vdev_draid_map_alloc_row(zio, &rr[1], + io_offset, abd_offset, abd_size); + VERIFY3U(size, ==, abd_size); + } + + raidz_map_t *rm; + rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); + rm->rm_ops = vdev_raidz_math_get_ops(); + rm->rm_nrows = nrows; + rm->rm_row[0] = rr[0]; + if (nrows == 2) + rm->rm_row[1] = rr[1]; + + return (rm); +} + +/* + * Given an offset into a dRAID return the next group width aligned offset + * which can be used to start an allocation. + */ +static uint64_t +vdev_draid_get_astart(vdev_t *vd, const uint64_t start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); +} + +/* + * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) + * rounded down to the last full slice. So each child must provide at least + * 1 / (children - nspares) of its asize. + */ +static uint64_t +vdev_draid_min_asize(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (VDEV_DRAID_REFLOW_RESERVE + + (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +/* + * When using dRAID the minimum allocation size is determined by the number + * of data disks in the redundancy group. Full stripes are always used. + */ +static uint64_t +vdev_draid_min_alloc(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (vdc->vdc_ndata << vd->vdev_ashift); +} + +/* + * Returns true if the txg range does not exist on any leaf vdev. + * + * A dRAID spare does not fit into the DTL model. While it has child vdevs + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Essentially we do a vdev_dtl_reassess() on the + * fly by replacing a dRAID spare with the child vdev under the offset. + * Note that it is a recursive process because the child vdev can be + * another dRAID spare and so on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child + * contains the txg range the data it is not missing. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, physical_offset, + txg, size)) + return (B_FALSE); + } + + return (B_TRUE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_missing(vd, physical_offset, + txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Returns true if the txg is only partially replicated on the leaf vdevs. + */ +static boolean_t +vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child is + * missing the txg range then it is partially replicated. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + } + + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_partial(vd, physical_offset, txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Determine if the vdev is readable at the given offset. + */ +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_readable(cvd, physical_offset)) + return (B_TRUE); + } + + return (B_FALSE); + } + + return (vdev_readable(vd)); +} + +/* + * Returns the first distributed spare found under the provided vdev tree. + */ +static vdev_t * +vdev_draid_find_spare(vdev_t *vd) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); + if (svd != NULL) + return (svd); + } + + return (NULL); +} + +/* + * Returns B_TRUE if the passed in vdev is currently "faulted". + * Faulted, in this context, means that the vdev represents a + * replacing or sparing vdev tree. + */ +static boolean_t +vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + + /* + * After resolving the distributed spare to a leaf vdev + * check the parent to determine if it's "faulted". + */ + vd = vd->vdev_parent; + } + + return (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Determine if the dRAID block at the logical offset is degraded. + * Used by sequential resilver. + */ +static boolean_t +vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Group contains a faulted vdev. */ + if (vdev_draid_faulted(cvd, physical_offset)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Determine if the txg is missing. Used by healing resilver. + */ +static boolean_t +vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, + uint64_t size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Transaction group is known to be partially replicated. */ + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Find the smallest child asize and largest sector size to calculate the + * available capacity. Distributed spares are ignored since their capacity + * is also based of the minimum child size in the top-level dRAID. + */ +static void +vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, + uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) +{ + uint64_t logical_ashift = 0, physical_ashift = 0; + uint64_t asize = 0, max_asize = 0; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + + asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; + max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; + logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); + physical_ashift = MAX(physical_ashift, + cvd->vdev_physical_ashift); + } + + *asizep = asize; + *max_asizep = max_asize; + *logical_ashiftp = logical_ashift; + *physical_ashiftp = physical_ashift; +} + +/* + * Open spare vdevs. + */ +static boolean_t +vdev_draid_open_spares(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_draid_spare_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Open all children, excluding spares. + */ +static boolean_t +vdev_draid_open_children(vdev_t *vd) +{ + return (!vdev_draid_open_spares(vd)); +} + +/* + * Open a top-level dRAID vdev. + */ +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t nparity = vdc->vdc_nparity; + int open_errors = 0; + + if (nparity > VDEV_DRAID_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * First open the normal children then the distributed spares. This + * ordering is important to ensure the distributed spares calculate + * the correct psize in the event that the dRAID vdevs were expanded. + */ + vdev_open_children_subset(vd, vdev_draid_open_children); + vdev_open_children_subset(vd, vdev_draid_open_spares); + + /* Verify enough of the children are available to continue. */ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + } + } + + /* + * Allocatable capacity is the sum of the space on all children less + * the number of distributed spares rounded down to last full row + * and then to the last full group. An additional 32MB of scratch + * space is reserved at the end of each child for use by the dRAID + * expansion feature. + */ + uint64_t child_asize, child_max_asize; + vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, + logical_ashift, physical_ashift); + + /* + * Should be unreachable since the minimum child size is 64MB, but + * we want to make sure an underflow absolutely cannot occur here. + */ + if (child_asize < VDEV_DRAID_REFLOW_RESERVE || + child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { + return (SET_ERROR(ENXIO)); + } + + child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + + *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + + return (0); +} + +/* + * Close a top-level dRAID vdev. + */ +static void +vdev_draid_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. A dRAID chunks may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span dRAID redundancy groups. + */ +static uint64_t +vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t ashift = vd->vdev_ashift; + uint64_t ndata = vdc->vdc_ndata; + uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), + SPA_MAXBLOCKSIZE); + + ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); + ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + + /* Chunks must evenly span all data columns in the group. */ + psize = (((psize >> ashift) / ndata) * ndata) << ashift; + uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); + + /* Reduce the chunk size to the group space remaining. */ + uint64_t group = vdev_draid_offset_to_group(vd, start); + uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; + chunk_size = MIN(chunk_size, left); + + ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, + vdev_draid_offset_to_group(vd, start + chunk_size - 1)); + + return (chunk_size); +} + +/* + * Align the start of the metaslab to the group width and slightly reduce + * its size to a multiple of the group width. Since full stripe writes are + * required by dRAID this space is unallocable. Furthermore, aligning the + * metaslab start is important for vdev initialize and TRIM which both operate + * on metaslab boundaries which vdev_xlate() expects to be aligned. + */ +static void +vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; + uint64_t astart = vdev_draid_get_astart(vd, *ms_start); + uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; + + *ms_start = astart; + *ms_size = asize; + + ASSERT0(*ms_start % sz); + ASSERT0(*ms_size % sz); +} + +/* + * Add virtual dRAID spares to the list of valid spares. In order to accomplish + * this the existing array must be freed and reallocated with the additional + * entries. + */ +int +vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, + uint64_t next_vdev_id) +{ + uint64_t draid_nspares = 0; + uint64_t ndraid = 0; + int error; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + + if (cvd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = cvd->vdev_tsd; + draid_nspares += vdc->vdc_nspares; + ndraid++; + } + } + + if (draid_nspares == 0) { + *ndraidp = ndraid; + return (0); + } + + nvlist_t **old_spares, **new_spares; + uint_t old_nspares; + error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &old_spares, &old_nspares); + if (error) + old_nspares = 0; + + /* Allocate memory and copy of the existing spares. */ + new_spares = kmem_alloc(sizeof (nvlist_t *) * + (draid_nspares + old_nspares), KM_SLEEP); + for (uint_t i = 0; i < old_nspares; i++) + new_spares[i] = fnvlist_dup(old_spares[i]); + + /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ + uint64_t n = old_nspares; + for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { + vdev_t *cvd = vd->vdev_child[vdev_id]; + char path[64]; + + if (cvd->vdev_ops != &vdev_draid_ops) + continue; + + vdev_draid_config_t *vdc = cvd->vdev_tsd; + uint64_t nspares = vdc->vdc_nspares; + uint64_t nparity = vdc->vdc_nparity; + + for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { + bzero(path, sizeof (path)); + (void) snprintf(path, sizeof (path) - 1, + "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, + (u_longlong_t)nparity, + (u_longlong_t)next_vdev_id + vdev_id, + (u_longlong_t)spare_id); + + nvlist_t *spare = fnvlist_alloc(); + fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, + cvd->vdev_guid); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, + spare_id); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, + cvd->vdev_ashift); + + new_spares[n] = spare; + n++; + } + } + + if (n > 0) { + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + new_spares, n); + } + + for (int i = 0; i < n; i++) + nvlist_free(new_spares[i]); + + kmem_free(new_spares, sizeof (*new_spares) * n); + *ndraidp = ndraid; + + return (0); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. + */ +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = vdev_draid_asize(vd, psize); + + if (phys_birth == TXG_UNKNOWN) { + /* + * Sequential resilver. There is no meaningful phys_birth + * for this block, we can only determine if block resides + * in a degraded group in which case it must be resilvered. + */ + ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, + vdev_draid_offset_to_group(vd, offset + asize - 1)); + + return (vdev_draid_group_degraded(vd, offset)); + } else { + /* + * Healing resilver. TXGs not in DTL_PARTIAL are intact, + * as are blocks in non-degraded groups. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) + return (B_TRUE); + + /* The block may span groups in which case check both. */ + if (vdev_draid_offset_to_group(vd, offset) != + vdev_draid_offset_to_group(vd, offset + asize - 1)) { + if (vdev_draid_group_missing(vd, + offset + asize, phys_birth, 1)) + return (B_TRUE); + } + + return (B_FALSE); + } +} + +static boolean_t +vdev_draid_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_draid_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static void +vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_draid_asize(vd, rr->rr_size); + + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); +#endif +} + +/* + * For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() + * if a skip sector needs to be added to a column. + */ +static void +vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Empty columns are zero filled and included in the parity + * calculation and therefore must be written. + */ + ASSERT3U(rc->rc_size, !=, 0); + + /* Verify physical to logical translation */ + vdev_draid_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } +} + +/* + * For read operations: + * 1. The vdev_draid_map_alloc() function will create a minimal raidz + * mapping for the read based on the zio->io_flags. There are two + * possible mappings either 1) a normal read, or 2) a scrub/resilver. + * 2. Create the zio read operations. This will include all parity + * columns and skip sectors for a scrub/resilver. + */ +static void +vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* Sequential rebuild must do IO at redundancy group boundary. */ + IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last. Any errors along the way will force us to read the parity. + * For scrub/resilver IOs which verify skip sectors, a gang ABD will + * have been allocated to store them and rc->rc_size is increased. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; + rc->rc_skipped = 1; + continue; + } + + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + + /* + * Empty columns may be read during vdev_draid_io_done(). + * Only skip them after the readable and missing checks + * verify they are available. + */ + if (rc->rc_size == 0) { + rc->rc_skipped = 1; + continue; + } + + if (zio->io_flags & ZIO_FLAG_RESILVER) { + vdev_t *svd; + + /* + * Sequential rebuilds need to always consider the data + * on the child being rebuilt to be stale. This is + * important when all columns are available to aid + * known reconstruction in identifing which columns + * contain incorrect data. + * + * Furthermore, all repairs need to be constrained to + * the devices being rebuilt because without a checksum + * we cannot verify the data is actually correct and + * performing an incorrect repair could result in + * locking in damage and making the data unrecoverable. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + if (vdev_draid_rebuilding(cvd)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + rc->rc_allow_repair = 1; + continue; + } else { + rc->rc_allow_repair = 0; + } + } else { + rc->rc_allow_repair = 1; + } + + /* + * If this child is a distributed spare then the + * offset might reside on the vdev being replaced. + * In which case this data must be written to the + * new device. Failure to do so would result in + * checksum errors when the old device is detached + * and the pool is scrubbed. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) { + rc->rc_force_repair = 1; + + if (vdev_draid_rebuilding(svd)) + rc->rc_allow_repair = 1; + } + } + + /* + * Always issue a repair IO to this child when its + * a spare or replacing vdev with an active rebuild. + */ + if ((cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_replacing_ops) && + vdev_draid_rebuilding(cvd)) { + rc->rc_force_repair = 1; + rc->rc_allow_repair = 1; + } + } + } + + /* + * Either a parity or data column is missing this means a repair + * may be attempted by vdev_draid_io_done(). Expand the raid map + * to read in empty columns which are needed along with the parity + * during reconstruction. + */ + if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && + rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { + vdev_draid_map_alloc_empty(zio, rr); + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + +/* + * Start an IO operation to a dRAID vdev. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd __maybe_unused = zio->io_vd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); + + raidz_map_t *rm = vdev_draid_map_alloc(zio); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_write(zio, rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_read(zio, rm->rm_row[i]); + } + } + + zio_execute(zio); +} + +/* + * Complete an IO operation on a dRAID vdev. The raidz logic can be applied + * to dRAID since the layout is fully described by the raidz_map_t. + */ +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_raidz_io_done(zio); +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + ASSERT(vd->vdev_ops == &vdev_draid_ops); + + if (faulted > vdc->vdc_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_draid_ops); + + vdev_draid_config_t *vdc = raidvd->vdev_tsd; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* Make sure the offsets are block-aligned */ + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + + uint64_t logical_start = logical_rs->rs_start; + uint64_t logical_end = logical_rs->rs_end; + + /* + * Unaligned ranges must be skipped. All metaslabs are correctly + * aligned so this should not happen, but this case is handled in + * case it's needed by future callers. + */ + uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); + if (astart != logical_start) { + physical_rs->rs_start = logical_start; + physical_rs->rs_end = logical_start; + remain_rs->rs_start = MIN(astart, logical_end); + remain_rs->rs_end = logical_end; + return; + } + + /* + * Unlike with mirrors and raidz a dRAID logical range can map + * to multiple non-contiguous physical ranges. This is handled by + * limiting the size of the logical range to a single group and + * setting the remain argument such that it describes the remaining + * unmapped logical range. This is stricter than absolutely + * necessary but helps simplify the logic below. + */ + uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); + uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); + if (logical_end > nextstart) + logical_end = nextstart; + + /* Find the starting offset for each vdev in the group */ + uint64_t perm, groupstart; + uint64_t start = vdev_draid_logical_to_physical(raidvd, + logical_start, &perm, &groupstart); + uint64_t end = start; + + uint8_t *base; + uint64_t iter, id; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + /* + * Check if the passed child falls within the group. If it does + * update the start and end to reflect the physical range. + * Otherwise, leave them unmodified which will result in an empty + * (zero-length) physical range being returned. + */ + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + + if (c == 0 && i != 0) { + /* the group wrapped, increment the start */ + start += VDEV_DRAID_ROWHEIGHT; + end = start; + } + + id = vdev_draid_permute_id(vdc, base, iter, c); + if (id == cvd->vdev_id) { + uint64_t b_size = (logical_end >> ashift) - + (logical_start >> ashift); + ASSERT3U(b_size, >, 0); + end = start + ((((b_size - 1) / + vdc->vdc_groupwidth) + 1) << ashift); + break; + } + } + physical_rs->rs_start = start; + physical_rs->rs_end = end; + + /* + * Only top-level vdevs are allowed to set remain_rs because + * when .vdev_op_xlate() is called for their children the full + * logical range is not provided by vdev_xlate(). + */ + remain_rs->rs_start = logical_end; + remain_rs->rs_end = logical_rs->rs_end; + + ASSERT3U(physical_rs->rs_start, <=, logical_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_end - logical_start); +} + +/* + * Add dRAID specific fields to the config nvlist. + */ +static void +vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + vdev_draid_config_t *vdc = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); +} + +/* + * Initialize private dRAID specific fields from the nvlist. + */ +static int +vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + uint64_t ndata, nparity, nspares, ngroups; + int error; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || + nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + return (SET_ERROR(EINVAL)); + } + + uint_t children; + nvlist_t **child; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children == 0 || + children > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100 || nspares > (children - (ndata + nparity))) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + /* + * Validate the minimum number of children exist per group for the + * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). + */ + if (children < (ndata + nparity + nspares)) + return (SET_ERROR(EINVAL)); + + /* + * Create the dRAID configuration using the pool nvlist configuration + * and the fixed mapping for the correct number of children. + */ + vdev_draid_config_t *vdc; + const draid_map_t *map; + + error = vdev_draid_lookup_map(children, &map); + if (error) + return (SET_ERROR(EINVAL)); + + vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); + vdc->vdc_ndata = ndata; + vdc->vdc_nparity = nparity; + vdc->vdc_nspares = nspares; + vdc->vdc_children = children; + vdc->vdc_ngroups = ngroups; + vdc->vdc_nperms = map->dm_nperms; + + error = vdev_draid_generate_perms(map, &vdc->vdc_perms); + if (error) { + kmem_free(vdc, sizeof (*vdc)); + return (SET_ERROR(EINVAL)); + } + + /* + * Derived constants. + */ + vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; + vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; + vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; + vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / + vdc->vdc_ndisks; + + ASSERT3U(vdc->vdc_groupwidth, >=, 2); + ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); + ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % + vdc->vdc_ndisks, ==, 0); + + *tsd = vdc; + + return (0); +} + +static void +vdev_draid_fini(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + vmem_free(vdc->vdc_perms, sizeof (uint8_t) * + vdc->vdc_children * vdc->vdc_nperms); + kmem_free(vdc, sizeof (*vdc)); +} + +static uint64_t +vdev_draid_nparity(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_nparity); +} + +static uint64_t +vdev_draid_ndisks(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_ndisks); +} + +vdev_ops_t vdev_draid_ops = { + .vdev_op_init = vdev_draid_init, + .vdev_op_fini = vdev_draid_fini, + .vdev_op_open = vdev_draid_open, + .vdev_op_close = vdev_draid_close, + .vdev_op_asize = vdev_draid_asize, + .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_alloc = vdev_draid_min_alloc, + .vdev_op_io_start = vdev_draid_io_start, + .vdev_op_io_done = vdev_draid_io_done, + .vdev_op_state_change = vdev_draid_state_change, + .vdev_op_need_resilver = vdev_draid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_draid_xlate, + .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, + .vdev_op_metaslab_init = vdev_draid_metaslab_init, + .vdev_op_config_generate = vdev_draid_config_generate, + .vdev_op_nparity = vdev_draid_nparity, + .vdev_op_ndisks = vdev_draid_ndisks, + .vdev_op_type = VDEV_TYPE_DRAID, + .vdev_op_leaf = B_FALSE, +}; + + +/* + * A dRAID distributed spare is a virtual leaf vdev which is included in the + * parent dRAID configuration. The last N columns of the dRAID permutation + * table are used to determine on which dRAID children a specific offset + * should be written. These spare leaf vdevs can only be used to replace + * faulted children in the same dRAID configuration. + */ + +/* + * Distributed spare state. All fields are set when the distributed spare is + * first opened and are immutable. + */ +typedef struct { + vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ + uint64_t vds_top_guid; /* top-level parent dRAID guid */ + uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ +} vdev_draid_spare_t; + +/* + * Returns the parent dRAID vdev to which the distributed spare belongs. + * This may be safely called even when the vdev is not open. + */ +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + if (vds->vds_draid_vdev != NULL) + return (vds->vds_draid_vdev); + + return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, + vds->vds_top_guid)); +} + +/* + * A dRAID space is active when it's the child of a vdev using the + * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. + */ +static boolean_t +vdev_draid_spare_is_active(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_draid_ops)) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Given a dRAID distribute spare vdev, returns the physical child vdev + * on which the provided offset resides. This may involve recursing through + * multiple layers of distributed spares. Note that offset is relative to + * this vdev. + */ +vdev_t * +vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + /* The vdev is closed */ + if (vds->vds_draid_vdev == NULL) + return (NULL); + + vdev_t *tvd = vds->vds_draid_vdev; + vdev_draid_config_t *vdc = tvd->vdev_tsd; + + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + + uint8_t *base; + uint64_t iter; + uint64_t perm = physical_offset / vdc->vdc_devslicesz; + + vdev_draid_get_perm(vdc, perm, &base, &iter); + + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, + (tvd->vdev_children - 1) - vds->vds_spare_id); + vdev_t *cvd = tvd->vdev_child[cid]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_get_child(cvd, physical_offset)); + + return (cvd); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_close(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vds->vds_draid_vdev = NULL; +} + +/* + * Opening a dRAID spare device is done by looking up the associated dRAID + * top-level vdev guid from the spare configuration. + */ +static int +vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + uint64_t asize, max_asize; + + vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); + if (tvd == NULL) { + /* + * When spa_vdev_add() is labeling new spares the + * associated dRAID is not attached to the root vdev + * nor does this spare have a parent. Simulate a valid + * device in order to allow the label to be initialized + * and the distributed spare added to the configuration. + */ + if (vd->vdev_parent == NULL) { + *psize = *max_psize = SPA_MINDEVSIZE; + *logical_ashift = *physical_ashift = ASHIFT_MIN; + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + vdev_draid_config_t *vdc = tvd->vdev_tsd; + if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) + return (SET_ERROR(EINVAL)); + + if (vds->vds_spare_id >= vdc->vdc_nspares) + return (SET_ERROR(EINVAL)); + + /* + * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here + * because the caller may be vdev_draid_open() in which case the + * values are stale as they haven't yet been updated by vdev_open(). + * To avoid this always recalculate the dRAID asize and max_asize. + */ + vdev_draid_calculate_asize(tvd, &asize, &max_asize, + logical_ashift, physical_ashift); + + *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + + vds->vds_draid_vdev = tvd; + + return (0); +} + +/* + * Completed distributed spare IO. Store the result in the parent zio + * as if it had performed the operation itself. Only the first error is + * preserved if there are multiple errors. + */ +static void +vdev_draid_spare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + /* + * IOs are issued to non-writable vdevs in order to keep their + * DTLs accurate. However, we don't want to propagate the + * error in to the distributed spare's DTL. When resilvering + * vdev_draid_need_resilver() will consult the relevant DTL + * to determine if the data is missing and must be repaired. + */ + if (!vdev_writeable(zio->io_vd)) + return; + + if (pio->io_error == 0) + pio->io_error = zio->io_error; +} + +/* + * Returns a valid label nvlist for the distributed spare vdev. This is + * used to bypass the IO pipeline to avoid the complexity of constructing + * a complete label with valid checksum to return when read. + */ +nvlist_t * +vdev_draid_read_config_spare(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + uint64_t guid = vd->vdev_guid; + + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, + vdev_draid_spare_is_active(vd) ? + POOL_STATE_ACTIVE : POOL_STATE_SPARE); + + /* Set the vdev guid based on the vdev list in sav_count. */ + for (int i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + return (nv); +} + +/* + * Handle any ioctl requested of the distributed spare. Only flushes + * are supported in which case all children must be flushed. + */ +static int +vdev_draid_spare_ioctl(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + int error = 0; + + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } else { + error = SET_ERROR(ENOTSUP); + } + + return (error); +} + +/* + * Initiate an IO to the distributed spare. For normal IOs this entails using + * the zio->io_offset and permutation table to calculate which child dRAID vdev + * is responsible for the data. Then passing along the zio to that child to + * perform the actual IO. The label ranges are not stored on disk and require + * some special handling which is described below. + */ +static void +vdev_draid_spare_io_start(zio_t *zio) +{ + vdev_t *cvd = NULL, *vd = zio->io_vd; + vdev_draid_spare_t *vds = vd->vdev_tsd; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vds == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + zio->io_error = vdev_draid_spare_ioctl(zio); + break; + + case ZIO_TYPE_WRITE: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs and config writers to simulate the + * existence of an on disk label. vdev_label_sync(), + * vdev_uberblock_sync() and vdev_copy_uberblocks() + * skip the distributed spares. This only leaves + * vdev_label_init() which is allowed to succeed to + * avoid adding special cases the function. + */ + if (zio->io_flags & ZIO_FLAG_PROBE || + zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_READ: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs to simulate the existence of a + * label. vdev_label_read_config() bypasses the + * pipeline to read the label configuration and + * vdev_uberblock_load() skips distributed spares + * when attempting to locate the best uberblock. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_TRIM: + /* The vdev label ranges are never trimmed */ + ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); + + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !cvd->vdev_has_trim) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_io_done(zio_t *zio) +{ +} + +/* + * Lookup the full spare config in spa->spa_spares.sav_config and + * return the top_guid and spare_id for the named spare. + */ +static int +vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, + uint64_t *spare_idp) +{ + nvlist_t **spares; + uint_t nspares; + int error; + + if ((spa->spa_spares.sav_config == NULL) || + (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { + return (SET_ERROR(ENOENT)); + } + + char *spare_name; + error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (int i = 0; i < nspares; i++) { + nvlist_t *spare = spares[i]; + uint64_t top_guid, spare_id; + char *type, *path; + + /* Skip non-distributed spares */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); + if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) + continue; + + /* Skip spares with the wrong name */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); + if (error != 0 || strcmp(path, spare_name) != 0) + continue; + + /* Found the matching spare */ + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_TOP_GUID, &top_guid); + if (error == 0) { + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_SPARE_ID, &spare_id); + } + + if (error != 0) { + return (SET_ERROR(EINVAL)); + } else { + *top_guidp = top_guid; + *spare_idp = spare_id; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* + * Initialize private dRAID spare specific fields from the nvlist. + */ +static int +vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_draid_spare_t *vds; + uint64_t top_guid = 0; + uint64_t spare_id; + + /* + * In the normal case check the list of spares stored in the spa + * to lookup the top_guid and spare_id for provided spare config. + * When creating a new pool or adding vdevs the spare list is not + * yet populated and the values are provided in the passed config. + */ + if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, + &spare_id) != 0) + return (SET_ERROR(EINVAL)); + } + + vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); + vds->vds_draid_vdev = NULL; + vds->vds_top_guid = top_guid; + vds->vds_spare_id = spare_id; + + *tsd = vds; + + return (0); +} + +static void +vdev_draid_spare_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); +} + +static void +vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); +} + +vdev_ops_t vdev_draid_spare_ops = { + .vdev_op_init = vdev_draid_spare_init, + .vdev_op_fini = vdev_draid_spare_fini, + .vdev_op_open = vdev_draid_spare_open, + .vdev_op_close = vdev_draid_spare_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_draid_spare_io_start, + .vdev_op_io_done = vdev_draid_spare_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_draid_spare_config_generate, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DRAID_SPARE, + .vdev_op_leaf = B_TRUE, +}; diff --git a/module/zfs/vdev_draid_rand.c b/module/zfs/vdev_draid_rand.c new file mode 100644 index 0000000000..fe1a75c113 --- /dev/null +++ b/module/zfs/vdev_draid_rand.c @@ -0,0 +1,40 @@ +/* + * Xorshift Pseudo Random Number Generator based on work by David Blackman + * and Sebastiano Vigna (vigna@acm.org). + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * http://prng.di.unimi.it/xoroshiro128plusplus.c + * + * To the extent possible under law, the author has dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * See . + * + * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, + * small-state generators. It is extremely (sub-ns) fast and it passes all + * tests we are aware of, but its state space is large enough only for + * mild parallelism. + */ + +#include + +static inline uint64_t rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +uint64_t +vdev_draid_rand(uint64_t *s) +{ + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = rotl(s0 + s1, 17) + s0; + + s1 ^= s0; + s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b + s[1] = rotl(s1, 28); // c + + return (result); +} diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 4d18e33c0a..14ebf55146 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -16,6 +16,7 @@ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include @@ -26,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -181,7 +181,7 @@ int zfs_condense_indirect_vdevs_enable = B_TRUE; * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ -int zfs_indirect_condense_obsolete_pct = 25; +int zfs_condense_indirect_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of @@ -239,6 +239,7 @@ typedef struct indirect_child { */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ + int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* @@ -314,7 +315,6 @@ vdev_indirect_map_free(zio_t *zio) static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, - .vsd_cksum_report = zio_vsd_default_cksum_report }; /* @@ -420,7 +420,7 @@ vdev_indirect_should_condense(vdev_t *vd) * If nothing new has been marked obsolete, there is no * point in condensing. */ - ASSERTV(uint64_t obsolete_sm_obj); + uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); @@ -445,7 +445,7 @@ vdev_indirect_should_condense(vdev_t *vd) * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= - zfs_indirect_condense_obsolete_pct && + zfs_condense_indirect_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", @@ -529,8 +529,9 @@ spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", - vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, - new_count, old_count); + (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)vic->vic_mapping_object, + (u_longlong_t)new_count, (u_longlong_t)old_count); vdev_config_dirty(spa->spa_root_vdev); } @@ -543,7 +544,7 @@ spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); - ASSERTV(spa_t *spa = dmu_tx_pool(tx)->dp_spa); + spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); @@ -576,8 +577,7 @@ spa_condense_indirect_commit_entry(spa_t *spa, */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), - spa_condense_indirect_commit_sync, sci, - 0, ZFS_SPACE_CHECK_NONE, tx); + spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = @@ -797,7 +797,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", - vd->vdev_id, dmu_tx_get_txg(tx), + (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); @@ -814,7 +814,7 @@ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; - ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); + vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); @@ -825,7 +825,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, - vdev_standard_sm_blksz, tx); + zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, @@ -883,8 +883,9 @@ void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); - spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, - spa_condense_indirect_thread, spa); + spa->spa_condense_zthr = zthr_create("z_indirect_condense", + spa_condense_indirect_thread_check, + spa_condense_indirect_thread, spa, minclsyspri); } /* @@ -949,11 +950,12 @@ vdev_indirect_close(vdev_t *vd) /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - *ashift = vd->vdev_ashift; + *logical_ashift = vd->vdev_ashift; + *physical_ashift = vd->vdev_physical_ashift; return (0); } @@ -965,7 +967,7 @@ typedef struct remap_segment { list_node_t rs_node; } remap_segment_t; -remap_segment_t * +static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); @@ -989,7 +991,7 @@ rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ -vdev_indirect_mapping_entry_phys_t * +static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { @@ -1185,7 +1187,7 @@ vdev_indirect_child_io_done(zio_t *zio) pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); - abd_put(zio->io_abd); + abd_free(zio->io_abd); } /* @@ -1271,15 +1273,14 @@ vdev_indirect_read_all(zio_t *zio) continue; /* - * Note, we may read from a child whose DTL - * indicates that the data may not be present here. - * While this might result in a few i/os that will - * likely return incorrect data, it simplifies the - * code since we can treat scrub and resilver - * identically. (The incorrect data will be - * detected and ignored when we verify the - * checksum.) + * If a child is missing the data, set ic_error. Used + * in vdev_indirect_repair(). We perform the read + * nevertheless which provides the opportunity to + * reconstruct the split block if at all possible. */ + if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, + zio->io_txg, 1)) + ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); @@ -1297,7 +1298,7 @@ vdev_indirect_read_all(zio_t *zio) static void vdev_indirect_io_start(zio_t *zio) { - ASSERTV(spa_t *spa = zio->io_spa); + spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); @@ -1401,7 +1402,7 @@ vdev_indirect_checksum_error(zio_t *zio, zio_bad_cksum_t zbc = {{{ 0 }}}; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; - zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } @@ -1409,7 +1410,11 @@ vdev_indirect_checksum_error(zio_t *zio, * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data - * with the good copy. Note that we do this without regard for the DTL's, + * with the good copy. The DTL is checked in vdev_indirect_read_all() and + * if a vdev is missing a copy of the data we set ic_error and the read is + * performed. This provides the opportunity to reconstruct the split block + * if at all possible. ic_error is checked here and if set it suppresses + * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use @@ -1420,11 +1425,6 @@ vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; - enum zio_flag flags = ZIO_FLAG_IO_REPAIR; - - if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) - flags |= ZIO_FLAG_SELF_HEAL; - if (!spa_writeable(zio->io_spa)) return; @@ -1446,6 +1446,14 @@ vdev_indirect_repair(zio_t *zio) ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); + /* + * If ic_error is set the current child does not have + * a copy of the data, so suppress incrementing the + * checksum counter. + */ + if (ic->ic_error == ESTALE) + continue; + vdev_indirect_checksum_error(zio, is, ic); } } @@ -1472,13 +1480,12 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; + (void) zfs_ereport_post_checksum(zio->io_spa, vd, + NULL, zio, is->is_target_offset, is->is_size, + NULL, NULL, NULL); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - - zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, - is->is_target_offset, is->is_size, - NULL, NULL, NULL); } } } @@ -1566,7 +1573,7 @@ vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; - for (int i = spa_get_random(children); i > 0; i--) + for (int i = random_in_range(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); @@ -1637,7 +1644,7 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) if (ic->ic_data == NULL) continue; - abd_zero(ic->ic_data, ic->ic_data->abd_size); + abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; @@ -1730,7 +1737,7 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && - spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) + random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* @@ -1842,23 +1849,30 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { - vdev_indirect_open, - vdev_indirect_close, - vdev_default_asize, - vdev_indirect_io_start, - vdev_indirect_io_done, - NULL, - NULL, - NULL, - NULL, - vdev_indirect_remap, - NULL, - VDEV_TYPE_INDIRECT, /* name of this vdev type */ - B_FALSE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_indirect_open, + .vdev_op_close = vdev_indirect_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_indirect_io_start, + .vdev_op_io_done = vdev_indirect_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = vdev_indirect_remap, + .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* leaf vdev */ }; -#if defined(_KERNEL) -EXPORT_SYMBOL(rs_alloc); EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); @@ -1870,25 +1884,24 @@ EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); -module_param(zfs_condense_indirect_vdevs_enable, int, 0644); -MODULE_PARM_DESC(zfs_condense_indirect_vdevs_enable, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); -/* CSTYLED */ -module_param(zfs_condense_min_mapping_bytes, ulong, 0644); -MODULE_PARM_DESC(zfs_condense_min_mapping_bytes, - "Minimum size of vdev mapping to condense"); +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, INT, ZMOD_RW, + "Minimum obsolete percent of bytes in the mapping to attempt condensing"); -/* CSTYLED */ -module_param(zfs_condense_max_obsolete_bytes, ulong, 0644); -MODULE_PARM_DESC(zfs_condense_max_obsolete_bytes, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, + "Don't bother condensing if the mapping uses less than this amount of " + "memory"); + +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); -module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); -MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, - "Delay while condensing vdev mapping"); +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, + "Used by tests to ensure certain actions happen in the middle of a " + "condense. A maximum value of 1 should be sufficient."); -module_param(zfs_reconstruct_indirect_combinations_max, int, 0644); -MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max, +ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); -#endif +/* END CSTYLED */ diff --git a/module/zfs/vdev_indirect_births.c b/module/zfs/vdev_indirect_births.c index 1c44a64287..99b83c3922 100644 --- a/module/zfs/vdev_indirect_births.c +++ b/module/zfs/vdev_indirect_births.c @@ -70,7 +70,7 @@ vdev_indirect_births_close(vdev_indirect_births_t *vib) if (vib->vib_phys->vib_count > 0) { uint64_t births_size = vdev_indirect_births_size_impl(vib); - kmem_free(vib->vib_entries, births_size); + vmem_free(vib->vib_entries, births_size); vib->vib_entries = NULL; } @@ -108,7 +108,7 @@ vdev_indirect_births_open(objset_t *os, uint64_t births_object) if (vib->vib_phys->vib_count > 0) { uint64_t births_size = vdev_indirect_births_size_impl(vib); - vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); + vib->vib_entries = vmem_alloc(births_size, KM_SLEEP); VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, births_size, vib->vib_entries, DMU_READ_PREFETCH)); } @@ -148,10 +148,10 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, vib->vib_phys->vib_count++; new_size = vdev_indirect_births_size_impl(vib); - new_entries = kmem_alloc(new_size, KM_SLEEP); + new_entries = vmem_alloc(new_size, KM_SLEEP); if (old_size > 0) { bcopy(vib->vib_entries, new_entries, old_size); - kmem_free(vib->vib_entries, old_size); + vmem_free(vib->vib_entries, old_size); } new_entries[vib->vib_phys->vib_count - 1] = vibe; vib->vib_entries = new_entries; diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c index e4d998f09b..bb484a401b 100644 --- a/module/zfs/vdev_indirect_mapping.c +++ b/module/zfs/vdev_indirect_mapping.c @@ -39,11 +39,12 @@ vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) EQUIV(vim->vim_phys->vimp_num_entries > 0, vim->vim_entries != NULL); if (vim->vim_phys->vimp_num_entries > 0) { - ASSERTV(vdev_indirect_mapping_entry_phys_t *last_entry = - &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]); - ASSERTV(uint64_t offset = - DVA_MAPPING_GET_SRC_OFFSET(last_entry)); - ASSERTV(uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst)); + vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused = + &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; + uint64_t offset __maybe_unused = + DVA_MAPPING_GET_SRC_OFFSET(last_entry); + uint64_t size __maybe_unused = + DVA_GET_ASIZE(&last_entry->vimep_dst); ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index b159013263..e9156c32f3 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -20,18 +20,18 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #include #include #include #include -#include #include #include #include #include +#include /* * Value that is written to disk during initialization. @@ -46,7 +46,7 @@ unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ -uint64_t zfs_initialize_chunk_size = 1024 * 1024; +unsigned long zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) @@ -121,12 +121,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } + + vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_INITIALIZE_ACTIVE: @@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_INITIALIZE_ACTIVE || + old_state == VDEV_INITIALIZE_SUSPENDED) + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, @@ -150,6 +154,9 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) } dmu_tx_commit(tx); + + if (new_state != VDEV_INITIALIZE_ACTIVE) + spa_notify_waiters(spa); } static void @@ -213,8 +220,7 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_initialize_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_initialize_zap_update_sync, guid, tx); } /* @@ -288,11 +294,13 @@ vdev_initialize_block_free(abd_t *data) static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { - avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + range_tree_t *rt = vd->vdev_initialize_tree; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + rs = zfs_btree_next(bt, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = @@ -302,7 +310,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs->rs_start + + VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); @@ -313,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) return (0); } +static void +vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_initialize_bytes_est += size; + + if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && + vd->vdev_initialize_last_offset < physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - physical_rs->rs_start; + } +} + static void vdev_initialize_calculate_progress(vdev_t *vd) { @@ -327,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been initialized */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been initialized */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_initialize_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -362,26 +403,17 @@ vdev_initialize_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; - vdev_xlate(vd, &logical_rs, &physical_rs); + zfs_btree_index_t where; + range_tree_t *rt = msp->ms_allocatable; + for (range_seg_t *rs = + zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, + &where)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -411,55 +443,48 @@ vdev_initialize_load(vdev_t *vd) return (err); } +static void +vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = arg; + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs->rs_start, + (u_longlong_t)physical_rs->rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs->rs_end); + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs->rs_start = vd->vdev_initialize_last_offset; + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* * Convert the logical range into a physical range and add it to our * avl tree. */ -void +static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { - zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " - "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, - (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; - } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static void @@ -478,11 +503,13 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + boolean_t unload_when_done = B_FALSE; /* * If we've expanded the top-level vdev or it's our @@ -496,6 +523,8 @@ vdev_initialize_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); + if (!msp->ms_loaded && !msp->ms_loading) + unload_when_done = B_TRUE; VERIFY0(metaslab_load(msp)); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, @@ -503,7 +532,7 @@ vdev_initialize_thread(void *arg) mutex_exit(&msp->ms_lock); error = vdev_initialize_ranges(vd, deadbeef); - metaslab_enable(msp, B_TRUE); + metaslab_enable(msp, B_TRUE, unload_when_done); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); @@ -524,8 +553,14 @@ vdev_initialize_thread(void *arg) vd->vdev_initialize_tree = NULL; mutex_enter(&vd->vdev_initialize_lock); - if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { - vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + if (!vd->vdev_initialize_exit_wanted) { + if (vdev_writeable(vd)) { + vdev_initialize_change_state(vd, + VDEV_INITIALIZE_COMPLETE); + } else if (vd->vdev_faulted) { + vdev_initialize_change_state(vd, + VDEV_INITIALIZE_CANCELED); + } } ASSERT(vd->vdev_initialize_thread != NULL || vd->vdev_initialize_inflight == 0); @@ -544,6 +579,8 @@ vdev_initialize_thread(void *arg) vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); + + thread_exit(); } /* @@ -599,7 +636,7 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) } /* - * Stop initializing a device, with the resultant initialing state being + * Stop initializing a device, with the resultant initializing state being * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when * a list_t is provided the stopping vdev is inserted in to the list. Callers * are then required to call vdev_initialize_stop_wait() to block for all the @@ -699,7 +736,7 @@ vdev_initialize_restart(vdev_t *vd) vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); - vd->vdev_initialize_action_time = (time_t)timestamp; + vd->vdev_initialize_action_time = timestamp; if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || vd->vdev_offline) { @@ -720,15 +757,16 @@ vdev_initialize_restart(vdev_t *vd) } } -#if defined(_KERNEL) EXPORT_SYMBOL(vdev_initialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); EXPORT_SYMBOL(vdev_initialize_restart); -/* CSTYLED */ -module_param(zfs_initialize_value, ulong, 0644); -MODULE_PARM_DESC(zfs_initialize_value, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, "Value written during zpool initialize"); -#endif + +ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, + "Size in bytes of writes by zpool initialize"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a0e373b3df..f03ae0873f 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,8 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -143,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -150,6 +150,8 @@ #include #include #include +#include +#include /* * Basic routines to read and write from a vdev label. @@ -254,6 +256,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]); + /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); @@ -273,6 +278,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]); + /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], @@ -314,6 +322,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD])); + /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], @@ -339,6 +351,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); @@ -363,6 +379,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD])); + /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); @@ -405,6 +425,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) } } +static void +top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + if (vd == vd->vdev_top) { + vdev_rebuild_stat_t vrs; + if (vdev_rebuild_get_stats(vd, &vrs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, + sizeof (vrs) / sizeof (uint64_t)); + } + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -439,31 +472,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops->vdev_op_config_generate != NULL) + vd->vdev_ops->vdev_op_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -560,6 +575,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); + top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context @@ -598,7 +614,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * as a single mapping. */ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - if (1ULL << (i + 1) < vdev_removal_max_span) { + if (i + 1 < highbit64(vdev_removal_max_span) + - 1) { to_alloc += vd->vdev_mg->mg_histogram[i] << (i + 1); @@ -613,7 +630,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ - seg_count += to_alloc / zfs_remove_max_segment; + seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * @@ -664,6 +681,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); + if (vd->vdev_rebuild_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) @@ -752,35 +772,47 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; - vdev_phys_t *vp; - abd_t *vp_abd; - zio_t *zio; + vdev_phys_t *vp[VDEV_LABELS]; + abd_t *vp_abd[VDEV_LABELS]; + zio_t *zio[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_validate_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - vp = abd_to_buf(vp_abd); + /* + * The label for a dRAID distributed spare is not stored on disk. + * Instead it is generated when needed which allows us to bypass + * the pipeline when reading the config from the label. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_read_config_spare(vd)); + + for (int l = 0; l < VDEV_LABELS; l++) { + vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp[l] = abd_to_buf(vp_abd[l]); + } retry: + for (int l = 0; l < VDEV_LABELS; l++) { + zio[l] = zio_root(spa, NULL, NULL, flags); + + vdev_label_read(zio[l], vd, l, vp_abd[l], + offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), + NULL, NULL, flags); + } for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; - zio = zio_root(spa, NULL, NULL, flags); - - vdev_label_read(zio, vd, l, vp_abd, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL, flags); - - if (zio_wait(zio) == 0 && - nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + if (zio_wait(zio[l]) == 0 && + nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their @@ -793,6 +825,8 @@ retry: ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; + for (l++; l < VDEV_LABELS; l++) + zio_wait(zio[l]); break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; @@ -821,7 +855,9 @@ retry: (u_longlong_t)txg); } - abd_free(vp_abd); + for (int l = 0; l < VDEV_LABELS; l++) { + abd_free(vp_abd[l]); + } return (config); } @@ -933,7 +969,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (spa = spa_by_guid(pool_guid, device_guid)) != NULL && - spa_mode(spa) == FREAD) + spa_mode(spa) == SPA_MODE_READ) state = POOL_STATE_ACTIVE; /* @@ -958,7 +994,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; - abd_t *pad2; + abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; @@ -1119,8 +1155,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(pad2, VDEV_PAD_SIZE); + bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -1139,8 +1175,8 @@ retry: * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, - offsetof(vdev_label_t, vl_pad2), + vdev_label_write(zio, vd, l, bootenv, + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, @@ -1156,7 +1192,7 @@ retry: } nvlist_free(label); - abd_free(pad2); + abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); @@ -1179,6 +1215,212 @@ retry: return (error); } +/* + * Done callback for vdev_label_read_bootenv_impl. If this is the first + * callback to finish, store our abd in the callback pointer. Otherwise, we + * just free our abd and return. + */ +static void +vdev_label_read_bootenv_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + abd_t **cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); + + if (zio->io_error == 0) { + mutex_enter(&rio->io_lock); + if (*cbp == NULL) { + /* Will free this buffer in vdev_label_read_bootenv. */ + *cbp = zio->io_abd; + } else { + abd_free(zio->io_abd); + } + mutex_exit(&rio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + + /* + * We just use the first label that has a correct checksum; the + * bootloader should have rewritten them all to be the same on boot, + * and any changes we made since boot have been the same across all + * labels. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, + vdev_label_read_bootenv_done, zio, flags); + } + } +} + +int +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) +{ + nvlist_t *config; + spa_t *spa = rvd->vdev_spa; + abd_t *abd = NULL; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(bootenv); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + zio_t *zio = zio_root(spa, NULL, &abd, flags); + vdev_label_read_bootenv_impl(zio, rvd, flags); + int err = zio_wait(zio); + + if (abd != NULL) { + char *buf; + vdev_boot_envblock_t *vbe = abd_to_buf(abd); + + vbe->vbe_version = ntohll(vbe->vbe_version); + switch (vbe->vbe_version) { + case VB_RAW: + /* + * if we have textual data in vbe_bootenv, create nvlist + * with key "envmap". + */ + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(bootenv, GRUB_ENVMAP, + vbe->vbe_bootenv); + break; + + case VB_NVLIST: + err = nvlist_unpack(vbe->vbe_bootenv, + sizeof (vbe->vbe_bootenv), &config, 0); + if (err == 0) { + fnvlist_merge(bootenv, config); + nvlist_free(config); + break; + } + fallthrough; + default: + /* Check for FreeBSD zfs bootonce command string */ + buf = abd_to_buf(abd); + if (*buf == '\0') { + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, + VB_NVLIST); + break; + } + fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); + } + + /* + * abd was allocated in vdev_label_read_bootenv_impl() + */ + abd_free(abd); + /* + * If we managed to read any successfully, + * return success. + */ + return (0); + } + return (err); +} + +int +vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) +{ + zio_t *zio; + spa_t *spa = vd->vdev_spa; + vdev_boot_envblock_t *bootenv; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + size_t nvsize; + char *nvbuf; + + error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); + if (error != 0) + return (SET_ERROR(error)); + + if (nvsize >= sizeof (bootenv->vbe_bootenv)) { + return (SET_ERROR(E2BIG)); + } + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + error = ENXIO; + for (int c = 0; c < vd->vdev_children; c++) { + int child_err; + + child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); + /* + * As long as any of the disks managed to write all of their + * labels successfully, return success. + */ + if (child_err == 0) + error = child_err; + } + + if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || + !vdev_writeable(vd)) { + return (error); + } + ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(abd, VDEV_PAD_SIZE); + + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + nvbuf = bootenv->vbe_bootenv; + nvsize = sizeof (bootenv->vbe_bootenv); + + bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); + switch (bootenv->vbe_version) { + case VB_RAW: + if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) { + (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize); + } + error = 0; + break; + + case VB_NVLIST: + error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, + KM_SLEEP); + break; + + default: + error = EINVAL; + break; + } + + if (error == 0) { + bootenv->vbe_version = htonll(bootenv->vbe_version); + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + } else { + abd_free(abd); + return (SET_ERROR(error)); + } + +retry: + zio = zio_root(spa, NULL, NULL, flags); + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_write(zio, vd, l, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(abd); + return (error); +} + /* * ========================================================================== * uberblock load/sync @@ -1198,18 +1440,18 @@ retry: static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); /* * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware - * ZFS, e.g. zfsonlinux >= 0.7. + * ZFS, e.g. OpenZFS >= 0.7. * * If one ub has MMP and the other does not, they were written by * different hosts, which matters for MMP. So we treat no MMP/no SEQ as @@ -1227,7 +1469,7 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); - return (AVL_CMP(seq1, seq2)); + return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { @@ -1272,7 +1514,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1361,6 +1604,13 @@ vdev_copy_uberblocks(vdev_t *vd) SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); + /* + * No uberblocks are stored on distributed spares, they may be + * safely skipped when expanding a leaf vdev. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1422,6 +1672,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * There's no need to write uberblocks to a distributed spare, they + * are already stored on all the leaves of the parent dRAID. For + * this same reason vdev_uberblock_load_impl() skips distributed + * spares when reading uberblocks. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { @@ -1447,7 +1706,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -int +static int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; @@ -1538,6 +1797,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * The top-level config never needs to be written to a distributed + * spare. When read vdev_dspare_label_read_config() will generate + * the config for the vdev_label_read_config(). + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* * Generate a label describing the top-level config to which we belong. */ @@ -1564,7 +1831,7 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, nvlist_free(label); } -int +static int vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { list_t *dl = &spa->spa_config_dirty_list; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 59cc2dcdd2..5eb3310469 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void) /* * Virtual device vector for mirroring. */ - typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; @@ -108,6 +108,7 @@ typedef struct mirror_child { uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; + uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { @@ -115,6 +116,7 @@ typedef struct mirror_map { int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; + boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; @@ -172,7 +174,6 @@ vdev_mirror_map_free(zio_t *zio) static const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, - .vsd_cksum_report = zio_vsd_default_cksum_report }; static int @@ -239,6 +240,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) return (load + zfs_vdev_mirror_rotating_seek_inc); } +static boolean_t +vdev_mirror_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_mirror_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. @@ -282,10 +298,11 @@ vdev_mirror_map_init(zio_t *zio) } /* - * If we do not trust the pool config, some DVAs might be - * invalid or point to vdevs that do not exist. We skip them. + * If the pool cannot be written to, then infer that some + * DVAs might be invalid or point to vdevs that do not exist. + * We skip them. */ - if (!spa_trust_config(spa)) { + if (!spa_writeable(spa)) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); int j = 0; for (int i = 0; i < c; i++) { @@ -309,6 +326,13 @@ vdev_mirror_map_init(zio_t *zio) mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + if (mc->mc_vd == NULL) { + kmem_free(mm, vdev_mirror_map_size( + mm->mm_children)); + zio->io_vsd = NULL; + zio->io_error = ENXIO; + return (NULL); + } } } else { /* @@ -348,17 +372,18 @@ vdev_mirror_map_init(zio_t *zio) mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (vdev_mirror_rebuilding(mc->mc_vd)) + mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } - zio->io_vsd = mm; - zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; @@ -381,7 +406,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { @@ -469,7 +496,7 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) int p; if (mm->mm_root) { - p = spa_get_random(mm->mm_preferred_cnt); + p = random_in_range(mm->mm_preferred_cnt); return (vdev_mirror_dva_select(zio, p)); } @@ -483,12 +510,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read - * prefering vdevs based on determined load. + * preferring vdevs based on determined load. If we can't, try the read on + * any vdev we haven't already tried. * - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. + * Distributed spares are an exception to the above load rule. They are + * always preferred in order to detect gaps in the distributed spare which + * are created when another disk in the dRAID fails. In order to restore + * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) @@ -508,20 +560,27 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } + if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { + mm->mm_preferred[0] = c; + mm->mm_preferred_cnt = 1; + break; + } + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; @@ -567,6 +626,8 @@ vdev_mirror_io_start(zio_t *zio) int c, children; mm = vdev_mirror_map_init(zio); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); @@ -588,6 +649,15 @@ vdev_mirror_io_start(zio_t *zio) */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; + + /* Don't issue ZIOs to offline children */ + if (!vdev_mirror_child_readable(mc)) { + mc->mc_error = SET_ERROR(ENXIO); + mc->mc_tried = 1; + mc->mc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, abd_alloc_sametype(zio->io_abd, @@ -615,11 +685,25 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; + c++; + + /* + * When sequentially resilvering only issue write repair + * IOs to the vdev which is being rebuilt since performance + * is limited by the slowest child. This is an issue for + * faster replacement devices such as distributed spares. + */ + if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && + (zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SCRUB) && + mm->mm_rebuilding && !mc->mc_rebuilding) { + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); - c++; } zio_execute(zio); @@ -734,6 +818,8 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_error == 0) { + vdev_ops_t *ops = mc->mc_vd->vdev_ops; + if (mc->mc_tried) continue; /* @@ -742,15 +828,16 @@ vdev_mirror_io_done(zio_t *zio) * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) + * 2. it's an indirect or distributed spare + * vdev (in which case it could point to any + * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && + ops != &vdev_indirect_ops && + ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -759,8 +846,9 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_abd, zio->io_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } @@ -785,76 +873,108 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } } +/* + * Return the maximum asize for a rebuild zio in the provided range. + */ +static uint64_t +vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = vdev_default_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = vdev_default_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = vdev_default_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; -#if defined(_KERNEL) /* BEGIN CSTYLED */ -module_param(zfs_vdev_mirror_rotating_inc, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_rotating_inc, +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, "Rotating media load increment for non-seeking I/O's"); -module_param(zfs_vdev_mirror_rotating_seek_inc, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_inc, +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, "Rotating media load increment for seeking I/O's"); -module_param(zfs_vdev_mirror_rotating_seek_offset, int, 0644); +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, + "Offset in bytes from the last I/O which triggers " + "a reduced rotating media seek increment"); -MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_offset, - "Offset in bytes from the last I/O which " - "triggers a reduced rotating media seek increment"); - -module_param(zfs_vdev_mirror_non_rotating_inc, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_inc, +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, "Non-rotating media load increment for non-seeking I/O's"); -module_param(zfs_vdev_mirror_non_rotating_seek_inc, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_seek_inc, +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, "Non-rotating media load increment for seeking I/O's"); /* END CSTYLED */ -#endif diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index d85993bff0..e9145fd012 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -45,7 +45,7 @@ /* ARGSUSED */ static int vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *ashift, uint64_t *pshift) { /* * Really this should just fail. But then the root vdev will be in the @@ -56,6 +56,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *psize = 0; *max_psize = 0; *ashift = 0; + *pshift = 0; return (0); } @@ -80,33 +81,51 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_HOLE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e74df76b75..cc5b15b8c0 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -35,8 +35,6 @@ #include #include #include -#include -#include #include /* @@ -121,16 +119,17 @@ /* * The maximum number of i/os active to each device. Ideally, this will be >= - * the sum of each queue's max_active. It must be at least the sum of each - * queue's min_active. + * the sum of each queue's max_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the * number of active i/os is < zfs_vdev_max_active, then the min_active comes - * into play. We will send min_active from each queue, and then select from - * queues in the order defined by zio_priority_t. + * into play. We will send min_active from each queue round-robin, and then + * send from queues in the order defined by zio_priority_t up to max_active. + * Some queues have additional mechanisms to limit number of active I/Os in + * addition to min_active and max_active, see below. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, @@ -151,13 +150,15 @@ uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 2; +uint32_t zfs_vdev_rebuild_min_active = 1; +uint32_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -169,6 +170,28 @@ uint32_t zfs_vdev_trim_max_active = 2; int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; +/* + * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), + * the number of concurrently-active I/O's is limited to *_min_active, unless + * the vdev is "idle". When there are no interactive I/Os active (sync or + * async), and zfs_vdev_nia_delay I/Os have completed since the last + * interactive I/O, then the vdev is considered to be "idle", and the number + * of concurrently-active non-interactive I/O's is increased to *_max_active. + */ +uint_t zfs_vdev_nia_delay = 5; + +/* + * Some HDDs tend to prioritize sequential I/O so high that concurrent + * random I/O latency reaches several seconds. On some HDDs it happens + * even if sequential I/Os are submitted one at a time, and so setting + * *_max_active to 1 does not help. To prevent non-interactive I/Os, like + * scrub, from monopolizing the device no more than zfs_vdev_nia_credit + * I/Os can be sent while there are outstanding incomplete interactive + * I/Os. This enforced wait ensures the HDD services the interactive I/O + * within a reasonable amount of time. + */ +uint_t zfs_vdev_nia_credit = 5; + /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes @@ -212,18 +235,18 @@ int zfs_vdev_def_queue_depth = 32; */ int zfs_vdev_aggregate_trim = 0; -int +static int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_offset, z2->io_offset); + int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static inline avl_tree_t * @@ -244,22 +267,22 @@ vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) return (&vq->vq_trim_offset_tree); } -int +static int vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp); + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static int -vdev_queue_class_min_active(zio_priority_t p) +vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -271,13 +294,19 @@ vdev_queue_class_min_active(zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_min_active); + return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: + MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REBUILD: + return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); @@ -307,14 +336,12 @@ vdev_queue_max_async_writes(spa_t *spa) * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ - if (spa_has_pending_synctask(spa)) + dirty = dp->dp_dirty_total; + if (dirty > max_bytes || spa_has_pending_synctask(spa)) return (zfs_vdev_async_write_max_active); - dirty = dp->dp_dirty_total; if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); - if (dirty > max_bytes) - return (zfs_vdev_async_write_max_active); /* * linear interpolation: @@ -333,7 +360,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static int -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -345,13 +372,35 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_scrub_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_scrub_min_active)); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_removal_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_removal_min_active)); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_initializing_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_initializing_min_active)); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REBUILD: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_rebuild_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_rebuild_min_active)); + return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); @@ -366,17 +415,24 @@ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p; + zio_priority_t p, n; if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); - /* find a queue that has not reached its minimum # outstanding i/os */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + /* + * Find a queue that has not reached its minimum # outstanding i/os. + * Do round-robin to reduce starvation due to zfs_vdev_max_active + * and vq_nia_credit limits. + */ + for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { + p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(p)) + vdev_queue_class_min_active(vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* @@ -386,8 +442,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, p)) + vdev_queue_class_max_active(spa, vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* No eligible queued i/os */ @@ -456,94 +514,67 @@ vdev_queue_fini(vdev_t *vd) static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); - - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_waitq_enter(shk->kstat->ks_data); - mutex_exit(&shk->lock); - } } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); +} - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_waitq_exit(shk->kstat->ks_data); - mutex_exit(&shk->lock); +static boolean_t +vdev_queue_is_interactive(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SCRUB: + case ZIO_PRIORITY_REMOVAL: + case ZIO_PRIORITY_INITIALIZING: + case ZIO_PRIORITY_REBUILD: + return (B_FALSE); + default: + return (B_TRUE); } } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; - avl_add(&vq->vq_active_tree, zio); - - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_runq_enter(shk->kstat->ks_data); - mutex_exit(&shk->lock); + if (vdev_queue_is_interactive(zio->io_priority)) { + if (++vq->vq_ia_active == 1) + vq->vq_nia_credit = 1; + } else if (vq->vq_ia_active > 0) { + vq->vq_nia_credit--; } + avl_add(&vq->vq_active_tree, zio); } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (--vq->vq_ia_active == 0) + vq->vq_nia_credit = 0; + else + vq->vq_nia_credit = zfs_vdev_nia_credit; + } else if (vq->vq_ia_active == 0) + vq->vq_nia_credit++; avl_remove(&vq->vq_active_tree, zio); - - if (shk->kstat != NULL) { - kstat_io_t *ksio = shk->kstat->ks_data; - - mutex_enter(&shk->lock); - kstat_runq_exit(ksio); - if (zio->io_type == ZIO_TYPE_READ) { - ksio->reads++; - ksio->nread += zio->io_size; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - ksio->writes++; - ksio->nwritten += zio->io_size; - } - mutex_exit(&shk->lock); - } } static void vdev_queue_agg_io_done(zio_t *aio) { - if (aio->io_type == ZIO_TYPE_READ) { - zio_t *pio; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - abd_copy_off(pio->io_abd, aio->io_abd, - 0, pio->io_offset - aio->io_offset, pio->io_size); - } - } - abd_free(aio->io_abd); } @@ -556,11 +587,18 @@ vdev_queue_agg_io_done(zio_t *aio) #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) +/* + * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this + * by creating a gang ABD from the adjacent ZIOs io_abd's. By using + * a gang ABD we avoid doing memory copies to and from the parent, + * child ZIOs. The gang ABD also accounts for gaps between adjacent + * io_offsets by simply getting the zero ABD for writes or allocating + * a new ABD for reads and placing them in the gang ABD as well. + */ static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; - zio_link_t *zl = NULL; uint64_t maxgap = 0; uint64_t size; uint64_t limit; @@ -568,6 +606,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + uint64_t next_offset; abd_t *abd; maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); @@ -587,6 +626,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); + /* + * I/Os to distributed spares are directly dispatched to the dRAID + * leaf vdevs for aggregation. See the comment at the end of the + * zio_vdev_io_start() function. + */ + ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); + first = last = zio; if (zio->io_type == ZIO_TYPE_READ) @@ -695,7 +741,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); - abd = abd_alloc_for_io(size, B_TRUE); + abd = abd_alloc_gang(); if (abd == NULL) return (NULL); @@ -706,35 +752,56 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio->io_timestamp = first->io_timestamp; nio = first; + next_offset = first->io_offset; do { dio = nio; nio = AVL_NEXT(t, dio); - ASSERT3U(dio->io_type, ==, aio->io_type); - - if (dio->io_flags & ZIO_FLAG_NODATA) { - ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - abd_zero_off(aio->io_abd, - dio->io_offset - aio->io_offset, dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - abd_copy_off(aio->io_abd, dio->io_abd, - dio->io_offset - aio->io_offset, 0, dio->io_size); - } - zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); + + if (dio->io_offset != next_offset) { + /* allocate a buffer for a read gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); + ASSERT3U(dio->io_offset, >, next_offset); + abd = abd_alloc_for_io( + dio->io_offset - next_offset, B_TRUE); + abd_gang_add(aio->io_abd, abd, B_TRUE); + } + if (dio->io_abd && + (dio->io_size != abd_get_size(dio->io_abd))) { + /* abd size not the same as IO size */ + ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); + abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); + abd_gang_add(aio->io_abd, abd, B_TRUE); + } else { + if (dio->io_flags & ZIO_FLAG_NODATA) { + /* allocate a buffer for a write gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3P(dio->io_abd, ==, NULL); + abd_gang_add(aio->io_abd, + abd_get_zeros(dio->io_size), B_TRUE); + } else { + /* + * We pass B_FALSE to abd_gang_add() + * because we did not allocate a new + * ABD, so it is assumed the caller + * will free this ABD. + */ + abd_gang_add(aio->io_abd, dio->io_abd, + B_FALSE); + } + } + next_offset = dio->io_offset + dio->io_size; } while (dio != last); + ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size); /* - * We need to drop the vdev queue's lock to avoid a deadlock that we - * could encounter since this I/O will complete immediately. + * Callers must call zio_vdev_io_bypass() and zio_execute() for + * aggregated (parent) I/Os so that we could avoid dropping the + * queue's lock here to avoid a deadlock that we could encounter + * due to lock order reversal between vq_lock and io_lock in + * zio_change_priority(). */ - mutex_exit(&vq->vq_lock); - while ((dio = zio_walk_parents(aio, &zl)) != NULL) { - zio_vdev_io_bypass(dio); - zio_execute(dio); - } - mutex_enter(&vq->vq_lock); - return (aio); } @@ -772,23 +839,24 @@ again: ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); - if (aio != NULL) + if (aio != NULL) { zio = aio; - else + } else { vdev_queue_io_remove(vq, zio); - /* - * If the I/O is or was optional and therefore has no data, we need to - * simply discard it. We need to drop the vdev queue's lock to avoid a - * deadlock that we could encounter since this I/O will complete - * immediately. - */ - if (zio->io_flags & ZIO_FLAG_NODATA) { - mutex_exit(&vq->vq_lock); - zio_vdev_io_bypass(zio); - zio_execute(zio); - mutex_enter(&vq->vq_lock); - goto again; + /* + * If the I/O is or was optional and therefore has no data, we + * need to simply discard it. We need to drop the vdev queue's + * lock to avoid a deadlock that we could encounter since this + * I/O will complete immediately. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(zio); + zio_execute(zio); + mutex_enter(&vq->vq_lock); + goto again; + } } vdev_queue_pending_add(vq, zio); @@ -801,7 +869,8 @@ zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; + zio_t *dio, *nio; + zio_link_t *zl = NULL; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); @@ -817,7 +886,8 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { @@ -826,7 +896,8 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { @@ -835,9 +906,9 @@ vdev_queue_io(zio_t *zio) } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); - zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); @@ -846,6 +917,11 @@ vdev_queue_io(zio_t *zio) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { + while ((dio = zio_walk_parents(nio, &zl)) != NULL) { + ASSERT3U(dio->io_type, ==, nio->io_type); + zio_vdev_io_bypass(dio); + zio_execute(dio); + } zio_nowait(nio); return (NULL); } @@ -857,19 +933,24 @@ void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; + zio_t *dio, *nio; + zio_link_t *zl = NULL; + + hrtime_t now = gethrtime(); + vq->vq_io_complete_ts = now; + vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; mutex_enter(&vq->vq_lock); - vdev_queue_pending_remove(vq, zio); - zio->io_delta = gethrtime() - zio->io_timestamp; - vq->vq_io_complete_ts = gethrtime(); - vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; - while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { + while ((dio = zio_walk_parents(nio, &zl)) != NULL) { + ASSERT3U(dio->io_type, ==, nio->io_type); + zio_vdev_io_bypass(dio); + zio_execute(dio); + } zio_nowait(nio); } else { zio_vdev_io_reissue(nio); @@ -891,7 +972,7 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio * code to issue IOs without adding them to the vdev queue. In this * case, the zio is already going to be issued as quickly as possible - * and so it doesn't need any reprioitization to help. + * and so it doesn't need any reprioritization to help. */ if (zio->io_priority == ZIO_PRIORITY_NOW) return; @@ -950,99 +1031,91 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } -#if defined(_KERNEL) -module_param(zfs_vdev_aggregation_limit, int, 0644); -MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW, + "Max vdev I/O aggregation size"); -module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644); -MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -module_param(zfs_vdev_aggregate_trim, int, 0644); -MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW, + "Allow TRIM I/O to be aggregated"); -module_param(zfs_vdev_read_gap_limit, int, 0644); -MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW, + "Aggregate read I/O over gap"); -module_param(zfs_vdev_write_gap_limit, int, 0644); -MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW, + "Aggregate write I/O over gap"); -module_param(zfs_vdev_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_max_active, "Maximum number of active I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW, + "Maximum number of active I/Os per vdev"); -module_param(zfs_vdev_async_write_active_max_dirty_percent, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_write_active_max_dirty_percent, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW, "Async write concurrency max threshold"); -module_param(zfs_vdev_async_write_active_min_dirty_percent, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_write_active_min_dirty_percent, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW, "Async write concurrency min threshold"); -module_param(zfs_vdev_async_read_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_read_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW, "Max active async read I/Os per vdev"); -module_param(zfs_vdev_async_read_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_read_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW, "Min active async read I/Os per vdev"); -module_param(zfs_vdev_async_write_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_write_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW, "Max active async write I/Os per vdev"); -module_param(zfs_vdev_async_write_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_async_write_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW, "Min active async write I/Os per vdev"); -module_param(zfs_vdev_initializing_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_initializing_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW, "Max active initializing I/Os per vdev"); -module_param(zfs_vdev_initializing_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_initializing_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW, "Min active initializing I/Os per vdev"); -module_param(zfs_vdev_removal_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_removal_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW, "Max active removal I/Os per vdev"); -module_param(zfs_vdev_removal_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_removal_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW, "Min active removal I/Os per vdev"); -module_param(zfs_vdev_scrub_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_scrub_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW, "Max active scrub I/Os per vdev"); -module_param(zfs_vdev_scrub_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_scrub_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW, "Min active scrub I/Os per vdev"); -module_param(zfs_vdev_sync_read_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_sync_read_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW, "Max active sync read I/Os per vdev"); -module_param(zfs_vdev_sync_read_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_sync_read_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW, "Min active sync read I/Os per vdev"); -module_param(zfs_vdev_sync_write_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_sync_write_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW, "Max active sync write I/Os per vdev"); -module_param(zfs_vdev_sync_write_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_sync_write_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW, "Min active sync write I/Os per vdev"); -module_param(zfs_vdev_trim_max_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_trim_max_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, "Max active trim/discard I/Os per vdev"); -module_param(zfs_vdev_trim_min_active, int, 0644); -MODULE_PARM_DESC(zfs_vdev_trim_min_active, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); -module_param(zfs_vdev_queue_depth_pct, int, 0644); -MODULE_PARM_DESC(zfs_vdev_queue_depth_pct, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, + "Max active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, + "Min active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, + "Number of non-interactive I/Os to allow in sequence"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, + "Number of non-interactive I/Os before _max_active"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); -#endif +/* END CSTYLED */ diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 215cd1c120..7e7202ec1e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -98,7 +99,7 @@ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) @@ -134,25 +135,31 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } +static void +vdev_raidz_row_free(raidz_row_t *rr) +{ + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size != 0) + abd_free(rc->rc_abd); + if (rc->rc_orig_data != NULL) + abd_free(rc->rc_orig_data); + } + + if (rr->rr_abd_empty != NULL) + abd_free(rr->rr_abd_empty); + + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); +} + void vdev_raidz_map_free(raidz_map_t *rm) { - int c; + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_row_free(rm->rm_row[i]); - for (c = 0; c < rm->rm_firstdatacol; c++) { - abd_free(rm->rm_col[c].rc_abd); - - if (rm->rm_col[c].rc_gdata != NULL) - abd_free(rm->rm_col[c].rc_gdata); - } - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - abd_put(rm->rm_col[c].rc_abd); - - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); - - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -160,171 +167,120 @@ vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; - ASSERT0(rm->rm_freed); - rm->rm_freed = 1; - - if (rm->rm_reports == 0) - vdev_raidz_map_free(rm); + vdev_raidz_map_free(rm); } -/*ARGSUSED*/ -static void -vdev_raidz_cksum_free(void *arg, size_t ignored) -{ - raidz_map_t *rm = arg; - - ASSERT3U(rm->rm_reports, >, 0); - - if (--rm->rm_reports == 0 && rm->rm_freed != 0) - vdev_raidz_map_free(rm); -} +const zio_vsd_ops_t vdev_raidz_vsd_ops = { + .vsd_free = vdev_raidz_map_free_vsd, +}; static void -vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { - raidz_map_t *rm = zcr->zcr_cbdata; - const size_t c = zcr->zcr_cbinfo; - size_t x, offset; + int c; + int nwrapped = 0; + uint64_t off = 0; + raidz_row_t *rr = rm->rm_row[0]; - const abd_t *good = NULL; - const abd_t *bad = rm->rm_col[c].rc_abd; - - if (good_data == NULL) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); - return; - } - - if (c < rm->rm_firstdatacol) { - /* - * The first time through, calculate the parity blocks for - * the good data (this relies on the fact that the good - * data never changes for a given logical ZIO) - */ - if (rm->rm_col[0].rc_gdata == NULL) { - abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; - - /* - * Set up the rm_col[]s to generate the parity for - * good_data, first saving the parity bufs and - * replacing them with buffers to hold the result. - */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_abd = - rm->rm_col[x].rc_gdata = - abd_alloc_sametype(rm->rm_col[x].rc_abd, - rm->rm_col[x].rc_size); - } - - /* fill in the data columns from good_data */ - offset = 0; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - - rm->rm_col[x].rc_abd = - abd_get_offset_size((abd_t *)good_data, - offset, rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; - } - - /* - * Construct the parity from the good data. - */ - vdev_raidz_generate_parity(rm); - - /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_abd = bad_parity[x]; - - offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset_size( - rm->rm_abd_copy, offset, - rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; - } - } - - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, - rm->rm_col[c].rc_size); - } else { - /* adjust good_data to point at the start of our column */ - offset = 0; - for (x = rm->rm_firstdatacol; x < c; x++) - offset += rm->rm_col[x].rc_size; - - good = abd_get_offset_size((abd_t *)good_data, offset, - rm->rm_col[c].rc_size); - } - - /* we drop the ereport if it ends up that the data was good */ - zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_put((abd_t *)good); -} - -/* - * Invoked indirectly by zfs_ereport_start_checksum(), called - * below when our read operation fails completely. The main point - * is to keep a copy of everything we read from disk, so that at - * vdev_raidz_cksum_finish() time we can compare it with the good data. - */ -static void -vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) -{ - size_t c = (size_t)(uintptr_t)arg; - size_t offset; - - raidz_map_t *rm = zio->io_vsd; - size_t size; - - /* set up the report and bump the refcount */ - zcr->zcr_cbdata = rm; - zcr->zcr_cbinfo = c; - zcr->zcr_finish = vdev_raidz_cksum_finish; - zcr->zcr_free = vdev_raidz_cksum_free; - - rm->rm_reports++; - ASSERT3U(rm->rm_reports, >, 0); - - if (rm->rm_abd_copy != NULL) - return; + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(rm->rm_nrows, ==, 1); /* - * It's the first time we're called for this raidz_map_t, so we need - * to copy the data aside; there's no guarantee that our zio's buffer - * won't be re-used for something else. - * - * Our parity data is already in separate buffers, so there's no need - * to copy them. + * Pad any parity columns with additional space to account for skip + * sectors. */ - - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; - - rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); - - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, - col->rc_size); - - abd_copy(tmp, col->rc_abd, col->rc_size); - - abd_put(col->rc_abd); - col->rc_abd = tmp; - - offset += col->rc_size; + if (rm->rm_skipstart < rr->rr_firstdatacol) { + ASSERT0(rm->rm_skipstart); + nwrapped = rm->rm_nskip; + } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { + nwrapped = + (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; } - ASSERT3U(offset, ==, size); + + /* + * Optional single skip sectors (rc_size == 0) will be handled in + * vdev_raidz_io_start_write(). + */ + int skipped = rr->rr_scols - rr->rr_cols; + + /* Allocate buffers for the parity columns */ + for (c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Parity columns will pad out a linear ABD to account for + * the skip sector. A linear ABD is used here because + * parity calculations use the ABD buffer directly to calculate + * parity. This avoids doing a memcpy back to the ABD after the + * parity has been calculated. By issuing the parity column + * with the skip sector we can reduce contention on the child + * VDEV queue locks (vq_lock). + */ + if (c < nwrapped) { + rc->rc_abd = abd_alloc_linear( + rc->rc_size + (1ULL << ashift), B_FALSE); + abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); + skipped++; + } else { + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + } + + for (off = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, off, rc->rc_size); + + /* + * Generate I/O for skip sectors to improve aggregation + * continuity. We will use gang ABD's to reduce contention + * on the child VDEV queue locks (vq_lock) by issuing + * a single I/O that contains the data and skip sector. + * + * It is important to make sure that rc_size is not updated + * even though we are adding a skip sector to the ABD. When + * calculating the parity in vdev_raidz_generate_parity_row() + * the rc_size is used to iterate through the ABD's. We can + * not have zero'd out skip sectors used for calculating + * parity for raidz, because those same sectors are not used + * during reconstruction. + */ + if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { + rc->rc_abd = abd_alloc_gang(); + abd_gang_add(rc->rc_abd, abd, B_TRUE); + abd_gang_add(rc->rc_abd, + abd_get_zeros(1ULL << ashift), B_TRUE); + skipped++; + } else { + rc->rc_abd = abd; + } + off += rc->rc_size; + } + + ASSERT3U(off, ==, zio->io_size); + ASSERT3S(skipped, ==, rm->rm_nskip); } -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { - .vsd_free = vdev_raidz_map_free_vsd, - .vsd_cksum_report = vdev_raidz_cksum_report -}; +static void +vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) +{ + int c; + raidz_row_t *rr = rm->rm_row[0]; + + ASSERT3U(rm->rm_nrows, ==, 1); + + /* Allocate buffers for the parity columns */ + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); + + for (uint64_t off = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, off, rc->rc_size); + off += rc->rc_size; + } +} /* * Divides the IO evenly across all child vdevs; usually, dcols is @@ -337,7 +293,7 @@ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -347,7 +303,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; - uint64_t off = 0; + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; /* * "Quotient": The number of data sectors for this stripe on all but @@ -370,8 +329,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ + /* + * acols: The columns that will be accessed. + * scols: The columns that will be accessed or skipped. + */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; @@ -383,71 +344,61 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rm->rm_row[0] = rr; - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr->rr_cols = acols; + rr->rr_scols = scols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; +#ifdef ZFS_DEBUG + rr->rr_offset = zio->io_offset; + rr->rr_size = zio->io_size; +#endif asize = 0; for (c = 0; c < scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; + rc->rc_devidx = col; + rc->rc_offset = coff; + rc->rc_abd = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_force_repair = 0; + rc->rc_allow_repair = 1; + rc->rc_need_orig_restore = B_FALSE; if (c >= acols) - rm->rm_col[c].rc_size = 0; + rc->rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << ashift; + rc->rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << ashift; + rc->rc_size = q << ashift; - asize += rm->rm_col[c].rc_size; + asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); - rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); - ASSERT3U(rm->rm_nskip, <=, nparity); - - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; - - for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; - } + rm->rm_skipstart = bc; /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read - * during normal operation, that that device's I/O bandwidth won't be + * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical @@ -464,23 +415,26 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_map_alloc_write(zio, rm, ashift); + } else { + vdev_raidz_map_alloc_read(zio, rm); + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -550,50 +504,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -601,14 +548,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -616,33 +564,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; @@ -651,14 +595,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -671,27 +616,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * parity columns available. */ void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { + ASSERT3U(rr->rr_cols, !=, 0); + /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; - switch (rm->rm_firstdatacol) { + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_generate_parity_row(rm, rr); + } +} + /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) @@ -808,31 +764,28 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) return (0); } -static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +static void +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; if (c == x) continue; @@ -840,12 +793,10 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } - - return (1 << VDEV_RAIDZ_P); } -static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +static void +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; @@ -853,44 +804,42 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - + rr->rr_col[x].rc_size - size); + } } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); - - return (1 << VDEV_RAIDZ_Q); } -static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +static void +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -901,10 +850,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -913,29 +862,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -953,7 +902,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -967,22 +916,20 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; - - return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; } /* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear - * equations defined by the coeffecients used to generate parity as well as + * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): @@ -996,7 +943,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde - * matrix defined by the coeffecients we chose for the various parity columns + * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * @@ -1134,13 +1081,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1164,7 +1111,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1176,10 +1123,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1196,8 +1143,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1258,7 +1205,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1290,22 +1237,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1333,51 +1282,56 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, kmem_free(p, psize); } -static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +static void +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; size_t psize; - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; - int code = 0; - /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. + * temporary linear ABDs if any non-linear ABDs are found. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + if (!abd_is_linear(rr->rr_col[i].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), + KM_PUSHPAGE); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + bufs[c] = col->rc_abd; + if (bufs[c] != NULL) { + col->rc_abd = abd_alloc_linear( + col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], + col->rc_size); + } + } + + break; } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1387,7 +1341,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1397,15 +1351,10 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) continue; } - code |= 1 << c; - parity_map[i] = c; i++; } - ASSERT(code != 0); - ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); - psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); @@ -1422,9 +1371,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1437,18 +1386,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1457,49 +1406,42 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } - - return (code); } -int -vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +static void +vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, + const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; - int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1514,50 +1456,53 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) - return (ret); + return; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_P]) { + vdev_raidz_reconstruct_p(rr, dt, 1); + return; + } - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) { + vdev_raidz_reconstruct_q(rr, dt, 1); + return; + } - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + parity_valid[VDEV_RAIDZ_Q]) { + vdev_raidz_reconstruct_pq(rr, dt, 2); + return; + } - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); - ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); - ASSERT(code > 0); - return (code); + vdev_raidz_reconstruct_general(rr, tgts, ntgts); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1573,7 +1518,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1583,7 +1528,9 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; @@ -1600,19 +1547,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, static void vdev_raidz_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1621,32 +1569,44 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +/* + * The allocatable space for a raidz vdev is N * sizeof(smallest child) + * so each child must provide at least 1/Nth of its asize. + */ +static uint64_t +vdev_raidz_min_asize(vdev_t *vd) +{ + return ((vd->vdev_min_asize + vd->vdev_children - 1) / + vd->vdev_children); +} + +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; + ASSERT3P(rc->rc_abd, !=, NULL); rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); + vdev_raidz_asize(vd, rr->rr_size); - raidz_col_t *rc = &rm->rm_col[col]; + raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - vdev_xlate(cvd, &logical_rs, &physical_rs); + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1664,6 +1624,85 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) #endif } +static void +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + /* Verify physical to logical translation */ + vdev_raidz_io_verify(vd, rr, c); + + if (rc->rc_size > 0) { + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + } else { + /* + * Generate optional write for skip sector to improve + * aggregation contiguity. + */ + ASSERT3P(rc->rc_abd, ==, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, + NULL)); + } + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1686,96 +1725,30 @@ vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; - int c, i; + vdev_raidz_t *vdrz = vd->vdev_tsd; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); - - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } - - zio_execute(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); + raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + vdrz->vd_logical_width, vdrz->vd_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity. + * Until raidz expansion is implemented all maps for a raidz vdev + * contain a single row. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } - if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; - continue; - } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rr); } zio_execute(zio); } - /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1784,20 +1757,20 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); } } @@ -1824,13 +1797,14 @@ raidz_checksum_verify(zio_t *zio) * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; + raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; @@ -1840,8 +1814,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; @@ -1849,12 +1823,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* + * Regenerates parity even for !tried||rc_error!=0 columns. This + * isn't harmful but it does have the side effect of fixing stuff + * we didn't realize was necessary (i.e. even if we return 0). + */ + vdev_raidz_generate_parity_row(rm, rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); @@ -1867,464 +1848,596 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) } static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) -{ - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int curr, next, i, c, n; - int code, ret = 0; - - ASSERT(total_errors < rm->rm_firstdatacol); - - /* - * This simplifies one edge condition. - */ - tgts[-1] = -1; - - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { - /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. - */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; - } - - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); - } - - tgts[i] = c++; - } - - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; - - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); - } - - orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, - rm->rm_col[0].rc_size); - - curr = 0; - next = tgts[curr]; - - while (curr != n) { - tgts[curr] = next; - curr = 0; - - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy(orig[i], rc->rc_abd, rc->rc_size); - } - - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } - - ret = code; - goto done; - } - - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy(rc->rc_abd, orig[i], rc->rc_size); - } - - do { - /* - * Find the next valid column after the curr - * position.. - */ - for (next = tgts[curr] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; - - ASSERT(next <= tgts[curr + 1]); - - /* - * If that spot is available, we're done here. - */ - if (next != tgts[curr + 1]) - break; - - /* - * Otherwise, find the next valid column after - * the previous position. - */ - for (c = tgts[curr - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[curr] = c; - curr++; - - } while (curr != n); - } - } - n--; -done: - for (i = 0; i < n; i++) - abd_free(orig[i]); - - return (ret); -} - -/* - * Complete an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Check for errors on the child IOs. - * 2. Return, setting an error code if too few child VDevs were written - * to reconstruct the data later. Note that partial writes are - * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. - */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc = NULL; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; - int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - if (c < rm->rm_firstdatacol) + if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; - - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); - - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - } + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } - /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; - - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - - return; - } - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); - - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - vdev_t *cvd; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; - - mutex_enter(&cvd->vdev_stat_lock); - cvd->vdev_stat.vs_checksum_errors++; - mutex_exit(&cvd->vdev_stat_lock); - - zfs_ereport_start_checksum( - zio->io_spa, cvd, - &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - } - } - } - } - -done: - zio_checksum_verified(zio); - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) + if (!rc->rc_allow_repair) { continue; + } else if (!rc->rc_force_repair && + (rc->rc_error == 0 || rc->rc_size == 0)) { + continue; + } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; + } + } + } +} + +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) +{ + raidz_map_t *rm = zio->io_vsd; + + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (rc->rc_devidx == ltgts[lt]) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + abd_alloc_linear( + rc->rc_size, B_TRUE); + abd_copy(rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= nparity) + dead_data++; + my_tgts[t++] = c; + break; + } + } + } + if (dead > nparity) { + /* reconstruction not possible */ + raidz_restore_orig_data(rm); + return (EINVAL); + } + if (dead_data > 0) + vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); + } + + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && + c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, + rc, rc->rc_orig_data); + rc->rc_error = + SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } + + vdev_raidz_io_done_verified(zio, rr); + } + + zio_checksum_verified(zio); + + return (0); + } + + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + return (ECKSUM); +} + +/* + * Iterate over all combinations of N bad vdevs and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done + * + * This strategy works for dRAID but is less efficient when there are a large + * number of child vdevs and therefore permutations to check. Furthermore, + * since the raidz_map_t rows likely do not overlap reconstruction would be + * possible as long as there are no more than nparity data errors per row. + * These additional permutations are not currently checked but could be as + * a future improvement. + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + int nparity = vdev_get_nparity(zio->io_vd); + raidz_map_t *rm = zio->io_vsd; + + /* Check if there's enough data to attempt reconstrution. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + int total_errors = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_error) + total_errors++; + } + + if (total_errors > nparity) + return (vdev_raidz_worst_error(rr)); + } + + for (int num_failures = 1; num_failures <= nparity; num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = zio->io_vd->vdev_children; + + ASSERT3U(num_failures, <=, nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + + /* Handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; + + for (;;) { + int err = raidz_reconstruct(zio, ltgts, num_failures, + nparity); + if (err == EINVAL) { + /* + * Reconstruction not possible with this # + * failures; try more failures. + */ + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + /* try more failures */ + ASSERT3U(t, ==, num_failures - 1); + break; + } + + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); + + /* + * If that spot is available, we're done here. + * Try the next combination. + */ + if (ltgts[t] != ltgts[t + 1]) + break; + + /* + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. + */ + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } + + /* Increase the number of failures and keep trying. */ + if (ltgts[num_failures - 1] == n) + break; + } + } + + return (ECKSUM); +} + +void +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + for (uint64_t row = 0; row < rm->rm_nrows; row++) { + raidz_row_t *rr = rm->rm_row[row]; + vdev_raidz_reconstruct_row(rm, rr, t, nt); + } +} + +/* + * Complete a write IO operation on a RAIDZ VDev + * + * Outline: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + */ +static void +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * Treat partial writes as a success. If we couldn't write enough + * columns to reconstruct the data, the I/O failed. Otherwise, + * good enough. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +static void +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, + raidz_row_t *rr) +{ + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + total_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + /* + * If there were data errors and the number of errors we saw was + * correctable -- less than or equal to the number of parity disks read + * -- reconstruct based on the missing data. + */ + if (data_errors != 0 && + total_errors <= rr->rr_firstdatacol - parity_untried) { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rr->rr_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + int n = 0; + int tgts[VDEV_RAIDZ_MAXPARITY]; + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rr->rr_firstdatacol >= n); + + vdev_raidz_reconstruct_row(rm, rr, tgts, n); + } +} + +/* + * Return the number of reads issued. + */ +static int +vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + int nread = 0; + + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + + /* + * If this rows contains empty sectors which are not required + * for a normal read then allocate an ABD for them now so they + * may be read, verified, and any needed repairs performed. + */ + if (rr->rr_nempty && rr->rr_abd_empty == NULL) + vdev_draid_map_alloc_empty(zio, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + nread++; + } + return (nread); +} + +/* + * We're here because either there were too many errors to even attempt + * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() + * failed. In either case, there is enough bad data to prevent reconstruction. + * Start checksum ereports for all children which haven't failed. + */ +static void +vdev_raidz_io_done_unrecoverable(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error != 0) + continue; + + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + (void) zfs_ereport_start_checksum(zio->io_spa, + cvd, &zio->io_bookmark, zio, rc->rc_offset, + rc->rc_size, &zbc); + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); + } + } +} + +void +vdev_raidz_io_done(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); + } + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_done_reconstruct_known_missing(zio, + rm, rr); + } + + if (raidz_checksum_verify(zio) == 0) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_done_verified(zio, rr); + } + zio_checksum_verified(zio); + } else { + /* + * A sequential resilver has no checksum which makes + * combinatoral reconstruction impossible. This code + * path is unreachable since raidz_checksum_verify() + * has no checksum to verify and must succeed. + */ + ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); + + /* + * This isn't a typical situation -- either we got a + * read error or a child silently returned bad data. + * Read every block so we can try again with as much + * data and parity as we can track down. If we've + * already been through once before, all children will + * be marked as tried so we'll proceed to combinatorial + * reconstruction. + */ + int nread = 0; + for (int i = 0; i < rm->rm_nrows; i++) { + nread += vdev_raidz_read_all(zio, + rm->rm_row[i]); + } + if (nread != 0) { + /* + * Normally our stage is VDEV_IO_DONE, but if + * we've already called redone(), it will have + * changed to VDEV_IO_START, in which case we + * don't want to call redone() again. + */ + if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_redone(zio); + return; + } + + zio->io_error = vdev_raidz_combrec(zio); + if (zio->io_error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + vdev_raidz_io_done_unrecoverable(zio); + } + } + } +} + static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted > vd->vdev_nparity) + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2336,22 +2449,30 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe + * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t -vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t dcols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> ashift; + uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; + /* Unreachable by sequential resilver. */ + ASSERT3U(phys_birth, !=, TXG_UNKNOWN); + + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + if (s + nparity >= dcols) return (B_TRUE); @@ -2372,7 +2493,8 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); @@ -2382,10 +2504,10 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ - ASSERT0(in->rs_start % (1 << ashift)); - ASSERT0(in->rs_end % (1 << ashift)); - uint64_t b_start = in->rs_start >> ashift; - uint64_t b_end = in->rs_end >> ashift; + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + uint64_t b_start = logical_rs->rs_start >> ashift; + uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ @@ -2395,25 +2517,132 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; - res->rs_start = start_row << ashift; - res->rs_end = end_row << ashift; + physical_rs->rs_start = start_row << ashift; + physical_rs->rs_end = end_row << ashift; - ASSERT3U(res->rs_start, <=, in->rs_start); - ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); + ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_rs->rs_end - logical_rs->rs_start); +} + +/* + * Initialize private RAIDZ specific fields from the nvlist. + */ +static int +vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_raidz_t *vdrz; + uint64_t nparity; + + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(EINVAL)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + + vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vd_logical_width = children; + vdrz->vd_nparity = nparity; + + *tsd = vdrz; + + return (0); +} + +static void +vdev_raidz_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); +} + +/* + * Add RAIDZ specific fields to the config nvlist. + */ +static void +vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); + vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vdrz->vd_nparity == 1 || + (vdrz->vd_nparity <= 2 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || + (vdrz->vd_nparity <= 3 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add these even on storage pools where they + * aren't strictly required -- older software will just ignore + * it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_nparity(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + return (vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - vdev_raidz_need_resilver, - NULL, - NULL, - NULL, - vdev_raidz_xlate, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_init = vdev_raidz_init, + .vdev_op_fini = vdev_raidz_fini, + .vdev_op_open = vdev_raidz_open, + .vdev_op_close = vdev_raidz_close, + .vdev_op_asize = vdev_raidz_asize, + .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_raidz_io_start, + .vdev_op_io_done = vdev_raidz_io_done, + .vdev_op_state_change = vdev_raidz_state_change, + .vdev_op_need_resilver = vdev_raidz_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_raidz_config_generate, + .vdev_op_nparity = vdev_raidz_nparity, + .vdev_op_ndisks = vdev_raidz_ndisks, + .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index e6112bc021..03df2df5ad 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -27,11 +27,9 @@ #include #include #include - #include #include - -extern boolean_t raidz_will_scalar_work(void); +#include /* Opaque implementation with NULL methods to represent original methods */ static const raidz_impl_ops_t vdev_raidz_original_impl = { @@ -63,10 +61,13 @@ const raidz_impl_ops_t *raidz_all_maths[] = { #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ &vdev_raidz_avx512bw_impl, #endif -#if defined(__aarch64__) +#if defined(__aarch64__) && !defined(__FreeBSD__) &vdev_raidz_aarch64_neon_impl, &vdev_raidz_aarch64_neonx2_impl, #endif +#if defined(__powerpc__) && defined(__altivec__) + &vdev_raidz_powerpc_altivec_impl, +#endif }; /* Indicate that benchmark has been completed */ @@ -87,6 +88,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST; static size_t raidz_supp_impl_cnt = 0; static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; +#if defined(_KERNEL) /* * kstats values for supported implementations * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] @@ -95,14 +97,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; /* kstat for benchmarked implementations */ static kstat_t *raidz_math_kstat = NULL; +#endif /* - * Selects the raidz operation for raidz_map - * If rm_ops is set to NULL original raidz implementation will be used + * Returns the RAIDZ operations for raidz_map() parity calculations. When + * a SIMD implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. */ -raidz_impl_ops_t * -vdev_raidz_math_get_ops() +const raidz_impl_ops_t * +vdev_raidz_math_get_ops(void) { + if (!kfpu_allowed()) + return (&vdev_raidz_scalar_impl); + raidz_impl_ops_t *ops = NULL; const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); @@ -111,18 +118,14 @@ vdev_raidz_math_get_ops() ASSERT(raidz_math_initialized); ops = &vdev_raidz_fastest_impl; break; -#if !defined(_KERNEL) case IMPL_CYCLE: - { + /* Cycle through all supported implementations */ ASSERT(raidz_math_initialized); ASSERT3U(raidz_supp_impl_cnt, >, 0); - /* Cycle through all supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; ops = raidz_supp_impl[idx]; - } - break; -#endif + break; case IMPL_ORIGINAL: ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; break; @@ -146,7 +149,7 @@ vdev_raidz_math_get_ops() * Select parity generation method for raidz_map */ int -vdev_raidz_math_generate(raidz_map_t *rm) +vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr) { raidz_gen_f gen_parity = NULL; @@ -162,8 +165,8 @@ vdev_raidz_math_generate(raidz_map_t *rm) break; default: gen_parity = NULL; - cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", - raidz_parity(rm)); + cmn_err(CE_PANIC, "invalid RAID-Z configuration %llu", + (u_longlong_t)raidz_parity(rm)); break; } @@ -171,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm) if (gen_parity == NULL) return (RAIDZ_ORIGINAL_IMPL); - gen_parity(rm); + gen_parity(rr); return (0); } @@ -238,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, * @nbaddata - Number of failed data columns */ int -vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, - const int *dt, const int nbaddata) +vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr, + const int *parity_valid, const int *dt, const int nbaddata) { raidz_rec_f rec_fn = NULL; @@ -254,15 +257,15 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); break; default: - cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", - raidz_parity(rm)); + cmn_err(CE_PANIC, "invalid RAID-Z configuration %llu", + (u_longlong_t)raidz_parity(rm)); break; } if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_fn(rm, dt)); + return (rec_fn(rr, dt)); } const char *raidz_gen_name[] = { @@ -273,6 +276,8 @@ const char *raidz_rec_name[] = { "rec_pq", "rec_pr", "rec_qr", "rec_pqr" }; +#if defined(_KERNEL) + #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) static int @@ -355,7 +360,7 @@ raidz_math_kstat_addr(kstat_t *ksp, loff_t n) #define BENCH_D_COLS (8ULL) #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ -#define BENCH_NS MSEC2NSEC(25) /* 25ms */ +#define BENCH_NS MSEC2NSEC(1) /* 1ms */ typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); @@ -405,7 +410,7 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) t_start = gethrtime(); do { - for (i = 0; i < 25; i++, run_cnt++) + for (i = 0; i < 5; i++, run_cnt++) bench_fn(bench_rm, fn); t_diff = gethrtime() - t_start; @@ -435,21 +440,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) } } } +#endif -void -vdev_raidz_math_init(void) +/* + * Initialize and benchmark all supported implementations. + */ +static void +benchmark_raidz(void) { raidz_impl_ops_t *curr_impl; - zio_t *bench_zio = NULL; - raidz_map_t *bench_rm = NULL; - uint64_t bench_parity; - int i, c, fn; + int i, c; - /* move supported impl into raidz_supp_impl */ + /* Move supported impl into raidz_supp_impl */ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; - /* initialize impl */ if (curr_impl->init) curr_impl->init(); @@ -459,20 +464,13 @@ vdev_raidz_math_init(void) membar_producer(); /* complete raidz_supp_impl[] init */ raidz_supp_impl_cnt = c; /* number of supported impl */ -#if !defined(_KERNEL) - /* Skip benchmarking and use last implementation as fastest */ - memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1], - sizeof (vdev_raidz_fastest_impl)); - strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#if defined(_KERNEL) + abd_t *pabd; + zio_t *bench_zio = NULL; + raidz_map_t *bench_rm = NULL; + uint64_t bench_parity; - raidz_math_initialized = B_TRUE; - - /* Use 'cycle' math selection method for userspace */ - VERIFY0(vdev_raidz_impl_set("cycle")); - return; -#endif - - /* Fake an zio and run the benchmark on a warmed up buffer */ + /* Fake a zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ @@ -480,7 +478,7 @@ vdev_raidz_math_init(void) memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ - for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { bench_parity = fn + 1; /* New raidz_map is needed for each generate_p/q/r */ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, @@ -495,7 +493,13 @@ vdev_raidz_math_init(void) bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, BENCH_COLS, PARITY_PQR); - for (fn = 0; fn < RAIDZ_REC_NUM; fn++) + /* Ensure that fake parity blocks are initialized */ + for (c = 0; c < bench_rm->rm_row[0]->rr_firstdatacol; c++) { + pabd = bench_rm->rm_row[0]->rr_col[c].rc_abd; + memset(abd_to_buf(pabd), 0xAA, abd_get_size(pabd)); + } + + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); vdev_raidz_map_free(bench_rm); @@ -503,11 +507,29 @@ vdev_raidz_math_init(void) /* cleanup the bench zio */ abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&vdev_raidz_fastest_impl, + raidz_supp_impl[raidz_supp_impl_cnt - 1], + sizeof (vdev_raidz_fastest_impl)); + strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#endif /* _KERNEL */ +} - /* install kstats for all impl */ +void +vdev_raidz_math_init(void) +{ + /* Determine the fastest available implementation. */ + benchmark_raidz(); + +#if defined(_KERNEL) + /* Install kstats for all implementations */ raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - if (raidz_math_kstat != NULL) { raidz_math_kstat->ks_data = NULL; raidz_math_kstat->ks_ndata = UINT32_MAX; @@ -517,6 +539,7 @@ vdev_raidz_math_init(void) raidz_math_kstat_addr); kstat_install(raidz_math_kstat); } +#endif /* Finish initialization */ atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); @@ -527,15 +550,15 @@ void vdev_raidz_math_fini(void) { raidz_impl_ops_t const *curr_impl; - int i; +#if defined(_KERNEL) if (raidz_math_kstat != NULL) { kstat_delete(raidz_math_kstat); raidz_math_kstat = NULL; } +#endif - /* fini impl */ - for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = raidz_all_maths[i]; if (curr_impl->fini) curr_impl->fini(); @@ -546,9 +569,7 @@ static const struct { char *name; uint32_t sel; } math_impl_opts[] = { -#if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, -#endif { "fastest", IMPL_FASTEST }, { "original", IMPL_ORIGINAL }, { "scalar", IMPL_SCALAR } @@ -614,8 +635,7 @@ vdev_raidz_impl_set(const char *val) return (err); } -#if defined(_KERNEL) -#include +#if defined(_KERNEL) && defined(__linux__) static int zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index e3ad067765..0a67ceb849 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon); static boolean_t raidz_will_aarch64_neon_work(void) { - return (B_TRUE); // __arch64__ requires NEON + return (kfpu_allowed()); } const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h index 024917417a..e46b253654 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h +++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -23,9 +23,11 @@ */ #include -#include +#include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -42,7 +44,7 @@ /* * Here we need registers not used otherwise. * They will be used in unused ASM for the case - * with more registers than required... but GGC + * with more registers than required... but GCC * will still need to make sure the constraints * are correct, and duplicate constraints are illegal * ... and we use the "register" number as a name @@ -479,10 +481,8 @@ typedef struct v { /* upper part */ \ "and v14.16b," VR0(r) ".16b,v15.16b\n" \ "and v13.16b," VR1(r) ".16b,v15.16b\n" \ - "sshr " VR0(r) ".8h," VR0(r) ".8h,#4\n" \ - "sshr " VR1(r) ".8h," VR1(r) ".8h,#4\n" \ - "and " VR0(r) ".16b," VR0(r) ".16b,v15.16b\n" \ - "and " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \ + "ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n" \ + "ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n" \ \ "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \ "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \ diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c index f8688a06a8..e072f51cd6 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2); static boolean_t raidz_will_aarch64_neonx2_work(void) { - return (B_TRUE); // __arch64__ requires NEON + return (kfpu_allowed()); } const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = { diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c index 063d29bcd8..65e4bebce8 100644 --- a/module/zfs/vdev_raidz_math_avx2.c +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -26,9 +26,11 @@ #if defined(__x86_64) && defined(HAVE_AVX2) #include -#include +#include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -396,7 +398,7 @@ DEFINE_REC_METHODS(avx2); static boolean_t raidz_will_avx2_work(void) { - return (zfs_avx_available() && zfs_avx2_available()); + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const raidz_impl_ops_t vdev_raidz_avx2_impl = { diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c index d605653db3..f06b469023 100644 --- a/module/zfs/vdev_raidz_math_avx512bw.c +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -27,10 +27,14 @@ #if defined(__x86_64) && defined(HAVE_AVX512BW) +#include #include -#include +#include + +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -393,9 +397,8 @@ DEFINE_REC_METHODS(avx512bw); static boolean_t raidz_will_avx512bw_work(void) { - return (zfs_avx_available() && - zfs_avx512f_available() && - zfs_avx512bw_available()); + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx512f_available() && zfs_avx512bw_available()); } const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c index f4e4560ced..aab653b774 100644 --- a/module/zfs/vdev_raidz_math_avx512f.c +++ b/module/zfs/vdev_raidz_math_avx512f.c @@ -28,9 +28,12 @@ #if defined(__x86_64) && defined(HAVE_AVX512F) #include -#include +#include +#include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -194,6 +197,8 @@ typedef struct v { "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \ "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \ break; \ + default: \ + VERIFY(0); \ } \ } @@ -370,6 +375,9 @@ gf_x2_mul_fns[256] = { COPY(R_23(r), _mul_x2_in); \ gf_x2_mul_fns[c](); \ COPY(_mul_x2_acc, R_23(r)); \ + break; \ + default: \ + VERIFY(0); \ } \ } @@ -470,9 +478,8 @@ DEFINE_REC_METHODS(avx512f); static boolean_t raidz_will_avx512f_work(void) { - return (zfs_avx_available() && - zfs_avx2_available() && - zfs_avx512f_available()); + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx2_available() && zfs_avx512f_available()); } const raidz_impl_ops_t vdev_raidz_avx512f_impl = { diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index ea592c0f12..35e016fc65 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -26,6 +26,7 @@ #define _VDEV_RAIDZ_MATH_IMPL_H #include +#include #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline @@ -36,33 +37,33 @@ * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void -raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void -raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void -raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; @@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; @@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private) /* * Generate P parity (RAIDZ1) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) +raidz_generate_p_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t psize = rm->rm_col[CODE_P].rc_size; - abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + const size_t ncols = rr->rr_cols; + const size_t psize = rr->rr_col[CODE_P].rc_size; + abd_t *pabd = rr->rr_col[CODE_P].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* start with first data column */ - raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); for (c = 2; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - size = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + size = rr->rr_col[c].rc_size; /* add data column */ raidz_add(pabd, dabd, size); @@ -391,7 +392,7 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize, { v_t *p = (v_t *)c[0]; v_t *q = (v_t *)c[1]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); @@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize, /* * Generate PQ parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) +raidz_generate_pq_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); for (c = 3; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, raidz_gen_pq_add); @@ -462,7 +463,7 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, v_t *p = (v_t *)c[0]; v_t *q = (v_t *)c[1]; v_t *r = (v_t *)c[CODE_R]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); @@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* * Generate PQR parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) +raidz_generate_pqr_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); for (c = 4; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, raidz_gen_pqr_add); @@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm) * @syn_method raidz_add_abd() * @rec_method not applicable * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; size_t size; abd_t *dabd; + if (xabd == NULL) + return (1 << CODE_P); + raidz_math_begin(); /* copy P into target */ - raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); /* generate p_syndrome */ for (c = firstdc; c < ncols; c++) { if (c == x) continue; - dabd = rm->rm_col[c].rc_abd; - size = MIN(rm->rm_col[c].rc_size, xsize); + dabd = rr->rr_col[c].rc_abd; + size = MIN(rr->rr_col[c].rc_size, xsize); raidz_add(xabd, dabd, size); } @@ -629,7 +633,7 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (xsize / sizeof (v_t)); @@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - abd_t *xabd = rm->rm_col[x].rc_abd; - const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_Q); + unsigned coeff[MUL_CNT]; - raidz_rec_q_coeff(rm, tgtidx, coeff); + raidz_rec_q_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) } /* add Q to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); @@ -720,7 +727,7 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (tsize / sizeof (v_t)); @@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ rr * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_R); + unsigned coeff[MUL_CNT]; - raidz_rec_r_coeff(rm, tgtidx, coeff); + raidz_rec_r_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) } /* add R to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); @@ -813,7 +823,7 @@ raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, { v_t *x = (v_t *)tc[TARGET_X]; v_t *y = (v_t *)tc[TARGET_Y]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); @@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q)); + unsigned coeff[MUL_CNT]; - raidz_rec_pq_coeff(rm, tgtidx, coeff); + raidz_rec_pq_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -971,7 +984,7 @@ raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); @@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + raidz_rec_pr_coeff(rr, tgtidx, coeff); /* * Check if some of targets are shorter then others. @@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); - return ((1 << CODE_P) | (1 << CODE_Q)); + return ((1 << CODE_P) | (1 << CODE_R)); } @@ -1130,7 +1147,7 @@ raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_QR_DEFINE(); @@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + raidz_rec_qr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1295,7 +1316,7 @@ raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, v_t *y = (v_t *)c[TARGET_Y]; v_t *z = (v_t *)c[TARGET_Z]; const v_t * const yend = y + (tsize / sizeof (v_t)); - const v_t *d = (v_t *)dc; + const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_PQR_DEFINE(); @@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - const size_t zsize = rm->rm_col[z].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; - abd_t *zabd = rm->rm_col[z].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + const size_t zsize = rr->rr_col[z].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; + abd_t *zabd = rr->rr_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + raidz_rec_pqr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, @@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); if (zsize < xsize) - raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); raidz_math_end(); diff --git a/module/zfs/vdev_raidz_math_powerpc_altivec.c b/module/zfs/vdev_raidz_math_powerpc_altivec.c new file mode 100644 index 0000000000..1db2c4cd3a --- /dev/null +++ b/module/zfs/vdev_raidz_math_powerpc_altivec.c @@ -0,0 +1,4337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau. All rights reserved. + * + */ + +#include +#include + +#if defined(__powerpc__) +#pragma GCC target("altivec") + +#include "vdev_raidz_math_powerpc_altivec_common.h" + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQ_STRIDE 2 +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PR_STRIDE 2 +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_QR_STRIDE 2 +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(powerpc_altivec); +DEFINE_REC_METHODS(powerpc_altivec); + +static boolean_t +raidz_will_powerpc_altivec_work(void) +{ + return (kfpu_allowed()) && zfs_altivec_available(); +} + +const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(powerpc_altivec), + .rec = RAIDZ_REC_METHODS(powerpc_altivec), + .is_supported = &raidz_will_powerpc_altivec_work, + .name = "powerpc_altivec" +}; + +#endif /* defined(__powerpc__) */ + + +#if defined(__powerpc__) +#if defined(_ZFS_LITTLE_ENDIAN) && _LITTLE_ENDIAN +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x11, 0x12, 0x17, 0x14, 0x1d, 0x1e, 0x1b, 0x18, + 0x09, 0x0a, 0x0f, 0x0c, 0x05, 0x06, 0x03, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3c, 0x38, 0x34, 0x30, 0x2c, 0x28, 0x24, 0x20, + 0x1c, 0x18, 0x14, 0x10, 0x0c, 0x08, 0x04, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x33, 0x36, 0x39, 0x3c, 0x27, 0x22, 0x2d, 0x28, + 0x1b, 0x1e, 0x11, 0x14, 0x0f, 0x0a, 0x05, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x22, 0x24, 0x2e, 0x28, 0x3a, 0x3c, 0x36, 0x30, + 0x12, 0x14, 0x1e, 0x18, 0x0a, 0x0c, 0x06, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x2d, 0x2a, 0x23, 0x24, 0x31, 0x36, 0x3f, 0x38, + 0x15, 0x12, 0x1b, 0x1c, 0x09, 0x0e, 0x07, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x77, 0x7e, 0x65, 0x6c, 0x53, 0x5a, 0x41, 0x48, + 0x3f, 0x36, 0x2d, 0x24, 0x1b, 0x12, 0x09, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x66, 0x6c, 0x72, 0x78, 0x4e, 0x44, 0x5a, 0x50, + 0x36, 0x3c, 0x22, 0x28, 0x1e, 0x14, 0x0a, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x62, 0x7f, 0x74, 0x45, 0x4e, 0x53, 0x58, + 0x31, 0x3a, 0x27, 0x2c, 0x1d, 0x16, 0x0b, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x44, 0x48, 0x5c, 0x50, 0x74, 0x78, 0x6c, 0x60, + 0x24, 0x28, 0x3c, 0x30, 0x14, 0x18, 0x0c, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4b, 0x46, 0x51, 0x5c, 0x7f, 0x72, 0x65, 0x68, + 0x23, 0x2e, 0x39, 0x34, 0x17, 0x1a, 0x0d, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x5a, 0x54, 0x46, 0x48, 0x62, 0x6c, 0x7e, 0x70, + 0x2a, 0x24, 0x36, 0x38, 0x12, 0x1c, 0x0e, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x55, 0x5a, 0x4b, 0x44, 0x69, 0x66, 0x77, 0x78, + 0x2d, 0x22, 0x33, 0x3c, 0x11, 0x1e, 0x0f, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, + 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00 }, + { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xee, 0xfc, 0xca, 0xd8, 0xa6, 0xb4, 0x82, 0x90, + 0x7e, 0x6c, 0x5a, 0x48, 0x36, 0x24, 0x12, 0x00 }, + { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe1, 0xf2, 0xc7, 0xd4, 0xad, 0xbe, 0x8b, 0x98, + 0x79, 0x6a, 0x5f, 0x4c, 0x35, 0x26, 0x13, 0x00 }, + { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xcc, 0xd8, 0xe4, 0xf0, 0x9c, 0x88, 0xb4, 0xa0, + 0x6c, 0x78, 0x44, 0x50, 0x3c, 0x28, 0x14, 0x00 }, + { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xc3, 0xd6, 0xe9, 0xfc, 0x97, 0x82, 0xbd, 0xa8, + 0x6b, 0x7e, 0x41, 0x54, 0x3f, 0x2a, 0x15, 0x00 }, + { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xd2, 0xc4, 0xfe, 0xe8, 0x8a, 0x9c, 0xa6, 0xb0, + 0x62, 0x74, 0x4e, 0x58, 0x3a, 0x2c, 0x16, 0x00 }, + { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xdd, 0xca, 0xf3, 0xe4, 0x81, 0x96, 0xaf, 0xb8, + 0x65, 0x72, 0x4b, 0x5c, 0x39, 0x2e, 0x17, 0x00 }, + { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x88, 0x90, 0xb8, 0xa0, 0xe8, 0xf0, 0xd8, 0xc0, + 0x48, 0x50, 0x78, 0x60, 0x28, 0x30, 0x18, 0x00 }, + { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x87, 0x9e, 0xb5, 0xac, 0xe3, 0xfa, 0xd1, 0xc8, + 0x4f, 0x56, 0x7d, 0x64, 0x2b, 0x32, 0x19, 0x00 }, + { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x96, 0x8c, 0xa2, 0xb8, 0xfe, 0xe4, 0xca, 0xd0, + 0x46, 0x5c, 0x72, 0x68, 0x2e, 0x34, 0x1a, 0x00 }, + { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x99, 0x82, 0xaf, 0xb4, 0xf5, 0xee, 0xc3, 0xd8, + 0x41, 0x5a, 0x77, 0x6c, 0x2d, 0x36, 0x1b, 0x00 }, + { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xb4, 0xa8, 0x8c, 0x90, 0xc4, 0xd8, 0xfc, 0xe0, + 0x54, 0x48, 0x6c, 0x70, 0x24, 0x38, 0x1c, 0x00 }, + { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xaa, 0xb4, 0x96, 0x88, 0xd2, 0xcc, 0xee, 0xf0, + 0x5a, 0x44, 0x66, 0x78, 0x22, 0x3c, 0x1e, 0x00 }, + { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xa5, 0xba, 0x9b, 0x84, 0xd9, 0xc6, 0xe7, 0xf8, + 0x5d, 0x42, 0x63, 0x7c, 0x21, 0x3e, 0x1f, 0x00 }, + { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xef, 0xce, 0xad, 0x8c, 0x6b, 0x4a, 0x29, 0x08, + 0xe7, 0xc6, 0xa5, 0x84, 0x63, 0x42, 0x21, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, + 0xee, 0xcc, 0xaa, 0x88, 0x66, 0x44, 0x22, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf1, 0xd2, 0xb7, 0x94, 0x7d, 0x5e, 0x3b, 0x18, + 0xe9, 0xca, 0xaf, 0x8c, 0x65, 0x46, 0x23, 0x00 }, + { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xdc, 0xf8, 0x94, 0xb0, 0x4c, 0x68, 0x04, 0x20, + 0xfc, 0xd8, 0xb4, 0x90, 0x6c, 0x48, 0x24, 0x00 }, + { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xd3, 0xf6, 0x99, 0xbc, 0x47, 0x62, 0x0d, 0x28, + 0xfb, 0xde, 0xb1, 0x94, 0x6f, 0x4a, 0x25, 0x00 }, + { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xc2, 0xe4, 0x8e, 0xa8, 0x5a, 0x7c, 0x16, 0x30, + 0xf2, 0xd4, 0xbe, 0x98, 0x6a, 0x4c, 0x26, 0x00 }, + { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x98, 0xb0, 0xc8, 0xe0, 0x38, 0x10, 0x68, 0x40, + 0xd8, 0xf0, 0x88, 0xa0, 0x78, 0x50, 0x28, 0x00 }, + { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x97, 0xbe, 0xc5, 0xec, 0x33, 0x1a, 0x61, 0x48, + 0xdf, 0xf6, 0x8d, 0xa4, 0x7b, 0x52, 0x29, 0x00 }, + { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x86, 0xac, 0xd2, 0xf8, 0x2e, 0x04, 0x7a, 0x50, + 0xd6, 0xfc, 0x82, 0xa8, 0x7e, 0x54, 0x2a, 0x00 }, + { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x89, 0xa2, 0xdf, 0xf4, 0x25, 0x0e, 0x73, 0x58, + 0xd1, 0xfa, 0x87, 0xac, 0x7d, 0x56, 0x2b, 0x00 }, + { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xa4, 0x88, 0xfc, 0xd0, 0x14, 0x38, 0x4c, 0x60, + 0xc4, 0xe8, 0x9c, 0xb0, 0x74, 0x58, 0x2c, 0x00 }, + { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xab, 0x86, 0xf1, 0xdc, 0x1f, 0x32, 0x45, 0x68, + 0xc3, 0xee, 0x99, 0xb4, 0x77, 0x5a, 0x2d, 0x00 }, + { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xba, 0x94, 0xe6, 0xc8, 0x02, 0x2c, 0x5e, 0x70, + 0xca, 0xe4, 0x96, 0xb8, 0x72, 0x5c, 0x2e, 0x00 }, + { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xb5, 0x9a, 0xeb, 0xc4, 0x09, 0x26, 0x57, 0x78, + 0xcd, 0xe2, 0x93, 0xbc, 0x71, 0x5e, 0x2f, 0x00 }, + { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1f, 0x2e, 0x7d, 0x4c, 0xdb, 0xea, 0xb9, 0x88, + 0x97, 0xa6, 0xf5, 0xc4, 0x53, 0x62, 0x31, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x0e, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90, + 0x9e, 0xac, 0xfa, 0xc8, 0x56, 0x64, 0x32, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x01, 0x32, 0x67, 0x54, 0xcd, 0xfe, 0xab, 0x98, + 0x99, 0xaa, 0xff, 0xcc, 0x55, 0x66, 0x33, 0x00 }, + { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x2c, 0x18, 0x44, 0x70, 0xfc, 0xc8, 0x94, 0xa0, + 0x8c, 0xb8, 0xe4, 0xd0, 0x5c, 0x68, 0x34, 0x00 }, + { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x23, 0x16, 0x49, 0x7c, 0xf7, 0xc2, 0x9d, 0xa8, + 0x8b, 0xbe, 0xe1, 0xd4, 0x5f, 0x6a, 0x35, 0x00 }, + { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x32, 0x04, 0x5e, 0x68, 0xea, 0xdc, 0x86, 0xb0, + 0x82, 0xb4, 0xee, 0xd8, 0x5a, 0x6c, 0x36, 0x00 }, + { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3d, 0x0a, 0x53, 0x64, 0xe1, 0xd6, 0x8f, 0xb8, + 0x85, 0xb2, 0xeb, 0xdc, 0x59, 0x6e, 0x37, 0x00 }, + { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x68, 0x50, 0x18, 0x20, 0x88, 0xb0, 0xf8, 0xc0, + 0xa8, 0x90, 0xd8, 0xe0, 0x48, 0x70, 0x38, 0x00 }, + { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x67, 0x5e, 0x15, 0x2c, 0x83, 0xba, 0xf1, 0xc8, + 0xaf, 0x96, 0xdd, 0xe4, 0x4b, 0x72, 0x39, 0x00 }, + { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x79, 0x42, 0x0f, 0x34, 0x95, 0xae, 0xe3, 0xd8, + 0xa1, 0x9a, 0xd7, 0xec, 0x4d, 0x76, 0x3b, 0x00 }, + { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x54, 0x68, 0x2c, 0x10, 0xa4, 0x98, 0xdc, 0xe0, + 0xb4, 0x88, 0xcc, 0xf0, 0x44, 0x78, 0x3c, 0x00 }, + { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x5b, 0x66, 0x21, 0x1c, 0xaf, 0x92, 0xd5, 0xe8, + 0xb3, 0x8e, 0xc9, 0xf4, 0x47, 0x7a, 0x3d, 0x00 }, + { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4a, 0x74, 0x36, 0x08, 0xb2, 0x8c, 0xce, 0xf0, + 0xba, 0x84, 0xc6, 0xf8, 0x42, 0x7c, 0x3e, 0x00 }, + { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x45, 0x7a, 0x3b, 0x04, 0xb9, 0x86, 0xc7, 0xf8, + 0xbd, 0x82, 0xc3, 0xfc, 0x41, 0x7e, 0x3f, 0x00 }, + { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xcf, 0x8e, 0x4d, 0x0c, 0xcb, 0x8a, 0x49, 0x08, + 0xc7, 0x86, 0x45, 0x04, 0xc3, 0x82, 0x41, 0x00 }, + { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xde, 0x9c, 0x5a, 0x18, 0xd6, 0x94, 0x52, 0x10, + 0xce, 0x8c, 0x4a, 0x08, 0xc6, 0x84, 0x42, 0x00 }, + { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd1, 0x92, 0x57, 0x14, 0xdd, 0x9e, 0x5b, 0x18, + 0xc9, 0x8a, 0x4f, 0x0c, 0xc5, 0x86, 0x43, 0x00 }, + { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xfc, 0xb8, 0x74, 0x30, 0xec, 0xa8, 0x64, 0x20, + 0xdc, 0x98, 0x54, 0x10, 0xcc, 0x88, 0x44, 0x00 }, + { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xf3, 0xb6, 0x79, 0x3c, 0xe7, 0xa2, 0x6d, 0x28, + 0xdb, 0x9e, 0x51, 0x14, 0xcf, 0x8a, 0x45, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xe2, 0xa4, 0x6e, 0x28, 0xfa, 0xbc, 0x76, 0x30, + 0xd2, 0x94, 0x5e, 0x18, 0xca, 0x8c, 0x46, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xed, 0xaa, 0x63, 0x24, 0xf1, 0xb6, 0x7f, 0x38, + 0xd5, 0x92, 0x5b, 0x1c, 0xc9, 0x8e, 0x47, 0x00 }, + { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb8, 0xf0, 0x28, 0x60, 0x98, 0xd0, 0x08, 0x40, + 0xf8, 0xb0, 0x68, 0x20, 0xd8, 0x90, 0x48, 0x00 }, + { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb7, 0xfe, 0x25, 0x6c, 0x93, 0xda, 0x01, 0x48, + 0xff, 0xb6, 0x6d, 0x24, 0xdb, 0x92, 0x49, 0x00 }, + { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa6, 0xec, 0x32, 0x78, 0x8e, 0xc4, 0x1a, 0x50, + 0xf6, 0xbc, 0x62, 0x28, 0xde, 0x94, 0x4a, 0x00 }, + { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa9, 0xe2, 0x3f, 0x74, 0x85, 0xce, 0x13, 0x58, + 0xf1, 0xba, 0x67, 0x2c, 0xdd, 0x96, 0x4b, 0x00 }, + { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x84, 0xc8, 0x1c, 0x50, 0xb4, 0xf8, 0x2c, 0x60, + 0xe4, 0xa8, 0x7c, 0x30, 0xd4, 0x98, 0x4c, 0x00 }, + { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x8b, 0xc6, 0x11, 0x5c, 0xbf, 0xf2, 0x25, 0x68, + 0xe3, 0xae, 0x79, 0x34, 0xd7, 0x9a, 0x4d, 0x00 }, + { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x95, 0xda, 0x0b, 0x44, 0xa9, 0xe6, 0x37, 0x78, + 0xed, 0xa2, 0x73, 0x3c, 0xd1, 0x9e, 0x4f, 0x00 }, + { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x3f, 0x6e, 0x9d, 0xcc, 0x7b, 0x2a, 0xd9, 0x88, + 0xb7, 0xe6, 0x15, 0x44, 0xf3, 0xa2, 0x51, 0x00 }, + { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x2e, 0x7c, 0x8a, 0xd8, 0x66, 0x34, 0xc2, 0x90, + 0xbe, 0xec, 0x1a, 0x48, 0xf6, 0xa4, 0x52, 0x00 }, + { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x0c, 0x58, 0xa4, 0xf0, 0x5c, 0x08, 0xf4, 0xa0, + 0xac, 0xf8, 0x04, 0x50, 0xfc, 0xa8, 0x54, 0x00 }, + { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x03, 0x56, 0xa9, 0xfc, 0x57, 0x02, 0xfd, 0xa8, + 0xab, 0xfe, 0x01, 0x54, 0xff, 0xaa, 0x55, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x12, 0x44, 0xbe, 0xe8, 0x4a, 0x1c, 0xe6, 0xb0, + 0xa2, 0xf4, 0x0e, 0x58, 0xfa, 0xac, 0x56, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x4a, 0xb3, 0xe4, 0x41, 0x16, 0xef, 0xb8, + 0xa5, 0xf2, 0x0b, 0x5c, 0xf9, 0xae, 0x57, 0x00 }, + { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x48, 0x10, 0xf8, 0xa0, 0x28, 0x70, 0x98, 0xc0, + 0x88, 0xd0, 0x38, 0x60, 0xe8, 0xb0, 0x58, 0x00 }, + { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x47, 0x1e, 0xf5, 0xac, 0x23, 0x7a, 0x91, 0xc8, + 0x8f, 0xd6, 0x3d, 0x64, 0xeb, 0xb2, 0x59, 0x00 }, + { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x56, 0x0c, 0xe2, 0xb8, 0x3e, 0x64, 0x8a, 0xd0, + 0x86, 0xdc, 0x32, 0x68, 0xee, 0xb4, 0x5a, 0x00 }, + { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x59, 0x02, 0xef, 0xb4, 0x35, 0x6e, 0x83, 0xd8, + 0x81, 0xda, 0x37, 0x6c, 0xed, 0xb6, 0x5b, 0x00 }, + { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x28, 0xcc, 0x90, 0x04, 0x58, 0xbc, 0xe0, + 0x94, 0xc8, 0x2c, 0x70, 0xe4, 0xb8, 0x5c, 0x00 }, + { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x7b, 0x26, 0xc1, 0x9c, 0x0f, 0x52, 0xb5, 0xe8, + 0x93, 0xce, 0x29, 0x74, 0xe7, 0xba, 0x5d, 0x00 }, + { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x6a, 0x34, 0xd6, 0x88, 0x12, 0x4c, 0xae, 0xf0, + 0x9a, 0xc4, 0x26, 0x78, 0xe2, 0xbc, 0x5e, 0x00 }, + { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x65, 0x3a, 0xdb, 0x84, 0x19, 0x46, 0xa7, 0xf8, + 0x9d, 0xc2, 0x23, 0x7c, 0xe1, 0xbe, 0x5f, 0x00 }, + { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x2f, 0x4e, 0xed, 0x8c, 0xab, 0xca, 0x69, 0x08, + 0x27, 0x46, 0xe5, 0x84, 0xa3, 0xc2, 0x61, 0x00 }, + { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x3e, 0x5c, 0xfa, 0x98, 0xb6, 0xd4, 0x72, 0x10, + 0x2e, 0x4c, 0xea, 0x88, 0xa6, 0xc4, 0x62, 0x00 }, + { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x31, 0x52, 0xf7, 0x94, 0xbd, 0xde, 0x7b, 0x18, + 0x29, 0x4a, 0xef, 0x8c, 0xa5, 0xc6, 0x63, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x1c, 0x78, 0xd4, 0xb0, 0x8c, 0xe8, 0x44, 0x20, + 0x3c, 0x58, 0xf4, 0x90, 0xac, 0xc8, 0x64, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x13, 0x76, 0xd9, 0xbc, 0x87, 0xe2, 0x4d, 0x28, + 0x3b, 0x5e, 0xf1, 0x94, 0xaf, 0xca, 0x65, 0x00 }, + { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x02, 0x64, 0xce, 0xa8, 0x9a, 0xfc, 0x56, 0x30, + 0x32, 0x54, 0xfe, 0x98, 0xaa, 0xcc, 0x66, 0x00 }, + { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x0d, 0x6a, 0xc3, 0xa4, 0x91, 0xf6, 0x5f, 0x38, + 0x35, 0x52, 0xfb, 0x9c, 0xa9, 0xce, 0x67, 0x00 }, + { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x58, 0x30, 0x88, 0xe0, 0xf8, 0x90, 0x28, 0x40, + 0x18, 0x70, 0xc8, 0xa0, 0xb8, 0xd0, 0x68, 0x00 }, + { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x46, 0x2c, 0x92, 0xf8, 0xee, 0x84, 0x3a, 0x50, + 0x16, 0x7c, 0xc2, 0xa8, 0xbe, 0xd4, 0x6a, 0x00 }, + { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x49, 0x22, 0x9f, 0xf4, 0xe5, 0x8e, 0x33, 0x58, + 0x11, 0x7a, 0xc7, 0xac, 0xbd, 0xd6, 0x6b, 0x00 }, + { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x64, 0x08, 0xbc, 0xd0, 0xd4, 0xb8, 0x0c, 0x60, + 0x04, 0x68, 0xdc, 0xb0, 0xb4, 0xd8, 0x6c, 0x00 }, + { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x6b, 0x06, 0xb1, 0xdc, 0xdf, 0xb2, 0x05, 0x68, + 0x03, 0x6e, 0xd9, 0xb4, 0xb7, 0xda, 0x6d, 0x00 }, + { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x7a, 0x14, 0xa6, 0xc8, 0xc2, 0xac, 0x1e, 0x70, + 0x0a, 0x64, 0xd6, 0xb8, 0xb2, 0xdc, 0x6e, 0x00 }, + { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x75, 0x1a, 0xab, 0xc4, 0xc9, 0xa6, 0x17, 0x78, + 0x0d, 0x62, 0xd3, 0xbc, 0xb1, 0xde, 0x6f, 0x00 }, + { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xdf, 0xae, 0x3d, 0x4c, 0x1b, 0x6a, 0xf9, 0x88, + 0x57, 0x26, 0xb5, 0xc4, 0x93, 0xe2, 0x71, 0x00 }, + { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xce, 0xbc, 0x2a, 0x58, 0x06, 0x74, 0xe2, 0x90, + 0x5e, 0x2c, 0xba, 0xc8, 0x96, 0xe4, 0x72, 0x00 }, + { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc1, 0xb2, 0x27, 0x54, 0x0d, 0x7e, 0xeb, 0x98, + 0x59, 0x2a, 0xbf, 0xcc, 0x95, 0xe6, 0x73, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xe3, 0x96, 0x09, 0x7c, 0x37, 0x42, 0xdd, 0xa8, + 0x4b, 0x3e, 0xa1, 0xd4, 0x9f, 0xea, 0x75, 0x00 }, + { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xf2, 0x84, 0x1e, 0x68, 0x2a, 0x5c, 0xc6, 0xb0, + 0x42, 0x34, 0xae, 0xd8, 0x9a, 0xec, 0x76, 0x00 }, + { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xfd, 0x8a, 0x13, 0x64, 0x21, 0x56, 0xcf, 0xb8, + 0x45, 0x32, 0xab, 0xdc, 0x99, 0xee, 0x77, 0x00 }, + { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa8, 0xd0, 0x58, 0x20, 0x48, 0x30, 0xb8, 0xc0, + 0x68, 0x10, 0x98, 0xe0, 0x88, 0xf0, 0x78, 0x00 }, + { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa7, 0xde, 0x55, 0x2c, 0x43, 0x3a, 0xb1, 0xc8, + 0x6f, 0x16, 0x9d, 0xe4, 0x8b, 0xf2, 0x79, 0x00 }, + { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb6, 0xcc, 0x42, 0x38, 0x5e, 0x24, 0xaa, 0xd0, + 0x66, 0x1c, 0x92, 0xe8, 0x8e, 0xf4, 0x7a, 0x00 }, + { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb9, 0xc2, 0x4f, 0x34, 0x55, 0x2e, 0xa3, 0xd8, + 0x61, 0x1a, 0x97, 0xec, 0x8d, 0xf6, 0x7b, 0x00 }, + { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x94, 0xe8, 0x6c, 0x10, 0x64, 0x18, 0x9c, 0xe0, + 0x74, 0x08, 0x8c, 0xf0, 0x84, 0xf8, 0x7c, 0x00 }, + { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x9b, 0xe6, 0x61, 0x1c, 0x6f, 0x12, 0x95, 0xe8, + 0x73, 0x0e, 0x89, 0xf4, 0x87, 0xfa, 0x7d, 0x00 }, + { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x8a, 0xf4, 0x76, 0x08, 0x72, 0x0c, 0x8e, 0xf0, + 0x7a, 0x04, 0x86, 0xf8, 0x82, 0xfc, 0x7e, 0x00 }, + { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x85, 0xfa, 0x7b, 0x04, 0x79, 0x06, 0x87, 0xf8, + 0x7d, 0x02, 0x83, 0xfc, 0x81, 0xfe, 0x7f, 0x00 }, + { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9e, 0x1c, 0x9a, 0x18, 0x96, 0x14, 0x92, 0x10, + 0x8e, 0x0c, 0x8a, 0x08, 0x86, 0x04, 0x82, 0x00 }, + { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x91, 0x12, 0x97, 0x14, 0x9d, 0x1e, 0x9b, 0x18, + 0x89, 0x0a, 0x8f, 0x0c, 0x85, 0x06, 0x83, 0x00 }, + { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbc, 0x38, 0xb4, 0x30, 0xac, 0x28, 0xa4, 0x20, + 0x9c, 0x18, 0x94, 0x10, 0x8c, 0x08, 0x84, 0x00 }, + { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb3, 0x36, 0xb9, 0x3c, 0xa7, 0x22, 0xad, 0x28, + 0x9b, 0x1e, 0x91, 0x14, 0x8f, 0x0a, 0x85, 0x00 }, + { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa2, 0x24, 0xae, 0x28, 0xba, 0x3c, 0xb6, 0x30, + 0x92, 0x14, 0x9e, 0x18, 0x8a, 0x0c, 0x86, 0x00 }, + { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xad, 0x2a, 0xa3, 0x24, 0xb1, 0x36, 0xbf, 0x38, + 0x95, 0x12, 0x9b, 0x1c, 0x89, 0x0e, 0x87, 0x00 }, + { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf8, 0x70, 0xe8, 0x60, 0xd8, 0x50, 0xc8, 0x40, + 0xb8, 0x30, 0xa8, 0x20, 0x98, 0x10, 0x88, 0x00 }, + { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf7, 0x7e, 0xe5, 0x6c, 0xd3, 0x5a, 0xc1, 0x48, + 0xbf, 0x36, 0xad, 0x24, 0x9b, 0x12, 0x89, 0x00 }, + { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe6, 0x6c, 0xf2, 0x78, 0xce, 0x44, 0xda, 0x50, + 0xb6, 0x3c, 0xa2, 0x28, 0x9e, 0x14, 0x8a, 0x00 }, + { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe9, 0x62, 0xff, 0x74, 0xc5, 0x4e, 0xd3, 0x58, + 0xb1, 0x3a, 0xa7, 0x2c, 0x9d, 0x16, 0x8b, 0x00 }, + { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc4, 0x48, 0xdc, 0x50, 0xf4, 0x78, 0xec, 0x60, + 0xa4, 0x28, 0xbc, 0x30, 0x94, 0x18, 0x8c, 0x00 }, + { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xcb, 0x46, 0xd1, 0x5c, 0xff, 0x72, 0xe5, 0x68, + 0xa3, 0x2e, 0xb9, 0x34, 0x97, 0x1a, 0x8d, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xda, 0x54, 0xc6, 0x48, 0xe2, 0x6c, 0xfe, 0x70, + 0xaa, 0x24, 0xb6, 0x38, 0x92, 0x1c, 0x8e, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd5, 0x5a, 0xcb, 0x44, 0xe9, 0x66, 0xf7, 0x78, + 0xad, 0x22, 0xb3, 0x3c, 0x91, 0x1e, 0x8f, 0x00 }, + { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7f, 0xee, 0x5d, 0xcc, 0x3b, 0xaa, 0x19, 0x88, + 0xf7, 0x66, 0xd5, 0x44, 0xb3, 0x22, 0x91, 0x00 }, + { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6e, 0xfc, 0x4a, 0xd8, 0x26, 0xb4, 0x02, 0x90, + 0xfe, 0x6c, 0xda, 0x48, 0xb6, 0x24, 0x92, 0x00 }, + { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x61, 0xf2, 0x47, 0xd4, 0x2d, 0xbe, 0x0b, 0x98, + 0xf9, 0x6a, 0xdf, 0x4c, 0xb5, 0x26, 0x93, 0x00 }, + { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4c, 0xd8, 0x64, 0xf0, 0x1c, 0x88, 0x34, 0xa0, + 0xec, 0x78, 0xc4, 0x50, 0xbc, 0x28, 0x94, 0x00 }, + { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x43, 0xd6, 0x69, 0xfc, 0x17, 0x82, 0x3d, 0xa8, + 0xeb, 0x7e, 0xc1, 0x54, 0xbf, 0x2a, 0x95, 0x00 }, + { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x52, 0xc4, 0x7e, 0xe8, 0x0a, 0x9c, 0x26, 0xb0, + 0xe2, 0x74, 0xce, 0x58, 0xba, 0x2c, 0x96, 0x00 }, + { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5d, 0xca, 0x73, 0xe4, 0x01, 0x96, 0x2f, 0xb8, + 0xe5, 0x72, 0xcb, 0x5c, 0xb9, 0x2e, 0x97, 0x00 }, + { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x08, 0x90, 0x38, 0xa0, 0x68, 0xf0, 0x58, 0xc0, + 0xc8, 0x50, 0xf8, 0x60, 0xa8, 0x30, 0x98, 0x00 }, + { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x07, 0x9e, 0x35, 0xac, 0x63, 0xfa, 0x51, 0xc8, + 0xcf, 0x56, 0xfd, 0x64, 0xab, 0x32, 0x99, 0x00 }, + { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x16, 0x8c, 0x22, 0xb8, 0x7e, 0xe4, 0x4a, 0xd0, + 0xc6, 0x5c, 0xf2, 0x68, 0xae, 0x34, 0x9a, 0x00 }, + { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x19, 0x82, 0x2f, 0xb4, 0x75, 0xee, 0x43, 0xd8, + 0xc1, 0x5a, 0xf7, 0x6c, 0xad, 0x36, 0x9b, 0x00 }, + { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x3b, 0xa6, 0x01, 0x9c, 0x4f, 0xd2, 0x75, 0xe8, + 0xd3, 0x4e, 0xe9, 0x74, 0xa7, 0x3a, 0x9d, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x2a, 0xb4, 0x16, 0x88, 0x52, 0xcc, 0x6e, 0xf0, + 0xda, 0x44, 0xe6, 0x78, 0xa2, 0x3c, 0x9e, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x25, 0xba, 0x1b, 0x84, 0x59, 0xc6, 0x67, 0xf8, + 0xdd, 0x42, 0xe3, 0x7c, 0xa1, 0x3e, 0x9f, 0x00 }, + { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6f, 0xce, 0x2d, 0x8c, 0xeb, 0x4a, 0xa9, 0x08, + 0x67, 0xc6, 0x25, 0x84, 0xe3, 0x42, 0xa1, 0x00 }, + { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7e, 0xdc, 0x3a, 0x98, 0xf6, 0x54, 0xb2, 0x10, + 0x6e, 0xcc, 0x2a, 0x88, 0xe6, 0x44, 0xa2, 0x00 }, + { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x71, 0xd2, 0x37, 0x94, 0xfd, 0x5e, 0xbb, 0x18, + 0x69, 0xca, 0x2f, 0x8c, 0xe5, 0x46, 0xa3, 0x00 }, + { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5c, 0xf8, 0x14, 0xb0, 0xcc, 0x68, 0x84, 0x20, + 0x7c, 0xd8, 0x34, 0x90, 0xec, 0x48, 0xa4, 0x00 }, + { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x53, 0xf6, 0x19, 0xbc, 0xc7, 0x62, 0x8d, 0x28, + 0x7b, 0xde, 0x31, 0x94, 0xef, 0x4a, 0xa5, 0x00 }, + { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4d, 0xea, 0x03, 0xa4, 0xd1, 0x76, 0x9f, 0x38, + 0x75, 0xd2, 0x3b, 0x9c, 0xe9, 0x4e, 0xa7, 0x00 }, + { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x18, 0xb0, 0x48, 0xe0, 0xb8, 0x10, 0xe8, 0x40, + 0x58, 0xf0, 0x08, 0xa0, 0xf8, 0x50, 0xa8, 0x00 }, + { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x17, 0xbe, 0x45, 0xec, 0xb3, 0x1a, 0xe1, 0x48, + 0x5f, 0xf6, 0x0d, 0xa4, 0xfb, 0x52, 0xa9, 0x00 }, + { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x06, 0xac, 0x52, 0xf8, 0xae, 0x04, 0xfa, 0x50, + 0x56, 0xfc, 0x02, 0xa8, 0xfe, 0x54, 0xaa, 0x00 }, + { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x09, 0xa2, 0x5f, 0xf4, 0xa5, 0x0e, 0xf3, 0x58, + 0x51, 0xfa, 0x07, 0xac, 0xfd, 0x56, 0xab, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x24, 0x88, 0x7c, 0xd0, 0x94, 0x38, 0xcc, 0x60, + 0x44, 0xe8, 0x1c, 0xb0, 0xf4, 0x58, 0xac, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x2b, 0x86, 0x71, 0xdc, 0x9f, 0x32, 0xc5, 0x68, + 0x43, 0xee, 0x19, 0xb4, 0xf7, 0x5a, 0xad, 0x00 }, + { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x3a, 0x94, 0x66, 0xc8, 0x82, 0x2c, 0xde, 0x70, + 0x4a, 0xe4, 0x16, 0xb8, 0xf2, 0x5c, 0xae, 0x00 }, + { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x35, 0x9a, 0x6b, 0xc4, 0x89, 0x26, 0xd7, 0x78, + 0x4d, 0xe2, 0x13, 0xbc, 0xf1, 0x5e, 0xaf, 0x00 }, + { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9f, 0x2e, 0xfd, 0x4c, 0x5b, 0xea, 0x39, 0x88, + 0x17, 0xa6, 0x75, 0xc4, 0xd3, 0x62, 0xb1, 0x00 }, + { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8e, 0x3c, 0xea, 0x58, 0x46, 0xf4, 0x22, 0x90, + 0x1e, 0xac, 0x7a, 0xc8, 0xd6, 0x64, 0xb2, 0x00 }, + { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x81, 0x32, 0xe7, 0x54, 0x4d, 0xfe, 0x2b, 0x98, + 0x19, 0xaa, 0x7f, 0xcc, 0xd5, 0x66, 0xb3, 0x00 }, + { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xac, 0x18, 0xc4, 0x70, 0x7c, 0xc8, 0x14, 0xa0, + 0x0c, 0xb8, 0x64, 0xd0, 0xdc, 0x68, 0xb4, 0x00 }, + { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa3, 0x16, 0xc9, 0x7c, 0x77, 0xc2, 0x1d, 0xa8, + 0x0b, 0xbe, 0x61, 0xd4, 0xdf, 0x6a, 0xb5, 0x00 }, + { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb2, 0x04, 0xde, 0x68, 0x6a, 0xdc, 0x06, 0xb0, + 0x02, 0xb4, 0x6e, 0xd8, 0xda, 0x6c, 0xb6, 0x00 }, + { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbd, 0x0a, 0xd3, 0x64, 0x61, 0xd6, 0x0f, 0xb8, + 0x05, 0xb2, 0x6b, 0xdc, 0xd9, 0x6e, 0xb7, 0x00 }, + { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe8, 0x50, 0x98, 0x20, 0x08, 0xb0, 0x78, 0xc0, + 0x28, 0x90, 0x58, 0xe0, 0xc8, 0x70, 0xb8, 0x00 }, + { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe7, 0x5e, 0x95, 0x2c, 0x03, 0xba, 0x71, 0xc8, + 0x2f, 0x96, 0x5d, 0xe4, 0xcb, 0x72, 0xb9, 0x00 }, + { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf6, 0x4c, 0x82, 0x38, 0x1e, 0xa4, 0x6a, 0xd0, + 0x26, 0x9c, 0x52, 0xe8, 0xce, 0x74, 0xba, 0x00 }, + { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd4, 0x68, 0xac, 0x10, 0x24, 0x98, 0x5c, 0xe0, + 0x34, 0x88, 0x4c, 0xf0, 0xc4, 0x78, 0xbc, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xdb, 0x66, 0xa1, 0x1c, 0x2f, 0x92, 0x55, 0xe8, + 0x33, 0x8e, 0x49, 0xf4, 0xc7, 0x7a, 0xbd, 0x00 }, + { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xca, 0x74, 0xb6, 0x08, 0x32, 0x8c, 0x4e, 0xf0, + 0x3a, 0x84, 0x46, 0xf8, 0xc2, 0x7c, 0xbe, 0x00 }, + { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc5, 0x7a, 0xbb, 0x04, 0x39, 0x86, 0x47, 0xf8, + 0x3d, 0x82, 0x43, 0xfc, 0xc1, 0x7e, 0xbf, 0x00 }, + { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4f, 0x8e, 0xcd, 0x0c, 0x4b, 0x8a, 0xc9, 0x08, + 0x47, 0x86, 0xc5, 0x04, 0x43, 0x82, 0xc1, 0x00 }, + { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5e, 0x9c, 0xda, 0x18, 0x56, 0x94, 0xd2, 0x10, + 0x4e, 0x8c, 0xca, 0x08, 0x46, 0x84, 0xc2, 0x00 }, + { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x51, 0x92, 0xd7, 0x14, 0x5d, 0x9e, 0xdb, 0x18, + 0x49, 0x8a, 0xcf, 0x0c, 0x45, 0x86, 0xc3, 0x00 }, + { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7c, 0xb8, 0xf4, 0x30, 0x6c, 0xa8, 0xe4, 0x20, + 0x5c, 0x98, 0xd4, 0x10, 0x4c, 0x88, 0xc4, 0x00 }, + { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x73, 0xb6, 0xf9, 0x3c, 0x67, 0xa2, 0xed, 0x28, + 0x5b, 0x9e, 0xd1, 0x14, 0x4f, 0x8a, 0xc5, 0x00 }, + { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x62, 0xa4, 0xee, 0x28, 0x7a, 0xbc, 0xf6, 0x30, + 0x52, 0x94, 0xde, 0x18, 0x4a, 0x8c, 0xc6, 0x00 }, + { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6d, 0xaa, 0xe3, 0x24, 0x71, 0xb6, 0xff, 0x38, + 0x55, 0x92, 0xdb, 0x1c, 0x49, 0x8e, 0xc7, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x38, 0xf0, 0xa8, 0x60, 0x18, 0xd0, 0x88, 0x40, + 0x78, 0xb0, 0xe8, 0x20, 0x58, 0x90, 0xc8, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x37, 0xfe, 0xa5, 0x6c, 0x13, 0xda, 0x81, 0x48, + 0x7f, 0xb6, 0xed, 0x24, 0x5b, 0x92, 0xc9, 0x00 }, + { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x26, 0xec, 0xb2, 0x78, 0x0e, 0xc4, 0x9a, 0x50, + 0x76, 0xbc, 0xe2, 0x28, 0x5e, 0x94, 0xca, 0x00 }, + { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x29, 0xe2, 0xbf, 0x74, 0x05, 0xce, 0x93, 0x58, + 0x71, 0xba, 0xe7, 0x2c, 0x5d, 0x96, 0xcb, 0x00 }, + { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x04, 0xc8, 0x9c, 0x50, 0x34, 0xf8, 0xac, 0x60, + 0x64, 0xa8, 0xfc, 0x30, 0x54, 0x98, 0xcc, 0x00 }, + { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x0b, 0xc6, 0x91, 0x5c, 0x3f, 0xf2, 0xa5, 0x68, + 0x63, 0xae, 0xf9, 0x34, 0x57, 0x9a, 0xcd, 0x00 }, + { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x1a, 0xd4, 0x86, 0x48, 0x22, 0xec, 0xbe, 0x70, + 0x6a, 0xa4, 0xf6, 0x38, 0x52, 0x9c, 0xce, 0x00 }, + { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbf, 0x6e, 0x1d, 0xcc, 0xfb, 0x2a, 0x59, 0x88, + 0x37, 0xe6, 0x95, 0x44, 0x73, 0xa2, 0xd1, 0x00 }, + { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa1, 0x72, 0x07, 0xd4, 0xed, 0x3e, 0x4b, 0x98, + 0x39, 0xea, 0x9f, 0x4c, 0x75, 0xa6, 0xd3, 0x00 }, + { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8c, 0x58, 0x24, 0xf0, 0xdc, 0x08, 0x74, 0xa0, + 0x2c, 0xf8, 0x84, 0x50, 0x7c, 0xa8, 0xd4, 0x00 }, + { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x83, 0x56, 0x29, 0xfc, 0xd7, 0x02, 0x7d, 0xa8, + 0x2b, 0xfe, 0x81, 0x54, 0x7f, 0xaa, 0xd5, 0x00 }, + { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x92, 0x44, 0x3e, 0xe8, 0xca, 0x1c, 0x66, 0xb0, + 0x22, 0xf4, 0x8e, 0x58, 0x7a, 0xac, 0xd6, 0x00 }, + { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9d, 0x4a, 0x33, 0xe4, 0xc1, 0x16, 0x6f, 0xb8, + 0x25, 0xf2, 0x8b, 0x5c, 0x79, 0xae, 0xd7, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc8, 0x10, 0x78, 0xa0, 0xa8, 0x70, 0x18, 0xc0, + 0x08, 0xd0, 0xb8, 0x60, 0x68, 0xb0, 0xd8, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc7, 0x1e, 0x75, 0xac, 0xa3, 0x7a, 0x11, 0xc8, + 0x0f, 0xd6, 0xbd, 0x64, 0x6b, 0xb2, 0xd9, 0x00 }, + { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd6, 0x0c, 0x62, 0xb8, 0xbe, 0x64, 0x0a, 0xd0, + 0x06, 0xdc, 0xb2, 0x68, 0x6e, 0xb4, 0xda, 0x00 }, + { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd9, 0x02, 0x6f, 0xb4, 0xb5, 0x6e, 0x03, 0xd8, + 0x01, 0xda, 0xb7, 0x6c, 0x6d, 0xb6, 0xdb, 0x00 }, + { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf4, 0x28, 0x4c, 0x90, 0x84, 0x58, 0x3c, 0xe0, + 0x14, 0xc8, 0xac, 0x70, 0x64, 0xb8, 0xdc, 0x00 }, + { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xfb, 0x26, 0x41, 0x9c, 0x8f, 0x52, 0x35, 0xe8, + 0x13, 0xce, 0xa9, 0x74, 0x67, 0xba, 0xdd, 0x00 }, + { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xea, 0x34, 0x56, 0x88, 0x92, 0x4c, 0x2e, 0xf0, + 0x1a, 0xc4, 0xa6, 0x78, 0x62, 0xbc, 0xde, 0x00 }, + { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe5, 0x3a, 0x5b, 0x84, 0x99, 0x46, 0x27, 0xf8, + 0x1d, 0xc2, 0xa3, 0x7c, 0x61, 0xbe, 0xdf, 0x00 }, + { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xaf, 0x4e, 0x6d, 0x8c, 0x2b, 0xca, 0xe9, 0x08, + 0xa7, 0x46, 0x65, 0x84, 0x23, 0xc2, 0xe1, 0x00 }, + { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbe, 0x5c, 0x7a, 0x98, 0x36, 0xd4, 0xf2, 0x10, + 0xae, 0x4c, 0x6a, 0x88, 0x26, 0xc4, 0xe2, 0x00 }, + { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb1, 0x52, 0x77, 0x94, 0x3d, 0xde, 0xfb, 0x18, + 0xa9, 0x4a, 0x6f, 0x8c, 0x25, 0xc6, 0xe3, 0x00 }, + { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9c, 0x78, 0x54, 0xb0, 0x0c, 0xe8, 0xc4, 0x20, + 0xbc, 0x58, 0x74, 0x90, 0x2c, 0xc8, 0xe4, 0x00 }, + { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x93, 0x76, 0x59, 0xbc, 0x07, 0xe2, 0xcd, 0x28, + 0xbb, 0x5e, 0x71, 0x94, 0x2f, 0xca, 0xe5, 0x00 }, + { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x82, 0x64, 0x4e, 0xa8, 0x1a, 0xfc, 0xd6, 0x30, + 0xb2, 0x54, 0x7e, 0x98, 0x2a, 0xcc, 0xe6, 0x00 }, + { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8d, 0x6a, 0x43, 0xa4, 0x11, 0xf6, 0xdf, 0x38, + 0xb5, 0x52, 0x7b, 0x9c, 0x29, 0xce, 0xe7, 0x00 }, + { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd7, 0x3e, 0x05, 0xec, 0x73, 0x9a, 0xa1, 0x48, + 0x9f, 0x76, 0x4d, 0xa4, 0x3b, 0xd2, 0xe9, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc6, 0x2c, 0x12, 0xf8, 0x6e, 0x84, 0xba, 0x50, + 0x96, 0x7c, 0x42, 0xa8, 0x3e, 0xd4, 0xea, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc9, 0x22, 0x1f, 0xf4, 0x65, 0x8e, 0xb3, 0x58, + 0x91, 0x7a, 0x47, 0xac, 0x3d, 0xd6, 0xeb, 0x00 }, + { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe4, 0x08, 0x3c, 0xd0, 0x54, 0xb8, 0x8c, 0x60, + 0x84, 0x68, 0x5c, 0xb0, 0x34, 0xd8, 0xec, 0x00 }, + { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xeb, 0x06, 0x31, 0xdc, 0x5f, 0xb2, 0x85, 0x68, + 0x83, 0x6e, 0x59, 0xb4, 0x37, 0xda, 0xed, 0x00 }, + { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xfa, 0x14, 0x26, 0xc8, 0x42, 0xac, 0x9e, 0x70, + 0x8a, 0x64, 0x56, 0xb8, 0x32, 0xdc, 0xee, 0x00 }, + { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf5, 0x1a, 0x2b, 0xc4, 0x49, 0xa6, 0x97, 0x78, + 0x8d, 0x62, 0x53, 0xbc, 0x31, 0xde, 0xef, 0x00 }, + { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5f, 0xae, 0xbd, 0x4c, 0x9b, 0x6a, 0x79, 0x88, + 0xd7, 0x26, 0x35, 0xc4, 0x13, 0xe2, 0xf1, 0x00 }, + { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4e, 0xbc, 0xaa, 0x58, 0x86, 0x74, 0x62, 0x90, + 0xde, 0x2c, 0x3a, 0xc8, 0x16, 0xe4, 0xf2, 0x00 }, + { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x41, 0xb2, 0xa7, 0x54, 0x8d, 0x7e, 0x6b, 0x98, + 0xd9, 0x2a, 0x3f, 0xcc, 0x15, 0xe6, 0xf3, 0x00 }, + { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6c, 0x98, 0x84, 0x70, 0xbc, 0x48, 0x54, 0xa0, + 0xcc, 0x38, 0x24, 0xd0, 0x1c, 0xe8, 0xf4, 0x00 }, + { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x72, 0x84, 0x9e, 0x68, 0xaa, 0x5c, 0x46, 0xb0, + 0xc2, 0x34, 0x2e, 0xd8, 0x1a, 0xec, 0xf6, 0x00 }, + { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7d, 0x8a, 0x93, 0x64, 0xa1, 0x56, 0x4f, 0xb8, + 0xc5, 0x32, 0x2b, 0xdc, 0x19, 0xee, 0xf7, 0x00 }, + { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x28, 0xd0, 0xd8, 0x20, 0xc8, 0x30, 0x38, 0xc0, + 0xe8, 0x10, 0x18, 0xe0, 0x08, 0xf0, 0xf8, 0x00 }, + { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x27, 0xde, 0xd5, 0x2c, 0xc3, 0x3a, 0x31, 0xc8, + 0xef, 0x16, 0x1d, 0xe4, 0x0b, 0xf2, 0xf9, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x36, 0xcc, 0xc2, 0x38, 0xde, 0x24, 0x2a, 0xd0, + 0xe6, 0x1c, 0x12, 0xe8, 0x0e, 0xf4, 0xfa, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x39, 0xc2, 0xcf, 0x34, 0xd5, 0x2e, 0x23, 0xd8, + 0xe1, 0x1a, 0x17, 0xec, 0x0d, 0xf6, 0xfb, 0x00 }, + { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x14, 0xe8, 0xec, 0x10, 0xe4, 0x18, 0x1c, 0xe0, + 0xf4, 0x08, 0x0c, 0xf0, 0x04, 0xf8, 0xfc, 0x00 }, + { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x1b, 0xe6, 0xe1, 0x1c, 0xef, 0x12, 0x15, 0xe8, + 0xf3, 0x0e, 0x09, 0xf4, 0x07, 0xfa, 0xfd, 0x00 }, + { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x0a, 0xf4, 0xf6, 0x08, 0xf2, 0x0c, 0x0e, 0xf0, + 0xfa, 0x04, 0x06, 0xf8, 0x02, 0xfc, 0xfe, 0x00 }, + { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x05, 0xfa, 0xfb, 0x04, 0xf9, 0x06, 0x07, 0xf8, + 0xfd, 0x02, 0x03, 0xfc, 0x01, 0xfe, 0xff, 0x00 } +}; +/* END CSTYLED */ +#else +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, + 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, + 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, + 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, + 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, + 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, + 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, + 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, + 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, + 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, + 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, + 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, + 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, + 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, + 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, + 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, + 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, + 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, + 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, + 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, + 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, + 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, + 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, + 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, + 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, + 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, + 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, + 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, + 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, + 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, + 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, + 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, + 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, + 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, + 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, + 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, + 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, + 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, + 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, + 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, + 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, + 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, + 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, + 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, + 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, + 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, + 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, + 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, + 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, + 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, + 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, + 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, + 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, + 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, + 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, + 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, + 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, + 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, + 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, + 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, + 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, + 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, + 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, + 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, + 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, + 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, + 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, + 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, + 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, + 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, + 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, + 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, + 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, + 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, + 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, + 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, + 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, + 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, + 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, + 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, + 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, + 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, + 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, + 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, + 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, + 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, + 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, + 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, + 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, + 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, + 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, + 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, + 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, + 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, + 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, + 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, + 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, + 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, + 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, + 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, + 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, + 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, + 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, + 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, + 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, + 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, + 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, + 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, + 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, + 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, + 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, + 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, + 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, + 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, + 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, + 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, + 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, + 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, + 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, + 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, + 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, + 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, + 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, + 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, + 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, + 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, + 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, + 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, + 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, + 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, + 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, + 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, + 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, + 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, + 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, + 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, + 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, + 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, + 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, + 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, + 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, + 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, + 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, + 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, + 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, + 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, + 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, + 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, + 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, + 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, + 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, + 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, + 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, + 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, + 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, + 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, + 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, + 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, + 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, + 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, + 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, + 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, + 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, + 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, + 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, + 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, + 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, + 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, + 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, + 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, + 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, + 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, + 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, + 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, + 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, + 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, + 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, + 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, + 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, + 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, + 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, + 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, + 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, + 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, + 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, + 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, + 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, + 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, + 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, + 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, + 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, + 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, + 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, + 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, + 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, + 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, + 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, + 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, + 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, + 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, + 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, + 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, + 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, + 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, + 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, + 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, + 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, + 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, + 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, + 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, + 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, + 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, + 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, + 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, + 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, + 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, + 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, + 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, + 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, + 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, + 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } +}; +/* END CSTYLED */ +#endif // ENDIANNESS +#endif /* defined(__powerpc__) */ diff --git a/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/module/zfs/vdev_raidz_math_powerpc_altivec_common.h new file mode 100644 index 0000000000..3842f5fd63 --- /dev/null +++ b/module/zfs/vdev_raidz_math_powerpc_altivec_common.h @@ -0,0 +1,690 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau. All rights reserved. + * + */ + +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "%[w"#REG"]" +#define VR1_(_1, REG, ...) "%[w"#REG"]" +#define VR2_(_1, _2, REG, ...) "%[w"#REG"]" +#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]" +#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]" +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]" +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]" +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]" + +/* + * Here we need registers not used otherwise. + * They will be used in unused ASM for the case + * with more registers than required... but GCC + * will still need to make sure the constraints + * are correct, and duplicate constraints are illegal + * ... and we use the "register" number as a name + */ + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 36) +#define VR3(r...) VR3_(r, 36, 35) +#define VR4(r...) VR4_(r, 36, 35, 34, 33) +#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32) +#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31) +#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define VR(X) "%[w"#X"]" + +#define RVR0_(REG, ...) [w##REG] "v" (w##REG) +#define RVR1_(_1, REG, ...) [w##REG] "v" (w##REG) +#define RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG) +#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG) +#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG) +#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG) +#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG) +#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG) + +#define RVR0(r...) RVR0_(r) +#define RVR1(r...) RVR1_(r) +#define RVR2(r...) RVR2_(r, 36) +#define RVR3(r...) RVR3_(r, 36, 35) +#define RVR4(r...) RVR4_(r, 36, 35, 34, 33) +#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32) +#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31) +#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define RVR(X) [w##X] "v" (w##X) + +#define WVR0_(REG, ...) [w##REG] "=v" (w##REG) +#define WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG) +#define WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG) +#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG) +#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG) +#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG) +#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG) +#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG) + +#define WVR0(r...) WVR0_(r) +#define WVR1(r...) WVR1_(r) +#define WVR2(r...) WVR2_(r, 36) +#define WVR3(r...) WVR3_(r, 36, 35) +#define WVR4(r...) WVR4_(r, 36, 35, 34, 33) +#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32) +#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31) +#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define WVR(X) [w##X] "=v" (w##X) + +#define UVR0_(REG, ...) [w##REG] "+&v" (w##REG) +#define UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG) + +#define UVR0(r...) UVR0_(r) +#define UVR1(r...) UVR1_(r) +#define UVR2(r...) UVR2_(r, 36) +#define UVR3(r...) UVR3_(r, 36, 35) +#define UVR4(r...) UVR4_(r, 36, 35, 34, 33) +#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32) +#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) +#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define UVR(X) [w##X] "+&v" (w##X) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "lvx 19,0,%[SRC2]\n" \ + "lvx 18,0,%[SRC3]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + "vxor " VR2(r) "," VR2(r) ",19\n" \ + "vxor " VR3(r) "," VR3(r) ",18\n" \ + "lvx 21,0,%[SRC4]\n" \ + "lvx 20,0,%[SRC5]\n" \ + "lvx 19,0,%[SRC6]\n" \ + "lvx 18,0,%[SRC7]\n" \ + "vxor " VR4(r) "," VR4(r) ",21\n" \ + "vxor " VR5(r) "," VR5(r) ",20\n" \ + "vxor " VR6(r) "," VR6(r) ",19\n" \ + "vxor " VR7(r) "," VR7(r) ",18\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \ + UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))), \ + [SRC4] "r" ((OFFSET(src, 64))), \ + [SRC5] "r" ((OFFSET(src, 80))), \ + [SRC6] "r" ((OFFSET(src, 96))), \ + [SRC7] "r" ((OFFSET(src, 112))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 4: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "lvx 19,0,%[SRC2]\n" \ + "lvx 18,0,%[SRC3]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + "vxor " VR2(r) "," VR2(r) ",19\n" \ + "vxor " VR3(r) "," VR3(r) ",18\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + : UVR0(r), UVR1(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))) \ + : "v20", "v21"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vxor " VR4(r) "," VR4(r) "," VR0(r) "\n" \ + "vxor " VR5(r) "," VR5(r) "," VR1(r) "\n" \ + "vxor " VR6(r) "," VR6(r) "," VR2(r) "\n" \ + "vxor " VR7(r) "," VR7(r) "," VR3(r) "\n" \ + : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "vxor " VR2(r) "," VR2(r) "," VR0(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR1(r) "\n" \ + : UVR2(r), UVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + "vxor " VR4(r) "," VR4(r) "," VR4(r) "\n" \ + "vxor " VR5(r) "," VR5(r) "," VR5(r) "\n" \ + "vxor " VR6(r) "," VR6(r) "," VR6(r) "\n" \ + "vxor " VR7(r) "," VR7(r) "," VR7(r) "\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ + break; \ + case 2: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + : WVR0(r), WVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vor " VR4(r) "," VR0(r) "," VR0(r) "\n" \ + "vor " VR5(r) "," VR1(r) "," VR1(r) "\n" \ + "vor " VR6(r) "," VR2(r) "," VR2(r) "\n" \ + "vor " VR7(r) "," VR3(r) "," VR3(r) "\n" \ + : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "vor " VR2(r) "," VR0(r) "," VR0(r) "\n" \ + "vor " VR3(r) "," VR1(r) "," VR1(r) "\n" \ + : WVR2(r), WVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + "lvx " VR2(r) " ,0,%[SRC2]\n" \ + "lvx " VR3(r) " ,0,%[SRC3]\n" \ + "lvx " VR4(r) " ,0,%[SRC4]\n" \ + "lvx " VR5(r) " ,0,%[SRC5]\n" \ + "lvx " VR6(r) " ,0,%[SRC6]\n" \ + "lvx " VR7(r) " ,0,%[SRC7]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))), \ + [SRC4] "r" ((OFFSET(src, 64))), \ + [SRC5] "r" ((OFFSET(src, 80))), \ + [SRC6] "r" ((OFFSET(src, 96))), \ + [SRC7] "r" ((OFFSET(src, 112)))); \ + break; \ + case 4: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + "lvx " VR2(r) " ,0,%[SRC2]\n" \ + "lvx " VR3(r) " ,0,%[SRC3]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48)))); \ + break; \ + case 2: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + : WVR0(r), WVR1(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16)))); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + "stvx " VR2(r) " ,0,%[DST2]\n" \ + "stvx " VR3(r) " ,0,%[DST3]\n" \ + "stvx " VR4(r) " ,0,%[DST4]\n" \ + "stvx " VR5(r) " ,0,%[DST5]\n" \ + "stvx " VR6(r) " ,0,%[DST6]\n" \ + "stvx " VR7(r) " ,0,%[DST7]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + [DST2] "r" ((OFFSET(dst, 32))), \ + [DST3] "r" ((OFFSET(dst, 48))), \ + [DST4] "r" ((OFFSET(dst, 64))), \ + [DST5] "r" ((OFFSET(dst, 80))), \ + [DST6] "r" ((OFFSET(dst, 96))), \ + [DST7] "r" ((OFFSET(dst, 112))), \ + RVR0(r), RVR1(r), RVR2(r), RVR3(r), \ + RVR4(r), RVR5(r), RVR6(r), RVR7(r) \ + : "memory"); \ + break; \ + case 4: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + "stvx " VR2(r) " ,0,%[DST2]\n" \ + "stvx " VR3(r) " ,0,%[DST3]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + [DST2] "r" ((OFFSET(dst, 32))), \ + [DST3] "r" ((OFFSET(dst, 48))), \ + RVR0(r), RVR1(r), RVR2(r), RVR3(r) \ + : "memory"); \ + break; \ + case 2: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + RVR0(r), RVR1(r) : "memory"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a numbered variable is + */ +#define _00 "17" +#define _1d "16" +#define _temp0 "19" +#define _temp1 "18" + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "vspltisb " VR(16) ",14\n" \ + "vspltisb " VR(17) ",15\n" \ + "vaddubm " VR(16) "," VR(17) "," VR(16) "\n" \ + "vxor " VR(17) "," VR(17) "," VR(17) "\n" \ + : WVR(16), WVR(17)); \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ + "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ + "vcmpgtsb 21," VR(17) "," VR2(r) "\n" \ + "vcmpgtsb 20," VR(17) "," VR3(r) "\n" \ + "vand 19,19," VR(16) "\n" \ + "vand 18,18," VR(16) "\n" \ + "vand 21,21," VR(16) "\n" \ + "vand 20,20," VR(16) "\n" \ + "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + "vxor " VR0(r) ",19," VR0(r) "\n" \ + "vxor " VR1(r) ",18," VR1(r) "\n" \ + "vxor " VR2(r) ",21," VR2(r) "\n" \ + "vxor " VR3(r) ",20," VR3(r) "\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ + "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ + "vand 19,19," VR(16) "\n" \ + "vand 18,18," VR(16) "\n" \ + "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR0(r) ",19," VR0(r) "\n" \ + "vxor " VR1(r) ",18," VR1(r) "\n" \ + : UVR0(r), UVR1(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a register is + * (here we're using actual registers for the + * clobbered ones) + */ +#define _0f "15" +#define _a_save "14" +#define _b_save "13" +#define _lt_mod_a "12" +#define _lt_clmul_a "11" +#define _lt_mod_b "10" +#define _lt_clmul_b "15" + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + /* lts for upper part */ \ + "vspltisb 15,15\n" \ + "lvx 10,0,%[lt0]\n" \ + "lvx 11,0,%[lt1]\n" \ + /* upper part */ \ + "vand 14," VR0(r) ",15\n" \ + "vand 13," VR1(r) ",15\n" \ + "vspltisb 15,4\n" \ + "vsrab " VR0(r) "," VR0(r) ",15\n" \ + "vsrab " VR1(r) "," VR1(r) ",15\n" \ + \ + "vperm 12,10,10," VR0(r) "\n" \ + "vperm 10,10,10," VR1(r) "\n" \ + "vperm 15,11,11," VR0(r) "\n" \ + "vperm 11,11,11," VR1(r) "\n" \ + \ + "vxor " VR0(r) ",15,12\n" \ + "vxor " VR1(r) ",11,10\n" \ + /* lts for lower part */ \ + "lvx 10,0,%[lt2]\n" \ + "lvx 15,0,%[lt3]\n" \ + /* lower part */ \ + "vperm 12,10,10,14\n" \ + "vperm 10,10,10,13\n" \ + "vperm 11,15,15,14\n" \ + "vperm 15,15,15,13\n" \ + \ + "vxor " VR0(r) "," VR0(r) ",12\n" \ + "vxor " VR1(r) "," VR1(r) ",10\n" \ + "vxor " VR0(r) "," VR0(r) ",11\n" \ + "vxor " VR1(r) "," VR1(r) ",15\n" \ + : UVR0(r), UVR1(r) \ + : [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])), \ + [lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])), \ + [lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])), \ + [lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0])) \ + : "v10", "v11", "v12", "v13", "v14", "v15"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_23(r)); \ + _MULx2(c, R_01(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +/* Overkill... */ +#if 0 // defined(_KERNEL) +#define GEN_X_DEFINE_0_3() \ +register unsigned char w0 asm("0") __attribute__((vector_size(16))); \ +register unsigned char w1 asm("1") __attribute__((vector_size(16))); \ +register unsigned char w2 asm("2") __attribute__((vector_size(16))); \ +register unsigned char w3 asm("3") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ +register unsigned char w4 asm("4") __attribute__((vector_size(16))); \ +register unsigned char w5 asm("5") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ +register unsigned char w6 asm("6") __attribute__((vector_size(16))); \ +register unsigned char w7 asm("7") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ +register unsigned char w8 asm("8") __attribute__((vector_size(16))); \ +register unsigned char w9 asm("9") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ +register unsigned char w10 asm("10") __attribute__((vector_size(16))); \ +register unsigned char w11 asm("11") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ +register unsigned char w12 asm("12") __attribute__((vector_size(16))); \ +register unsigned char w13 asm("13") __attribute__((vector_size(16))); \ +register unsigned char w14 asm("14") __attribute__((vector_size(16))); \ +register unsigned char w15 asm("15") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ +register unsigned char w16 asm("16") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ +register unsigned char w17 asm("17") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ +register unsigned char w18 asm("18") __attribute__((vector_size(16))); \ +register unsigned char w19 asm("19") __attribute__((vector_size(16))); \ +register unsigned char w20 asm("20") __attribute__((vector_size(16))); \ +register unsigned char w21 asm("21") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ +register unsigned char w22 asm("22") __attribute__((vector_size(16))); \ +register unsigned char w23 asm("23") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ +register unsigned char w24 asm("24") __attribute__((vector_size(16))); \ +register unsigned char w25 asm("25") __attribute__((vector_size(16))); \ +register unsigned char w26 asm("26") __attribute__((vector_size(16))); \ +register unsigned char w27 asm("27") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ +register unsigned char w28 asm("28") __attribute__((vector_size(16))); \ +register unsigned char w29 asm("29") __attribute__((vector_size(16))); \ +register unsigned char w30 asm("30") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ +register unsigned char w31 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ +register unsigned char w32 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ +register unsigned char w33 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w34 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w35 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w36 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ +register unsigned char w37 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w38 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#else +#define GEN_X_DEFINE_0_3() \ + unsigned char w0 __attribute__((vector_size(16))); \ + unsigned char w1 __attribute__((vector_size(16))); \ + unsigned char w2 __attribute__((vector_size(16))); \ + unsigned char w3 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ + unsigned char w4 __attribute__((vector_size(16))); \ + unsigned char w5 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ + unsigned char w6 __attribute__((vector_size(16))); \ + unsigned char w7 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ + unsigned char w8 __attribute__((vector_size(16))); \ + unsigned char w9 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ + unsigned char w10 __attribute__((vector_size(16))); \ + unsigned char w11 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ + unsigned char w12 __attribute__((vector_size(16))); \ + unsigned char w13 __attribute__((vector_size(16))); \ + unsigned char w14 __attribute__((vector_size(16))); \ + unsigned char w15 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ + unsigned char w16 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ + unsigned char w17 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ + unsigned char w18 __attribute__((vector_size(16))); \ + unsigned char w19 __attribute__((vector_size(16))); \ + unsigned char w20 __attribute__((vector_size(16))); \ + unsigned char w21 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ + unsigned char w22 __attribute__((vector_size(16))); \ + unsigned char w23 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ + unsigned char w24 __attribute__((vector_size(16))); \ + unsigned char w25 __attribute__((vector_size(16))); \ + unsigned char w26 __attribute__((vector_size(16))); \ + unsigned char w27 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ + unsigned char w28 __attribute__((vector_size(16))); \ + unsigned char w29 __attribute__((vector_size(16))); \ + unsigned char w30 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ + unsigned char w31 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ + unsigned char w32 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ + unsigned char w33 __attribute__((vector_size(16))); \ + unsigned char w34 __attribute__((vector_size(16))); \ + unsigned char w35 __attribute__((vector_size(16))); \ + unsigned char w36 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ + unsigned char w37 __attribute__((vector_size(16))); \ + unsigned char w38 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#endif diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c index a693bff63f..9e9c15ff4b 100644 --- a/module/zfs/vdev_raidz_math_scalar.c +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -142,6 +142,7 @@ static const struct { a.b[6] = mul_lt[a.b[6]]; \ a.b[5] = mul_lt[a.b[5]]; \ a.b[4] = mul_lt[a.b[4]]; \ + fallthrough; \ case 4: \ a.b[3] = mul_lt[a.b[3]]; \ a.b[2] = mul_lt[a.b[2]]; \ diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c index 9985da2736..56a0b123d9 100644 --- a/module/zfs/vdev_raidz_math_sse2.c +++ b/module/zfs/vdev_raidz_math_sse2.c @@ -27,9 +27,12 @@ #if defined(__x86_64) && defined(HAVE_SSE2) #include -#include +#include +#include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -125,6 +128,8 @@ typedef struct v { __asm( \ "movdqa %" VR0(r) ", %" VR1(r)); \ break; \ + default: \ + VERIFY(0); \ } \ } @@ -175,6 +180,8 @@ typedef struct v { "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ : : [DST] "r" (dst)); \ break; \ + default: \ + VERIFY(0); \ } \ } @@ -508,6 +515,8 @@ gf_x2_mul_fns[256] = { gf_x1_mul_fns[c](); \ COPY(_mul_x1_acc, r); \ break; \ + default: \ + VERIFY(0); \ } \ } @@ -607,7 +616,7 @@ DEFINE_REC_METHODS(sse2); static boolean_t raidz_will_sse2_work(void) { - return (zfs_sse_available() && zfs_sse2_available()); + return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); } const raidz_impl_ops_t vdev_raidz_sse2_impl = { diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index 047a48d544..5ddc079a4f 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -27,9 +27,11 @@ #if defined(__x86_64) && defined(HAVE_SSSE3) #include -#include +#include +#ifdef __linux__ #define __asm __asm__ __volatile__ +#endif #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -399,8 +401,8 @@ DEFINE_REC_METHODS(ssse3); static boolean_t raidz_will_ssse3_work(void) { - return (zfs_sse_available() && zfs_sse2_available() && - zfs_ssse3_available()); + return (kfpu_allowed() && zfs_sse_available() && + zfs_sse2_available() && zfs_ssse3_available()); } const raidz_impl_ops_t vdev_raidz_ssse3_impl = { diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c new file mode 100644 index 0000000000..4d7de0c6c4 --- /dev/null +++ b/module/zfs/vdev_rebuild.c @@ -0,0 +1,1150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains the sequential reconstruction implementation for + * resilvering. This form of resilvering is internally referred to as device + * rebuild to avoid conflating it with the traditional healing reconstruction + * performed by the dsl scan code. + * + * When replacing a device, or scrubbing the pool, ZFS has historically used + * a process called resilvering which is a form of healing reconstruction. + * This approach has the advantage that as blocks are read from disk their + * checksums can be immediately verified and the data repaired. Unfortunately, + * it also results in a random IO pattern to the disk even when extra care + * is taken to sequentialize the IO as much as possible. This substantially + * increases the time required to resilver the pool and restore redundancy. + * + * For mirrored devices it's possible to implement an alternate sequential + * reconstruction strategy when resilvering. Sequential reconstruction + * behaves like a traditional RAID rebuild and reconstructs a device in LBA + * order without verifying the checksum. After this phase completes a second + * scrub phase is started to verify all of the checksums. This two phase + * process will take longer than the healing reconstruction described above. + * However, it has that advantage that after the reconstruction first phase + * completes redundancy has been restored. At this point the pool can incur + * another device failure without risking data loss. + * + * There are a few noteworthy limitations and other advantages of resilvering + * using sequential reconstruction vs healing reconstruction. + * + * Limitations: + * + * - Sequential reconstruction is not possible on RAIDZ due to its + * variable stripe width. Note dRAID uses a fixed stripe width which + * avoids this issue, but comes at the expense of some usable capacity. + * + * - Block checksums are not verified during sequential reconstruction. + * Similar to traditional RAID the parity/mirror data is reconstructed + * but cannot be immediately double checked. For this reason when the + * last active resilver completes the pool is automatically scrubbed + * by default. + * + * - Deferred resilvers using sequential reconstruction are not currently + * supported. When adding another vdev to an active top-level resilver + * it must be restarted. + * + * Advantages: + * + * - Sequential reconstruction is performed in LBA order which may be faster + * than healing reconstruction particularly when using HDDs (or + * especially with SMR devices). Only allocated capacity is resilvered. + * + * - Sequential reconstruction is not constrained by ZFS block boundaries. + * This allows it to issue larger IOs to disk which span multiple blocks + * allowing all of these logical blocks to be repaired with a single IO. + * + * - Unlike a healing resilver or scrub which are pool wide operations, + * sequential reconstruction is handled by the top-level vdevs. This + * allows for it to be started or canceled on a top-level vdev without + * impacting any other top-level vdevs in the pool. + * + * - Data only referenced by a pool checkpoint will be repaired because + * that space is reflected in the space maps. This differs for a + * healing resilver or scrub which will not repair that data. + */ + + +/* + * Size of rebuild reads; defaults to 1MiB per data disk and is capped at + * SPA_MAXBLOCKSIZE. + */ +unsigned long zfs_rebuild_max_segment = 1024 * 1024; + +/* + * Maximum number of parallelly executed bytes per leaf vdev caused by a + * sequential resilver. We attempt to strike a balance here between keeping + * the vdev queues full of I/Os at all times and not overflowing the queues + * to cause long latency, which would cause long txg sync times. + * + * A large default value can be safely used here because the default target + * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep + * the queue depth short. + * + * 32MB was selected as the default value to achieve good performance with + * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential + * rebuild was unable to saturate all of the drives using smaller values. + * With a value of 32MB the sequential resilver write rate was measured at + * 800MB/s sustained while rebuilding to a distributed spare. + */ +unsigned long zfs_rebuild_vdev_limit = 32 << 20; + +/* + * Automatically start a pool scrub when the last active sequential resilver + * completes in order to verify the checksums of all blocks which have been + * resilvered. This option is enabled by default and is strongly recommended. + */ +int zfs_rebuild_scrub_enabled = 1; + +/* + * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). + */ +static void vdev_rebuild_thread(void *arg); + +/* + * Clear the per-vdev rebuild bytes value for a vdev tree. + */ +static void +clear_rebuild_bytes(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (uint64_t i = 0; i < vd->vdev_children; i++) + clear_rebuild_bytes(vd->vdev_child[i]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_rebuild_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Determines whether a vdev_rebuild_thread() should be stopped. + */ +static boolean_t +vdev_rebuild_should_stop(vdev_t *vd) +{ + return (!vdev_writeable(vd) || vd->vdev_removing || + vd->vdev_rebuild_exit_wanted || + vd->vdev_rebuild_cancel_wanted || + vd->vdev_rebuild_reset_wanted); +} + +/* + * Determine if the rebuild should be canceled. This may happen when all + * vdevs with MISSING DTLs are detached. + */ +static boolean_t +vdev_rebuild_should_cancel(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The sync task for updating the on-disk state of a rebuild. This is + * scheduled by vdev_rebuild_range(). + */ +static void +vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { + vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; + vr->vr_scan_offset[txg & TXG_MASK] = 0; + } + + vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Initialize the on-disk state for a new rebuild, start the rebuild thread. + */ +static void +vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_rebuilding); + + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + mutex_enter(&vd->vdev_rebuild_lock); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_start_time = gethrestime_sec(); + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* + * Rebuilds are currently only used when replacing a device, in which + * case there must be DTL_MISSING entries. In the future, we could + * allow rebuilds to be used in a way similar to a scrub. This would + * be useful because it would allow us to rebuild the space used by + * pool checkpoints. + */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu started", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +{ + nvlist_t *aux = fnvlist_alloc(); + + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); + spa_event_notify(spa, vd, aux, name); + nvlist_free(aux); +} + +/* + * Called to request that a new rebuild be started. The feature will remain + * active for the duration of the rebuild, then revert to the enabled state. + */ +static void +vdev_rebuild_initiate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_top == vd); + ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); + ASSERT(!vd->vdev_rebuilding); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + vd->vdev_rebuilding = B_TRUE; + + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, tx); + dmu_tx_commit(tx); + + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); +} + +/* + * Update the on-disk state to completed when a rebuild finishes. + */ +static void +vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu complete", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + /* Handles detaching of spares */ + spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * While we're in syncing context take the opportunity to + * setup the scrub when there are no more active rebuilds. + */ + pool_scan_func_t func = POOL_SCAN_SCRUB; + if (dsl_scan_setup_check(&func, tx) == 0 && + zfs_rebuild_scrub_enabled) { + dsl_scan_setup_sync(&func, tx); + } + + cv_broadcast(&vd->vdev_rebuild_cv); + + /* Clear recent error events (i.e. duplicate events tracking) */ + zfs_ereport_clear(spa, NULL); +} + +/* + * Update the on-disk state to canceled when a rebuild finishes. + */ +static void +vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu canceled", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + vd->vdev_rebuild_cancel_wanted = B_FALSE; + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Resets the progress of a running rebuild. This will occur when a new + * vdev is added to rebuild. + */ +static void +vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + vrp->vrp_last_offset = 0; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_bytes_scanned = 0; + vrp->vrp_bytes_issued = 0; + vrp->vrp_bytes_rebuilt = 0; + vrp->vrp_bytes_est = 0; + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* See vdev_rebuild_initiate_sync comment */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu reset", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + vd->vdev_rebuild_reset_wanted = B_FALSE; + ASSERT(vd->vdev_rebuilding); + + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Clear the last rebuild status. + */ +void +vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + objset_t *mos = spa_meta_objset(spa); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || + vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { + mutex_exit(&vd->vdev_rebuild_lock); + return; + } + + clear_rebuild_bytes(vd); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + + if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { + VERIFY0(zap_update(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + } + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * The zio_done_func_t callback for each rebuild I/O issued. It's responsible + * for updating the rebuild stats and limiting the number of in flight I/Os. + */ +static void +vdev_rebuild_cb(zio_t *zio) +{ + vdev_rebuild_t *vr = zio->io_private; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vdev_t *vd = vr->vr_top_vdev; + + mutex_enter(&vr->vr_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the top-level vdev was unavailable. + * Attempt to roll back to the last completed offset, in order + * resume from the correct location if the pool is resumed. + * (This works because spa_sync waits on spa_txg_zio before + * it runs sync tasks.) + */ + uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else if (zio->io_error) { + vrp->vrp_errors++; + } + + abd_free(zio->io_abd); + + ASSERT3U(vr->vr_bytes_inflight, >, 0); + vr->vr_bytes_inflight -= zio->io_size; + cv_broadcast(&vr->vr_io_cv); + mutex_exit(&vr->vr_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * Initialize a block pointer that can be used to read the given segment + * for sequential rebuild. + */ +static void +vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, + uint64_t asize) +{ + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? + vdev_draid_asize_to_psize(vd, asize) : asize; + + BP_ZERO(bp); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], start); + DVA_SET_GANG(&bp->blk_dva[0], 0); + DVA_SET_ASIZE(&bp->blk_dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +/* + * Issues a rebuild I/O and takes care of rate limiting the number of queued + * rebuild I/Os. The provided start and size must be properly aligned for the + * top-level vdev type being rebuilt. + */ +static int +vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) +{ + uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + blkptr_t blk; + + ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); + ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); + + vr->vr_pass_bytes_scanned += size; + vr->vr_rebuild_phys.vrp_bytes_scanned += size; + + /* + * Rebuild the data in this range by constructing a special block + * pointer. It has no relation to any existing blocks in the pool. + * However, by disabling checksum verification and issuing a scrub IO + * we can reconstruct and repair any children with missing data. + */ + vdev_rebuild_blkptr_init(&blk, vd, start, size); + uint64_t psize = BP_GET_PSIZE(&blk); + + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + return (0); + + mutex_enter(&vr->vr_io_lock); + + /* Limit in flight rebuild I/Os */ + while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); + + vr->vr_bytes_inflight += psize; + mutex_exit(&vr->vr_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + /* This is the first I/O for this txg. */ + if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { + vr->vr_scan_offset[txg & TXG_MASK] = start; + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_rebuild_update_sync, + (void *)(uintptr_t)vd->vdev_id, tx); + } + + /* When exiting write out our progress. */ + if (vdev_rebuild_should_stop(vd)) { + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vr->vr_pass_bytes_issued += size; + vr->vr_rebuild_phys.vrp_bytes_issued += size; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); + + return (0); +} + +/* + * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. + */ +static int +vdev_rebuild_ranges(vdev_rebuild_t *vr) +{ + vdev_t *vd = vr->vr_top_vdev; + zfs_btree_t *t = &vr->vr_scan_tree->rt_root; + zfs_btree_index_t idx; + int error; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t start = rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + + /* + * zfs_scan_suspend_progress can be set to disable rebuild + * progress for testing. See comment in dsl_scan_sync(). + */ + while (zfs_scan_suspend_progress && + !vdev_rebuild_should_stop(vd)) { + delay(hz); + } + + while (size > 0) { + uint64_t chunk_size; + + /* + * Split range into legally-sized logical chunks + * given the constraints of the top-level vdev + * being rebuilt (dRAID or mirror). + */ + ASSERT3P(vd->vdev_ops, !=, NULL); + chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, + start, size, zfs_rebuild_max_segment); + + error = vdev_rebuild_range(vr, start, chunk_size); + if (error != 0) + return (error); + + size -= chunk_size; + start += chunk_size; + } + } + + return (0); +} + +/* + * Calculates the estimated capacity which remains to be scanned. Since + * we traverse the pool in metaslab order only allocated capacity beyond + * the vrp_last_offset need be considered. All lower offsets must have + * already been rebuilt and are thus already included in vrp_bytes_scanned. + */ +static void +vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t bytes_est = vrp->vrp_bytes_scanned; + + if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) + return; + + for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + bytes_est += metaslab_allocated_space(msp); + mutex_exit(&msp->ms_lock); + } + + vrp->vrp_bytes_est = bytes_est; +} + +/* + * Load from disk the top-level vdev's rebuild information. + */ +int +vdev_rebuild_load(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + spa_t *spa = vd->vdev_spa; + int err = 0; + + mutex_enter(&vd->vdev_rebuild_lock); + vd->vdev_rebuilding = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + mutex_exit(&vd->vdev_rebuild_lock); + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(vd->vdev_top == vd); + + err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp); + + /* + * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should + * not prevent a pool from being imported. Clear the rebuild + * status allowing a new resilver/rebuild to be started. + */ + if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + } else if (err) { + mutex_exit(&vd->vdev_rebuild_lock); + return (err); + } + + vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; + vr->vr_top_vdev = vd; + + mutex_exit(&vd->vdev_rebuild_lock); + + return (0); +} + +/* + * Each scan thread is responsible for rebuilding a top-level vdev. The + * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. + */ +static void +vdev_rebuild_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + /* + * If there's a scrub in process request that it be stopped. This + * is not required for a correct rebuild, but we do want rebuilds to + * emulate the resilver behavior as much as possible. + */ + dsl_pool_t *dsl = spa_get_dsl(spa); + if (dsl_scan_scrubbing(dsl)) + dsl_scan_cancel(dsl); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); + ASSERT(vd->vdev_rebuilding); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); + ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); + ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); + + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vr->vr_top_vdev = vd; + vr->vr_scan_msp = NULL; + vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); + + vr->vr_pass_start_time = gethrtime(); + vr->vr_pass_bytes_scanned = 0; + vr->vr_pass_bytes_issued = 0; + + vr->vr_bytes_inflight_max = MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children); + + uint64_t update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, 0); + + clear_rebuild_bytes(vr->vr_top_vdev); + + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * Systematically walk the metaslabs and issue rebuild I/Os for + * all ranges in the allocated space map. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + vr->vr_scan_msp = msp; + + /* + * Removal of vdevs from the vdev tree may eliminate the need + * for the rebuild, in which case it should be canceled. The + * vdev_rebuild_cancel_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (vdev_rebuild_should_cancel(vd)) { + vd->vdev_rebuild_cancel_wanted = B_TRUE; + error = EINTR; + break; + } + + ASSERT0(range_tree_space(vr->vr_scan_tree)); + + /* Disable any new allocations to this metaslab */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + metaslab_disable(msp); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * If there are outstanding allocations wait for them to be + * synced. This is needed to ensure all allocated ranges are + * on disk and therefore will be rebuilt. + */ + for (int j = 0; j < TXG_SIZE; j++) { + if (range_tree_space(msp->ms_allocating[j])) { + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + txg_wait_synced(dsl, 0); + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + break; + } + } + + /* + * When a metaslab has been allocated from read its allocated + * ranges from the space map object into the vr_scan_tree. + * Then add inflight / unflushed ranges and remove inflight / + * unflushed frees. This is the minimum range to be rebuilt. + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + vr->vr_scan_tree, SM_ALLOC)); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space( + msp->ms_allocating[i])); + } + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, vr->vr_scan_tree); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, vr->vr_scan_tree); + + /* + * Remove ranges which have already been rebuilt based + * on the last offset. This can happen when restarting + * a scan after exporting and re-importing the pool. + */ + range_tree_clear(vr->vr_scan_tree, 0, + vrp->vrp_last_offset); + } + + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + /* + * To provide an accurate estimate re-calculate the estimated + * size every 5 minutes to account for recent allocations and + * frees made to space maps which have not yet been rebuilt. + */ + if (gethrtime() > update_est_time + SEC2NSEC(300)) { + update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, i); + } + + /* + * Walk the allocated space map and issue the rebuild I/O. + */ + error = vdev_rebuild_ranges(vr); + range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + metaslab_enable(msp, B_FALSE, B_FALSE); + + if (error != 0) + break; + } + + range_tree_destroy(vr->vr_scan_tree); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* Wait for any remaining rebuild I/O to complete */ + mutex_enter(&vr->vr_io_lock); + while (vr->vr_bytes_inflight > 0) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); + + mutex_exit(&vr->vr_io_lock); + + mutex_destroy(&vr->vr_io_lock); + cv_destroy(&vr->vr_io_cv); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + dsl_pool_t *dp = spa_get_dsl(spa); + dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (error == 0) { + /* + * After a successful rebuild clear the DTLs of all ranges + * which were missing when the rebuild was started. These + * ranges must have been rebuilt as a consequence of rebuilding + * all allocated space. Note that unlike a scrub or resilver + * the rebuild operation will reconstruct data only referenced + * by a pool checkpoint. See the dsl_scan_done() comments. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, + (void *)(uintptr_t)vd->vdev_id, tx); + } else if (vd->vdev_rebuild_cancel_wanted) { + /* + * The rebuild operation was canceled. This will occur when + * a device participating in the rebuild is detached. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, + (void *)(uintptr_t)vd->vdev_id, tx); + } else if (vd->vdev_rebuild_reset_wanted) { + /* + * Reset the running rebuild without canceling and restarting + * it. This will occur when a new device is attached and must + * participate in the rebuild. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, + (void *)(uintptr_t)vd->vdev_id, tx); + } else { + /* + * The rebuild operation should be suspended. This may occur + * when detaching a child vdev or when exporting the pool. The + * rebuild is left in the active state so it will be resumed. + */ + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + vd->vdev_rebuilding = B_FALSE; + } + + dmu_tx_commit(tx); + + vd->vdev_rebuild_thread = NULL; + mutex_exit(&vd->vdev_rebuild_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + cv_broadcast(&vd->vdev_rebuild_cv); + + thread_exit(); +} + +/* + * Returns B_TRUE if any top-level vdev are rebuilding. + */ +boolean_t +vdev_rebuild_active(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t ret = B_FALSE; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + ret = vdev_rebuild_active(vd->vdev_child[i]); + if (ret) + return (ret); + } + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + mutex_exit(&vd->vdev_rebuild_lock); + } + + return (ret); +} + +/* + * Start a rebuild operation. The rebuild may be restarted when the + * top-level vdev is currently actively rebuilding. + */ +void +vdev_rebuild(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_top == vd); + ASSERT(vdev_is_concrete(vd)); + ASSERT(!vd->vdev_removing); + ASSERT(spa_feature_is_enabled(vd->vdev_spa, + SPA_FEATURE_DEVICE_REBUILD)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuilding) { + ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); + + /* + * Signal a running rebuild operation that it should restart + * from the beginning because a new device was attached. The + * vdev_rebuild_reset_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (!vd->vdev_rebuild_reset_wanted) + vd->vdev_rebuild_reset_wanted = B_TRUE; + } else { + vdev_rebuild_initiate(vd); + } + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_restart_impl(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_restart_impl(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && + vdev_writeable(vd) && !vd->vdev_rebuilding) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DEVICE_REBUILD)); + vd->vdev_rebuilding = B_TRUE; + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, + maxclsyspri); + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Conditionally restart all of the vdev_rebuild_thread's for a pool. The + * feature flag must be active and the rebuild in the active state. This + * cannot be used to start a new rebuild. + */ +void +vdev_rebuild_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vdev_rebuild_restart_impl(spa->spa_root_vdev); +} + +/* + * Stop and wait for all of the vdev_rebuild_thread's associated with the + * vdev tree provide to be terminated (canceled or stopped). + */ +void +vdev_rebuild_stop_wait(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_stop_wait(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuild_thread != NULL) { + vd->vdev_rebuild_exit_wanted = B_TRUE; + while (vd->vdev_rebuilding) { + cv_wait(&vd->vdev_rebuild_cv, + &vd->vdev_rebuild_lock); + } + vd->vdev_rebuild_exit_wanted = B_FALSE; + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Stop all rebuild operations but leave them in the active state so they + * will be resumed when importing the pool. + */ +void +vdev_rebuild_stop_all(spa_t *spa) +{ + vdev_rebuild_stop_wait(spa->spa_root_vdev); +} + +/* + * Rebuild statistics reported per top-level vdev. + */ +int +vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) +{ + spa_t *spa = tvd->vdev_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (SET_ERROR(ENOTSUP)); + + if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) + return (SET_ERROR(EINVAL)); + + int error = zap_contains(spa_meta_objset(spa), + tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); + + if (error == ENOENT) { + bzero(vrs, sizeof (vdev_rebuild_stat_t)); + vrs->vrs_state = VDEV_REBUILD_NONE; + error = 0; + } else if (error == 0) { + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&tvd->vdev_rebuild_lock); + vrs->vrs_state = vrp->vrp_rebuild_state; + vrs->vrs_start_time = vrp->vrp_start_time; + vrs->vrs_end_time = vrp->vrp_end_time; + vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; + vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; + vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; + vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; + vrs->vrs_bytes_est = vrp->vrp_bytes_est; + vrs->vrs_errors = vrp->vrp_errors; + vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - + vr->vr_pass_start_time); + vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; + vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + mutex_exit(&tvd->vdev_rebuild_lock); + } + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, + "Max segment size in bytes of rebuild reads"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, + "Max bytes in flight per leaf vdev for sequential resilvers"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, + "Automatically scrub after sequential resilver completes"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f2d18d9257..f762c1df96 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -21,7 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. */ #include @@ -46,7 +47,7 @@ #include #include #include -#include +#include /* * This file contains the necessary logic to remove vdevs from a @@ -100,6 +101,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. + * + * See also the accessor function spa_remove_max_segment(). */ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; @@ -195,11 +198,12 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, NULL); + svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -244,9 +248,9 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; - ASSERTV(uint64_t txg = dmu_tx_get_txg(tx)); + uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); @@ -264,7 +268,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, &one, tx)); - ASSERTV(boolean_t are_precise); + boolean_t are_precise __maybe_unused; ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise)); ASSERT3B(are_precise, ==, B_TRUE); } @@ -341,11 +345,12 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) vdev_config_dirty(vd); zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu " - "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), - vic->vic_mapping_object); + "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd, + (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)vic->vic_mapping_object); spa_history_log_internal(spa, "vdev remove started", tx, - "%s vdev %llu %s", spa_name(spa), vd->vdev_id, + "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* * Setting spa_vdev_removal causes subsequent frees to call @@ -470,7 +475,8 @@ spa_restart_removal(spa_t *spa) if (!spa_writeable(spa)) return; - zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); + zfs_dbgmsg("restarting removal of %llu", + (u_longlong_t)svr->svr_vdev_id); svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } @@ -695,6 +701,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) spa_vdev_removal_destroy(svr); spa_sync_removing_state(spa, tx); + spa_notify_waiters(spa); vdev_config_dirty(spa->spa_root_vdev); } @@ -720,7 +727,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); + vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; @@ -951,8 +958,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { @@ -961,18 +970,15 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_t search; - avl_index_t where; - search.rs_start = start + maxalloc; - search.rs_end = search.rs_start; - range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); - if (rs == NULL) { - rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); - } else { - rs = AVL_PREV(&segs->rt_root, rs); - } + range_seg_max_t search; + zfs_btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); if (rs != NULL) { - size = rs->rs_end - start; + size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. @@ -983,12 +989,13 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } } ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; - if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) + if (mc->mc_groups == 0) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, zal, 0); @@ -1004,20 +1011,22 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); - range_seg_t *rs = avl_first(&segs->rt_root); - ASSERT3U(rs->rs_start, ==, start); - uint64_t prev_seg_end = rs->rs_end; - while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { - if (rs->rs_start >= start + size) { + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, - rs->rs_start - prev_seg_end); + rs_get_start(rs, segs) - prev_seg_end); } - prev_seg_end = rs->rs_end; + prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); @@ -1026,11 +1035,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, /* * We can't have any padding of the allocated size, otherwise we will - * misunderstand what's allocated, and the size of the mapping. - * The caller ensures this will be true by passing in a size that is - * aligned to the worst (highest) ashift in the pool. + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. */ - ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); @@ -1106,14 +1115,14 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); /* vd->vdev_path is not available here */ spa_history_log_internal(spa, "vdev remove completed", tx, - "%s vdev %llu", spa_name(spa), vd->vdev_id); + "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id); } static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; @@ -1160,8 +1169,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) /* After this, we can not use svr. */ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, - 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_complete_sync, svr, tx); dmu_tx_commit(tx); } @@ -1189,7 +1198,7 @@ vdev_remove_complete(spa_t *spa) ESC_ZFS_VDEV_REMOVE_DEV); zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", - vd->vdev_id, txg); + (u_longlong_t)vd->vdev_id, (u_longlong_t)txg); /* * Discard allocation state. @@ -1198,6 +1207,12 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; + spa_log_sm_set_blocklimit(spa); + } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1260,9 +1275,10 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - range_tree_t *segs = range_tree_create(NULL, NULL); + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { - range_seg_t *rs = range_tree_first(svr->svr_allocd_segs); + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); if (rs == NULL) break; @@ -1271,17 +1287,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ - seg_length = - MIN(rs->rs_end - rs->rs_start, *max_alloc); + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); } else { - if (rs->rs_start - range_tree_max(segs) > + if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; - } else if (rs->rs_end - range_tree_min(segs) > + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past @@ -1290,13 +1306,14 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, */ break; } else { - seg_length = rs->rs_end - rs->rs_start; + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); } } - range_tree_add(segs, rs->rs_start, seg_length); + range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, - rs->rs_start, seg_length); + rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { @@ -1307,7 +1324,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, - svr, 0, ZFS_SPACE_CHECK_NONE, tx); + svr, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); @@ -1363,6 +1380,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, range_tree_destroy(segs); } +/* + * The size of each removal mapping is limited by the tunable + * zfs_remove_max_segment, but we must adjust this to be a multiple of the + * pool's ashift, so that we don't try to split individual sectors regardless + * of the tunable value. (Note that device removal requires that all devices + * have the same ashift, so there's no difference between spa_min_ashift and + * spa_max_ashift.) The raw tunable should not be used elsewhere. + */ +uint64_t +spa_remove_max_segment(spa_t *spa) +{ + return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); +} + /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. @@ -1385,7 +1416,7 @@ spa_vdev_remove_thread(void *arg) spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; - uint64_t max_alloc = zfs_remove_max_segment; + uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1442,6 +1473,10 @@ spa_vdev_remove_thread(void *arg) VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1457,8 +1492,9 @@ spa_vdev_remove_thread(void *arg) vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", - avl_numnodes(&svr->svr_allocd_segs->rt_root), - msp->ms_id); + (u_longlong_t)zfs_btree_numnodes( + &svr->svr_allocd_segs->rt_root), + (u_longlong_t)msp->ms_id); while (!svr->svr_thread_exit && !range_tree_is_empty(svr->svr_allocd_segs)) { @@ -1481,10 +1517,6 @@ spa_vdev_remove_thread(void *arg) * specified by zfs_removal_suspend_progress. We do this * solely from the test suite or during debugging. */ - uint64_t bytes_copied = - spa->spa_removing_phys.sr_copied; - for (int i = 0; i < TXG_SIZE; i++) - bytes_copied += svr->svr_bytes_done[i]; while (zfs_removal_suspend_progress && !svr->svr_thread_exit) delay(hz); @@ -1498,7 +1530,7 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); @@ -1511,7 +1543,7 @@ spa_vdev_remove_thread(void *arg) vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) - max_alloc = zfs_remove_max_segment; + max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); @@ -1559,14 +1591,16 @@ spa_vdev_remove_thread(void *arg) vca.vca_write_error_bytes > 0)) { zfs_dbgmsg("canceling removal due to IO errors: " "[read_error_bytes=%llu] [write_error_bytes=%llu]", - vca.vca_read_error_bytes, - vca.vca_write_error_bytes); + (u_longlong_t)vca.vca_read_error_bytes, + (u_longlong_t)vca.vca_write_error_bytes); spa_vdev_remove_cancel_impl(spa); } } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } + + thread_exit(); } void @@ -1666,6 +1700,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1725,10 +1764,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", - vd->vdev_id, dmu_tx_get_txg(tx)); + (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx)); spa_history_log_internal(spa, "vdev remove canceled", tx, "%s vdev %llu %s", spa_name(spa), - vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + (u_longlong_t)vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } static int @@ -1744,6 +1784,8 @@ spa_vdev_remove_cancel_impl(spa_t *spa) spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_t *vd = vdev_lookup_top(spa, vdid); metaslab_group_activate(vd->vdev_mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } @@ -1794,19 +1836,14 @@ vdev_remove_make_hole_and_free(vdev_t *vd) uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); vdev_config_dirty(rvd); /* @@ -1827,6 +1864,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); + ASSERT3P(vd->vdev_log_mg, ==, NULL); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* @@ -1841,6 +1879,13 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + /* + * Cancel any initialize or TRIM which was in progress. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); + vdev_autotrim_stop_wait(vd); + /* * Evacuate the device. We don't hold the config lock as * writer since we need to do I/O but we do keep the @@ -1855,6 +1900,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT3P(vd->vdev_log_mg, ==, NULL); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); @@ -1868,15 +1914,30 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); + /* + * When the log space map feature is enabled we look at + * the vdev's top_zap to find the on-disk flush data of + * the metaslab we just flushed. Thus, while removing a + * log vdev we make sure to call vdev_metaslab_fini() + * first, which removes all metaslabs of this vdev from + * spa_metaslabs_by_flushed before vdev_remove_empty() + * destroys the top_zap of this log vdev. + * + * This avoids the scenario where we flush a metaslab + * from the log vdev being removed that doesn't have a + * top_zap and end up failing to lookup its on-disk flush + * data. + * + * We don't call metaslab_group_destroy() right away + * though (it will be called in vdev_free() later) as + * during metaslab_sync() of metaslabs from other vdevs + * we may touch the metaslab group of this vdev through + * metaslab_class_histogram_verify() + */ vdev_metaslab_fini(vd); + spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); - - /* Stop initializing and TRIM */ - vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); - vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); - vdev_autotrim_stop_wait(vd); - *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, @@ -1917,35 +1978,44 @@ spa_vdev_remove_top_check(vdev_t *vd) if (vd != vd->vdev_top) return (SET_ERROR(ENOTSUP)); + if (!vdev_is_concrete(vd)) + return (SET_ERROR(ENOTSUP)); + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); - /* available space in the pool's normal class */ - uint64_t available = dsl_dir_space_available( - spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); metaslab_class_t *mc = vd->vdev_mg->mg_class; - - /* - * When removing a vdev from an allocation class that has - * remaining vdevs, include available space from the class. - */ - if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { - uint64_t class_avail = metaslab_class_get_space(mc) - - metaslab_class_get_alloc(mc); - - /* add class space, adjusted for overhead */ - available += (class_avail * 94) / 100; - } - - /* - * There has to be enough free space to remove the - * device and leave double the "slop" space (i.e. we - * must leave at least 3% of the pool free, in addition to - * the normal slop space). - */ - if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { - return (SET_ERROR(ENOSPC)); + metaslab_class_t *normal = spa_normal_class(spa); + if (mc != normal) { + /* + * Space allocated from the special (or dedup) class is + * included in the DMU's space usage, but it's not included + * in spa_dspace (or dsl_pool_adjustedsize()). Therefore + * there is always at least as much free space in the normal + * class, as is allocated from the special (and dedup) class. + * As a backup check, we will return ENOSPC if this is + * violated. See also spa_update_dspace(). + */ + uint64_t available = metaslab_class_get_space(normal) - + metaslab_class_get_alloc(normal); + ASSERT3U(available, >=, vd->vdev_stat.vs_alloc); + if (available < vd->vdev_stat.vs_alloc) + return (SET_ERROR(ENOSPC)); + } else { + /* available space in the pool's normal class */ + uint64_t available = dsl_dir_space_available( + spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); + if (available < + vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + /* + * This is a normal device. There has to be enough free + * space to remove the device and leave double the + * "slop" space (i.e. we must leave at least 3% of the + * pool free, in addition to the normal slop space). + */ + return (SET_ERROR(ENOSPC)); + } } /* @@ -1974,21 +2044,41 @@ spa_vdev_remove_top_check(vdev_t *vd) return (SET_ERROR(EINVAL)); } + /* + * A removed special/dedup vdev must have same ashift as normal class. + */ + ASSERT(!vd->vdev_islog); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && + vd->vdev_ashift != spa->spa_max_ashift) { + return (SET_ERROR(EINVAL)); + } + /* * All vdevs in normal class must have the same ashift - * and not be raidz. + * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; - if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) + + /* + * A removed special/dedup vdev must have the same ashift + * across all vdevs in its class. + */ + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && + cvd->vdev_alloc_bias == vd->vdev_alloc_bias && + cvd->vdev_ashift != vd->vdev_ashift) { + return (SET_ERROR(EINVAL)); + } + if (cvd->vdev_ashift != 0 && + cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); if (cvd->vdev_ops == &vdev_indirect_ops) num_indirect++; if (!vdev_is_concrete(cvd)) continue; - if (cvd->vdev_ops == &vdev_raidz_ops) + if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only @@ -2039,6 +2129,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); /* * Wait for the youngest allocations and frees to sync, @@ -2075,6 +2167,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); @@ -2087,8 +2181,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, - vdev_remove_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); return (0); @@ -2113,7 +2206,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) int error = 0, error_log; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; - char *vd_type = NULL, *vd_path = NULL, *vd_path_log = NULL; + char *vd_type = NULL, *vd_path = NULL; ASSERT(spa_writeable(spa)); @@ -2142,17 +2235,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); + char *type; + boolean_t draid_spare = B_FALSE; - vd_type = VDEV_TYPE_SPARE; - vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, + guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + vd_type = VDEV_TYPE_SPARE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } @@ -2161,11 +2267,26 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { vd_type = VDEV_TYPE_L2CACHE; - vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); + + /* + * Stop trimming the cache device. We need to release the + * config lock to allow the syncing of TRIM transactions + * without releasing the spa_namespace_lock. The same + * strategy is employed in spa_vdev_remove_top(). + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); + mutex_exit(&vd->vdev_trim_lock); + txg = spa_vdev_config_enter(spa); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); @@ -2174,7 +2295,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); vd_type = VDEV_TYPE_LOG; - vd_path = (vd->vdev_path != NULL) ? vd->vdev_path : "-"; + vd_path = spa_strdup((vd->vdev_path != NULL) ? + vd->vdev_path : "-"); error = spa_vdev_remove_log(vd, &txg); } else if (vd != NULL) { ASSERT(!locked); @@ -2186,9 +2308,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) error = SET_ERROR(ENOENT); } - if (vd_path != NULL) - vd_path_log = spa_strdup(vd_path); - error_log = error; if (!locked) @@ -2201,12 +2320,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * Doing that would prevent the txg sync from actually happening, * causing a deadlock. */ - if (error_log == 0 && vd_type != NULL && vd_path_log != NULL) { + if (error_log == 0 && vd_type != NULL && vd_path != NULL) { spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path_log); + "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path); } - if (vd_path_log != NULL) - spa_strfree(vd_path_log); + if (vd_path != NULL) + spa_strfree(vd_path); if (ev != NULL) spa_event_post(ev); @@ -2244,22 +2363,17 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) return (0); } -#if defined(_KERNEL) -module_param(zfs_removal_ignore_errors, int, 0644); -MODULE_PARM_DESC(zfs_removal_ignore_errors, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device"); -module_param(zfs_remove_max_segment, int, 0644); -MODULE_PARM_DESC(zfs_remove_max_segment, +ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW, "Largest contiguous segment to allocate when removing device"); -module_param(vdev_removal_max_span, int, 0644); -MODULE_PARM_DESC(vdev_removal_max_span, +ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); -/* BEGIN CSTYLED */ -module_param(zfs_removal_suspend_progress, int, 0644); -MODULE_PARM_DESC(zfs_removal_suspend_progress, +ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); /* END CSTYLED */ @@ -2273,4 +2387,3 @@ EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_remove_cancel); EXPORT_SYMBOL(spa_vdev_remove_suspend); EXPORT_SYMBOL(svr_sync); -#endif diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index e40b7ce8e4..45ddc2f719 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -82,7 +82,7 @@ too_many_errors(vdev_t *vd, uint64_t numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *ashift, uint64_t *pshift) { spa_t *spa = vd->vdev_spa; int lasterror = 0; @@ -98,7 +98,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - if (cvd->vdev_open_error && !cvd->vdev_islog) { + if (cvd->vdev_open_error && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; } @@ -115,6 +116,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = 0; *max_asize = 0; *ashift = 0; + *pshift = 0; return (0); } @@ -140,17 +142,26 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_root_open, + .vdev_op_close = vdev_root_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = NULL, /* not applicable to the root */ + .vdev_op_io_done = NULL, /* not applicable to the root */ + .vdev_op_state_change = vdev_root_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 5ad47cccda..deea7fedd7 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2021 Hewlett Packard Enterprise Development LP */ #include @@ -29,11 +30,11 @@ #include #include #include -#include #include #include #include #include +#include /* * TRIM is a feature which is used to notify a SSD that some previously @@ -311,13 +312,14 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_secure = secure; } - boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); + vdev_trim_state_t old_state = vd->vdev_trim_state; + boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_TRIM_ACTIVE: @@ -332,9 +334,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: - spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); - spa_history_log_internal(spa, "trim", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_TRIM_ACTIVE || + old_state == VDEV_TRIM_SUSPENDED) { + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s canceled", vd->vdev_path); + } break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); @@ -346,6 +351,9 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, } dmu_tx_commit(tx); + + if (new_state != VDEV_TRIM_ACTIVE) + spa_notify_waiters(spa); } /* @@ -419,6 +427,35 @@ vdev_autotrim_cb(zio_t *zio) spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } +/* + * The zio_done_func_t done callback for each TRIM issued via + * vdev_trim_simple(). It is responsible for updating the TRIM stats and + * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best + * effort and are never reissued on failure. + */ +static void +vdev_trim_simple_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} /* * Returns the average trim rate in bytes/sec for the ta->trim_vdev. */ @@ -438,6 +475,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) { vdev_t *vd = ta->trim_vdev; spa_t *spa = vd->vdev_spa; + void *cb; mutex_enter(&vd->vdev_trim_io_lock); @@ -448,7 +486,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) if (ta->trim_type == TRIM_TYPE_MANUAL) { while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { - cv_timedwait_sig(&vd->vdev_trim_io_cv, + cv_timedwait_idle(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock, ddi_get_lbolt() + MSEC_TO_TICK(10)); } @@ -456,8 +494,8 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) ta->trim_bytes_done += size; /* Limit in flight trimming I/Os */ - while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >= - zfs_trim_queue_limit) { + while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } vd->vdev_trim_inflight[ta->trim_type]++; @@ -477,8 +515,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_trim_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_trim_zap_update_sync, guid, tx); } /* @@ -502,10 +539,17 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) if (ta->trim_type == TRIM_TYPE_MANUAL) vd->vdev_trim_offset[txg & TXG_MASK] = start + size; + if (ta->trim_type == TRIM_TYPE_MANUAL) { + cb = vdev_trim_cb; + } else if (ta->trim_type == TRIM_TYPE_AUTO) { + cb = vdev_autotrim_cb; + } else { + cb = vdev_trim_simple_cb; + } + zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, - start, size, ta->trim_type == TRIM_TYPE_MANUAL ? - vdev_trim_cb : vdev_autotrim_cb, NULL, - ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); + start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, + ta->trim_flags)); /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ dmu_tx_commit(tx); @@ -523,7 +567,8 @@ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; - avl_tree_t *rt = &ta->trim_tree->rt_root; + zfs_btree_t *t = &ta->trim_tree->rt_root; + zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; @@ -531,9 +576,10 @@ vdev_trim_ranges(trim_args_t *ta) ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, + ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, @@ -548,9 +594,9 @@ vdev_trim_ranges(trim_args_t *ta) int error; error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + - rs->rs_start + (w * extent_bytes_max), - MIN(size - (w * extent_bytes_max), - extent_bytes_max)); + rs_get_start(rs, ta->trim_tree) + + (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { return (error); } @@ -560,6 +606,32 @@ vdev_trim_ranges(trim_args_t *ta) return (0); } +static void +vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_trim_bytes_est += size; + + if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += size; + } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && + vd->vdev_trim_last_offset <= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += + vd->vdev_trim_last_offset - physical_rs->rs_start; + } +} + /* * Calculates the completion percentage of a manual TRIM. */ @@ -577,27 +649,35 @@ vdev_trim_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been trimmed. */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been trimmed */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_trim_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_trim_last_offset > last_rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -611,25 +691,16 @@ vdev_trim_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; - vdev_xlate(vd, &logical_rs, &physical_rs); + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t idx; + for (range_seg_t *rs = zfs_btree_first(bt, &idx); + rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_trim_bytes_est += size; - if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { - vd->vdev_trim_bytes_done += size; - } else if (vd->vdev_trim_last_offset > - physical_rs.rs_start && - vd->vdev_trim_last_offset <= - physical_rs.rs_end) { - vd->vdev_trim_bytes_done += - vd->vdev_trim_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_trim_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -697,8 +768,38 @@ vdev_trim_load(vdev_t *vd) return (err); } +static void +vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + trim_args_t *ta = arg; + vdev_t *vd = ta->trim_vdev; + + /* + * Only a manual trim will be traversing the vdev sequentially. + * For an auto trim all valid ranges should be added. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_trim_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_trim_last_offset > physical_rs->rs_start) { + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_trim_last_offset); + physical_rs->rs_start = vd->vdev_trim_last_offset; + } + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(ta->trim_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* - * Convert the logical range into a physical range and add it to the + * Convert the logical range into physical ranges and add them to the * range tree passed in the trim_args_t. */ static void @@ -706,7 +807,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -719,48 +820,11 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); - VERIFY(range_tree_find(msp->ms_allocatable, start, size)); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* - * Only a manual trim will be traversing the vdev sequentially. - * For an auto trim all valid ranges should be added. - */ - if (ta->trim_type == TRIM_TYPE_MANUAL) { - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_trim_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_trim_last_offset > physical_rs.rs_start) { - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_trim_last_offset); - physical_rs.rs_start = vd->vdev_trim_last_offset; - } - } - - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(ta->trim_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); } /* @@ -798,7 +862,7 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = range_tree_create(NULL, NULL); + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -837,7 +901,7 @@ vdev_trim_thread(void *arg) */ if (msp->ms_sm == NULL && vd->vdev_trim_partial) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_calculate_progress(vd); continue; @@ -849,7 +913,7 @@ vdev_trim_thread(void *arg) mutex_exit(&msp->ms_lock); error = vdev_trim_ranges(&ta); - metaslab_enable(msp, B_TRUE); + metaslab_enable(msp, B_TRUE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(ta.trim_tree, NULL, NULL); @@ -867,10 +931,16 @@ vdev_trim_thread(void *arg) range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); - if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { - vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, - vd->vdev_trim_rate, vd->vdev_trim_partial, - vd->vdev_trim_secure); + if (!vd->vdev_trim_exit_wanted) { + if (vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } else if (vd->vdev_faulted) { + vdev_trim_change_state(vd, VDEV_TRIM_CANCELED, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } } ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); @@ -888,6 +958,8 @@ vdev_trim_thread(void *arg) vd->vdev_trim_thread = NULL; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); } /* @@ -1006,6 +1078,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; + vdev_t *vd_l2cache; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -1013,6 +1086,17 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) offsetof(vdev_t, vdev_trim_node)); vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); + + /* + * Iterate over cache devices and request stop trimming the + * whole device in case we export the pool or remove the cache + * device prematurely. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; + vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); + } + vdev_trim_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { @@ -1046,7 +1130,7 @@ vdev_trim_restart(vdev_t *vd) vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); - vd->vdev_trim_action_time = (time_t)timestamp; + vd->vdev_trim_action_time = timestamp; if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || vd->vdev_offline) { @@ -1080,7 +1164,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); - VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } /* @@ -1154,7 +1238,7 @@ vdev_autotrim_thread(void *arg) if (msp->ms_sm == NULL || range_tree_is_empty(msp->ms_trim)) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); continue; } @@ -1170,7 +1254,7 @@ vdev_autotrim_thread(void *arg) */ if (msp->ms_disabled > 1) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); continue; } @@ -1178,7 +1262,8 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = range_tree_create(NULL, NULL); + trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(range_tree_is_empty(msp->ms_trim)); @@ -1232,7 +1317,8 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = range_tree_create(NULL, NULL); + ta->trim_tree = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); range_tree_walk(trim_tree, vdev_trim_range_add, ta); } @@ -1288,7 +1374,7 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim); + metaslab_enable(msp, issued_trim, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { @@ -1350,6 +1436,8 @@ vdev_autotrim_thread(void *arg) vd->vdev_autotrim_thread = NULL; cv_broadcast(&vd->vdev_autotrim_cv); mutex_exit(&vd->vdev_autotrim_lock); + + thread_exit(); } /* @@ -1425,7 +1513,189 @@ vdev_autotrim_restart(spa_t *spa) vdev_autotrim(spa); } -#if defined(_KERNEL) +static void +vdev_trim_l2arc_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + trim_args_t ta; + range_seg64_t physical_rs; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_trim_last_offset = 0; + vd->vdev_trim_rate = 0; + vd->vdev_trim_partial = 0; + vd->vdev_trim_secure = 0; + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_MANUAL; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; + physical_rs.rs_end = vd->vdev_trim_bytes_est = + vdev_get_min_asize(vd); + + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + mutex_exit(&vd->vdev_trim_lock); + + (void) vdev_trim_ranges(&ta); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + mutex_enter(&vd->vdev_trim_lock); + if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + ASSERT(vd->vdev_trim_thread != NULL || + vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); + + /* + * Drop the vdev_trim_lock while we sync out the txg since it's + * possible that a device might be trying to come online and + * must check to see if it needs to restart a trim. That thread + * will be holding the spa_config_lock which would prevent the + * txg_wait_synced from completing. Same strategy as in + * vdev_trim_thread(). + */ + mutex_exit(&vd->vdev_trim_lock); + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + mutex_enter(&vd->vdev_trim_lock); + + /* + * Update the header of the cache device here, before + * broadcasting vdev_trim_cv which may lead to the removal + * of the device. The same applies for setting l2ad_trim_all to + * false. + */ + spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, + RW_READER); + bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); + + vd->vdev_trim_thread = NULL; + if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) + dev->l2ad_trim_all = B_FALSE; + + cv_broadcast(&vd->vdev_trim_cv); + mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); +} + +/* + * Punches out TRIM threads for the L2ARC devices in a spa and assigns them + * to vd->vdev_trim_thread variable. This facilitates the management of + * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition + * to a pool or pool creation or when the header of the device is invalid. + */ +void +vdev_trim_l2arc(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off TRIM threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + + if (dev == NULL || !dev->l2ad_trim_all) { + /* + * Don't attempt TRIM if the vdev is UNAVAIL or if the + * cache device was not marked for whole device TRIM + * (ie l2arc_trim_ahead = 0, or the L2ARC device header + * is valid with trim_state = VDEV_TRIM_COMPLETE and + * l2ad_log_entries > 0). + */ + continue; + } + + mutex_enter(&vd->vdev_trim_lock); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_trim_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + vd->vdev_trim_thread = thread_create(NULL, 0, + vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&vd->vdev_trim_lock); + } +} + +/* + * A wrapper which calls vdev_trim_ranges(). It is intended to be called + * on leaf vdevs. + */ +int +vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) +{ + trim_args_t ta; + range_seg64_t physical_rs; + int error; + physical_rs.rs_start = start; + physical_rs.rs_end = start + size; + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_top->vdev_removing); + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_SIMPLE; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } + + error = vdev_trim_ranges(&ta); + + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + return (error); +} + EXPORT_SYMBOL(vdev_trim); EXPORT_SYMBOL(vdev_trim_stop); EXPORT_SYMBOL(vdev_trim_stop_all); @@ -1435,26 +1705,22 @@ EXPORT_SYMBOL(vdev_autotrim); EXPORT_SYMBOL(vdev_autotrim_stop_all); EXPORT_SYMBOL(vdev_autotrim_stop_wait); EXPORT_SYMBOL(vdev_autotrim_restart); +EXPORT_SYMBOL(vdev_trim_l2arc); +EXPORT_SYMBOL(vdev_trim_simple); /* BEGIN CSTYLED */ -module_param(zfs_trim_extent_bytes_max, uint, 0644); -MODULE_PARM_DESC(zfs_trim_extent_bytes_max, +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, "Max size of TRIM commands, larger will be split"); -module_param(zfs_trim_extent_bytes_min, uint, 0644); -MODULE_PARM_DESC(zfs_trim_extent_bytes_min, +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, "Min size of TRIM commands, smaller will be skipped"); -module_param(zfs_trim_metaslab_skip, uint, 0644); -MODULE_PARM_DESC(zfs_trim_metaslab_skip, +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, "Skip metaslabs which have never been initialized"); -module_param(zfs_trim_txg_batch, uint, 0644); -MODULE_PARM_DESC(zfs_trim_txg_batch, +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, "Min number of txgs to aggregate frees before issuing TRIM"); -module_param(zfs_trim_queue_limit, uint, 0644); -MODULE_PARM_DESC(zfs_trim_queue_limit, +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, "Max queued TRIMs outstanding per leaf vdev"); /* END CSTYLED */ -#endif diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 6d8c498042..6f03beef3b 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -45,10 +45,39 @@ #include #include #include -#include #include #include +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +int zap_iterate_prefetch = B_TRUE; + int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); @@ -192,7 +221,8 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, tbl->zt_blks_copied++; dprintf("copied block %llu of %llu\n", - tbl->zt_blks_copied, tbl->zt_numblks); + (u_longlong_t)tbl->zt_blks_copied, + (u_longlong_t)tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { (void) dmu_free_range(zap->zap_objset, zap->zap_object, @@ -205,7 +235,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, tbl->zt_blks_copied = 0; dprintf("finished; numblocks now %llu (%uk entries)\n", - tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } return (0); @@ -220,7 +250,8 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); - dprintf("storing %llx at index %llx\n", val, idx); + dprintf("storing %llx at index %llx\n", (u_longlong_t)val, + (u_longlong_t)idx); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); @@ -1000,7 +1031,7 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { if ((za->za_first_integer & mask) == (value & mask)) { - (void) strcpy(name, za->za_name); + (void) strlcpy(name, za->za_name, MAXNAMELEN); break; } } @@ -1189,6 +1220,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ + /* + * If we are reading from the beginning, we're almost certain to + * iterate over the entire ZAP object. If there are multiple leaf + * blocks (freeblk > 2), prefetch the whole object (up to + * dmu_prefetch_max bytes), so that we read the leaf blocks + * concurrently. (Unless noprefetch was requested via + * zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } + if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != @@ -1333,3 +1379,8 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, + "When iterating ZAP object, prefetch it"); +/* END CSTYLED */ diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index b421dd5038..aa6c298c3b 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -467,7 +467,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l, } } - return (bestcd == -1U ? ENOENT : 0); + return (bestcd == -1U ? SET_ERROR(ENOENT) : 0); } int diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index fa369f7975..b4611685b2 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -230,7 +229,7 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) return (zn); } -zap_name_t * +static zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); @@ -280,11 +279,11 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); } static void @@ -564,7 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", - obj, zap->zap_m.zap_num_entries); + (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) @@ -657,7 +656,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) } dprintf("upgrading obj=%llu with %u chunks\n", - zap->zap_object, nchunks); + (u_longlong_t)zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); @@ -668,7 +667,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", - mze->mze_name, mze->mze_value); + mze->mze_name, (u_longlong_t)mze->mze_value); zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, @@ -1340,7 +1339,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); + (u_longlong_t)zapobj, integer_size, + (u_longlong_t)num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, @@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, * Routines for iterating over the attributes. */ -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) +static void +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; @@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; + zc->zc_prefetch = prefetch; +} +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); } +/* + * Initialize a cursor at the beginning of the ZAP object. The entire + * ZAP object will be prefetched. + */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - zap_cursor_init_serialized(zc, os, zapobj, 0); + zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); +} + +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +void +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void @@ -1581,7 +1602,8 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; - (void) strcpy(za->za_name, mzep->mze_name); + (void) strlcpy(za->za_name, mzep->mze_name, + sizeof (za->za_name)); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c index 4894df11d5..f724b44baf 100644 --- a/module/zfs/zcp.c +++ b/module/zfs/zcp.c @@ -66,7 +66,7 @@ * consuming excessive system or running forever. If one of these limits is * hit, the channel program will be stopped immediately and return from * zcp_eval() with an error code. No attempt will be made to roll back or undo - * any changes made by the channel program before the error occured. + * any changes made by the channel program before the error occurred. * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time * limit of 0, disabling the time limit. * @@ -77,7 +77,7 @@ * In place of a return value, an error message will also be returned in the * 'result' nvlist containing information about the error. No attempt will be * made to roll back or undo any changes made by the channel program before the - * error occured. + * error occurred. * * 3. If an error occurs inside a ZFS library call which returns an error code, * the error is returned to the Lua script to be handled as desired. @@ -100,6 +100,7 @@ #include #include #include +#include #ifndef KM_NORMALPRI #define KM_NORMALPRI 0 @@ -118,21 +119,6 @@ static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int); static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *, int); -typedef struct zcp_alloc_arg { - boolean_t aa_must_succeed; - int64_t aa_alloc_remaining; - int64_t aa_alloc_limit; -} zcp_alloc_arg_t; - -typedef struct zcp_eval_arg { - lua_State *ea_state; - zcp_alloc_arg_t *ea_allocargs; - cred_t *ea_cred; - nvlist_t *ea_outnvl; - int ea_result; - uint64_t ea_instrlimit; -} zcp_eval_arg_t; - /* * The outer-most error callback handler for use with lua_pcall(). On * error Lua will call this callback with a single argument that @@ -175,7 +161,7 @@ zcp_argerror(lua_State *state, int narg, const char *msg, ...) * of a function call. * * If an error occurs, the cleanup function will be invoked exactly once and - * then unreigstered. + * then unregistered. * * Returns the registered cleanup handler so the caller can deregister it * if no error occurs. @@ -410,7 +396,7 @@ zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, case LUA_TTABLE: { nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth); if (value_nvl == NULL) - return (EINVAL); + return (SET_ERROR(EINVAL)); fnvlist_add_nvlist(nvl, key, value_nvl); fnvlist_free(value_nvl); @@ -420,7 +406,7 @@ zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, (void) lua_pushfstring(state, "Invalid value type '%s' for key '%s'", lua_typename(state, lua_type(state, index)), key); - return (EINVAL); + return (SET_ERROR(EINVAL)); } return (0); @@ -452,7 +438,7 @@ zcp_lua_to_nvlist_helper(lua_State *state) static void zcp_convert_return_values(lua_State *state, nvlist_t *nvl, - const char *key, zcp_eval_arg_t *evalargs) + const char *key, int *result) { int err; VERIFY3U(1, ==, lua_gettop(state)); @@ -464,7 +450,7 @@ zcp_convert_return_values(lua_State *state, nvlist_t *nvl, err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */ if (err != 0) { zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR); - evalargs->ea_result = SET_ERROR(ECHRNG); + *result = SET_ERROR(ECHRNG); } } @@ -599,7 +585,7 @@ zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair, "Unhandled nvpair type %d for key '%s'", nvpair_type(pair), nvpair_name(pair)); } - return (EINVAL); + return (SET_ERROR(EINVAL)); } } return (err); @@ -668,7 +654,8 @@ zcp_debug(lua_State *state) dbgstring = lua_tostring(state, 1); - zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring); + zfs_dbgmsg("txg %lld ZCP: %s", (longlong_t)ri->zri_tx->tx_txg, + dbgstring); return (0); } @@ -736,8 +723,6 @@ static void * zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { zcp_alloc_arg_t *allocargs = ud; - int flags = (allocargs->aa_must_succeed) ? - KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); if (nsize == 0) { if (ptr != NULL) { @@ -760,10 +745,7 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) return (NULL); } - allocbuf = vmem_alloc(allocsize, flags); - if (allocbuf == NULL) { - return (NULL); - } + allocbuf = vmem_alloc(allocsize, KM_SLEEP); allocargs->aa_alloc_remaining -= allocsize; *allocbuf = allocsize; @@ -791,19 +773,32 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) static void zcp_lua_counthook(lua_State *state, lua_Debug *ar) { - /* - * If we're called, check how many instructions the channel program has - * executed so far, and compare against the limit. - */ lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); zcp_run_info_t *ri = lua_touserdata(state, -1); + /* + * Check if we were canceled while waiting for the + * txg to sync or from our open context thread + */ + if (ri->zri_canceled || + (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { + ri->zri_canceled = B_TRUE; + (void) lua_pushstring(state, "Channel program was canceled."); + (void) lua_error(state); + /* Unreachable */ + } + + /* + * Check how many instructions the channel program has + * executed so far, and compare against the limit. + */ ri->zri_curinstrs += zfs_lua_check_instrlimit_interval; if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) { ri->zri_timed_out = B_TRUE; (void) lua_pushstring(state, "Channel program timed out."); (void) lua_error(state); + /* Unreachable */ } } @@ -816,31 +811,25 @@ zcp_panic_cb(lua_State *state) } static void -zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) +zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri) { int err; - zcp_run_info_t ri; - lua_State *state = evalargs->ea_state; + lua_State *state = ri->zri_state; VERIFY3U(3, ==, lua_gettop(state)); + /* finish initializing our runtime state */ + ri->zri_pool = dmu_tx_pool(tx); + ri->zri_tx = tx; + list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t), + offsetof(zcp_cleanup_handler_t, zch_node)); + /* * Store the zcp_run_info_t struct for this run in the Lua registry. * Registry entries are not directly accessible by the Lua scripts but * can be accessed by our callbacks. */ - ri.zri_space_used = 0; - ri.zri_pool = dmu_tx_pool(tx); - ri.zri_cred = evalargs->ea_cred; - ri.zri_tx = tx; - ri.zri_timed_out = B_FALSE; - ri.zri_sync = sync; - list_create(&ri.zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t), - offsetof(zcp_cleanup_handler_t, zch_node)); - ri.zri_curinstrs = 0; - ri.zri_maxinstrs = evalargs->ea_instrlimit; - - lua_pushlightuserdata(state, &ri); + lua_pushlightuserdata(state, ri); lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); VERIFY3U(3, ==, lua_gettop(state)); @@ -857,7 +846,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) * off control to the channel program. Channel programs that use too * much memory should die with ENOSPC. */ - evalargs->ea_allocargs->aa_must_succeed = B_FALSE; + ri->zri_allocargs->aa_must_succeed = B_FALSE; /* * Call the Lua function that open-context passed us. This pops the @@ -869,14 +858,14 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) /* * Let Lua use KM_SLEEP while we interpret the return values. */ - evalargs->ea_allocargs->aa_must_succeed = B_TRUE; + ri->zri_allocargs->aa_must_succeed = B_TRUE; /* * Remove the error handler callback from the stack. At this point, * there shouldn't be any cleanup handler registered in the handler * list (zri_cleanup_handlers), regardless of whether it ran or not. */ - list_destroy(&ri.zri_cleanup_handlers); + list_destroy(&ri->zri_cleanup_handlers); lua_remove(state, 1); switch (err) { @@ -896,16 +885,16 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) int return_count = lua_gettop(state); if (return_count == 1) { - evalargs->ea_result = 0; - zcp_convert_return_values(state, evalargs->ea_outnvl, - ZCP_RET_RETURN, evalargs); + ri->zri_result = 0; + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_RETURN, &ri->zri_result); } else if (return_count > 1) { - evalargs->ea_result = SET_ERROR(ECHRNG); + ri->zri_result = SET_ERROR(ECHRNG); lua_settop(state, 0); (void) lua_pushfstring(state, "Multiple return " "values not supported"); - zcp_convert_return_values(state, evalargs->ea_outnvl, - ZCP_RET_ERROR, evalargs); + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); } break; } @@ -919,19 +908,20 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) * stack. */ VERIFY3U(1, ==, lua_gettop(state)); - if (ri.zri_timed_out) { - evalargs->ea_result = SET_ERROR(ETIME); + if (ri->zri_timed_out) { + ri->zri_result = SET_ERROR(ETIME); + } else if (ri->zri_canceled) { + ri->zri_result = SET_ERROR(EINTR); } else { - evalargs->ea_result = SET_ERROR(ECHRNG); + ri->zri_result = SET_ERROR(ECHRNG); } - zcp_convert_return_values(state, evalargs->ea_outnvl, - ZCP_RET_ERROR, evalargs); + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); - if (evalargs->ea_result == ETIME && - evalargs->ea_outnvl != NULL) { - (void) nvlist_add_uint64(evalargs->ea_outnvl, - ZCP_ARG_INSTRLIMIT, ri.zri_curinstrs); + if (ri->zri_result == ETIME && ri->zri_outnvl != NULL) { + (void) nvlist_add_uint64(ri->zri_outnvl, + ZCP_ARG_INSTRLIMIT, ri->zri_curinstrs); } break; } @@ -943,14 +933,16 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) * return the error message. */ VERIFY3U(1, ==, lua_gettop(state)); - if (ri.zri_timed_out) { - evalargs->ea_result = SET_ERROR(ETIME); + if (ri->zri_timed_out) { + ri->zri_result = SET_ERROR(ETIME); + } else if (ri->zri_canceled) { + ri->zri_result = SET_ERROR(EINTR); } else { - evalargs->ea_result = SET_ERROR(ECHRNG); + ri->zri_result = SET_ERROR(ECHRNG); } - zcp_convert_return_values(state, evalargs->ea_outnvl, - ZCP_RET_ERROR, evalargs); + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); break; } case LUA_ERRMEM: @@ -958,7 +950,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) * Lua ran out of memory while running the channel program. * There's not much we can do. */ - evalargs->ea_result = SET_ERROR(ENOSPC); + ri->zri_result = SET_ERROR(ENOSPC); break; default: VERIFY0(err); @@ -966,21 +958,35 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) } static void -zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname) +zcp_pool_error(zcp_run_info_t *ri, const char *poolname) { - evalargs->ea_result = SET_ERROR(ECHRNG); - lua_settop(evalargs->ea_state, 0); - (void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s", + ri->zri_result = SET_ERROR(ECHRNG); + lua_settop(ri->zri_state, 0); + (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s", poolname); - zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl, - ZCP_RET_ERROR, evalargs); + zcp_convert_return_values(ri->zri_state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); } +/* + * This callback is called when txg_wait_synced_sig encountered a signal. + * The txg_wait_synced_sig will continue to wait for the txg to complete + * after calling this callback. + */ +/* ARGSUSED */ +static void +zcp_eval_sig(void *arg, dmu_tx_t *tx) +{ + zcp_run_info_t *ri = arg; + + ri->zri_canceled = B_TRUE; +} + static void zcp_eval_sync(void *arg, dmu_tx_t *tx) { - zcp_eval_arg_t *evalargs = arg; + zcp_run_info_t *ri = arg; /* * Open context should have setup the stack to contain: @@ -988,15 +994,14 @@ zcp_eval_sync(void *arg, dmu_tx_t *tx) * 2: Script to run (converted to a Lua function) * 3: nvlist input to function (converted to Lua table or nil) */ - VERIFY3U(3, ==, lua_gettop(evalargs->ea_state)); + VERIFY3U(3, ==, lua_gettop(ri->zri_state)); - zcp_eval_impl(tx, B_TRUE, evalargs); + zcp_eval_impl(tx, ri); } static void -zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname) +zcp_eval_open(zcp_run_info_t *ri, const char *poolname) { - int error; dsl_pool_t *dp; dmu_tx_t *tx; @@ -1004,11 +1009,11 @@ zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname) /* * See comment from the same assertion in zcp_eval_sync(). */ - VERIFY3U(3, ==, lua_gettop(evalargs->ea_state)); + VERIFY3U(3, ==, lua_gettop(ri->zri_state)); error = dsl_pool_hold(poolname, FTAG, &dp); if (error != 0) { - zcp_pool_error(evalargs, poolname); + zcp_pool_error(ri, poolname); return; } @@ -1023,7 +1028,7 @@ zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname) */ tx = dmu_tx_create_dd(dp->dp_mos_dir); - zcp_eval_impl(tx, B_FALSE, evalargs); + zcp_eval_impl(tx, ri); dmu_tx_abort(tx); @@ -1036,7 +1041,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, { int err; lua_State *state; - zcp_eval_arg_t evalargs; + zcp_run_info_t runinfo; if (instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); @@ -1136,24 +1141,41 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, } VERIFY3U(3, ==, lua_gettop(state)); - evalargs.ea_state = state; - evalargs.ea_allocargs = &allocargs; - evalargs.ea_instrlimit = instrlimit; - evalargs.ea_cred = CRED(); - evalargs.ea_outnvl = outnvl; - evalargs.ea_result = 0; + runinfo.zri_state = state; + runinfo.zri_allocargs = &allocargs; + runinfo.zri_outnvl = outnvl; + runinfo.zri_result = 0; + runinfo.zri_cred = CRED(); + runinfo.zri_proc = curproc; + runinfo.zri_timed_out = B_FALSE; + runinfo.zri_canceled = B_FALSE; + runinfo.zri_sync = sync; + runinfo.zri_space_used = 0; + runinfo.zri_curinstrs = 0; + runinfo.zri_maxinstrs = instrlimit; + runinfo.zri_new_zvols = fnvlist_alloc(); if (sync) { - err = dsl_sync_task(poolname, NULL, - zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL); + err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync, + zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL); if (err != 0) - zcp_pool_error(&evalargs, poolname); + zcp_pool_error(&runinfo, poolname); } else { - zcp_eval_open(&evalargs, poolname); + zcp_eval_open(&runinfo, poolname); } lua_close(state); - return (evalargs.ea_result); + /* + * Create device minor nodes for any new zvols. + */ + for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL); + pair != NULL; + pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) { + zvol_create_minor(nvpair_name(pair)); + } + fnvlist_free(runinfo.zri_new_zvols); + + return (runinfo.zri_result); } /* @@ -1421,14 +1443,10 @@ zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, } } -#if defined(_KERNEL) /* BEGIN CSTYLED */ -module_param(zfs_lua_max_instrlimit, ulong, 0644); -MODULE_PARM_DESC(zfs_lua_max_instrlimit, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW, "Max instruction limit that can be specified for a channel program"); -module_param(zfs_lua_max_memlimit, ulong, 0644); -MODULE_PARM_DESC(zfs_lua_max_memlimit, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW, "Max memory limit that can be specified for a channel program"); /* END CSTYLED */ -#endif diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c index ed98f0d102..7256e4de19 100644 --- a/module/zfs/zcp_get.c +++ b/module/zfs/zcp_get.c @@ -34,11 +34,13 @@ #include #include #include +#include #include #include #include #ifdef _KERNEL +#include #include #endif @@ -81,13 +83,13 @@ get_objset_type_name(dsl_dataset_t *ds, char *str) return (error); switch (type) { case ZFS_TYPE_SNAPSHOT: - (void) strcpy(str, "snapshot"); + (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN); break; case ZFS_TYPE_FILESYSTEM: - (void) strcpy(str, "filesystem"); + (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN); break; case ZFS_TYPE_VOLUME: - (void) strcpy(str, "volume"); + (void) strlcpy(str, "volume", ZAP_MAXVALUELEN); break; default: return (EINVAL); @@ -206,91 +208,12 @@ get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, break; default: mutex_exit(&dd->dd_lock); - return (ENOENT); + return (SET_ERROR(ENOENT)); } mutex_exit(&dd->dd_lock); return (0); } -/* - * Takes a dataset, a property, a value and that value's setpoint as - * found in the ZAP. Checks if the property has been changed in the vfs. - * If so, val and setpoint will be overwritten with updated content. - * Otherwise, they are left unchanged. - */ -static int -get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, - char *setpoint) -{ -#if !defined(_KERNEL) - return (0); -#else - int error; - zfsvfs_t *zfvp; - vfs_t *vfsp; - objset_t *os; - uint64_t tmp = *val; - - error = dmu_objset_from_ds(ds, &os); - if (error != 0) - return (error); - - if (dmu_objset_type(os) != DMU_OST_ZFS) - return (EINVAL); - - mutex_enter(&os->os_user_ptr_lock); - zfvp = dmu_objset_get_user(os); - mutex_exit(&os->os_user_ptr_lock); - if (zfvp == NULL) - return (ESRCH); - - vfsp = zfvp->z_vfs; - - switch (zfs_prop) { - case ZFS_PROP_ATIME: - if (vfsp->vfs_do_atime) - tmp = vfsp->vfs_atime; - break; - case ZFS_PROP_RELATIME: - if (vfsp->vfs_do_relatime) - tmp = vfsp->vfs_relatime; - break; - case ZFS_PROP_DEVICES: - if (vfsp->vfs_do_devices) - tmp = vfsp->vfs_devices; - break; - case ZFS_PROP_EXEC: - if (vfsp->vfs_do_exec) - tmp = vfsp->vfs_exec; - break; - case ZFS_PROP_SETUID: - if (vfsp->vfs_do_setuid) - tmp = vfsp->vfs_setuid; - break; - case ZFS_PROP_READONLY: - if (vfsp->vfs_do_readonly) - tmp = vfsp->vfs_readonly; - break; - case ZFS_PROP_XATTR: - if (vfsp->vfs_do_xattr) - tmp = vfsp->vfs_xattr; - break; - case ZFS_PROP_NBMAND: - if (vfsp->vfs_do_nbmand) - tmp = vfsp->vfs_nbmand; - break; - default: - return (ENOENT); - } - - if (tmp != *val) { - (void) strcpy(setpoint, "temporary"); - *val = tmp; - } - return (0); -#endif -} - /* * Check if the property we're looking for is stored at the dsl_dataset or * dsl_dir level. If so, push the property value and source onto the lua stack @@ -399,11 +322,11 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, break; case ZFS_PROP_FILESYSTEM_COUNT: error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); - (void) strcpy(setpoint, ""); + (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_SNAPSHOT_COUNT: error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); - (void) strcpy(setpoint, ""); + (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_NUMCLONES: numval = dsl_get_numclones(ds); @@ -423,19 +346,17 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_stats_impl(ds); - VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), - <, ZAP_MAXVALUELEN); + (void) strlcpy(strval, token, ZAP_MAXVALUELEN); if (strcmp(strval, "") == 0) { char *childval = get_child_receive_stats(ds); - VERIFY3U(strlcpy(strval, childval, ZAP_MAXVALUELEN), - <, ZAP_MAXVALUELEN); + (void) strlcpy(strval, childval, ZAP_MAXVALUELEN); if (strcmp(strval, "") == 0) error = ENOENT; - strfree(childval); + kmem_strfree(childval); } - strfree(token); + kmem_strfree(token); break; } case ZFS_PROP_VOLSIZE: @@ -447,7 +368,8 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, sizeof (numval), 1, &numval); } if (error == 0) - (void) strcpy(setpoint, dsname); + (void) strlcpy(setpoint, dsname, + ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_VOLBLOCKSIZE: { @@ -549,9 +471,14 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); - /* Fill in temorary value for prop, if applicable */ - (void) get_temporary_prop(ds, zfs_prop, &numval, setpoint); - +#ifdef _KERNEL + /* Fill in temporary value for prop, if applicable */ + (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); +#else + return (luaL_error(state, + "temporary properties only supported in kernel mode", + prop_name)); +#endif /* Push value to lua stack */ if (prop_type == PROP_TYPE_INDEX) { const char *propval; @@ -663,7 +590,7 @@ get_userquota_prop(const char *prop_name) * prop type as well as the numeric group/user ids based on the string * following the '@' in the property name. On success, returns 0. On failure, * returns a non-zero error. - * 'domain' must be free'd by caller using strfree() + * 'domain' must be free'd by caller using kmem_strfree() */ static int parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, @@ -680,7 +607,7 @@ parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, if (strncmp(cp, "S-1-", 4) == 0) { /* * It's a numeric SID (eg "S-1-234-567-89") and we want to - * seperate the domain id and the rid + * separate the domain id and the rid */ int domain_len = strrchr(cp, '-') - cp; domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); @@ -690,7 +617,7 @@ parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') { - strfree(domain_val); + kmem_strfree(domain_val); return (EINVAL); } } else { @@ -738,13 +665,13 @@ zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, } } if (domain != NULL) - strfree(domain); + kmem_strfree(domain); } dsl_dataset_rele(ds, FTAG); if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || (type == ZFS_PROP_GROUPQUOTA))) - error = ENOENT; + error = SET_ERROR(ENOENT); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); @@ -768,9 +695,10 @@ parse_written_prop(const char *dataset_name, const char *prop_name, ASSERT(zfs_prop_written(prop_name)); const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; if (strchr(name, '@') == NULL) { - (void) sprintf(snap_name, "%s@%s", dataset_name, name); + (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", + dataset_name, name); } else { - (void) strcpy(snap_name, name); + (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN); } } diff --git a/module/zfs/zcp_iter.c b/module/zfs/zcp_iter.c index f264455207..f727c56f21 100644 --- a/module/zfs/zcp_iter.c +++ b/module/zfs/zcp_iter.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2018 by Delphix. All rights reserved. */ #include @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -124,8 +125,6 @@ zcp_clones_list(lua_State *state) { const char *snapname = lua_tostring(state, 1); dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - boolean_t issnap; - uint64_t dsobj, cursor; /* * zcp_dataset_hold will either successfully return the requested @@ -135,9 +134,9 @@ zcp_clones_list(lua_State *state) dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - cursor = 0; - issnap = ds->ds_is_snapshot; - dsobj = ds->ds_object; + boolean_t issnap = ds->ds_is_snapshot; + uint64_t cursor = 0; + uint64_t dsobj = ds->ds_object; dsl_dataset_rele(ds, FTAG); if (!issnap) { @@ -323,7 +322,7 @@ zcp_children_list(lua_State *state) } static int -zcp_props_list_gc(lua_State *state) +zcp_user_props_list_gc(lua_State *state) { nvlist_t **props = lua_touserdata(state, 1); if (*props != NULL) @@ -332,7 +331,7 @@ zcp_props_list_gc(lua_State *state) } static int -zcp_props_iter(lua_State *state) +zcp_user_props_iter(lua_State *state) { char *source, *val; nvlist_t *nvprop; @@ -361,11 +360,33 @@ zcp_props_iter(lua_State *state) return (3); } -static int zcp_props_list(lua_State *); +static int zcp_user_props_list(lua_State *); +static zcp_list_info_t zcp_user_props_list_info = { + .name = "user_properties", + .func = zcp_user_props_list, + .gc = zcp_user_props_list_gc, + .pargs = { + { .za_name = "filesystem | snapshot | volume", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * 'properties' was the initial name for 'user_properties' seen + * above. 'user_properties' is a better name as it distinguishes + * these properties from 'system_properties' which are different. + * In order to avoid breaking compatibility between different + * versions of ZFS, we declare 'properties' as an alias for + * 'user_properties'. + */ static zcp_list_info_t zcp_props_list_info = { .name = "properties", - .func = zcp_props_list, - .gc = zcp_props_list_gc, + .func = zcp_user_props_list, + .gc = zcp_user_props_list_gc, .pargs = { { .za_name = "filesystem | snapshot | volume", .za_lua_type = LUA_TSTRING}, @@ -377,7 +398,7 @@ static zcp_list_info_t zcp_props_list_info = { }; static int -zcp_props_list(lua_State *state) +zcp_user_props_list(lua_State *state) { const char *dsname = lua_tostring(state, 1); dsl_pool_t *dp = zcp_run_info(state)->zri_pool; @@ -392,23 +413,24 @@ zcp_props_list(lua_State *state) dsl_dataset_rele(ds, FTAG); /* - * Set the metatable for the properties list to free it on completion. + * Set the metatable for the properties list to free it on + * completion. */ - luaL_getmetatable(state, zcp_props_list_info.name); + luaL_getmetatable(state, zcp_user_props_list_info.name); (void) lua_setmetatable(state, -2); lua_pushlightuserdata(state, NULL); - lua_pushcclosure(state, &zcp_props_iter, 2); + lua_pushcclosure(state, &zcp_user_props_iter, 2); return (1); } /* - * Populate nv with all valid properties and their values for the given + * Populate nv with all valid system properties and their values for the given * dataset. */ static void -zcp_dataset_props(dsl_dataset_t *ds, nvlist_t *nv) +zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv) { for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) { /* Do not display hidden props */ @@ -435,8 +457,8 @@ static zcp_list_info_t zcp_system_props_list_info = { }; /* - * Get a list of all visble properties and their values for a given dataset. - * Returned on the stack as a Lua table. + * Get a list of all visible system properties and their values for a given + * dataset. Returned on the stack as a Lua table. */ static int zcp_system_props_list(lua_State *state) @@ -454,8 +476,8 @@ zcp_system_props_list(lua_State *state) if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - /* Get the names of all valid properties for this dataset */ - zcp_dataset_props(ds, nv); + /* Get the names of all valid system properties for this dataset */ + zcp_dataset_system_props(ds, nv); dsl_dataset_rele(ds, FTAG); /* push list as lua table */ @@ -468,6 +490,213 @@ zcp_system_props_list(lua_State *state) return (1); } +static int +zcp_bookmarks_iter(lua_State *state) +{ + char ds_name[ZFS_MAX_DATASET_NAME_LEN]; + char bookmark_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + zap_attribute_t za; + zap_cursor_t zc; + + int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + err = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks_obj), 1, + &ds->ds_bookmarks_obj); + if (err != 0 && err != ENOENT) { + dsl_dataset_rele(ds, FTAG); + return (luaL_error(state, + "unexpected error %d from zap_lookup()", err)); + } + if (ds->ds_bookmarks_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + /* Store the dataset's name so we can append the bookmark's name */ + dsl_dataset_name(ds, ds_name); + + zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_bookmarks_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + /* Create the full "pool/fs#bookmark" string to return */ + int n = snprintf(bookmark_name, ZFS_MAX_DATASET_NAME_LEN, "%s#%s", + ds_name, za.za_name); + if (n >= ZFS_MAX_DATASET_NAME_LEN) { + return (luaL_error(state, + "unexpected error %d from snprintf()", ENAMETOOLONG)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, bookmark_name); + return (1); +} + +static int zcp_bookmarks_list(lua_State *); +static zcp_list_info_t zcp_bookmarks_list_info = { + .name = "bookmarks", + .func = zcp_bookmarks_list, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_bookmarks_list(lua_State *state) +{ + const char *dsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + boolean_t issnap = ds->ds_is_snapshot; + uint64_t dsobj = ds->ds_object; + uint64_t cursor = 0; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, "%s is a snapshot", dsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_bookmarks_iter, 2); + return (1); +} + +static int +zcp_holds_iter(lua_State *state) +{ + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + zap_attribute_t za; + zap_cursor_t zc; + + int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, za.za_name); + (void) lua_pushnumber(state, za.za_first_integer); + return (2); +} + +static int zcp_holds_list(lua_State *); +static zcp_list_info_t zcp_holds_list_info = { + .name = "holds", + .func = zcp_holds_list, + .gc = NULL, + .pargs = { + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * Iterate over all the holds for a given dataset. Each iteration returns + * a hold's tag and its timestamp as an integer. + */ +static int +zcp_holds_list(lua_State *state) +{ + const char *snapname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + boolean_t issnap = ds->ds_is_snapshot; + uint64_t dsobj = ds->ds_object; + uint64_t cursor = 0; + dsl_dataset_rele(ds, FTAG); + + if (!issnap) { + return (zcp_argerror(state, 1, "%s is not a snapshot", + snapname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_holds_iter, 2); + return (1); +} + static int zcp_list_func(lua_State *state) { @@ -485,9 +714,12 @@ zcp_load_list_lib(lua_State *state) zcp_list_info_t *zcp_list_funcs[] = { &zcp_children_list_info, &zcp_snapshots_list_info, + &zcp_user_props_list_info, &zcp_props_list_info, &zcp_clones_list_info, &zcp_system_props_list_info, + &zcp_bookmarks_list_info, + &zcp_holds_list_info, NULL }; diff --git a/module/zfs/zcp_set.c b/module/zfs/zcp_set.c new file mode 100644 index 0000000000..cebb56a5f1 --- /dev/null +++ b/module/zfs/zcp_set.c @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyrigh 2020 Joyent, Inc. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void +zcp_set_user_prop(lua_State *state, dsl_pool_t *dp, const char *dsname, + const char *prop_name, const char *prop_val, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return; /* not reached; zcp_dataset_hold() longjmp'd */ + + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, prop_name, prop_val); + + dsl_props_set_sync_impl(ds, ZPROP_SRC_LOCAL, nvl, tx); + + fnvlist_free(nvl); + dsl_dataset_rele(ds, FTAG); +} + +int +zcp_set_prop_check(void *arg, dmu_tx_t *tx) +{ + zcp_set_prop_arg_t *args = arg; + const char *prop_name = args->prop; + dsl_props_set_arg_t dpsa = { + .dpsa_dsname = args->dsname, + .dpsa_source = ZPROP_SRC_LOCAL, + }; + nvlist_t *nvl = NULL; + int ret = 0; + + /* + * Only user properties are currently supported. When non-user + * properties are supported, we will want to use + * zfs_valid_proplist() to verify the properties. + */ + if (!zfs_prop_user(prop_name)) { + return (EINVAL); + } + + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, args->prop, args->val); + dpsa.dpsa_props = nvl; + + ret = dsl_props_set_check(&dpsa, tx); + nvlist_free(nvl); + + return (ret); +} + +void +zcp_set_prop_sync(void *arg, dmu_tx_t *tx) +{ + zcp_set_prop_arg_t *args = arg; + zcp_run_info_t *ri = zcp_run_info(args->state); + dsl_pool_t *dp = ri->zri_pool; + + const char *dsname = args->dsname; + const char *prop_name = args->prop; + const char *prop_val = args->val; + + if (zfs_prop_user(prop_name)) { + zcp_set_user_prop(args->state, dp, dsname, prop_name, + prop_val, tx); + } +} diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c index e089666f20..c6ade59b9c 100644 --- a/module/zfs/zcp_synctask.c +++ b/module/zfs/zcp_synctask.c @@ -15,12 +15,15 @@ /* * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright 2020 Joyent, Inc. */ #include #include #include +#include #include #include #include @@ -35,6 +38,12 @@ #define DST_AVG_BLKSHIFT 14 +typedef struct zcp_inherit_prop_arg { + lua_State *zipa_state; + const char *zipa_prop; + dsl_props_set_arg_t zipa_dpsa; +} zcp_inherit_prop_arg_t; + typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *); typedef struct zcp_synctask_info { const char *name; @@ -45,6 +54,12 @@ typedef struct zcp_synctask_info { int blocks_modified; } zcp_synctask_info_t; +static void +zcp_synctask_cleanup(void *arg) +{ + fnvlist_free(arg); +} + /* * Generic synctask interface for channel program syncfuncs. * @@ -177,6 +192,7 @@ zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details) ddpa.ddpa_clonename = dsname; ddpa.err_ds = err_details; ddpa.cr = ri->zri_cred; + ddpa.proc = ri->zri_proc; /* * If there was a snapshot name conflict, then err_ds will be filled @@ -250,7 +266,7 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) * context. */ if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) { - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } /* @@ -260,21 +276,191 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) ddsa.ddsa_errors = NULL; ddsa.ddsa_props = NULL; ddsa.ddsa_cr = ri->zri_cred; + ddsa.ddsa_proc = ri->zri_proc; ddsa.ddsa_snaps = fnvlist_alloc(); fnvlist_add_boolean(ddsa.ddsa_snaps, dsname); zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps); + zcp_synctask_cleanup, ddsa.ddsa_snaps); err = zcp_sync_task(state, dsl_dataset_snapshot_check, dsl_dataset_snapshot_sync, &ddsa, sync, dsname); + if (err == 0) { + /* + * We may need to create a new device minor node for this + * dataset (if it is a zvol and the "snapdev" property is set). + * Save it in the nvlist so that it can be processed in open + * context. + */ + fnvlist_add_boolean(ri->zri_new_zvols, dsname); + } + zcp_deregister_cleanup(state, zch); fnvlist_free(ddsa.ddsa_snaps); return (err); } +static int zcp_synctask_inherit_prop(lua_State *, boolean_t, + nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_inherit_prop_info = { + .name = "inherit", + .func = zcp_synctask_inherit_prop, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 2, /* 2 * numprops */ + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, + { .za_name = "property", .za_lua_type = LUA_TSTRING }, + { NULL, 0 } + }, + .kwargs = { + { NULL, 0 } + }, +}; + +static int +zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx) +{ + zcp_inherit_prop_arg_t *args = arg; + zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop); + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(args->zipa_prop)) + return (0); + + return (EINVAL); + } + + if (zfs_prop_readonly(prop)) + return (EINVAL); + + if (!zfs_prop_inheritable(prop)) + return (EINVAL); + + return (dsl_props_set_check(&args->zipa_dpsa, tx)); +} + +static void +zcp_synctask_inherit_prop_sync(void *arg, dmu_tx_t *tx) +{ + zcp_inherit_prop_arg_t *args = arg; + dsl_props_set_arg_t *dpsa = &args->zipa_dpsa; + + dsl_props_set_sync(dpsa, tx); +} + +static int +zcp_synctask_inherit_prop(lua_State *state, boolean_t sync, + nvlist_t *err_details) +{ + int err; + zcp_inherit_prop_arg_t zipa = { 0 }; + dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa; + + const char *dsname = lua_tostring(state, 1); + const char *prop = lua_tostring(state, 2); + + zipa.zipa_state = state; + zipa.zipa_prop = prop; + dpsa->dpsa_dsname = dsname; + dpsa->dpsa_source = ZPROP_SRC_INHERITED; + dpsa->dpsa_props = fnvlist_alloc(); + fnvlist_add_boolean(dpsa->dpsa_props, prop); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + zcp_synctask_cleanup, dpsa->dpsa_props); + + err = zcp_sync_task(state, zcp_synctask_inherit_prop_check, + zcp_synctask_inherit_prop_sync, &zipa, sync, dsname); + + zcp_deregister_cleanup(state, zch); + fnvlist_free(dpsa->dpsa_props); + + return (err); +} + +static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_bookmark_info = { + .name = "bookmark", + .func = zcp_synctask_bookmark, + .pargs = { + {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING}, + {.za_name = "bookmark", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_NORMAL, + .blocks_modified = 1, +}; + +/* ARGSUSED */ +static int +zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *source = lua_tostring(state, 1); + const char *new = lua_tostring(state, 2); + + nvlist_t *bmarks = fnvlist_alloc(); + fnvlist_add_string(bmarks, new, source); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + zcp_synctask_cleanup, bmarks); + + dsl_bookmark_create_arg_t dbca = { + .dbca_bmarks = bmarks, + .dbca_errors = NULL, + }; + err = zcp_sync_task(state, dsl_bookmark_create_check, + dsl_bookmark_create_sync, &dbca, sync, source); + + zcp_deregister_cleanup(state, zch); + fnvlist_free(bmarks); + + return (err); +} + +static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_set_prop_info = { + .name = "set_prop", + .func = zcp_synctask_set_prop, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 2, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "property", .za_lua_type = LUA_TSTRING}, + { .za_name = "value", .za_lua_type = LUA_TSTRING}, + { NULL, 0 } + }, + .kwargs = { + { NULL, 0 } + } +}; + +static int +zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + zcp_set_prop_arg_t args = { 0 }; + + const char *dsname = lua_tostring(state, 1); + const char *prop = lua_tostring(state, 2); + const char *val = lua_tostring(state, 3); + + args.state = state; + args.dsname = dsname; + args.prop = prop; + args.val = val; + + err = zcp_sync_task(state, zcp_set_prop_check, zcp_set_prop_sync, + &args, sync, dsname); + + return (err); +} + static int zcp_synctask_wrapper(lua_State *state) { @@ -287,8 +473,7 @@ zcp_synctask_wrapper(lua_State *state) * Make sure err_details is properly freed, even if a fatal error is * thrown during the synctask. */ - zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, err_details); + zch = zcp_register_cleanup(state, zcp_synctask_cleanup, err_details); zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); boolean_t sync = lua_toboolean(state, lua_upvalueindex(2)); @@ -343,6 +528,9 @@ zcp_load_synctask_lib(lua_State *state, boolean_t sync) &zcp_synctask_promote_info, &zcp_synctask_rollback_info, &zcp_synctask_snapshot_info, + &zcp_synctask_inherit_prop_info, + &zcp_synctask_bookmark_info, + &zcp_synctask_set_prop_info, NULL }; diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c index ed6ebcfc9d..9d16fff81d 100644 --- a/module/zfs/zfeature.c +++ b/module/zfs/zfeature.c @@ -203,7 +203,7 @@ spa_features_check(spa_t *spa, boolean_t for_write, supported = B_FALSE; if (NULL != unsup_feat) { - char *desc = ""; + const char *desc = ""; if (zap_lookup(os, spa->spa_feat_desc_obj, za->za_name, 1, MAXPATHLEN, buf) == 0) @@ -279,7 +279,7 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, static int feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { - ASSERTV(uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj); + uint64_t enabled_txg_obj __maybe_unused = spa->spa_feat_enabled_txg_obj; ASSERT(zfeature_depends_on(feature->fi_feature, SPA_FEATURE_ENABLED_TXG)); @@ -397,9 +397,9 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, { uint64_t refcount = 0; zfeature_info_t *feature = &spa_feature_table[fid]; - ASSERTV(uint64_t zapobj = + uint64_t zapobj __maybe_unused = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj); + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; ASSERT(VALID_FEATURE_FID(fid)); ASSERT(0 != zapobj); diff --git a/module/zfs/zfs_byteswap.c b/module/zfs/zfs_byteswap.c index 7893bde4e2..cd35849c3f 100644 --- a/module/zfs/zfs_byteswap.c +++ b/module/zfs/zfs_byteswap.c @@ -30,6 +30,9 @@ #include #include +#ifndef _KERNEL +static +#endif void zfs_oldace_byteswap(ace_t *ace, int ace_cnt) { @@ -44,8 +47,11 @@ zfs_oldace_byteswap(ace_t *ace, int ace_cnt) } /* - * swap ace_t and ace_oject_t + * swap ace_t and ace_object_t */ +#ifndef _KERNEL +static +#endif void zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) { @@ -70,7 +76,7 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) * larger than needed to hold the aces * present. As long as we do not do any * swapping beyond the end of our block we are - * okay. It it safe to swap any non-ace data + * okay. It is safe to swap any non-ace data * within the block since it is just zeros. */ if (ptr + sizeof (zfs_ace_hdr_t) > end) { diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 579aa03804..007f31b4e7 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012,2021 by Delphix. All rights reserved. */ #include @@ -101,7 +101,289 @@ * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ + #ifdef _KERNEL +/* + * Duplicate ereport Detection + * + * Some ereports are retained momentarily for detecting duplicates. These + * are kept in a recent_events_node_t in both a time-ordered list and an AVL + * tree of recent unique ereports. + * + * The lifespan of these recent ereports is bounded (15 mins) and a cleaner + * task is used to purge stale entries. + */ +static list_t recent_events_list; +static avl_tree_t recent_events_tree; +static kmutex_t recent_events_lock; +static taskqid_t recent_events_cleaner_tqid; + +/* + * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. + * + * This setting can be changed dynamically and setting it to zero + * disables duplicate detection. + */ +unsigned int zfs_zevent_retain_max = 2000; + +/* + * The lifespan for a recent ereport entry. The default of 15 minutes is + * intended to outlive the zfs diagnosis engine's threshold of 10 errors + * over a period of 10 minutes. + */ +unsigned int zfs_zevent_retain_expire_secs = 900; + +typedef enum zfs_subclass { + ZSC_IO, + ZSC_DATA, + ZSC_CHECKSUM +} zfs_subclass_t; + +typedef struct { + /* common criteria */ + uint64_t re_pool_guid; + uint64_t re_vdev_guid; + int re_io_error; + uint64_t re_io_size; + uint64_t re_io_offset; + zfs_subclass_t re_subclass; + zio_priority_t re_io_priority; + + /* logical zio criteria (optional) */ + zbookmark_phys_t re_io_bookmark; + + /* internal state */ + avl_node_t re_tree_link; + list_node_t re_list_link; + uint64_t re_timestamp; +} recent_events_node_t; + +static int +recent_events_compare(const void *a, const void *b) +{ + const recent_events_node_t *node1 = a; + const recent_events_node_t *node2 = b; + int cmp; + + /* + * The comparison order here is somewhat arbitrary. + * What's important is that if every criteria matches, then it + * is a duplicate (i.e. compare returns 0) + */ + if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) + return (cmp); + + const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; + const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; + + if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) + return (cmp); + + return (0); +} + +static void zfs_ereport_schedule_cleaner(void); + +/* + * background task to clean stale recent event nodes. + */ +/*ARGSUSED*/ +static void +zfs_ereport_cleaner(void *arg) +{ + recent_events_node_t *entry; + uint64_t now = gethrtime(); + + /* + * purge expired entries + */ + mutex_enter(&recent_events_lock); + while ((entry = list_tail(&recent_events_list)) != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + if (age <= zfs_zevent_retain_expire_secs) + break; + + /* remove expired node */ + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + + /* Restart the cleaner if more entries remain */ + recent_events_cleaner_tqid = 0; + if (!list_is_empty(&recent_events_list)) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); +} + +static void +zfs_ereport_schedule_cleaner(void) +{ + ASSERT(MUTEX_HELD(&recent_events_lock)); + + uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); + + recent_events_cleaner_tqid = taskq_dispatch_delay( + system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, + ddi_get_lbolt() + NSEC_TO_TICK(timeout)); +} + +/* + * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL + */ +void +zfs_ereport_clear(spa_t *spa, vdev_t *vd) +{ + uint64_t vdev_guid, pool_guid; + int cnt = 0; + + ASSERT(vd != NULL || spa != NULL); + if (vd == NULL) { + vdev_guid = 0; + pool_guid = spa_guid(spa); + } else { + vdev_guid = vd->vdev_guid; + pool_guid = 0; + } + + mutex_enter(&recent_events_lock); + + recent_events_node_t *next = list_head(&recent_events_list); + while (next != NULL) { + recent_events_node_t *entry = next; + + next = list_next(&recent_events_list, next); + + if (entry->re_vdev_guid == vdev_guid || + entry->re_pool_guid == pool_guid) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + cnt++; + } + } + + mutex_exit(&recent_events_lock); +} + +/* + * Check if an ereport would be a duplicate of one recently posted. + * + * An ereport is considered a duplicate if the set of criteria in + * recent_events_node_t all match. + * + * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM + * are candidates for duplicate checking. + */ +static boolean_t +zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) +{ + recent_events_node_t search = {0}, *entry; + + if (vd == NULL || zio == NULL) + return (B_FALSE); + + if (zfs_zevent_retain_max == 0) + return (B_FALSE); + + if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) + search.re_subclass = ZSC_IO; + else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) + search.re_subclass = ZSC_DATA; + else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) + search.re_subclass = ZSC_CHECKSUM; + else + return (B_FALSE); + + search.re_pool_guid = spa_guid(spa); + search.re_vdev_guid = vd->vdev_guid; + search.re_io_error = zio->io_error; + search.re_io_priority = zio->io_priority; + /* if size is supplied use it over what's in zio */ + if (size) { + search.re_io_size = size; + search.re_io_offset = offset; + } else { + search.re_io_size = zio->io_size; + search.re_io_offset = zio->io_offset; + } + + /* grab optional logical zio criteria */ + if (zb != NULL) { + search.re_io_bookmark.zb_objset = zb->zb_objset; + search.re_io_bookmark.zb_object = zb->zb_object; + search.re_io_bookmark.zb_level = zb->zb_level; + search.re_io_bookmark.zb_blkid = zb->zb_blkid; + } + + uint64_t now = gethrtime(); + + mutex_enter(&recent_events_lock); + + /* check if we have seen this one recently */ + entry = avl_find(&recent_events_tree, &search, NULL); + if (entry != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + + /* + * There is still an active cleaner (since we're here). + * Reset the last seen time for this duplicate entry + * so that its lifespand gets extended. + */ + list_remove(&recent_events_list, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + zfs_zevent_track_duplicate(); + mutex_exit(&recent_events_lock); + + return (age <= zfs_zevent_retain_expire_secs); + } + + if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { + /* recycle oldest node */ + entry = list_tail(&recent_events_list); + ASSERT(entry != NULL); + list_remove(&recent_events_list, entry); + avl_remove(&recent_events_tree, entry); + } else { + entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); + } + + /* record this as a recent ereport */ + *entry = search; + avl_add(&recent_events_tree, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + /* Start a cleaner if not already scheduled */ + if (recent_events_cleaner_tqid == 0) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); + return (B_FALSE); +} + void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { @@ -113,8 +395,8 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) } /* - * We want to rate limit ZIO delay and checksum events so as to not - * flood ZED when a disk is acting up. + * We want to rate limit ZIO delay, deadman, and checksum events so as to not + * flood zevent consumers when a disk is acting up. * * Returns 1 if we're ratelimiting, 0 if not. */ @@ -123,11 +405,13 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) { int rc = 0; /* - * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we + * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * are. Invert it to get our return value. */ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { rc = !zfs_ratelimit(&vd->vdev_delay_rl); + } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { + rc = !zfs_ratelimit(&vd->vdev_deadman_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { rc = !zfs_ratelimit(&vd->vdev_checksum_rl); } @@ -153,9 +437,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) - return (B_FALSE); - if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); @@ -336,6 +617,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -708,6 +991,12 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } return (eip); } +#else +/*ARGSUSED*/ +void +zfs_ereport_clear(spa_t *spa, vdev_t *vd) +{ +} #endif /* @@ -788,24 +1077,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) } /* - * Return 0 if event was posted, EINVAL if there was a problem posting it or - * EBUSY if the event was rate limited. + * Post an ereport for the given subclass + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t size) + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (EINVAL); + + if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size)) + zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) @@ -817,24 +1116,36 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, return (rc); } -void +/* + * Prepare a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ +int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, - struct zio *zio, uint64_t offset, uint64_t length, void *arg, - zio_bad_cksum_t *info) + struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) { zio_cksum_report_t *report; #ifdef _KERNEL + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return; + return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); - if (zio->io_vsd != NULL) - zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); - else - zio_vsd_default_cksum_report(zio, report, arg); + zio_vsd_default_cksum_report(zio, report); /* copy the checksum failure information if it was provided */ if (info != NULL) { @@ -842,16 +1153,18 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, bcopy(info, report->zcr_ckinfo, sizeof (*info)); } - report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_align = + vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL - zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); - return; + return (0); } #endif @@ -859,6 +1172,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); + return (0); } void @@ -901,7 +1215,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) kmem_free(rpt, sizeof (*rpt)); } - +/* + * Post a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, @@ -913,8 +1235,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, nvlist_t *detector = NULL; zfs_ecksum_info_t *info; + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return (EBUSY); + return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { @@ -1073,11 +1402,109 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #endif } -#if defined(_KERNEL) +#ifdef _KERNEL +void +zfs_ereport_init(void) +{ + mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&recent_events_list, sizeof (recent_events_node_t), + offsetof(recent_events_node_t, re_list_link)); + avl_create(&recent_events_tree, recent_events_compare, + sizeof (recent_events_node_t), offsetof(recent_events_node_t, + re_tree_link)); +} + +/* + * This 'early' fini needs to run before zfs_fini() which on Linux waits + * for the system_delay_taskq to drain. + */ +void +zfs_ereport_taskq_fini(void) +{ + mutex_enter(&recent_events_lock); + if (recent_events_cleaner_tqid != 0) { + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + recent_events_cleaner_tqid = 0; + } + mutex_exit(&recent_events_lock); +} + +void +zfs_ereport_fini(void) +{ + recent_events_node_t *entry; + + while ((entry = list_head(&recent_events_list)) != NULL) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + avl_destroy(&recent_events_tree); + list_destroy(&recent_events_list); + mutex_destroy(&recent_events_lock); +} + +void +zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) +{ + nvlist_t *aux; + + aux = fm_nvlist_create(NULL); + nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); + + zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); + fm_nvlist_destroy(aux, FM_NVA_FREE); +} + +/* + * Post when a event when a zvol is created or removed + * + * This is currently only used by macOS, since it uses the event to create + * symlinks between the volume name (mypool/myvol) and the actual /dev + * device (/dev/disk3). For example: + * + * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 + * + * name: The full name of the zvol ("mypool/myvol") + * dev_name: The full /dev name for the zvol ("/dev/disk3") + * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") + */ +void +zfs_ereport_zvol_post(const char *subclass, const char *name, + const char *dev_name, const char *raw_name) +{ + nvlist_t *aux; + char *r; + + boolean_t locked = mutex_owned(&spa_namespace_lock); + if (!locked) mutex_enter(&spa_namespace_lock); + spa_t *spa = spa_lookup(name); + if (!locked) mutex_exit(&spa_namespace_lock); + + if (spa == NULL) + return; + + aux = fm_nvlist_create(NULL); + nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); + nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, + raw_name); + r = strchr(name, '/'); + if (r && r[1]) + nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); + + zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); + fm_nvlist_destroy(aux, FM_NVA_FREE); +} + EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, + "Maximum recent zevents records to retain for duplicate checking"); +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, + "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index e57753593c..a90bf5feee 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #ifdef _KERNEL #include @@ -73,7 +72,7 @@ idx_compare(const void *arg1, const void *arg2) const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - return (AVL_CMP(node1->f_idx, node2->f_idx)); + return (TREE_CMP(node1->f_idx, node2->f_idx)); } /* @@ -88,7 +87,7 @@ domain_compare(const void *arg1, const void *arg2) val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } void @@ -382,17 +381,40 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { - *uidp = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), + *uidp = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOUID(zp)), cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(ZTOZSB(zp), KGID_TO_SGID(ZTOI(zp)->i_gid), + *gidp = zfs_fuid_map_id(ZTOZSB(zp), KGID_TO_SGID(ZTOGID(zp)), cr, ZFS_GROUP); } +#ifdef __FreeBSD__ +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + + if (index == 0) + return (fuid); + + return (UID_NOBODY); +} +#elif defined(__linux__) +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + /* + * The Linux port only supports POSIX IDs, use the passed id. + */ + return (fuid); +} + +#else uid_t zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, cred_t *cr, zfs_fuid_type_t type) { -#ifdef HAVE_KSID uint32_t index = FUID_INDEX(fuid); const char *domain; uid_t id; @@ -411,13 +433,8 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, FUID_RID(fuid), &id); } return (id); -#else - /* - * The Linux port only supports POSIX IDs, use the passed id. - */ - return (fuid); -#endif /* HAVE_KSID */ } +#endif /* * Add a FUID node to the list of fuid's being created for this @@ -560,9 +577,9 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, const char *domain; char *kdomain; uint32_t fuid_idx = FUID_INDEX(id); - uint32_t rid; + uint32_t rid = 0; idmap_stat status; - uint64_t idx = 0; + uint64_t idx = UID_NOBODY; zfs_fuid_t *zfuid = NULL; zfs_fuid_info_t *fuidp = NULL; @@ -711,10 +728,11 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { -#ifdef HAVE_KSID + uid_t gid; + +#ifdef illumos ksid_t *ksid = crgetsid(cr, KSID_GROUP); ksidlist_t *ksidlist = crgetsidlist(cr); - uid_t gid; if (ksid && ksidlist) { int i; @@ -747,15 +765,13 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) } } } +#endif /* illumos */ /* * Not found in ksidlist, check posix groups */ gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); -#else - return (B_TRUE); -#endif } void @@ -772,4 +788,24 @@ zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) FUID_SIZE_ESTIMATE(zfsvfs)); } } + +/* + * buf must be big enough (eg, 32 bytes) + */ +int +zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, size_t len, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (SET_ERROR(ENOENT)); + } + fuid = FUID_ENCODE(domainid, rid); + (void) snprintf(buf, len, "%llx", (longlong_t)fuid); + return (0); +} #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f30d0a8944..96a021acbc 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -27,7 +27,7 @@ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -37,6 +37,9 @@ * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* @@ -156,12 +159,13 @@ #include #include #include -#include +#include #include #include #include #include #include +#include #include #include #include @@ -176,13 +180,13 @@ #include #include #include +#include #include #include #include #include #include #include -#include #include #include #include @@ -191,9 +195,12 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -201,14 +208,10 @@ #include #include #include -#include #include #include #include -#include -#include - #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" @@ -216,70 +219,31 @@ #include #include - -/* - * Limit maximum nvlist size. We don't want users passing in insane values - * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. - */ -#define MAX_NVLIST_SRC_SIZE KMALLOC_MAX_SIZE +#include kmutex_t zfsdev_state_lock; zfsdev_state_t *zfsdev_state_list; -extern void zfs_init(void); -extern void zfs_fini(void); - -uint_t zfs_fsyncer_key; -extern uint_t rrw_tsd_key; -static uint_t zfs_allow_log_key; - -typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); -typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); -typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); +/* + * Limit maximum nvlist size. We don't want users passing in insane values + * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. + * Defaults to 0=auto which is handled by platform code. + */ +unsigned long zfs_max_nvlist_src_size = 0; /* - * IOC Keys are used to document and validate user->kernel interface inputs. - * See zfs_keys_recv_new for an example declaration. Any key name that is not - * listed will be rejected as input. - * - * The keyname 'optional' is always allowed, and must be an nvlist if present. - * Arguments which older kernels can safely ignore can be placed under the - * "optional" key. - * - * When adding new keys to an existing ioc for new functionality, consider: - * - adding an entry into zfs_sysfs.c zfs_features[] list - * - updating the libzfs_input_check.c test utility - * - * Note: in the ZK_WILDCARDLIST case, the name serves as documentation - * for the expected name (bookmark, snapshot, property, etc) but there - * is no validation in the preflight zfs_check_input_nvpairs() check. + * When logging the output nvlist of an ioctl in the on-disk history, limit + * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. + * This applies primarily to zfs_ioc_channel_program(). */ -typedef enum { - ZK_OPTIONAL = 1 << 0, /* pair is optional */ - ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */ -} ioc_key_flag_t; +unsigned long zfs_history_output_max = 1024 * 1024; + +uint_t zfs_fsyncer_key; +uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN -typedef struct zfs_ioc_key { - const char *zkey_name; - data_type_t zkey_type; - ioc_key_flag_t zkey_flags; -} zfs_ioc_key_t; - -typedef enum { - NO_NAME, - POOL_NAME, - DATASET_NAME -} zfs_ioc_namecheck_t; - -typedef enum { - POOL_CHECK_NONE = 1 << 0, - POOL_CHECK_SUSPENDED = 1 << 1, - POOL_CHECK_READONLY = 1 << 2, -} zfs_ioc_poolcheck_t; - typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; @@ -313,7 +277,7 @@ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); -static int zfs_check_clearable(char *dataset, nvlist_t *props, +static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); @@ -346,23 +310,6 @@ history_str_get(zfs_cmd_t *zc) return (buf); } -/* - * Check to see if the named dataset is currently defined as bootable - */ -static boolean_t -zfs_is_bootfs(const char *name) -{ - objset_t *os; - - if (dmu_objset_hold(name, FTAG, &os) == 0) { - boolean_t ret; - ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); - dmu_objset_rele(os, FTAG); - return (ret); - } - return (B_FALSE); -} - /* * Return non-zero if the spa version is less than requested version. */ @@ -487,7 +434,8 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) + if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), + &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); @@ -498,7 +446,7 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; - if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) + if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); @@ -557,7 +505,7 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) * Returns 0 for success, non-zero for access and other errors. */ static int -zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) +zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; @@ -612,7 +560,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; - static char *setsl_tag = "setsl_tag"; + static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, @@ -683,8 +631,8 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * limit on things *under* (ie. contained by) * the thing they own. */ - if (dsl_prop_get_integer(dsname, "zoned", &zoned, - setpoint)) + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); @@ -739,7 +687,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; - char *cp; + const char *cp; int error; /* @@ -777,13 +725,13 @@ zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) ZFS_DELEG_PERM_SEND, cr)); } -int +static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } -int +static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); @@ -1042,14 +990,6 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } -/* ARGSUSED */ -static int -zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_REMAP, cr)); -} - /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) @@ -1133,7 +1073,7 @@ zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ -static int +int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) @@ -1440,10 +1380,7 @@ getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ - if (*zfvp == NULL || (*zfvp)->z_sb == NULL || - !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) { - error = SET_ERROR(ESRCH); - } + error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } @@ -1477,15 +1414,17 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { - rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : - RW_READER, tag); + if (writer) + ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); + else + ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ - rrm_exit(&(*zfvp)->z_teardown_lock, tag); + ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } @@ -1495,10 +1434,10 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { - rrm_exit(&zfsvfs->z_teardown_lock, tag); + ZFS_TEARDOWN_EXIT(zfsvfs, tag); - if (zfsvfs->z_sb) { - deactivate_super(zfsvfs->z_sb); + if (zfs_vfs_held(zfsvfs)) { + zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); @@ -1513,7 +1452,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; - char *spa_name = zc->zc_name; + const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, @@ -1994,8 +1933,9 @@ static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; - int replacing = zc->zc_cookie; nvlist_t *config; + int replacing = zc->zc_cookie; + int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) @@ -2003,7 +1943,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, + rebuild); nvlist_free(config); } @@ -2065,7 +2006,7 @@ static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_value; + const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; @@ -2082,7 +2023,7 @@ static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; - char *fru = zc->zc_value; + const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; @@ -2111,7 +2052,7 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... - * XXX reading with out owning + * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { @@ -2345,7 +2286,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { - return (error == ENOENT ? ESRCH : error); + return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* @@ -2419,8 +2360,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; - const char *domain; - char *dash; + const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; @@ -2473,7 +2413,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; - char *strval = NULL; + const char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { @@ -2534,6 +2474,15 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; + case ZFS_PROP_COMPRESSION: + err = dsl_dataset_set_compression(dsname, source, intval); + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; @@ -2557,7 +2506,8 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - (void) strcpy(zc->zc_name, dsname); + (void) strlcpy(zc->zc_name, dsname, + sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); @@ -2571,6 +2521,26 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, return (err); } +static boolean_t +zfs_is_namespace_prop(zfs_prop_t prop) +{ + switch (prop) { + + case ZFS_PROP_ATIME: + case ZFS_PROP_RELATIME: + case ZFS_PROP_DEVICES: + case ZFS_PROP_EXEC: + case ZFS_PROP_SETUID: + case ZFS_PROP_READONLY: + case ZFS_PROP_XATTR: + case ZFS_PROP_NBMAND: + return (B_TRUE); + + default: + return (B_FALSE); + } +} + /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error @@ -2589,7 +2559,8 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvpair_t *propval; int rv = 0; uint64_t intval; - char *strval; + const char *strval; + boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); @@ -2644,7 +2615,8 @@ retry: case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) - err = SET_ERROR(EINVAL); + err = + SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, @@ -2686,6 +2658,9 @@ retry: fnvlist_add_int32(errlist, propname, err); rv = err; } + + if (zfs_is_namespace_prop(prop)) + should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { @@ -2734,6 +2709,9 @@ retry: } } } + if (should_update_mount_cache) + zfs_ioctl_update_mount_cache(dsname); + nvlist_free(genericnvl); nvlist_free(retrynvl); @@ -2744,10 +2722,9 @@ retry: * Check that all the properties are valid user properties. */ static int -zfs_check_userprops(const char *fsname, nvlist_t *nvl) +zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; - int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); @@ -2756,10 +2733,6 @@ zfs_check_userprops(const char *fsname, nvlist_t *nvl) nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); - if ((error = zfs_secpolicy_write_perms(fsname, - ZFS_DELEG_PERM_USERPROP, CRED()))) - return (error); - if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); @@ -3371,8 +3344,9 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) /* * Volumes will return EBUSY and cannot be destroyed - * until all asynchronous minor handling has completed. - * Wait for the spa_zvol_taskq to drain then retry. + * until all asynchronous minor handling (e.g. from + * setting the volmode property) has completed. Wait for + * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { @@ -3410,7 +3384,7 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; - char *origin_name; + const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); @@ -3444,11 +3418,8 @@ static const zfs_ioc_key_t zfs_keys_remap[] = { static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { - if (strchr(fsname, '@') || - strchr(fsname, '%')) - return (SET_ERROR(EINVAL)); - - return (dmu_objset_remap_indirects(fsname)); + /* This IOCTL is no longer supported. */ + return (0); } /* @@ -3473,19 +3444,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); - if ((error = zfs_check_userprops(poolname, props)) != 0) - return (error); - if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); + if ((error = zfs_check_userprops(props)) != 0) + return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); - const char *cp = strchr(name, '@'); + char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must @@ -3502,6 +3472,18 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); + /* + * Check for permission to set the properties on the fs. + */ + if (!nvlist_empty(props)) { + *cp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED()); + *cp = '@'; + if (error != 0) + return (error); + } + /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { @@ -3528,10 +3510,10 @@ static const zfs_ioc_key_t zfs_keys_log_history[] = { static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { - char *message; + const char *message; + char *poolname; spa_t *spa; int error; - char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, @@ -3545,7 +3527,7 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); - strfree(poolname); + kmem_strfree(poolname); if (error != 0) return (error); @@ -3561,6 +3543,56 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* + * This ioctl is used to set the bootenv configuration on the current + * pool. This configuration is stored in the second padding area of the label, + * and it is used by the bootloader(s) to store the bootloader and/or system + * specific data. + * The data is stored as nvlist data stream, and is protected by + * an embedded checksum. + * The version can have two possible values: + * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. + * VB_NVLIST: nvlist with arbitrary pairs. + */ +static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { + {"version", DATA_TYPE_UINT64, 0}, + {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, +}; + +static int +zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error; + spa_t *spa; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { + /* no nvl keys */ +}; + +static int +zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. @@ -3577,7 +3609,7 @@ zfs_unmount_snap(const char *snapname) if (strchr(snapname, '@') == NULL) return; - (void) zfsctl_snapshot_unmount((char *)snapname, MNT_FORCE); + (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } /* ARGSUSED */ @@ -3624,34 +3656,53 @@ zfs_destroy_unmount_origin(const char *fsname) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, - {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { + int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; + spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); + poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + + /* + * The snap must be in the specified pool to prevent the + * invalid removal of zvol minors below. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + zfs_unmount_snap(nvpair_name(pair)); + if (spa_open(name, &spa, FTAG) == 0) { + zvol_remove_minors(spa, name, B_TRUE); + spa_close(spa, FTAG); + } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* - * Create bookmarks. Bookmark names are of the form #. - * All bookmarks must be in the same pool. + * Create bookmarks. The bookmark names are of the form #. + * All bookmarks and snapshots must be in the same pool. + * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { - * bookmark1 -> snapshot1, bookmark2 -> snapshot2 + * new_bookmark1 -> existing_snapshot, + * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) @@ -3665,25 +3716,6 @@ static const zfs_ioc_key_t zfs_keys_bookmark[] = { static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { - for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *snap_name; - - /* - * Verify the snapshot argument. - */ - if (nvpair_value_string(pair, &snap_name) != 0) - return (SET_ERROR(EINVAL)); - - - /* Verify that the keys (bookmarks) are unique */ - for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); - pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { - if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) - return (SET_ERROR(EINVAL)); - } - } - return (dsl_bookmark_create(innvl, outnvl)); } @@ -3708,6 +3740,37 @@ zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (dsl_get_bookmarks(fsname, innvl, outnvl)); } +/* + * innvl is not used. + * + * outnvl: { + * property 1, property 2, ... + * } + * + */ +static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, + nvlist_t *outnvl) +{ + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *bmname; + + bmname = strchr(bookmark, '#'); + if (bmname == NULL) + return (SET_ERROR(EINVAL)); + bmname++; + + (void) strlcpy(fsname, bookmark, sizeof (fsname)); + *(strchr(fsname, '#')) = '\0'; + + return (dsl_get_bookmark_props(fsname, bmname, outnvl)); +} + /* * innvl: { * bookmark name 1, bookmark name 2 @@ -3782,9 +3845,9 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) - return (EINVAL); + return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); @@ -3952,7 +4015,7 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_free(vdev_errlist); spa_close(spa, FTAG); - return (total_errors > 0 ? EINVAL : 0); + return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* @@ -4037,7 +4100,134 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_free(vdev_errlist); spa_close(spa, FTAG); - return (total_errors > 0 ? EINVAL : 0); + return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); +} + +/* + * This ioctl waits for activity of a particular type to complete. If there is + * no activity of that type in progress, it returns immediately, and the + * returned value "waited" is false. If there is activity in progress, and no + * tag is passed in, the ioctl blocks until all activity of that type is + * complete, and then returns with "waited" set to true. + * + * If a tag is provided, it identifies a particular instance of an activity to + * wait for. Currently, this is only valid for use with 'initialize', because + * that is the only activity for which there can be multiple instances running + * concurrently. In the case of 'initialize', the tag corresponds to the guid of + * the vdev on which to wait. + * + * If a thread waiting in the ioctl receives a signal, the call will return + * immediately, and the return value will be EINTR. + * + * innvl: { + * "wait_activity" -> int32_t + * (optional) "wait_tag" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_pool_wait[] = { + {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, + {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t activity; + uint64_t tag; + boolean_t waited; + int error; + + if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) + return (EINVAL); + + if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) + error = spa_wait_tag(name, activity, tag, &waited); + else + error = spa_wait(name, activity, &waited); + + if (error == 0) + fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); + + return (error); +} + +/* + * This ioctl waits for activity of a particular type to complete. If there is + * no activity of that type in progress, it returns immediately, and the + * returned value "waited" is false. If there is activity in progress, and no + * tag is passed in, the ioctl blocks until all activity of that type is + * complete, and then returns with "waited" set to true. + * + * If a thread waiting in the ioctl receives a signal, the call will return + * immediately, and the return value will be EINTR. + * + * innvl: { + * "wait_activity" -> int32_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_fs_wait[] = { + {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, +}; + +static int +zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t activity; + boolean_t waited = B_FALSE; + int error; + dsl_pool_t *dp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + + if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) + return (SET_ERROR(EINVAL)); + + if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) + return (SET_ERROR(EINVAL)); + + if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) + return (error); + + if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_waiters++; + + /* + * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t + * aren't evicted while we're waiting. Normally this is prevented by + * holding the pool, but we can't do that while we're waiting since + * that would prevent TXGs from syncing out. Some of the functionality + * of long-holds (e.g. preventing deletion) is unnecessary for this + * case, since we would cancel the waiters before proceeding with a + * deletion. An alternative mechanism for keeping the dataset around + * could be developed but this is simpler. + */ + dsl_dataset_long_hold(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + error = dsl_dir_wait(dd, ds, activity, &waited); + + dsl_dataset_long_rele(ds, FTAG); + dd->dd_activity_waiters--; + if (dd->dd_activity_waiters == 0) + cv_signal(&dd->dd_activity_cv); + mutex_exit(&dd->dd_activity_lock); + + dsl_dataset_rele(ds, FTAG); + + if (error == 0) + fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); + + return (error); } /* @@ -4057,7 +4247,7 @@ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; - zvol_state_t *zv; + zvol_state_handle_t *zv; char *target = NULL; int error; @@ -4087,7 +4277,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } - deactivate_super(zfsvfs->z_sb); + zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); @@ -4106,11 +4296,45 @@ recursive_unmount(const char *fsname, void *arg) fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); - strfree(fullname); + kmem_strfree(fullname); return (0); } +/* + * + * snapname is the snapshot to redact. + * innvl: { + * "bookname" -> (string) + * shortname of the redaction bookmark to generate + * "snapnv" -> (nvlist, values ignored) + * snapshots to redact snapname with respect to + * } + * + * outnvl is unused + */ + +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_redact[] = { + {"bookname", DATA_TYPE_STRING, 0}, + {"snapnv", DATA_TYPE_NVLIST, 0}, +}; +static int +zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + nvlist_t *redactnvl = NULL; + char *redactbook = NULL; + + if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) + return (SET_ERROR(EINVAL)); + if (fnvlist_num_pairs(redactnvl) == 0) + return (SET_ERROR(ENXIO)); + if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) + return (SET_ERROR(EINVAL)); + + return (dmu_redact_snap(snapname, redactnvl, redactbook)); +} + /* * inputs: * zc_name old name of dataset @@ -4125,6 +4349,7 @@ zfs_ioc_rename(zfs_cmd_t *zc) objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; + boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; @@ -4150,7 +4375,7 @@ zfs_ioc_rename(zfs_cmd_t *zc) if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; - if (ost == DMU_OST_ZFS) { + if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); @@ -4175,7 +4400,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); - uint64_t intval; + uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { @@ -4257,19 +4482,20 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { - if (intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9 && + compval = ZIO_COMPRESS_ALGO(intval); + if (compval >= ZIO_COMPRESS_GZIP_1 && + compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } - if (intval == ZIO_COMPRESS_ZLE && + if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); - if (intval == ZIO_COMPRESS_LZ4) { + if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) @@ -4283,16 +4509,18 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) spa_close(spa, FTAG); } - /* - * If this is a bootable dataset then - * verify that the compression algorithm - * is supported for booting. We must return - * something other than ENOTSUP since it - * implies a downrev pool version. - */ - if (zfs_is_bootfs(dsname) && - !BOOTFS_COMPRESS_VALID(intval)) { - return (SET_ERROR(ERANGE)); + if (compval == ZIO_COMPRESS_ZSTD) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_ZSTD_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); } } break; @@ -4335,16 +4563,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; - /* - * If this is a bootable dataset then - * we don't allow large (>512B) dnodes, - * because GRUB doesn't support them. - */ - if (zfs_is_bootfs(dsname) && - intval != ZFS_DNSIZE_LEGACY) { - return (SET_ERROR(EDOM)); - } - if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); @@ -4434,7 +4652,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) * pointed at by errlist is NULL. */ static int -zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) +zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; @@ -4602,7 +4820,16 @@ extract_delay_props(nvlist_t *props) return (delayprops); } -#ifdef DEBUG +static void +zfs_allow_log_destroy(void *arg) +{ + char *poolname = arg; + + if (poolname != NULL) + kmem_strfree(poolname); +} + +#ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif @@ -4613,33 +4840,37 @@ static boolean_t zfs_ioc_recv_inject_err; static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, - boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, - int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, - uint64_t *action_handle, nvlist_t **errors) + boolean_t resumable, int input_fd, + dmu_replay_record_t *begin_record, uint64_t *read_bytes, + uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; - offset_t off; + offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; - file_t *input_fp; + boolean_t tofs_was_redacted; + zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); + off = 0; - input_fp = getf(input_fd); - if (input_fp == NULL) + if ((input_fp = zfs_file_get(input_fd)) == NULL) return (SET_ERROR(EBADF)); + noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, - resumable, localprops, hidden_args, origin, &drc); + resumable, localprops, hidden_args, origin, &drc, input_fp, + &off); if (error != 0) goto out; + tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied @@ -4740,18 +4971,19 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_free(xprops); } - off = input_fp->f_offset; - error = dmu_recv_stream(&drc, input_fp->f_vnode, &off, cleanup_fd, - action_handle); + error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; - zvol_state_t *zv = NULL; + zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; + boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( + begin_record->drr_u.drr_begin. + drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); @@ -4760,10 +4992,19 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); - if (error == 0) + /* + * If the dataset was not redacted, but we received a + * redacted stream onto it, we need to unmount the + * dataset. Otherwise, resume the filesystem. + */ + if (error == 0 && !drc.drc_newfs && + stream_is_redacted && !tofs_was_redacted) { + error = zfs_end_fs(zfsvfs, ds); + } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); + } error = error ? error : end_err; - deactivate_super(zfsvfs->z_sb); + zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); @@ -4799,12 +5040,9 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } + *read_bytes = off - noff; - *read_bytes = off - input_fp->f_offset; - if (VOP_SEEK(input_fp->f_vnode, input_fp->f_offset, &off, NULL) == 0) - input_fp->f_offset = off; - -#ifdef DEBUG +#ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; @@ -4904,7 +5142,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_free(inheritprops); } out: - releasef(input_fd); + zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); @@ -4925,13 +5163,10 @@ out: * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag - * zc_cleanup_fd cleanup-on-exit file descriptor - * zc_action_handle handle for this guid/ds mapping (or zero on first call) * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t - * zc_action_handle handle for this guid/ds mapping * zc_nvlist_dst{_size} error for each unapplied received property */ static int @@ -4974,8 +5209,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, - zc->zc_cleanup_fd, &zc->zc_cookie, &zc->zc_obj, - &zc->zc_action_handle, &errors); + &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); @@ -5008,15 +5242,14 @@ zfs_ioc_recv(zfs_cmd_t *zc) * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) - * (optional) "cleanup_fd" -> cleanup-on-exit file descriptor - * (optional) "action_handle" -> handle for this guid/ds mapping + * (optional) "cleanup_fd" -> unused + * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t - * "action_handle" -> handle for this guid/ds mapping * "errors" -> error for each unapplied received property (nvlist) * } */ @@ -5049,11 +5282,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t resumable; - uint64_t action_handle = 0; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; - int cleanup_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); @@ -5063,7 +5294,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) strchr(snapname, '%')) return (SET_ERROR(EINVAL)); - (void) strcpy(tofs, snapname); + (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; @@ -5081,14 +5312,6 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) force = nvlist_exists(innvl, "force"); resumable = nvlist_exists(innvl, "resumable"); - error = nvlist_lookup_int32(innvl, "cleanup_fd", &cleanup_fd); - if (error && error != ENOENT) - return (error); - - error = nvlist_lookup_uint64(innvl, "action_handle", &action_handle); - if (error && error != ENOENT) - return (error); - /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) @@ -5103,12 +5326,11 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, - hidden_args, force, resumable, input_fd, begin_record, cleanup_fd, - &read_bytes, &errflags, &action_handle, &errors); + hidden_args, force, resumable, input_fd, begin_record, + &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); - fnvlist_add_uint64(outnvl, "action_handle", action_handle); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); @@ -5118,6 +5340,51 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +typedef struct dump_bytes_io { + zfs_file_t *dbi_fp; + caddr_t dbi_buf; + int dbi_len; + int dbi_err; +} dump_bytes_io_t; + +static void +dump_bytes_cb(void *arg) +{ + dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; + zfs_file_t *fp; + caddr_t buf; + + fp = dbi->dbi_fp; + buf = dbi->dbi_buf; + + dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); +} + +static int +dump_bytes(objset_t *os, void *buf, int len, void *arg) +{ + dump_bytes_io_t dbi; + + dbi.dbi_fp = arg; + dbi.dbi_buf = buf; + dbi.dbi_len = len; + +#if defined(HAVE_LARGE_STACKS) + dump_bytes_cb(&dbi); +#else + /* + * The vn_rdwr() call is performed in a taskq to ensure that there is + * always enough stack space to write safely to the target filesystem. + * The ZIO_TYPE_FREE threads are used because there can be a lot of + * them and they are used in vdev_file.c for a similar purpose. + */ + spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, + ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); +#endif /* HAVE_LARGE_STACKS */ + + return (dbi.dbi_err); +} + /* * inputs: * zc_name name of snapshot to send @@ -5145,6 +5412,7 @@ zfs_ioc_send(zfs_cmd_t *zc) boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); + boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -5193,44 +5461,48 @@ zfs_ioc_send(zfs_cmd_t *zc) } } - error = dmu_send_estimate(tosnap, fromsnap, compressok || rawok, - &zc->zc_objset_type); + error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, + compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { - file_t *fp = getf(zc->zc_cookie); - if (fp == NULL) + zfs_file_t *fp; + dmu_send_outparams_t out = {0}; + + if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); - off = fp->f_offset; + off = zfs_file_off(fp); + out.dso_outfunc = dump_bytes; + out.dso_arg = fp; + out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, - zc->zc_cookie, fp->f_vnode, &off); + zc->zc_fromobj, embedok, large_block_ok, compressok, + rawok, savedok, zc->zc_cookie, &off, &out); - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - releasef(zc->zc_cookie); + zfs_file_put(fp); } return (error); } /* * inputs: - * zc_name name of snapshot on which to report progress - * zc_cookie file descriptor of send stream + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream * * outputs: - * zc_cookie number of bytes written in send stream thus far + * zc_cookie number of bytes written in send stream thus far + * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; - dmu_sendarg_t *dsp = NULL; + dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); @@ -5254,15 +5526,19 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { - if (dsp->dsa_outfd == zc->zc_cookie && - dsp->dsa_proc->group_leader == curproc->group_leader) + if (dsp->dss_outfd == zc->zc_cookie && + zfs_proc_is_caller(dsp->dss_proc)) break; } - if (dsp != NULL) - zc->zc_cookie = *(dsp->dsa_off); - else + if (dsp != NULL) { + zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, + 0, 0); + /* This is the closest thing we have to atomic_read_64. */ + zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); + } else { error = SET_ERROR(ENOENT); + } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); @@ -5390,9 +5666,10 @@ zfs_ioc_clear(zfs_cmd_t *zc) } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { - (void) spa_vdev_state_exit(spa, NULL, ENODEV); + error = SET_ERROR(ENODEV); + (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); - return (SET_ERROR(ENODEV)); + return (error); } } @@ -5423,7 +5700,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { - {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, 0}, + {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; /* ARGSUSED */ @@ -5432,11 +5709,13 @@ zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; - boolean_t scrub_restart = B_TRUE; + boolean_t rc, scrub_restart = B_TRUE; if (innvl) { - scrub_restart = fnvlist_lookup_boolean_value(innvl, - "scrub_restart"); + error = nvlist_lookup_boolean_value(innvl, + "scrub_restart", &rc); + if (error == 0) + scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); @@ -5606,7 +5885,6 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { - objset_t *os; int error = 0; zfsvfs_t *zfsvfs; @@ -5627,19 +5905,54 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) error = zfs_resume_fs(zfsvfs, newds); } } - if (error == 0) - error = dmu_objset_userspace_upgrade(zfsvfs->z_os); - deactivate_super(zfsvfs->z_sb); + if (error == 0) { + mutex_enter(&zfsvfs->z_os->os_upgrade_lock); + if (zfsvfs->z_os->os_upgrade_id == 0) { + /* clear potential error code and retry */ + zfsvfs->z_os->os_upgrade_status = 0; + mutex_exit(&zfsvfs->z_os->os_upgrade_lock); + + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_userspace_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } else { + mutex_exit(&zfsvfs->z_os->os_upgrade_lock); + } + + taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, + zfsvfs->z_os->os_upgrade_id); + error = zfsvfs->z_os->os_upgrade_status; + } + zfs_vfs_rele(zfsvfs); } else { + objset_t *os; + /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); - error = dmu_objset_userspace_upgrade(os); - dmu_objset_rele_flags(os, B_TRUE, FTAG); - } + mutex_enter(&os->os_upgrade_lock); + if (os->os_upgrade_id == 0) { + /* clear potential error code and retry */ + os->os_upgrade_status = 0; + mutex_exit(&os->os_upgrade_lock); + dmu_objset_userspace_upgrade(os); + } else { + mutex_exit(&os->os_upgrade_lock); + } + + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); + error = os->os_upgrade_status; + + dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, + FTAG); + } return (error); } @@ -5734,25 +6047,24 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; - int error; minor_t minor; - error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (fp == NULL) + return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); - error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); - strfree(snap_name); - strfree(hold_name); - zfs_onexit_fd_rele(zc->zc_cleanup_fd); + kmem_strfree(snap_name); + kmem_strfree(hold_name); + zfs_onexit_fd_rele(fp); return (error); } @@ -5768,21 +6080,17 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) static int zfs_ioc_diff(zfs_cmd_t *zc) { - file_t *fp; + zfs_file_t *fp; offset_t off; int error; - fp = getf(zc->zc_cookie); - if (fp == NULL) + if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); - off = fp->f_offset; + off = zfs_file_off(fp); + error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); - error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); - - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - releasef(zc->zc_cookie); + zfs_file_put(fp); return (error); } @@ -5818,6 +6126,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) int cleanup_fd = -1; int error; minor_t minor = 0; + zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); @@ -5835,15 +6144,17 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { - error = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (error != 0) - return (error); + fp = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (fp == NULL) + return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); - if (minor != 0) - zfs_onexit_fd_rele(cleanup_fd); - return (error); + if (fp != NULL) { + ASSERT3U(minor, !=, 0); + zfs_onexit_fd_rele(fp); + } + return (SET_ERROR(error)); } /* @@ -5905,9 +6216,9 @@ zfs_ioc_events_next(zfs_cmd_t *zc) uint64_t dropped = 0; int error; - error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (fp == NULL) + return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, @@ -5929,7 +6240,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc) break; } while (1); - zfs_zevent_fd_rele(zc->zc_cleanup_fd); + zfs_zevent_fd_rele(fp); return (error); } @@ -5961,20 +6272,20 @@ zfs_ioc_events_seek(zfs_cmd_t *zc) minor_t minor; int error; - error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (fp == NULL) + return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); - zfs_zevent_fd_rele(zc->zc_cleanup_fd); + zfs_zevent_fd_rele(fp); return (error); } /* * inputs: - * zc_name name of new filesystem or snapshot - * zc_value full name of old snapshot + * zc_name name of later filesystem or snapshot + * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes @@ -5986,7 +6297,7 @@ zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; - dsl_dataset_t *new, *old; + dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) @@ -5996,16 +6307,26 @@ zfs_ioc_space_written(zfs_cmd_t *zc) dsl_pool_rele(dp, FTAG); return (error); } - error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } + if (strchr(zc->zc_value, '#') != NULL) { + zfs_bookmark_phys_t bmp; + error = dsl_bookmark_lookup(dp, zc->zc_value, + new, &bmp); + if (error == 0) { + error = dsl_dataset_space_written_bookmark(&bmp, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + } + } else { + dsl_dataset_t *old; + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - error = dsl_dataset_space_written(old, new, &zc->zc_cookie, - &zc->zc_objset_type, &zc->zc_perm_action); - dsl_dataset_rele(old, FTAG); + if (error == 0) { + error = dsl_dataset_space_written(old, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + } + } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); @@ -6083,8 +6404,13 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. + * (optional) "savedok" -> (value ignored) + * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. + * (optional) "redactbook" -> (string) + * if present, use this bookmark's redaction list to generate a redacted + * send stream * } * * outnvl is unused @@ -6096,8 +6422,10 @@ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ @@ -6108,13 +6436,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) offset_t off; char *fromname = NULL; int fd; - file_t *fp; + zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; + boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; + char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); @@ -6124,24 +6454,39 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); + savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); - if ((fp = getf(fd)) == NULL) + (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); + + if ((fp = zfs_file_get(fd)) == NULL) return (SET_ERROR(EBADF)); - off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, - rawok, fd, resumeobj, resumeoff, fp->f_vnode, &off); + off = zfs_file_off(fp); - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; + dmu_send_outparams_t out = {0}; + out.dso_outfunc = dump_bytes; + out.dso_arg = fp; + out.dso_dryrun = B_FALSE; + error = dmu_send(snapname, fromname, embedok, largeblockok, + compressok, rawok, savedok, resumeobj, resumeoff, + redactbook, fd, &off, &out); - releasef(fd); + zfs_file_put(fp); return (error); } +/* ARGSUSED */ +static int +send_space_sum(objset_t *os, void *buf, int len, void *arg) +{ + uint64_t *size = arg; + *size += len; + return (0); +} + /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). @@ -6155,8 +6500,12 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted - * (optional) "rawok" -> (value ignored) + * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. + * (optional) "fd" -> file descriptor to use as a cookie for progress + * tracking (int32) * } * * outnvl: { @@ -6170,6 +6519,11 @@ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int @@ -6177,11 +6531,22 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; int error; - char *fromname; + char *fromname = NULL; + char *redactlist_book = NULL; + boolean_t largeblockok; + boolean_t embedok; boolean_t compressok; boolean_t rawok; - uint64_t space; + boolean_t savedok; + uint64_t space = 0; + boolean_t full_estimate = B_FALSE; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; + uint64_t resume_bytes = 0; + int32_t fd = -1; + zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) @@ -6192,61 +6557,102 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) dsl_pool_rele(dp, FTAG); return (error); } + (void) nvlist_lookup_int32(innvl, "fd", &fd); + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); + savedok = nvlist_exists(innvl, "savedok"); + boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); + boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", + &redactlist_book) == 0); + + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); + + if (altbook) { + full_estimate = B_TRUE; + } else if (from) { + if (strchr(fromname, '#')) { + error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); - error = nvlist_lookup_string(innvl, "from", &fromname); - if (error == 0) { - if (strchr(fromname, '@') != NULL) { /* - * If from is a snapshot, hold it and use the more - * efficient dmu_send_estimate to estimate send space - * size using deadlists. + * dsl_bookmark_lookup() will fail with EXDEV if + * the from-bookmark and tosnap are at the same txg. + * However, it's valid to do a send (and therefore, + * a send estimate) from and to the same time point, + * if the bookmark is redacted (the incremental send + * can change what's redacted on the target). In + * this case, dsl_bookmark_lookup() fills in zbm + * but returns EXDEV. Ignore this error. */ - dsl_dataset_t *fromsnap; + if (error == EXDEV && zbm.zbm_redaction_obj != 0 && + zbm.zbm_guid == + dsl_dataset_phys(tosnap)->ds_guid) + error = 0; + + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & + ZBM_FLAG_HAS_FBN)) { + full_estimate = B_TRUE; + } + } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); - if (error != 0) - goto out; - error = dmu_send_estimate(tosnap, fromsnap, - compressok || rawok, &space); - dsl_dataset_rele(fromsnap, FTAG); - } else if (strchr(fromname, '#') != NULL) { - /* - * If from is a bookmark, fetch the creation TXG of the - * snapshot it was created from and use that to find - * blocks that were born after it. - */ - zfs_bookmark_phys_t frombm; + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } - error = dsl_bookmark_lookup(dp, fromname, tosnap, - &frombm); - if (error != 0) - goto out; - error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, compressok || rawok, - &space); + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + full_estimate = B_TRUE; + dsl_dataset_rele(fromsnap, FTAG); + } } else { /* * from is not properly formatted as a snapshot or * bookmark */ - error = SET_ERROR(EINVAL); - goto out; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EINVAL)); } - } else { + } + + if (full_estimate) { + dmu_send_outparams_t out = {0}; + offset_t off = 0; + out.dso_outfunc = send_space_sum; + out.dso_arg = &space; + out.dso_dryrun = B_TRUE; /* - * If estimating the size of a full send, use dmu_send_estimate. + * We have to release these holds so dmu_send can take them. It + * will do all the error checking we need. */ - error = dmu_send_estimate(tosnap, NULL, compressok || rawok, - &space); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + error = dmu_send(snapname, fromname, embedok, largeblockok, + compressok, rawok, savedok, resumeobj, resumeoff, + redactlist_book, fd, &off, &out); + } else { + error = dmu_send_estimate_fast(tosnap, fromsnap, + (from && strchr(fromname, '#') != NULL ? &zbm : NULL), + compressok || rawok, savedok, &space); + space -= resume_bytes; + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); -out: - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); return (error); } @@ -6273,14 +6679,17 @@ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; - boolean_t force = B_FALSE; + boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); - if (innvl) - force = fnvlist_lookup_boolean_value(innvl, "force"); + if (innvl) { + err = nvlist_lookup_boolean_value(innvl, "force", &rc); + if (err == 0) + force = rc; + } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); @@ -6291,7 +6700,7 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) spa_close(spa, FTAG); - return (err); + return (0); } /* @@ -6448,7 +6857,7 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, * See the block comment at the beginning of this file for details on * each argument to this function. */ -static void +void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, @@ -6484,7 +6893,7 @@ zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, POOL_NAME, log_history, pool_check); } -static void +void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { @@ -6569,7 +6978,7 @@ zfs_ioctl_init(void) zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, - zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, + zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); @@ -6607,6 +7016,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); + zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, + zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, + ARRAY_SIZE(zfs_keys_get_bookmark_props)); + zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, @@ -6646,6 +7060,11 @@ zfs_ioctl_init(void) B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); + zfs_ioctl_register("redact", ZFS_IOC_REDACT, + zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); + zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, @@ -6668,6 +7087,26 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); + zfs_ioctl_register("wait", ZFS_IOC_WAIT, + zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); + + zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, + zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); + + zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, + zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); + + zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, + zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, + zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -6799,15 +7238,14 @@ zfs_ioctl_init(void) zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); - /* - * ZoL functions - */ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_init_os(); } /* @@ -6871,7 +7309,7 @@ zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { - /* at least one non-optionial key is expected here */ + /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; @@ -6884,14 +7322,15 @@ zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) return (0); } -int +static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; - ASSERT(type == POOL_NAME || type == DATASET_NAME); + ASSERT(type == POOL_NAME || type == DATASET_NAME || + type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); @@ -6907,47 +7346,14 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type, return (error); } -static void * -zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) -{ - zfsdev_state_t *zs; - - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { - if (zs->zs_minor == minor) { - smp_rmb(); - switch (which) { - case ZST_ONEXIT: - return (zs->zs_onexit); - case ZST_ZEVENT: - return (zs->zs_zevent); - case ZST_ALL: - return (zs); - } - } - } - - return (NULL); -} - -void * -zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) -{ - void *ptr; - - ptr = zfsdev_get_state_impl(minor, which); - - return (ptr); -} - int -zfsdev_getminor(struct file *filp, minor_t *minorp) +zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; - ASSERT(filp != NULL); ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); - fpd = filp->private_data; + fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); @@ -6970,11 +7376,33 @@ zfsdev_getminor(struct file *filp, minor_t *minorp) return (SET_ERROR(EBADF)); } +void * +zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) +{ + zfsdev_state_t *zs; + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == minor) { + smp_rmb(); + switch (which) { + case ZST_ONEXIT: + return (zs->zs_onexit); + case ZST_ZEVENT: + return (zs->zs_zevent); + case ZST_ALL: + return (zs); + } + } + } + + return (NULL); +} + /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ -minor_t +static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; @@ -6985,7 +7413,7 @@ zfsdev_minor_alloc(void) for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; - if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { + if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } @@ -6994,8 +7422,8 @@ zfsdev_minor_alloc(void) return (0); } -static int -zfsdev_state_init(struct file *filp) +int +zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; @@ -7018,16 +7446,14 @@ zfsdev_state_init(struct file *filp) newzs = B_TRUE; } - zs->zs_file = filp; - filp->private_data = zs; + zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); - /* * In order to provide for lock-free concurrent read access - * to the minor list in zfsdev_get_state_impl(), new entries + * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new @@ -7035,70 +7461,57 @@ zfsdev_state_init(struct file *filp) */ if (newzs) { zs->zs_minor = minor; - smp_wmb(); + membar_producer(); zsprev->zs_next = zs; } else { - smp_wmb(); + membar_producer(); zs->zs_minor = minor; } return (0); } -static int -zfsdev_state_destroy(struct file *filp) +void +zfsdev_state_destroy(void *priv) { - zfsdev_state_t *zs; + zfsdev_state_t *zs = zfsdev_private_get_state(priv); - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - ASSERT(filp->private_data != NULL); + ASSERT(zs != NULL); + ASSERT3S(zs->zs_minor, >, 0); - zs = filp->private_data; - zs->zs_minor = -1; + /* + * The last reference to this zfsdev file descriptor is being dropped. + * We don't have to worry about lookup grabbing this state object, and + * zfsdev_state_init() will not try to reuse this object until it is + * invalidated by setting zs_minor to -1. Invalidation must be done + * last, with a memory barrier to ensure ordering. This lets us avoid + * taking the global zfsdev state lock around destruction. + */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); - - return (0); + zs->zs_onexit = NULL; + zs->zs_zevent = NULL; + membar_producer(); + zs->zs_minor = -1; } -static int -zfsdev_open(struct inode *ino, struct file *filp) +long +zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { - int error; - - mutex_enter(&zfsdev_state_lock); - error = zfsdev_state_init(filp); - mutex_exit(&zfsdev_state_lock); - - return (-error); -} - -static int -zfsdev_release(struct inode *ino, struct file *filp) -{ - int error; - - mutex_enter(&zfsdev_state_lock); - error = zfsdev_state_destroy(filp); - mutex_exit(&zfsdev_state_lock); - - return (-error); -} - -static long -zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) -{ - zfs_cmd_t *zc; - uint_t vecnum; - int error, rc, flag = 0; + int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; + uint64_t max_nvlist_src_size; + size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; + hrtime_t start_time = gethrtime(); - vecnum = cmd - ZFS_IOC_FIRST; + cmd = vecnum; + error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (-SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); + vec = &zfs_ioc_vec[vecnum]; /* @@ -7106,18 +7519,11 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) - return (-SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); - - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); - if (error != 0) { - error = SET_ERROR(EFAULT); - goto out; - } + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; - if (zc->zc_nvlist_src_size > MAX_NVLIST_SRC_SIZE) { + max_nvlist_src_size = zfs_max_nvlist_src_size_os(); + if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end @@ -7162,10 +7568,18 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vec->zvec_namecheck, vec->zvec_pool_check); break; + case ENTITY_NAME: + if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { + error = SET_ERROR(EINVAL); + } else { + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + } + break; + case NO_NAME: break; } - /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. @@ -7190,13 +7604,15 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) goto out; /* legacy ioctls can modify zc_name */ - saved_poolname = strdup(zc->zc_name); - if (saved_poolname == NULL) { - error = SET_ERROR(ENOMEM); - goto out; - } else { - saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; - } + /* + * Can't use kmem_strdup() as we might truncate the string and + * kmem_strfree() would then free with incorrect size. + */ + saved_poolname_len = strlen(zc->zc_name) + 1; + saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); + + strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); + saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; @@ -7235,13 +7651,21 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { - fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, - outnvl); + size_t out_size = fnvlist_size(outnvl); + if (out_size > zfs_history_output_max) { + fnvlist_add_int64(lognv, + ZPOOL_HIST_OUTPUT_SIZE, out_size); + } else { + fnvlist_add_nvlist(lognv, + ZPOOL_HIST_OUTPUT_NVL, outnvl); + } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } + fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, + gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } @@ -7269,167 +7693,70 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) out: nvlist_free(innvl); - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); - if (error == 0 && rc != 0) - error = SET_ERROR(EFAULT); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) - strfree(s); - (void) tsd_set(zfs_allow_log_key, saved_poolname); - } else { - if (saved_poolname != NULL) - strfree(saved_poolname); + kmem_strfree(s); + (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } + if (saved_poolname != NULL) + kmem_free(saved_poolname, saved_poolname_len); - kmem_free(zc, sizeof (zfs_cmd_t)); - return (-error); + return (error); } -#ifdef CONFIG_COMPAT -static long -zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) -{ - return (zfsdev_ioctl(filp, cmd, arg)); -} -#else -#define zfsdev_compat_ioctl NULL -#endif - -static const struct file_operations zfsdev_fops = { - .open = zfsdev_open, - .release = zfsdev_release, - .unlocked_ioctl = zfsdev_ioctl, - .compat_ioctl = zfsdev_compat_ioctl, - .owner = THIS_MODULE, -}; - -static struct miscdevice zfs_misc = { - .minor = ZFS_DEVICE_MINOR, - .name = ZFS_DRIVER, - .fops = &zfsdev_fops, -}; - -MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR); -MODULE_ALIAS("devname:zfs"); - -static int -zfs_attach(void) +int +zfs_kmod_init(void) { int error; + if ((error = zvol_init()) != 0) + return (error); + + spa_init(SPA_MODE_READ | SPA_MODE_WRITE); + zfs_init(); + + zfs_ioctl_init(); + mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; - error = misc_register(&zfs_misc); - if (error == -EBUSY) { - /* - * Fallback to dynamic minor allocation in the event of a - * collision with a reserved minor in linux/miscdevice.h. - * In this case the kernel modules must be manually loaded. - */ - printk(KERN_INFO "ZFS: misc_register() with static minor %d " - "failed %d, retrying with MISC_DYNAMIC_MINOR\n", - ZFS_DEVICE_MINOR, error); - - zfs_misc.minor = MISC_DYNAMIC_MINOR; - error = misc_register(&zfs_misc); - } - - if (error) - printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); - - return (error); -} - -static void -zfs_detach(void) -{ - zfsdev_state_t *zs, *zsprev = NULL; - - misc_deregister(&zfs_misc); - mutex_destroy(&zfsdev_state_lock); - - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { - if (zsprev) - kmem_free(zsprev, sizeof (zfsdev_state_t)); - zsprev = zs; - } - if (zsprev) - kmem_free(zsprev, sizeof (zfsdev_state_t)); -} - -static void -zfs_allow_log_destroy(void *arg) -{ - char *poolname = arg; - - if (poolname != NULL) - strfree(poolname); -} - -#ifdef DEBUG -#define ZFS_DEBUG_STR " (DEBUG mode)" -#else -#define ZFS_DEBUG_STR "" -#endif - -static int __init -_init(void) -{ - int error; - - error = -vn_set_pwd("/"); - if (error) { - printk(KERN_NOTICE - "ZFS: Warning unable to set pwd to '/': %d\n", error); - return (error); - } - - if ((error = -zvol_init()) != 0) - return (error); - - spa_init(FREAD | FWRITE); - zfs_init(); - - zfs_ioctl_init(); - zfs_sysfs_init(); - - if ((error = zfs_attach()) != 0) + if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); - printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " - "ZFS pool version %s, ZFS filesystem version %s\n", - ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, - SPA_VERSION_STRING, ZPL_VERSION_STRING); -#ifndef CONFIG_FS_POSIX_ACL - printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); -#endif /* CONFIG_FS_POSIX_ACL */ - return (0); - out: - zfs_sysfs_fini(); zfs_fini(); spa_fini(); - (void) zvol_fini(); - printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" - ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, - ZFS_DEBUG_STR, error); + zvol_fini(); return (error); } -static void __exit -_fini(void) +void +zfs_kmod_fini(void) { - zfs_detach(); - zfs_sysfs_fini(); + zfsdev_state_t *zs, *zsnext = NULL; + + zfsdev_detach(); + + mutex_destroy(&zfsdev_state_lock); + + for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { + zsnext = zs->zs_next; + if (zs->zs_onexit) + zfs_onexit_destroy(zs->zs_onexit); + if (zs->zs_zevent) + zfs_zevent_destroy(zs->zs_zevent); + kmem_free(zs, sizeof (zfsdev_state_t)); + } + + zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); @@ -7437,17 +7764,12 @@ _fini(void) tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); - - printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", - ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); } -#if defined(_KERNEL) -module_init(_init); -module_exit(_fini); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, + "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); -MODULE_DESCRIPTION("ZFS"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); -#endif +ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, + "Maximum size in bytes of ZFS ioctl output that will be logged"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 15c396ce03..e248dc3cc4 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2018 by Delphix. All rights reserved. */ @@ -39,9 +39,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -79,7 +79,6 @@ zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) return (TX_CREATE_ACL); else return (TX_CREATE_ATTR); - /*NOTREACHED*/ case Z_DIR: if (vsecp == NULL && !isxvattr) return (TX_MKDIR); @@ -126,9 +125,11 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) /* Now pack the attributes up in a single uint64_t */ attrs = (uint64_t *)bitmap; - crtime = attrs + 1; - scanstamp = (caddr_t)(crtime + 2); *attrs = 0; + crtime = attrs + 1; + bzero(crtime, 2 * sizeof (uint64_t)); + scanstamp = (caddr_t)(crtime + 2); + bzero(scanstamp, AV_SCANSTAMP_SZ); if (XVA_ISSET_REQ(xvap, XAT_READONLY)) *attrs |= (xoap->xoa_readonly == 0) ? 0 : XAT0_READONLY; @@ -231,7 +232,33 @@ zfs_xattr_owner_unlinked(znode_t *zp) { int unlinked = 0; znode_t *dzp; - igrab(ZTOI(zp)); +#ifdef __FreeBSD__ + znode_t *tzp = zp; + + /* + * zrele drops the vnode lock which violates the VOP locking contract + * on FreeBSD. See comment at the top of zfs_replay.c for more detail. + */ + /* + * if zp is XATTR node, keep walking up via z_xattr_parent until we + * get the owner + */ + while (tzp->z_pflags & ZFS_XATTR) { + ASSERT3U(zp->z_xattr_parent, !=, 0); + if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) { + unlinked = 1; + break; + } + + if (tzp != zp) + zrele(tzp); + tzp = dzp; + unlinked = tzp->z_unlinked; + } + if (tzp != zp) + zrele(tzp); +#else + zhold(zp); /* * if zp is XATTR node, keep walking up via z_xattr_parent until we * get the owner @@ -242,11 +269,13 @@ zfs_xattr_owner_unlinked(znode_t *zp) unlinked = 1; break; } - iput(ZTOI(zp)); + + zrele(zp); zp = dzp; unlinked = zp->z_unlinked; } - iput(ZTOI(zp)); + zrele(zp); +#endif return (unlinked); } @@ -271,7 +300,7 @@ zfs_xattr_owner_unlinked(znode_t *zp) */ void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; @@ -321,13 +350,13 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, /* Store dnode slot count in 8 bits above object id. */ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); lr->lr_mode = zp->z_mode; - if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOI(zp)->i_uid))) { - lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOI(zp)->i_uid); + if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) { + lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp)); } else { lr->lr_uid = fuidp->z_fuid_owner; } - if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOI(zp)->i_gid))) { - lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOI(zp)->i_gid); + if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) { + lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp)); } else { lr->lr_gid = fuidp->z_fuid_group; } @@ -385,7 +414,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid) + znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked) { itx_t *itx; lr_remove_t *lr; @@ -401,6 +430,17 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, itx->itx_oid = foid; + /* + * Object ids can be re-instantiated in the next txg so + * remove any async transactions to avoid future leaks. + * This can happen if a fsync occurs on the re-instantiated + * object for a WR_INDIRECT or WR_NEED_COPY write, which gets + * the new file data and flushes a write record for the old object. + */ + if (unlinked) { + ASSERT((txtype & ~TX_CI) == TX_REMOVE); + zil_remove_async(zilog, foid); + } zil_itx_assign(zilog, itx, tx); } @@ -409,7 +449,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) + znode_t *dzp, znode_t *zp, const char *name) { itx_t *itx; lr_link_t *lr; @@ -432,7 +472,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, char *link) + znode_t *dzp, znode_t *zp, const char *name, const char *link) { itx_t *itx; lr_create_t *lr; @@ -446,8 +486,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_uid = KUID_TO_SUID(ZTOI(zp)->i_uid); - lr->lr_gid = KGID_TO_SGID(ZTOI(zp)->i_gid); + lr->lr_uid = KUID_TO_SUID(ZTOUID(zp)); + lr->lr_gid = KGID_TO_SGID(ZTOGID(zp)); lr->lr_mode = zp->z_mode; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, sizeof (uint64_t)); @@ -463,8 +503,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, * Handles TX_RENAME transactions. */ void -zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, + const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) { itx_t *itx; lr_rename_t *lr; @@ -497,9 +537,12 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag, zil_callback_t callback, void *callback_data) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; uintptr_t fsync_cnt; + uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -513,7 +556,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; - else if (ioflag & (FSYNC | FDSYNC)) + else if (ioflag & (O_SYNC | O_DSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; @@ -522,13 +565,23 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, + sizeof (gen)); + while (resid) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = resid; - if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + /* + * A WR_COPIED record must fit entirely in one log block. + * Large writes can use WR_NEED_COPY, which the ZIL will + * split into multiple records across several log blocks + * if necessary. + */ + if (wr_state == WR_COPIED && + resid > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid); @@ -536,12 +589,22 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx = zil_itx_create(txtype, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os, - zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - wr_state = WR_NEED_COPY; + + /* + * For WR_COPIED records, copy the data into the lr_write_t. + */ + if (wr_state == WR_COPIED) { + int err; + DB_DNODE_ENTER(db); + err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1, + DMU_READ_NO_PREFETCH); + if (err != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + wr_state = WR_NEED_COPY; + } + DB_DNODE_EXIT(db); } itx->itx_wr_state = wr_state; @@ -552,8 +615,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, BP_ZERO(&lr->lr_blkptr); itx->itx_private = ZTOZSB(zp); + itx->itx_gen = gen; - if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && + if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) itx->itx_sync = B_FALSE; @@ -564,6 +628,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); + } } /* @@ -718,7 +786,7 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, zil_itx_assign(zilog, itx, tx); } -#if defined(_KERNEL) -module_param(zfs_immediate_write_sz, long, 0644); -MODULE_PARM_DESC(zfs_immediate_write_sz, "Largest data block to write to zil"); -#endif +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW, + "Largest data block to write to zil"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c index 31f77ce81b..7c56dd9c97 100644 --- a/module/zfs/zfs_onexit.c +++ b/module/zfs/zfs_onexit.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2020 by Delphix. All rights reserved. */ #include @@ -101,6 +101,41 @@ zfs_onexit_destroy(zfs_onexit_t *zo) kmem_free(zo, sizeof (zfs_onexit_t)); } +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +zfs_file_t * +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + zfs_onexit_t *zo = NULL; + + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (NULL); + + int error = zfsdev_getminor(fp, minorp); + if (error) { + zfs_onexit_fd_rele(fp); + return (NULL); + } + + zo = zfsdev_get_state(*minorp, ZST_ONEXIT); + if (zo == NULL) { + zfs_onexit_fd_rele(fp); + return (NULL); + } + return (fp); +} + +void +zfs_onexit_fd_rele(zfs_file_t *fp) +{ + zfs_file_put(fp); +} + static int zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) { @@ -111,39 +146,6 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) return (0); } -/* - * Consumers might need to operate by minor number instead of fd, since - * they might be running in another thread (e.g. txg_sync_thread). Callers - * of this function must call zfs_onexit_fd_rele() when they're finished - * using the minor number. - */ -int -zfs_onexit_fd_hold(int fd, minor_t *minorp) -{ - file_t *fp; - zfs_onexit_t *zo; - int error; - - fp = getf(fd); - if (fp == NULL) - return (SET_ERROR(EBADF)); - - error = zfsdev_getminor(fp->f_file, minorp); - if (error == 0) - error = zfs_onexit_minor_to_state(*minorp, &zo); - - if (error) - zfs_onexit_fd_rele(fd); - - return (error); -} - -void -zfs_onexit_fd_rele(int fd) -{ - releasef(fd); -} - /* * Add a callback to be invoked when the calling process exits. */ @@ -172,80 +174,3 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, return (0); } - -static zfs_onexit_action_node_t * -zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) -{ - zfs_onexit_action_node_t *match; - zfs_onexit_action_node_t *ap; - list_t *l; - - ASSERT(MUTEX_HELD(&zo->zo_lock)); - - match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; - l = &zo->zo_actions; - for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { - if (match == ap) - break; - } - return (ap); -} - -/* - * Delete the callback, triggering it first if 'fire' is set. - */ -int -zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) -{ - zfs_onexit_t *zo; - zfs_onexit_action_node_t *ap; - int error; - - error = zfs_onexit_minor_to_state(minor, &zo); - if (error) - return (error); - - mutex_enter(&zo->zo_lock); - ap = zfs_onexit_find_cb(zo, action_handle); - if (ap != NULL) { - list_remove(&zo->zo_actions, ap); - mutex_exit(&zo->zo_lock); - if (fire) - ap->za_func(ap->za_data); - kmem_free(ap, sizeof (zfs_onexit_action_node_t)); - } else { - mutex_exit(&zo->zo_lock); - error = SET_ERROR(ENOENT); - } - - return (error); -} - -/* - * Return the data associated with this callback. This allows consumers - * of the cleanup-on-exit interfaces to stash kernel data across system - * calls, knowing that it will be cleaned up if the calling process exits. - */ -int -zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) -{ - zfs_onexit_t *zo; - zfs_onexit_action_node_t *ap; - int error; - - *data = NULL; - - error = zfs_onexit_minor_to_state(minor, &zo); - if (error) - return (error); - - mutex_enter(&zo->zo_lock); - ap = zfs_onexit_find_cb(zo, action_handle); - if (ap != NULL) - *data = ap->za_data; - else - error = SET_ERROR(ENOENT); - mutex_exit(&zo->zo_lock); - - return (error); -} diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c new file mode 100644 index 0000000000..e61db5c7ab --- /dev/null +++ b/module/zfs/zfs_quota.c @@ -0,0 +1,476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +zpl_get_file_info(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); + + zoi->zfi_project = ZFS_DEFAULT_PROJID; + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); + + if (bonustype == DMU_OT_ZNODE) { + const znode_phys_t *znp = data; + zoi->zfi_user = znp->zp_uid; + zoi->zfi_group = znp->zp_gid; + zoi->zfi_generation = znp->zp_gen; + return (0); + } + + const sa_hdr_phys_t *sap = data; + if (sap->sa_magic == 0) { + /* + * This should only happen for newly created files + * that haven't had the znode data filled in yet. + */ + zoi->zfi_user = 0; + zoi->zfi_group = 0; + zoi->zfi_generation = 0; + return (0); + } + + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + + int hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + + uintptr_t data_after_hdr = (uintptr_t)data + hdrsize; + zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET)); + zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET)); + zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET)); + uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET)); + if (swap) + flags = BSWAP_64(flags); + + if (flags & ZFS_PROJID) { + zoi->zfi_project = + *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET)); + } + + if (swap) { + zoi->zfi_user = BSWAP_64(zoi->zfi_user); + zoi->zfi_group = BSWAP_64(zoi->zfi_group); + zoi->zfi_project = BSWAP_64(zoi->zfi_project); + zoi->zfi_generation = BSWAP_64(zoi->zfi_generation); + } + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = zfs_strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + case ZFS_PROP_USEROBJUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + case ZFS_PROP_GROUPOBJUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_PROJECTUSED: + case ZFS_PROP_PROJECTOBJUSED: + return (DMU_PROJECTUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + case ZFS_PROP_USEROBJQUOTA: + return (zfsvfs->z_userobjquota_obj); + case ZFS_PROP_GROUPOBJQUOTA: + return (zfsvfs->z_groupobjquota_obj); + case ZFS_PROP_PROJECTQUOTA: + return (zfsvfs->z_projectquota_obj); + case ZFS_PROP_PROJECTOBJQUOTA: + return (zfsvfs->z_projectobjquota_obj); + default: + return (ZFS_NO_OBJECT); + } +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + int offset = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) && + !dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) { + *bufsizep = 0; + return (0); + } + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) + offset = DMU_OBJACCT_PREFIX_LEN; + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + /* + * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) + * when dealing with block quota and vice versa. + */ + if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, + DMU_OBJACCT_PREFIX_LEN) == 0)) + continue; + + fuidstr_to_sid(zfsvfs, za.za_name + offset, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + int offset = 0; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + } + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) + return (0); + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) { + strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); + offset = DMU_OBJACCT_PREFIX_LEN; + } + + err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf + offset, + sizeof (buf) - offset, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (SET_ERROR(ENOTSUP)); + + switch (type) { + case ZFS_PROP_USERQUOTA: + objp = &zfsvfs->z_userquota_obj; + break; + case ZFS_PROP_GROUPQUOTA: + objp = &zfsvfs->z_groupquota_obj; + break; + case ZFS_PROP_USEROBJQUOTA: + objp = &zfsvfs->z_userobjquota_obj; + break; + case ZFS_PROP_GROUPOBJQUOTA: + objp = &zfsvfs->z_groupobjquota_obj; + break; + case ZFS_PROP_PROJECTQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectquota_obj; + break; + case ZFS_PROP_PROJECTOBJQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectobjquota_obj; + break; + default: + return (SET_ERROR(EINVAL)); + } + + err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof (buf), B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + uint64_t used, quota, quotaobj; + int err; + + if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { + if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectobjquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userobjquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupobjquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), DMU_OBJACCT_PREFIX "%llx", + (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20]; + uint64_t used, quota, quotaobj; + int err; + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + return (zfs_id_overblockquota(zfsvfs, usedobj, id) || + zfs_id_overobjquota(zfsvfs, usedobj, id)); +} + +EXPORT_SYMBOL(zpl_get_file_info); +EXPORT_SYMBOL(zfs_userspace_one); +EXPORT_SYMBOL(zfs_userspace_many); +EXPORT_SYMBOL(zfs_set_userquota); +EXPORT_SYMBOL(zfs_id_overblockquota); +EXPORT_SYMBOL(zfs_id_overobjquota); +EXPORT_SYMBOL(zfs_id_overquota); diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 1443817690..e6ed3e738e 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -43,12 +43,21 @@ #include #include #include -#include #include #include #include #include +/* + * NB: FreeBSD expects to be able to do vnode locking in lookup and + * hold the locks across all subsequent VOPs until vput is called. + * This means that its zfs vnops routines can't do any internal locking. + * In order to have the same contract as the Linux vnops there would + * needed to be duplicate locked vnops. If the vnops were used more widely + * in common code this would likely be preferable. However, currently + * this is the only file where this is the case. + */ + /* * Functions to replay ZFS intent log (ZIL) records * The functions are called through a function vector (zfs_replay_vector) @@ -61,11 +70,13 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, { bzero(vap, sizeof (*vap)); vap->va_mask = (uint_t)mask; - vap->va_type = IFTOVT(mode); vap->va_mode = mode; +#if defined(__FreeBSD__) || defined(__APPLE__) + vap->va_type = IFTOVT(mode); +#endif vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; - vap->va_rdev = rdev; + vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } @@ -282,7 +293,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; - struct inode *ip = NULL; + znode_t *zp; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; @@ -337,8 +348,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); - if (error != ENOENT) + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) @@ -351,7 +362,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ + fallthrough; case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -373,8 +384,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) lr->lr_uid, lr->lr_gid); } - error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, - 0, 0, &ip, kcred, vflg, &vsec); + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, &vsec); break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); @@ -383,7 +394,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ + fallthrough; case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -403,18 +414,21 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } - error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, - &ip, kcred, vflg, &vsec); + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, &vsec); break; default: error = SET_ERROR(ENOTSUP); } bail: - if (error == 0 && ip != NULL) - iput(ip); - - iput(ZTOI(dzp)); + if (error == 0 && zp != NULL) { +#ifdef __FreeBSD__ + VOP_UNLOCK1(ZTOV(zp)); +#endif + zrele(zp); + } + zrele(dzp); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); @@ -431,7 +445,7 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; - struct inode *ip = NULL; + znode_t *zp = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); @@ -473,8 +487,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zfsvfs->z_os, objid, NULL); - if (error != ENOENT) + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) goto out; if (lr->lr_common.lrc_txtype & TX_CI) @@ -505,14 +519,14 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; + fallthrough; - /*FALLTHROUGH*/ case TX_CREATE: if (name == NULL) name = (char *)start; - error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, - 0, 0, &ip, kcred, vflg, NULL); + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, NULL); break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); @@ -523,33 +537,36 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; + fallthrough; - /*FALLTHROUGH*/ case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); - error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, - &ip, kcred, vflg, NULL); + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, NULL); break; case TX_MKXATTR: - error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &ip, kcred); + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; - error = zfs_symlink(ZTOI(dzp), name, &xva.xva_vattr, - link, &ip, kcred, vflg); + error = zfs_symlink(dzp, name, &xva.xva_vattr, + link, &zp, kcred, vflg); break; default: error = SET_ERROR(ENOTSUP); } out: - if (error == 0 && ip != NULL) - iput(ip); - - iput(ZTOI(dzp)); + if (error == 0 && zp != NULL) { +#ifdef __FreeBSD__ + VOP_UNLOCK1(ZTOV(zp)); +#endif + zrele(zp); + } + zrele(dzp); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); @@ -578,16 +595,16 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: - error = zfs_remove(ZTOI(dzp), name, kcred, vflg); + error = zfs_remove(dzp, name, kcred, vflg); break; case TX_RMDIR: - error = zfs_rmdir(ZTOI(dzp), name, NULL, kcred, vflg); + error = zfs_rmdir(dzp, name, NULL, kcred, vflg); break; default: error = SET_ERROR(ENOTSUP); } - iput(ZTOI(dzp)); + zrele(dzp); return (error); } @@ -609,17 +626,16 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { - iput(ZTOI(dzp)); + zrele(dzp); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - error = zfs_link(ZTOI(dzp), ZTOI(zp), name, kcred, vflg); - - iput(ZTOI(zp)); - iput(ZTOI(dzp)); + error = zfs_link(dzp, zp, name, kcred, vflg); + zrele(zp); + zrele(dzp); return (error); } @@ -642,18 +658,17 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { - iput(ZTOI(sdzp)); + zrele(sdzp); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - error = zfs_rename(ZTOI(sdzp), sname, ZTOI(tdzp), tname, kcred, vflg); - - iput(ZTOI(tdzp)); - iput(ZTOI(sdzp)); + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg); + zrele(tdzp); + zrele(sdzp); return (error); } @@ -664,7 +679,7 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) lr_write_t *lr = arg2; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; - int error, written; + int error; uint64_t eod, offset, length; if (byteswap) @@ -708,15 +723,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } - - written = zpl_write_common(ZTOI(zp), data, length, &offset, - UIO_SYSSPACE, 0, kcred); - if (written < 0) - error = -written; - else if (written < length) - error = SET_ERROR(EIO); /* short write */ - - iput(ZTOI(zp)); + error = zfs_write_simple(zp, data, length, offset, NULL); + zrele(zp); zfsvfs->z_replay_eof = 0; /* safety */ return (error); @@ -752,7 +760,7 @@ top: dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - iput(ZTOI(zp)); + zrele(zp); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); @@ -770,7 +778,7 @@ top: dmu_tx_commit(tx); } - iput(ZTOI(zp)); + zrele(zp); return (error); } @@ -796,10 +804,10 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) fl.l_start = lr->lr_offset; fl.l_len = lr->lr_length; - error = zfs_space(ZTOI(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE, lr->lr_offset, kcred); - iput(ZTOI(zp)); + zrele(zp); return (error); } @@ -851,11 +859,11 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); - error = zfs_setattr(ZTOI(zp), vap, 0, kcred); + error = zfs_setattr(zp, vap, 0, kcred); zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; - iput(ZTOI(zp)); + zrele(zp); return (error); } @@ -885,9 +893,9 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) vsa.vsa_aclflags = 0; vsa.vsa_aclentp = ace; - error = zfs_setsecattr(ZTOI(zp), &vsa, 0, kcred); + error = zfs_setsecattr(zp, &vsa, 0, kcred); - iput(ZTOI(zp)); + zrele(zp); return (error); } @@ -945,13 +953,13 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); } - error = zfs_setsecattr(ZTOI(zp), &vsa, 0, kcred); + error = zfs_setsecattr(zp, &vsa, 0, kcred); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; - iput(ZTOI(zp)); + zrele(zp); return (error); } diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index d514a4fc77..06a5e031a7 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -38,6 +38,20 @@ * rangelock_reduce(lr, off, len); // optional * rangelock_exit(lr); * + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + * * AVL tree * -------- * An AVL tree is used to maintain the state of the existing ranges @@ -99,17 +113,18 @@ #include #include + /* * AVL comparison function used to order range locks * Locks are ordered on the start offset of the range. */ static int -rangelock_compare(const void *arg1, const void *arg2) +zfs_rangelock_compare(const void *arg1, const void *arg2) { - const locked_range_t *rl1 = (const locked_range_t *)arg1; - const locked_range_t *rl2 = (const locked_range_t *)arg2; + const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1; + const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2; - return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); + return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); } /* @@ -118,34 +133,36 @@ rangelock_compare(const void *arg1, const void *arg2) * and may increase the range that's locked for RL_WRITER. */ void -rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) +zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg) { mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&rl->rl_tree, rangelock_compare, - sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); + avl_create(&rl->rl_tree, zfs_rangelock_compare, + sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node)); rl->rl_cb = cb; rl->rl_arg = arg; } void -rangelock_fini(rangelock_t *rl) +zfs_rangelock_fini(zfs_rangelock_t *rl) { mutex_destroy(&rl->rl_lock); avl_destroy(&rl->rl_tree); } /* - * Check if a write lock can be grabbed, or wait and recheck until available. + * Check if a write lock can be grabbed. If not, fail immediately or sleep and + * recheck until available, depending on the value of the "nonblock" parameter. */ -static void -rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) +static boolean_t +zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new, + boolean_t nonblock) { avl_tree_t *tree = &rl->rl_tree; - locked_range_t *lr; + zfs_locked_range_t *lr; avl_index_t where; uint64_t orig_off = new->lr_offset; uint64_t orig_len = new->lr_length; - rangelock_type_t orig_type = new->lr_type; + zfs_rangelock_type_t orig_type = new->lr_type; for (;;) { /* @@ -168,7 +185,7 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) */ if (avl_numnodes(tree) == 0) { avl_add(tree, new); - return; + return (B_TRUE); } /* @@ -178,19 +195,21 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) if (lr != NULL) goto wait; /* already locked at same offset */ - lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + lr = avl_nearest(tree, where, AVL_AFTER); if (lr != NULL && lr->lr_offset < new->lr_offset + new->lr_length) goto wait; - lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); + lr = avl_nearest(tree, where, AVL_BEFORE); if (lr != NULL && lr->lr_offset + lr->lr_length > new->lr_offset) goto wait; avl_insert(tree, new, where); - return; + return (B_TRUE); wait: + if (nonblock) + return (B_FALSE); if (!lr->lr_write_wanted) { cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); lr->lr_write_wanted = B_TRUE; @@ -208,10 +227,10 @@ wait: * If this is an original (non-proxy) lock then replace it by * a proxy and return the proxy. */ -static locked_range_t * -rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) +static zfs_locked_range_t * +zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr) { - locked_range_t *proxy; + zfs_locked_range_t *proxy; if (lr->lr_proxy) return (lr); /* already a proxy */ @@ -223,7 +242,7 @@ rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) lr->lr_count = 0; /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); proxy->lr_offset = lr->lr_offset; proxy->lr_length = lr->lr_length; proxy->lr_count = 1; @@ -240,9 +259,11 @@ rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) * Split the range lock at the supplied offset * returning the *front* proxy. */ -static locked_range_t * -rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) +static zfs_locked_range_t * +zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off) { + zfs_locked_range_t *rear; + ASSERT3U(lr->lr_length, >, 1); ASSERT3U(off, >, lr->lr_offset); ASSERT3U(off, <, lr->lr_offset + lr->lr_length); @@ -250,7 +271,7 @@ rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) ASSERT(lr->lr_read_wanted == B_FALSE); /* create the rear proxy range lock */ - locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); rear->lr_offset = off; rear->lr_length = lr->lr_offset + lr->lr_length - off; rear->lr_count = lr->lr_count; @@ -259,7 +280,7 @@ rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) rear->lr_write_wanted = B_FALSE; rear->lr_read_wanted = B_FALSE; - locked_range_t *front = rangelock_proxify(tree, lr); + zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr); front->lr_length = off - lr->lr_offset; avl_insert_here(tree, rear, front, AVL_AFTER); @@ -270,10 +291,12 @@ rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) * Create and add a new proxy range lock for the supplied range. */ static void -rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) { + zfs_locked_range_t *lr; + ASSERT(len != 0); - locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); lr->lr_offset = off; lr->lr_length = len; lr->lr_count = 1; @@ -285,10 +308,10 @@ rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) } static void -rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, - locked_range_t *prev, avl_index_t where) +zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new, + zfs_locked_range_t *prev, avl_index_t where) { - locked_range_t *next; + zfs_locked_range_t *next; uint64_t off = new->lr_offset; uint64_t len = new->lr_length; @@ -307,7 +330,7 @@ rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, * convert to proxy if needed then * split this entry and bump ref count */ - prev = rangelock_split(tree, prev, off); + prev = zfs_rangelock_split(tree, prev, off); prev = AVL_NEXT(tree, prev); /* move to rear range */ } } @@ -326,7 +349,7 @@ rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, if (off < next->lr_offset) { /* Add a proxy for initial range before the overlap */ - rangelock_new_proxy(tree, off, next->lr_offset - off); + zfs_rangelock_new_proxy(tree, off, next->lr_offset - off); } new->lr_count = 0; /* will use proxies in tree */ @@ -344,41 +367,43 @@ rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, /* there's a gap */ ASSERT3U(next->lr_offset, >, prev->lr_offset + prev->lr_length); - rangelock_new_proxy(tree, + zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, next->lr_offset - (prev->lr_offset + prev->lr_length)); } if (off + len == next->lr_offset + next->lr_length) { /* exact overlap with end */ - next = rangelock_proxify(tree, next); + next = zfs_rangelock_proxify(tree, next); next->lr_count++; return; } if (off + len < next->lr_offset + next->lr_length) { /* new range ends in the middle of this block */ - next = rangelock_split(tree, next, off + len); + next = zfs_rangelock_split(tree, next, off + len); next->lr_count++; return; } ASSERT3U(off + len, >, next->lr_offset + next->lr_length); - next = rangelock_proxify(tree, next); + next = zfs_rangelock_proxify(tree, next); next->lr_count++; } /* Add the remaining end range. */ - rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, + zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, (off + len) - (prev->lr_offset + prev->lr_length)); } /* - * Check if a reader lock can be grabbed, or wait and recheck until available. + * Check if a reader lock can be grabbed. If not, fail immediately or sleep and + * recheck until available, depending on the value of the "nonblock" parameter. */ -static void -rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) +static boolean_t +zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new, + boolean_t nonblock) { avl_tree_t *tree = &rl->rl_tree; - locked_range_t *prev, *next; + zfs_locked_range_t *prev, *next; avl_index_t where; uint64_t off = new->lr_offset; uint64_t len = new->lr_length; @@ -389,13 +414,15 @@ rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) retry: prev = avl_find(tree, new, &where); if (prev == NULL) - prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); + prev = avl_nearest(tree, where, AVL_BEFORE); /* * Check the previous range for a writer lock overlap. */ if (prev && (off < prev->lr_offset + prev->lr_length)) { if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { + if (nonblock) + return (B_FALSE); if (!prev->lr_read_wanted) { cv_init(&prev->lr_read_cv, NULL, CV_DEFAULT, NULL); @@ -415,11 +442,13 @@ retry: if (prev != NULL) next = AVL_NEXT(tree, prev); else - next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + next = avl_nearest(tree, where, AVL_AFTER); for (; next != NULL; next = AVL_NEXT(tree, next)) { if (off + len <= next->lr_offset) goto got_lock; if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { + if (nonblock) + return (B_FALSE); if (!next->lr_read_wanted) { cv_init(&next->lr_read_cv, NULL, CV_DEFAULT, NULL); @@ -437,7 +466,8 @@ got_lock: * Add the read lock, which may involve splitting existing * locks and bumping ref counts (r_count). */ - rangelock_add_reader(tree, new, prev, where); + zfs_rangelock_add_reader(tree, new, prev, where); + return (B_TRUE); } /* @@ -445,15 +475,18 @@ got_lock: * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert * it to a RL_WRITER lock (with the offset at the end of the file). Returns * the range lock structure for later unlocking (or reduce range if the - * entire file is locked as RL_WRITER). + * entire file is locked as RL_WRITER), or NULL if nonblock is true and the + * lock could not be acquired immediately. */ -locked_range_t * -rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, - rangelock_type_t type) +static zfs_locked_range_t * +zfs_rangelock_enter_impl(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type, boolean_t nonblock) { + zfs_locked_range_t *new; + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); new->lr_rangelock = rl; new->lr_offset = off; if (len + off < off) /* overflow */ @@ -470,21 +503,39 @@ rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, /* * First check for the usual case of no locks */ - if (avl_numnodes(&rl->rl_tree) == 0) + if (avl_numnodes(&rl->rl_tree) == 0) { avl_add(&rl->rl_tree, new); - else - rangelock_enter_reader(rl, new); - } else - rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */ + } else if (!zfs_rangelock_enter_reader(rl, new, nonblock)) { + kmem_free(new, sizeof (*new)); + new = NULL; + } + } else if (!zfs_rangelock_enter_writer(rl, new, nonblock)) { + kmem_free(new, sizeof (*new)); + new = NULL; + } mutex_exit(&rl->rl_lock); return (new); } +zfs_locked_range_t * +zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type) +{ + return (zfs_rangelock_enter_impl(rl, off, len, type, B_FALSE)); +} + +zfs_locked_range_t * +zfs_rangelock_tryenter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type) +{ + return (zfs_rangelock_enter_impl(rl, off, len, type, B_TRUE)); +} + /* - * Safely free the locked_range_t. + * Safely free the zfs_locked_range_t. */ static void -rangelock_free(locked_range_t *lr) +zfs_rangelock_free(zfs_locked_range_t *lr) { if (lr->lr_write_wanted) cv_destroy(&lr->lr_write_cv); @@ -492,14 +543,14 @@ rangelock_free(locked_range_t *lr) if (lr->lr_read_wanted) cv_destroy(&lr->lr_read_cv); - kmem_free(lr, sizeof (locked_range_t)); + kmem_free(lr, sizeof (zfs_locked_range_t)); } /* * Unlock a reader lock */ static void -rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, +zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove, list_t *free_list) { avl_tree_t *tree = &rl->rl_tree; @@ -528,11 +579,11 @@ rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, * then decrement ref count on all proxies * that make up this range, freeing them as needed. */ - locked_range_t *lr = avl_find(tree, remove, NULL); + zfs_locked_range_t *lr = avl_find(tree, remove, NULL); ASSERT3P(lr, !=, NULL); ASSERT3U(lr->lr_count, !=, 0); ASSERT3U(lr->lr_type, ==, RL_READER); - locked_range_t *next = NULL; + zfs_locked_range_t *next = NULL; for (len = remove->lr_length; len != 0; lr = next) { len -= lr->lr_length; if (len != 0) { @@ -553,7 +604,7 @@ rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, list_insert_tail(free_list, lr); } } - kmem_free(remove, sizeof (locked_range_t)); + kmem_free(remove, sizeof (zfs_locked_range_t)); } } @@ -561,11 +612,11 @@ rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, * Unlock range and destroy range lock structure. */ void -rangelock_exit(locked_range_t *lr) +zfs_rangelock_exit(zfs_locked_range_t *lr) { - rangelock_t *rl = lr->lr_rangelock; + zfs_rangelock_t *rl = lr->lr_rangelock; list_t free_list; - locked_range_t *free_lr; + zfs_locked_range_t *free_lr; ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); ASSERT(lr->lr_count == 1 || lr->lr_count == 0); @@ -575,8 +626,8 @@ rangelock_exit(locked_range_t *lr) * The free list is used to defer the cv_destroy() and * subsequent kmem_free until after the mutex is dropped. */ - list_create(&free_list, sizeof (locked_range_t), - offsetof(locked_range_t, lr_node)); + list_create(&free_list, sizeof (zfs_locked_range_t), + offsetof(zfs_locked_range_t, lr_node)); mutex_enter(&rl->rl_lock); if (lr->lr_type == RL_WRITER) { @@ -590,14 +641,14 @@ rangelock_exit(locked_range_t *lr) } else { /* * lock may be shared, let rangelock_exit_reader() - * release the lock and free the locked_range_t. + * release the lock and free the zfs_locked_range_t. */ - rangelock_exit_reader(rl, lr, &free_list); + zfs_rangelock_exit_reader(rl, lr, &free_list); } mutex_exit(&rl->rl_lock); while ((free_lr = list_remove_head(&free_list)) != NULL) - rangelock_free(free_lr); + zfs_rangelock_free(free_lr); list_destroy(&free_list); } @@ -608,9 +659,9 @@ rangelock_exit(locked_range_t *lr) * entry in the tree. */ void -rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) +zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len) { - rangelock_t *rl = lr->lr_rangelock; + zfs_rangelock_t *rl = lr->lr_rangelock; /* Ensure there are no other locks */ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); @@ -631,9 +682,10 @@ rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) } #if defined(_KERNEL) -EXPORT_SYMBOL(rangelock_init); -EXPORT_SYMBOL(rangelock_fini); -EXPORT_SYMBOL(rangelock_enter); -EXPORT_SYMBOL(rangelock_exit); -EXPORT_SYMBOL(rangelock_reduce); +EXPORT_SYMBOL(zfs_rangelock_init); +EXPORT_SYMBOL(zfs_rangelock_fini); +EXPORT_SYMBOL(zfs_rangelock_enter); +EXPORT_SYMBOL(zfs_rangelock_tryenter); +EXPORT_SYMBOL(zfs_rangelock_exit); +EXPORT_SYMBOL(zfs_rangelock_reduce); #endif diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index bd21ba896c..67be131da6 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -71,7 +71,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { #ifdef _KERNEL int -zfs_sa_readlink(znode_t *zp, uio_t *uio) +zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio) { dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); size_t bufsz; @@ -79,15 +79,16 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + + error = zfs_uiomove((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, uio); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + error = zfs_uiomove(dbp->db_data, + MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, + uio); dmu_buf_rele(dbp, FTAG); } } @@ -300,7 +301,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) * and ready the ACL would require special "locked" * interfaces that would be messy */ - if (zp->z_acl_cached == NULL || S_ISLNK(ZTOI(zp)->i_mode)) + if (zp->z_acl_cached == NULL || Z_ISLNK(ZTOTYPE(zp))) return; /* @@ -369,13 +370,13 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); - links = ZTOI(zp)->i_nlink; + links = ZTONLNK(zp); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (dmu_objset_projectquota_enabled(hdl->sa_os)) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 8); - if (S_ISBLK(ZTOI(zp)->i_mode) || S_ISCHR(ZTOI(zp)->i_mode)) + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 9d8a9cbc54..7cbb70f499 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -29,20 +29,15 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ - #include #include #include #include #include +#include #include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -54,191 +49,33 @@ #include #include #include -#include -#include #include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include +#include +#include +#include -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait for the intent log to commit if it is a synchronous operation. - * Moreover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. - * - * (2) iput() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * If you must call iput() within a tx then use zfs_iput_async(). - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to - * dmu_tx_assign(). This is critical because we don't want to block - * while holding locks. - * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This - * reduces lock contention and CPU usage when we must wait (note that if - * throughput is constrained by the storage, nearly every transaction - * must wait). - * - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing - * to use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. On subsequent - * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, - * to indicate that this operation has already called dmu_tx_wait(). - * This will ensure that we don't retry forever, waiting a short bit - * each time. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * iput(...); // release held vnodes - * if (error == ERESTART) { - * waited = B_TRUE; - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * iput(...); // release held vnodes - * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ -/* - * Virus scanning is unsupported. It would be possible to add a hook - * here to performance the required virus scan. This could be done - * entirely in the kernel or potentially as an update to invoke a - * scanning utility. - */ -static int -zfs_vscan(struct inode *ip, cred_t *cr, int async) -{ - return (0); -} +static ulong_t zfs_fsync_sync_cnt = 4; -/* ARGSUSED */ int -zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) +zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && - ((flag & O_APPEND) == 0)) { + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); } + tsd_set(zfs_fsyncer_key, NULL); - /* Virus scan eligible files on open */ - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { - if (zfs_vscan(ip, cr, 0) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - } - - /* Keep a count of the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_inc_32(&zp->z_sync_cnt); - - ZFS_EXIT(zfsvfs); return (0); } -/* ARGSUSED */ -int -zfs_close(struct inode *ip, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_dec_32(&zp->z_sync_cnt); - - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) - VERIFY(zfs_vscan(ip, cr, 1) == 0); - - ZFS_EXIT(zfsvfs); - return (0); -} #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* @@ -246,9 +83,9 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int -zfs_holey_common(struct inode *ip, int cmd, loff_t *off) +zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { - znode_t *zp = ITOZ(ip); + zfs_locked_range_t *lr; uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; @@ -259,17 +96,23 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) return (SET_ERROR(ENXIO)); } - if (cmd == SEEK_HOLE) + if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; + /* Flush any mmap()'d data to disk */ + if (zn_has_cached_data(zp)) + zn_flush_cached_data(zp, B_FALSE); + + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + zfs_rangelock_exit(lr); if (error == ESRCH) return (SET_ERROR(ENXIO)); - /* file was dirty, so fall back to using generic logic */ + /* File was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; @@ -296,134 +139,49 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) } int -zfs_holey(struct inode *ip, int cmd, loff_t *off) +zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - error = zfs_holey_common(ip, cmd, off); + error = zfs_holey_common(zp, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ -#if defined(_KERNEL) -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - */ -static void -update_pages(struct inode *ip, int64_t start, int len, - objset_t *os, uint64_t oid) +/*ARGSUSED*/ +int +zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); - pb = kmap(pp); - (void) dmu_read(os, oid, start+off, nbytes, pb+off, - DMU_READ_PREFETCH); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); - unlock_page(pp); - put_page(pp); - } - - len -= nbytes; - off = 0; - } -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedread(struct inode *ip, int nbytes, uio_t *uio) -{ - struct address_space *mp = ip->i_mapping; - struct page *pp; - znode_t *zp = ITOZ(ip); - int64_t start, off; - uint64_t bytes; - int len = nbytes; - int error = 0; - void *pb; - - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - ASSERT(PageUptodate(pp)); - unlock_page(pp); - - pb = kmap(pp); - error = uiomove(pb + off, bytes, UIO_READ, uio); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - put_page(pp); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); - } - - len -= bytes; - off = 0; - if (error) - break; - } + ZFS_EXIT(zfsvfs); return (error); } -#endif /* _KERNEL */ -unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ -unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; +static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * - * IN: ip - inode of file to be read from. + * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. - * ioflag - FSYNC flags; used to provide FRSYNC semantics. + * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * @@ -436,13 +194,12 @@ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; */ /* ARGSUSED */ int -zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -451,10 +208,16 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EACCES)); } + /* We don't copy out anything useful for directories. */ + if (Z_ISDIR(ZTOTYPE(zp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + /* * Validate file offset */ - if (uio->uio_loffset < (offset_t)0) { + if (zfs_uio_offset(uio) < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } @@ -462,7 +225,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* * Fasttrack empty reads */ - if (uio->uio_resid == 0) { + if (zfs_uio_resid(uio) == 0) { ZFS_EXIT(zfsvfs); return (0); } @@ -473,7 +236,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it - * to FSYNC, which results in unnecessary calls to zil_commit. We + * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); @@ -485,60 +248,35 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* * Lock the range against changes. */ - locked_range_t *lr = rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ - if (uio->uio_loffset >= zp->z_size) { + if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } - ASSERT(uio->uio_loffset < zp->z_size); - ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ASSERT(zfs_uio_offset(uio) < zp->z_size); +#if defined(__linux__) + ssize_t start_offset = zfs_uio_offset(uio); +#endif + ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { - int nblk; - int blksz = zp->z_blksz; - uint64_t offset = uio->uio_loffset; - - xuio = (xuio_t *)uio; - if ((ISP2(blksz))) { - nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, - blksz)) / blksz; - } else { - ASSERT(offset + n <= blksz); - nblk = 1; - } - (void) dmu_xuio_init(xuio, nblk); - - if (vn_has_cached_data(ip)) { - /* - * For simplicity, we always allocate a full buffer - * even if we only expect to read a portion of a block. - */ - while (--nblk >= 0) { - (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz), 0, blksz); - } - } - } -#endif /* HAVE_UIO_ZEROCOPY */ - while (n > 0) { - ssize_t nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { - error = mappedread(ip, nbytes, uio); + ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - + P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); +#ifdef UIO_NOCOPY + if (zfs_uio_segflg(uio) == UIO_NOCOPY) + error = mappedread_sf(zp, nbytes, uio); + else +#endif + if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { + error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -548,6 +286,18 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); + +#if defined(__linux__) + /* + * if we actually read some bytes, bubbling EFAULT + * up to become EAGAIN isn't what we want here... + * + * ...on Linux, at least. On FBSD, doing this breaks. + */ + if (error == EFAULT && + (zfs_uio_offset(uio) - start_offset) != 0) + error = 0; +#endif break; } @@ -558,8 +308,9 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: - rangelock_exit(lr); + zfs_rangelock_exit(lr); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } @@ -567,10 +318,10 @@ out: /* * Write the bytes to a file. * - * IN: ip - inode of file to be written to. + * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. - * ioflag - FAPPEND flag set if in append mode. + * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * @@ -585,10 +336,10 @@ out: /* ARGSUSED */ int -zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; - ssize_t start_resid = uio->uio_resid; + ssize_t start_resid = zfs_uio_resid(uio); /* * Fasttrack empty write @@ -597,11 +348,6 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) if (n == 0) return (0); - rlim64_t limit = uio->uio_limit; - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -626,11 +372,13 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) } /* - * If immutable or not appending then return EPERM + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() */ - if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_size))) { + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && + (zfs_uio_offset(uio) < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } @@ -638,41 +386,34 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* * Validate file offset */ - offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } - int max_blksz = zfsvfs->z_max_blksz; - xuio_t *xuio = NULL; + const uint64_t max_blksz = zfsvfs->z_max_blksz; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ -#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else -#endif - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFAULT)); - } + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } /* * If in append mode, set the io offset pointer to eof. */ - locked_range_t *lr; - if (ioflag & FAPPEND) { + zfs_locked_range_t *lr; + if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ - lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* @@ -682,36 +423,39 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) */ woff = zp->z_size; } - uio->uio_loffset = woff; + zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ - lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } - if (woff >= limit) { - rangelock_exit(lr); + if (zn_rlimit_fsize(zp, uio)) { + zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; + const rlim64_t limit = MAXOFFSET_T; - /* Will this write extend the file length? */ - int write_eof = (woff + n > zp->z_size); + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if (n > limit - woff) + n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; -#ifdef HAVE_UIO_ZEROCOPY - int i_iov = 0; - const iovec_t *iovp = uio->uio_iov; - ASSERTV(int iovcnt = uio->uio_iovcnt); -#endif + const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); + const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); + const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written @@ -719,34 +463,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * and allows us to do more fine-grained space accounting. */ while (n > 0) { - woff = uio->uio_loffset; + woff = zfs_uio_offset(uio); - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { + projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; - const iovec_t *aiov = NULL; - if (xuio) { -#ifdef HAVE_UIO_ZEROCOPY - ASSERT(i_iov < iovcnt); - ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; -#endif - } else if (n >= max_blksz && woff >= zp->z_size && + if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* @@ -762,12 +491,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } - ASSERT(cbytes == max_blksz); + ASSERT3S(cbytes, ==, max_blksz); } /* @@ -775,7 +504,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -807,66 +540,78 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); - rangelock_reduce(lr, woff, n); + zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ - ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + const ssize_t nbytes = + MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { - tx_bytes = uio->uio_resid; - uio->uio_fault_disable = B_TRUE; + tx_bytes = zfs_uio_resid(uio); + zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); - uio->uio_fault_disable = B_FALSE; + zfs_uio_fault_disable(uio, B_FALSE); +#ifdef __linux__ if (error == EFAULT) { dmu_tx_commit(tx); - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + /* + * Account for partial writes before + * continuing the loop. + * Update needs to occur before the next + * zfs_uio_prefaultpages, or prefaultpages may + * error, and we may break the loop early. + */ + if (tx_bytes != zfs_uio_resid(uio)) + n -= tx_bytes - zfs_uio_resid(uio); + if (zfs_uio_prefaultpages(MIN(n, max_blksz), + uio)) { break; } continue; - } else if (error != 0) { + } +#endif + if (error != 0) { dmu_tx_commit(tx); break; } - tx_bytes -= uio->uio_resid; + tx_bytes -= zfs_uio_resid(uio); } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* Implied by abuf != NULL: */ + ASSERT3S(n, >=, max_blksz); + ASSERT0(P2PHASE(woff, max_blksz)); /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). + * We can simplify nbytes to MIN(n, max_blksz) since + * P2PHASE(woff, max_blksz) is 0, and knowing + * n >= max_blksz lets us simplify further: */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - /* cppcheck-suppress nullPointer */ - aiov->iov_len, aiov->iov_base, tx); + ASSERT3S(nbytes, ==, max_blksz); + /* + * Thus, we're writing a full block at a block-aligned + * offset and extending the file past EOF. + * + * dmu_assign_arcbuf_by_dbuf() will directly assign the + * arc buffer to a dbuf. + */ + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - error = dmu_assign_arcbuf_by_dbuf( - sa_get_db(zp->z_sa_hdl), woff, abuf, tx); - if (error != 0) { - dmu_return_arcbuf(abuf); - dmu_tx_commit(tx); - break; - } + dmu_tx_commit(tx); + break; } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); + ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); + zfs_uioskip(uio, nbytes); + tx_bytes = nbytes; } - if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { - update_pages(ip, woff, - tx_bytes, zfsvfs->z_os, zp->z_id); + if (tx_bytes && zn_has_cached_data(zp) && + !(ioflag & O_DIRECT)) { + update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* @@ -885,7 +630,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * - * It would be nice to to this after all writes have + * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * @@ -893,15 +638,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); - uint32_t uid = KUID_TO_SUID(ip->i_uid); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, + secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); - ip->i_mode = newmode = zp->z_mode; + newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } @@ -913,9 +657,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ - while ((end_size = zp->z_size) < uio->uio_loffset) { + while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); + zfs_uio_offset(uio)); ASSERT(error == 0); } /* @@ -934,34 +678,36 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) if (error != 0) break; - ASSERT(tx_bytes == nbytes); + ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; - if (!xuio && n > 0) { - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = EFAULT; + if (n > 0) { + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = SET_ERROR(EFAULT); break; } } } - zfs_inode_update(zp); - rangelock_exit(lr); + zfs_znode_update_vfs(zp); + zfs_rangelock_exit(lr); /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. + * If we're in replay mode, or we made no progress, or the + * uio data is inaccessible return an error. Otherwise, it's + * at least a partial write, so it's successful. */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || + error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } - if (ioflag & (FSYNC | FDSYNC) || + if (ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); - int64_t nwritten = start_resid - uio->uio_resid; + const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); @@ -969,56 +715,55 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) return (0); } -/* - * Drop a reference on the passed inode asynchronously. This ensures - * that the caller will never drop the last reference on an inode in - * the current context. Doing so while holding open a tx could result - * in a deadlock if iput_final() re-enters the filesystem code. - */ -void -zfs_iput_async(struct inode *ip) +/*ARGSUSED*/ +int +zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { - objset_t *os = ITOZSB(ip)->z_os; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - ASSERT(atomic_read(&ip->i_count) > 0); - ASSERT(os != NULL); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); - if (atomic_read(&ip->i_count) == 1) - VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)), - (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); - else - iput(ip); + return (error); } -/* ARGSUSED */ -void -zfs_get_done(zgd_t *zgd, int error) +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { - znode_t *zp = zgd->zgd_private; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); - rangelock_exit(zgd->zgd_lr); + error = zfs_setacl(zp, vsecp, skipaclchk, cr); - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_iput_async(ZTOI(zp)); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); - kmem_free(zgd, sizeof (zgd_t)); + ZFS_EXIT(zfsvfs); + return (error); } -#ifdef DEBUG +#ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif +static void zfs_get_done(zgd_t *zgd, int error); + /* * Get data to generate a TX_WRITE intent log record. */ int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; @@ -1029,6 +774,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) dmu_buf_t *db; zgd_t *zgd; int error = 0; + uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); @@ -1044,7 +790,17 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ - zfs_iput_async(ZTOI(zp)); + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } + /* check if generation number matches */ + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (zp_gen)) != 0) { + zfs_zrele_async(zp); + return (SET_ERROR(EIO)); + } + if (zp_gen != gen) { + zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } @@ -1060,7 +816,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { @@ -1082,17 +838,17 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; - rangelock_exit(zgd->zgd_lr); + zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); -#ifdef DEBUG +#ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; @@ -1144,4124 +900,34 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) return (error); } -/*ARGSUSED*/ -int -zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held inode reference for it. - * - * IN: dip - inode of directory to search. - * nm - name of entry to lookup. - * flags - LOOKUP_XATTR set if looking for an attribute. - * cr - credentials of caller. - * direntflags - directory lookup flags - * realpnp - returned pathname. - * - * OUT: ipp - inode of located entry, NULL if not found. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -int -zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - cred_t *cr, int *direntflags, pathname_t *realpnp) -{ - znode_t *zdp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - int error = 0; - - /* - * Fast path lookup, however we must skip DNLC lookup - * for case folding or normalizing lookups because the - * DNLC code only stores the passed in name. This means - * creating 'a' and removing 'A' on a case insensitive - * file system would work, but DNLC still thinks 'a' - * exists and won't let you create it again on the next - * pass through fast path. - */ - if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { - - if (!S_ISDIR(dip->i_mode)) { - return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { - return (SET_ERROR(EIO)); - } - - if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (!error) { - *ipp = dip; - igrab(*ipp); - return (0); - } - return (error); - } - } - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); - - *ipp = NULL; - - if (flags & LOOKUP_XATTR) { - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - - if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0, - B_FALSE, cr))) { - iput(*ipp); - *ipp = NULL; - } - - ZFS_EXIT(zfsvfs); - return (error); - } - - if (!S_ISDIR(dip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTDIR)); - } - - /* - * Check accessibility of directory. - */ - - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp); - if ((error == 0) && (*ipp)) - zfs_inode_update(ITOZ(*ipp)); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the ip of the created or trunc'd file. - * - * IN: dip - inode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - file flag. - * vsecp - ACL to be set - * - * OUT: ipp - inode of created or trunc'd entry. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated if new entry created - * ip - ctime|mtime always, atime if new - */ /* ARGSUSED */ -int -zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - objset_t *os; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *ipp = NULL; - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - igrab(dip); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible igrab(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - if (strcmp(name, "..") == 0) - error = SET_ERROR(EISDIR); - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if (zp == NULL) { - uint64_t txtype; - uint64_t projid = ZFS_DEFAULT_PROJID; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - - if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EINVAL); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - - error = dmu_tx_assign(tx, - (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - /* - * Since, we failed to add the directory entry for it, - * delete the newly created dnode. - */ - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - } else { - int aflags = (flag & FAPPEND) ? V_APPEND : 0; - - if (have_acl) - zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; - - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl) { - error = SET_ERROR(EEXIST); - goto out; - } - /* - * Can't open a directory for writing. - */ - if (S_ISDIR(ZTOI(zp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } - - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); - - /* - * Truncate regular files if requested. - */ - if (S_ISREG(ZTOI(zp)->i_mode) && - (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { - /* we can't hold any locks when calling zfs_freesp() */ - if (dl) { - zfs_dirent_unlock(dl); - dl = NULL; - } - error = zfs_freesp(zp, 0, 0, mode, TRUE); - } - } -out: - - if (dl) - zfs_dirent_unlock(dl); - - if (error) { - if (zp) - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *ipp = ZTOI(zp); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -int -zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp = NULL, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - objset_t *os; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - uint64_t projid = ZFS_DEFAULT_PROJID; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *ipp = NULL; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - /* Add to unlinked set */ - zp->z_unlinked = 1; - zfs_unlinked_add(zp, tx); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); -out: - - if (error) { - if (zp) - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *ipp = ZTOI(zp); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dip - inode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dip - ctime|mtime - * ip - ctime (if nlink > 0) - */ - -uint64_t null_xattr = 0; - -/*ARGSUSED*/ -int -zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) -{ - znode_t *zp, *dzp = ITOZ(dip); - znode_t *xzp; - struct inode *ip; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - uint64_t acl_obj, xattr_obj; - uint64_t xattr_obj_unlinked = 0; - uint64_t obj = 0; - uint64_t links; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked, toobig = FALSE; - uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: - xattr_obj = 0; - xzp = NULL; - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp))) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - ip = ZTOI(zp); - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (S_ISDIR(ip->i_mode)) { - error = SET_ERROR(EPERM); - goto out; - } - - mutex_enter(&zp->z_lock); - may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); - mutex_exit(&zp->z_lock); - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the inode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - obj = zp->z_id; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - if (may_delete_now) { - toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; - /* if the file is too big, only hold_free a token amount */ - dmu_tx_hold_free(tx, zp->z_id, 0, - (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); - } - - /* are there any extended attributes? */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT0(error); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - } - - mutex_enter(&zp->z_lock); - if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - mutex_exit(&zp->z_lock); - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - /* - * Mark this transaction as typically resulting in a net free of space - */ - dmu_tx_mark_netfree(tx); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ip); - if (xzp) - iput(ZTOI(xzp)); - goto top; - } - if (realnmp) - pn_free(realnmp); - dmu_tx_abort(tx); - iput(ip); - if (xzp) - iput(ZTOI(xzp)); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (unlinked) { - /* - * Hold z_lock so that we can make sure that the ACL obj - * hasn't changed. Could have been deleted due to - * zfs_sa_upgrade(). - */ - mutex_enter(&zp->z_lock); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); - delete_now = may_delete_now && !toobig && - atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) && - xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == - acl_obj; - } - - if (delete_now) { - if (xattr_obj_unlinked) { - ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = 1; - clear_nlink(ZTOI(xzp)); - links = 0; - error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &links, sizeof (links), tx); - ASSERT3U(error, ==, 0); - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - - if (zp->z_is_sa) - error = sa_remove(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), tx); - else - error = sa_update(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), &null_xattr, - sizeof (uint64_t), tx); - ASSERT0(error); - } - /* - * Add to the unlinked set because a new reference could be - * taken concurrently resulting in a deferred destruction. - */ - zfs_unlinked_add(zp, tx); - mutex_exit(&zp->z_lock); - } else if (unlinked) { - mutex_exit(&zp->z_lock); - zfs_unlinked_add(zp, tx); - } - - txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, obj); - - dmu_tx_commit(tx); -out: - if (realnmp) - pn_free(realnmp); - - zfs_dirent_unlock(dl); - zfs_inode_update(dzp); - zfs_inode_update(zp); - - if (delete_now) - iput(ip); - else - zfs_iput_async(ip); - - if (xzp) { - zfs_inode_update(xzp); - zfs_iput_async(ZTOI(xzp)); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dip using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dip - inode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * flags - case flags. - * vsecp - ACL to be set - * - * OUT: ipp - inode of created directory. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dip - ctime|mtime updated - * ipp - ctime|mtime|atime updated - */ -/*ARGSUSED*/ -int -zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - cred_t *cr, int flags, vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - zfs_dirlock_t *dl; - uint64_t txtype; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uid_t uid; - gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t waited = B_FALSE; - - ASSERT(S_ISDIR(vap->va_mode)); - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - uid = crgetuid(cr); - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (dirname == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (zfsvfs->z_utf8 && u8_validate(dirname, - strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - /* - * First make sure the new directory doesn't exist. - * - * Existence is checked first to make sure we don't return - * EACCES instead of EEXIST which can cause some applications - * to fail. - */ -top: - *ipp = NULL; - - if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL))) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - - /* - * Add a new entry to the directory. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - /* - * Now put new name in parent dir. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - *ipp = ZTOI(zp); - - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, - acl_ids.z_fuidp, vap); - -out: - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (error != 0) { - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dip - inode of directory to remove from. - * name - name of directory to be removed. - * cwd - inode of current working directory. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, - int flags) -{ - znode_t *dzp = ITOZ(dip); - znode_t *zp; - struct inode *ip; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - ip = ZTOI(zp); - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - if (!S_ISDIR(ip->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - - if (ip == cwd) { - error = SET_ERROR(EINVAL); - goto out; - } - - /* - * Grab a lock on the directory to make sure that no one is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ip); - goto top; - } - dmu_tx_abort(tx); - iput(ip); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); - - if (error == 0) { - uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); - } - - dmu_tx_commit(tx); - - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -out: - zfs_dirent_unlock(dl); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - iput(ip); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read directory entries from the given directory cursor position and emit - * name and position for each entry. - * - * IN: ip - inode of directory to read. - * ctx - directory entry context. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -int -zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - zap_cursor_t zc; - zap_attribute_t zap; - int error; - uint8_t prefetch; - uint8_t type; - int done = 0; - uint64_t parent; - uint64_t offset; /* must be unsigned; checks for < 1 */ - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent))) != 0) - goto out; - - /* - * Quit if directory has been removed (posix) - */ - if (zp->z_unlinked) - goto out; - - error = 0; - os = zfsvfs->z_os; - offset = ctx->pos; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Transform to file-system independent format - */ - while (!done) { - uint64_t objnum; - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - zap.za_normalization_conflict = 0; - objnum = zp->z_id; - type = DT_DIR; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - zap.za_normalization_conflict = 0; - objnum = parent; - type = DT_DIR; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - zap.za_normalization_conflict = 0; - objnum = ZFSCTL_INO_ROOT; - type = DT_DIR; - } else { - /* - * Grab next entry. - */ - if ((error = zap_cursor_retrieve(&zc, &zap))) { - if (error == ENOENT) - break; - else - goto update; - } - - /* - * Allow multiple entries provided the first entry is - * the object id. Non-zpl consumers may safely make - * use of the additional space. - * - * XXX: This should be a feature flag for compatibility - */ - if (zap.za_integer_length != 8 || - zap.za_num_integers == 0) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld, " - "length = %d, num = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset, - zap.za_integer_length, - (u_longlong_t)zap.za_num_integers); - error = SET_ERROR(ENXIO); - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } - - done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), - objnum, type); - if (done) - break; - - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - ctx->pos = offset; - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - -update: - zap_cursor_fini(&zc); - if (error == ENOENT) - error = 0; -out: - ZFS_EXIT(zfsvfs); - - return (error); -} - -ulong_t zfs_fsync_sync_cnt = 4; - -int -zfs_fsync(struct inode *ip, int syncflag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - - return (0); -} - - -/* - * Get the requested file attributes and place them in the provided - * vattr structure. - * - * IN: ip - inode of file. - * vap - va_mask identifies requested attributes. - * If ATTR_XVATTR set, then optional attrs are requested - * flags - ATTR_NOACLCHECK (CIFS server context) - * cr - credentials of caller. - * - * OUT: vap - attribute values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -int -zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error = 0; - uint64_t links; - uint64_t atime[2], mtime[2], ctime[2]; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap = NULL; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - sa_bulk_attr_t bulk[3]; - int count = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. - */ - if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && - (vap->va_uid != crgetuid(cr))) { - if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, - skipaclchk, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * Return all attributes. It's cheaper to provide the answer - * than to determine whether we were asked the question. - */ - - mutex_enter(&zp->z_lock); - vap->va_type = vn_mode_to_vtype(zp->z_mode); - vap->va_mode = zp->z_mode; - vap->va_fsid = ZTOI(zp)->i_sb->s_dev; - vap->va_nodeid = zp->z_id; - if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) - links = ZTOI(zp)->i_nlink + 1; - else - links = ZTOI(zp)->i_nlink; - vap->va_nlink = MIN(links, ZFS_LINK_MAX); - vap->va_size = i_size_read(ip); - vap->va_rdev = ip->i_rdev; - vap->va_seq = ip->i_generation; - - /* - * Add in any requested optional attributes and the create time. - * Also set the corresponding bits in the returned attribute bitmap. - */ - if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - xoap->xoa_archive = - ((zp->z_pflags & ZFS_ARCHIVE) != 0); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - xoap->xoa_readonly = - ((zp->z_pflags & ZFS_READONLY) != 0); - XVA_SET_RTN(xvap, XAT_READONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - xoap->xoa_system = - ((zp->z_pflags & ZFS_SYSTEM) != 0); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - xoap->xoa_hidden = - ((zp->z_pflags & ZFS_HIDDEN) != 0); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - xoap->xoa_nounlink = - ((zp->z_pflags & ZFS_NOUNLINK) != 0); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - xoap->xoa_immutable = - ((zp->z_pflags & ZFS_IMMUTABLE) != 0); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - } - - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - xoap->xoa_appendonly = - ((zp->z_pflags & ZFS_APPENDONLY) != 0); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - xoap->xoa_nodump = - ((zp->z_pflags & ZFS_NODUMP) != 0); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - xoap->xoa_opaque = - ((zp->z_pflags & ZFS_OPAQUE) != 0); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - xoap->xoa_av_quarantined = - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - xoap->xoa_av_modified = - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - S_ISREG(ip->i_mode)) { - zfs_sa_get_scanstamp(zp, xvap); - } - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - uint64_t times[2]; - - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), - times, sizeof (times)); - ZFS_TIME_DECODE(&xoap->xoa_createtime, times); - XVA_SET_RTN(xvap, XAT_CREATETIME); - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); - XVA_SET_RTN(xvap, XAT_REPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_GEN)) { - xoap->xoa_generation = ip->i_generation; - XVA_SET_RTN(xvap, XAT_GEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { - xoap->xoa_offline = - ((zp->z_pflags & ZFS_OFFLINE) != 0); - XVA_SET_RTN(xvap, XAT_OFFLINE); - } - - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { - xoap->xoa_sparse = - ((zp->z_pflags & ZFS_SPARSE) != 0); - XVA_SET_RTN(xvap, XAT_SPARSE); - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - xoap->xoa_projinherit = - ((zp->z_pflags & ZFS_PROJINHERIT) != 0); - XVA_SET_RTN(xvap, XAT_PROJINHERIT); - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { - xoap->xoa_projid = zp->z_projid; - XVA_SET_RTN(xvap, XAT_PROJID); - } - } - - ZFS_TIME_DECODE(&vap->va_atime, atime); - ZFS_TIME_DECODE(&vap->va_mtime, mtime); - ZFS_TIME_DECODE(&vap->va_ctime, ctime); - - mutex_exit(&zp->z_lock); - - sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); - - if (zp->z_blksz == 0) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - vap->va_blksize = zfsvfs->z_max_blksz; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Get the basic file attributes and place them in the provided kstat - * structure. The inode is assumed to be the authoritative source - * for most of the attributes. However, the znode currently has the - * authoritative atime, blksize, and block count. - * - * IN: ip - inode of file. - * - * OUT: sp - kstat values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -int -zfs_getattr_fast(struct inode *ip, struct kstat *sp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t blksize; - u_longlong_t nblocks; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - - generic_fillattr(ip, sp); - /* - * +1 link count for root inode with visible '.zfs' directory. - */ - if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) - if (sp->nlink < ZFS_LINK_MAX) - sp->nlink++; - - sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); - sp->blksize = blksize; - sp->blocks = nblocks; - - if (unlikely(zp->z_blksz == 0)) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - sp->blksize = zfsvfs->z_max_blksz; - } - - mutex_exit(&zp->z_lock); - - /* - * Required to prevent NFS client from detecting different inode - * numbers of snapshot root dentry before and after snapshot mount. - */ - if (zfsvfs->z_issnap) { - if (ip->i_sb->s_root->d_inode == ip) - sp->ino = ZFSCTL_INO_SNAPDIRS - - dmu_objset_id(zfsvfs->z_os); - } - - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * For the operation of changing file's user/group/project, we need to - * handle not only the main object that is assigned to the file directly, - * but also the ones that are used by the file via hidden xattr directory. - * - * Because the xattr directory may contains many EA entries, as to it may - * be impossible to change all of them via the transaction of changing the - * main object's user/group/project attributes. Then we have to change them - * via other multiple independent transactions one by one. It may be not good - * solution, but we have no better idea yet. - */ -static int -zfs_setattr_dir(znode_t *dzp) -{ - struct inode *dxip = ZTOI(dzp); - struct inode *xip = NULL; - zfsvfs_t *zfsvfs = ITOZSB(dxip); - objset_t *os = zfsvfs->z_os; - zap_cursor_t zc; - zap_attribute_t zap; - zfs_dirlock_t *dl; - znode_t *zp; - dmu_tx_t *tx = NULL; - uint64_t uid, gid; - sa_bulk_attr_t bulk[4]; - int count; - int err; - - zap_cursor_init(&zc, os, dzp->z_id); - while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { - count = 0; - if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { - err = ENXIO; - break; - } - - err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, - ZEXISTS, NULL, NULL); - if (err == ENOENT) - goto next; - if (err) - break; - - xip = ZTOI(zp); - if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && - KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && - zp->z_projid == dzp->z_projid) - goto next; - - tx = dmu_tx_create(os); - if (!(zp->z_pflags & ZFS_PROJID)) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - break; - - mutex_enter(&dzp->z_lock); - - if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { - xip->i_uid = dxip->i_uid; - uid = zfs_uid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &uid, sizeof (uid)); - } - - if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { - xip->i_gid = dxip->i_gid; - gid = zfs_gid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &gid, sizeof (gid)); - } - - if (zp->z_projid != dzp->z_projid) { - if (!(zp->z_pflags & ZFS_PROJID)) { - zp->z_pflags |= ZFS_PROJID; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, - sizeof (zp->z_pflags)); - } - - zp->z_projid = dzp->z_projid; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), - NULL, &zp->z_projid, sizeof (zp->z_projid)); - } - - mutex_exit(&dzp->z_lock); - - if (likely(count > 0)) { - err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - tx = NULL; - if (err != 0 && err != ENOENT) - break; - -next: - if (xip) { - iput(xip); - xip = NULL; - zfs_dirent_unlock(dl); - } - zap_cursor_advance(&zc); - } - - if (tx) - dmu_tx_abort(tx); - if (xip) { - iput(xip); - zfs_dirent_unlock(dl); - } - zap_cursor_fini(&zc); - - return (err == ENOENT ? 0 : err); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: ip - inode of file to be modified. - * vap - new attribute values. - * If ATTR_XVATTR set, then optional attrs are being set - * flags - ATTR_UTIME set if non-default time values provided. - * - ATTR_NOACLCHECK (CIFS context only). - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -int -zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os = zfsvfs->z_os; - zilog_t *zilog; - dmu_tx_t *tx; - vattr_t oldva; - xvattr_t *tmpxvattr; - uint_t mask = vap->va_mask; - uint_t saved_mask = 0; - int trim_mask = 0; - uint64_t new_mode; - uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; - uint64_t xattr_obj; - uint64_t mtime[2], ctime[2], atime[2]; - uint64_t projid = ZFS_INVALID_PROJID; - znode_t *attrzp; - int need_policy = FALSE; - int err, err2 = 0; - zfs_fuid_info_t *fuidp = NULL; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap; - zfs_acl_t *aclp; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; - boolean_t handle_eadir = B_FALSE; - sa_bulk_attr_t *bulk, *xattr_bulk; - int count = 0, xattr_count = 0, bulks = 8; - - if (mask == 0) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * If this is a xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - if (xoap != NULL && (mask & ATTR_XVATTR)) { - if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { - if (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - - projid = xoap->xoa_projid; - if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) - projid = ZFS_INVALID_PROJID; - else - need_policy = TRUE; - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && - (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && - (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - } - - zilog = zfsvfs->z_log; - - /* - * Make sure that if we have ephemeral uid/gid or xvattr specified - * that file system is at proper version level - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || - (mask & ATTR_XVATTR))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); - xva_init(tmpxvattr); - - bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - - /* - * Immutable files can only alter immutable bit and atime - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) && - ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || - ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - err = SET_ERROR(EPERM); - goto out3; - } - - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } - - /* - * Verify timestamps doesn't overflow 32 bits. - * ZFS can handle large timestamps, but 32bit syscalls can't - * handle times greater than 2039. This check should be removed - * once large timestamps are fully supported. - */ - if (mask & (ATTR_ATIME | ATTR_MTIME)) { - if (((mask & ATTR_ATIME) && - TIMESPEC_OVERFLOW(&vap->va_atime)) || - ((mask & ATTR_MTIME) && - TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = SET_ERROR(EOVERFLOW); - goto out3; - } - } - -top: - attrzp = NULL; - aclp = NULL; - - /* Can this be moved to before the top label? */ - if (zfs_is_readonly(zfsvfs)) { - err = SET_ERROR(EROFS); - goto out3; - } - - /* - * First validate permissions - */ - - if (mask & ATTR_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); - if (err) - goto out3; - - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - /* XXX - would it be OK to generate a log record here? */ - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - if (err) - goto out3; - } - - if (mask & (ATTR_ATIME|ATTR_MTIME) || - ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || - XVA_ISSET_REQ(xvap, XAT_READONLY) || - XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || - XVA_ISSET_REQ(xvap, XAT_OFFLINE) || - XVA_ISSET_REQ(xvap, XAT_SPARSE) || - XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { - need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); - } - - if (mask & (ATTR_UID|ATTR_GID)) { - int idmask = (mask & (ATTR_UID|ATTR_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & ATTR_MODE)) - vap->va_mode = zp->z_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & ATTR_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); - - /* - * If both ATTR_UID and ATTR_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (ATTR_UID|ATTR_GID)) && - take_owner && take_group) || - ((idmask == ATTR_UID) && take_owner) || - ((idmask == ATTR_GID) && take_group)) { - if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - (void) secpolicy_setid_clear(vap, cr); - trim_mask = (mask & (ATTR_UID|ATTR_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - mutex_enter(&zp->z_lock); - oldva.va_mode = zp->z_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); - if (mask & ATTR_XVATTR) { - /* - * Update xvattr mask to include only those attributes - * that are actually changing. - * - * the bits will be restored prior to actually setting - * the attributes so the caller thinks they were set. - */ - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - if (xoap->xoa_appendonly != - ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_APPENDONLY); - XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - if (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_PROJINHERIT); - XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - if (xoap->xoa_nounlink != - ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NOUNLINK); - XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - if (xoap->xoa_immutable != - ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_IMMUTABLE); - XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - if (xoap->xoa_nodump != - ((zp->z_pflags & ZFS_NODUMP) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NODUMP); - XVA_SET_REQ(tmpxvattr, XAT_NODUMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - if (xoap->xoa_av_modified != - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); - XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - if ((!S_ISREG(ip->i_mode) && - xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); - XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - mutex_exit(&zp->z_lock); - err = SET_ERROR(EPERM); - goto out3; - } - - if (need_policy == FALSE && - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || - XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; - } - } - - mutex_exit(&zp->z_lock); - - if (mask & ATTR_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(ip, vap, - &oldva, cr); - if (err) - goto out3; - - trim_mask |= ATTR_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - } - err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); - if (err) - goto out3; - - if (trim_mask) - vap->va_mask |= saved_mask; - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { - handle_eadir = B_TRUE; - err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - - if (err == 0 && xattr_obj) { - err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); - if (err) - goto out2; - } - if (mask & ATTR_UID) { - new_kuid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && - zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, - new_kuid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (mask & ATTR_GID) { - new_kgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); - if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && - zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, - new_kgid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (projid != ZFS_INVALID_PROJID && - zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = EDQUOT; - goto out2; - } - } - tx = dmu_tx_create(os); - - if (mask & ATTR_MODE) { - uint64_t pmode = zp->z_mode; - uint64_t acl_obj; - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - zfs_acl_chmod_setattr(zp, &aclp, new_mode); - - mutex_enter(&zp->z_lock); - if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { - /* - * Are we upgrading ACL from old V0 format - * to V1 format? - */ - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, - aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - mutex_exit(&zp->z_lock); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - } else { - if (((mask & ATTR_XVATTR) && - XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (projid != ZFS_INVALID_PROJID && - !(zp->z_pflags & ZFS_PROJID))) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - } - - if (attrzp) { - dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); - } - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - goto out; - - count = 0; - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { - /* - * For the existed object that is upgraded from old system, - * its on-disk layout has no slot for the project ID attribute. - * But quota accounting logic needs to access related slots by - * offset directly. So we need to adjust old objects' layout - * to make the project ID to some unified and fixed offset. - */ - if (attrzp) - err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); - if (err == 0) - err = sa_add_projid(zp->z_sa_hdl, tx, projid); - - if (unlikely(err == EEXIST)) - err = 0; - else if (err != 0) - goto out; - else - projid = ZFS_INVALID_PROJID; - } - - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&attrzp->z_acl_lock); - mutex_enter(&attrzp->z_lock); - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, - sizeof (attrzp->z_pflags)); - if (projid != ZFS_INVALID_PROJID) { - attrzp->z_projid = projid; - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, - sizeof (attrzp->z_projid)); - } - } - - if (mask & (ATTR_UID|ATTR_GID)) { - - if (mask & ATTR_UID) { - ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); - new_uid = zfs_uid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &new_uid, sizeof (new_uid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_UID(zfsvfs), NULL, &new_uid, - sizeof (new_uid)); - ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); - } - } - - if (mask & ATTR_GID) { - ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); - new_gid = zfs_gid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), - NULL, &new_gid, sizeof (new_gid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_GID(zfsvfs), NULL, &new_gid, - sizeof (new_gid)); - ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); - } - } - if (!(mask & ATTR_MODE)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), - NULL, &new_mode, sizeof (new_mode)); - new_mode = zp->z_mode; - } - err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); - if (attrzp) { - err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); - } - } - - if (mask & ATTR_MODE) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &new_mode, sizeof (new_mode)); - zp->z_mode = ZTOI(zp)->i_mode = new_mode; - ASSERT3P(aclp, !=, NULL); - err = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT0(err); - if (zp->z_acl_cached) - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = aclp; - aclp = NULL; - } - - if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { - zp->z_atime_dirty = 0; - ZFS_TIME_ENCODE(&ip->i_atime, atime); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &atime, sizeof (atime)); - } - - if (mask & (ATTR_MTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime, - ZTOI(zp)->i_sb->s_time_gran); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - } - - if (mask & (ATTR_CTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime, - ZTOI(zp)->i_sb->s_time_gran); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - } - - if (projid != ZFS_INVALID_PROJID) { - zp->z_projid = projid; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, - sizeof (zp->z_projid)); - } - - if (attrzp && mask) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_CTIME(zfsvfs), NULL, &ctime, - sizeof (ctime)); - } - - /* - * Do this after setting timestamps to prevent timestamp - * update from toggling bit - */ - - if (xoap && (mask & ATTR_XVATTR)) { - - /* - * restore trimmed off masks - * so that return masks can be set for caller. - */ - - if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { - XVA_SET_REQ(xvap, XAT_APPENDONLY); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { - XVA_SET_REQ(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { - XVA_SET_REQ(xvap, XAT_IMMUTABLE); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { - XVA_SET_REQ(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { - XVA_SET_REQ(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { - XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { - XVA_SET_REQ(xvap, XAT_PROJINHERIT); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - ASSERT(S_ISREG(ip->i_mode)); - - zfs_xvattr_set(zp, xvap, tx); - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - - mutex_exit(&zp->z_lock); - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&zp->z_acl_lock); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&attrzp->z_acl_lock); - mutex_exit(&attrzp->z_lock); - } -out: - if (err == 0 && xattr_count > 0) { - err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, - xattr_count, tx); - ASSERT(err2 == 0); - } - - if (aclp) - zfs_acl_free(aclp); - - if (fuidp) { - zfs_fuid_info_free(fuidp); - fuidp = NULL; - } - - if (err) { - dmu_tx_abort(tx); - if (attrzp) - iput(ZTOI(attrzp)); - if (err == ERESTART) - goto top; - } else { - if (count > 0) - err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - if (attrzp) { - if (err2 == 0 && handle_eadir) - err2 = zfs_setattr_dir(attrzp); - iput(ZTOI(attrzp)); - } - zfs_inode_update(zp); - } - -out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - -out3: - kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zfsvfs); - return (err); -} - -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - -/* - * Drop locks and release vnodes that were held by zfs_rename_lock(). - */ static void -zfs_rename_unlock(zfs_zlock_t **zlpp) +zfs_get_done(zgd_t *zgd, int error) { - zfs_zlock_t *zl; + znode_t *zp = zgd->zgd_private; - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - zfs_iput_async(ZTOI(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); - } + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + + kmem_free(zgd, sizeof (zgd_t)); } -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = ZTOZSB(zp)->z_root; - uint64_t oidp = zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; - - /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. - */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); - } - } - - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; - - if (oidp == szp->z_id) /* We're a descendant of szp */ - return (SET_ERROR(EINVAL)); - - if (oidp == rootid) /* We've hit the top */ - return (0); - - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(ZTOZSB(zp), oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), - &oidp, sizeof (oidp)); - rwlp = &zp->z_parent_lock; - rw = RW_READER; - - } while (zp->z_id != sdzp->z_id); - - return (0); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdip - Source directory containing the "old entry". - * snm - Old entry name. - * tdip - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * sdip,tdip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - cred_t *cr, int flags) -{ - znode_t *tdzp, *szp, *tzp; - znode_t *sdzp = ITOZ(sdip); - zfsvfs_t *zfsvfs = ITOZSB(sdip); - zilog_t *zilog; - zfs_dirlock_t *sdl, *tdl; - dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; - int error = 0; - int zflg = 0; - boolean_t waited = B_FALSE; - - if (snm == NULL || tnm == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - zilog = zfsvfs->z_log; - - tdzp = ITOZ(tdip); - ZFS_VERIFY_ZP(tdzp); - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); - - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } - } - - /* - * If the source and destination directories are the same, we should - * grab the z_name_lock of that directory only once. - */ - if (sdzp == tdzp) { - zflg |= ZHAVELOCK; - rw_enter(&sdzp->z_name_lock, RW_READER); - } - - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); - } - - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - iput(ZTOI(tzp)); - } - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); - } - if (terr) { - zfs_dirent_unlock(sdl); - iput(ZTOI(szp)); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); - } - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow renames into our tree when the project - * IDs are the same. - */ - if (tdzp->z_pflags & ZFS_PROJINHERIT && - tdzp->z_projid != szp->z_projid) { - error = SET_ERROR(EXDEV); - goto out; - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) - goto out; - - if (S_ISDIR(ZTOI(szp)->i_mode)) { - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) - goto out; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) { - dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tdzp); - } - if (tzp) { - dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tzp); - } - - zfs_sa_upgrade_txholds(tx, szp); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ZTOI(szp)); - if (tzp) - iput(ZTOI(tzp)); - goto top; - } - dmu_tx_abort(tx); - iput(ZTOI(szp)); - if (tzp) - iput(ZTOI(tzp)); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); - - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); - - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); - } - } - - dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - zfs_inode_update(sdzp); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (sdzp != tdzp) - zfs_inode_update(tdzp); - - zfs_inode_update(szp); - iput(ZTOI(szp)); - if (tzp) { - zfs_inode_update(tzp); - iput(ZTOI(tzp)); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dip - Directory to contain new symbolic link. - * name - Name of directory entry in dip. - * vap - Attributes of new entry. - * link - Name for new symlink entry. - * cr - credentials of caller. - * flags - case flags - * - * OUT: ipp - Inode for new symbolic link. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - struct inode **ipp, cred_t *cr, int flags) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfs_dirlock_t *dl; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - uint64_t len = strlen(link); - int error; - int zflg = ZNEW; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - uint64_t txtype = TX_SYMLINK; - boolean_t waited = B_FALSE; - - ASSERT(S_ISLNK(vap->va_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENAMETOOLONG)); - } - - if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } -top: - *ipp = NULL; - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); - if (error) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - tx = dmu_tx_create(zfsvfs->z_os); - fuid_dirtied = zfsvfs->z_fuid_dirty; - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE + len); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create a new object for the symlink. - * for version 4 ZPL datsets the symlink will be an SA attribute - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), - link, len, tx); - else - zfs_sa_symlink(zp, link, len, tx); - mutex_exit(&zp->z_lock); - - zp->z_size = len; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx); - /* - * Insert the new object into the directory. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (error == 0) { - *ipp = ZTOI(zp); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - } else { - iput(ZTOI(zp)); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by ip. - * - * IN: ip - inode of symbolic link - * uio - structure to contain the link path. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - */ -/* ARGSUSED */ -int -zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_lookup_uio(zp->z_sa_hdl, - SA_ZPL_SYMLINK(zfsvfs), uio); - else - error = zfs_sa_readlink(zp, uio); - mutex_exit(&zp->z_lock); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdip referencing sip. - * - * IN: tdip - Directory to contain new entry. - * sip - inode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * tdip - ctime|mtime updated - * sip - ctime updated - */ -/* ARGSUSED */ -int -zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr, - int flags) -{ - znode_t *dzp = ITOZ(tdip); - znode_t *tzp, *szp; - zfsvfs_t *zfsvfs = ITOZSB(tdip); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uint64_t parent; - uid_t owner; - boolean_t waited = B_FALSE; - boolean_t is_tmpfile = 0; - uint64_t txg; -#ifdef HAVE_TMPFILE - is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); -#endif - ASSERT(S_ISDIR(tdip->i_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (S_ISDIR(sip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - szp = ITOZ(sip); - ZFS_VERIFY_ZP(szp); - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow hard link creation in our tree when the - * project IDs are the same. - */ - if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* Prevent links to .zfs/shares files */ - - if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), - cr, ZFS_OWNER); - if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - -top: - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (is_tmpfile) - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - zfs_sa_upgrade_txholds(tx, szp); - zfs_sa_upgrade_txholds(tx, dzp); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - /* unmark z_unlinked so zfs_link_create will not reject */ - if (is_tmpfile) - szp->z_unlinked = 0; - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - /* - * tmpfile is created to be in z_unlinkedobj, so remove it. - * Also, we don't log in ZIL, be cause all previous file - * operation on the tmpfile are ignored by ZIL. Instead we - * always wait for txg to sync to make sure all previous - * operation are sync safe. - */ - if (is_tmpfile) { - VERIFY(zap_remove_int(zfsvfs->z_os, - zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); - } - } else if (is_tmpfile) { - /* restore z_unlinked since when linking failed */ - szp->z_unlinked = 1; - } - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (is_tmpfile) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); - - zfs_inode_update(dzp); - zfs_inode_update(szp); - ZFS_EXIT(zfsvfs); - return (error); -} - -static void -zfs_putpage_commit_cb(void *arg) -{ - struct page *pp = arg; - - ClearPageError(pp); - end_page_writeback(pp); -} - -/* - * Push a page out to disk, once the page is on stable storage the - * registered commit callback will be run as notification of completion. - * - * IN: ip - page mapped for inode. - * pp - page to push (page is locked) - * wbc - writeback control data - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - loff_t offset; - loff_t pgoff; - unsigned int pglen; - dmu_tx_t *tx; - caddr_t va; - int err = 0; - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int cnt = 0; - struct address_space *mapping; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - ASSERT(PageLocked(pp)); - - pgoff = page_offset(pp); /* Page byte-offset in file */ - offset = i_size_read(ip); /* File length in bytes */ - pglen = MIN(PAGE_SIZE, /* Page length in bytes */ - P2ROUNDUP(offset, PAGE_SIZE)-pgoff); - - /* Page is beyond end of file */ - if (pgoff >= offset) { - unlock_page(pp); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Truncate page length to end of file */ - if (pgoff + pglen > offset) - pglen = offset - pgoff; - -#if 0 - /* - * FIXME: Allow mmap writes past its quota. The correct fix - * is to register a page_mkwrite() handler to count the page - * against its quota when it is about to be dirtied. - */ - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - err = EDQUOT; - } -#endif - - /* - * The ordering here is critical and must adhere to the following - * rules in order to avoid deadlocking in either zfs_read() or - * zfs_free_range() due to a lock inversion. - * - * 1) The page must be unlocked prior to acquiring the range lock. - * This is critical because zfs_read() calls find_lock_page() - * which may block on the page lock while holding the range lock. - * - * 2) Before setting or clearing write back on a page the range lock - * must be held in order to prevent a lock inversion with the - * zfs_free_range() function. - * - * This presents a problem because upon entering this function the - * page lock is already held. To safely acquire the range lock the - * page lock must be dropped. This creates a window where another - * process could truncate, invalidate, dirty, or write out the page. - * - * Therefore, after successfully reacquiring the range and page locks - * the current page state is checked. In the common case everything - * will be as is expected and it can be written out. However, if - * the page state has changed it must be handled accordingly. - */ - mapping = pp->mapping; - redirty_page_for_writepage(wbc, pp); - unlock_page(pp); - - locked_range_t *lr = rangelock_enter(&zp->z_rangelock, - pgoff, pglen, RL_WRITER); - lock_page(pp); - - /* Page mapping changed or it was no longer dirty, we're done */ - if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { - unlock_page(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Another process started write block if required */ - if (PageWriteback(pp)) { - unlock_page(pp); - rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(pp)) - wait_on_page_bit(pp, PG_writeback); - } - - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Clear the dirty flag the required locks are held */ - if (!clear_page_dirty_for_io(pp)) { - unlock_page(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Counterpart for redirty_page_for_writepage() above. This page - * was in fact not skipped and should not be counted as if it were. - */ - wbc->pages_skipped--; - set_page_writeback(pp); - unlock_page(pp); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_NOWAIT); - if (err != 0) { - if (err == ERESTART) - dmu_tx_wait(tx); - - dmu_tx_abort(tx); - __set_page_dirty_nobuffers(pp); - ClearPageError(pp); - end_page_writeback(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (err); - } - - va = kmap(pp); - ASSERT3U(pglen, <=, PAGE_SIZE); - dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); - kunmap(pp); - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - zp->z_atime_dirty = 0; - zp->z_seq++; - - err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - zfs_putpage_commit_cb, pp); - dmu_tx_commit(tx); - - rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - zil_commit(zfsvfs->z_log, zp->z_id); - } - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Update the system attributes when the inode has been dirtied. For the - * moment we only update the mode, atime, mtime, and ctime. - */ -int -zfs_dirty_inode(struct inode *ip, int flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - dmu_tx_t *tx; - uint64_t mode, atime[2], mtime[2], ctime[2]; - sa_bulk_attr_t bulk[4]; - int error = 0; - int cnt = 0; - - if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - -#ifdef I_DIRTY_TIME - /* - * This is the lazytime semantic indroduced in Linux 4.0 - * This flag will only be called from update_time when lazytime is set. - * (Note, I_DIRTY_SYNC will also set if not lazytime) - * Fortunately mtime and ctime are managed within ZFS itself, so we - * only need to dirty atime. - */ - if (flags == I_DIRTY_TIME) { - zp->z_atime_dirty = 1; - goto out; - } -#endif - - tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto out; - } - - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - mode = ip->i_mode; - - zp->z_mode = mode; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - mutex_exit(&zp->z_lock); - - dmu_tx_commit(tx); -out: - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -void -zfs_inactive(struct inode *ip) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t atime[2]; - int error; - int need_unlock = 0; - - /* Only read lock if we haven't already write locked, e.g. rollback */ - if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { - need_unlock = 1; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - } - if (zp->z_sa_hdl == NULL) { - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&atime, sizeof (atime), tx); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} - -/* - * Bounds-check the seek operation. - * - * IN: ip - inode seeking within - * ooff - old file offset - * noffp - pointer to new file offset - * - * RETURN: 0 if success - * EINVAL if new offset invalid - */ -/* ARGSUSED */ -int -zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp) -{ - if (S_ISDIR(ip->i_mode)) - return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} - -/* - * Fill pages with data from the disk. - */ -static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); - - if (io_off + io_len > i_size) - io_len = i_size - io_off; - - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } - } - - return (0); -} - -/* - * Uses zfs_fillpage to read data from the file and fill the pages. - * - * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - err = zfs_fillpage(ip, pl, nr_pages); - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Check ZFS specific permissions to memory map a section of a file. - * - * IN: ip - inode of the file to mmap - * off - file offset - * addrp - start address in memory region - * len - length of memory region - * vm_flags- address flags - * - * RETURN: 0 if success - * error code if failure - */ -/*ARGSUSED*/ -int -zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - unsigned long vm_flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((vm_flags & (VM_READ | VM_EXEC)) && - (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENXIO)); - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * convoff - converts the given data (start, whence) to the - * given whence. - */ -int -convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) -{ - vattr_t vap; - int error; - - if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) { - if ((error = zfs_getattr(ip, &vap, 0, CRED()))) - return (error); - } - - switch (lckdat->l_whence) { - case SEEK_CUR: - lckdat->l_start += offset; - break; - case SEEK_END: - lckdat->l_start += vap.va_size; - /* FALLTHRU */ - case SEEK_SET: - break; - default: - return (SET_ERROR(EINVAL)); - } - - if (lckdat->l_start < 0) - return (SET_ERROR(EINVAL)); - - switch (whence) { - case SEEK_CUR: - lckdat->l_start -= offset; - break; - case SEEK_END: - lckdat->l_start -= vap.va_size; - /* FALLTHRU */ - case SEEK_SET: - break; - default: - return (SET_ERROR(EINVAL)); - } - - lckdat->l_whence = (short)whence; - return (0); -} - -/* - * Free or allocate space in a file. Currently, this function only - * supports the `F_FREESP' command. However, this command is somewhat - * misnamed, as its functionality includes the ability to allocate as - * well as free space. - * - * IN: ip - inode of file to free data in. - * cmd - action to take (only F_FREESP supported). - * bfp - section of file to free/alloc. - * flag - current file open mode flags. - * offset - current file offset. - * cr - credentials of caller. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * ip - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t off, len; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - if ((error = convoff(ip, bfp, SEEK_SET, offset))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Permissions aren't checked on Solaris because on this OS - * zfs_space() can only be called with an opened file handle. - * On Linux we can get here through truncate_range() which - * operates directly on inodes, so we need to check access rights. - */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - off = bfp->l_start; - len = bfp->l_len; /* 0 means from off to end of file */ - - error = zfs_freesp(zp, off, len, flag, TRUE); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -int -zfs_fid(struct inode *ip, fid_t *fidp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t gen; - uint64_t gen64; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i, error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), - &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - gen = (uint32_t)gen64; - - size = SHORT_FID_LEN; - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -int -zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef HAVE_UIO_ZEROCOPY -/* - * Tunable, both must be a power of 2. - * - * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf - * zcr_blksz_max: if set to less than the file block size, allow loaning out of - * an arcbuf for a partial block read - */ -int zcr_blksz_min = (1 << 10); /* 1K */ -int zcr_blksz_max = (1 << 17); /* 128K */ - -/*ARGSUSED*/ -static int -zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int max_blksz = zfsvfs->z_max_blksz; - uio_t *uio = &xuio->xu_uio; - ssize_t size = uio->uio_resid; - offset_t offset = uio->uio_loffset; - int blksz; - int fullblk, i; - arc_buf_t *abuf; - ssize_t maxsize; - int preamble, postamble; - - if (xuio->xu_type != UIOTYPE_ZEROCOPY) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - switch (ioflag) { - case UIO_WRITE: - /* - * Loan out an arc_buf for write if write size is bigger than - * max_blksz, and the file's block size is also max_blksz. - */ - blksz = max_blksz; - if (size < blksz || zp->z_blksz != blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* - * Caller requests buffers for write before knowing where the - * write offset might be (e.g. NFS TCP write). - */ - if (offset == -1) { - preamble = 0; - } else { - preamble = P2PHASE(offset, blksz); - if (preamble) { - preamble = blksz - preamble; - size -= preamble; - } - } - - postamble = P2PHASE(size, blksz); - size -= postamble; - - fullblk = size / blksz; - (void) dmu_xuio_init(xuio, - (preamble != 0) + fullblk + (postamble != 0)); - - /* - * Have to fix iov base/len for partial buffers. They - * currently represent full arc_buf's. - */ - if (preamble) { - /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, - blksz - preamble, preamble); - } - - for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, blksz); - } - - if (postamble) { - /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, postamble); - } - break; - case UIO_READ: - /* - * Loan out an arc_buf for read if the read size is larger than - * the current file block size. Block alignment is not - * considered. Partial arc_buf will be loaned out for read. - */ - blksz = zp->z_blksz; - if (blksz < zcr_blksz_min) - blksz = zcr_blksz_min; - if (blksz > zcr_blksz_max) - blksz = zcr_blksz_max; - /* avoid potential complexity of dealing with it */ - if (blksz > max_blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - maxsize = zp->z_size - uio->uio_loffset; - if (size > maxsize) - size = maxsize; - - if (size < blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - break; - default: - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - uio->uio_extflg = UIO_XUIO; - XUIO_XUZC_RW(xuio) = ioflag; - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -static int -zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) -{ - int i; - arc_buf_t *abuf; - int ioflag = XUIO_XUZC_RW(xuio); - - ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); - - i = dmu_xuio_cnt(xuio); - while (i-- > 0) { - abuf = dmu_xuio_arcbuf(xuio, i); - /* - * if abuf == NULL, it must be a write buffer - * that has been returned in zfs_write(). - */ - if (abuf) - dmu_return_arcbuf(abuf); - ASSERT(abuf || ioflag == UIO_WRITE); - } - - dmu_xuio_fini(xuio); - return (0); -} -#endif /* HAVE_UIO_ZEROCOPY */ - -#if defined(_KERNEL) -EXPORT_SYMBOL(zfs_open); -EXPORT_SYMBOL(zfs_close); +EXPORT_SYMBOL(zfs_access); +EXPORT_SYMBOL(zfs_fsync); +EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); -EXPORT_SYMBOL(zfs_access); -EXPORT_SYMBOL(zfs_lookup); -EXPORT_SYMBOL(zfs_create); -EXPORT_SYMBOL(zfs_tmpfile); -EXPORT_SYMBOL(zfs_remove); -EXPORT_SYMBOL(zfs_mkdir); -EXPORT_SYMBOL(zfs_rmdir); -EXPORT_SYMBOL(zfs_readdir); -EXPORT_SYMBOL(zfs_fsync); -EXPORT_SYMBOL(zfs_getattr); -EXPORT_SYMBOL(zfs_getattr_fast); -EXPORT_SYMBOL(zfs_setattr); -EXPORT_SYMBOL(zfs_rename); -EXPORT_SYMBOL(zfs_symlink); -EXPORT_SYMBOL(zfs_readlink); -EXPORT_SYMBOL(zfs_link); -EXPORT_SYMBOL(zfs_inactive); -EXPORT_SYMBOL(zfs_space); -EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); -EXPORT_SYMBOL(zfs_getpage); -EXPORT_SYMBOL(zfs_putpage); -EXPORT_SYMBOL(zfs_dirty_inode); -EXPORT_SYMBOL(zfs_map); -/* CSTYLED */ -module_param(zfs_delete_blocks, ulong, 0644); -MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -module_param(zfs_read_chunk_size, long, 0644); -MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); -#endif +ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, + "Bytes to read per chunk"); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ff14a98b6b..640e805d09 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include /* @@ -58,7 +58,7 @@ * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first - * instantiated (e.g. if the dataset is a normal fileystem, when it is + * instantiated (e.g. if the dataset is a normal filesystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory @@ -135,8 +135,6 @@ unsigned long zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); - #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) @@ -146,11 +144,11 @@ zil_bp_compare(const void *x1, const void *x2) const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); - return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); + return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -207,8 +205,10 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; - zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); - zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0], + sizeof (zc->zc_word[ZIL_ZC_GUID_0])); + (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1], + sizeof (zc->zc_word[ZIL_ZC_GUID_1])); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } @@ -434,7 +434,8 @@ done: /* ARGSUSED */ static int -zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, + uint64_t first_txg) { ASSERT(!BP_IS_HOLE(bp)); @@ -456,13 +457,15 @@ zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) /* ARGSUSED */ static int -zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { return (0); } static int -zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, + uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. @@ -478,7 +481,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) } static int -zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; @@ -505,7 +509,8 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) /* ARGSUSED */ static int -zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, + uint64_t claim_txg) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); @@ -513,7 +518,8 @@ zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) } static int -zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; @@ -535,7 +541,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - return (AVL_CMP(v1, v2)); + return (TREE_CMP(v1, v2)); } static lwb_t * @@ -604,7 +610,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ -void +static void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; @@ -630,7 +636,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ -boolean_t +static boolean_t __maybe_unused zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; @@ -644,7 +650,7 @@ zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ -boolean_t +static boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; @@ -1172,6 +1178,20 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; + /* + * We expect any ZIO errors from child ZIOs to have been + * propagated "up" to this specific LWB's root ZIO, in + * order for this error handling to work correctly. This + * includes ZIO errors from either this LWB's write or + * flush, as well as any errors from other dependent LWBs + * (e.g. a root LWB ZIO that might be a child of this LWB). + * + * With that said, it's important to note that LWB flush + * errors are not propagated up to the LWB root ZIO. + * This is incorrect behavior, and results in VDEV flush + * errors not being handled correctly here. See the + * comment above the call to "zio_flush" for details. + */ zcw->zcw_zio_error = zio->io_error; @@ -1226,7 +1246,7 @@ zil_lwb_write_done(zio_t *zio) ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_put(zio->io_abd); + abd_free(zio->io_abd); mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); @@ -1245,6 +1265,12 @@ zil_lwb_write_done(zio_t *zio) * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. + * + * Additionally, we don't perform any further error handling at + * this point (e.g. setting "zcw_zio_error" appropriately), as + * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, + * we expect any error seen here, to have been propagated to + * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) @@ -1275,8 +1301,17 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) + if (vd != NULL) { + /* + * The "ZIO_FLAG_DONT_PROPAGATE" is currently + * always used within "zio_flush". This means, + * any errors when flushing the vdev(s), will + * (unfortunately) not be handled correctly, + * since these "zio_flush" errors will not be + * propagated up to "zil_lwb_flush_vdevs_done". + */ zio_flush(lwb->lwb_root_zio, vd); + } kmem_free(zv, sizeof (*zv)); } } @@ -1393,8 +1428,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_FASTWRITE, &zb); + prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); ASSERT3P(lwb->lwb_write_zio, !=, NULL); lwb->lwb_state = LWB_STATE_OPENED; @@ -1416,13 +1450,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ -uint64_t zil_block_buckets[] = { - 4096, /* non TX_WRITE */ - 8192+4096, /* data base */ - 32*1024 + 4096, /* NFS writes */ - UINT64_MAX +struct { + uint64_t limit; + uint64_t blksz; +} zil_block_buckets[] = { + { 4096, 4096 }, /* non TX_WRITE */ + { 8192 + 4096, 8192 + 4096 }, /* database */ + { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ + { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ + { 131072, 131072 }, /* < 128KB writes */ + { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ + { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; +/* + * Maximum block size used by the ZIL. This is picked up when the ZIL is + * initialized. Otherwise this should not be used directly; see + * zl_max_block_size instead. + */ +int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; + /* * Start a log block write and advance to the next log block. * Calls are serialized. @@ -1497,11 +1544,9 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) * pool log space. */ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i]; i++) + for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; - zil_blksz = zil_block_buckets[i]; - if (zil_blksz == UINT64_MAX) - zil_blksz = SPA_OLD_MAXBLOCKSIZE; + zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); @@ -1562,13 +1607,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) return (nlwb); } +/* + * Maximum amount of write data that can be put into single log block. + */ +uint64_t +zil_max_log_data(zilog_t *zilog) +{ + return (zilog->zl_max_block_size - + sizeof (zil_chain_t) - sizeof (lr_write_t)); +} + +/* + * Maximum amount of log space we agree to waste to reduce number of + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + */ +static inline uint64_t +zil_max_waste_space(zilog_t *zilog) +{ + return (zil_max_log_data(zilog) / 8); +} + +/* + * Maximum amount of write data for WR_COPIED. For correctness, consumers + * must fall back to WR_NEED_COPY if we can't fit the entire record into one + * maximum sized log block, because each WR_COPIED record must fit in a + * single log block. For space efficiency, we want to fit two records into a + * max-sized log block. + */ +uint64_t +zil_max_copied_data(zilog_t *zilog) +{ + return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - + sizeof (lr_write_t)); +} + static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrcb, *lrc; lr_write_t *lrwb, *lrw; char *lr_buf; - uint64_t dlen, dnow, lwb_sp, reclen, txg; + uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1602,8 +1681,9 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); + dpad = dlen - lrw->lr_length; } else { - dlen = 0; + dlen = dpad = 0; } reclen = lrc->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); @@ -1617,15 +1697,27 @@ cont: * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + max_log_data = zil_max_log_data(zilog); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && - lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || - lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { + lwb_sp < zil_max_waste_space(zilog) && + (dlen % max_log_data == 0 || + lwb_sp < reclen + dlen % max_log_data))) { lwb = zil_lwb_write_issue(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + + /* + * There must be enough space in the new, empty log block to + * hold reclen. For WR_COPIED, we need to fit the whole + * record in one block, and reclen is the header size + the + * data size. For WR_NEED_COPY, we can create multiple + * records, splitting the data into multiple blocks, so we + * only need to fit one word of data per block; in this case + * reclen is just the header size (no data). + */ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } @@ -1683,7 +1775,11 @@ cont: * completed after "lwb_write_zio" completed. */ error = zilog->zl_get_data(itx->itx_private, - lrwb, dbuf, lwb, lwb->lwb_write_zio); + itx->itx_gen, lrwb, dbuf, lwb, + lwb->lwb_write_zio); + if (dbuf != NULL && error == 0 && dnow == dlen) + /* Zero any padding bytes in the last block. */ + bzero((char *)dbuf + lrwb->lr_length, dpad); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); @@ -1721,18 +1817,19 @@ cont: } itx_t * -zil_itx_create(uint64_t txtype, size_t lrsize) +zil_itx_create(uint64_t txtype, size_t olrsize) { - size_t itxsize; + size_t itxsize, lrsize; itx_t *itx; - lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); + lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ + bzero((char *)&itx->itx_lr + olrsize, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; @@ -1758,12 +1855,13 @@ zil_itx_destroy(itx_t *itx) * so no locks are needed. */ static void -zil_itxg_clean(itxs_t *itxs) +zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; + itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; @@ -1818,13 +1916,13 @@ zil_aitx_compare(const void *x1, const void *x2) const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - return (AVL_CMP(o1, o2)); + return (TREE_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ -static void +void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; @@ -1876,16 +1974,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) itxg_t *itxg; itxs_t *itxs, *clean = NULL; - /* - * Object ids can be re-instantiated in the next txg so - * remove any async transactions to avoid future leaks. - * This can happen if a fsync occurs on the re-instantiated - * object for a WR_INDIRECT or WR_NEED_COPY write, which gets - * the new file data and flushes a write record for the old object. - */ - if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) - zil_remove_async(zilog, itx->itx_oid); - /* * Ensure the data of a renamed file is committed before the rename. */ @@ -1908,7 +1996,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " - "txg %llu", itxg->itxg_txg); + "txg %llu", (u_longlong_t)itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; @@ -1961,7 +2049,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we - * have written out the uberblocks (i.e. txg has been comitted) so that + * have written out the uberblocks (i.e. txg has been committed) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ @@ -1993,7 +2081,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, - (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP); + zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } @@ -2048,7 +2136,7 @@ zil_get_commit_list(zilog_t *zilog) /* * Move the async itxs for a specified object to commit into sync lists. */ -static void +void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; @@ -2642,11 +2730,11 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * timeout is reached; responsibility (2) from * the comment above this function. */ - clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, + int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); - if (timeleft >= 0 || zcw->zcw_done) + if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE; @@ -3124,6 +3212,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; + zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3232,7 +3321,8 @@ zil_close(zilog_t *zilog) txg_wait_synced(zilog->zl_dmu_pool, txg); if (zilog_is_dirty(zilog)) - zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg); + zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, + (u_longlong_t)txg); if (txg < spa_freeze_txg(zilog->zl_spa)) VERIFY(!zilog_is_dirty(zilog)); @@ -3425,7 +3515,7 @@ typedef struct zil_replay_arg { } zil_replay_arg_t; static int -zil_replay_error(zilog_t *zilog, lr_t *lr, int error) +zil_replay_error(zilog_t *zilog, const lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -3443,7 +3533,8 @@ zil_replay_error(zilog_t *zilog, lr_t *lr, int error) } static int -zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, + uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; @@ -3526,7 +3617,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* ARGSUSED */ static int -zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; @@ -3601,7 +3692,6 @@ zil_reset(const char *osname, void *arg) return (0); } -#if defined(_KERNEL) EXPORT_SYMBOL(zil_alloc); EXPORT_SYMBOL(zil_free); EXPORT_SYMBOL(zil_open); @@ -3626,16 +3716,18 @@ EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); /* BEGIN CSTYLED */ -module_param(zfs_commit_timeout_pct, int, 0644); -MODULE_PARM_DESC(zfs_commit_timeout_pct, "ZIL block open timeout percentage"); +ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW, + "ZIL block open timeout percentage"); -module_param(zil_replay_disable, int, 0644); -MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay"); +ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, + "Disable intent logging replay"); -module_param(zil_nocacheflush, int, 0644); -MODULE_PARM_DESC(zil_nocacheflush, "Disable ZIL cache flushes"); +ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, + "Disable ZIL cache flushes"); -module_param(zil_slog_bulk, ulong, 0644); -MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit"); +ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW, + "Limit in bytes slog sync writes per commit"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW, + "Limit in bytes of ZIL log block size"); /* END CSTYLED */ -#endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 016ac07eab..c016fa323b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,9 +20,12 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2021, Datto, Inc. */ #include @@ -44,10 +47,10 @@ #include #include #include -#include +#include #include #include -#include +#include /* * ========================================================================== @@ -96,9 +99,23 @@ int zio_slow_io_ms = (30 * MILLISEC); * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. + * + * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable + * compression (including of metadata). In practice, we don't have this + * many sync passes, so this has no effect. + * + * The original intent was that disabling compression would help the sync + * passes to converge. However, in practice disabling compression increases + * the average number of sync passes, because when we turn compression off, a + * lot of block's size will change and thus we have to re-allocate (not + * overwrite) them. It also increases the number of 128KB allocations (e.g. + * for indirect blocks and spacemaps) because these will not be compressed. + * The 128K allocations are especially detrimental to performance on highly + * fragmented systems, which may have very few free segments of this size, + * and may need to load new metaslabs to satisfy 128K allocations. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ -int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* @@ -107,6 +124,11 @@ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) +/* + * Enable smaller cores by excluding metadata + * allocations as well. + */ +int zio_exclude_metadata = 0; int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG @@ -123,7 +145,6 @@ void zio_init(void) { size_t c; - vmem_t *data_alloc_arena = NULL; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -139,7 +160,11 @@ zio_init(void) size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; - size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; + size_t data_cflags, cflags; + + data_cflags = KMC_NODEBUG; + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; #if defined(_ILP32) && defined(_KERNEL) /* @@ -180,14 +205,28 @@ zio_init(void) if (align != 0) { char name[36]; - (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, + size, align, NULL, NULL, NULL, NULL, NULL, + cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; + } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); - (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, - data_alloc_arena, cflags); + align, NULL, NULL, NULL, NULL, NULL, data_cflags); } } @@ -209,37 +248,50 @@ zio_init(void) void zio_fini(void) { - size_t c; - kmem_cache_t *last_cache = NULL; - kmem_cache_t *last_data_cache = NULL; + size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; - for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { -#ifdef _ILP32 - /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. - */ - if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) - break; -#endif #if defined(ZFS_DEBUG) && !defined(_KERNEL) - if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) + for (size_t i = 0; i < n; i++) { + if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", - (int)((c + 1) << SPA_MINBLOCKSHIFT), - (long long unsigned)zio_buf_cache_allocs[c], - (long long unsigned)zio_buf_cache_frees[c]); + (int)((i + 1) << SPA_MINBLOCKSHIFT), + (long long unsigned)zio_buf_cache_allocs[i], + (long long unsigned)zio_buf_cache_frees[i]); + } #endif - if (zio_buf_cache[c] != last_cache) { - last_cache = zio_buf_cache[c]; - kmem_cache_destroy(zio_buf_cache[c]); - } - zio_buf_cache[c] = NULL; - if (zio_data_buf_cache[c] != last_data_cache) { - last_data_cache = zio_data_buf_cache[c]; - kmem_cache_destroy(zio_data_buf_cache[c]); + /* + * The same kmem cache can show up multiple times in both zio_buf_cache + * and zio_data_buf_cache. Do a wasteful but trivially correct scan to + * sort it out. + */ + for (size_t i = 0; i < n; i++) { + kmem_cache_t *cache = zio_buf_cache[i]; + if (cache == NULL) + continue; + for (size_t j = i; j < n; j++) { + if (cache == zio_buf_cache[j]) + zio_buf_cache[j] = NULL; + if (cache == zio_data_buf_cache[j]) + zio_data_buf_cache[j] = NULL; } - zio_data_buf_cache[c] = NULL; + kmem_cache_destroy(cache); + } + + for (size_t i = 0; i < n; i++) { + kmem_cache_t *cache = zio_data_buf_cache[i]; + if (cache == NULL) + continue; + for (size_t j = i; j < n; j++) { + if (cache == zio_data_buf_cache[j]) + zio_data_buf_cache[j] = NULL; + } + kmem_cache_destroy(cache); + } + + for (size_t i = 0; i < n; i++) { + VERIFY3P(zio_buf_cache[i], ==, NULL); + VERIFY3P(zio_data_buf_cache[i], ==, NULL); } kmem_cache_destroy(zio_link_cache); @@ -331,12 +383,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - /* - * Ensure that anyone expecting this zio to contain a linear ABD isn't - * going to get a nasty surprise when they try to access the data. - */ - IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); - zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; @@ -390,7 +436,8 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size) if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_abd, tmp, zio->io_size, size); + zio->io_abd, tmp, zio->io_size, size, + &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) @@ -440,7 +487,8 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), - zio->io_abd, tmp, zio->io_size, lsize); + zio->io_abd, tmp, zio->io_size, lsize, + &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; @@ -523,8 +571,8 @@ error: zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, &zio->io_bookmark, zio, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; @@ -829,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { - if (zio->io_metaslab_class == NULL) - zio->io_metaslab_class = pio->io_metaslab_class; + zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) @@ -873,35 +920,83 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) return (zio_null(NULL, spa, NULL, done, private, flags)); } -void -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) +static int +zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, + enum blk_verify_flag blk_verify, const char *fmt, ...) { + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + switch (blk_verify) { + case BLK_VERIFY_HALT: + dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); + zfs_panic_recover("%s: %s", spa_name(spa), buf); + break; + case BLK_VERIFY_LOG: + zfs_dbgmsg("%s: %s", spa_name(spa), buf); + break; + case BLK_VERIFY_ONLY: + break; + } + + return (1); +} + +/* + * Verify the block pointer fields contain reasonable values. This means + * it only contains known object types, checksum/compression identifiers, + * block sizes within the maximum allowed limits, valid DVAs, etc. + * + * If everything checks out B_TRUE is returned. The zfs_blkptr_verify + * argument controls the behavior when an invalid field is detected. + * + * Modes for zfs_blkptr_verify: + * 1) BLK_VERIFY_ONLY (evaluate the block) + * 2) BLK_VERIFY_LOG (evaluate the block and log problems) + * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + */ +boolean_t +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, + enum blk_verify_flag blk_verify) +{ + int errors = 0; + if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { - zfs_panic_recover("blkptr at %p has invalid TYPE %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { - zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { - zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { - zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { - zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { - if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { - zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", + if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } @@ -911,8 +1006,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) - return; + return (errors == 0); + if (!config_held) + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + else + ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); /* * Pool-specific checks. * @@ -922,24 +1021,25 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { - uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); + const dva_t *dva = &bp->blk_dva[i]; + uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "VDEV %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "VDEV %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { - zfs_panic_recover("blkptr at %p DVA %u has hole " - "VDEV %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } @@ -951,16 +1051,22 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) */ continue; } - uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); - uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); - if (BP_IS_GANG(bp)) - asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = DVA_GET_ASIZE(dva); + if (DVA_GET_GANG(dva)) + asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "OFFSET %llu", + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } + if (errors > 0) + dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); + if (!config_held) + spa_config_exit(spa, SCL_VDEV, bp); + + return (errors == 0); } boolean_t @@ -985,8 +1091,8 @@ zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); - if (BP_IS_GANG(bp)) - asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (DVA_GET_GANG(dva)) + asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); @@ -1000,8 +1106,6 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, { zio_t *zio; - zfs_blkptr_verify(spa, bp); - zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, @@ -1093,7 +1197,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - zfs_blkptr_verify(spa, bp); + (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We @@ -1109,47 +1213,58 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. + * + * Note that we only defer frees after zfs_sync_pass_deferred_free + * when the log space map feature is disabled. [see relevant comment + * in spa_sync_iterate_to_convergence()] */ - if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + if (BP_IS_GANG(bp) || + BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || - spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { + (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && + !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { - VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); + VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } +/* + * To improve performance, this function may return NULL if we were able + * to do the free immediately. This avoids the cost of creating a zio + * (and linking it to the parent, etc). + */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags) { - zio_t *zio; - enum zio_stage stage = ZIO_FREE_PIPELINE; - ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); - ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) - return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); - /* - * GANG and DEDUP blocks can induce a read (for the gang block header, - * or the DDT), so issue them asynchronously so that this thread is - * not tied up. - */ - if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) - stage |= ZIO_STAGE_ISSUE_ASYNC; + if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { + /* + * GANG and DEDUP blocks can induce a read (for the gang block + * header, or the DDT), so issue them asynchronously so that + * this thread is not tied up. + */ + enum zio_stage stage = + ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, - flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); - - return (zio); + return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + BP_GET_PSIZE(bp), NULL, NULL, + ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); + } else { + metaslab_free(spa, bp, txg, B_FALSE); + return (NULL); + } } zio_t * @@ -1158,7 +1273,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - zfs_blkptr_verify(spa, bp); + (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, + BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); @@ -1178,7 +1294,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); - ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, @@ -1588,8 +1704,9 @@ zio_write_compress(zio_t *zio) if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); - if (psize == 0 || psize == lsize) { + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, + zp->zp_complevel); + if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && @@ -1609,16 +1726,16 @@ zio_write_compress(zio_t *zio) return (zio); } else { /* - * Round up compressed size up to the ashift - * of the smallest-ashift device, and zero the tail. - * This ensures that the compressed size of the BP - * (and thus compressratio property) are correct, + * Round compressed size up to the minimum allocation + * size of the smallest-ashift device, and zero the + * tail. This ensures that the compressed size of the + * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)P2ROUNDUP(psize, - 1ULL << spa->spa_min_ashift); + ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)roundup(psize, + spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1651,9 +1768,21 @@ zio_write_compress(zio_t *zio) * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, - zio->io_abd, NULL, lsize); - if (psize == 0) + zio->io_abd, NULL, lsize, zp->zp_complevel); + if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; + } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { + size_t rounded = MIN((size_t)roundup(psize, + spa->spa_min_alloc), lsize); + + if (rounded != psize) { + abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); + abd_copy_off(cdata, zio->io_abd, 0, 0, psize); + psize = rounded; + zio_push_transform(zio, cdata, + psize, rounded, NULL); + } } else { ASSERT3U(psize, !=, 0); } @@ -1773,21 +1902,22 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, - flags, &zio->io_tqent); + spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, + &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { - kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; + taskq_t *tq = taskq_of_curthread(); + for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { - if (taskq_member(tqs->stqs_taskq[i], executor)) + if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } @@ -1804,7 +1934,7 @@ zio_issue_async(zio_t *zio) } void -zio_interrupt(zio_t *zio) +zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } @@ -1862,8 +1992,8 @@ zio_delay_interrupt(zio_t *zio) * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, - (task_func_t *)zio_interrupt, - zio, TQ_NOSLEEP, expire_at_tick); + zio_interrupt, zio, TQ_NOSLEEP, + expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just @@ -1895,20 +2025,26 @@ zio_deadman_impl(zio_t *pio, int ziodepth) zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " - "path=%s last=%llu " - "type=%d priority=%d flags=0x%x " - "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " - "objset=%llu object=%llu level=%llu blkid=%llu " - "offset=%llu size=%llu error=%d", + "path=%s " + "last=%llu type=%d " + "priority=%d flags=0x%x stage=0x%x " + "pipeline=0x%x pipeline-trace=0x%x " + "objset=%llu object=%llu " + "level=%llu blkid=%llu " + "offset=%llu size=%llu " + "error=%d", ziodepth, pio, pio->io_timestamp, - delta, pio->io_delta, pio->io_delay, - vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, - pio->io_type, pio->io_priority, pio->io_flags, - pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, - pio->io_offset, pio->io_size, pio->io_error); - zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, - pio->io_spa, vd, zb, pio, 0, 0); + (u_longlong_t)delta, pio->io_delta, pio->io_delay, + vd ? vd->vdev_path : "NULL", + vq ? vq->vq_io_complete_ts : 0, pio->io_type, + pio->io_priority, pio->io_flags, pio->io_stage, + pio->io_pipeline, pio->io_pipeline_trace, + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, + (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, + pio->io_error); + (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, + pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { @@ -1978,7 +2114,7 @@ static zio_pipe_stage_t *zio_pipeline[]; * it is externally visible. */ void -zio_execute(zio_t *zio) +zio_execute(void *zio) { fstrans_cookie_t cookie; @@ -1992,7 +2128,7 @@ zio_execute(zio_t *zio) * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ -boolean_t +static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) @@ -2086,6 +2222,15 @@ __zio_execute(zio_t *zio) int zio_wait(zio_t *zio) { + /* + * Some routines, like zio_free_sync(), may return a NULL zio + * to avoid the performance overhead of creating and then destroying + * an unneeded zio. For the callers' simplicity, we accept a NULL + * zio and ignore it. + */ + if (zio == NULL) + return (0); + long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; @@ -2123,6 +2268,12 @@ zio_wait(zio_t *zio) void zio_nowait(zio_t *zio) { + /* + * See comment in zio_wait(). + */ + if (zio == NULL) + return; + ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && @@ -2135,9 +2286,7 @@ zio_nowait(zio_t *zio) * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - kpreempt_disable(); - pio = spa->spa_async_zio_root[CPU_SEQID]; - kpreempt_enable(); + pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } @@ -2154,8 +2303,9 @@ zio_nowait(zio_t *zio) */ static void -zio_reexecute(zio_t *pio) +zio_reexecute(void *arg) { + zio_t *pio = arg; zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); @@ -2219,8 +2369,8 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, - NULL, NULL, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); @@ -2343,7 +2493,7 @@ zio_resume_wait(spa_t *spa) static void zio_gang_issue_func_done(zio_t *zio) { - abd_put(zio->io_abd); + abd_free(zio->io_abd); } static zio_t * @@ -2387,7 +2537,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); - abd_put(buf); + abd_free(buf); } /* * If we are here to damage data for testing purposes, @@ -2410,8 +2560,13 @@ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { - return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, - ZIO_GANG_CHILD_FLAGS(pio))); + zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + ZIO_GANG_CHILD_FLAGS(pio)); + if (zio == NULL) { + zio = zio_null(pio, pio->io_spa, + NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); + } + return (zio); } /* ARGSUSED */ @@ -2510,7 +2665,7 @@ zio_gang_tree_assemble_done(zio_t *zio) ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - abd_put(zio->io_abd); + abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; @@ -2602,7 +2757,7 @@ zio_write_gang_member_ready(zio_t *zio) dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; - ASSERTV(zio_t *gio = zio->io_gang_leader); + zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; @@ -2634,14 +2789,13 @@ zio_write_gang_done(zio_t *zio) * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) - abd_put(zio->io_abd); + abd_free(zio->io_abd); } static zio_t * -zio_write_gang_block(zio_t *pio) +zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2671,8 +2825,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], - pio)); + VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. + mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for @@ -2738,6 +2892,7 @@ zio_write_gang_block(zio_t *pio) zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; @@ -2848,6 +3003,20 @@ zio_nop_write(zio_t *zio) ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); + /* + * If we're overwriting a block that is currently on an + * indirect vdev, then ignore the nopwrite request and + * allow a new block to be allocated on a concrete vdev. + */ + spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *tvd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + if (tvd->vdev_ops == &vdev_indirect_ops) { + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + return (zio); + } + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; @@ -3110,35 +3279,6 @@ zio_ddt_child_write_done(zio_t *zio) ddt_exit(ddt); } -static void -zio_ddt_ditto_write_done(zio_t *zio) -{ - int p = DDT_PHYS_DITTO; - ASSERTV(zio_prop_t *zp = &zio->io_prop); - blkptr_t *bp = zio->io_bp; - ddt_t *ddt = ddt_select(zio->io_spa, bp); - ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - ddt_key_t *ddk = &dde->dde_key; - - ddt_enter(ddt); - - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_lead_zio[p] == zio); - dde->dde_lead_zio[p] = NULL; - - if (zio->io_error == 0) { - ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); - ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); - ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); - if (ddp->ddp_phys_birth != 0) - ddt_phys_free(ddt, ddk, ddp, zio->io_txg); - ddt_phys_fill(ddp, bp); - } - - ddt_exit(ddt); -} - static zio_t * zio_ddt_write(zio_t *zio) { @@ -3147,9 +3287,7 @@ zio_ddt_write(zio_t *zio) uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; - int ditto_copies; zio_t *cio = NULL; - zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; @@ -3178,47 +3316,14 @@ zio_ddt_write(zio_t *zio) BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); } + ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } - ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); - ASSERT(ditto_copies < SPA_DVAS_PER_BP); - - if (ditto_copies > ddt_ditto_copies_present(dde) && - dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { - zio_prop_t czp = *zp; - - czp.zp_copies = ditto_copies; - - /* - * If we arrived here with an override bp, we won't have run - * the transform stack, so we won't have the data we need to - * generate a child i/o. So, toss the override bp and restart. - * This is safe, because using the override bp is just an - * optimization; and it's rare, so the cost doesn't matter. - */ - if (zio->io_bp_override) { - zio_pop_transforms(zio); - zio->io_stage = ZIO_STAGE_OPEN; - zio->io_pipeline = ZIO_WRITE_PIPELINE; - zio->io_bp_override = NULL; - BP_ZERO(bp); - ddt_exit(ddt); - return (zio); - } - - dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, - NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - - zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; - } - if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); @@ -3244,10 +3349,7 @@ zio_ddt_write(zio_t *zio) ddt_exit(ddt); - if (cio) - zio_nowait(cio); - if (dio) - zio_nowait(dio); + zio_nowait(cio); return (zio); } @@ -3289,9 +3391,9 @@ zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); + ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); - zio = avl_first(&spa->spa_alloc_trees[allocator]); + zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); @@ -3303,11 +3405,11 @@ zio_io_to_allocate(spa_t *spa, int allocator) */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { + zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_trees[allocator], zio); + avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); @@ -3331,8 +3433,8 @@ zio_dva_throttle(zio_t *zio) return (zio); } + ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); @@ -3344,14 +3446,14 @@ zio_dva_throttle(zio_t *zio) * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ - zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio->io_allocator = allocator; zio->io_metaslab_class = mc; - avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(spa, zio->io_allocator); - mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); + mutex_enter(&spa->spa_allocs[allocator].spaa_lock); + avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); + nio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } @@ -3360,9 +3462,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_locks[allocator]); + mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); - mutex_exit(&spa->spa_alloc_locks[allocator]); + mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; @@ -3410,6 +3512,17 @@ zio_dva_allocate(zio_t *zio) zio->io_metaslab_class = mc; } + /* + * Try allocating the block in the usual metaslab class. + * If that's full, allocate it in the normal class. + * If that's full, allocate as a gang block, + * and if all are full, the allocation fails (which shouldn't happen). + * + * Note that we do not fall back on embedded slog (ZIL) space, to + * preserve unfragmented slog space, which is critical for decent + * sync write performance. If a log allocation fails, we will fall + * back to spa_sync() which is abysmal for performance. + */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -3429,26 +3542,41 @@ zio_dva_allocate(zio_t *zio) zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; - mc = spa_normal_class(spa); - VERIFY(metaslab_class_throttle_reserve(mc, + VERIFY(metaslab_class_throttle_reserve( + spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); - } else { - mc = spa_normal_class(spa); } - zio->io_metaslab_class = mc; + zio->io_metaslab_class = mc = spa_normal_class(spa); + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying normal class: zio %px, size %llu, error %d", + spa_name(spa), zio, (u_longlong_t)zio->io_size, + error); + } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying ganging: zio %px, size %llu, error %d", + spa_name(spa), zio, (u_longlong_t)zio->io_size, + error); + } + return (zio_write_gang_block(zio, mc)); + } if (error != 0) { - zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " - "size %llu, error %d", spa_name(spa), zio, zio->io_size, - error); - if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) - return (zio_write_gang_block(zio)); + if (error != ENOSPC || + (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { + zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " + "size %llu, error %d", + spa_name(spa), zio, (u_longlong_t)zio->io_size, + error); + } zio->io_error = error; } @@ -3525,19 +3653,21 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ + int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int allocator = (uint_t)cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, - cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count); - if (error == 0) { - *slog = TRUE; - } else { + txg, NULL, flags, &io_alloc_list, NULL, allocator); + *slog = (error == 0); + if (error != 0) { + error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); + } + if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL, cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); - if (error == 0) - *slog = FALSE; + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); @@ -3571,7 +3701,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, } } else { zfs_dbgmsg("%s: zil block allocation failure: " - "size %llu, error %d", spa_name(spa), size, error); + "size %llu, error %d", spa_name(spa), (u_longlong_t)size, + error); } return (error); @@ -3694,19 +3825,37 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } - if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + /* + * Select the next best leaf I/O to process. Distributed spares are + * excluded since they dispatch the I/O directly to a leaf vdev after + * applying the dRAID mapping. + */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && + (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); @@ -3743,8 +3892,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -3817,7 +3966,7 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, /*ARGSUSED*/ void -zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) +zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); @@ -3882,6 +4031,9 @@ zio_vdev_io_assess(zio_t *zio) */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { + vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " + "cant_write=TRUE due to write failure with ENXIO", + zio); vd->vdev_cant_write = B_TRUE; } @@ -4146,20 +4298,19 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + (void) zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, &info); mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, - zio->io_offset, zio->io_size, NULL, &info); } } @@ -4288,7 +4439,7 @@ zio_ready(zio_t *zio) static void zio_dva_throttle_done(zio_t *zio) { - ASSERTV(zio_t *lio = zio->io_logical); + zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; @@ -4388,9 +4539,8 @@ zio_done(zio_t *zio) metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); - VERIFY(zfs_refcount_not_held( - &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], - zio)); + VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> + mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } @@ -4435,7 +4585,7 @@ zio_done(zio_t *zio) uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; - if (asize != psize) { + if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); @@ -4446,7 +4596,7 @@ zio_done(zio_t *zio) zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); - if (asize != psize) + if (adata != NULL && asize != psize) abd_free(adata); } } @@ -4477,9 +4627,9 @@ zio_done(zio_t *zio) zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0, 0); + zio, 0); } } } @@ -4493,16 +4643,16 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - if (zio->io_type == ZIO_TYPE_READ) { - zio->io_vd->vdev_stat.vs_read_errors++; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - zio->io_vd->vdev_stat.vs_write_errors++; + int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vd->vdev_stat.vs_read_errors++; + else if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_vd->vdev_stat.vs_write_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } - mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & @@ -4513,8 +4663,8 @@ zio_done(zio_t *zio) * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, - NULL, &zio->io_bookmark, zio, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, + zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } @@ -4650,8 +4800,7 @@ zio_done(zio_t *zio) ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - (task_func_t *)zio_reexecute, zio, 0, - &zio->io_tqent); + zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } @@ -4791,6 +4940,9 @@ zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, zb1->zb_blkid == zb2->zb_blkid) return (0); + IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); + IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); + /* * BP_SPANB calculates the span in blocks. */ @@ -4872,37 +5024,31 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp, last_block) <= 0); } -#if defined(_KERNEL) EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); -module_param(zio_slow_io_ms, int, 0644); -MODULE_PARM_DESC(zio_slow_io_ms, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); -module_param(zio_requeue_io_start_cut_in_line, int, 0644); -MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); +ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, + "Prioritize requeued I/O"); -module_param(zfs_sync_pass_deferred_free, int, 0644); -MODULE_PARM_DESC(zfs_sync_pass_deferred_free, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW, "Defer frees starting in this pass"); -module_param(zfs_sync_pass_dont_compress, int, 0644); -MODULE_PARM_DESC(zfs_sync_pass_dont_compress, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW, "Don't compress starting in this pass"); -module_param(zfs_sync_pass_rewrite, int, 0644); -MODULE_PARM_DESC(zfs_sync_pass_rewrite, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW, "Rewrite new bps starting in this pass"); -module_param(zio_dva_throttle_enabled, int, 0644); -MODULE_PARM_DESC(zio_dva_throttle_enabled, +ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); -module_param(zio_deadman_log_all, int, 0644); -MODULE_PARM_DESC(zio_deadman_log_all, +ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); -#endif +/* END CSTYLED */ diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 7b148375d0..f8fee78c60 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -100,7 +100,7 @@ abd_checksum_off(abd_t *abd, uint64_t size, } /*ARGSUSED*/ -void +static void abd_fletcher_2_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { @@ -110,7 +110,7 @@ abd_fletcher_2_native(abd_t *abd, uint64_t size, } /*ARGSUSED*/ -void +static void abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { @@ -191,10 +191,12 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, +#if !defined(__FreeBSD__) {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif }; /* @@ -211,8 +213,10 @@ zio_checksum_to_feature(enum zio_checksum cksum) return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: return (SPA_FEATURE_SKEIN); +#if !defined(__FreeBSD__) case ZIO_CHECKSUM_EDONR: return (SPA_FEATURE_EDONR); +#endif default: return (SPA_FEATURE_NONE); } @@ -308,7 +312,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) mutex_exit(&spa->spa_cksum_tmpls_lock); } -/* convenience function to update a checksum to accomodate an encryption MAC */ +/* convenience function to update a checksum to accommodate an encryption MAC */ static void zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor) { diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index f5cbc3e821..1ff1e76d7f 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -29,6 +29,8 @@ /* * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include @@ -36,6 +38,7 @@ #include #include #include +#include /* * If nonzero, every 1/X decompression attempts will fail, simulating @@ -47,24 +50,42 @@ unsigned long zio_decompress_fail_fraction = 0; * Compression vectors. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {"inherit", 0, NULL, NULL}, - {"on", 0, NULL, NULL}, - {"uncompressed", 0, NULL, NULL}, - {"lzjb", 0, lzjb_compress, lzjb_decompress}, - {"empty", 0, NULL, NULL}, - {"gzip-1", 1, gzip_compress, gzip_decompress}, - {"gzip-2", 2, gzip_compress, gzip_decompress}, - {"gzip-3", 3, gzip_compress, gzip_decompress}, - {"gzip-4", 4, gzip_compress, gzip_decompress}, - {"gzip-5", 5, gzip_compress, gzip_decompress}, - {"gzip-6", 6, gzip_compress, gzip_decompress}, - {"gzip-7", 7, gzip_compress, gzip_decompress}, - {"gzip-8", 8, gzip_compress, gzip_decompress}, - {"gzip-9", 9, gzip_compress, gzip_decompress}, - {"zle", 64, zle_compress, zle_decompress}, - {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs} + {"inherit", 0, NULL, NULL, NULL}, + {"on", 0, NULL, NULL, NULL}, + {"uncompressed", 0, NULL, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress, NULL}, + {"empty", 0, NULL, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress, NULL}, + {"gzip-2", 2, gzip_compress, gzip_decompress, NULL}, + {"gzip-3", 3, gzip_compress, gzip_decompress, NULL}, + {"gzip-4", 4, gzip_compress, gzip_decompress, NULL}, + {"gzip-5", 5, gzip_compress, gzip_decompress, NULL}, + {"gzip-6", 6, gzip_compress, gzip_decompress, NULL}, + {"gzip-7", 7, gzip_compress, gzip_decompress, NULL}, + {"gzip-8", 8, gzip_compress, gzip_decompress, NULL}, + {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, + {"zle", 64, zle_compress, zle_decompress, NULL}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, + zfs_zstd_decompress, zfs_zstd_decompress_level}, }; +uint8_t +zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, + uint8_t parent) +{ + uint8_t result; + + if (!ZIO_COMPRESS_HASLEVEL(compress)) + return (0); + + result = child; + if (result == ZIO_COMPLEVEL_INHERIT) + result = parent; + + return (result); +} + enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent) @@ -102,9 +123,11 @@ zio_compress_zeroed_cb(void *data, size_t len, void *private) } size_t -zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, + uint8_t level) { size_t c_len, d_len; + uint8_t complevel; zio_compress_info_t *ci = &zio_compress_table[c]; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); @@ -123,9 +146,24 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) + return (s_len); + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + /* No compression algorithms can read from ABDs directly */ void *tmp = abd_borrow_buf_copy(src, s_len); - c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel); abd_return_buf(src, tmp, s_len); if (c_len > d_len) @@ -137,32 +175,46 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, - size_t s_len, size_t d_len) + size_t s_len, size_t d_len, uint8_t *level) { zio_compress_info_t *ci = &zio_compress_table[c]; if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); + if (ci->ci_decompress_level != NULL && level != NULL) + return (ci->ci_decompress_level(src, dst, s_len, d_len, level)); + return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len) + size_t s_len, size_t d_len, uint8_t *level) { void *tmp = abd_borrow_buf_copy(src, s_len); - int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level); abd_return_buf(src, tmp, s_len); /* - * Decompression shouldn't fail, because we've already verifyied + * Decompression shouldn't fail, because we've already verified * the checksum. However, for extra protection (e.g. against bitflips * in non-ECC RAM), we handle this error (and test it). */ - ASSERT0(ret); if (zio_decompress_fail_fraction != 0 && - spa_get_random(zio_decompress_fail_fraction) == 0) + random_in_range(zio_decompress_fail_fraction) == 0) ret = SET_ERROR(EINVAL); return (ret); } + +int +zio_compress_to_feature(enum zio_compress comp) +{ + switch (comp) { + case ZIO_COMPRESS_ZSTD: + return (SPA_FEATURE_ZSTD_COMPRESS); + default: + break; + } + return (SPA_FEATURE_NONE); +} diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 78896d3dc3..feaf41dc65 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -113,11 +113,11 @@ freq_triggered(uint32_t frequency) return (B_TRUE); /* - * Note: we still handle legacy (unscaled) frequecy values + * Note: we still handle legacy (unscaled) frequency values */ uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; - return (spa_get_random(maximum) < frequency); + return (random_in_range(maximum) < frequency); } /* @@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error) if (zio->io_type != ZIO_TYPE_READ) return (0); + /* + * A rebuild I/O has no checksum to verify. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) + return (0); + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; @@ -339,14 +345,14 @@ zio_handle_label_injection(zio_t *zio, int error) static int zio_inject_bitflip_cb(void *data, size_t len, void *private) { - ASSERTV(zio_t *zio = private); + zio_t *zio __maybe_unused = private; uint8_t *buffer = data; - uint_t byte = spa_get_random(len); + uint_t byte = random_in_range(len); ASSERT(zio->io_type == ZIO_TYPE_READ); /* flip a single random bit in an abd data buffer */ - buffer[byte] ^= 1 << spa_get_random(8); + buffer[byte] ^= 1 << random_in_range(8); return (1); /* stop after first flip */ } @@ -487,7 +493,7 @@ zio_handle_ignored_writes(zio_t *zio) } /* Have a "problem" writing 60% of the time */ - if (spa_get_random(100) < 60) + if (random_in_range(100) < 60) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; break; } diff --git a/module/zfs/zle.c b/module/zfs/zle.c index 613607faaa..0decebb13c 100644 --- a/module/zfs/zle.c +++ b/module/zfs/zle.c @@ -32,6 +32,7 @@ */ #include #include +#include size_t zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) diff --git a/module/zfs/zrlock.c b/module/zfs/zrlock.c index 014a5cc6c7..a4def60536 100644 --- a/module/zfs/zrlock.c +++ b/module/zfs/zrlock.c @@ -39,7 +39,7 @@ * function calls. */ #include -#include +#include /* * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is @@ -156,15 +156,6 @@ zrl_exit(zrlock_t *zrl) mutex_exit(&zrl->zr_mtx); } -int -zrl_refcount(zrlock_t *zrl) -{ - ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); - - int n = (int)zrl->zr_refcount; - return (n <= 0 ? 0 : n); -} - int zrl_is_zero(zrlock_t *zrl) { diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index 532e8ce0f8..33fdda7b68 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2017, 2019 by Delphix. All rights reserved. + * Copyright (c) 2017, 2020 by Delphix. All rights reserved. */ /* @@ -56,7 +56,7 @@ * * == ZTHR creation * - * Every zthr needs three inputs to start running: + * Every zthr needs four inputs to start running: * * 1] A user-defined checker function (checkfunc) that decides whether * the zthr should start working or go to sleep. The function should @@ -72,6 +72,9 @@ * 3] A void args pointer that will be passed to checkfunc and func * implicitly by the infrastructure. * + * 4] A name for the thread. This string must be valid for the lifetime + * of the zthr. + * * The reason why the above API needs two different functions, * instead of one that both checks and does the work, has to do with * the zthr's internal state lock (zthr_state_lock) and the allowed @@ -80,10 +83,11 @@ * can be cancelled while doing work and not while checking for work. * * To start a zthr: - * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args, + * pri); * or * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, - * args, max_sleep); + * args, max_sleep, pri); * * After that you should be able to wakeup, cancel, and resume the * zthr from another thread using the zthr_pointer. @@ -207,17 +211,24 @@ struct zthr { /* flag set to true if we are canceling the zthr */ boolean_t zthr_cancel; + /* flag set to true if we are waiting for the zthr to finish */ + boolean_t zthr_haswaiters; + kcondvar_t zthr_wait_cv; /* * maximum amount of time that the zthr is spent sleeping; * if this is 0, the thread doesn't wake up until it gets * signaled. */ - hrtime_t zthr_wait_time; + hrtime_t zthr_sleep_timeout; + + /* Thread priority */ + pri_t zthr_pri; /* consumer-provided callbacks & data */ zthr_checkfunc_t *zthr_checkfunc; zthr_func_t *zthr_func; void *zthr_arg; + const char *zthr_name; }; static void @@ -234,19 +245,18 @@ zthr_procedure(void *arg) t->zthr_func(t->zthr_arg, t); mutex_enter(&t->zthr_state_lock); } else { - /* - * cv_wait_sig() is used instead of cv_wait() in - * order to prevent this process from incorrectly - * contributing to the system load average when idle. - */ - if (t->zthr_wait_time == 0) { - cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); + if (t->zthr_sleep_timeout == 0) { + cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock); } else { - (void) cv_timedwait_sig_hires(&t->zthr_cv, - &t->zthr_state_lock, t->zthr_wait_time, + (void) cv_timedwait_idle_hires(&t->zthr_cv, + &t->zthr_state_lock, t->zthr_sleep_timeout, MSEC2NSEC(1), 0); } } + if (t->zthr_haswaiters) { + t->zthr_haswaiters = B_FALSE; + cv_broadcast(&t->zthr_wait_cv); + } } /* @@ -262,9 +272,11 @@ zthr_procedure(void *arg) } zthr_t * -zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) +zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, pri_t pri) { - return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); + return (zthr_create_timer(zthr_name, checkfunc, + func, arg, (hrtime_t)0, pri)); } /* @@ -273,22 +285,26 @@ zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) * start working if required) will be triggered. */ zthr_t * -zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, - void *arg, hrtime_t max_sleep) +zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t max_sleep, pri_t pri) { zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL); mutex_enter(&t->zthr_state_lock); t->zthr_checkfunc = checkfunc; t->zthr_func = func; t->zthr_arg = arg; - t->zthr_wait_time = max_sleep; + t->zthr_sleep_timeout = max_sleep; + t->zthr_name = zthr_name; + t->zthr_pri = pri; + + t->zthr_thread = thread_create_named(zthr_name, NULL, 0, + zthr_procedure, t, 0, &p0, TS_RUN, pri); - t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, - 0, &p0, TS_RUN, minclsyspri); mutex_exit(&t->zthr_state_lock); return (t); @@ -303,6 +319,7 @@ zthr_destroy(zthr_t *t) mutex_destroy(&t->zthr_request_lock); mutex_destroy(&t->zthr_state_lock); cv_destroy(&t->zthr_cv); + cv_destroy(&t->zthr_wait_cv); kmem_free(t, sizeof (*t)); } @@ -355,9 +372,8 @@ zthr_cancel(zthr_t *t) * * [1] The thread has already been cancelled, therefore * there is nothing for us to do. - * [2] The thread is sleeping, so we broadcast the CV first - * to wake it up and then we set the flag and we are - * waiting for it to exit. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. * [3] The thread is doing work, in which case we just set * the flag and wait for it to finish. * [4] The thread was just created/resumed, in which case @@ -397,6 +413,7 @@ zthr_resume(zthr_t *t) ASSERT3P(&t->zthr_checkfunc, !=, NULL); ASSERT3P(&t->zthr_func, !=, NULL); ASSERT(!t->zthr_cancel); + ASSERT(!t->zthr_haswaiters); /* * There are 4 states that we find the zthr in at this point @@ -410,8 +427,8 @@ zthr_resume(zthr_t *t) * no-op. */ if (t->zthr_thread == NULL) { - t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, - 0, &p0, TS_RUN, minclsyspri); + t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0, + zthr_procedure, t, 0, &p0, TS_RUN, t->zthr_pri); } mutex_exit(&t->zthr_state_lock); @@ -451,3 +468,74 @@ zthr_iscancelled(zthr_t *t) mutex_exit(&t->zthr_state_lock); return (cancelled); } + +/* + * Wait for the zthr to finish its current function. Similar to + * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end + * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was + * sleeping or cancelled, return immediately. + */ +void +zthr_wait_cycle_done(zthr_t *t) +{ + mutex_enter(&t->zthr_state_lock); + + /* + * Since we are holding the zthr_state_lock at this point + * we can find the state in one of the following 5 states: + * + * [1] The thread has already cancelled, therefore + * there is nothing for us to do. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. + * [3] The thread is doing work, in which case we just set + * the flag and wait for it to finish. + * [4] The thread was just created/resumed, in which case + * the behavior is similar to [3]. + * [5] The thread is the middle of being cancelled, which is + * similar to [3]. We'll wait for the cancel, which is + * waiting for the zthr func. + * + * Since requests are serialized, by the time that we get + * control back we expect that the zthr has completed it's + * zthr_func. + */ + if (t->zthr_thread != NULL) { + t->zthr_haswaiters = B_TRUE; + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + while ((t->zthr_haswaiters) && (t->zthr_thread != NULL)) + cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock); + + ASSERT(!t->zthr_haswaiters); + } + + mutex_exit(&t->zthr_state_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread is waiting on it to finish + * + * returns TRUE if we have been asked to finish. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_has_waiters(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_state_lock); + + /* + * Similarly to zthr_iscancelled(), we only grab the + * zthr_state_lock so that the zthr itself can use this + * to check for the request. + */ + boolean_t has_waiters = t->zthr_haswaiters; + mutex_exit(&t->zthr_state_lock); + return (has_waiters); +} diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index c29f65f676..d50cce7d73 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -68,10 +68,6 @@ * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * - * It is also worth keeping in mind that once add_disk() is called, the zvol is - * announced to the world, and zvol_open()/zvol_release() can be called at any - * time. Incidentally, add_disk() itself calls zvol_open()->zvol_first_open() - * and zvol_release()->zvol_last_close() directly as well. */ #include @@ -88,56 +84,17 @@ #include #include #include - -#include -#include +#include unsigned int zvol_inhibit_dev = 0; -unsigned int zvol_major = ZVOL_MAJOR; -unsigned int zvol_threads = 32; -unsigned int zvol_request_sync = 0; -unsigned int zvol_prefetch_bytes = (128 * 1024); -unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; -static taskq_t *zvol_taskq; -static krwlock_t zvol_state_lock; -static list_t zvol_state_list; - -#define ZVOL_HT_SIZE 1024 -static struct hlist_head *zvol_htable; -#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) - -static struct ida zvol_ida; - -/* - * The in-core state of each volume. - */ -struct zvol_state { - char zv_name[MAXNAMELEN]; /* name */ - uint64_t zv_volsize; /* advertised space */ - uint64_t zv_volblocksize; /* volume block size */ - objset_t *zv_objset; /* objset handle */ - uint32_t zv_flags; /* ZVOL_* flags */ - uint32_t zv_open_count; /* open counts */ - uint32_t zv_changed; /* disk changed */ - zilog_t *zv_zilog; /* ZIL handle */ - rangelock_t zv_rangelock; /* for range locking */ - dnode_t *zv_dn; /* dnode hold */ - dev_t zv_dev; /* device id */ - struct gendisk *zv_disk; /* generic disk */ - struct request_queue *zv_queue; /* request queue */ - dataset_kstats_t zv_kstat; /* zvol kstats */ - list_node_t zv_next; /* next zvol_state_t linkage */ - uint64_t zv_hash; /* name hash */ - struct hlist_node zv_hlink; /* hash link */ - kmutex_t zv_state_lock; /* protects zvol_state_t */ - atomic_t zv_suspend_ref; /* refcount for suspend */ - krwlock_t zv_suspend_lock; /* suspend lock */ -}; +struct hlist_head *zvol_htable; +list_t zvol_state_list; +krwlock_t zvol_state_lock; +const zvol_platform_ops_t *ops; typedef enum { - ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, @@ -147,26 +104,17 @@ typedef enum { typedef struct { zvol_async_op_t op; - char pool[MAXNAMELEN]; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; - zprop_source_t source; uint64_t value; } zvol_task_t; -#define ZVOL_RDONLY 0x1 -/* - * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which - * specifies whether or not the zvol _can_ be written to) - */ -#define ZVOL_WRITTEN_TO 0x2 - -static uint64_t +uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; - uint8_t *p = (uint8_t *)name; + const uint8_t *p = (const uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; @@ -174,31 +122,6 @@ zvol_name_hash(const char *name) return (crc); } -/* - * Find a zvol_state_t given the full major+minor dev_t. If found, - * return with zv_state_lock taken, otherwise, return (NULL) without - * taking zv_state_lock. - */ -static zvol_state_t * -zvol_find_by_dev(dev_t dev) -{ - zvol_state_t *zv; - - rw_enter(&zvol_state_lock, RW_READER); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - mutex_enter(&zv->zv_state_lock); - if (zv->zv_dev == dev) { - rw_exit(&zvol_state_lock); - return (zv); - } - mutex_exit(&zv->zv_state_lock); - } - rw_exit(&zvol_state_lock); - - return (NULL); -} - /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, @@ -206,7 +129,7 @@ zvol_find_by_dev(dev_t dev) * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ -static zvol_state_t * +zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; @@ -258,29 +181,6 @@ zvol_find_by_name(const char *name, int mode) return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } - -/* - * Given a path, return TRUE if path is a ZVOL. - */ -boolean_t -zvol_is_zvol(const char *device) -{ - struct block_device *bdev; - unsigned int major; - - bdev = vdev_lookup_bdev(device); - if (IS_ERR(bdev)) - return (B_FALSE); - - major = MAJOR(bdev->bd_dev); - bdput(bdev); - - if (major == zvol_major) - return (B_TRUE); - - return (B_FALSE); -} - /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ @@ -407,7 +307,6 @@ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; - struct gendisk *disk = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; @@ -450,7 +349,6 @@ zvol_set_volsize(const char *name, uint64_t volsize) if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; - disk = zv->zv_disk; } out: kmem_free(doi, sizeof (dmu_object_info_t)); @@ -466,8 +364,8 @@ out: if (zv != NULL) mutex_exit(&zv->zv_state_lock); - if (disk != NULL) - revalidate_disk(disk); + if (error == 0 && zv != NULL) + ops->zv_update_volsize(zv, volsize); return (SET_ERROR(error)); } @@ -509,51 +407,6 @@ zvol_check_volblocksize(const char *name, uint64_t volblocksize) return (0); } -/* - * Set ZFS_PROP_VOLBLOCKSIZE set entry point. - */ -int -zvol_set_volblocksize(const char *name, uint64_t volblocksize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - - zv = zvol_find_by_name(name, RW_READER); - - if (zv == NULL) - return (SET_ERROR(ENXIO)); - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); - - if (zv->zv_flags & ZVOL_RDONLY) { - mutex_exit(&zv->zv_state_lock); - rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(EROFS)); - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, - volblocksize, 0, tx); - if (error == ENOTSUP) - error = SET_ERROR(EBUSY); - dmu_tx_commit(tx); - if (error == 0) - zv->zv_volblocksize = volblocksize; - } - - mutex_exit(&zv->zv_state_lock); - rw_exit(&zv->zv_suspend_lock); - - return (SET_ERROR(error)); -} - /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. @@ -571,7 +424,19 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) offset = lr->lr_offset; length = lr->lr_length; - return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + int error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zil_replaying(zv->zv_zilog, tx); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, + length); + } + + return (error); } /* @@ -611,6 +476,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); + zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } @@ -657,13 +523,14 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { */ ssize_t zvol_immediate_write_sz = 32768; -static void +void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -684,7 +551,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, itx_wr_state_t wr_state = write_state; ssize_t len = size; - if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA) + if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); @@ -715,94 +582,16 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } -} -typedef struct zv_request { - zvol_state_t *zv; - struct bio *bio; - locked_range_t *lr; -} zv_request_t; - -static void -uio_from_bio(uio_t *uio, struct bio *bio) -{ - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; - uio->uio_segflg = UIO_BVEC; - uio->uio_limit = MAXOFFSET_T; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); -} - -static void -zvol_write(void *arg) -{ - int error = 0; - - zv_request_t *zvr = arg; - struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); - - zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); - - ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), - &zv->zv_disk->part0); - - boolean_t sync = - bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; - - uint64_t volsize = zv->zv_volsize; - while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { - uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio.uio_loffset; - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; - - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - - /* This will only fail for ENOSPC */ - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); - if (error == 0) { - zvol_log_write(zv, tx, off, bytes, sync); - } - dmu_tx_commit(tx); - - if (error) - break; + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); } - rangelock_exit(zvr->lr); - - int64_t nwritten = start_resid - uio.uio_resid; - dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); - task_io_account_write(nwritten); - - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - - rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0, - start_jif); - BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ -static void +void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { @@ -823,119 +612,6 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, zil_itx_assign(zilog, itx, tx); } -static void -zvol_discard(void *arg) -{ - zv_request_t *zvr = arg; - struct bio *bio = zvr->bio; - zvol_state_t *zv = zvr->zv; - uint64_t start = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); - uint64_t end = start + size; - boolean_t sync; - int error = 0; - dmu_tx_t *tx; - unsigned long start_jif; - - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); - - start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), - &zv->zv_disk->part0); - - sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; - - if (end > zv->zv_volsize) { - error = SET_ERROR(EIO); - goto unlock; - } - - /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. - */ - if (!bio_is_secure_erase(bio)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN(end, zv->zv_volblocksize); - size = end - start; - } - - if (start >= end) - goto unlock; - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - } else { - zvol_log_truncate(zv, tx, start, size, B_TRUE); - dmu_tx_commit(tx); - error = dmu_free_long_range(zv->zv_objset, - ZVOL_OBJ, start, size); - } -unlock: - rangelock_exit(zvr->lr); - - if (error == 0 && sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - - rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0, - start_jif); - BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); -} - -static void -zvol_read(void *arg) -{ - int error = 0; - - zv_request_t *zvr = arg; - struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); - - zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); - - ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio), - &zv->zv_disk->part0); - - uint64_t volsize = zv->zv_volsize; - while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { - uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); - - /* don't read past the end */ - if (bytes > volsize - uio.uio_loffset) - bytes = volsize - uio.uio_loffset; - - error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - } - rangelock_exit(zvr->lr); - - int64_t nread = start_resid - uio.uio_resid; - dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); - task_io_account_read(nread); - - rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0, - start_jif); - BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); -} /* ARGSUSED */ static void @@ -944,7 +620,7 @@ zvol_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - rangelock_exit(zgd->zgd_lr); + zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } @@ -952,8 +628,9 @@ zvol_get_done(zgd_t *zgd, int error) /* * Get data to generate a TX_WRITE intent log record. */ -static int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +int +zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; @@ -977,8 +654,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); + zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, + size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ @@ -990,8 +667,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); + zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, + size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { @@ -1017,141 +694,14 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) return (SET_ERROR(error)); } -static MAKE_REQUEST_FN_RET -zvol_request(struct request_queue *q, struct bio *bio) -{ - zvol_state_t *zv = q->queuedata; - fstrans_cookie_t cookie = spl_fstrans_mark(); - uint64_t offset = BIO_BI_SECTOR(bio) << 9; - uint64_t size = BIO_BI_SIZE(bio); - int rw = bio_data_dir(bio); - zv_request_t *zvr; - - if (bio_has_data(bio) && offset + size > zv->zv_volsize) { - printk(KERN_INFO - "%s: bad access: offset=%llu, size=%lu\n", - zv->zv_disk->disk_name, - (long long unsigned)offset, - (long unsigned)size); - - BIO_END_IO(bio, -SET_ERROR(EIO)); - goto out; - } - - if (rw == WRITE) { - boolean_t need_sync = B_FALSE; - - if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - BIO_END_IO(bio, -SET_ERROR(EROFS)); - goto out; - } - - /* - * To be released in the I/O function. See the comment on - * rangelock_enter() below. - */ - rw_enter(&zv->zv_suspend_lock, RW_READER); - - /* - * Open a ZIL if this is the first time we have written to this - * zvol. We protect zv->zv_zilog with zv_suspend_lock rather - * than zv_state_lock so that we don't need to acquire an - * additional lock in this path. - */ - if (zv->zv_zilog == NULL) { - rw_exit(&zv->zv_suspend_lock); - rw_enter(&zv->zv_suspend_lock, RW_WRITER); - if (zv->zv_zilog == NULL) { - zv->zv_zilog = zil_open(zv->zv_objset, - zvol_get_data); - zv->zv_flags |= ZVOL_WRITTEN_TO; - } - rw_downgrade(&zv->zv_suspend_lock); - } - - /* bio marked as FLUSH need to flush before write */ - if (bio_is_flush(bio)) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - - /* Some requests are just for flush and nothing else. */ - if (size == 0) { - rw_exit(&zv->zv_suspend_lock); - BIO_END_IO(bio, 0); - goto out; - } - - zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); - zvr->zv = zv; - zvr->bio = bio; - - /* - * To be released in the I/O function. Since the I/O functions - * are asynchronous, we take it here synchronously to make - * sure overlapped I/Os are properly ordered. - */ - zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_WRITER); - /* - * Sync writes and discards execute zil_commit() which may need - * to take a RL_READER lock on the whole block being modified - * via its zillog->zl_get_data(): to avoid circular dependency - * issues with taskq threads execute these requests - * synchronously here in zvol_request(). - */ - need_sync = bio_is_fua(bio) || - zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; - if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync || need_sync || - taskq_dispatch(zvol_taskq, zvol_discard, zvr, - TQ_SLEEP) == TASKQID_INVALID) - zvol_discard(zvr); - } else { - if (zvol_request_sync || need_sync || - taskq_dispatch(zvol_taskq, zvol_write, zvr, - TQ_SLEEP) == TASKQID_INVALID) - zvol_write(zvr); - } - } else { - /* - * The SCST driver, and possibly others, may issue READ I/Os - * with a length of zero bytes. These empty I/Os contain no - * data and require no additional handling. - */ - if (size == 0) { - BIO_END_IO(bio, 0); - goto out; - } - - zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); - zvr->zv = zv; - zvr->bio = bio; - - rw_enter(&zv->zv_suspend_lock, RW_READER); - - zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); - if (zvol_request_sync || taskq_dispatch(zvol_taskq, - zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID) - zvol_read(zvr); - } - -out: - spl_fstrans_unmark(cookie); -#ifdef HAVE_MAKE_REQUEST_FN_RET_INT - return (0); -#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC) - return (BLK_QC_T_NONE); -#endif -} - /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ -static void + +void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); - ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } @@ -1192,19 +742,19 @@ zvol_setup_zv(zvol_state_t *zv) if (error) return (SET_ERROR(error)); - error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn); + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); - set_capacity(zv->zv_disk, volsize >> 9); + ops->zv_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { - set_disk_ro(zv->zv_disk, 1); + ops->zv_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { - set_disk_ro(zv->zv_disk, 0); + ops->zv_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); @@ -1227,7 +777,7 @@ zvol_shutdown_zv(zvol_state_t *zv) zv->zv_zilog = NULL; - dnode_rele(zv->zv_dn, FTAG); + dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* @@ -1313,7 +863,7 @@ zvol_resume(zvol_state_t *zv) return (SET_ERROR(error)); } -static int +int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; @@ -1344,7 +894,7 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly) if (!mutex_owned(&spa_namespace_lock)) { locked = mutex_tryenter(&spa_namespace_lock); if (!locked) - return (-SET_ERROR(ERESTARTSYS)); + return (SET_ERROR(EINTR)); } ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); @@ -1364,10 +914,10 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly) out_mutex: if (locked) mutex_exit(&spa_namespace_lock); - return (SET_ERROR(-error)); + return (SET_ERROR(error)); } -static void +void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); @@ -1379,574 +929,6 @@ zvol_last_close(zvol_state_t *zv) zv->zv_objset = NULL; } -static int -zvol_open(struct block_device *bdev, fmode_t flag) -{ - zvol_state_t *zv; - int error = 0; - boolean_t drop_suspend = B_TRUE; - - rw_enter(&zvol_state_lock, RW_READER); - /* - * Obtain a copy of private_data under the zvol_state_lock to make - * sure that either the result of zvol free code path setting - * bdev->bd_disk->private_data to NULL is observed, or zvol_free() - * is not called on this zv because of the positive zv_open_count. - */ - zv = bdev->bd_disk->private_data; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - return (SET_ERROR(-ENXIO)); - } - - mutex_enter(&zv->zv_state_lock); - /* - * make sure zvol is not suspended during first open - * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock - */ - if (zv->zv_open_count == 0) { - if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { - mutex_exit(&zv->zv_state_lock); - rw_enter(&zv->zv_suspend_lock, RW_READER); - mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 0) { - rw_exit(&zv->zv_suspend_lock); - drop_suspend = B_FALSE; - } - } - } else { - drop_suspend = B_FALSE; - } - rw_exit(&zvol_state_lock); - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); - - if (zv->zv_open_count == 0) { - error = zvol_first_open(zv, !(flag & FMODE_WRITE)); - if (error) - goto out_mutex; - } - - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; - goto out_open_count; - } - - zv->zv_open_count++; - - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - - check_disk_change(bdev); - - return (0); - -out_open_count: - if (zv->zv_open_count == 0) - zvol_last_close(zv); - -out_mutex: - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - if (error == -ERESTARTSYS) - schedule(); - - return (SET_ERROR(error)); -} - -#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID -static void -#else -static int -#endif -zvol_release(struct gendisk *disk, fmode_t mode) -{ - zvol_state_t *zv; - boolean_t drop_suspend = B_TRUE; - - rw_enter(&zvol_state_lock, RW_READER); - zv = disk->private_data; - - mutex_enter(&zv->zv_state_lock); - ASSERT(zv->zv_open_count > 0); - /* - * make sure zvol is not suspended during last close - * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock - */ - if (zv->zv_open_count == 1) { - if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { - mutex_exit(&zv->zv_state_lock); - rw_enter(&zv->zv_suspend_lock, RW_READER); - mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 1) { - rw_exit(&zv->zv_suspend_lock); - drop_suspend = B_FALSE; - } - } - } else { - drop_suspend = B_FALSE; - } - rw_exit(&zvol_state_lock); - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); - - zv->zv_open_count--; - if (zv->zv_open_count == 0) - zvol_last_close(zv); - - mutex_exit(&zv->zv_state_lock); - - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - -#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID - return (0); -#endif -} - -static int -zvol_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - zvol_state_t *zv = bdev->bd_disk->private_data; - int error = 0; - - ASSERT3U(zv->zv_open_count, >, 0); - - switch (cmd) { - case BLKFLSBUF: - fsync_bdev(bdev); - invalidate_bdev(bdev); - rw_enter(&zv->zv_suspend_lock, RW_READER); - - if (!(zv->zv_flags & ZVOL_RDONLY)) - txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - - rw_exit(&zv->zv_suspend_lock); - break; - - case BLKZNAME: - mutex_enter(&zv->zv_state_lock); - error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); - mutex_exit(&zv->zv_state_lock); - break; - - default: - error = -ENOTTY; - break; - } - - return (SET_ERROR(error)); -} - -#ifdef CONFIG_COMPAT -static int -zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - return (zvol_ioctl(bdev, mode, cmd, arg)); -} -#else -#define zvol_compat_ioctl NULL -#endif - -/* - * Linux 2.6.38 preferred interface. - */ -#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS -static unsigned int -zvol_check_events(struct gendisk *disk, unsigned int clearing) -{ - unsigned int mask = 0; - - rw_enter(&zvol_state_lock, RW_READER); - - zvol_state_t *zv = disk->private_data; - if (zv != NULL) { - mutex_enter(&zv->zv_state_lock); - mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; - zv->zv_changed = 0; - mutex_exit(&zv->zv_state_lock); - } - - rw_exit(&zvol_state_lock); - - return (mask); -} -#else -static int zvol_media_changed(struct gendisk *disk) -{ - int changed = 0; - - rw_enter(&zvol_state_lock, RW_READER); - - zvol_state_t *zv = disk->private_data; - if (zv != NULL) { - mutex_enter(&zv->zv_state_lock); - changed = zv->zv_changed; - zv->zv_changed = 0; - mutex_exit(&zv->zv_state_lock); - } - - rw_exit(&zvol_state_lock); - - return (changed); -} -#endif - -static int zvol_revalidate_disk(struct gendisk *disk) -{ - rw_enter(&zvol_state_lock, RW_READER); - - zvol_state_t *zv = disk->private_data; - if (zv != NULL) { - mutex_enter(&zv->zv_state_lock); - set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS); - mutex_exit(&zv->zv_state_lock); - } - - rw_exit(&zvol_state_lock); - - return (0); -} - -/* - * Provide a simple virtual geometry for legacy compatibility. For devices - * smaller than 1 MiB a small head and sector count is used to allow very - * tiny devices. For devices over 1 Mib a standard head and sector count - * is used to keep the cylinders count reasonable. - */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - zvol_state_t *zv = bdev->bd_disk->private_data; - sector_t sectors; - - ASSERT3U(zv->zv_open_count, >, 0); - - sectors = get_capacity(zv->zv_disk); - - if (sectors > 2048) { - geo->heads = 16; - geo->sectors = 63; - } else { - geo->heads = 2; - geo->sectors = 4; - } - - geo->start = 0; - geo->cylinders = sectors / (geo->heads * geo->sectors); - - return (0); -} - -static struct kobject * -zvol_probe(dev_t dev, int *part, void *arg) -{ - zvol_state_t *zv; - struct kobject *kobj; - - zv = zvol_find_by_dev(dev); - kobj = zv ? get_disk_and_module(zv->zv_disk) : NULL; - ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); - if (zv) - mutex_exit(&zv->zv_state_lock); - - return (kobj); -} - -static struct block_device_operations zvol_ops = { - .open = zvol_open, - .release = zvol_release, - .ioctl = zvol_ioctl, - .compat_ioctl = zvol_compat_ioctl, -#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS - .check_events = zvol_check_events, -#else - .media_changed = zvol_media_changed, -#endif - .revalidate_disk = zvol_revalidate_disk, - .getgeo = zvol_getgeo, - .owner = THIS_MODULE, -}; - -/* - * Allocate memory for a new zvol_state_t and setup the required - * request queue and generic disk structures for the block device. - */ -static zvol_state_t * -zvol_alloc(dev_t dev, const char *name) -{ - zvol_state_t *zv; - uint64_t volmode; - - if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) - return (NULL); - - if (volmode == ZFS_VOLMODE_DEFAULT) - volmode = zvol_volmode; - - if (volmode == ZFS_VOLMODE_NONE) - return (NULL); - - zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); - - list_link_init(&zv->zv_next); - - mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); - - zv->zv_queue = blk_alloc_queue(GFP_ATOMIC); - if (zv->zv_queue == NULL) - goto out_kmem; - - blk_queue_make_request(zv->zv_queue, zvol_request); - blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE); - - /* Limit read-ahead to a single page to prevent over-prefetching. */ - blk_queue_set_read_ahead(zv->zv_queue, 1); - - /* Disable write merging in favor of the ZIO pipeline. */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_queue); - - zv->zv_disk = alloc_disk(ZVOL_MINORS); - if (zv->zv_disk == NULL) - goto out_queue; - - zv->zv_queue->queuedata = zv; - zv->zv_dev = dev; - zv->zv_open_count = 0; - strlcpy(zv->zv_name, name, MAXNAMELEN); - - rangelock_init(&zv->zv_rangelock, NULL, NULL); - rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); - - zv->zv_disk->major = zvol_major; -#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS - zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE; -#endif - - if (volmode == ZFS_VOLMODE_DEV) { - /* - * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set - * gendisk->minors = 1 as noted in include/linux/genhd.h. - * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) - * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) - * setting gendisk->flags accordingly. - */ - zv->zv_disk->minors = 1; -#if defined(GENHD_FL_EXT_DEVT) - zv->zv_disk->flags &= ~GENHD_FL_EXT_DEVT; -#endif -#if defined(GENHD_FL_NO_PART_SCAN) - zv->zv_disk->flags |= GENHD_FL_NO_PART_SCAN; -#endif - } - zv->zv_disk->first_minor = (dev & MINORMASK); - zv->zv_disk->fops = &zvol_ops; - zv->zv_disk->private_data = zv; - zv->zv_disk->queue = zv->zv_queue; - snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d", - ZVOL_DEV_NAME, (dev & MINORMASK)); - - return (zv); - -out_queue: - blk_cleanup_queue(zv->zv_queue); -out_kmem: - kmem_free(zv, sizeof (zvol_state_t)); - - return (NULL); -} - -/* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). - * At this time, the structure is not opened by anyone, is taken off - * the zvol_state_list, and has its private data set to NULL. - * The zvol_state_lock is dropped. - */ -static void -zvol_free(void *arg) -{ - zvol_state_t *zv = arg; - - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count == 0); - ASSERT(zv->zv_disk->private_data == NULL); - - rw_destroy(&zv->zv_suspend_lock); - rangelock_fini(&zv->zv_rangelock); - - del_gendisk(zv->zv_disk); - blk_cleanup_queue(zv->zv_queue); - put_disk(zv->zv_disk); - - ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS); - - mutex_destroy(&zv->zv_state_lock); - dataset_kstats_destroy(&zv->zv_kstat); - - kmem_free(zv, sizeof (zvol_state_t)); -} - -/* - * Create a block device minor node and setup the linkage between it - * and the specified volume. Once this function returns the block - * device is live and ready for use. - */ -static int -zvol_create_minor_impl(const char *name) -{ - zvol_state_t *zv; - objset_t *os; - dmu_object_info_t *doi; - uint64_t volsize; - uint64_t len; - unsigned minor = 0; - int error = 0; - int idx; - uint64_t hash = zvol_name_hash(name); - - if (zvol_inhibit_dev) - return (0); - - idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); - if (idx < 0) - return (SET_ERROR(-idx)); - minor = idx << ZVOL_MINOR_BITS; - - zv = zvol_find_by_name_hash(name, hash, RW_NONE); - if (zv) { - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - mutex_exit(&zv->zv_state_lock); - ida_simple_remove(&zvol_ida, idx); - return (SET_ERROR(EEXIST)); - } - - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); - if (error) - goto out_doi; - - error = dmu_object_info(os, ZVOL_OBJ, doi); - if (error) - goto out_dmu_objset_disown; - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - if (error) - goto out_dmu_objset_disown; - - zv = zvol_alloc(MKDEV(zvol_major, minor), name); - if (zv == NULL) { - error = SET_ERROR(EAGAIN); - goto out_dmu_objset_disown; - } - zv->zv_hash = hash; - - if (dmu_objset_is_snapshot(os)) - zv->zv_flags |= ZVOL_RDONLY; - - zv->zv_volblocksize = doi->doi_data_block_size; - zv->zv_volsize = volsize; - zv->zv_objset = os; - - set_capacity(zv->zv_disk, zv->zv_volsize >> 9); - - blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9); - blk_queue_max_segments(zv->zv_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); - blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); - blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); - blk_queue_max_discard_sectors(zv->zv_queue, - (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); - blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_queue); -#ifdef QUEUE_FLAG_NONROT - blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_queue); -#endif -#ifdef QUEUE_FLAG_ADD_RANDOM - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue); -#endif - - if (spa_writeable(dmu_objset_spa(os))) { - if (zil_replay_disable) - zil_destroy(dmu_objset_zil(os), B_FALSE); - else - zil_replay(os, zv, zvol_replay_vector); - } - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); - dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); - - /* - * When udev detects the addition of the device it will immediately - * invoke blkid(8) to determine the type of content on the device. - * Prefetching the blocks commonly scanned by blkid(8) will speed - * up this process. - */ - len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); - if (len > 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); - dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, - ZIO_PRIORITY_SYNC_READ); - } - - zv->zv_objset = NULL; -out_dmu_objset_disown: - dmu_objset_disown(os, B_TRUE, FTAG); -out_doi: - kmem_free(doi, sizeof (dmu_object_info_t)); - - if (error == 0) { - rw_enter(&zvol_state_lock, RW_WRITER); - zvol_insert(zv); - rw_exit(&zvol_state_lock); - add_disk(zv->zv_disk); - } else { - ida_simple_remove(&zvol_ida, idx); - } - - return (SET_ERROR(error)); -} - -/* - * Rename a block device minor mode for the specified volume. - */ -static void -zvol_rename_minor(zvol_state_t *zv, const char *newname) -{ - int readonly = get_disk_ro(zv->zv_disk); - - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - - strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); - - /* move to new hashtable entry */ - zv->zv_hash = zvol_name_hash(zv->zv_name); - hlist_del(&zv->zv_hlink); - hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); - - /* - * The block device's read-only state is briefly changed causing - * a KOBJ_CHANGE uevent to be issued. This ensures udev detects - * the name change and fixes the symlinks. This does not change - * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never - * changes. This would normally be done using kobject_uevent() but - * that is a GPL-only symbol which is why we need this workaround. - */ - set_disk_ro(zv->zv_disk, !readonly); - set_disk_ro(zv->zv_disk, readonly); -} - typedef struct minors_job { list_t *list; list_node_t link; @@ -1993,10 +975,10 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg) /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " - "%s is not a shapshot name\n", dsname); + "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; - char *n = strdup(dsname); + char *n = kmem_strdup(dsname); if (n == NULL) return (0); @@ -2013,6 +995,68 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg) return (0); } +/* + * If spa_keystore_load_wkey() is called for an encrypted zvol, + * we need to look for any clones also using the key. This function + * is "best effort" - so we just skip over it if there are failures. + */ +static void +zvol_add_clones(const char *dsname, list_t *minors_list) +{ + /* Also check if it has clones */ + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = NULL; + + if (dsl_pool_hold(dsname, FTAG, &dp) != 0) + return; + + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_ENCRYPTION)) + goto out; + + if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) + goto out; + + if (dsl_dir_phys(dd)->dd_clones == 0) + goto out; + + zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + objset_t *mos = dd->dd_pool->dp_meta_objset; + + for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + minors_job_t *job; + + if (dsl_dataset_hold_obj(dd->dd_pool, + za->za_first_integer, FTAG, &clone) == 0) { + + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(clone, name); + + char *n = kmem_strdup(name); + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + + dsl_dataset_rele(clone, FTAG); + } + } + zap_cursor_fini(zc); + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); + +out: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); +} + /* * Mask errors to continue dmu_objset_find() traversal */ @@ -2038,7 +1082,7 @@ zvol_create_minors_cb(const char *dsname, void *arg) */ if (strchr(dsname, '@') == 0) { minors_job_t *job; - char *n = strdup(dsname); + char *n = kmem_strdup(dsname); if (n == NULL) return (0); @@ -2051,12 +1095,14 @@ zvol_create_minors_cb(const char *dsname, void *arg) taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); + zvol_add_clones(dsname, minors_list); + if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ - error = dmu_objset_find((char *)dsname, + error = dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } @@ -2085,17 +1131,14 @@ zvol_create_minors_cb(const char *dsname, void *arg) * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ -static int -zvol_create_minors_impl(const char *name) +void +zvol_create_minors_recursive(const char *name) { - int error = 0; - fstrans_cookie_t cookie; - char *atp, *parent; list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) - return (0); + return; /* * This is the list for prefetch jobs. Whenever we found a match @@ -2109,26 +1152,22 @@ zvol_create_minors_impl(const char *name) list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); - parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strlcpy(parent, name, MAXPATHLEN); - if ((atp = strrchr(parent, '@')) != NULL) { + if (strchr(name, '@') != NULL) { uint64_t snapdev; - *atp = '\0'; - error = dsl_prop_get_integer(parent, "snapdev", + int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - error = zvol_create_minor_impl(name); + (void) ops->zv_create_minor(name); } else { - cookie = spl_fstrans_mark(); - error = dmu_objset_find(parent, zvol_create_minors_cb, + fstrans_cookie_t cookie = spl_fstrans_mark(); + (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } - kmem_free(parent, MAXPATHLEN); taskq_wait_outstanding(system_taskq, 0); /* @@ -2138,25 +1177,58 @@ zvol_create_minors_impl(const char *name) while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) - zvol_create_minor_impl(job->name); - strfree(job->name); + (void) ops->zv_create_minor(job->name); + kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); +} - return (SET_ERROR(error)); +void +zvol_create_minor(const char *name) +{ + /* + * Note: the dsl_pool_config_lock must not be held. + * Minor node creation needs to obtain the zvol_state_lock. + * zvol_open() obtains the zvol_state_lock and then the dsl pool + * config lock. Therefore, we can't have the config lock now if + * we are going to wait for the zvol_state_lock, because it + * would be a lock order inversion which could lead to deadlock. + */ + + if (zvol_inhibit_dev) + return; + + if (strchr(name, '@') != NULL) { + uint64_t snapdev; + + int error = dsl_prop_get_integer(name, + "snapdev", &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) + (void) ops->zv_create_minor(name); + } else { + (void) ops->zv_create_minor(name); + } } /* * Remove minors for specified dataset including children and snapshots. */ + static void +zvol_free_task(void *arg) +{ + ops->zv_free(arg); +} + +void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); - taskqid_t t, tid = TASKQID_INVALID; + taskqid_t t; list_t free_list; if (zvol_inhibit_dev) @@ -2193,18 +1265,16 @@ zvol_remove_minors_impl(const char *name) * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ - zv->zv_disk->private_data = NULL; + ops->zv_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ - t = taskq_dispatch(system_taskq, zvol_free, zv, + t = taskq_dispatch(system_taskq, zvol_free_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); - else - tid = t; } else { mutex_exit(&zv->zv_state_lock); } @@ -2214,11 +1284,8 @@ zvol_remove_minors_impl(const char *name) /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_head(&free_list)) != NULL) { list_remove(&free_list, zv); - zvol_free(zv); + ops->zv_free(zv); } - - if (tid != TASKQID_INVALID) - taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific volume only */ @@ -2250,12 +1317,7 @@ zvol_remove_minor_impl(const char *name) } zvol_remove(zv); - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_disk->private_data = NULL; - + ops->zv_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { @@ -2267,7 +1329,7 @@ zvol_remove_minor_impl(const char *name) rw_exit(&zvol_state_lock); if (zv != NULL) - zvol_free(zv); + ops->zv_free(zv); } /* @@ -2293,15 +1355,15 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { - zvol_rename_minor(zv, newname); + ops->zv_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - zvol_rename_minor(zv, name); - strfree(name); + ops->zv_rename_minor(zv, name); + kmem_strfree(name); } mutex_exit(&zv->zv_state_lock); @@ -2324,7 +1386,7 @@ zvol_set_snapdev_cb(const char *dsname, void *param) switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - (void) zvol_create_minor_impl(dsname); + (void) ops->zv_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); @@ -2347,14 +1409,12 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev) spl_fstrans_unmark(cookie); } -typedef struct zvol_volmode_cb_arg { - uint64_t volmode; -} zvol_volmode_cb_arg_t; - static void zvol_set_volmode_impl(char *name, uint64_t volmode) { - fstrans_cookie_t cookie = spl_fstrans_mark(); + fstrans_cookie_t cookie; + uint64_t old_volmode; + zvol_state_t *zv; if (strchr(name, '@') != NULL) return; @@ -2362,11 +1422,20 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) - * coule be different when we set, for instance, volmode from "geom" + * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). - * A possible optimization is to modify our consumers so we don't get - * called when "volmode" does not change. */ + zv = zvol_find_by_name(name, RW_NONE); + if (zv == NULL && volmode == ZFS_VOLMODE_NONE) + return; + if (zv != NULL) { + old_volmode = zv->zv_volmode; + mutex_exit(&zv->zv_state_lock); + if (old_volmode == volmode) + return; + zvol_wait_close(zv); + } + cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); @@ -2374,17 +1443,16 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); - (void) zvol_create_minor_impl(name); + (void) ops->zv_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ - (void) zvol_create_minor_impl(name); + (void) ops->zv_create_minor(name); break; } - spl_fstrans_unmark(cookie); } @@ -2393,7 +1461,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; - char *delim; /* Never allow tasks on hidden names. */ if (name1[0] == '$') @@ -2402,8 +1469,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; - delim = strchr(name1, '/'); - strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) @@ -2422,14 +1487,11 @@ zvol_task_free(zvol_task_t *task) * The worker thread function performed asynchronously. */ static void -zvol_task_cb(void *param) +zvol_task_cb(void *arg) { - zvol_task_t *task = (zvol_task_t *)param; + zvol_task_t *task = arg; switch (task->op) { - case ZVOL_ASYNC_CREATE_MINORS: - (void) zvol_create_minors_impl(task->name1); - break; case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; @@ -2630,21 +1692,6 @@ zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } -void -zvol_create_minors(spa_t *spa, const char *name, boolean_t async) -{ - zvol_task_t *task; - taskqid_t id; - - task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL); - if (task == NULL) - return; - - id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); - if ((async == B_FALSE) && (id != TASKQID_INVALID)) - taskq_wait_id(spa->spa_zvol_taskq, id); -} - void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { @@ -2676,92 +1723,50 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, taskq_wait_id(spa->spa_zvol_taskq, id); } -int -zvol_init(void) +boolean_t +zvol_is_zvol(const char *name) { - int threads = MIN(MAX(zvol_threads, 1), 1024); - int i, error; + + return (ops->zv_is_zvol(name)); +} + +void +zvol_register_ops(const zvol_platform_ops_t *zvol_ops) +{ + ops = zvol_ops; +} + +int +zvol_init_impl(void) +{ + int i; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); - ida_init(&zvol_ida); - - zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, - threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - printk(KERN_INFO "ZFS: taskq_create() failed\n"); - error = -ENOMEM; - goto out; - } zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); - if (!zvol_htable) { - error = -ENOMEM; - goto out_taskq; - } for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); - error = register_blkdev(zvol_major, ZVOL_DRIVER); - if (error) { - printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - goto out_free; - } - - blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, - THIS_MODULE, zvol_probe, NULL, NULL); - return (0); - -out_free: - kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); -out_taskq: - taskq_destroy(zvol_taskq); -out: - ida_destroy(&zvol_ida); - rw_destroy(&zvol_state_lock); - list_destroy(&zvol_state_list); - - return (SET_ERROR(error)); } void -zvol_fini(void) +zvol_fini_impl(void) { zvol_remove_minors_impl(NULL); - blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); - unregister_blkdev(zvol_major, ZVOL_DRIVER); - kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); + /* + * The call to "zvol_remove_minors_impl" may dispatch entries to + * the system_taskq, but it doesn't wait for those entries to + * complete before it returns. Thus, we must wait for all of the + * removals to finish, before we can continue. + */ + taskq_wait_outstanding(system_taskq, 0); - taskq_destroy(zvol_taskq); + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); - - ida_destroy(&zvol_ida); } - -/* BEGIN CSTYLED */ -module_param(zvol_inhibit_dev, uint, 0644); -MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); - -module_param(zvol_major, uint, 0444); -MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); - -module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); - -module_param(zvol_request_sync, uint, 0644); -MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); - -module_param(zvol_max_discard_blocks, ulong, 0444); -MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); - -module_param(zvol_prefetch_bytes, uint, 0644); -MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); - -module_param(zvol_volmode, uint, 0644); -MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); -/* END CSTYLED */ diff --git a/module/zstd/Makefile.in b/module/zstd/Makefile.in new file mode 100644 index 0000000000..091f7cea36 --- /dev/null +++ b/module/zstd/Makefile.in @@ -0,0 +1,39 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +zstd_include = $(src)/include +else +zstd_include = $(srctree)/$(src)/include +endif + +MODULE := zzstd + +obj-$(CONFIG_ZFS) := $(MODULE).o + +asflags-y := -I$(zstd_include) +ccflags-y := -I$(zstd_include) + +# Zstd uses -O3 by default, so we should follow +ccflags-y += -O3 + +# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +$(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize + +# SSE register return with SSE disabled if -march=znverX is passed +$(obj)/lib/zstd.o: c_flags += -U__BMI__ + +# Quiet warnings about frame size due to unused code in unmodified zstd lib +$(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480 + +# Disable aarch64 neon SIMD instructions for kernel mode +$(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w + +$(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h + +$(MODULE)-objs += zfs_zstd.o +$(MODULE)-objs += lib/zstd.o +$(MODULE)-objs += zstd_sparc.o + +all: + mkdir -p lib diff --git a/module/zstd/README.md b/module/zstd/README.md new file mode 100644 index 0000000000..eed229e2f7 --- /dev/null +++ b/module/zstd/README.md @@ -0,0 +1,65 @@ +# ZSTD-On-ZFS Library Manual + +## Introduction + +This subtree contains the ZSTD library used in ZFS. It is heavily cut-down by +dropping any unneeded files, and combined into a single file, but otherwise is +intentionally unmodified. Please do not alter the file containing the zstd +library, besides upgrading to a newer ZSTD release. + +Tree structure: + +* `zfs_zstd.c` is the actual `zzstd` kernel module. +* `lib/` contains the unmodified, [_"amalgamated"_](https://github.com/facebook/zstd/blob/dev/contrib/single_file_libs/README.md) + version of the `Zstandard` library, generated from our template file +* `zstd-in.c` is our template file for generating the library +* `include/`: This directory contains supplemental includes for platform + compatibility, which are not expected to be used by ZFS elsewhere in the + future. Thus we keep them private to ZSTD. + +## Updating ZSTD + +To update ZSTD the following steps need to be taken: + +1. Grab the latest release of [ZSTD](https://github.com/facebook/zstd/releases). +2. Update `module/zstd/zstd-in.c` if required. (see + `zstd/contrib/single_file_libs/zstd-in.c` in the zstd repository) +3. Generate the "single-file-library" and put it to `module/zstd/lib/`. +4. Copy the following files to `module/zstd/lib/`: + - `zstd/lib/zstd.h` + - `zstd/lib/common/zstd_errors.h` + +This can be done using a few shell commands from inside the zfs repo: + +~~~sh +cd PATH/TO/ZFS + +url="https://github.com/facebook/zstd" +release="$(curl -s "${url}"/releases/latest | grep -oP '(?<=v)[\d\.]+')" +zstd="/tmp/zstd-${release}/" + +wget -O /tmp/zstd.tar.gz \ + "${url}/releases/download/v${release}/zstd-${release}.tar.gz" +tar -C /tmp -xzf /tmp/zstd.tar.gz + +cp ${zstd}/lib/zstd.h module/zstd/lib/ +cp ${zstd}/lib/zstd_errors.h module/zstd/lib/ +${zstd}/contrib/single_file_libs/combine.sh \ + -r ${zstd}/lib -o module/zstd/lib/zstd.c module/zstd/zstd-in.c +~~~ + +Note: if the zstd library for zfs is updated to a newer version, +the macro list in include/zstd_compat_wrapper.h usually needs to be updated. +this can be done with some hand crafting of the output of the following +script: nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable + + +## Altering ZSTD and breaking changes + +If ZSTD made changes that break compatibility or you need to make breaking +changes to the way we handle ZSTD, it is required to maintain backwards +compatibility. + +We already save the ZSTD version number within the block header to be used +to add future compatibility checks and/or fixes. However, currently it is +not actually used in such a way. diff --git a/module/zstd/include/aarch64_compat.h b/module/zstd/include/aarch64_compat.h new file mode 100644 index 0000000000..088517d3d2 --- /dev/null +++ b/module/zstd/include/aarch64_compat.h @@ -0,0 +1,37 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2018-2020, Sebastian Gottschall + */ + +#ifdef _KERNEL +#undef __aarch64__ +#endif diff --git a/module/zstd/include/limits.h b/module/zstd/include/limits.h new file mode 100644 index 0000000000..3bf5b67765 --- /dev/null +++ b/module/zstd/include/limits.h @@ -0,0 +1,63 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_LIMITS_H +#define _ZSTD_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_LIMITS_H */ diff --git a/module/zstd/include/sparc_compat.h b/module/zstd/include/sparc_compat.h new file mode 100644 index 0000000000..14c1bdde91 --- /dev/null +++ b/module/zstd/include/sparc_compat.h @@ -0,0 +1,4 @@ +#if defined(__sparc) +uint64_t __bswapdi2(uint64_t in); +uint32_t __bswapsi2(uint32_t in); +#endif diff --git a/module/zstd/include/stddef.h b/module/zstd/include/stddef.h new file mode 100644 index 0000000000..3f46fb8b03 --- /dev/null +++ b/module/zstd/include/stddef.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDDEF_H +#define _ZSTD_STDDEF_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDDEF_H */ diff --git a/module/zstd/include/stdint.h b/module/zstd/include/stdint.h new file mode 100644 index 0000000000..2d98a556c2 --- /dev/null +++ b/module/zstd/include/stdint.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDINT_H +#define _ZSTD_STDINT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDINT_H */ diff --git a/module/zstd/include/stdio.h b/module/zstd/include/stdio.h new file mode 100644 index 0000000000..5a7c6ec699 --- /dev/null +++ b/module/zstd/include/stdio.h @@ -0,0 +1,54 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDIO_H +#define _ZSTD_STDIO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#include_next + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDIO_H */ diff --git a/module/zstd/include/stdlib.h b/module/zstd/include/stdlib.h new file mode 100644 index 0000000000..c341a0c848 --- /dev/null +++ b/module/zstd/include/stdlib.h @@ -0,0 +1,58 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDLIB_H +#define _ZSTD_STDLIB_H + +#ifdef __cplusplus +extern "C" { +#endif + +#undef GCC_VERSION + +/* + * Define calloc, malloc, free to make building work. They are never really used + * in zstdlib.c since allocation is done in zstd.c. + */ +#define calloc(n, sz) NULL +#define malloc(sz) NULL +#define free(ptr) + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDLIB_H */ diff --git a/module/zstd/include/string.h b/module/zstd/include/string.h new file mode 100644 index 0000000000..78998d3c46 --- /dev/null +++ b/module/zstd/include/string.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STRING_H +#define _ZSTD_STRING_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include /* memcpy, memset */ +#elif defined(__linux__) +#include /* memcpy, memset */ +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STRING_H */ diff --git a/module/zstd/include/zstd_compat_wrapper.h b/module/zstd/include/zstd_compat_wrapper.h new file mode 100644 index 0000000000..71adc78040 --- /dev/null +++ b/module/zstd/include/zstd_compat_wrapper.h @@ -0,0 +1,460 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2020, Sebastian Gottschall + */ + +/* + * This wrapper fixes a problem, in case the ZFS filesystem driver, is compiled + * statically into the kernel. + * This will cause a symbol collision with the older in-kernel zstd library. + * The following macros will simply rename all local zstd symbols and references + * + * Note: if the zstd library for zfs is updated to a newer version, this macro + * list usually needs to be updated. + * this can be done with some hand crafting of the output of the following + * script + * nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable + */ + +#define BIT_initDStream zfs_BIT_initDStream +#define BIT_mask zfs_BIT_mask +#define BIT_reloadDStream zfs_BIT_reloadDStream +#define ERR_getErrorString zfs_ERR_getErrorString +#define FSE_NCountWriteBound zfs_FSE_NCountWriteBound +#define FSE_buildCTable zfs_FSE_buildCTable +#define FSE_buildCTable_raw zfs_FSE_buildCTable_raw +#define FSE_buildCTable_rle zfs_FSE_buildCTable_rle +#define FSE_buildCTable_wksp zfs_FSE_buildCTable_wksp +#define FSE_buildDTable zfs_FSE_buildDTable +#define FSE_buildDTable_raw zfs_FSE_buildDTable_raw +#define FSE_buildDTable_rle zfs_FSE_buildDTable_rle +#define FSE_compress zfs_FSE_compress +#define FSE_compress2 zfs_FSE_compress2 +#define FSE_compressBound zfs_FSE_compressBound +#define FSE_compress_usingCTable zfs_FSE_compress_usingCTable +#define FSE_compress_usingCTable_generic zfs_FSE_compress_usingCTable_generic +#define FSE_compress_wksp zfs_FSE_compress_wksp +#define FSE_createCTable zfs_FSE_createCTable +#define FSE_createDTable zfs_FSE_createDTable +#define FSE_decompress zfs_FSE_decompress +#define FSE_decompress_usingDTable zfs_FSE_decompress_usingDTable +#define FSE_decompress_wksp zfs_FSE_decompress_wksp +#define FSE_freeCTable zfs_FSE_freeCTable +#define FSE_freeDTable zfs_FSE_freeDTable +#define FSE_getErrorName zfs_FSE_getErrorName +#define FSE_normalizeCount zfs_FSE_normalizeCount +#define FSE_optimalTableLog zfs_FSE_optimalTableLog +#define FSE_optimalTableLog_internal zfs_FSE_optimalTableLog_internal +#define FSE_readNCount zfs_FSE_readNCount +#define FSE_versionNumber zfs_FSE_versionNumber +#define FSE_writeNCount zfs_FSE_writeNCount +#define HIST_count zfs_HIST_count +#define HIST_countFast zfs_HIST_countFast +#define HIST_countFast_wksp zfs_HIST_countFast_wksp +#define HIST_count_parallel_wksp zfs_HIST_count_parallel_wksp +#define HIST_count_simple zfs_HIST_count_simple +#define HIST_count_wksp zfs_HIST_count_wksp +#define HUF_buildCTable zfs_HUF_buildCTable +#define HUF_buildCTable_wksp zfs_HUF_buildCTable_wksp +#define HUF_compress zfs_HUF_compress +#define HUF_compress1X zfs_HUF_compress1X +#define HUF_compress1X_repeat zfs_HUF_compress1X_repeat +#define HUF_compress1X_usingCTable zfs_HUF_compress1X_usingCTable +#define HUF_compress1X_wksp zfs_HUF_compress1X_wksp +#define HUF_compress2 zfs_HUF_compress2 +#define HUF_compress4X_repeat zfs_HUF_compress4X_repeat +#define HUF_compress4X_usingCTable zfs_HUF_compress4X_usingCTable +#define HUF_compress4X_wksp zfs_HUF_compress4X_wksp +#define HUF_compressBound zfs_HUF_compressBound +#define HUF_compressWeights zfs_HUF_compressWeights +#define HUF_decompress zfs_HUF_decompress +#define HUF_decompress1X1 zfs_HUF_decompress1X1 +#define HUF_decompress1X1_DCtx zfs_HUF_decompress1X1_DCtx +#define HUF_decompress1X1_DCtx_wksp zfs_HUF_decompress1X1_DCtx_wksp +#define HUF_decompress1X1_DCtx_wksp_bmi2 zfs_HUF_decompress1X1_DCtx_wksp_bmi2 +#define HUF_decompress1X1_usingDTable zfs_HUF_decompress1X1_usingDTable +#define HUF_decompress1X2 zfs_HUF_decompress1X2 +#define HUF_decompress1X2_DCtx zfs_HUF_decompress1X2_DCtx +#define HUF_decompress1X2_DCtx_wksp zfs_HUF_decompress1X2_DCtx_wksp +#define HUF_decompress1X2_usingDTable zfs_HUF_decompress1X2_usingDTable +#define HUF_decompress1X_DCtx zfs_HUF_decompress1X_DCtx +#define HUF_decompress1X_DCtx_wksp zfs_HUF_decompress1X_DCtx_wksp +#define HUF_decompress1X_usingDTable zfs_HUF_decompress1X_usingDTable +#define HUF_decompress1X_usingDTable_bmi2 zfs_HUF_decompress1X_usingDTable_bmi2 +#define HUF_decompress4X1 zfs_HUF_decompress4X1 +#define HUF_decompress4X1_DCtx zfs_HUF_decompress4X1_DCtx +#define HUF_decompress4X1_DCtx_wksp zfs_HUF_decompress4X1_DCtx_wksp +#define HUF_decompress4X1_usingDTable zfs_HUF_decompress4X1_usingDTable +#define HUF_decompress4X2 zfs_HUF_decompress4X2 +#define HUF_decompress4X2_DCtx zfs_HUF_decompress4X2_DCtx +#define HUF_decompress4X2_DCtx_wksp zfs_HUF_decompress4X2_DCtx_wksp +#define HUF_decompress4X2_usingDTable zfs_HUF_decompress4X2_usingDTable +#define HUF_decompress4X_DCtx zfs_HUF_decompress4X_DCtx +#define HUF_decompress4X_hufOnly zfs_HUF_decompress4X_hufOnly +#define HUF_decompress4X_hufOnly_wksp zfs_HUF_decompress4X_hufOnly_wksp +#define HUF_decompress4X_hufOnly_wksp_bmi2 \ + zfs_HUF_decompress4X_hufOnly_wksp_bmi2 +#define HUF_decompress4X_usingDTable zfs_HUF_decompress4X_usingDTable +#define HUF_decompress4X_usingDTable_bmi2 zfs_HUF_decompress4X_usingDTable_bmi2 +#define HUF_estimateCompressedSize zfs_HUF_estimateCompressedSize +#define HUF_fillDTableX2Level2 zfs_HUF_fillDTableX2Level2 +#define HUF_getErrorName zfs_HUF_getErrorName +#define HUF_getNbBits zfs_HUF_getNbBits +#define HUF_optimalTableLog zfs_HUF_optimalTableLog +#define HUF_readCTable zfs_HUF_readCTable +#define HUF_readDTableX1 zfs_HUF_readDTableX1 +#define HUF_readDTableX1_wksp zfs_HUF_readDTableX1_wksp +#define HUF_readDTableX2 zfs_HUF_readDTableX2 +#define HUF_readDTableX2_wksp zfs_HUF_readDTableX2_wksp +#define HUF_readStats zfs_HUF_readStats +#define HUF_selectDecoder zfs_HUF_selectDecoder +#define HUF_setMaxHeight zfs_HUF_setMaxHeight +#define HUF_validateCTable zfs_HUF_validateCTable +#define HUF_writeCTable zfs_HUF_writeCTable +#define LL_base zfs_LL_base +#define LL_bits zfs_LL_bits +#define LL_defaultDTable zfs_LL_defaultDTable +#define LL_defaultNorm zfs_LL_defaultNorm +#define ML_base zfs_ML_base +#define ML_bits zfs_ML_bits +#define ML_defaultDTable zfs_ML_defaultDTable +#define ML_defaultNorm zfs_ML_defaultNorm +#define OF_base zfs_OF_base +#define OF_bits zfs_OF_bits +#define OF_defaultDTable zfs_OF_defaultDTable +#define OF_defaultNorm zfs_OF_defaultNorm +#define POOL_add zfs_POOL_add +#define POOL_create zfs_POOL_create +#define POOL_create_advanced zfs_POOL_create_advanced +#define POOL_free zfs_POOL_free +#define POOL_resize zfs_POOL_resize +#define POOL_sizeof zfs_POOL_sizeof +#define POOL_tryAdd zfs_POOL_tryAdd +#define ZSTD_CCtxParams_getParameter zfs_ZSTD_CCtxParams_getParameter +#define ZSTD_CCtxParams_init zfs_ZSTD_CCtxParams_init +#define ZSTD_CCtxParams_init_advanced zfs_ZSTD_CCtxParams_init_advanced +#define ZSTD_CCtxParams_reset zfs_ZSTD_CCtxParams_reset +#define ZSTD_CCtxParams_setParameter zfs_ZSTD_CCtxParams_setParameter +#define ZSTD_CCtx_getParameter zfs_ZSTD_CCtx_getParameter +#define ZSTD_CCtx_loadDictionary zfs_ZSTD_CCtx_loadDictionary +#define ZSTD_CCtx_loadDictionary_advanced zfs_ZSTD_CCtx_loadDictionary_advanced +#define ZSTD_CCtx_loadDictionary_byReference \ + zfs_ZSTD_CCtx_loadDictionary_byReference +#define ZSTD_CCtx_refCDict zfs_ZSTD_CCtx_refCDict +#define ZSTD_CCtx_refPrefix zfs_ZSTD_CCtx_refPrefix +#define ZSTD_CCtx_refPrefix_advanced zfs_ZSTD_CCtx_refPrefix_advanced +#define ZSTD_CCtx_reset zfs_ZSTD_CCtx_reset +#define ZSTD_CCtx_setParameter zfs_ZSTD_CCtx_setParameter +#define ZSTD_CCtx_setParametersUsingCCtxParams \ + zfs_ZSTD_CCtx_setParametersUsingCCtxParams +#define ZSTD_CCtx_setPledgedSrcSize zfs_ZSTD_CCtx_setPledgedSrcSize +#define ZSTD_CStreamInSize zfs_ZSTD_CStreamInSize +#define ZSTD_CStreamOutSize zfs_ZSTD_CStreamOutSize +#define ZSTD_DCtx_loadDictionary zfs_ZSTD_DCtx_loadDictionary +#define ZSTD_DCtx_loadDictionary_advanced zfs_ZSTD_DCtx_loadDictionary_advanced +#define ZSTD_DCtx_loadDictionary_byReference \ + zfs_ZSTD_DCtx_loadDictionary_byReference +#define ZSTD_DCtx_refDDict zfs_ZSTD_DCtx_refDDict +#define ZSTD_DCtx_refPrefix zfs_ZSTD_DCtx_refPrefix +#define ZSTD_DCtx_refPrefix_advanced zfs_ZSTD_DCtx_refPrefix_advanced +#define ZSTD_DCtx_reset zfs_ZSTD_DCtx_reset +#define ZSTD_DCtx_setFormat zfs_ZSTD_DCtx_setFormat +#define ZSTD_DCtx_setMaxWindowSize zfs_ZSTD_DCtx_setMaxWindowSize +#define ZSTD_DCtx_setParameter zfs_ZSTD_DCtx_setParameter +#define ZSTD_DDict_dictContent zfs_ZSTD_DDict_dictContent +#define ZSTD_DDict_dictSize zfs_ZSTD_DDict_dictSize +#define ZSTD_DStreamInSize zfs_ZSTD_DStreamInSize +#define ZSTD_DStreamOutSize zfs_ZSTD_DStreamOutSize +#define ZSTD_DUBT_findBestMatch zfs_ZSTD_DUBT_findBestMatch +#define ZSTD_NCountCost zfs_ZSTD_NCountCost +#define ZSTD_XXH64_digest zfs_ZSTD_XXH64_digest +#define ZSTD_adjustCParams zfs_ZSTD_adjustCParams +#define ZSTD_assignParamsToCCtxParams zfs_ZSTD_assignParamsToCCtxParams +#define ZSTD_buildCTable zfs_ZSTD_buildCTable +#define ZSTD_buildFSETable zfs_ZSTD_buildFSETable +#define ZSTD_buildSeqStore zfs_ZSTD_buildSeqStore +#define ZSTD_buildSeqTable zfs_ZSTD_buildSeqTable +#define ZSTD_cParam_getBounds zfs_ZSTD_cParam_getBounds +#define ZSTD_cParam_withinBounds zfs_ZSTD_cParam_withinBounds +#define ZSTD_calloc zfs_ZSTD_calloc +#define ZSTD_checkCParams zfs_ZSTD_checkCParams +#define ZSTD_checkContinuity zfs_ZSTD_checkContinuity +#define ZSTD_compress zfs_ZSTD_compress +#define ZSTD_compress2 zfs_ZSTD_compress2 +#define ZSTD_compressBegin zfs_ZSTD_compressBegin +#define ZSTD_compressBegin_advanced zfs_ZSTD_compressBegin_advanced +#define ZSTD_compressBegin_advanced_internal \ + zfs_ZSTD_compressBegin_advanced_internal +#define ZSTD_compressBegin_usingCDict zfs_ZSTD_compressBegin_usingCDict +#define ZSTD_compressBegin_usingCDict_advanced \ + zfs_ZSTD_compressBegin_usingCDict_advanced +#define ZSTD_compressBegin_usingDict zfs_ZSTD_compressBegin_usingDict +#define ZSTD_compressBlock zfs_ZSTD_compressBlock +#define ZSTD_compressBlock_btlazy2 zfs_ZSTD_compressBlock_btlazy2 +#define ZSTD_compressBlock_btlazy2_dictMatchState \ + zfs_ZSTD_compressBlock_btlazy2_dictMatchState +#define ZSTD_compressBlock_btlazy2_extDict \ + zfs_ZSTD_compressBlock_btlazy2_extDict +#define ZSTD_compressBlock_btopt zfs_ZSTD_compressBlock_btopt +#define ZSTD_compressBlock_btopt_dictMatchState \ + zfs_ZSTD_compressBlock_btopt_dictMatchState +#define ZSTD_compressBlock_btopt_extDict zfs_ZSTD_compressBlock_btopt_extDict +#define ZSTD_compressBlock_btultra zfs_ZSTD_compressBlock_btultra +#define ZSTD_compressBlock_btultra2 zfs_ZSTD_compressBlock_btultra2 +#define ZSTD_compressBlock_btultra_dictMatchState \ + zfs_ZSTD_compressBlock_btultra_dictMatchState +#define ZSTD_compressBlock_btultra_extDict \ + zfs_ZSTD_compressBlock_btultra_extDict +#define ZSTD_compressBlock_doubleFast zfs_ZSTD_compressBlock_doubleFast +#define ZSTD_compressBlock_doubleFast_dictMatchState \ + zfs_ZSTD_compressBlock_doubleFast_dictMatchState +#define ZSTD_compressBlock_doubleFast_extDict \ + zfs_ZSTD_compressBlock_doubleFast_extDict +#define ZSTD_compressBlock_doubleFast_extDict_generic \ + zfs_ZSTD_compressBlock_doubleFast_extDict_generic +#define ZSTD_compressBlock_fast zfs_ZSTD_compressBlock_fast +#define ZSTD_compressBlock_fast_dictMatchState \ + zfs_ZSTD_compressBlock_fast_dictMatchState +#define ZSTD_compressBlock_fast_extDict zfs_ZSTD_compressBlock_fast_extDict +#define ZSTD_compressBlock_fast_extDict_generic \ + zfs_ZSTD_compressBlock_fast_extDict_generic +#define ZSTD_compressBlock_greedy zfs_ZSTD_compressBlock_greedy +#define ZSTD_compressBlock_greedy_dictMatchState \ + zfs_ZSTD_compressBlock_greedy_dictMatchState +#define ZSTD_compressBlock_greedy_extDict zfs_ZSTD_compressBlock_greedy_extDict +#define ZSTD_compressBlock_internal zfs_ZSTD_compressBlock_internal +#define ZSTD_compressBlock_lazy zfs_ZSTD_compressBlock_lazy +#define ZSTD_compressBlock_lazy2 zfs_ZSTD_compressBlock_lazy2 +#define ZSTD_compressBlock_lazy2_dictMatchState \ + zfs_ZSTD_compressBlock_lazy2_dictMatchState +#define ZSTD_compressBlock_lazy2_extDict zfs_ZSTD_compressBlock_lazy2_extDict +#define ZSTD_compressBlock_lazy_dictMatchState \ + zfs_ZSTD_compressBlock_lazy_dictMatchState +#define ZSTD_compressBlock_lazy_extDict zfs_ZSTD_compressBlock_lazy_extDict +#define ZSTD_compressBound zfs_ZSTD_compressBound +#define ZSTD_compressCCtx zfs_ZSTD_compressCCtx +#define ZSTD_compressContinue zfs_ZSTD_compressContinue +#define ZSTD_compressContinue_internal zfs_ZSTD_compressContinue_internal +#define ZSTD_compressEnd zfs_ZSTD_compressEnd +#define ZSTD_compressLiterals zfs_ZSTD_compressLiterals +#define ZSTD_compressRleLiteralsBlock zfs_ZSTD_compressRleLiteralsBlock +#define ZSTD_compressStream zfs_ZSTD_compressStream +#define ZSTD_compressStream2 zfs_ZSTD_compressStream2 +#define ZSTD_compressStream2_simpleArgs zfs_ZSTD_compressStream2_simpleArgs +#define ZSTD_compressSuperBlock zfs_ZSTD_compressSuperBlock +#define ZSTD_compress_advanced zfs_ZSTD_compress_advanced +#define ZSTD_compress_advanced_internal zfs_ZSTD_compress_advanced_internal +#define ZSTD_compress_internal zfs_ZSTD_compress_internal +#define ZSTD_compress_usingCDict zfs_ZSTD_compress_usingCDict +#define ZSTD_compress_usingCDict_advanced zfs_ZSTD_compress_usingCDict_advanced +#define ZSTD_compress_usingDict zfs_ZSTD_compress_usingDict +#define ZSTD_copyCCtx zfs_ZSTD_copyCCtx +#define ZSTD_copyDCtx zfs_ZSTD_copyDCtx +#define ZSTD_copyDDictParameters zfs_ZSTD_copyDDictParameters +#define ZSTD_count zfs_ZSTD_count +#define ZSTD_count_2segments zfs_ZSTD_count_2segments +#define ZSTD_createCCtx zfs_ZSTD_createCCtx +#define ZSTD_createCCtxParams zfs_ZSTD_createCCtxParams +#define ZSTD_createCCtx_advanced zfs_ZSTD_createCCtx_advanced +#define ZSTD_createCDict zfs_ZSTD_createCDict +#define ZSTD_createCDict_advanced zfs_ZSTD_createCDict_advanced +#define ZSTD_createCDict_byReference zfs_ZSTD_createCDict_byReference +#define ZSTD_createCStream zfs_ZSTD_createCStream +#define ZSTD_createCStream_advanced zfs_ZSTD_createCStream_advanced +#define ZSTD_createDCtx zfs_ZSTD_createDCtx +#define ZSTD_createDCtx_advanced zfs_ZSTD_createDCtx_advanced +#define ZSTD_createDDict zfs_ZSTD_createDDict +#define ZSTD_createDDict_advanced zfs_ZSTD_createDDict_advanced +#define ZSTD_createDDict_byReference zfs_ZSTD_createDDict_byReference +#define ZSTD_createDStream zfs_ZSTD_createDStream +#define ZSTD_createDStream_advanced zfs_ZSTD_createDStream_advanced +#define ZSTD_crossEntropyCost zfs_ZSTD_crossEntropyCost +#define ZSTD_cycleLog zfs_ZSTD_cycleLog +#define ZSTD_dParam_getBounds zfs_ZSTD_dParam_getBounds +#define ZSTD_decodeLiteralsBlock zfs_ZSTD_decodeLiteralsBlock +#define ZSTD_decodeSeqHeaders zfs_ZSTD_decodeSeqHeaders +#define ZSTD_decodingBufferSize_min zfs_ZSTD_decodingBufferSize_min +#define ZSTD_decompress zfs_ZSTD_decompress +#define ZSTD_decompressBegin zfs_ZSTD_decompressBegin +#define ZSTD_decompressBegin_usingDDict zfs_ZSTD_decompressBegin_usingDDict +#define ZSTD_decompressBegin_usingDict zfs_ZSTD_decompressBegin_usingDict +#define ZSTD_decompressBlock zfs_ZSTD_decompressBlock +#define ZSTD_decompressBlock_internal zfs_ZSTD_decompressBlock_internal +#define ZSTD_decompressBound zfs_ZSTD_decompressBound +#define ZSTD_decompressContinue zfs_ZSTD_decompressContinue +#define ZSTD_decompressContinueStream zfs_ZSTD_decompressContinueStream +#define ZSTD_decompressDCtx zfs_ZSTD_decompressDCtx +#define ZSTD_decompressMultiFrame zfs_ZSTD_decompressMultiFrame +#define ZSTD_decompressStream zfs_ZSTD_decompressStream +#define ZSTD_decompressStream_simpleArgs zfs_ZSTD_decompressStream_simpleArgs +#define ZSTD_decompress_usingDDict zfs_ZSTD_decompress_usingDDict +#define ZSTD_decompress_usingDict zfs_ZSTD_decompress_usingDict +#define ZSTD_defaultCParameters zfs_ZSTD_defaultCParameters +#define ZSTD_did_fieldSize zfs_ZSTD_did_fieldSize +#define ZSTD_encodeSequences zfs_ZSTD_encodeSequences +#define ZSTD_encodeSequences_default zfs_ZSTD_encodeSequences_default +#define ZSTD_endStream zfs_ZSTD_endStream +#define ZSTD_estimateCCtxSize zfs_ZSTD_estimateCCtxSize +#define ZSTD_estimateCCtxSize_usingCCtxParams \ + zfs_ZSTD_estimateCCtxSize_usingCCtxParams +#define ZSTD_estimateCCtxSize_usingCParams \ + zfs_ZSTD_estimateCCtxSize_usingCParams +#define ZSTD_estimateCDictSize zfs_ZSTD_estimateCDictSize +#define ZSTD_estimateCDictSize_advanced zfs_ZSTD_estimateCDictSize_advanced +#define ZSTD_estimateCStreamSize zfs_ZSTD_estimateCStreamSize +#define ZSTD_estimateCStreamSize_usingCCtxParams \ + zfs_ZSTD_estimateCStreamSize_usingCCtxParams +#define ZSTD_estimateCStreamSize_usingCParams \ + zfs_ZSTD_estimateCStreamSize_usingCParams +#define ZSTD_estimateDCtxSize zfs_ZSTD_estimateDCtxSize +#define ZSTD_estimateDDictSize zfs_ZSTD_estimateDDictSize +#define ZSTD_estimateDStreamSize zfs_ZSTD_estimateDStreamSize +#define ZSTD_estimateDStreamSize_fromFrame \ + zfs_ZSTD_estimateDStreamSize_fromFrame +#define ZSTD_fcs_fieldSize zfs_ZSTD_fcs_fieldSize +#define ZSTD_fillDoubleHashTable zfs_ZSTD_fillDoubleHashTable +#define ZSTD_fillHashTable zfs_ZSTD_fillHashTable +#define ZSTD_findDecompressedSize zfs_ZSTD_findDecompressedSize +#define ZSTD_findFrameCompressedSize zfs_ZSTD_findFrameCompressedSize +#define ZSTD_findFrameSizeInfo zfs_ZSTD_findFrameSizeInfo +#define ZSTD_flushStream zfs_ZSTD_flushStream +#define ZSTD_frameHeaderSize zfs_ZSTD_frameHeaderSize +#define ZSTD_free zfs_ZSTD_free +#define ZSTD_freeCCtx zfs_ZSTD_freeCCtx +#define ZSTD_freeCCtxParams zfs_ZSTD_freeCCtxParams +#define ZSTD_freeCDict zfs_ZSTD_freeCDict +#define ZSTD_freeCStream zfs_ZSTD_freeCStream +#define ZSTD_freeDCtx zfs_ZSTD_freeDCtx +#define ZSTD_freeDDict zfs_ZSTD_freeDDict +#define ZSTD_freeDStream zfs_ZSTD_freeDStream +#define ZSTD_fseBitCost zfs_ZSTD_fseBitCost +#define ZSTD_getBlockSize zfs_ZSTD_getBlockSize +#define ZSTD_getCParams zfs_ZSTD_getCParams +#define ZSTD_getCParamsFromCCtxParams zfs_ZSTD_getCParamsFromCCtxParams +#define ZSTD_getCParamsFromCDict zfs_ZSTD_getCParamsFromCDict +#define ZSTD_getCParams_internal zfs_ZSTD_getCParams_internal +#define ZSTD_getDDict zfs_ZSTD_getDDict +#define ZSTD_getDecompressedSize zfs_ZSTD_getDecompressedSize +#define ZSTD_getDictID_fromDDict zfs_ZSTD_getDictID_fromDDict +#define ZSTD_getDictID_fromDict zfs_ZSTD_getDictID_fromDict +#define ZSTD_getDictID_fromFrame zfs_ZSTD_getDictID_fromFrame +#define ZSTD_getErrorCode zfs_ZSTD_getErrorCode +#define ZSTD_getErrorName zfs_ZSTD_getErrorName +#define ZSTD_getErrorString zfs_ZSTD_getErrorString +#define ZSTD_getFrameContentSize zfs_ZSTD_getFrameContentSize +#define ZSTD_getFrameHeader zfs_ZSTD_getFrameHeader +#define ZSTD_getFrameHeader_advanced zfs_ZSTD_getFrameHeader_advanced +#define ZSTD_getFrameProgression zfs_ZSTD_getFrameProgression +#define ZSTD_getParams zfs_ZSTD_getParams +#define ZSTD_getSeqStore zfs_ZSTD_getSeqStore +#define ZSTD_getSequences zfs_ZSTD_getSequences +#define ZSTD_getcBlockSize zfs_ZSTD_getcBlockSize +#define ZSTD_hashPtr zfs_ZSTD_hashPtr +#define ZSTD_initCDict_internal zfs_ZSTD_initCDict_internal +#define ZSTD_initCStream zfs_ZSTD_initCStream +#define ZSTD_initCStream_advanced zfs_ZSTD_initCStream_advanced +#define ZSTD_initCStream_internal zfs_ZSTD_initCStream_internal +#define ZSTD_initCStream_srcSize zfs_ZSTD_initCStream_srcSize +#define ZSTD_initCStream_usingCDict zfs_ZSTD_initCStream_usingCDict +#define ZSTD_initCStream_usingCDict_advanced \ + zfs_ZSTD_initCStream_usingCDict_advanced +#define ZSTD_initCStream_usingDict zfs_ZSTD_initCStream_usingDict +#define ZSTD_initDDict_internal zfs_ZSTD_initDDict_internal +#define ZSTD_initDStream zfs_ZSTD_initDStream +#define ZSTD_initDStream_usingDDict zfs_ZSTD_initDStream_usingDDict +#define ZSTD_initDStream_usingDict zfs_ZSTD_initDStream_usingDict +#define ZSTD_initFseState zfs_ZSTD_initFseState +#define ZSTD_initStaticCCtx zfs_ZSTD_initStaticCCtx +#define ZSTD_initStaticCDict zfs_ZSTD_initStaticCDict +#define ZSTD_initStaticCStream zfs_ZSTD_initStaticCStream +#define ZSTD_initStaticDCtx zfs_ZSTD_initStaticDCtx +#define ZSTD_initStaticDDict zfs_ZSTD_initStaticDDict +#define ZSTD_initStaticDStream zfs_ZSTD_initStaticDStream +#define ZSTD_initStats_ultra zfs_ZSTD_initStats_ultra +#define ZSTD_insertAndFindFirstIndex zfs_ZSTD_insertAndFindFirstIndex +#define ZSTD_insertAndFindFirstIndexHash3 zfs_ZSTD_insertAndFindFirstIndexHash3 +#define ZSTD_insertAndFindFirstIndex_internal \ + zfs_ZSTD_insertAndFindFirstIndex_internal +#define ZSTD_insertBlock zfs_ZSTD_insertBlock +#define ZSTD_invalidateRepCodes zfs_ZSTD_invalidateRepCodes +#define ZSTD_isFrame zfs_ZSTD_isFrame +#define ZSTD_ldm_adjustParameters zfs_ZSTD_ldm_adjustParameters +#define ZSTD_ldm_blockCompress zfs_ZSTD_ldm_blockCompress +#define ZSTD_ldm_fillHashTable zfs_ZSTD_ldm_fillHashTable +#define ZSTD_ldm_generateSequences zfs_ZSTD_ldm_generateSequences +#define ZSTD_ldm_getMaxNbSeq zfs_ZSTD_ldm_getMaxNbSeq +#define ZSTD_ldm_getTableSize zfs_ZSTD_ldm_getTableSize +#define ZSTD_ldm_skipSequences zfs_ZSTD_ldm_skipSequences +#define ZSTD_loadCEntropy zfs_ZSTD_loadCEntropy +#define ZSTD_loadDEntropy zfs_ZSTD_loadDEntropy +#define ZSTD_loadDictionaryContent zfs_ZSTD_loadDictionaryContent +#define ZSTD_makeCCtxParamsFromCParams zfs_ZSTD_makeCCtxParamsFromCParams +#define ZSTD_malloc zfs_ZSTD_malloc +#define ZSTD_maxCLevel zfs_ZSTD_maxCLevel +#define ZSTD_minCLevel zfs_ZSTD_minCLevel +#define ZSTD_nextInputType zfs_ZSTD_nextInputType +#define ZSTD_nextSrcSizeToDecompress zfs_ZSTD_nextSrcSizeToDecompress +#define ZSTD_noCompressLiterals zfs_ZSTD_noCompressLiterals +#define ZSTD_referenceExternalSequences zfs_ZSTD_referenceExternalSequences +#define ZSTD_rescaleFreqs zfs_ZSTD_rescaleFreqs +#define ZSTD_resetCCtx_internal zfs_ZSTD_resetCCtx_internal +#define ZSTD_resetCCtx_usingCDict zfs_ZSTD_resetCCtx_usingCDict +#define ZSTD_resetCStream zfs_ZSTD_resetCStream +#define ZSTD_resetDStream zfs_ZSTD_resetDStream +#define ZSTD_resetSeqStore zfs_ZSTD_resetSeqStore +#define ZSTD_reset_compressedBlockState zfs_ZSTD_reset_compressedBlockState +#define ZSTD_safecopy zfs_ZSTD_safecopy +#define ZSTD_selectBlockCompressor zfs_ZSTD_selectBlockCompressor +#define ZSTD_selectEncodingType zfs_ZSTD_selectEncodingType +#define ZSTD_seqToCodes zfs_ZSTD_seqToCodes +#define ZSTD_sizeof_CCtx zfs_ZSTD_sizeof_CCtx +#define ZSTD_sizeof_CDict zfs_ZSTD_sizeof_CDict +#define ZSTD_sizeof_CStream zfs_ZSTD_sizeof_CStream +#define ZSTD_sizeof_DCtx zfs_ZSTD_sizeof_DCtx +#define ZSTD_sizeof_DDict zfs_ZSTD_sizeof_DDict +#define ZSTD_sizeof_DStream zfs_ZSTD_sizeof_DStream +#define ZSTD_toFlushNow zfs_ZSTD_toFlushNow +#define ZSTD_updateRep zfs_ZSTD_updateRep +#define ZSTD_updateStats zfs_ZSTD_updateStats +#define ZSTD_updateTree zfs_ZSTD_updateTree +#define ZSTD_versionNumber zfs_ZSTD_versionNumber +#define ZSTD_versionString zfs_ZSTD_versionString +#define ZSTD_writeFrameHeader zfs_ZSTD_writeFrameHeader +#define ZSTD_writeLastEmptyBlock zfs_ZSTD_writeLastEmptyBlock +#define algoTime zfs_algoTime +#define attachDictSizeCutoffs zfs_attachDictSizeCutoffs +#define g_ctx zfs_g_ctx +#define g_debuglevel zfs_g_debuglevel +#define kInverseProbabilityLog256 zfs_kInverseProbabilityLog256 +#define repStartValue zfs_repStartValue +#define FSE_isError zfs_FSE_isError +#define HUF_isError zfs_HUF_isError diff --git a/module/zstd/lib/zstd.c b/module/zstd/lib/zstd.c new file mode 100644 index 0000000000..acdd4d9dac --- /dev/null +++ b/module/zstd/lib/zstd.c @@ -0,0 +1,27826 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved. + * Copyright (c) 2019-2020, Michael Niewöhner. All rights reserved. + */ + +#define MEM_MODULE +#define XXH_NAMESPACE ZSTD_ +#define XXH_PRIVATE_API +#define XXH_INLINE_ALL +#define ZSTD_LEGACY_SUPPORT 0 +#define ZSTD_LIB_DICTBUILDER 0 +#define ZSTD_LIB_DEPRECATED 0 +#define ZSTD_NOBENCH + +/**** start inlining common/debug.c ****/ +/* ****************************************************************** + * debug + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* + * This module only hosts one global variable + * which can be used to dynamically influence the verbosity of traces, + * such as DEBUGLOG and RAWLOG + */ + +/**** start inlining debug.h ****/ +/* ****************************************************************** + * debug + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* + * The purpose of this header is to enable debug functions. + * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time, + * and DEBUG_STATIC_ASSERT() for compile-time. + * + * By default, DEBUGLEVEL==0, which means run-time debug is disabled. + * + * Level 1 enables assert() only. + * Starting level 2, traces can be generated and pushed to stderr. + * The higher the level, the more verbose the traces. + * + * It's possible to dynamically adjust level using variable g_debug_level, + * which is only declared if DEBUGLEVEL>=2, + * and is a global variable, not multi-thread protected (use with care) + */ + +#ifndef DEBUG_H_12987983217 +#define DEBUG_H_12987983217 + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* static assert is triggered at compile time, leaving no runtime artefact. + * static assert only works with compile-time constants. + * Also, this variant can only be used inside a function. */ +#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1]) + + +/* DEBUGLEVEL is expected to be defined externally, + * typically through compiler command line. + * Value must be a number. */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + + +/* DEBUGFILE can be defined externally, + * typically through compiler command line. + * note : currently useless. + * Value must be stderr or stdout */ +#ifndef DEBUGFILE +# define DEBUGFILE stderr +#endif + + +/* recommended values for DEBUGLEVEL : + * 0 : release mode, no debug, all run-time checks disabled + * 1 : enables assert() only, no display + * 2 : reserved, for currently active debug path + * 3 : events once per object lifetime (CCtx, CDict, etc.) + * 4 : events once per frame + * 5 : events once per block + * 6 : events once per sequence (verbose) + * 7+: events at every position (*very* verbose) + * + * It's generally inconvenient to output traces > 5. + * In which case, it's possible to selectively trigger high verbosity levels + * by modifying g_debug_level. + */ + +#if (DEBUGLEVEL>=1) +# include +#else +# ifndef assert /* assert may be already defined, due to prior #include */ +# define assert(condition) ((void)0) /* disable assert (default) */ +# endif +#endif + +#if (DEBUGLEVEL>=2) +# include +extern int g_debuglevel; /* the variable is only declared, + it actually lives in debug.c, + and is shared by the whole process. + It's not thread-safe. + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +# define RAWLOG(l, ...) { \ + if (l<=g_debuglevel) { \ + fprintf(stderr, __VA_ARGS__); \ + } } +# define DEBUGLOG(l, ...) { \ + if (l<=g_debuglevel) { \ + fprintf(stderr, __FILE__ ": " __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define RAWLOG(l, ...) {} /* disabled */ +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +#if defined (__cplusplus) +} +#endif + +#endif /* DEBUG_H_12987983217 */ +/**** ended inlining debug.h ****/ + +int g_debuglevel = DEBUGLEVEL; +/**** ended inlining common/debug.c ****/ +/**** start inlining common/entropy_common.c ****/ +/* ****************************************************************** + * Common functions of New Generation Entropy library + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************* +* Dependencies +***************************************/ +/**** start inlining mem.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef MEM_H_MODULE +#define MEM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-**************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ +#include /* memcpy */ + + +/*-**************************************** +* Compiler specifics +******************************************/ +#if defined(_MSC_VER) /* Visual Studio */ +# include /* _byteswap_ulong */ +# include /* _byteswap_* */ +#endif +#if defined(__GNUC__) +# define MEM_STATIC static __inline __attribute__((unused)) +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define MEM_STATIC static inline +#elif defined(_MSC_VER) +# define MEM_STATIC static __inline +#else +# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + +#ifndef __has_builtin +# define __has_builtin(x) 0 /* compat. with non-clang compilers */ +#endif + +/* code only tested on 32 and 64 bits systems */ +#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } +MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } + +/* detects whether we are being compiled under msan */ +#if defined (__has_feature) +# if __has_feature(memory_sanitizer) +# define MEMORY_SANITIZER 1 +# endif +#endif + +#if defined (MEMORY_SANITIZER) +/* Not all platforms that support msan provide sanitizers/msan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +#include /* intptr_t */ + +/* Make memory region fully initialized (without changing its contents). */ +void __msan_unpoison(const volatile void *a, size_t size); + +/* Make memory region fully uninitialized (without changing its contents). + This is a legacy interface that does not update origin information. Use + __msan_allocated_memory() instead. */ +void __msan_poison(const volatile void *a, size_t size); + +/* Returns the offset of the first (at least partially) poisoned byte in the + memory range, or -1 if the whole range is good. */ +intptr_t __msan_test_shadow(const volatile void *x, size_t size); +#endif + +/* detects whether we are being compiled under asan */ +#if defined (__has_feature) +# if __has_feature(address_sanitizer) +# define ADDRESS_SANITIZER 1 +# endif +#elif defined(__SANITIZE_ADDRESS__) +# define ADDRESS_SANITIZER 1 +#endif + +#if defined (ADDRESS_SANITIZER) +/* Not all platforms that support asan provide sanitizers/asan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +/** + * Marks a memory region ([addr, addr+size)) as unaddressable. + * + * This memory must be previously allocated by your program. Instrumented + * code is forbidden from accessing addresses in this region until it is + * unpoisoned. This function is not guaranteed to poison the entire region - + * it could poison only a subregion of [addr, addr+size) due to ASan + * alignment restrictions. + * + * \note This function is not thread-safe because no two threads can poison or + * unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_poison_memory_region(void const volatile *addr, size_t size); + +/** + * Marks a memory region ([addr, addr+size)) as addressable. + * + * This memory must be previously allocated by your program. Accessing + * addresses in this region is allowed until this region is poisoned again. + * This function could unpoison a super-region of [addr, addr+size) due + * to ASan alignment restrictions. + * + * \note This function is not thread-safe because no two threads can + * poison or unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +#endif + + +/*-************************************************************** +* Basic Types +*****************************************************************/ +#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef int16_t S16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef int64_t S64; +#else +# include +#if CHAR_BIT != 8 +# error "this implementation requires char to be exactly 8-bit type" +#endif + typedef unsigned char BYTE; +#if USHRT_MAX != 65535 +# error "this implementation requires short to be exactly 16-bit type" +#endif + typedef unsigned short U16; + typedef signed short S16; +#if UINT_MAX != 4294967295 +# error "this implementation requires int to be exactly 32-bit type" +#endif + typedef unsigned int U32; + typedef signed int S32; +/* note : there are no limits defined for long long type in C90. + * limits exist in C99, however, in such case, is preferred */ + typedef unsigned long long U64; + typedef signed long long S64; +#endif + + +/*-************************************************************** +* Memory I/O +*****************************************************************/ +/* MEM_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets depending on alignment. + * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6) + * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define MEM_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) +# define MEM_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; } +MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } + +MEM_STATIC unsigned MEM_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + +#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) + +/* violates C standard, by lying on structure alignment. +Only use if no other choice to achieve best performance on target platform */ +MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } +MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } +MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } +MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; } + +#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) + __pragma( pack(push, 1) ) + typedef struct { U16 v; } unalign16; + typedef struct { U32 v; } unalign32; + typedef struct { U64 v; } unalign64; + typedef struct { size_t v; } unalignArch; + __pragma( pack(pop) ) +#else + typedef struct { U16 v; } __attribute__((packed)) unalign16; + typedef struct { U32 v; } __attribute__((packed)) unalign32; + typedef struct { U64 v; } __attribute__((packed)) unalign64; + typedef struct { size_t v; } __attribute__((packed)) unalignArch; +#endif + +MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; } +MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; } +MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; } +MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; } + +#else + +/* default method, safe and standard. + can sometimes prove slower */ + +MEM_STATIC U16 MEM_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U32 MEM_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U64 MEM_read64(const void* memPtr) +{ + U64 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC size_t MEM_readST(const void* memPtr) +{ + size_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write64(void* memPtr, U64 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* MEM_FORCE_MEMORY_ACCESS */ + +MEM_STATIC U32 MEM_swap32(U32 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_ulong(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap32)) + return __builtin_bswap32(in); +#else + return ((in << 24) & 0xff000000 ) | + ((in << 8) & 0x00ff0000 ) | + ((in >> 8) & 0x0000ff00 ) | + ((in >> 24) & 0x000000ff ); +#endif +} + +MEM_STATIC U64 MEM_swap64(U64 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_uint64(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap64)) + return __builtin_bswap64(in); +#else + return ((in << 56) & 0xff00000000000000ULL) | + ((in << 40) & 0x00ff000000000000ULL) | + ((in << 24) & 0x0000ff0000000000ULL) | + ((in << 8) & 0x000000ff00000000ULL) | + ((in >> 8) & 0x00000000ff000000ULL) | + ((in >> 24) & 0x0000000000ff0000ULL) | + ((in >> 40) & 0x000000000000ff00ULL) | + ((in >> 56) & 0x00000000000000ffULL); +#endif +} + +MEM_STATIC size_t MEM_swapST(size_t in) +{ + if (MEM_32bits()) + return (size_t)MEM_swap32((U32)in); + else + return (size_t)MEM_swap64((U64)in); +} + +/*=== Little endian r/w ===*/ + +MEM_STATIC U16 MEM_readLE16(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read16(memPtr); + else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)(p[0] + (p[1]<<8)); + } +} + +MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) +{ + if (MEM_isLittleEndian()) { + MEM_write16(memPtr, val); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE)val; + p[1] = (BYTE)(val>>8); + } +} + +MEM_STATIC U32 MEM_readLE24(const void* memPtr) +{ + return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); +} + +MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) +{ + MEM_writeLE16(memPtr, (U16)val); + ((BYTE*)memPtr)[2] = (BYTE)(val>>16); +} + +MEM_STATIC U32 MEM_readLE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read32(memPtr); + else + return MEM_swap32(MEM_read32(memPtr)); +} + +MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, val32); + else + MEM_write32(memPtr, MEM_swap32(val32)); +} + +MEM_STATIC U64 MEM_readLE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read64(memPtr); + else + return MEM_swap64(MEM_read64(memPtr)); +} + +MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, val64); + else + MEM_write64(memPtr, MEM_swap64(val64)); +} + +MEM_STATIC size_t MEM_readLEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readLE32(memPtr); + else + return (size_t)MEM_readLE64(memPtr); +} + +MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeLE32(memPtr, (U32)val); + else + MEM_writeLE64(memPtr, (U64)val); +} + +/*=== Big endian r/w ===*/ + +MEM_STATIC U32 MEM_readBE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap32(MEM_read32(memPtr)); + else + return MEM_read32(memPtr); +} + +MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, MEM_swap32(val32)); + else + MEM_write32(memPtr, val32); +} + +MEM_STATIC U64 MEM_readBE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap64(MEM_read64(memPtr)); + else + return MEM_read64(memPtr); +} + +MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, MEM_swap64(val64)); + else + MEM_write64(memPtr, val64); +} + +MEM_STATIC size_t MEM_readBEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readBE32(memPtr); + else + return (size_t)MEM_readBE64(memPtr); +} + +MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeBE32(memPtr, (U32)val); + else + MEM_writeBE64(memPtr, (U64)val); +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* MEM_H_MODULE */ +/**** ended inlining mem.h ****/ +/**** start inlining error_private.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* Note : this module is expected to remain private, do not expose it */ + +#ifndef ERROR_H_MODULE +#define ERROR_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************************** +* Dependencies +******************************************/ +#include /* size_t */ +/**** start inlining zstd_errors.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDERRORLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#endif + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ERRORS_H_398273423 */ +/**** ended inlining zstd_errors.h ****/ + + +/* **************************************** +* Compiler-specific +******************************************/ +#if defined(__GNUC__) +# define ERR_STATIC static __attribute__((unused)) +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define ERR_STATIC static inline +#elif defined(_MSC_VER) +# define ERR_STATIC static __inline +#else +# define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + + +/*-**************************************** +* Customization (error_public.h) +******************************************/ +typedef ZSTD_ErrorCode ERR_enum; +#define PREFIX(name) ZSTD_error_##name + + +/*-**************************************** +* Error codes handling +******************************************/ +#undef ERROR /* already defined on Visual Studio */ +#define ERROR(name) ZSTD_ERROR(name) +#define ZSTD_ERROR(name) ((size_t)-PREFIX(name)) + +ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + +ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + +/* check and forward error code */ +#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } + + +/*-**************************************** +* Error Strings +******************************************/ + +const char* ERR_getErrorString(ERR_enum code); /* error_private.c */ + +ERR_STATIC const char* ERR_getErrorName(size_t code) +{ + return ERR_getErrorString(ERR_getErrorCode(code)); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* ERROR_H_MODULE */ +/**** ended inlining error_private.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ +/**** start inlining fse.h ****/ +/* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef FSE_H +#define FSE_H + + +/*-***************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ + + +/*-***************************************** +* FSE_PUBLIC_API : control library symbols visibility +******************************************/ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define FSE_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define FSE_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define FSE_PUBLIC_API +#endif + +/*------ Version ------*/ +#define FSE_VERSION_MAJOR 0 +#define FSE_VERSION_MINOR 9 +#define FSE_VERSION_RELEASE 0 + +#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE +#define FSE_QUOTE(str) #str +#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str) +#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION) + +#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE) +FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ + + +/*-**************************************** +* FSE simple functions +******************************************/ +/*! FSE_compress() : + Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. + 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). + @return : size of compressed data (<= dstCapacity). + Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. + if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +*/ +FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/*! FSE_decompress(): + Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', + into already allocated destination buffer 'dst', of size 'dstCapacity'. + @return : size of regenerated data (<= maxDstSize), + or an error code, which can be tested using FSE_isError() . + + ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! + Why ? : making this distinction requires a header. + Header management is intentionally delegated to the user layer, which can better manage special cases. +*/ +FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, + const void* cSrc, size_t cSrcSize); + + +/*-***************************************** +* Tool functions +******************************************/ +FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */ + +/* Error Management */ +FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ +FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +/*-***************************************** +* FSE advanced functions +******************************************/ +/*! FSE_compress2() : + Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' + Both parameters can be defined as '0' to mean : use default value + @return : size of compressed data + Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! + if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. + if FSE_isError(return), it's an error code. +*/ +FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); + + +/*-***************************************** +* FSE detailed API +******************************************/ +/*! +FSE_compress() does the following: +1. count symbol occurrence from source[] into table count[] (see hist.h) +2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog) +3. save normalized counters to memory buffer using writeNCount() +4. build encoding table 'CTable' from normalized counters +5. encode the data stream using encoding table 'CTable' + +FSE_decompress() does the following: +1. read normalized counters with readNCount() +2. build decoding table 'DTable' from normalized counters +3. decode the data stream using decoding table 'DTable' + +The following API allows targeting specific sub-functions for advanced tasks. +For example, it's possible to compress several blocks using the same 'CTable', +or to save and provide normalized distribution using external method. +*/ + +/* *** COMPRESSION *** */ + +/*! FSE_optimalTableLog(): + dynamically downsize 'tableLog' when conditions are met. + It saves CPU time, by using smaller tables, while preserving or even improving compression ratio. + @return : recommended tableLog (necessarily <= 'maxTableLog') */ +FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_normalizeCount(): + normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) + 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). + @return : tableLog, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_NCountWriteBound(): + Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. + Typically useful for allocation purpose. */ +FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_writeNCount(): + Compactly save 'normalizedCounter' into 'buffer'. + @return : size of the compressed table, + or an errorCode, which can be tested using FSE_isError(). */ +FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, + unsigned maxSymbolValue, unsigned tableLog); + +/*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ +typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + +/*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). + @return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_compress_usingCTable(): + Compress `src` using `ct` into `dst` which must be already allocated. + @return : size of compressed data (<= `dstCapacity`), + or 0 if compressed data could not fit into `dst`, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct); + +/*! +Tutorial : +---------- +The first step is to count all symbols. FSE_count() does this job very fast. +Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells. +'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0] +maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value) +FSE_count() will return the number of occurrence of the most frequent symbol. +This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). + +The next step is to normalize the frequencies. +FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'. +It also guarantees a minimum of 1 to any Symbol with frequency >= 1. +You can use 'tableLog'==0 to mean "use default tableLog value". +If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(), +which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default"). + +The result of FSE_normalizeCount() will be saved into a table, +called 'normalizedCounter', which is a table of signed short. +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells. +The return value is tableLog if everything proceeded as expected. +It is 0 if there is a single symbol within distribution. +If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()). + +'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount(). +'buffer' must be already allocated. +For guaranteed success, buffer size must be at least FSE_headerBound(). +The result of the function is the number of bytes written into 'buffer'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small). + +'normalizedCounter' can then be used to create the compression table 'CTable'. +The space required by 'CTable' must be already allocated, using FSE_createCTable(). +You can then use FSE_buildCTable() to fill 'CTable'. +If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()). + +'CTable' can then be used to compress 'src', with FSE_compress_usingCTable(). +Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize' +The function returns the size of compressed data (without header), necessarily <= `dstCapacity`. +If it returns '0', compressed data could not fit into 'dst'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). +*/ + + +/* *** DECOMPRESSION *** */ + +/*! FSE_readNCount(): + Read compactly saved 'normalizedCounter' from 'rBuffer'. + @return : size read from 'rBuffer', + or an errorCode, which can be tested using FSE_isError(). + maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */ +FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize); + +/*! Constructor and Destructor of FSE_DTable. + Note that its size depends on 'tableLog' */ +typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); + +/*! FSE_buildDTable(): + Builds 'dt', which must be already allocated, using FSE_createDTable(). + return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_decompress_usingDTable(): + Decompress compressed source `cSrc` of size `cSrcSize` using `dt` + into `dst` which must be already allocated. + @return : size of regenerated data (necessarily <= `dstCapacity`), + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + +/*! +Tutorial : +---------- +(Note : these functions only decompress FSE-compressed blocks. + If block is uncompressed, use memcpy() instead + If block is a single repeated byte, use memset() instead ) + +The first step is to obtain the normalized frequencies of symbols. +This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount(). +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short. +In practice, that means it's necessary to know 'maxSymbolValue' beforehand, +or size the table to handle worst case situations (typically 256). +FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'. +The result of FSE_readNCount() is the number of bytes read from 'rBuffer'. +Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that. +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'. +This is performed by the function FSE_buildDTable(). +The space required by 'FSE_DTable' must be already allocated using FSE_createDTable(). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable(). +`cSrcSize` must be strictly correct, otherwise decompression will fail. +FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small) +*/ + +#endif /* FSE_H */ + +#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) +#define FSE_H_FSE_STATIC_LINKING_ONLY + +/* *** Dependency *** */ +/**** start inlining bitstream.h ****/ +/* ****************************************************************** + * bitstream + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ +#ifndef BITSTREAM_H_MODULE +#define BITSTREAM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/* +* This API consists of small unitary functions, which must be inlined for best performance. +* Since link-time-optimization is not available for all compilers, +* these functions are defined into a .h to be included. +*/ + +/*-**************************************** +* Dependencies +******************************************/ +/**** skipping file: mem.h ****/ +/**** start inlining compiler.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPILER_H +#define ZSTD_COMPILER_H + +/*-******************************************************* +* Compiler specifics +*********************************************************/ +/* force inlining */ + +#if !defined(ZSTD_NO_INLINE) +#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#else + +#define INLINE_KEYWORD +#define FORCE_INLINE_ATTR + +#endif + +/** + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR +/** + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers + * performance. + * + * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the + * always_inline attribute. + * + * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline + * attribute. + */ +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 +# define HINT_INLINE static INLINE_KEYWORD +#else +# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR +#endif + +/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +#if defined(__GNUC__) +# define UNUSED_ATTR __attribute__((unused)) +#else +# define UNUSED_ATTR +#endif + +/* force no inlining */ +#ifdef _MSC_VER +# define FORCE_NOINLINE static __declspec(noinline) +#else +# if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_NOINLINE static __attribute__((__noinline__)) +# else +# define FORCE_NOINLINE static +# endif +#endif + +/* target attribute */ +#ifndef __has_attribute + #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ +#endif +#if defined(__GNUC__) || defined(__ICCARM__) +# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) +#else +# define TARGET_ATTRIBUTE(target) +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ +#ifndef DYNAMIC_BMI2 + #if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ + && (defined(__x86_64__) || defined(_M_X86)) \ + && !defined(__BMI2__) + # define DYNAMIC_BMI2 1 + #else + # define DYNAMIC_BMI2 0 + #endif +#endif + +/* prefetch + * can be disabled, by declaring NO_PREFETCH build macro */ +#if defined(NO_PREFETCH) +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) +# elif defined(__aarch64__) +# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) +# else +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* NO_PREFETCH */ + +#define CACHELINE_SIZE 64 + +#define PREFETCH_AREA(p, s) { \ + const char* const _ptr = (const char*)(p); \ + size_t const _size = (size_t)(s); \ + size_t _pos; \ + for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ + PREFETCH_L2(_ptr + _pos); \ + } \ +} + +/* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ +#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) +# if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5) +# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) +# else +# define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")") +# endif +#else +# define DONT_VECTORIZE +#endif + +/* Tell the compiler that a branch is likely or unlikely. + * Only use these macros if it causes the compiler to generate better code. + * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc + * and clang, please do. + */ +#if defined(__GNUC__) +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +/* disable warnings */ +#ifdef _MSC_VER /* Visual Studio */ +# include /* For Visual 2005 */ +# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ +# pragma warning(disable : 4324) /* disable: C4324: padded structure */ +#endif + +#endif /* ZSTD_COMPILER_H */ +/**** ended inlining compiler.h ****/ +/**** skipping file: debug.h ****/ +/**** skipping file: error_private.h ****/ + + +/*========================================= +* Target specific +=========================================*/ +#if defined(__BMI__) && defined(__GNUC__) +# include /* support for bextr (experimental) */ +#elif defined(__ICCARM__) +# include +#endif + +#define STREAM_ACCUMULATOR_MIN_32 25 +#define STREAM_ACCUMULATOR_MIN_64 57 +#define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64)) + + +/*-****************************************** +* bitStream encoding API (write forward) +********************************************/ +/* bitStream can mix input from multiple sources. + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ +typedef struct { + size_t bitContainer; + unsigned bitPos; + char* startPtr; + char* ptr; + char* endPtr; +} BIT_CStream_t; + +MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); +MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + +/* Start with initCStream, providing the size of buffer to write into. +* bitStream will never write outside of this buffer. +* `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. +* +* bits are first added to a local register. +* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. +* Writing data into memory is an explicit operation, performed by the flushBits function. +* Hence keep track how many bits are potentially stored into local register to avoid register overflow. +* After a flushBits, a maximum of 7 bits might still be stored into local register. +* +* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers. +* +* Last operation is to close the bitStream. +* The function returns the final size of CStream in bytes. +* If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable) +*/ + + +/*-******************************************** +* bitStream decoding API (read backward) +**********************************************/ +typedef struct { + size_t bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; +} BIT_DStream_t; + +typedef enum { BIT_DStream_unfinished = 0, + BIT_DStream_endOfBuffer = 1, + BIT_DStream_completed = 2, + BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ + /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ + +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + +/* Start by invoking BIT_initDStream(). +* A chunk of the bitStream is then stored into a local register. +* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +* You can then retrieve bitFields stored into the local register, **in reverse order**. +* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. +* A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +* Otherwise, it can be less than that, so proceed accordingly. +* Checking if DStream has reached its end can be performed with BIT_endOfDStream(). +*/ + + +/*-**************************************** +* unsafe API +******************************************/ +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + +MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); +/* unsafe version; does not check buffer overflow */ + +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); +/* faster, but works only if nbBits >= 1 */ + + + +/*-************************************************************** +* Internal functions +****************************************************************/ +MEM_STATIC unsigned BIT_highbit32 (U32 val) +{ + assert(val != 0); + { +# if defined(_MSC_VER) /* Visual */ + unsigned long r=0; + return _BitScanReverse ( &r, val ) ? (unsigned)r : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ + return __builtin_clz (val) ^ 31; +# elif defined(__ICCARM__) /* IAR Intrinsic */ + return 31 - __CLZ(val); +# else /* Software version */ + static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +# endif + } +} + +/*===== Local Constants =====*/ +static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, + 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, + 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, + 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, + 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF, + 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */ +#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0])) + +/*-************************************************************** +* bitStream encoding +****************************************************************/ +/*! BIT_initCStream() : + * `dstCapacity` must be > sizeof(size_t) + * @return : 0 if success, + * otherwise an error code (can be tested using ERR_isError()) */ +MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + bitC->bitContainer = 0; + bitC->bitPos = 0; + bitC->startPtr = (char*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer); + if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) +{ + MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); + bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_addBitsFast() : + * works only if `value` is _clean_, + * meaning all high bits above nbBits are 0 */ +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) +{ + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); + bitC->bitContainer |= value << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_flushBitsFast() : + * assumption : bitContainer has not overflowed + * unsafe version; does not check buffer overflow */ +MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes*8; +} + +/*! BIT_flushBits() : + * assumption : bitContainer has not overflowed + * safe version; check for buffer overflow, and prevents it. + * note : does not signal buffer overflow. + * overflow will be revealed later on using BIT_closeCStream() */ +MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes*8; +} + +/*! BIT_closeCStream() : + * @return : size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) +{ + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); +} + + +/*-******************************************************** +* bitStream decoding +**********************************************************/ +/*! BIT_initDStream() : + * Initialize a BIT_DStream_t. + * `bitD` : a pointer to an already allocated BIT_DStream_t structure. + * `srcSize` must be the *exact* size of the bitStream, in bytes. + * @return : size of stream (== srcSize), or an errorCode if a problem is detected + */ +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) +{ + if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } + + bitD->start = (const char*)srcBuffer; + bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer); + + if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */ + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { + case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + /* fall-through */ + + case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + /* fall-through */ + + case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + /* fall-through */ + + case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; + /* fall-through */ + + case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; + /* fall-through */ + + case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; + /* fall-through */ + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; + } + + return srcSize; +} + +MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) +{ + return bitContainer >> start; +} + +MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) +{ + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ + assert(nbBits < BIT_MASK_SIZE); + return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; +} + +MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +{ + assert(nbBits < BIT_MASK_SIZE); + return bitContainer & BIT_mask[nbBits]; +} + +/*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) +{ + /* arbitrate between double-shift and shift+mask */ +#if 1 + /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8, + * bitstream is likely corrupted, and result is undefined */ + return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits); +#else + /* this code path is slower on my os-x laptop */ + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask); +#endif +} + +/*! BIT_lookBitsFast() : + * unsafe version; only works if nbBits >= 1 */ +MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) +{ + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); +} + +MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) +{ + bitD->bitsConsumed += nbBits; +} + +/*! BIT_readBits() : + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) +{ + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_readBitsFast() : + * unsafe version; only works only if nbBits >= 1 */ +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) +{ + size_t const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! + * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this + * point you must use BIT_reloadDStream() to reload. + */ +MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) +{ + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; + assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); + bitD->ptr -= bitD->bitsConsumed >> 3; + bitD->bitsConsumed &= 7; + bitD->bitContainer = MEM_readLEST(bitD->ptr); + return BIT_DStream_unfinished; +} + +/*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . + * This function is safe, it guarantees it will not read beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +{ + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + return BIT_DStream_overflow; + + if (bitD->ptr >= bitD->limitPtr) { + return BIT_reloadDStreamFast(bitD); + } + if (bitD->ptr == bitD->start) { + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } + /* start < ptr < limitPtr */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { + nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */ + result = BIT_DStream_endOfBuffer; + } + bitD->ptr -= nbBytes; + bitD->bitsConsumed -= nbBytes*8; + bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */ + return result; + } +} + +/*! BIT_endOfDStream() : + * @return : 1 if DStream has _exactly_ reached its end (all bits consumed). + */ +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) +{ + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* BITSTREAM_H_MODULE */ +/**** ended inlining bitstream.h ****/ + + +/* ***************************************** +* Static allocation +*******************************************/ +/* FSE buffer bounds */ +#define FSE_NCOUNTBOUND 512 +#define FSE_BLOCKBOUND(size) (size + (size>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */) +#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */ +#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2)) +#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1< 12) ? (1 << (maxTableLog - 2)) : 1024) ) +size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + +size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ + +size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); +/**< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` must be >= `(1<= BIT_DStream_completed + +When it's done, verify decompression is fully completed, by checking both DStream and the relevant states. +Checking if DStream has reached its end is performed by : + BIT_endOfDStream(&DStream); +Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible. + FSE_endOfDState(&DState); +*/ + + +/* ***************************************** +* FSE unsafe API +*******************************************/ +static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD); +/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */ + + +/* ***************************************** +* Implementation of inlined functions +*******************************************/ +typedef struct { + int deltaFindState; + U32 deltaNbBits; +} FSE_symbolCompressionTransform; /* total 8 bytes */ + +MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct) +{ + const void* ptr = ct; + const U16* u16ptr = (const U16*) ptr; + const U32 tableLog = MEM_read16(ptr); + statePtr->value = (ptrdiff_t)1<stateTable = u16ptr+2; + statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1); + statePtr->stateLog = tableLog; +} + + +/*! FSE_initCState2() : +* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read) +* uses the smallest state value possible, saving the cost of this symbol */ +MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol) +{ + FSE_initCState(statePtr, ct); + { const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* stateTable = (const U16*)(statePtr->stateTable); + U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16); + statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits; + statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } +} + +MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol) +{ + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); + BIT_addBits(bitC, statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; +} + +MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) +{ + BIT_addBits(bitC, statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); +} + + +/* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. + * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16; +} + +/* FSE_bitCost() : + * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16; + U32 const threshold = (minNbBits+1) << 16; + assert(tableLog < 16); + assert(accuracyLog < 31-tableLog); /* ensure enough room for renormalization double shift */ + { U32 const tableSize = 1 << tableLog; + U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize); + U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog; /* linear interpolation (very approximate) */ + U32 const bitMultiplier = 1 << accuracyLog; + assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold); + assert(normalizedDeltaFromThreshold <= bitMultiplier); + return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold; + } +} + + +/* ====== Decompression ====== */ + +typedef struct { + U16 tableLog; + U16 fastMode; +} FSE_DTableHeader; /* sizeof U32 */ + +typedef struct +{ + unsigned short newState; + unsigned char symbol; + unsigned char nbBits; +} FSE_decode_t; /* size == U32 */ + +MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt) +{ + const void* ptr = dt; + const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + return DInfo.symbol; +} + +MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.newState + lowBits; +} + +MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBits(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +/*! FSE_decodeSymbolFast() : + unsafe, only works if no symbol has a probability > 50% */ +MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBitsFast(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) +{ + return DStatePtr->state == 0; +} + + + +#ifndef FSE_COMMONDEFS_ONLY + +/* ************************************************************** +* Tuning parameters +****************************************************************/ +/*!MEMORY_USAGE : +* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +* Increasing memory usage improves compression ratio +* Reduced memory usage can improve speed, due to cache effect +* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ +#ifndef FSE_MAX_MEMORY_USAGE +# define FSE_MAX_MEMORY_USAGE 14 +#endif +#ifndef FSE_DEFAULT_MEMORY_USAGE +# define FSE_DEFAULT_MEMORY_USAGE 13 +#endif + +/*!FSE_MAX_SYMBOL_VALUE : +* Maximum symbol value authorized. +* Required for proper stack allocation */ +#ifndef FSE_MAX_SYMBOL_VALUE +# define FSE_MAX_SYMBOL_VALUE 255 +#endif + +/* ************************************************************** +* template functions type & suffix +****************************************************************/ +#define FSE_FUNCTION_TYPE BYTE +#define FSE_FUNCTION_EXTENSION +#define FSE_DECODE_TYPE FSE_decode_t + + +#endif /* !FSE_COMMONDEFS_ONLY */ + + +/* *************************************************************** +* Constants +*****************************************************************/ +#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE-2) +#define FSE_MAX_TABLESIZE (1U< FSE_TABLELOG_ABSOLUTE_MAX +# error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" +#endif + +#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3) + + +#endif /* FSE_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif +/**** ended inlining fse.h ****/ +#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ +/**** start inlining huf.h ****/ +/* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef HUF_H_298734234 +#define HUF_H_298734234 + +/* *** Dependencies *** */ +#include /* size_t */ + + +/* *** library symbols visibility *** */ +/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, + * HUF symbols remain "private" (internal symbols for library only). + * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define HUF_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +#else +# define HUF_PUBLIC_API +#endif + + +/* ========================== */ +/* *** simple functions *** */ +/* ========================== */ + +/** HUF_compress() : + * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. + * 'dst' buffer must be already allocated. + * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). + * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. + * @return : size of compressed data (<= `dstCapacity`). + * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) + */ +HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/** HUF_decompress() : + * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', + * into already allocated buffer 'dst', of minimum size 'dstSize'. + * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. + * Note : in contrast with FSE, HUF_decompress can regenerate + * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, + * because it knows size to regenerate (originalSize). + * @return : size of regenerated data (== originalSize), + * or an error code, which can be tested using HUF_isError() + */ +HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize); + + +/* *** Tool functions *** */ +#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ +HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ + +/* Error Management */ +HUF_PUBLIC_API unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ +HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ + + +/* *** Advanced function *** */ + +/** HUF_compress2() : + * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. + * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . + * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog); + +/** HUF_compress4X_wksp() : + * Same as HUF_compress2(), but uses externally allocated `workSpace`. + * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ +#define HUF_WORKSPACE_SIZE ((6 << 10) + 256) +#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) +HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize); + +#endif /* HUF_H_298734234 */ + +/* ****************************************************************** + * WARNING !! + * The following section contains advanced and experimental definitions + * which shall never be used in the context of a dynamic library, + * because they are not guaranteed to remain stable in the future. + * Only consider them in association with static linking. + * *****************************************************************/ +#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +#define HUF_H_HUF_STATIC_LINKING_ONLY + +/* *** Dependencies *** */ +/**** skipping file: mem.h ****/ + + +/* *** Constants *** */ +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ +#define HUF_SYMBOLVALUE_MAX 255 + +#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) +# error "HUF_TABLELOG_MAX is too large !" +#endif + + +/* **************************************** +* Static allocation +******************************************/ +/* HUF buffer bounds */ +#define HUF_CTABLEBOUND 129 +#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true when incompressible is pre-filtered with fast heuristic */ +#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* static allocation of HUF's Compression Table */ +#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) +#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ + U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \ + void* name##hv = &(name##hb); \ + HUF_CElt* name = (HUF_CElt*)(name##hv) /* no final ; */ + +/* static allocation of HUF's DTable */ +typedef U32 HUF_DTable; +#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1<<(maxTableLog))) +#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \ + HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) } +#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \ + HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) } + + +/* **************************************** +* Advanced decompression functions +******************************************/ +size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +#endif + +size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */ +size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ +#endif + + +/* **************************************** + * HUF detailed API + * ****************************************/ + +/*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") + * 2. (optional) refine tableLog using HUF_optimalTableLog() + * 3. build Huffman table from count using HUF_buildCTable() + * 4. save Huffman table to memory buffer using HUF_writeCTable() + * 5. encode the data stream using HUF_compress4X_usingCTable() + * + * The following API allows targeting specific sub-functions for advanced tasks. + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +typedef enum { + HUF_repeat_none, /**< Cannot use the previous table */ + HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; +/** HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) +size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, + void* workSpace, size_t wkspSize); + +/*! HUF_readStats() : + * Read compact Huffman tree, saved by HUF_writeCTable(). + * `huffWeight` is destination buffer. + * @return : size read from `src` , or an error Code . + * Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ +size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize); + +/** HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); + +/** HUF_getNbBits() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX + * Note 1 : is not inlined, as HUF_CElt definition is private + * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ +U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); + +/* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics + * 2. build Huffman table from save, using HUF_readDTableX?() + * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable() + */ + +/** HUF_selectDecoder() : + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . + * Assumption : 0 < dstSize <= 128 KB */ +U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + +/** + * The minimum workspace size for the `workSpace` used in + * HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp(). + * + * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when + * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15. + * Buffer overflow errors may potentially occur if code modifications result in + * a required workspace size greater than that specified in the following + * macro. + */ +#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +#endif + +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif + + +/* ====================== */ +/* single stream variants */ +/* ====================== */ + +size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +/** HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + +size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +#endif + +size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ +#endif + +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */ +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif + +/* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); +#endif +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); + +#endif /* HUF_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif +/**** ended inlining huf.h ****/ + + +/*=== Version ===*/ +unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; } + + +/*=== Error Management ===*/ +unsigned FSE_isError(size_t code) { return ERR_isError(code); } +const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); } + +unsigned HUF_isError(size_t code) { return ERR_isError(code); } +const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + + +/*-************************************************************** +* FSE NCount encoding-decoding +****************************************************************/ +size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +{ + const BYTE* const istart = (const BYTE*) headerBuffer; + const BYTE* const iend = istart + hbSize; + const BYTE* ip = istart; + int nbBits; + int remaining; + int threshold; + U32 bitStream; + int bitCount; + unsigned charnum = 0; + int previous0 = 0; + + if (hbSize < 4) { + /* This function only works when hbSize >= 4 */ + char buffer[4]; + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, headerBuffer, hbSize); + { size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr, + buffer, sizeof(buffer)); + if (FSE_isError(countSize)) return countSize; + if (countSize > hbSize) return ERROR(corruption_detected); + return countSize; + } } + assert(hbSize >= 4); + + /* init */ + memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */ + bitStream = MEM_readLE32(ip); + nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */ + if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge); + bitStream >>= 4; + bitCount = 4; + *tableLogPtr = nbBits; + remaining = (1<1) & (charnum<=*maxSVPtr)) { + if (previous0) { + unsigned n0 = charnum; + while ((bitStream & 0xFFFF) == 0xFFFF) { + n0 += 24; + if (ip < iend-5) { + ip += 2; + bitStream = MEM_readLE32(ip) >> bitCount; + } else { + bitStream >>= 16; + bitCount += 16; + } } + while ((bitStream & 3) == 3) { + n0 += 3; + bitStream >>= 2; + bitCount += 2; + } + n0 += bitStream & 3; + bitCount += 2; + if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall); + while (charnum < n0) normalizedCounter[charnum++] = 0; + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + assert((bitCount >> 3) <= 3); /* For first condition to work */ + ip += bitCount>>3; + bitCount &= 7; + bitStream = MEM_readLE32(ip) >> bitCount; + } else { + bitStream >>= 2; + } } + { int const max = (2*threshold-1) - remaining; + int count; + + if ((bitStream & (threshold-1)) < (U32)max) { + count = bitStream & (threshold-1); + bitCount += nbBits-1; + } else { + count = bitStream & (2*threshold-1); + if (count >= threshold) count -= max; + bitCount += nbBits; + } + + count--; /* extra accuracy */ + remaining -= count < 0 ? -count : count; /* -1 means +1 */ + normalizedCounter[charnum++] = (short)count; + previous0 = !count; + while (remaining < threshold) { + nbBits--; + threshold >>= 1; + } + + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + ip += bitCount>>3; + bitCount &= 7; + } else { + bitCount -= (int)(8 * (iend - 4 - ip)); + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> (bitCount & 31); + } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */ + if (remaining != 1) return ERROR(corruption_detected); + if (bitCount > 32) return ERROR(corruption_detected); + *maxSVPtr = charnum-1; + + ip += (bitCount+7)>>3; + return ip-istart; +} + + +/*! HUF_readStats() : + Read compact Huffman tree, saved by HUF_writeCTable(). + `huffWeight` is destination buffer. + `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32. + @return : size read from `src` , or an error Code . + Note : Needed by HUF_readCTable() and HUF_readDTableX?() . +*/ +size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize) +{ + U32 weightTotal; + const BYTE* ip = (const BYTE*) src; + size_t iSize; + size_t oSize; + + if (!srcSize) return ERROR(srcSize_wrong); + iSize = ip[0]; + /* memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */ + + if (iSize >= 128) { /* special header */ + oSize = iSize - 127; + iSize = ((oSize+1)/2); + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + if (oSize >= hwSize) return ERROR(corruption_detected); + ip += 1; + { U32 n; + for (n=0; n> 4; + huffWeight[n+1] = ip[n/2] & 15; + } } } + else { /* header compressed with FSE (normal case) */ + FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */ + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */ + if (FSE_isError(oSize)) return oSize; + } + + /* collect weight stats */ + memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); + weightTotal = 0; + { U32 n; for (n=0; n= HUF_TABLELOG_MAX) return ERROR(corruption_detected); + rankStats[huffWeight[n]]++; + weightTotal += (1 << huffWeight[n]) >> 1; + } } + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ + { U32 const tableLog = BIT_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; + U32 const verif = 1 << BIT_highbit32(rest); + U32 const lastWeight = BIT_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; + } } + + /* check tree construction validity */ + if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */ + + /* results */ + *nbSymbolsPtr = (U32)(oSize+1); + return iSize+1; +} +/**** ended inlining common/entropy_common.c ****/ +/**** start inlining common/error_private.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* The purpose of this file is to have a single list of error strings embedded in binary */ + +/**** skipping file: error_private.h ****/ + +const char* ERR_getErrorString(ERR_enum code) +{ +#ifdef ZSTD_STRIP_ERROR_STRINGS + (void)code; + return "Error strings stripped"; +#else + static const char* const notErrorCode = "Unspecified error code"; + switch( code ) + { + case PREFIX(no_error): return "No error detected"; + case PREFIX(GENERIC): return "Error (generic)"; + case PREFIX(prefix_unknown): return "Unknown frame descriptor"; + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; + case PREFIX(corruption_detected): return "Corrupted block detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; + case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough"; + case PREFIX(stage_wrong): return "Operation not authorized at current processing stage"; + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(maxCode): + default: return notErrorCode; + } +#endif +} +/**** ended inlining common/error_private.c ****/ +/**** start inlining common/fse_decompress.c ****/ +/* ****************************************************************** + * FSE : Finite State Entropy decoder + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +/**** skipping file: bitstream.h ****/ +/**** skipping file: compiler.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: fse.h ****/ +/**** skipping file: error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError +#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ + + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ +FSE_DTable* FSE_createDTable (unsigned tableLog) +{ + if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; + return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +} + +void FSE_freeDTable (FSE_DTable* dt) +{ + free(dt); +} + +size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ + FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); + U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize-1; + + /* Sanity Checks */ + if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); + + /* Init, lay down lowprob symbols */ + { FSE_DTableHeader DTableH; + DTableH.tableLog = (U16)tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + symbolNext[s] = normalizedCounter[s]; + } } } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } } + if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { U32 u; + for (u=0; utableLog = 0; + DTableH->fastMode = 0; + + cell->newState = 0; + cell->symbol = symbolValue; + cell->nbBits = 0; + + return 0; +} + + +size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +{ + void* ptr = dt; + FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; + void* dPtr = dt + 1; + FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; + const unsigned tableSize = 1 << nbBits; + const unsigned tableMask = tableSize - 1; + const unsigned maxSV1 = tableMask+1; + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) return ERROR(GENERIC); /* min size */ + + /* Build Decoding Table */ + DTableH->tableLog = (U16)nbBits; + DTableH->fastMode = 1; + for (s=0; s sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[1] = FSE_GETSYMBOL(&state2); + + if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } } + + op[2] = FSE_GETSYMBOL(&state1); + + if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[3] = FSE_GETSYMBOL(&state2); + } + + /* tail */ + /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */ + while (1) { + if (op>(omax-2)) return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state1); + if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state2); + break; + } + + if (op>(omax-2)) return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state2); + if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state1); + break; + } } + + return op-ostart; +} + + +size_t FSE_decompress_usingDTable(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize, + const FSE_DTable* dt) +{ + const void* ptr = dt; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ + if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); + return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +} + + +size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog) +{ + const BYTE* const istart = (const BYTE*)cSrc; + const BYTE* ip = istart; + short counting[FSE_MAX_SYMBOL_VALUE+1]; + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + + /* normal FSE decoding mode */ + size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); + if (FSE_isError(NCountLength)) return NCountLength; + /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */ /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */ + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + ip += NCountLength; + cSrcSize -= NCountLength; + + CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) ); + + return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ +} + + +typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; + +size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) +{ + DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ + return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); +} + + + +#endif /* FSE_COMMONDEFS_ONLY */ +/**** ended inlining common/fse_decompress.c ****/ +/**** start inlining common/pool.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* ====== Dependencies ======= */ +#include /* size_t */ +/**** skipping file: debug.h ****/ +/**** start inlining zstd_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CCOMMON_H_MODULE +#define ZSTD_CCOMMON_H_MODULE + +/* this module contains definitions which must be identical + * across compression, decompression and dictBuilder. + * It also contains a few functions useful to at least 2 of them + * and which benefit from being inlined */ + +/*-************************************* +* Dependencies +***************************************/ +#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) +#include +#endif +/**** skipping file: compiler.h ****/ +/**** skipping file: mem.h ****/ +/**** skipping file: debug.h ****/ +/**** skipping file: error_private.h ****/ +#define ZSTD_STATIC_LINKING_ONLY +/**** start inlining ../zstd.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + +/* ====== Dependency ======*/ +#include /* INT_MAX */ +#include /* size_t */ + + +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDLIB_API ZSTDLIB_VISIBILITY +#endif + + +/******************************************************************************* + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 5 + +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) +ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */ + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) +ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +/*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/*************************************** +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * They return an error otherwise. */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression work is performed in parallel, within worker threads. + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +typedef enum { + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. + */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + +typedef enum { + + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/**************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /**< start of input buffer */ + size_t size; /**< size of input buffer */ + size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /**< start of output buffer */ + size_t size; /**< size of output buffer */ + size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + + + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: + * + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). + */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/************************** +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*********************************** + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/******************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/******************************************************************************* + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/**************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 3, then this is seqDef.offset - 3 + * If seqDef.offset < 3, then this is the corresponding repeat offset + * But if seqDef.offset < 3 and litLength == 0, this is the + * repeat offset before the corresponding repeat offset + * And if seqDef.offset == 3 and litLength == 0, this is the + * most recent repeat offset - 1 + */ + unsigned int offset; + unsigned int litLength; /* Literal length */ + unsigned int matchLength; /* Match length */ + /* 0 when seq not rep and seqDef.offset otherwise + * when litLength == 0 this will be <= 4, otherwise <= 3 like normal + */ + unsigned int rep; +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /**< dispatch table : larger == faster, more memory */ + unsigned searchLog; /**< nb of searches : larger == more compression, slower */ + unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ +} ZSTD_frameParameters; + +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) + */ +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +/*! ZSTD_getSequences() : + * Extract sequences from the sequence store + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 + * @return : number of sequences extracted + */ +ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + + +/*************************************** +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. + */ +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*************************************** +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats + */ +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/******************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ +/**! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/**! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/**! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. + */ +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + + +/*===== Advanced Streaming decompression functions =====*/ +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/********************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ + +/** + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ + +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*- + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ + +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif +/**** ended inlining ../zstd.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: huf.h ****/ +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ +#endif +/**** start inlining xxhash.h ****/ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bits version, named XXH64, is available since r35. +It offers much better speed, but for 64-bits applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/* **************************** +* API modifier +******************************/ +/** XXH_PRIVATE_API +* This is useful if you want to include xxhash functions in `static` mode +* in order to inline them, and remove their symbol from the public list. +* Methodology : +* #define XXH_PRIVATE_API +* #include "xxhash.h" +* `xxhash.c` is automatically included. +* It's not useful to compile and link it as a separate module anymore. +*/ +#ifdef XXH_PRIVATE_API +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else +# define XXH_PUBLIC_API static /* this version may generate warnings for unused static functions; disable the relevant warning */ +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_PRIVATE_API */ + +/*!XXH_NAMESPACE, aka Namespace Emulation : + +If you want to include _and expose_ xxHash functions from within your own library, +but also want to avoid symbol collisions with another library which also includes xxHash, + +you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library +with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values). + +Note that no change is required within the calling program as long as it includes `xxhash.h` : +regular symbol name will be automatically translated by this header. +*/ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 2 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Simple Hash Functions +******************************/ +typedef unsigned int XXH32_hash_t; +typedef unsigned long long XXH64_hash_t; + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); + +/*! +XXH32() : + Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s +XXH64() : + Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark). +*/ + + +/* **************************** +* Streaming Hash Functions +******************************/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! State allocation, compatible with dynamic libraries */ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + + +/* hash streaming */ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/* +These functions generate the xxHash of an input provided in multiple segments. +Note that, for small input, they are slower than single-call functions, due to state management. +For small input, prefer `XXH32()` and `XXH64()` . + +XXH state must first be allocated, using XXH*_createState() . + +Start a new hash by initializing state with a seed, using XXH*_reset(). + +Then, feed the hash state by calling XXH*_update() as many times as necessary. +Obviously, input must be allocated and read accessible. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + +Finally, a hash value can be produced anytime, by using XXH*_digest(). +This function returns the nn-bits hash as an int or long long. + +It's still possible to continue inserting input into the hash state after a digest, +and generate some new hashes later on, by calling again XXH*_digest(). + +When done, free XXH state space if it was allocated dynamically. +*/ + + +/* ************************** +* Utils +****************************/ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* ! C99 */ +# define restrict /* disable restrict */ +#endif + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state); + + +/* ************************** +* Canonical representation +****************************/ +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. +* The canonical representation uses human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. +*/ +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +#endif /* XXHASH_H_5627135585666179 */ + + + +/* ================================================================================================ + This section contains definitions which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + They shall only be used with static linking. + Never use these definitions in association with dynamic linking ! +=================================================================================================== */ +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345) +#define XXH_STATIC_H_3543687687345 + +/* These definitions are only meant to allow allocation of XXH state + statically, on stack, or in a struct for example. + Do not use members directly. */ + + struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; /* buffer defined as U32 for alignment */ + unsigned memsize; + unsigned reserved; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH32_state_t */ + + struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; /* buffer defined as U64 for alignment */ + unsigned memsize; + unsigned reserved[2]; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH64_state_t */ + + +# ifdef XXH_PRIVATE_API +/**** start inlining xxhash.c ****/ +/* + * xxHash - Fast Hash algorithm + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash homepage: http://www.xxhash.com + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \ + defined(__ICCARM__) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. + * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. + * By default, this option is disabled. To enable it, uncomment below define : + */ +/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; set to 0 when the input data + * is guaranteed to be aligned. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/* Modify the local functions below should you wish to use some other memory routines */ +/* for malloc(), free() */ +#include +#include /* size_t */ +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } +/* for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } + +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +#endif +/**** skipping file: xxhash.h ****/ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR + + +#ifdef _MSC_VER +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# define MEM_MODULE +# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */ +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } +static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; + +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +#if defined(__ICCARM__) +# include +# define XXH_rotl32(x,r) __ROR(x,(32 - r)) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +# define XXH_swap64 _byteswap_uint64 +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +# define XXH_swap64 __builtin_bswap64 +#else +static U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +static U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN + static const int g_one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one)) +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/* ************************************* +* Constants +***************************************/ +static const U32 PRIME32_1 = 2654435761U; +static const U32 PRIME32_2 = 2246822519U; +static const U32 PRIME32_3 = 3266489917U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; + +static const U64 PRIME64_1 = 11400714785074694791ULL; +static const U64 PRIME64_2 = 14029467366897019727ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; + +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ************************** +* Utils +****************************/ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + + +/* *************************** +* Simple Hash Functions +*****************************/ + +static U32 XXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; + v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; + v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4; + v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4; + } while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p+4<=bEnd) { + h32 += XXH_get32bits(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_CREATESTATE_STATIC(state); + XXH32_reset(state, seed); + XXH32_update(state, input, len); + return XXH32_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +static U64 XXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + U64 h64; +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8; + v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8; + v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8; + v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8; + } while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_CREATESTATE_STATIC(state); + XXH64_reset(state, seed); + XXH64_update(state, input, len); + return XXH64_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +/* ************************************************** +* Advanced Hash Functions +****************************************************/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + + +/*** Hash feed ***/ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-4); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-8); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem32; + const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize; + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + while (p+4<=bEnd) { + h32 += XXH_readLE32(p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + + +/* **** XXH64 **** */ + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + if (input != NULL) { + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + } + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem64; + const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize; + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 + PRIME64_5; + } + + h64 += (U64) state->total_len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + +/* ************************** +* Canonical representation +****************************/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} +/**** ended inlining xxhash.c ****/ +# endif + +#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */ + + +#if defined (__cplusplus) +} +#endif +/**** ended inlining xxhash.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ---- static assert (debug) --- */ +#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) +#define ZSTD_isError ERR_isError /* for inlining */ +#define FSE_isError ERR_isError +#define HUF_isError ERR_isError + + +/*-************************************* +* shared macros +***************************************/ +#undef MIN +#undef MAX +#define MIN(a,b) ((a)<(b) ? (a) : (b)) +#define MAX(a,b) ((a)>(b) ? (a) : (b)) + +/** + * Ignore: this is an internal helper. + * + * This is a helper function to help force C99-correctness during compilation. + * Under strict compilation modes, variadic macro arguments can't be empty. + * However, variadic function arguments can be. Using a function therefore lets + * us statically check that at least one (string) argument was passed, + * independent of the compilation flags. + */ +static INLINE_KEYWORD UNUSED_ATTR +void _force_has_format_string(const char *format, ...) { + (void)format; +} + +/** + * Ignore: this is an internal helper. + * + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +#define _FORCE_HAS_FORMAT_STRING(...) \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } + +/** + * Return the specified error if the condition evaluates to true. + * + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +#define RETURN_ERROR_IF(cond, err, ...) \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } + +/** + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0); + +/** + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0); + + +/*-************************************* +* Common constants +***************************************/ +#define ZSTD_OPT_NUM (1<<12) + +#define ZSTD_REP_NUM 3 /* number of repcodes */ +#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define BIT7 128 +#define BIT6 64 +#define BIT5 32 +#define BIT4 16 +#define BIT1 2 +#define BIT0 1 + +#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10 +static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 }; +static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; + +#define ZSTD_FRAMEIDSIZE 4 /* magic number size */ + +#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ +static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; +typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + +#define ZSTD_FRAMECHECKSUMSIZE 4 + +#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ + +#define HufLog 12 +typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + +#define LONGNBSEQ 0x7F00 + +#define MINMATCH 3 + +#define Litbits 8 +#define MaxLit ((1<= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN)); + + if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { + /* Handle short offset copies. */ + do { + COPY8(op, ip) + } while (op < oend); + } else { + assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); + /* Separate out the first COPY16() call because the copy length is + * almost certain to be short, so the branches have different + * probabilities. Since it is almost certain to be short, only do + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +#ifndef __aarch64__ + do { + COPY16(op, ip); + } + while (op < oend); +#else + COPY16(op, ip); + if (op >= oend) return; + do { + COPY16(op, ip); + COPY16(op, ip); + } + while (op < oend); +#endif + } +} + +MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + size_t const length = MIN(dstCapacity, srcSize); + if (length > 0) { + memcpy(dst, src, length); + } + return length; +} + +/* define "workspace is too large" as this number of times larger than needed */ +#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 + +/* when workspace is continuously too large + * during at least this number of times, + * context's memory usage is considered wasteful, + * because it's sized to handle a worst case scenario which rarely happens. + * In which case, resize it down to free some memory */ +#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128 + + +/*-******************************************* +* Private declarations +*********************************************/ +typedef struct seqDef_s { + U32 offset; + U16 litLength; + U16 matchLength; +} seqDef; + +typedef struct { + seqDef* sequencesStart; + seqDef* sequences; + BYTE* litStart; + BYTE* lit; + BYTE* llCode; + BYTE* mlCode; + BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */ + U32 longLengthPos; +} seqStore_t; + +typedef struct { + U32 litLength; + U32 matchLength; +} ZSTD_sequenceLength; + +/** + * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences + * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength. + */ +MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) +{ + ZSTD_sequenceLength seqLen; + seqLen.litLength = seq->litLength; + seqLen.matchLength = seq->matchLength + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthID == 1) { + seqLen.litLength += 0xFFFF; + } + if (seqStore->longLengthID == 2) { + seqLen.matchLength += 0xFFFF; + } + } + return seqLen; +} + +/** + * Contains the compressed frame size and an upper-bound for the decompressed frame size. + * Note: before using `compressedSize`, check for errors using ZSTD_isError(). + * similarly, before using `decompressedBound`, check for errors using: + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ +typedef struct { + size_t compressedSize; + unsigned long long decompressedBound; +} ZSTD_frameSizeInfo; /* decompress & legacy */ + +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + +/* custom memory allocation functions */ +void* ZSTD_malloc(size_t size, ZSTD_customMem customMem); +void* ZSTD_calloc(size_t size, ZSTD_customMem customMem); +void ZSTD_free(void* ptr, ZSTD_customMem customMem); + + +MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +{ + assert(val != 0); + { +# if defined(_MSC_VER) /* Visual */ + unsigned long r=0; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */ + return __builtin_clz (val) ^ 31; +# elif defined(__ICCARM__) /* IAR Intrinsic */ + return 31 - __CLZ(val); +# else /* Software version */ + static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +# endif + } +} + + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); /* zstdmt, adaptive_compression (shouldn't get this definition from here) */ + + +typedef struct { + blockType_e blockType; + U32 lastBlock; + U32 origSize; +} blockProperties_t; /* declared here for decompress and fullbench */ + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +/* Used by: decompress, fullbench (does not get its definition from here) */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + +/*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +/* Used by: decompress, fullbench (does not get its definition from here) */ +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_CCOMMON_H_MODULE */ +/**** ended inlining zstd_internal.h ****/ +/**** start inlining pool.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef POOL_H +#define POOL_H + +#if defined (__cplusplus) +extern "C" { +#endif + + +#include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */ +/**** skipping file: ../zstd.h ****/ + +typedef struct POOL_ctx_s POOL_ctx; + +/*! POOL_create() : + * Create a thread pool with at most `numThreads` threads. + * `numThreads` must be at least 1. + * The maximum number of queued jobs before blocking is `queueSize`. + * @return : POOL_ctx pointer on success, else NULL. +*/ +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize); + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, + ZSTD_customMem customMem); + +/*! POOL_free() : + * Free a thread pool returned by POOL_create(). + */ +void POOL_free(POOL_ctx* ctx); + +/*! POOL_resize() : + * Expands or shrinks pool's number of threads. + * This is more efficient than releasing + creating a new context, + * since it tries to preserve and re-use existing threads. + * `numThreads` must be at least 1. + * @return : 0 when resize was successful, + * !0 (typically 1) if there is an error. + * note : only numThreads can be resized, queueSize remains unchanged. + */ +int POOL_resize(POOL_ctx* ctx, size_t numThreads); + +/*! POOL_sizeof() : + * @return threadpool memory usage + * note : compatible with NULL (returns 0 in this case) + */ +size_t POOL_sizeof(POOL_ctx* ctx); + +/*! POOL_function : + * The function type that can be added to a thread pool. + */ +typedef void (*POOL_function)(void*); + +/*! POOL_add() : + * Add the job `function(opaque)` to the thread pool. `ctx` must be valid. + * Possibly blocks until there is room in the queue. + * Note : The function may be executed asynchronously, + * therefore, `opaque` must live until function has been completed. + */ +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque); + + +/*! POOL_tryAdd() : + * Add the job `function(opaque)` to thread pool _if_ a worker is available. + * Returns immediately even if not (does not block). + * @return : 1 if successful, 0 if not. + */ +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque); + + +#if defined (__cplusplus) +} +#endif + +#endif +/**** ended inlining pool.h ****/ + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +#endif + + +#ifdef ZSTD_MULTITHREAD + +/**** start inlining threading.h ****/ +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef THREADING_H_938743 +#define THREADING_H_938743 + +/**** skipping file: debug.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +#if defined(ZSTD_MULTITHREAD) && defined(_WIN32) + +/** + * Windows minimalist Pthread Wrapper, based on : + * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + */ +#ifdef WINVER +# undef WINVER +#endif +#define WINVER 0x0600 + +#ifdef _WIN32_WINNT +# undef _WIN32_WINNT +#endif +#define _WIN32_WINNT 0x0600 + +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ +#include +#undef ERROR +#define ERROR(name) ZSTD_ERROR(name) + + +/* mutex */ +#define ZSTD_pthread_mutex_t CRITICAL_SECTION +#define ZSTD_pthread_mutex_init(a, b) ((void)(b), InitializeCriticalSection((a)), 0) +#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a)) +#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a)) +#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a)) + +/* condition variable */ +#define ZSTD_pthread_cond_t CONDITION_VARIABLE +#define ZSTD_pthread_cond_init(a, b) ((void)(b), InitializeConditionVariable((a)), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) +#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a)) +#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) + +/* ZSTD_pthread_create() and ZSTD_pthread_join() */ +typedef struct { + HANDLE handle; + void* (*start_routine)(void*); + void* arg; +} ZSTD_pthread_t; + +int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg); + +int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr); + +/** + * add here more wrappers as required + */ + + +#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */ +/* === POSIX Systems === */ +# include + +#if DEBUGLEVEL < 1 + +#define ZSTD_pthread_mutex_t pthread_mutex_t +#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b)) +#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a)) +#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a)) +#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a)) + +#define ZSTD_pthread_cond_t pthread_cond_t +#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b)) +#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a)) +#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b)) +#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a)) +#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a)) + +#define ZSTD_pthread_t pthread_t +#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) + +#else /* DEBUGLEVEL >= 1 */ + +/* Debug implementation of threading. + * In this implementation we use pointers for mutexes and condition variables. + * This way, if we forget to init/destroy them the program will crash or ASAN + * will report leaks. + */ + +#define ZSTD_pthread_mutex_t pthread_mutex_t* +int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr); +int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex); +#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock(*(a)) +#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock(*(a)) + +#define ZSTD_pthread_cond_t pthread_cond_t* +int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr); +int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond); +#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait(*(a), *(b)) +#define ZSTD_pthread_cond_signal(a) pthread_cond_signal(*(a)) +#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast(*(a)) + +#define ZSTD_pthread_t pthread_t +#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) + +#endif + +#else /* ZSTD_MULTITHREAD not defined */ +/* No multithreading support */ + +typedef int ZSTD_pthread_mutex_t; +#define ZSTD_pthread_mutex_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_mutex_destroy(a) ((void)(a)) +#define ZSTD_pthread_mutex_lock(a) ((void)(a)) +#define ZSTD_pthread_mutex_unlock(a) ((void)(a)) + +typedef int ZSTD_pthread_cond_t; +#define ZSTD_pthread_cond_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) ((void)(a), (void)(b)) +#define ZSTD_pthread_cond_signal(a) ((void)(a)) +#define ZSTD_pthread_cond_broadcast(a) ((void)(a)) + +/* do not use ZSTD_pthread_t */ + +#endif /* ZSTD_MULTITHREAD */ + +#if defined (__cplusplus) +} +#endif + +#endif /* THREADING_H_938743 */ +/**** ended inlining threading.h ****/ + +/* A job is a function and an opaque argument */ +typedef struct POOL_job_s { + POOL_function function; + void *opaque; +} POOL_job; + +struct POOL_ctx_s { + ZSTD_customMem customMem; + /* Keep track of the threads */ + ZSTD_pthread_t* threads; + size_t threadCapacity; + size_t threadLimit; + + /* The queue is a circular buffer */ + POOL_job *queue; + size_t queueHead; + size_t queueTail; + size_t queueSize; + + /* The number of threads working on jobs */ + size_t numThreadsBusy; + /* Indicates if the queue is empty */ + int queueEmpty; + + /* The mutex protects the queue */ + ZSTD_pthread_mutex_t queueMutex; + /* Condition variable for pushers to wait on when the queue is full */ + ZSTD_pthread_cond_t queuePushCond; + /* Condition variables for poppers to wait on when the queue is empty */ + ZSTD_pthread_cond_t queuePopCond; + /* Indicates if the queue is shutting down */ + int shutdown; +}; + +/* POOL_thread() : + * Work thread for the thread pool. + * Waits for jobs and executes them. + * @returns : NULL on failure else non-null. + */ +static void* POOL_thread(void* opaque) { + POOL_ctx* const ctx = (POOL_ctx*)opaque; + if (!ctx) { return NULL; } + for (;;) { + /* Lock the mutex and wait for a non-empty queue or until shutdown */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + + while ( ctx->queueEmpty + || (ctx->numThreadsBusy >= ctx->threadLimit) ) { + if (ctx->shutdown) { + /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit), + * a few threads will be shutdown while !queueEmpty, + * but enough threads will remain active to finish the queue */ + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return opaque; + } + ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); + } + /* Pop a job off the queue */ + { POOL_job const job = ctx->queue[ctx->queueHead]; + ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize; + ctx->numThreadsBusy++; + ctx->queueEmpty = ctx->queueHead == ctx->queueTail; + /* Unlock the mutex, signal a pusher, and run the job */ + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + + job.function(job.opaque); + + /* If the intended queue size was 0, signal after finishing job */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + ctx->numThreadsBusy--; + if (ctx->queueSize == 1) { + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + } + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + } + } /* for (;;) */ + assert(0); /* Unreachable */ +} + +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); +} + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, + ZSTD_customMem customMem) { + POOL_ctx* ctx; + /* Check parameters */ + if (!numThreads) { return NULL; } + /* Allocate the context and zero initialize */ + ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem); + if (!ctx) { return NULL; } + /* Initialize the job queue. + * It needs one extra space since one space is wasted to differentiate + * empty and full queues. + */ + ctx->queueSize = queueSize + 1; + ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem); + ctx->queueHead = 0; + ctx->queueTail = 0; + ctx->numThreadsBusy = 0; + ctx->queueEmpty = 1; + { + int error = 0; + error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL); + error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL); + error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL); + if (error) { POOL_free(ctx); return NULL; } + } + ctx->shutdown = 0; + /* Allocate space for the thread handles */ + ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem); + ctx->threadCapacity = 0; + ctx->customMem = customMem; + /* Check for errors */ + if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; } + /* Initialize the threads */ + { size_t i; + for (i = 0; i < numThreads; ++i) { + if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) { + ctx->threadCapacity = i; + POOL_free(ctx); + return NULL; + } } + ctx->threadCapacity = numThreads; + ctx->threadLimit = numThreads; + } + return ctx; +} + +/*! POOL_join() : + Shutdown the queue, wake any sleeping threads, and join all of the threads. +*/ +static void POOL_join(POOL_ctx* ctx) { + /* Shut down the queue */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + ctx->shutdown = 1; + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + /* Wake up sleeping threads */ + ZSTD_pthread_cond_broadcast(&ctx->queuePushCond); + ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); + /* Join all of the threads */ + { size_t i; + for (i = 0; i < ctx->threadCapacity; ++i) { + ZSTD_pthread_join(ctx->threads[i], NULL); /* note : could fail */ + } } +} + +void POOL_free(POOL_ctx *ctx) { + if (!ctx) { return; } + POOL_join(ctx); + ZSTD_pthread_mutex_destroy(&ctx->queueMutex); + ZSTD_pthread_cond_destroy(&ctx->queuePushCond); + ZSTD_pthread_cond_destroy(&ctx->queuePopCond); + ZSTD_free(ctx->queue, ctx->customMem); + ZSTD_free(ctx->threads, ctx->customMem); + ZSTD_free(ctx, ctx->customMem); +} + + + +size_t POOL_sizeof(POOL_ctx *ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + return sizeof(*ctx) + + ctx->queueSize * sizeof(POOL_job) + + ctx->threadCapacity * sizeof(ZSTD_pthread_t); +} + + +/* @return : 0 on success, 1 on error */ +static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads) +{ + if (numThreads <= ctx->threadCapacity) { + if (!numThreads) return 1; + ctx->threadLimit = numThreads; + return 0; + } + /* numThreads > threadCapacity */ + { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem); + if (!threadPool) return 1; + /* replace existing thread pool */ + memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool)); + ZSTD_free(ctx->threads, ctx->customMem); + ctx->threads = threadPool; + /* Initialize additional threads */ + { size_t threadId; + for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) { + if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) { + ctx->threadCapacity = threadId; + return 1; + } } + } } + /* successfully expanded */ + ctx->threadCapacity = numThreads; + ctx->threadLimit = numThreads; + return 0; +} + +/* @return : 0 on success, 1 on error */ +int POOL_resize(POOL_ctx* ctx, size_t numThreads) +{ + int result; + if (ctx==NULL) return 1; + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + result = POOL_resize_internal(ctx, numThreads); + ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return result; +} + +/** + * Returns 1 if the queue is full and 0 otherwise. + * + * When queueSize is 1 (pool was created with an intended queueSize of 0), + * then a queue is empty if there is a thread free _and_ no job is waiting. + */ +static int isQueueFull(POOL_ctx const* ctx) { + if (ctx->queueSize > 1) { + return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize); + } else { + return (ctx->numThreadsBusy == ctx->threadLimit) || + !ctx->queueEmpty; + } +} + + +static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque) +{ + POOL_job const job = {function, opaque}; + assert(ctx != NULL); + if (ctx->shutdown) return; + + ctx->queueEmpty = 0; + ctx->queue[ctx->queueTail] = job; + ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize; + ZSTD_pthread_cond_signal(&ctx->queuePopCond); +} + +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + /* Wait until there is space in the queue for the new job */ + while (isQueueFull(ctx) && (!ctx->shutdown)) { + ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); + } + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); +} + + +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + if (isQueueFull(ctx)) { + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 0; + } + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 1; +} + + +#else /* ZSTD_MULTITHREAD not defined */ + +/* ========================== */ +/* No multi-threading support */ +/* ========================== */ + + +/* We don't need any data, but if it is empty, malloc() might return NULL. */ +struct POOL_ctx_s { + int dummy; +}; +static POOL_ctx g_ctx; + +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); +} + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) { + (void)numThreads; + (void)queueSize; + (void)customMem; + return &g_ctx; +} + +void POOL_free(POOL_ctx* ctx) { + assert(!ctx || ctx == &g_ctx); + (void)ctx; +} + +int POOL_resize(POOL_ctx* ctx, size_t numThreads) { + (void)ctx; (void)numThreads; + return 0; +} + +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); +} + +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); + return 1; +} + +size_t POOL_sizeof(POOL_ctx* ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + assert(ctx == &g_ctx); + return sizeof(*ctx); +} + +#endif /* ZSTD_MULTITHREAD */ +/**** ended inlining common/pool.c ****/ +/**** start inlining common/zstd_common.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + + +/*-************************************* +* Dependencies +***************************************/ +#include /* malloc, calloc, free */ +#include /* memset */ +/**** skipping file: error_private.h ****/ +/**** skipping file: zstd_internal.h ****/ + + +/*-**************************************** +* Version +******************************************/ +unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; } + +const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; } + + +/*-**************************************** +* ZSTD Error Management +******************************************/ +#undef ZSTD_isError /* defined within zstd_internal.h */ +/*! ZSTD_isError() : + * tells if a return value is an error code + * symbol is required for external callers */ +unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } + +/*! ZSTD_getErrorName() : + * provides error code string from function result (useful for debugging) */ +const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); } + +/*! ZSTD_getError() : + * convert a `size_t` function result into a proper ZSTD_errorCode enum */ +ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + +/*! ZSTD_getErrorString() : + * provides error code string from enum */ +const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } + + + +/*=************************************************************** +* Custom allocator +****************************************************************/ +void* ZSTD_malloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) + return customMem.customAlloc(customMem.opaque, size); + return malloc(size); +} + +void* ZSTD_calloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) { + /* calloc implemented as malloc+memset; + * not as efficient as calloc, but next best guess for custom malloc */ + void* const ptr = customMem.customAlloc(customMem.opaque, size); + memset(ptr, 0, size); + return ptr; + } + return calloc(1, size); +} + +void ZSTD_free(void* ptr, ZSTD_customMem customMem) +{ + if (ptr!=NULL) { + if (customMem.customFree) + customMem.customFree(customMem.opaque, ptr); + else + free(ptr); + } +} +/**** ended inlining common/zstd_common.c ****/ + +/**** start inlining compress/fse_compress.c ****/ +/* ****************************************************************** + * FSE : Finite State Entropy encoder + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/debug.h ****/ +/**** start inlining hist.h ****/ +/* ****************************************************************** + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* --- dependencies --- */ +#include /* size_t */ + + +/* --- simple histogram functions --- */ + +/*! HIST_count(): + * Provides the precise count of each byte within a table 'count'. + * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1). + * Updates *maxSymbolValuePtr with actual largest symbol value detected. + * @return : count of the most frequent symbol (which isn't identified). + * or an error code, which can be tested using HIST_isError(). + * note : if return == srcSize, there is only one symbol. + */ +size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); + +unsigned HIST_isError(size_t code); /**< tells if a return value is an error code */ + + +/* --- advanced histogram functions --- */ + +#define HIST_WKSP_SIZE_U32 1024 +#define HIST_WKSP_SIZE (HIST_WKSP_SIZE_U32 * sizeof(unsigned)) +/** HIST_count_wksp() : + * Same as HIST_count(), but using an externally provided scratch buffer. + * Benefit is this function will use very little stack space. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize, + void* workSpace, size_t workSpaceSize); + +/** HIST_countFast() : + * same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr. + * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` + */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); + +/** HIST_countFast_wksp() : + * Same as HIST_countFast(), but using an externally provided scratch buffer. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize, + void* workSpace, size_t workSpaceSize); + +/*! HIST_count_simple() : + * Same as HIST_countFast(), this function is unsafe, + * and will segfault if any value within `src` is `> *maxSymbolValuePtr`. + * It is also a bit slower for large inputs. + * However, it does not need any additional memory (not even on stack). + * @return : count of the most frequent symbol. + * Note this function doesn't produce any error (i.e. it must succeed). + */ +unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); +/**** ended inlining hist.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +/**** skipping file: ../common/error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError + + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * wkspSize should be sized to handle worst case situation, which is `1<>1 : 1) ; + FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); + U32 const step = FSE_TABLESTEP(tableSize); + U32 cumul[FSE_MAX_SYMBOL_VALUE+2]; + + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace; + U32 highThreshold = tableSize-1; + + /* CTable header */ + if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge); + tableU16[-2] = (U16) tableLog; + tableU16[-1] = (U16) maxSymbolValue; + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : + * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ + #endif + + /* symbol start positions */ + { U32 u; + cumul[0] = 0; + for (u=1; u <= maxSymbolValue+1; u++) { + if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ + cumul[u] = cumul[u-1] + 1; + tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); + } else { + cumul[u] = cumul[u-1] + normalizedCounter[u-1]; + } } + cumul[maxSymbolValue+1] = tableSize+1; + } + + /* Spread symbols */ + { U32 position = 0; + U32 symbol; + for (symbol=0; symbol<=maxSymbolValue; symbol++) { + int nbOccurrences; + int const freq = normalizedCounter[symbol]; + for (nbOccurrences=0; nbOccurrences highThreshold) + position = (position + step) & tableMask; /* Low proba area */ + } } + + assert(position==0); /* Must have initialized all positions */ + } + + /* Build table */ + { U32 u; for (u=0; u> 3) + 3; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ +} + +static size_t +FSE_writeNCount_generic (void* header, size_t headerBufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, + unsigned writeIsSafe) +{ + BYTE* const ostart = (BYTE*) header; + BYTE* out = ostart; + BYTE* const oend = ostart + headerBufferSize; + int nbBits; + const int tableSize = 1 << tableLog; + int remaining; + int threshold; + U32 bitStream = 0; + int bitCount = 0; + unsigned symbol = 0; + unsigned const alphabetSize = maxSymbolValue + 1; + int previousIs0 = 0; + + /* Table Size */ + bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount; + bitCount += 4; + + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; + nbBits = tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { + unsigned start = symbol; + while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++; + if (symbol == alphabetSize) break; /* incorrect distribution */ + while (symbol >= start+24) { + start+=24; + bitStream += 0xFFFFU << bitCount; + if ((!writeIsSafe) && (out > oend-2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE) bitStream; + out[1] = (BYTE)(bitStream>>8); + out+=2; + bitStream>>=16; + } + while (symbol >= start+3) { + start+=3; + bitStream += 3 << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; + bitCount += 2; + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + { int count = normalizedCounter[symbol++]; + int const max = (2*threshold-1) - remaining; + remaining -= count < 0 ? -count : count; + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ + bitStream += count << bitCount; + bitCount += nbBits; + bitCount -= (count>=1; } + } + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + + if (remaining != 1) + return ERROR(GENERIC); /* incorrect normalized distribution */ + assert(symbol <= alphabetSize); + + /* flush remaining bitStream */ + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out+= (bitCount+7) /8; + + return (out-ostart); +} + + +size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported */ + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported */ + + if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog)) + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0); + + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */); +} + + +/*-************************************************************** +* FSE Compression Code +****************************************************************/ + +FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +{ + size_t size; + if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; + size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); + return (FSE_CTable*)malloc(size); +} + +void FSE_freeCTable (FSE_CTable* ct) { free(ct); } + +/* provides the minimum logSize to safely represent a distribution */ +static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) +{ + U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; + U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +} + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) +{ + U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */ + if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */ + if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG; + if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG; + return tableLog; +} + +unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2); +} + + +/* Secondary normalization method. + To be used when primary method fails. */ + +static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue) +{ + short const NOT_YET_ASSIGNED = -2; + U32 s; + U32 distributed = 0; + U32 ToDistribute; + + /* Init */ + U32 const lowThreshold = (U32)(total >> tableLog); + U32 lowOne = (U32)((total * 3) >> (tableLog + 1)); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == 0) { + norm[s]=0; + continue; + } + if (count[s] <= lowThreshold) { + norm[s] = -1; + distributed++; + total -= count[s]; + continue; + } + if (count[s] <= lowOne) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } + + norm[s]=NOT_YET_ASSIGNED; + } + ToDistribute = (1 << tableLog) - distributed; + + if (ToDistribute == 0) + return 0; + + if ((total / ToDistribute) > lowOne) { + /* risk of rounding to zero */ + lowOne = (U32)((total * 3) / (ToDistribute * 2)); + for (s=0; s<=maxSymbolValue; s++) { + if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } } + ToDistribute = (1 << tableLog) - distributed; + } + + if (distributed == maxSymbolValue+1) { + /* all values are pretty poor; + probably incompressible data (should have already been detected); + find max, then give all remaining points to max */ + U32 maxV = 0, maxC = 0; + for (s=0; s<=maxSymbolValue; s++) + if (count[s] > maxC) { maxV=s; maxC=count[s]; } + norm[maxV] += (short)ToDistribute; + return 0; + } + + if (total == 0) { + /* all of the symbols were low enough for the lowOne or lowThreshold */ + for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1)) + if (norm[s] > 0) { ToDistribute--; norm[s]++; } + return 0; + } + + { U64 const vStepLog = 62 - tableLog; + U64 const mid = (1ULL << (vStepLog-1)) - 1; + U64 const rStep = ((((U64)1<> vStepLog); + U32 const sEnd = (U32)(end >> vStepLog); + U32 const weight = sEnd - sStart; + if (weight < 1) + return ERROR(GENERIC); + norm[s] = (short)weight; + tmpTotal = end; + } } } + + return 0; +} + + +size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t total, + unsigned maxSymbolValue) +{ + /* Sanity checks */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported size */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */ + if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */ + + { static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 }; + U64 const scale = 62 - tableLog; + U64 const step = ((U64)1<<62) / total; /* <== here, one division ! */ + U64 const vStep = 1ULL<<(scale-20); + int stillToDistribute = 1<> tableLog); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == total) return 0; /* rle special case */ + if (count[s] == 0) { normalizedCounter[s]=0; continue; } + if (count[s] <= lowThreshold) { + normalizedCounter[s] = -1; + stillToDistribute--; + } else { + short proba = (short)((count[s]*step) >> scale); + if (proba<8) { + U64 restToBeat = vStep * rtbTable[proba]; + proba += (count[s]*step) - ((U64)proba< restToBeat; + } + if (proba > largestP) { largestP=proba; largest=s; } + normalizedCounter[s] = proba; + stillToDistribute -= proba; + } } + if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { + /* corner case, need another normalization method */ + size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue); + if (FSE_isError(errorCode)) return errorCode; + } + else normalizedCounter[largest] += (short)stillToDistribute; + } + +#if 0 + { /* Print Table (debug) */ + U32 s; + U32 nTotal = 0; + for (s=0; s<=maxSymbolValue; s++) + RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]); + for (s=0; s<=maxSymbolValue; s++) + nTotal += abs(normalizedCounter[s]); + if (nTotal != (1U<>1); /* assumption : tableLog >= 1 */ + FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) return ERROR(GENERIC); /* min size */ + + /* header */ + tableU16[-2] = (U16) nbBits; + tableU16[-1] = (U16) maxSymbolValue; + + /* Build table */ + for (s=0; s FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) { /* test bit 2 */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + FSE_FLUSHBITS(&bitC); + } + + /* 2 or 4 encoding per loop */ + while ( ip>istart ) { + + FSE_encodeSymbol(&bitC, &CState2, *--ip); + + if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 ) /* this test must be static */ + FSE_FLUSHBITS(&bitC); + + FSE_encodeSymbol(&bitC, &CState1, *--ip); + + if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) { /* this test must be static */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + } + + FSE_FLUSHBITS(&bitC); + } + + FSE_flushCState(&bitC, &CState2); + FSE_flushCState(&bitC, &CState1); + return BIT_closeCStream(&bitC); +} + +size_t FSE_compress_usingCTable (void* dst, size_t dstSize, + const void* src, size_t srcSize, + const FSE_CTable* ct) +{ + unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize)); + + if (fast) + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1); + else + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0); +} + + +size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); } + +/* FSE_compress_wksp() : + * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` size must be `(1< not compressible */ + if (maxCount < (srcSize >> 7)) return 0; /* Heuristic : not compressible enough */ + } + + tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue); + CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) ); + + /* Write table description header */ + { CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) ); + op += nc_err; + } + + /* Compress */ + CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) ); + if (cSize == 0) return 0; /* not enough space for compressed data */ + op += cSize; + } + + /* check compressibility */ + if ( (size_t)(op-ostart) >= srcSize-1 ) return 0; + + return op-ostart; +} + +typedef struct { + FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)]; + BYTE scratchBuffer[1 << FSE_MAX_TABLELOG]; +} fseWkspMax_t; + +size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog) +{ + fseWkspMax_t scratchBuffer; + DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); + return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer)); +} + +size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG); +} + + +#endif /* FSE_COMMONDEFS_ONLY */ +/**** ended inlining compress/fse_compress.c ****/ +/**** start inlining compress/hist.c ****/ +/* ****************************************************************** + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* --- dependencies --- */ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/debug.h ****/ +/**** skipping file: ../common/error_private.h ****/ +/**** skipping file: hist.h ****/ + + +/* --- Error management --- */ +unsigned HIST_isError(size_t code) { return ERR_isError(code); } + +/*-************************************************************** + * Histogram functions + ****************************************************************/ +unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) +{ + const BYTE* ip = (const BYTE*)src; + const BYTE* const end = ip + srcSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned largestCount=0; + + memset(count, 0, (maxSymbolValue+1) * sizeof(*count)); + if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; } + + while (ip largestCount) largestCount = count[s]; + } + + return largestCount; +} + +typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e; + +/* HIST_count_parallel_wksp() : + * store histogram into 4 intermediate tables, recombined at the end. + * this design makes better use of OoO cpus, + * and is noticeably faster when some values are heavily repeated. + * But it needs some additional workspace for intermediate tables. + * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32. + * @return : largest histogram frequency, + * or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */ +static size_t HIST_count_parallel_wksp( + unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + HIST_checkInput_e check, + U32* const workSpace) +{ + const BYTE* ip = (const BYTE*)source; + const BYTE* const iend = ip+sourceSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned max=0; + U32* const Counting1 = workSpace; + U32* const Counting2 = Counting1 + 256; + U32* const Counting3 = Counting2 + 256; + U32* const Counting4 = Counting3 + 256; + + memset(workSpace, 0, 4*256*sizeof(unsigned)); + + /* safety checks */ + if (!sourceSize) { + memset(count, 0, maxSymbolValue + 1); + *maxSymbolValuePtr = 0; + return 0; + } + if (!maxSymbolValue) maxSymbolValue = 255; /* 0 == default */ + + /* by stripes of 16 bytes */ + { U32 cached = MEM_read32(ip); ip += 4; + while (ip < iend-15) { + U32 c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + } + ip-=4; + } + + /* finish last symbols */ + while (ipmaxSymbolValue; s--) { + Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s]; + if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall); + } } + + { U32 s; + if (maxSymbolValue > 255) maxSymbolValue = 255; + for (s=0; s<=maxSymbolValue; s++) { + count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; + if (count[s] > max) max = count[s]; + } } + + while (!count[maxSymbolValue]) maxSymbolValue--; + *maxSymbolValuePtr = maxSymbolValue; + return (size_t)max; +} + +/* HIST_countFast_wksp() : + * Same as HIST_countFast(), but using an externally provided scratch buffer. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if (sourceSize < 1500) /* heuristic threshold */ + return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize); + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace); +} + +/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters)); +} + +/* HIST_count_wksp() : + * Same as HIST_count(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */ +size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + if (*maxSymbolValuePtr < 255) + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace); + *maxSymbolValuePtr = 255; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize); +} + +size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters)); +} +/**** ended inlining compress/hist.c ****/ +/**** start inlining compress/huf_compress.c ****/ +/* ****************************************************************** + * Huffman encoder, part of New Generation Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* memcpy, memset */ +#include /* printf (debug) */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +/**** skipping file: hist.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError +#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ + + +/* ************************************************************** +* Utils +****************************************************************/ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); +} + + +/* ******************************************************* +* HUF : Huffman block compression +*********************************************************/ +/* HUF_compressWeights() : + * Same as FSE_compress(), but dedicated to huff0's weights compression. + * The use case needs much less stack memory. + * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. + */ +#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 +static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize) +{ + BYTE* const ostart = (BYTE*) dst; + BYTE* op = ostart; + BYTE* const oend = ostart + dstSize; + + unsigned maxSymbolValue = HUF_TABLELOG_MAX; + U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + + FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; + BYTE scratchBuffer[1< not compressible */ + } + + tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); + CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) ); + + /* Write table description header */ + { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) ); + op += hSize; + } + + /* Compress */ + CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) ); + if (cSize == 0) return 0; /* not enough space for compressed data */ + op += cSize; + } + + return (size_t)(op-ostart); +} + + +struct HUF_CElt_s { + U16 val; + BYTE nbBits; +}; /* typedef'd to HUF_CElt within "huf.h" */ + +/*! HUF_writeCTable() : + `CTable` : Huffman tree to save, using huf representation. + @return : size of saved CTable */ +size_t HUF_writeCTable (void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +{ + BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ + BYTE huffWeight[HUF_SYMBOLVALUE_MAX]; + BYTE* op = (BYTE*)dst; + U32 n; + + /* check conditions */ + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + + /* convert to weight */ + bitsToWeight[0] = 0; + for (n=1; n1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ + op[0] = (BYTE)hSize; + return hSize+1; + } } + + /* write raw values as 4-bits (max : 15) */ + if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ + if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ + op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1)); + huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + for (n=0; n HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; + for (n=1; n<=tableLog; n++) { + U32 current = nextRankStart; + nextRankStart += (rankVal[n] << (n-1)); + rankVal[n] = current; + } } + + /* fill nbBits */ + *hasZeroWeights = 0; + { U32 n; for (n=0; nn=tableLog+1 */ + U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; + { U32 n; for (n=0; n0; n--) { /* start at n=tablelog <-> w=1 */ + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + /* assign value within rank, symbol order */ + { U32 n; for (n=0; n maxNbBits */ + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; + const U32 baseCost = 1 << (largestBits - maxNbBits); + int n = (int)lastNonNull; + + while (huffNode[n].nbBits > maxNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); + huffNode[n].nbBits = (BYTE)maxNbBits; + n --; + } /* n stops at huffNode[n].nbBits <= maxNbBits */ + while (huffNode[n].nbBits == maxNbBits) n--; /* n end at index of smallest symbol using < maxNbBits */ + + /* renorm totalCost */ + totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */ + + /* repay normalized cost */ + { U32 const noSymbol = 0xF0F0F0F0; + U32 rankLast[HUF_TABLELOG_MAX+2]; + + /* Get pos of last (smallest) symbol per rank */ + memset(rankLast, 0xF0, sizeof(rankLast)); + { U32 currentNbBits = maxNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; + currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ + rankLast[maxNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; + if (highPos == noSymbol) continue; + if (lowPos == noSymbol) break; + { U32 const highTotal = huffNode[highPos].count; + U32 const lowTotal = 2 * huffNode[lowPos].count; + if (highTotal <= lowTotal) break; + } } + /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */ + /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) + nBitsToDecrease ++; + totalCost -= 1 << (nBitsToDecrease-1); + if (rankLast[nBitsToDecrease-1] == noSymbol) + rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */ + huffNode[rankLast[nBitsToDecrease]].nbBits ++; + if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */ + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; + if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } } /* while (totalCost > 0) */ + + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ + if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */ + while (huffNode[n].nbBits == maxNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); + totalCost++; + continue; + } + huffNode[ rankLast[1] + 1 ].nbBits--; + rankLast[1]++; + totalCost ++; + } } } /* there are several too large elements (at least >= 2) */ + + return maxNbBits; +} + +typedef struct { + U32 base; + U32 current; +} rankPos; + +typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; + +#define RANK_POSITION_TABLE_SIZE 32 + +typedef struct { + huffNodeTable huffNodeTbl; + rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; +} HUF_buildCTable_wksp_tables; + +static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) +{ + U32 n; + + memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); + for (n=0; n<=maxSymbolValue; n++) { + U32 r = BIT_highbit32(count[n] + 1); + rankPosition[r].base ++; + } + for (n=30; n>0; n--) rankPosition[n-1].base += rankPosition[n].base; + for (n=0; n<32; n++) rankPosition[n].current = rankPosition[n].base; + for (n=0; n<=maxSymbolValue; n++) { + U32 const c = count[n]; + U32 const r = BIT_highbit32(c+1) + 1; + U32 pos = rankPosition[r].current++; + while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { + huffNode[pos] = huffNode[pos-1]; + pos--; + } + huffNode[pos].count = c; + huffNode[pos].byte = (BYTE)n; + } +} + + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). + */ +#define STARTNODE (HUF_SYMBOLVALUE_MAX+1) + +size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) +{ + HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; + nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; + + /* safety checks */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) + return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(maxSymbolValue_tooLarge); + memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); + + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; + lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb; + huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count; + huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb; + nodeNb++; lowS-=2; + for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30); + huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */ + + /* create parents */ + while (nodeNb <= nodeRoot) { + int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count; + huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb; + nodeNb++; + } + + /* distribute weights (unlimited tree height) */ + huffNode[nodeRoot].nbBits = 0; + for (n=nodeRoot-1; n>=STARTNODE; n--) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + + /* enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + + /* fill result into tree (val, nbBits) */ + { U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; + U16 valPerRank[HUF_TABLELOG_MAX+1] = {0}; + int const alphabetSize = (int)(maxSymbolValue + 1); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + for (n=0; n<=nonNullRank; n++) + nbPerRank[huffNode[n].nbBits]++; + /* determine stating value per rank */ + { U16 min = 0; + for (n=(int)maxNbBits; n>0; n--) { + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + for (n=0; n> 3; +} + +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + int bad = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + } + return !bad; +} + +size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } + +FORCE_INLINE_TEMPLATE void +HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +{ + BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +} + +#define HUF_FLUSHBITS(s) BIT_flushBits(s) + +#define HUF_FLUSHBITS_1(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + +#define HUF_FLUSHBITS_2(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) + +FORCE_INLINE_TEMPLATE size_t +HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + size_t n; + BIT_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ + { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + n = srcSize & ~3; /* join to mod 4 */ + switch (srcSize & 3) + { + case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); + HUF_FLUSHBITS_2(&bitC); + /* fall-through */ + case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); + HUF_FLUSHBITS_1(&bitC); + /* fall-through */ + case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); + HUF_FLUSHBITS(&bitC); + /* fall-through */ + case 0 : /* fall-through */ + default: break; + } + + for (; n>0; n-=4) { /* note : n&3==0 at this stage */ + HUF_encodeSymbol(&bitC, ip[n- 1], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 2], CTable); + HUF_FLUSHBITS_2(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 3], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 4], CTable); + HUF_FLUSHBITS(&bitC); + } + + return BIT_closeCStream(&bitC); +} + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + if (bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +} + +#else + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + (void)bmi2; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +#endif + +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + + +static size_t +HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, int bmi2) +{ + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ + if (srcSize < 12) return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); + if (cSize==0) return 0; + op += cSize; + } + + return (size_t)(op-ostart); +} + +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + +typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + +static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, + HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) +{ + size_t const cSize = (nbStreams==HUF_singleStream) ? + HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : + HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; + /* check compressibility */ + assert(op >= ostart); + if ((size_t)(op-ostart) >= srcSize-1) { return 0; } + return (size_t)(op-ostart); +} + +typedef struct { + unsigned count[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + HUF_buildCTable_wksp_tables buildCTable_wksp; +} HUF_compress_tables_t; + +/* HUF_compress_internal() : + * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +static size_t +HUF_compress_internal (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, + HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, + const int bmi2) +{ + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (!srcSize) return 0; /* Uncompressed */ + if (!dstSize) return 0; /* cannot fit anything within dst budget */ + if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ + if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX; + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ + if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } + + /* Scan input and build symbol stats */ + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) ); + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + + /* Check validity of previous table */ + if ( repeat + && *repeat == HUF_repeat_check + && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) { + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ + if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } + + /* Build Huffman Tree */ + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->buildCTable_wksp, sizeof(table->buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; + /* Zero unused symbols in CTable, so we can check it for validity */ + memset(table->CTable + (maxSymbolValue + 1), 0, + sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + } + + /* Write table description header */ + { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) ); + /* Check if using previous huffman table is beneficial */ + if (repeat && *repeat != HUF_repeat_none) { + size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); + size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue); + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } } + + /* Use the new huffman table */ + if (hSize + 12ul >= srcSize) { return 0; } + op += hSize; + if (repeat) { *repeat = HUF_repeat_none; } + if (oldHufTable) + memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, table->CTable, bmi2); +} + + +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, + repeat, preferRepeat, bmi2); +} + +size_t HUF_compress1X (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog) +{ + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * provide workspace to generate compression tables */ +size_t HUF_compress4X_wksp (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * re-use an existing huffman compression table */ +size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, + hufTable, repeat, preferRepeat, bmi2); +} + +size_t HUF_compress2 (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog) +{ + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); +} + +size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize) +{ + return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT); +} +/**** ended inlining compress/huf_compress.c ****/ +/**** start inlining compress/zstd_compress_literals.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_literals.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_LITERALS_H +#define ZSTD_COMPRESS_LITERALS_H + +/**** start inlining zstd_compress_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* This header contains definitions + * that shall **only** be used by modules within lib/compress. + */ + +#ifndef ZSTD_COMPRESS_H +#define ZSTD_COMPRESS_H + +/*-************************************* +* Dependencies +***************************************/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** start inlining zstd_cwksp.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CWKSP_H +#define ZSTD_CWKSP_H + +/*-************************************* +* Dependencies +***************************************/ +/**** skipping file: ../common/zstd_internal.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-************************************* +* Constants +***************************************/ + +/* Since the workspace is effectively its own little malloc implementation / + * arena, when we run under ASAN, we should similarly insert redzones between + * each internal element of the workspace, so ASAN will catch overruns that + * reach outside an object but that stay inside the workspace. + * + * This defines the size of that redzone. + */ +#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE +#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 +#endif + +/*-************************************* +* Structures +***************************************/ +typedef enum { + ZSTD_cwksp_alloc_objects, + ZSTD_cwksp_alloc_buffers, + ZSTD_cwksp_alloc_aligned +} ZSTD_cwksp_alloc_phase_e; + +/** + * Zstd fits all its internal datastructures into a single continuous buffer, + * so that it only needs to perform a single OS allocation (or so that a buffer + * can be provided to it and it can perform no allocations at all). This buffer + * is called the workspace. + * + * Several optimizations complicate that process of allocating memory ranges + * from this workspace for each internal datastructure: + * + * - These different internal datastructures have different setup requirements: + * + * - The static objects need to be cleared once and can then be trivially + * reused for each compression. + * + * - Various buffers don't need to be initialized at all--they are always + * written into before they're read. + * + * - The matchstate tables have a unique requirement that they don't need + * their memory to be totally cleared, but they do need the memory to have + * some bound, i.e., a guarantee that all values in the memory they've been + * allocated is less than some maximum value (which is the starting value + * for the indices that they will then use for compression). When this + * guarantee is provided to them, they can use the memory without any setup + * work. When it can't, they have to clear the area. + * + * - These buffers also have different alignment requirements. + * + * - We would like to reuse the objects in the workspace for multiple + * compressions without having to perform any expensive reallocation or + * reinitialization work. + * + * - We would like to be able to efficiently reuse the workspace across + * multiple compressions **even when the compression parameters change** and + * we need to resize some of the objects (where possible). + * + * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp + * abstraction was created. It works as follows: + * + * Workspace Layout: + * + * [ ... workspace ... ] + * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: + * + * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict, + * so that literally everything fits in a single buffer. Note: if present, + * this must be the first object in the workspace, since ZSTD_free{CCtx, + * CDict}() rely on a pointer comparison to see whether one or two frees are + * required. + * + * - Fixed size objects: these are fixed-size, fixed-count objects that are + * nonetheless "dynamically" allocated in the workspace so that we can + * control how they're initialized separately from the broader ZSTD_CCtx. + * Examples: + * - Entropy Workspace + * - 2 x ZSTD_compressedBlockState_t + * - CDict dictionary contents + * + * - Tables: these are any of several different datastructures (hash tables, + * chain tables, binary trees) that all respect a common format: they are + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. + * + * - Aligned: these buffers are used for various purposes that require 4 byte + * alignment, but don't require any initialization before they're used. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can + * be moved around at no cost for a new compression. + * + * Allocating Memory: + * + * The various types of objects must be allocated in order, so they can be + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects + * 2. Buffers + * 3. Aligned + * 4. Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +typedef struct { + void* workspace; + void* workspaceEnd; + + void* objectEnd; + void* tableEnd; + void* tableValidEnd; + void* allocStart; + + int allocFailed; + int workspaceOversizedDuration; + ZSTD_cwksp_alloc_phase_e phase; +} ZSTD_cwksp; + +/*-************************************* +* Functions +***************************************/ + +MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); + +MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; + assert(ws->workspace <= ws->objectEnd); + assert(ws->objectEnd <= ws->tableEnd); + assert(ws->objectEnd <= ws->tableValidEnd); + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); +} + +/** + * Align must be a power of 2. + */ +MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { + size_t const mask = align - 1; + assert((align & mask) == 0); + return (size + mask) & ~mask; +} + +/** + * Use this to determine how much space in the workspace we will consume to + * allocate this object. (Normally it should be exactly the size of the object, + * but under special conditions, like ASAN, where we pad each object, it might + * be larger.) + * + * Since tables aren't currently redzoned, you don't need to call through this + * to figure out how much space you need for the matchState tables. Everything + * else is though. + */ +MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#else + return size; +#endif +} + +MEM_STATIC void ZSTD_cwksp_internal_advance_phase( + ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { + assert(phase >= ws->phase); + if (phase > ws->phase) { + if (ws->phase < ZSTD_cwksp_alloc_buffers && + phase >= ZSTD_cwksp_alloc_buffers) { + ws->tableValidEnd = ws->objectEnd; + } + if (ws->phase < ZSTD_cwksp_alloc_aligned && + phase >= ZSTD_cwksp_alloc_aligned) { + /* If unaligned allocations down from a too-large top have left us + * unaligned, we need to realign our alloc ptr. Technically, this + * can consume space that is unaccounted for in the neededSpace + * calculation. However, I believe this can only happen when the + * workspace is too large, and specifically when it is too large + * by a larger margin than the space that will be consumed. */ + /* TODO: cleaner, compiler warning friendly way to do this??? */ + ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); + if (ws->allocStart < ws->tableValidEnd) { + ws->tableValidEnd = ws->allocStart; + } + } + ws->phase = phase; + } +} + +/** + * Returns whether this object/buffer/etc was allocated in this workspace. + */ +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); +} + +/** + * Internal function. Do not use directly. + */ +MEM_STATIC void* ZSTD_cwksp_reserve_internal( + ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { + void* alloc; + void* bottom = ws->tableEnd; + ZSTD_cwksp_internal_advance_phase(ws, phase); + alloc = (BYTE *)ws->allocStart - bytes; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* over-reserve space */ + alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#endif + + DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); + if (alloc < bottom) { + DEBUGLOG(4, "cwksp: alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + if (alloc < ws->tableValidEnd) { + ws->tableValidEnd = alloc; + } + ws->allocStart = alloc; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on + * either size. */ + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +/** + * Reserves and returns unaligned memory. + */ +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); +} + +/** + * Reserves and returns memory sized on and aligned on sizeof(unsigned). + */ +MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { + assert((bytes & (sizeof(U32)-1)) == 0); + return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); +} + +/** + * Aligned on sizeof(unsigned). These buffers have the special property that + * their values remain constrained, allowing us to re-use them without + * memset()-ing them. + */ +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { + const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; + void* alloc = ws->tableEnd; + void* end = (BYTE *)alloc + bytes; + void* top = ws->allocStart; + + DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + assert((bytes & (sizeof(U32)-1)) == 0); + ZSTD_cwksp_internal_advance_phase(ws, phase); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(end <= top); + if (end > top) { + DEBUGLOG(4, "cwksp: table alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + ws->tableEnd = end; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +/** + * Aligned on sizeof(void*). + */ +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { + size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); + void* alloc = ws->objectEnd; + void* end = (BYTE*)alloc + roundedBytes; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* over-reserve space */ + end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#endif + + DEBUGLOG(5, + "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining", + alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); + assert(((size_t)alloc & (sizeof(void*)-1)) == 0); + assert((bytes & (sizeof(void*)-1)) == 0); + ZSTD_cwksp_assert_internal_consistency(ws); + /* we must be in the first phase, no advance is possible */ + if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { + DEBUGLOG(4, "cwksp: object alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + ws->objectEnd = end; + ws->tableEnd = end; + ws->tableValidEnd = end; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on + * either size. */ + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table re-use logic is sound, and that we don't + * access table space that we haven't cleaned, we re-"poison" the table + * space every time we mark it dirty. */ + { + size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; + assert(__msan_test_shadow(ws->objectEnd, size) == -1); + __msan_poison(ws->objectEnd, size); + } +#endif + + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + ws->tableValidEnd = ws->objectEnd; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean"); + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; + } + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * Zero the part of the allocated tables not already marked clean. + */ +MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables"); + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { + memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); + } + ZSTD_cwksp_mark_tables_clean(ws); +} + +/** + * Invalidates table allocations. + * All other allocations remain valid. + */ +MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: clearing tables!"); + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + { + size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; + __asan_poison_memory_region(ws->objectEnd, size); + } +#endif + + ws->tableEnd = ws->objectEnd; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * Invalidates all buffer, aligned, and table allocations. + * Object allocations remain valid. + */ +MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: clearing!"); + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the context re-use logic is sound, and that we don't + * access stuff that this compression hasn't initialized, we re-"poison" + * the workspace (or at least the non-static, non-table parts of it) + * every time we start a new compression. */ + { + size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd; + __msan_poison(ws->tableValidEnd, size); + } +#endif + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + { + size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd; + __asan_poison_memory_region(ws->objectEnd, size); + } +#endif + + ws->tableEnd = ws->objectEnd; + ws->allocStart = ws->workspaceEnd; + ws->allocFailed = 0; + if (ws->phase > ZSTD_cwksp_alloc_buffers) { + ws->phase = ZSTD_cwksp_alloc_buffers; + } + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed + * buffer, if present, must be separately freed). + */ +MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size) { + DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size); + assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */ + ws->workspace = start; + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; + ws->phase = ZSTD_cwksp_alloc_objects; + ZSTD_cwksp_clear(ws); + ws->workspaceOversizedDuration = 0; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) { + void* workspace = ZSTD_malloc(size, customMem); + DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size); + RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!"); + ZSTD_cwksp_init(ws, workspace, size); + return 0; +} + +MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) { + void *ptr = ws->workspace; + DEBUGLOG(4, "cwksp: freeing workspace"); + memset(ws, 0, sizeof(ZSTD_cwksp)); + ZSTD_free(ptr, customMem); +} + +/** + * Moves the management of a workspace from one cwksp to another. The src cwksp + * is left in an invalid state (src must be re-init()'ed before its used again). + */ +MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + *dst = *src; + memset(src, 0, sizeof(ZSTD_cwksp)); +} + +MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +} + +MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; +} + +/*-************************************* +* Functions Checking Free Space +***************************************/ + +MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); +} + +MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace; +} + +MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_check_available( + ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR); +} + +MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace) + && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +MEM_STATIC void ZSTD_cwksp_bump_oversized_duration( + ZSTD_cwksp* ws, size_t additionalNeededSpace) { + if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) { + ws->workspaceOversizedDuration++; + } else { + ws->workspaceOversizedDuration = 0; + } +} + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_CWKSP_H */ +/**** ended inlining zstd_cwksp.h ****/ +#ifdef ZSTD_MULTITHREAD +/**** start inlining zstdmt_compress.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + #ifndef ZSTDMT_COMPRESS_H + #define ZSTDMT_COMPRESS_H + + #if defined (__cplusplus) + extern "C" { + #endif + + +/* Note : This is an internal API. + * These APIs used to be exposed with ZSTDLIB_API, + * because it used to be the only way to invoke MT compression. + * Now, it's recommended to use ZSTD_compress2 and ZSTD_compressStream2() + * instead. + * + * If you depend on these APIs and can't switch, then define + * ZSTD_LEGACY_MULTITHREADED_API when making the dynamic library. + * However, we may completely remove these functions in a future + * release, so please switch soon. + * + * This API requires ZSTD_MULTITHREAD to be defined during compilation, + * otherwise ZSTDMT_createCCtx*() will fail. + */ + +#ifdef ZSTD_LEGACY_MULTITHREADED_API +# define ZSTDMT_API ZSTDLIB_API +#else +# define ZSTDMT_API +#endif + +/* === Dependencies === */ +#include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ +/**** skipping file: ../zstd.h ****/ + + +/* === Constants === */ +#ifndef ZSTDMT_NBWORKERS_MAX +# define ZSTDMT_NBWORKERS_MAX 200 +#endif +#ifndef ZSTDMT_JOBSIZE_MIN +# define ZSTDMT_JOBSIZE_MIN (1 MB) +#endif +#define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30) +#define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB)) + + +/* === Memory management === */ +typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx; +/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ +ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers); +/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ +ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, + ZSTD_customMem cMem); +ZSTDMT_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx); + +ZSTDMT_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx); + + +/* === Simple one-pass compression function === */ + +ZSTDMT_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + + + +/* === Streaming functions === */ + +ZSTDMT_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel); +ZSTDMT_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */ + +ZSTDMT_API size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx); +ZSTDMT_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDMT_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ + + +/* === Advanced functions and parameters === */ + +ZSTDMT_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_parameters params, + int overlapLog); + +ZSTDMT_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, /* dict can be released after init, a local copy is preserved within zcs */ + ZSTD_parameters params, + unsigned long long pledgedSrcSize); /* pledgedSrcSize is optional and can be zero == unknown */ + +ZSTDMT_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fparams, + unsigned long long pledgedSrcSize); /* note : zero means empty */ + +/* ZSTDMT_parameter : + * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */ +typedef enum { + ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */ + ZSTDMT_p_overlapLog, /* Each job may reload a part of previous job to enhance compression ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */ + ZSTDMT_p_rsyncable /* Enables rsyncable mode. */ +} ZSTDMT_parameter; + +/* ZSTDMT_setMTCtxParameter() : + * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter. + * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__ + * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value); + +/* ZSTDMT_getMTCtxParameter() : + * Query the ZSTDMT_CCtx for a parameter value. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value); + + +/*! ZSTDMT_compressStream_generic() : + * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream() + * depending on flush directive. + * @return : minimum amount of data still to be flushed + * 0 if fully flushed + * or an error code + * note : needs to be init using any ZSTD_initCStream*() variant */ +ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* ======================================================== + * === Private interface, for use by ZSTD_compress.c === + * === Not exposed in libzstd. Never invoke directly === + * ======================================================== */ + + /*! ZSTDMT_toFlushNow() + * Tell how many bytes are ready to be flushed immediately. + * Probe the oldest active job (not yet entirely flushed) and check its output buffer. + * If return 0, it means there is no active job, + * or, it means oldest job is still active, but everything produced has been flushed so far, + * therefore flushing is limited by speed of oldest job. */ +size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx); + +/*! ZSTDMT_CCtxParam_setMTCtxParameter() + * like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */ +size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, int value); + +/*! ZSTDMT_CCtxParam_setNbWorkers() + * Set nbWorkers, and clamp it. + * Also reset jobSize and overlapLog */ +size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers); + +/*! ZSTDMT_updateCParams_whileCompressing() : + * Updates only a selected set of compression parameters, to remain compatible with current frame. + * New parameters will be applied to next compression job. */ +void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams); + +/*! ZSTDMT_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads. + */ +ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx); + + +/*! ZSTDMT_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDMT_COMPRESS_H */ +/**** ended inlining zstdmt_compress.h ****/ +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + + +/*-************************************* +* Constants +***************************************/ +#define kSearchStrength 8 +#define HASH_READ_SIZE 8 +#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted". + It could be confused for a real successor at index "1", if sorted as larger than its predecessor. + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +/*-************************************* +* Context memory management +***************************************/ +typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e; +typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage; + +typedef struct ZSTD_prefixDict_s { + const void* dict; + size_t dictSize; + ZSTD_dictContentType_e dictContentType; +} ZSTD_prefixDict; + +typedef struct { + void* dictBuffer; + void const* dict; + size_t dictSize; + ZSTD_dictContentType_e dictContentType; + ZSTD_CDict* cdict; +} ZSTD_localDict; + +typedef struct { + U32 CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_repeat repeatMode; +} ZSTD_hufCTables_t; + +typedef struct { + FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; + FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; + FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; + FSE_repeat offcode_repeatMode; + FSE_repeat matchlength_repeatMode; + FSE_repeat litlength_repeatMode; +} ZSTD_fseCTables_t; + +typedef struct { + ZSTD_hufCTables_t huf; + ZSTD_fseCTables_t fse; +} ZSTD_entropyCTables_t; + +typedef struct { + U32 off; + U32 len; +} ZSTD_match_t; + +typedef struct { + int price; + U32 off; + U32 mlen; + U32 litlen; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_optimal_t; + +typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + +typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ + U32 matchLengthSum; /* nb of matchLength codes */ + U32 offCodeSum; /* nb of offset codes */ + U32 litSumBasePrice; /* to compare to log2(litfreq) */ + U32 litLengthSumBasePrice; /* to compare to log2(llfreq) */ + U32 matchLengthSumBasePrice;/* to compare to log2(mlfreq) */ + U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ + ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ + ZSTD_literalCompressionMode_e literalCompressionMode; +} optState_t; + +typedef struct { + ZSTD_entropyCTables_t entropy; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_compressedBlockState_t; + +typedef struct { + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ +} ZSTD_window_t; + +typedef struct ZSTD_matchState_t ZSTD_matchState_t; +struct ZSTD_matchState_t { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary, within context's referential. + * When loadedDictEnd != 0, a dictionary is in use, and still valid. + * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance. + * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity(). + * When dict referential is copied into active context (i.e. not attached), + * loadedDictEnd == dictSize, since referential starts from zero. + */ + U32 nextToUpdate; /* index from which to continue table update */ + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + U32* hashTable; + U32* hashTable3; + U32* chainTable; + optState_t opt; /* optimal parser state */ + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; +}; + +typedef struct { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; + ZSTD_matchState_t matchState; +} ZSTD_blockState_t; + +typedef struct { + U32 offset; + U32 checksum; +} ldmEntry_t; + +typedef struct { + ZSTD_window_t window; /* State for the window round buffer management */ + ldmEntry_t* hashTable; + U32 loadedDictEnd; + BYTE* bucketOffsets; /* Next position in bucket to insert entry */ + U64 hashPower; /* Used to compute the rolling hash. + * Depends on ldmParams.minMatchLength */ +} ldmState_t; + +typedef struct { + U32 enableLdm; /* 1 if enable long distance matching */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ + U32 hashRateLog; /* Log number of entries to skip */ + U32 windowLog; /* Window log for the LDM */ +} ldmParams_t; + +typedef struct { + U32 offset; + U32 litLength; + U32 matchLength; +} rawSeq; + +typedef struct { + rawSeq* seq; /* The start of the sequences */ + size_t pos; /* The position where reading stopped. <= size. */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity starting from `seq` pointer */ +} rawSeqStore_t; + +typedef struct { + int collectSequences; + ZSTD_Sequence* seqStart; + size_t seqIndex; + size_t maxSequences; +} SeqCollector; + +struct ZSTD_CCtx_params_s { + ZSTD_format_e format; + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; + + int compressionLevel; + int forceWindow; /* force back-references to respect limit of + * 1< 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; +} + +/* ZSTD_MLcode() : + * note : mlBase = matchLength - MINMATCH; + * because it's the format it's stored in seqStore->sequences */ +MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) +{ + static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, + 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; + static const U32 ML_deltaCode = 36; + return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase]; +} + +typedef struct repcodes_s { + U32 rep[3]; +} repcodes_t; + +MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) +{ + repcodes_t newReps; + if (offset >= ZSTD_REP_NUM) { /* full offset */ + newReps.rep[2] = rep[1]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = offset - ZSTD_REP_MOVE; + } else { /* repcode */ + U32 const repCode = offset + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = currentOffset; + } else { /* repCode == 0 */ + memcpy(&newReps, rep, sizeof(newReps)); + } + } + return newReps; +} + +/* ZSTD_cParam_withinBounds: + * @return 1 if value is within cParam bounds, + * 0 otherwise */ +MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) +{ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); + if (ZSTD_isError(bounds.error)) return 0; + if (value < bounds.lowerBound) return 0; + if (value > bounds.upperBound) return 0; + return 1; +} + +/* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) +{ + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); + memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize); + return ZSTD_blockHeaderSize + srcSize; +} + +MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) +{ + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); + RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, ""); + MEM_writeLE24(op, cBlockHeader); + op[3] = src; + return 4; +} + + +/* ZSTD_minGain() : + * minimum compression required + * to generate a compress block or a compressed literals section. + * note : use same formula for both situations */ +MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) +{ + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); + return (srcSize >> minlog) + 2; +} + +MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) +{ + switch (cctxParams->literalCompressionMode) { + case ZSTD_lcm_huffman: + return 0; + case ZSTD_lcm_uncompressed: + return 1; + default: + assert(0 /* impossible: pre-validated */); + /* fall-through */ + case ZSTD_lcm_auto: + return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); + } +} + +/*! ZSTD_safecopyLiterals() : + * memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w. + * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single + * large copies. + */ +static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { + assert(iend > ilimit_w); + if (ip <= ilimit_w) { + ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); + op += ilimit_w - ip; + ip = ilimit_w; + } + while (ip < iend) *op++ = *ip++; +} + +/*! ZSTD_storeSeq() : + * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. + * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). + * `mlBase` : matchLength - MINMATCH + * Allowed to overread literals up to litLimit. +*/ +HINT_INLINE UNUSED_ATTR +void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +{ + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; + BYTE const* const litEnd = literals + litLength; +#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6) + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); + DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", + pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); + } +#endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); + /* copy Literals */ + assert(seqStorePtr->maxNbLit <= 128 KB); + assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit); + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. + * First copy 16 bytes, because literals are likely short. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); + } + } else { + ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w); + } + seqStorePtr->lit += litLength; + + /* literal Length */ + if (litLength>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 1; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ + seqStorePtr->sequences[0].offset = offCode + 1; + + /* match Length */ + if (mlBase>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 2; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].matchLength = (U16)mlBase; + + seqStorePtr->sequences++; +} + + +/*-************************************* +* Match length counter +***************************************/ +static unsigned ZSTD_NbCommonBytes (size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + + +MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) +{ + const BYTE* const pStart = pIn; + const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + if (pIn < pInLoopLimit) { + { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (diff) return ZSTD_NbCommonBytes(diff); } + pIn+=sizeof(size_t); pMatch+=sizeof(size_t); + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } } + if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn> (32-h) ; } +MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ + +static const U32 prime4bytes = 2654435761U; +static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } + +static const U64 prime5bytes = 889523592379ULL; +static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } + +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + +static const U64 prime7bytes = 58295818150454627ULL; +static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } + +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + +MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) +{ + switch(mls) + { + default: + case 4: return ZSTD_hash4Ptr(p, hBits); + case 5: return ZSTD_hash5Ptr(p, hBits); + case 6: return ZSTD_hash6Ptr(p, hBits); + case 7: return ZSTD_hash7Ptr(p, hBits); + case 8: return ZSTD_hash8Ptr(p, hBits); + } +} + +/** ZSTD_ipow() : + * Return base^exponent. + */ +static U64 ZSTD_ipow(U64 base, U64 exponent) +{ + U64 power = 1; + while (exponent) { + if (exponent & 1) power *= base; + exponent >>= 1; + base *= base; + } + return power; +} + +#define ZSTD_ROLL_HASH_CHAR_OFFSET 10 + +/** ZSTD_rollingHash_append() : + * Add the buffer to the hash value. + */ +static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size) +{ + BYTE const* istart = (BYTE const*)buf; + size_t pos; + for (pos = 0; pos < size; ++pos) { + hash *= prime8bytes; + hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET; + } + return hash; +} + +/** ZSTD_rollingHash_compute() : + * Compute the rolling hash value of the buffer. + */ +MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size) +{ + return ZSTD_rollingHash_append(0, buf, size); +} + +/** ZSTD_rollingHash_primePower() : + * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash + * over a window of length bytes. + */ +MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length) +{ + return ZSTD_ipow(prime8bytes, length - 1); +} + +/** ZSTD_rollingHash_rotate() : + * Rotate the rolling hash by one byte. + */ +MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower) +{ + hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower; + hash *= prime8bytes; + hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET; + return hash; +} + +/*-************************************* +* Round buffer management +***************************************/ +#if (ZSTD_WINDOWLOG_MAX_64 > 31) +# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" +#endif +/* Max current allowed */ +#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) +/* Maximum chunk size before overflow correction needs to be called again */ +#define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ + - ZSTD_CURRENT_MAX) /* Maximum beginning lowLimit */ + +/** + * ZSTD_window_clear(): + * Clears the window containing the history by simply setting it to empty. + */ +MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) +{ + size_t const endT = (size_t)(window->nextSrc - window->base); + U32 const end = (U32)endT; + + window->lowLimit = end; + window->dictLimit = end; +} + +/** + * ZSTD_window_hasExtDict(): + * Returns non-zero if the window has a non-empty extDict. + */ +MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) +{ + return window.lowLimit < window.dictLimit; +} + +/** + * ZSTD_matchState_dictMode(): + * Inspects the provided matchState and figures out what dictMode should be + * passed to the compressor. + */ +MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) +{ + return ZSTD_window_hasExtDict(ms->window) ? + ZSTD_extDict : + ms->dictMatchState != NULL ? + ZSTD_dictMatchState : + ZSTD_noDict; +} + +/** + * ZSTD_window_needOverflowCorrection(): + * Returns non-zero if the indices are getting too large and need overflow + * protection. + */ +MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + void const* srcEnd) +{ + U32 const current = (U32)((BYTE const*)srcEnd - window.base); + return current > ZSTD_CURRENT_MAX; +} + +/** + * ZSTD_window_correctOverflow(): + * Reduces the indices to protect from index overflow. + * Returns the correction made to the indices, which must be applied to every + * stored index. + * + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + * NOTE: (maxDist & cycleMask) must be zero. + */ +MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) +{ + /* preemptive overflow correction: + * 1. correction is large enough: + * lowLimit > (3<<29) ==> current > 3<<29 + 1< (3<<29 + 1< (3<<29) - (1< (3<<29) - (1<<30) (NOTE: chainLog <= 30) + * > 1<<29 + * + * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow: + * After correction, current is less than (1<base < 1<<32. + * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); + U32 const currentCycle0 = current & cycleMask; + /* Exclude zero so that newCurrent - maxDist >= 1. */ + U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; + U32 const newCurrent = currentCycle1 + maxDist; + U32 const correction = current - newCurrent; + assert((maxDist & cycleMask) == 0); + assert(current > newCurrent); + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + + window->base += correction; + window->dictBase += correction; + if (window->lowLimit <= correction) window->lowLimit = 1; + else window->lowLimit -= correction; + if (window->dictLimit <= correction) window->dictLimit = 1; + else window->dictLimit -= correction; + + /* Ensure we can still reference the full window. */ + assert(newCurrent >= maxDist); + assert(newCurrent - maxDist >= 1); + /* Ensure that lowLimit and dictLimit didn't underflow. */ + assert(window->lowLimit <= newCurrent); + assert(window->dictLimit <= newCurrent); + + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, + window->lowLimit); + return correction; +} + +/** + * ZSTD_window_enforceMaxDist(): + * Updates lowLimit so that: + * (srcEnd - base) - lowLimit == maxDist + loadedDictEnd + * + * It ensures index is valid as long as index >= lowLimit. + * This must be called before a block compression call. + * + * loadedDictEnd is only defined if a dictionary is in use for current compression. + * As the name implies, loadedDictEnd represents the index at end of dictionary. + * The value lies within context's referential, it can be directly compared to blockEndIdx. + * + * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0. + * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit. + * This is because dictionaries are allowed to be referenced fully + * as long as the last byte of the dictionary is in the window. + * Once input has progressed beyond window size, dictionary cannot be referenced anymore. + * + * In normal dict mode, the dictionary lies between lowLimit and dictLimit. + * In dictMatchState mode, lowLimit and dictLimit are the same, + * and the dictionary is below them. + * forceWindow and dictMatchState are therefore incompatible. + */ +MEM_STATIC void +ZSTD_window_enforceMaxDist(ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, + const ZSTD_matchState_t** dictMatchStatePtr) +{ + U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; + DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u", + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + + /* - When there is no dictionary : loadedDictEnd == 0. + In which case, the test (blockEndIdx > maxDist) is merely to avoid + overflowing next operation `newLowLimit = blockEndIdx - maxDist`. + - When there is a standard dictionary : + Index referential is copied from the dictionary, + which means it starts from 0. + In which case, loadedDictEnd == dictSize, + and it makes sense to compare `blockEndIdx > maxDist + dictSize` + since `blockEndIdx` also starts from zero. + - When there is an attached dictionary : + loadedDictEnd is expressed within the referential of the context, + so it can be directly compared against blockEndIdx. + */ + if (blockEndIdx > maxDist + loadedDictEnd) { + U32 const newLowLimit = blockEndIdx - maxDist; + if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit; + if (window->dictLimit < window->lowLimit) { + DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u", + (unsigned)window->dictLimit, (unsigned)window->lowLimit); + window->dictLimit = window->lowLimit; + } + /* On reaching window size, dictionaries are invalidated */ + if (loadedDictEndPtr) *loadedDictEndPtr = 0; + if (dictMatchStatePtr) *dictMatchStatePtr = NULL; + } +} + +/* Similar to ZSTD_window_enforceMaxDist(), + * but only invalidates dictionary + * when input progresses beyond window size. + * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL) + * loadedDictEnd uses same referential as window->base + * maxDist is the window size */ +MEM_STATIC void +ZSTD_checkDictValidity(const ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, + const ZSTD_matchState_t** dictMatchStatePtr) +{ + assert(loadedDictEndPtr != NULL); + assert(dictMatchStatePtr != NULL); + { U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = *loadedDictEndPtr; + DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u", + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + + if (blockEndIdx > loadedDictEnd + maxDist) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; + *dictMatchStatePtr = NULL; + } else { + if (*loadedDictEndPtr != 0) { + DEBUGLOG(6, "dictionary considered valid for current block"); + } } } +} + +MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + memset(window, 0, sizeof(*window)); + window->base = (BYTE const*)""; + window->dictBase = (BYTE const*)""; + window->dictLimit = 1; /* start from 1, so that 1st position is valid */ + window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ + window->nextSrc = window->base + 1; /* see issue #1241 */ +} + +/** + * ZSTD_window_update(): + * Updates the window by appending [src, src + srcSize) to the window. + * If it is not contiguous, the current prefix becomes the extDict, and we + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize) +{ + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; + DEBUGLOG(5, "ZSTD_window_update"); + if (srcSize == 0) + return contiguous; + assert(window->base != NULL); + assert(window->dictBase != NULL); + /* Check if blocks follow each other */ + if (src != window->nextSrc) { + /* not contiguous */ + size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); + DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); + window->lowLimit = window->dictLimit; + assert(distanceFromBase == (size_t)(U32)distanceFromBase); /* should never overflow */ + window->dictLimit = (U32)distanceFromBase; + window->dictBase = window->base; + window->base = ip - distanceFromBase; + /* ms->nextToUpdate = window->dictLimit; */ + if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit; /* too small extDict */ + contiguous = 0; + } + window->nextSrc = ip + srcSize; + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { + ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; + U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; + window->lowLimit = lowLimitMax; + DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); + } + return contiguous; +} + +/** + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog) +{ + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; + U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + return matchLowest; +} + +/** + * Returns the lowest allowed match index in the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog) +{ + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.dictLimit; + U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + return matchLowest; +} + + + +/* debug functions */ +#if (DEBUGLEVEL>=2) + +MEM_STATIC double ZSTD_fWeight(U32 rawStat) +{ + U32 const fp_accuracy = 8; + U32 const fp_multiplier = (1 << fp_accuracy); + U32 const newStat = rawStat + 1; + U32 const hb = ZSTD_highbit32(newStat); + U32 const BWeight = hb * fp_multiplier; + U32 const FWeight = (newStat << fp_accuracy) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + fp_accuracy < 31); + return (double)weight / fp_multiplier; +} + +/* display a table content, + * listing each element, its frequency, and its predicted bit cost */ +MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) +{ + unsigned u, sum; + for (u=0, sum=0; u<=max; u++) sum += table[u]; + DEBUGLOG(2, "total nb elts: %u", sum); + for (u=0; u<=max; u++) { + DEBUGLOG(2, "%2u: %5u (%.2f)", + u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) ); + } +} + +#endif + + +#if defined (__cplusplus) +} +#endif + +/* =============================================================== + * Shared internal declarations + * These prototypes may be called from sources not in lib/compress + * =============================================================== */ + +/* ZSTD_loadCEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * return : size of dictionary header (size of magic number + dict ID + entropy tables) + * assumptions : magic number supposed already checked + * and dictSize >= 8 */ +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + short* offcodeNCount, unsigned* offcodeMaxValue, + const void* const dict, size_t dictSize); + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + +/* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress + * ============================================================== */ + +/* ZSTD_getCParamsFromCCtxParams() : + * cParams are built depending on compressionLevel, src size hints, + * LDM and manually set compression parameters. + * Note: srcSizeHint == 0 means 0! + */ +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize); + +/*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); + +void ZSTD_resetSeqStore(seqStore_t* ssPtr); + +/*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict); + +/* ZSTD_compressBegin_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize); + +/* ZSTD_compress_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_CCtx_params* params); + + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapacity` is too small ( 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + +#endif /* ZSTD_COMPRESS_H */ +/**** ended inlining zstd_compress_internal.h ****/ + + +size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, int disableLiteralCompression, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, + const int bmi2); + +#endif /* ZSTD_COMPRESS_LITERALS_H */ +/**** ended inlining zstd_compress_literals.h ****/ + +size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + BYTE* const ostart = (BYTE* const)dst; + U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); + + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) + { + case 1: /* 2 - 1 - 5 */ + ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3)); + break; + case 2: /* 2 - 2 - 12 */ + MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4))); + break; + case 3: /* 2 - 2 - 20 */ + MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4))); + break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); + } + + memcpy(ostart + flSize, src, srcSize); + DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; +} + +size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + BYTE* const ostart = (BYTE* const)dst; + U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); + + (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ + + switch(flSize) + { + case 1: /* 2 - 1 - 5 */ + ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3)); + break; + case 2: /* 2 - 2 - 12 */ + MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4))); + break; + case 3: /* 2 - 2 - 20 */ + MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4))); + break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); + } + + ostart[flSize] = *(const BYTE*)src; + DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); + return flSize+1; +} + +size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, int disableLiteralCompression, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, + const int bmi2) +{ + size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + + DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", + disableLiteralCompression, (U32)srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + /* small ? don't even attempt compression (speed opt) */ +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; + int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; + cLitSize = singleStream ? + HUF_compress1X_repeat( + ostart+lhSize, dstCapacity-lhSize, src, srcSize, + HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : + HUF_compress4X_repeat( + ostart+lhSize, dstCapacity-lhSize, src, srcSize, + HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ + DEBUGLOG(5, "Reusing previous huffman table"); + hType = set_repeat; + } + } + + if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + if (cLitSize==1) { + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } + + if (hType == set_compressed) { + /* using a newly constructed table */ + nextHuf->repeatMode = HUF_repeat_check; + } + + /* Build header */ + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ + { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); + } + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize)); + return lhSize+cLitSize; +} +/**** ended inlining compress/zstd_compress_literals.c ****/ +/**** start inlining compress/zstd_compress_sequences.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_sequences.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_SEQUENCES_H +#define ZSTD_COMPRESS_SEQUENCES_H + +/**** skipping file: ../common/fse.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + +typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +} ZSTD_defaultPolicy_e; + +symbolEncodingType_e +ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, + ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy); + +size_t +ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, + const FSE_CTable* prevCTable, size_t prevCTableSize, + void* entropyWorkspace, size_t entropyWorkspaceSize); + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max); + +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max); +#endif /* ZSTD_COMPRESS_SEQUENCES_H */ +/**** ended inlining zstd_compress_sequences.h ****/ + +/** + * -log2(x / 256) lookup table for x in [0, 256). + * If x == 0: Return 0 + * Else: Return floor(-log2(x / 256) * 256) + */ +static unsigned const kInverseProbabilityLog256[256] = { + 0, 2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162, + 1130, 1100, 1073, 1047, 1024, 1001, 980, 960, 941, 923, 906, 889, + 874, 859, 844, 830, 817, 804, 791, 779, 768, 756, 745, 734, + 724, 714, 704, 694, 685, 676, 667, 658, 650, 642, 633, 626, + 618, 610, 603, 595, 588, 581, 574, 567, 561, 554, 548, 542, + 535, 529, 523, 517, 512, 506, 500, 495, 489, 484, 478, 473, + 468, 463, 458, 453, 448, 443, 438, 434, 429, 424, 420, 415, + 411, 407, 402, 398, 394, 390, 386, 382, 377, 373, 370, 366, + 362, 358, 354, 350, 347, 343, 339, 336, 332, 329, 325, 322, + 318, 315, 311, 308, 305, 302, 298, 295, 292, 289, 286, 282, + 279, 276, 273, 270, 267, 264, 261, 258, 256, 253, 250, 247, + 244, 241, 239, 236, 233, 230, 228, 225, 222, 220, 217, 215, + 212, 209, 207, 204, 202, 199, 197, 194, 192, 190, 187, 185, + 182, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, + 155, 153, 151, 149, 146, 144, 142, 140, 138, 136, 134, 132, + 130, 128, 126, 123, 121, 119, 117, 115, 114, 112, 110, 108, + 106, 104, 102, 100, 98, 96, 94, 93, 91, 89, 87, 85, + 83, 82, 80, 78, 76, 74, 73, 71, 69, 67, 66, 64, + 62, 61, 59, 57, 55, 54, 52, 50, 49, 47, 46, 44, + 42, 41, 39, 37, 36, 34, 33, 31, 30, 28, 26, 25, + 23, 22, 20, 19, 17, 16, 14, 13, 11, 10, 8, 7, + 5, 4, 2, 1, +}; + +static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) { + void const* ptr = ctable; + U16 const* u16ptr = (U16 const*)ptr; + U32 const maxSymbolValue = MEM_read16(u16ptr + 1); + return maxSymbolValue; +} + +/** + * Returns the cost in bytes of encoding the normalized count header. + * Returns an error if any of the helper functions return an error. + */ +static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max, + size_t const nbSeq, unsigned const FSELog) +{ + BYTE wksp[FSE_NCOUNTBOUND]; + S16 norm[MaxSeq + 1]; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max), ""); + return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog); +} + +/** + * Returns the cost in bits of encoding the distribution described by count + * using the entropy bound. + */ +static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total) +{ + unsigned cost = 0; + unsigned s; + for (s = 0; s <= max; ++s) { + unsigned norm = (unsigned)((256 * count[s]) / total); + if (count[s] != 0 && norm == 0) + norm = 1; + assert(count[s] < total); + cost += count[s] * kInverseProbabilityLog256[norm]; + } + return cost >> 8; +} + +/** + * Returns the cost in bits of encoding the distribution in count using ctable. + * Returns an error if ctable cannot represent all the symbols in count. + */ +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max) +{ + unsigned const kAccuracyLog = 8; + size_t cost = 0; + unsigned s; + FSE_CState_t cstate; + FSE_initCState(&cstate, ctable); + if (ZSTD_getFSEMaxSymbolValue(ctable) < max) { + DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u", + ZSTD_getFSEMaxSymbolValue(ctable), max); + return ERROR(GENERIC); + } + for (s = 0; s <= max; ++s) { + unsigned const tableLog = cstate.stateLog; + unsigned const badCost = (tableLog + 1) << kAccuracyLog; + unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog); + if (count[s] == 0) + continue; + if (bitCost >= badCost) { + DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s); + return ERROR(GENERIC); + } + cost += (size_t)count[s] * bitCost; + } + return cost >> kAccuracyLog; +} + +/** + * Returns the cost in bits of encoding the distribution in count using the + * table described by norm. The max symbol support by norm is assumed >= max. + * norm must be valid for every symbol with non-zero probability in count. + */ +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max) +{ + unsigned const shift = 8 - accuracyLog; + size_t cost = 0; + unsigned s; + assert(accuracyLog <= 8); + for (s = 0; s <= max; ++s) { + unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1; + unsigned const norm256 = normAcc << shift; + assert(norm256 > 0); + assert(norm256 < 256); + cost += count[s] * kInverseProbabilityLog256[norm256]; + } + return cost >> 8; +} + +symbolEncodingType_e +ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, + ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) +{ + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { + /* Prefer set_basic over set_rle when there are 2 or less symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ + DEBUGLOG(5, "Selected set_basic"); + return set_basic; + } + DEBUGLOG(5, "Selected set_rle"); + return set_rle; + } + if (strategy < ZSTD_lazy) { + if (isDefaultAllowed) { + size_t const staticFse_nbSeq_max = 1000; + size_t const mult = 10 - strategy; + size_t const baseLog = 3; + size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */ + assert(defaultNormLog >= 5 && defaultNormLog <= 6); /* xx_DEFAULTNORMLOG */ + assert(mult <= 9 && mult >= 7); + if ( (*repeatMode == FSE_repeat_valid) + && (nbSeq < staticFse_nbSeq_max) ) { + DEBUGLOG(5, "Selected set_repeat"); + return set_repeat; + } + if ( (nbSeq < dynamicFse_nbSeq_min) + || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) { + DEBUGLOG(5, "Selected set_basic"); + /* The format allows default tables to be repeated, but it isn't useful. + * When using simple heuristics to select encoding type, we don't want + * to confuse these tables with dictionaries. When running more careful + * analysis, we don't need to waste time checking both repeating tables + * and default tables. + */ + *repeatMode = FSE_repeat_none; + return set_basic; + } + } + } else { + size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC); + size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC); + size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog); + size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq); + + if (isDefaultAllowed) { + assert(!ZSTD_isError(basicCost)); + assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost))); + } + assert(!ZSTD_isError(NCountCost)); + assert(compressedCost < ERROR(maxCode)); + DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u", + (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost); + if (basicCost <= repeatCost && basicCost <= compressedCost) { + DEBUGLOG(5, "Selected set_basic"); + assert(isDefaultAllowed); + *repeatMode = FSE_repeat_none; + return set_basic; + } + if (repeatCost <= compressedCost) { + DEBUGLOG(5, "Selected set_repeat"); + assert(!ZSTD_isError(repeatCost)); + return set_repeat; + } + assert(compressedCost < basicCost && compressedCost < repeatCost); + } + DEBUGLOG(5, "Selected set_compressed"); + *repeatMode = FSE_repeat_check; + return set_compressed; +} + +size_t +ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, + const FSE_CTable* prevCTable, size_t prevCTableSize, + void* entropyWorkspace, size_t entropyWorkspaceSize) +{ + BYTE* op = (BYTE*)dst; + const BYTE* const oend = op + dstCapacity; + DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity); + + switch (type) { + case set_rle: + FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), ""); + RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space"); + *op = codeTable[0]; + return 1; + case set_repeat: + memcpy(nextCTable, prevCTable, prevCTableSize); + return 0; + case set_basic: + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */ + return 0; + case set_compressed: { + S16 norm[MaxSeq + 1]; + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + if (count[codeTable[nbSeq-1]] > 1) { + count[codeTable[nbSeq-1]]--; + nbSeq_1--; + } + assert(nbSeq_1 > 1); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max), ""); + { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), ""); + return NCountSize; + } + } + default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach"); + } +} + +FORCE_INLINE_TEMPLATE size_t +ZSTD_encodeSequences_body( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + RETURN_ERROR_IF( + ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)), + dstSize_tooSmall, "not enough space remaining"); + DEBUGLOG(6, "available space for bitstream : %i (dstCapacity=%u)", + (int)(blockStream.endPtr - blockStream.startPtr), + (unsigned)dstCapacity); + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq-1]; + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + } + BIT_flushBits(&blockStream); + + { size_t n; + for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); + if (longOffsets) { + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); + } } + + DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); + FSE_flushCState(&blockStream, &stateMatchLength); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); + FSE_flushCState(&blockStream, &stateOffsetBits); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); + FSE_flushCState(&blockStream, &stateLitLength); + + { size_t const streamSize = BIT_closeCStream(&blockStream); + RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space"); + return streamSize; + } +} + +static size_t +ZSTD_encodeSequences_default( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_encodeSequences_bmi2( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + +#endif + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) +{ + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); +#if DYNAMIC_BMI2 + if (bmi2) { + return ZSTD_encodeSequences_bmi2(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); + } +#endif + (void)bmi2; + return ZSTD_encodeSequences_default(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} +/**** ended inlining compress/zstd_compress_sequences.c ****/ +/**** start inlining compress/zstd_compress_superblock.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_superblock.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_ADVANCED_H +#define ZSTD_COMPRESS_ADVANCED_H + +/*-************************************* +* Dependencies +***************************************/ + +/**** skipping file: ../zstd.h ****/ + +/*-************************************* +* Target Compressed Block Size +***************************************/ + +/* ZSTD_compressSuperBlock() : + * Used to compress a super block when targetCBlockSize is being used. + * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */ +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + void const* src, size_t srcSize, + unsigned lastBlock); + +#endif /* ZSTD_COMPRESS_ADVANCED_H */ +/**** ended inlining zstd_compress_superblock.h ****/ + +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: hist.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_compress_sequences.h ****/ +/**** skipping file: zstd_compress_literals.h ****/ + +/*-************************************* +* Superblock entropy buffer structs +***************************************/ +/** ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ +typedef struct { + symbolEncodingType_e hType; + BYTE hufDesBuffer[500]; /* TODO give name to this value */ + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/** ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ +typedef struct { + symbolEncodingType_e llType; + symbolEncodingType_e ofType; + symbolEncodingType_e mlType; + BYTE fseTablesBuffer[500]; /* TODO give name to this value */ + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + + +/** ZSTD_buildSuperBlockEntropy_literal() : + * Builds entropy for the super-block literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * @return : size of huffman description table or error code */ +static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int disableLiteralsCompression, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = wkspEnd-nodeWksp; + unsigned maxSymbolValue = 255; + unsigned huffLog = HUF_TABLELOG_DEFAULT; + HUF_repeat repeat = prevHuf->repeatMode; + + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (disableLiteralsCompression) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Scan input and build symbol stats */ + { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } + } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } + } +} + +/** ZSTD_buildSuperBlockEntropy_sequences() : + * Builds entropy for the super-block sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * @return : size of fse tables or error code */ +static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); + BYTE* const cTableWksp = countWkspStart + countWkspSize; + const size_t cTableWkspSize = wkspEnd-cTableWksp; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + + assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); + memset(workspace, 0, wkspSize); + + fseMetadata->lastCountSize = 0; + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + /* build CTable for Literal Lengths */ + { U32 LLtype; + unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWksp, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, + countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); + if (LLtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->llType = (symbolEncodingType_e) LLtype; + } } + /* build CTable for Offsets */ + { U32 Offtype; + unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWksp, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, + countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); + if (Offtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->ofType = (symbolEncodingType_e) Offtype; + } } + /* build CTable for MatchLengths */ + { U32 MLtype; + unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWksp, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, + countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); + if (MLtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->mlType = (symbolEncodingType_e) MLtype; + } } + assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); + return op-ostart; +} + + +/** ZSTD_buildSuperBlockEntropy() : + * Builds entropy for the super-block. + * @return : 0 on success or error code */ +static size_t +ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_disableLiteralsCompression(cctxParams), + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); + return 0; +} + +/** ZSTD_compressSubBlock_literal() : + * Compresses literals section for a sub-block. + * When we have to write the Huffman table we will sometimes choose a header + * size larger than necessary. This is because we have to pick the header size + * before we know the table size + compressed size, so we have a bound on the + * table size. If we guessed incorrectly, we fall back to uncompressed literals. + * + * We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded + * in writing the header, otherwise it is set to 0. + * + * hufMetadata->hType has literals block type info. + * If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block. + * If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block. + * If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block + * Or 0 if it unable to compress. + * Or error code */ +static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + const BYTE* literals, size_t litSize, + void* dst, size_t dstSize, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + + (void)bmi2; /* TODO bmi2... */ + + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; + if (litSize == 0 || hufMetadata->hType == set_basic) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } else if (hufMetadata->hType == set_rle) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal"); + return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize); + } + + assert(litSize > 0); + assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat); + + if (writeEntropy && hufMetadata->hType == set_compressed) { + memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize); + op += hufMetadata->hufDesSize; + cLitSize += hufMetadata->hufDesSize; + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + + /* TODO bmi2 */ + { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) + : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { + DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize)); + return 0; + } + /* If we expand and we aren't writing a header then emit uncompressed */ + if (!writeEntropy && cLitSize >= litSize) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + /* If we are writing headers then allow expansion that doesn't change our header size. */ + if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) { + assert(cLitSize > litSize); + DEBUGLOG(5, "Literals expanded beyond allowed header size"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize); + } + + /* Build header */ + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ + { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ + { U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); + return op-ostart; +} + +static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { + const seqDef* const sstart = sequences; + const seqDef* const send = sequences + nbSeq; + const seqDef* sp = sstart; + size_t matchLengthSum = 0; + size_t litLengthSum = 0; + while (send-sp > 0) { + ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); + litLengthSum += seqLen.litLength; + matchLengthSum += seqLen.matchLength; + sp++; + } + assert(litLengthSum <= litSize); + if (!lastSequence) { + assert(litLengthSum == litSize); + } + return matchLengthSum + litSize; +} + +/** ZSTD_compressSubBlock_sequences() : + * Compresses sequences section for a sub-block. + * fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have + * symbol compression modes for the super-block. + * The first successfully compressed block will have these in its header. + * We set entropyWritten=1 when we succeed in compressing the sequences. + * The following sub-blocks will always have repeat mode. + * @return : compressed size of sequences section of a sub-block + * Or 0 if it is unable to compress + * Or error code. */ +static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + const seqDef* sequences, size_t nbSeq, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + BYTE* seqHead; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets); + + *entropyWritten = 0; + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); + if (nbSeq < 0x7F) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { + return op - ostart; + } + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart)); + + if (writeEntropy) { + const U32 LLtype = fseMetadata->llType; + const U32 Offtype = fseMetadata->ofType; + const U32 MLtype = fseMetadata->mlType; + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize); + *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize); + op += fseMetadata->fseTablesSize; + } else { + const U32 repeat = set_repeat; + *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2)); + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, oend - op, + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) { + /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(fseMetadata->lastCountSize + bitstreamSize == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } +#endif + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize); + } + + /* zstd versions <= 1.4.0 mistakenly report error when + * sequences section body size is less than 3 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1664. + * This can happen when the previous sequences section block is compressed + * with rle mode and the current block's sequences section is compressed + * with repeat mode where sequences section body size can be 1 byte. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (op-seqHead < 4) { + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting " + "an uncompressed block when sequences are < 4 bytes"); + return 0; + } +#endif + + *entropyWritten = 1; + return op - ostart; +} + +/** ZSTD_compressSubBlock() : + * Compresses a single sub-block. + * @return : compressed size of the sub-block + * Or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const seqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, + int writeLitEntropy, int writeSeqEntropy, + int* litEntropyWritten, int* seqEntropyWritten, + U32 lastBlock) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart + ZSTD_blockHeaderSize; + DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)", + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, + op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; + } + { size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse, + &entropyMetadata->fseMetadata, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, + op, oend-op, + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ + { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } + return op-ostart; +} + +static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = 255; + size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U32* additionalBits, + short const* defaultNorm, U32 defaultNormLog, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10; + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits / 8; +} + +static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, + nbSeq, fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL, + nbSeq, fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML, + nbSeq, fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) { + size_t cSizeEstimate = 0; + cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return cSizeEstimate + ZSTD_blockHeaderSize; +} + +static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +{ + if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle) + return 1; + if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle) + return 1; + if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle) + return 1; + return 0; +} + +/** ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. + * Entropy will be written to the first block. + * The following blocks will use repeat mode to compress. + * All sub-blocks are compressed blocks (no raw or rle blocks). + * @return : compressed size of the super block (which is multiple ZSTD blocks) + * Or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) +{ + const seqDef* const sstart = seqStorePtr->sequencesStart; + const seqDef* const send = seqStorePtr->sequences; + const seqDef* sp = sstart; + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; + size_t targetCBlockSize = cctxParams->targetCBlockSize; + size_t litSize, seqCount; + int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; + int writeSeqEntropy = 1; + int lastSequence = 0; + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", + (unsigned)(lend-lp), (unsigned)(send-sstart)); + + litSize = 0; + seqCount = 0; + do { + size_t cBlockSizeEstimate = 0; + if (sstart == send) { + lastSequence = 1; + } else { + const seqDef* const sequence = sp + seqCount; + lastSequence = sequence == send - 1; + litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; + seqCount++; + } + if (lastSequence) { + assert(lp <= lend); + assert(litSize <= (size_t)(lend - lp)); + litSize = (size_t)(lend - lp); + } + /* I think there is an optimization opportunity here. + * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful + * since it recalculates estimate from scratch. + * For example, it would recount literal distribution and symbol codes everytime. + */ + cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, + &nextCBlock->entropy, entropyMetadata, + workspace, wkspSize, writeLitEntropy, writeSeqEntropy); + if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { + int litEntropyWritten = 0; + int seqEntropyWritten = 0; + const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); + const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, oend-op, + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + lastBlock && lastSequence); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Committed the sub-block"); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + sp += seqCount; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + litSize = 0; + seqCount = 0; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; + } + } + } + } while (!lastSequence); + if (writeLitEntropy) { + DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); + memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ + DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); + return 0; + } + if (ip < iend) { + size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); + DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { + seqDef const* seq; + repcodes_t rep; + memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { + rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } + DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); + return op-ostart; +} + +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + void const* src, size_t srcSize, + unsigned lastBlock) { + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, + zc->blockState.nextCBlock, + &entropyMetadata, + &zc->appliedParams, + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */); +} +/**** ended inlining compress/zstd_compress_superblock.c ****/ +/**** start inlining compress/zstd_compress.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include /* INT_MAX */ +#include /* memset */ +/**** start inlining ../common/cpu.h ****/ +/* + * Copyright (c) 2018-2020, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMMON_CPU_H +#define ZSTD_COMMON_CPU_H + +/** + * Implementation taken from folly/CpuId.h + * https://github.com/facebook/folly/blob/master/folly/CpuId.h + */ + +#include + +/**** skipping file: mem.h ****/ + +#ifdef _MSC_VER +#include +#endif + +typedef struct { + U32 f1c; + U32 f1d; + U32 f7b; + U32 f7c; +} ZSTD_cpuid_t; + +MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { + U32 f1c = 0; + U32 f1d = 0; + U32 f7b = 0; + U32 f7c = 0; +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + int reg[4]; + __cpuid((int*)reg, 0); + { + int const n = reg[0]; + if (n >= 1) { + __cpuid((int*)reg, 1); + f1c = (U32)reg[2]; + f1d = (U32)reg[3]; + } + if (n >= 7) { + __cpuidex((int*)reg, 7, 0); + f7b = (U32)reg[1]; + f7c = (U32)reg[2]; + } + } +#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) + /* The following block like the normal cpuid branch below, but gcc + * reserves ebx for use of its pic register so we must specially + * handle the save and restore to avoid clobbering the register + */ + U32 n; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(n) + : "a"(0) + : "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(f1a), "=c"(f1c), "=d"(f1d) + : "a"(1)); + } + if (n >= 7) { + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %%eax\n\t" + "popl %%ebx" + : "=a"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) + U32 n; + __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx"); + } + if (n >= 7) { + U32 f7a; + __asm__("cpuid" + : "=a"(f7a), "=b"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#endif + { + ZSTD_cpuid_t cpuid; + cpuid.f1c = f1c; + cpuid.f1d = f1d; + cpuid.f7b = f7b; + cpuid.f7c = f7c; + return cpuid; + } +} + +#define X(name, r, bit) \ + MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \ + return ((cpuid.r) & (1U << bit)) != 0; \ + } + +/* cpuid(1): Processor Info and Feature Bits. */ +#define C(name, bit) X(name, f1c, bit) + C(sse3, 0) + C(pclmuldq, 1) + C(dtes64, 2) + C(monitor, 3) + C(dscpl, 4) + C(vmx, 5) + C(smx, 6) + C(eist, 7) + C(tm2, 8) + C(ssse3, 9) + C(cnxtid, 10) + C(fma, 12) + C(cx16, 13) + C(xtpr, 14) + C(pdcm, 15) + C(pcid, 17) + C(dca, 18) + C(sse41, 19) + C(sse42, 20) + C(x2apic, 21) + C(movbe, 22) + C(popcnt, 23) + C(tscdeadline, 24) + C(aes, 25) + C(xsave, 26) + C(osxsave, 27) + C(avx, 28) + C(f16c, 29) + C(rdrand, 30) +#undef C +#define D(name, bit) X(name, f1d, bit) + D(fpu, 0) + D(vme, 1) + D(de, 2) + D(pse, 3) + D(tsc, 4) + D(msr, 5) + D(pae, 6) + D(mce, 7) + D(cx8, 8) + D(apic, 9) + D(sep, 11) + D(mtrr, 12) + D(pge, 13) + D(mca, 14) + D(cmov, 15) + D(pat, 16) + D(pse36, 17) + D(psn, 18) + D(clfsh, 19) + D(ds, 21) + D(acpi, 22) + D(mmx, 23) + D(fxsr, 24) + D(sse, 25) + D(sse2, 26) + D(ss, 27) + D(htt, 28) + D(tm, 29) + D(pbe, 31) +#undef D + +/* cpuid(7): Extended Features. */ +#define B(name, bit) X(name, f7b, bit) + B(bmi1, 3) + B(hle, 4) + B(avx2, 5) + B(smep, 7) + B(bmi2, 8) + B(erms, 9) + B(invpcid, 10) + B(rtm, 11) + B(mpx, 14) + B(avx512f, 16) + B(avx512dq, 17) + B(rdseed, 18) + B(adx, 19) + B(smap, 20) + B(avx512ifma, 21) + B(pcommit, 22) + B(clflushopt, 23) + B(clwb, 24) + B(avx512pf, 26) + B(avx512er, 27) + B(avx512cd, 28) + B(sha, 29) + B(avx512bw, 30) + B(avx512vl, 31) +#undef B +#define C(name, bit) X(name, f7c, bit) + C(prefetchwt1, 0) + C(avx512vbmi, 1) +#undef C + +#undef X + +#endif /* ZSTD_COMMON_CPU_H */ +/**** ended inlining ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: hist.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_compress_sequences.h ****/ +/**** skipping file: zstd_compress_literals.h ****/ +/**** start inlining zstd_fast.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_FAST_H +#define ZSTD_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm); +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_fast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ +/**** ended inlining zstd_fast.h ****/ +/**** start inlining zstd_double_fast.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_DOUBLE_FAST_H +#define ZSTD_DOUBLE_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ + +void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm); +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_doubleFast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_DOUBLE_FAST_H */ +/**** ended inlining zstd_double_fast.h ****/ +/**** start inlining zstd_lazy.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LAZY_H +#define ZSTD_LAZY_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + +void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_LAZY_H */ +/**** ended inlining zstd_lazy.h ****/ +/**** start inlining zstd_opt.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_OPT_H +#define ZSTD_OPT_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ + +/* used in ZSTD_loadDictionaryContent() */ +void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); + +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_OPT_H */ +/**** ended inlining zstd_opt.h ****/ +/**** start inlining zstd_ldm.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LDM_H +#define ZSTD_LDM_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: ../zstd.h ****/ + +/*-************************************* +* Long distance matching +***************************************/ + +#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT + +void ZSTD_ldm_fillHashTable( + ldmState_t* state, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params); + +/** + * ZSTD_ldm_generateSequences(): + * + * Generates the sequences using the long distance match finder. + * Generates long range matching sequences in `sequences`, which parse a prefix + * of the source. `sequences` must be large enough to store every sequence, + * which can be checked with `ZSTD_ldm_getMaxNbSeq()`. + * @returns 0 or an error code. + * + * NOTE: The user must have called ZSTD_window_update() for all of the input + * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks. + * NOTE: This function returns an error if it runs out of space to store + * sequences. + */ +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldms, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + +/** + * ZSTD_ldm_blockCompress(): + * + * Compresses a block using the predefined sequences, along with a secondary + * block compressor. The literals section of every sequence is passed to the + * secondary block compressor, and those sequences are interspersed with the + * predefined sequences. Returns the length of the last literals. + * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed. + * `rawSeqStore.seq` may also be updated to split the last sequence between two + * blocks. + * @return The length of the last literals. + * + * NOTE: The source must be at most the maximum block size, but the predefined + * sequences can be any size, and may be longer than the block. In the case that + * they are longer than the block, the last sequences may need to be split into + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +/** + * ZSTD_ldm_skipSequences(): + * + * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`. + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + +/** ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is + * disabled. + */ +size_t ZSTD_ldm_getTableSize(ldmParams_t params); + +/** ZSTD_ldm_getSeqSpace() : + * Return an upper bound on the number of sequences that can be produced by + * the long distance matcher, or 0 if LDM is disabled. + */ +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); + +/** ZSTD_ldm_adjustParameters() : + * If the params->hashRateLog is not set, set it to its default value based on + * windowLog and params->hashLog. + * + * Ensures that params->bucketSizeLog is <= params->hashLog (setting it to + * params->hashLog if it is not). + * + * Ensures that the minMatchLength >= targetLength during optimal parsing. + */ +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ +/**** ended inlining zstd_ldm.h ****/ +/**** skipping file: zstd_compress_superblock.h ****/ + + +/*-************************************* +* Helper functions +***************************************/ +/* ZSTD_compressBound() + * Note that the result from this function is only compatible with the "normal" + * full-block strategy. + * When there are a lot of small blocks due to frequent flush in streaming mode + * the overhead of headers can make the compressed data to be larger than the + * return value of ZSTD_compressBound(). + */ +size_t ZSTD_compressBound(size_t srcSize) { + return ZSTD_COMPRESSBOUND(srcSize); +} + + +/*-************************************* +* Context memory management +***************************************/ +struct ZSTD_CDict_s { + const void* dictContent; + size_t dictContentSize; + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; + ZSTD_matchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +}; /* typedef'd to ZSTD_CDict within "zstd.h" */ + +ZSTD_CCtx* ZSTD_createCCtx(void) +{ + return ZSTD_createCCtx_advanced(ZSTD_defaultCMem); +} + +static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) +{ + assert(cctx != NULL); + memset(cctx, 0, sizeof(*cctx)); + cctx->customMem = memManager; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); + assert(!ZSTD_isError(err)); + (void)err; + } +} + +ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) +{ + ZSTD_STATIC_ASSERT(zcss_init==0); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem); + if (!cctx) return NULL; + ZSTD_initCCtx(cctx, customMem); + return cctx; + } +} + +ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) +{ + ZSTD_cwksp ws; + ZSTD_CCtx* cctx; + if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ + if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + + cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx)); + if (cctx == NULL) return NULL; + + memset(cctx, 0, sizeof(ZSTD_CCtx)); + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + + /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ + if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, HUF_WORKSPACE_SIZE); + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; +} + +/** + * Clears and frees all of the dictionaries in the CCtx. + */ +static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) +{ + ZSTD_free(cctx->localDict.dictBuffer, cctx->customMem); + ZSTD_freeCDict(cctx->localDict.cdict); + memset(&cctx->localDict, 0, sizeof(cctx->localDict)); + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); + cctx->cdict = NULL; +} + +static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict) +{ + size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0; + size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict); + return bufferSize + cdictSize; +} + +static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) +{ + assert(cctx != NULL); + assert(cctx->staticSize == 0); + ZSTD_clearAllDicts(cctx); +#ifdef ZSTD_MULTITHREAD + ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL; +#endif + ZSTD_cwksp_free(&cctx->workspace, cctx->customMem); +} + +size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); + { + int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); + if (!cctxInWorkspace) { + ZSTD_free(cctx, cctx->customMem); + } + } + return 0; +} + + +static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + return ZSTDMT_sizeof_CCtx(cctx->mtctx); +#else + (void)cctx; + return 0; +#endif +} + + +size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return 0; /* support sizeof on NULL */ + /* cctx may be in the workspace */ + return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx)) + + ZSTD_cwksp_sizeof(&cctx->workspace) + + ZSTD_sizeof_localDict(cctx->localDict) + + ZSTD_sizeof_mtctx(cctx); +} + +size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) +{ + return ZSTD_sizeof_CCtx(zcs); /* same object */ +} + +/* private API call, for dictBuilder only */ +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + +static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params cctxParams; + memset(&cctxParams, 0, sizeof(cctxParams)); + cctxParams.cParams = cParams; + cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + assert(!ZSTD_checkCParams(cParams)); + cctxParams.fParams.contentSizeFlag = 1; + return cctxParams; +} + +static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params* params; + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + params = (ZSTD_CCtx_params*)ZSTD_calloc( + sizeof(ZSTD_CCtx_params), customMem); + if (!params) { return NULL; } + params->customMem = customMem; + params->compressionLevel = ZSTD_CLEVEL_DEFAULT; + params->fParams.contentSizeFlag = 1; + return params; +} + +ZSTD_CCtx_params* ZSTD_createCCtxParams(void) +{ + return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem); +} + +size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) +{ + if (params == NULL) { return 0; } + ZSTD_free(params, params->customMem); + return 0; +} + +size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) +{ + return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); +} + +size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->compressionLevel = compressionLevel; + cctxParams->fParams.contentSizeFlag = 1; + return 0; +} + +size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +{ + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + memset(cctxParams, 0, sizeof(*cctxParams)); + assert(!ZSTD_checkCParams(params.cParams)); + cctxParams->cParams = params.cParams; + cctxParams->fParams = params.fParams; + cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + return 0; +} + +/* ZSTD_assignParamsToCCtxParams() : + * params is presumed valid at this stage */ +static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams( + const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +{ + ZSTD_CCtx_params ret = *cctxParams; + assert(!ZSTD_checkCParams(params->cParams)); + ret.cParams = params->cParams; + ret.fParams = params->fParams; + ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + return ret; +} + +ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + + switch(param) + { + case ZSTD_c_compressionLevel: + bounds.lowerBound = ZSTD_minCLevel(); + bounds.upperBound = ZSTD_maxCLevel(); + return bounds; + + case ZSTD_c_windowLog: + bounds.lowerBound = ZSTD_WINDOWLOG_MIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + + case ZSTD_c_hashLog: + bounds.lowerBound = ZSTD_HASHLOG_MIN; + bounds.upperBound = ZSTD_HASHLOG_MAX; + return bounds; + + case ZSTD_c_chainLog: + bounds.lowerBound = ZSTD_CHAINLOG_MIN; + bounds.upperBound = ZSTD_CHAINLOG_MAX; + return bounds; + + case ZSTD_c_searchLog: + bounds.lowerBound = ZSTD_SEARCHLOG_MIN; + bounds.upperBound = ZSTD_SEARCHLOG_MAX; + return bounds; + + case ZSTD_c_minMatch: + bounds.lowerBound = ZSTD_MINMATCH_MIN; + bounds.upperBound = ZSTD_MINMATCH_MAX; + return bounds; + + case ZSTD_c_targetLength: + bounds.lowerBound = ZSTD_TARGETLENGTH_MIN; + bounds.upperBound = ZSTD_TARGETLENGTH_MAX; + return bounds; + + case ZSTD_c_strategy: + bounds.lowerBound = ZSTD_STRATEGY_MIN; + bounds.upperBound = ZSTD_STRATEGY_MAX; + return bounds; + + case ZSTD_c_contentSizeFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_checksumFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_dictIDFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_nbWorkers: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_NBWORKERS_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_jobSize: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_JOBSIZE_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_overlapLog: +#ifdef ZSTD_MULTITHREAD + bounds.lowerBound = ZSTD_OVERLAPLOG_MIN; + bounds.upperBound = ZSTD_OVERLAPLOG_MAX; +#else + bounds.lowerBound = 0; + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_enableLongDistanceMatching: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_ldmHashLog: + bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHLOG_MAX; + return bounds; + + case ZSTD_c_ldmMinMatch: + bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN; + bounds.upperBound = ZSTD_LDM_MINMATCH_MAX; + return bounds; + + case ZSTD_c_ldmBucketSizeLog: + bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN; + bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX; + return bounds; + + case ZSTD_c_ldmHashRateLog: + bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX; + return bounds; + + /* experimental parameters */ + case ZSTD_c_rsyncable: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_forceMaxWindow : + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_format: + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + bounds.lowerBound = ZSTD_f_zstd1; + bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_forceAttachDict: + ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceCopy); + bounds.lowerBound = ZSTD_dictDefaultAttach; + bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_literalCompressionMode: + ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); + bounds.lowerBound = ZSTD_lcm_auto; + bounds.upperBound = ZSTD_lcm_uncompressed; + return bounds; + + case ZSTD_c_targetCBlockSize: + bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN; + bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; + return bounds; + + case ZSTD_c_srcSizeHint: + bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; + bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; + return bounds; + + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; + } +} + +/* ZSTD_cParam_clampBounds: + * Clamps the value into the bounded range. + */ +static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) +{ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); + if (ZSTD_isError(bounds.error)) return bounds.error; + if (*value < bounds.lowerBound) *value = bounds.lowerBound; + if (*value > bounds.upperBound) *value = bounds.upperBound; + return 0; +} + +#define BOUNDCHECK(cParam, val) { \ + RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ + parameter_outOfBound, "Param out of bounds"); \ +} + + +static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +{ + switch(param) + { + case ZSTD_c_compressionLevel: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + return 1; + + case ZSTD_c_format: + case ZSTD_c_windowLog: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow : + case ZSTD_c_nbWorkers: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + default: + return 0; + } +} + +size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value); + if (cctx->streamStage != zcss_init) { + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { + RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); + } } + + switch(param) + { + case ZSTD_c_nbWorkers: + RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported, + "MT not compatible with static alloc"); + break; + + case ZSTD_c_compressionLevel: + case ZSTD_c_windowLog: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_format: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value); + switch(param) + { + case ZSTD_c_format : + BOUNDCHECK(ZSTD_c_format, value); + CCtxParams->format = (ZSTD_format_e)value; + return (size_t)CCtxParams->format; + + case ZSTD_c_compressionLevel : { + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + if (value) { /* 0 : does not change current level */ + CCtxParams->compressionLevel = value; + } + if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel; + return 0; /* return type (size_t) cannot represent negative values */ + } + + case ZSTD_c_windowLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_windowLog, value); + CCtxParams->cParams.windowLog = (U32)value; + return CCtxParams->cParams.windowLog; + + case ZSTD_c_hashLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_hashLog, value); + CCtxParams->cParams.hashLog = (U32)value; + return CCtxParams->cParams.hashLog; + + case ZSTD_c_chainLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_chainLog, value); + CCtxParams->cParams.chainLog = (U32)value; + return CCtxParams->cParams.chainLog; + + case ZSTD_c_searchLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_searchLog, value); + CCtxParams->cParams.searchLog = (U32)value; + return (size_t)value; + + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); + CCtxParams->cParams.minMatch = value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); + CCtxParams->cParams.targetLength = value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_strategy, value); + CCtxParams->cParams.strategy = (ZSTD_strategy)value; + return (size_t)CCtxParams->cParams.strategy; + + case ZSTD_c_contentSizeFlag : + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; + return CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; + return CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); + CCtxParams->fParams.noDictIDFlag = !value; + return !CCtxParams->fParams.noDictIDFlag; + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); + return CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; + BOUNDCHECK(ZSTD_c_forceAttachDict, pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; + BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } + + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + CCtxParams->nbWorkers = value; + return CCtxParams->nbWorkers; +#endif + + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + /* Adjust to the minimum non-default value. */ + if (value != 0 && value < ZSTDMT_JOBSIZE_MIN) + value = ZSTDMT_JOBSIZE_MIN; + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + assert(value >= 0); + CCtxParams->jobSize = value; + return CCtxParams->jobSize; +#endif + + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->overlapLog = value; + return CCtxParams->overlapLog; +#endif + + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->rsyncable = value; + return CCtxParams->rsyncable; +#endif + + case ZSTD_c_enableLongDistanceMatching : + CCtxParams->ldmParams.enableLdm = (value!=0); + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); + CCtxParams->ldmParams.hashLog = value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); + CCtxParams->ldmParams.minMatchLength = value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); + CCtxParams->ldmParams.bucketSizeLog = value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN, + parameter_outOfBound, "Param out of bounds!"); + CCtxParams->ldmParams.hashRateLog = value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); + CCtxParams->targetCBlockSize = value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; + return CCtxParams->srcSizeHint; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +} + +size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value) +{ + return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_getParameter( + ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value) +{ + switch(param) + { + case ZSTD_c_format : + *value = CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; + break; + case ZSTD_c_windowLog : + *value = (int)CCtxParams->cParams.windowLog; + break; + case ZSTD_c_hashLog : + *value = (int)CCtxParams->cParams.hashLog; + break; + case ZSTD_c_chainLog : + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : + *value = CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : + *value = CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : + *value = CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : + *value = (unsigned)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; + break; + case ZSTD_c_checksumFlag : + *value = CCtxParams->fParams.checksumFlag; + break; + case ZSTD_c_dictIDFlag : + *value = !CCtxParams->fParams.noDictIDFlag; + break; + case ZSTD_c_forceMaxWindow : + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : + *value = CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : + *value = CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + assert(CCtxParams->nbWorkers == 0); +#endif + *value = CCtxParams->nbWorkers; + break; + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + assert(CCtxParams->jobSize <= INT_MAX); + *value = (int)CCtxParams->jobSize; + break; +#endif + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->overlapLog; + break; +#endif + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->rsyncable; + break; +#endif + case ZSTD_c_enableLongDistanceMatching : + *value = CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : + *value = CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : + *value = CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : + *value = CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : + *value = CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; + break; + case ZSTD_c_srcSizeHint : + *value = (int)CCtxParams->srcSizeHint; + break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +} + +/** ZSTD_CCtx_setParametersUsingCCtxParams() : + * just applies `params` into `cctx` + * no action is performed, parameters are merely stored. + * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx. + * This is possible even if a compression is ongoing. + * In which case, new parameters will be applied on the fly, starting with next compression job. + */ +size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams"); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "The context is in the wrong stage!"); + RETURN_ERROR_IF(cctx->cdict, stage_wrong, + "Can't override parameters with cdict attached (some must " + "be inherited from the cdict)."); + + cctx->requestedParams = *params; + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; + return 0; +} + +/** + * Initializes the local dict using the requested parameters. + * NOTE: This does not use the pledged src size, because it may be used for more + * than one compression. + */ +static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) +{ + ZSTD_localDict* const dl = &cctx->localDict; + ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams( + &cctx->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN, dl->dictSize); + if (dl->dict == NULL) { + /* No local dictionary. */ + assert(dl->dictBuffer == NULL); + assert(dl->cdict == NULL); + assert(dl->dictSize == 0); + return 0; + } + if (dl->cdict != NULL) { + assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ + return 0; + } + assert(dl->dictSize > 0); + assert(cctx->cdict == NULL); + assert(cctx->prefixDict.dict == NULL); + + dl->cdict = ZSTD_createCDict_advanced( + dl->dict, + dl->dictSize, + ZSTD_dlm_byRef, + dl->dictContentType, + cParams, + cctx->customMem); + RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed"); + cctx->cdict = dl->cdict; + return 0; +} + +size_t ZSTD_CCtx_loadDictionary_advanced( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't load a dictionary when ctx is not in init stage."); + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "no malloc for static CCtx"); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); + ZSTD_clearAllDicts(cctx); /* in case one already exists */ + if (dict == NULL || dictSize == 0) /* no dictionary mode */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { + void* dictBuffer = ZSTD_malloc(dictSize, cctx->customMem); + RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); + memcpy(dictBuffer, dict, dictSize); + cctx->localDict.dictBuffer = dictBuffer; + cctx->localDict.dict = dictBuffer; + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + + +size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a dict when ctx not in init stage."); + /* Free the existing local cdict (if any) to save memory. */ + ZSTD_clearAllDicts(cctx); + cctx->cdict = cdict; + return 0; +} + +size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + +size_t ZSTD_CCtx_refPrefix_advanced( + ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a prefix when ctx not in init stage."); + ZSTD_clearAllDicts(cctx); + if (prefix != NULL && prefixSize > 0) { + cctx->prefixDict.dict = prefix; + cctx->prefixDict.dictSize = prefixSize; + cctx->prefixDict.dictContentType = dictContentType; + } + return 0; +} + +/*! ZSTD_CCtx_reset() : + * Also dumps dictionary */ +size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + cctx->streamStage = zcss_init; + cctx->pledgedSrcSizePlusOne = 0; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't reset parameters only when not in init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } + return 0; +} + + +/** ZSTD_checkCParams() : + control CParam values remain within authorized range. + @return : 0, or an error code if one value is beyond authorized range */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) +{ + BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog); + BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog); + BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog); + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); + BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); + return 0; +} + +/** ZSTD_clampCParams() : + * make CParam values within valid range. + * @return : valid CParams */ +static ZSTD_compressionParameters +ZSTD_clampCParams(ZSTD_compressionParameters cParams) +{ +# define CLAMP_TYPE(cParam, val, type) { \ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ + if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ + } +# define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); + CLAMP(ZSTD_c_hashLog, cParams.hashLog); + CLAMP(ZSTD_c_searchLog, cParams.searchLog); + CLAMP(ZSTD_c_minMatch, cParams.minMatch); + CLAMP(ZSTD_c_targetLength,cParams.targetLength); + CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy); + return cParams; +} + +/** ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) +{ + U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); + return hashLog - btScale; +} + +/** ZSTD_adjustCParams_internal() : + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ +static ZSTD_compressionParameters +ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize) +{ + static const U64 minSrcSize = 513; /* (1<<9) + 1 */ + static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; + + /* resize windowLog if input is small enough, to use less memory */ + if ( (srcSize < maxWindowResize) + && (dictSize < maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : + ZSTD_highbit32(tSize-1) + 1; + if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; + } + if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1; + { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cycleLog > cPar.windowLog) + cPar.chainLog -= (cycleLog - cPar.windowLog); + } + + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + + return cPar; +} + +ZSTD_compressionParameters +ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize) +{ + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize); +} + +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); + +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) +{ + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { + srcSizeHint = CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize); + if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; + if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; + if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog; + if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog; + if (CCtxParams->cParams.minMatch) cParams.minMatch = CCtxParams->cParams.minMatch; + if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength; + if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy; + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ + return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize); +} + +static size_t +ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const U32 forCCtx) +{ + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't + * surrounded by redzones in ASAN. */ + size_t const tableSpace = chainSize * sizeof(U32) + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = + ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((1<strategy >= ZSTD_btopt)) + ? optPotentialSpace + : 0; + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", + (U32)chainSize, (U32)hSize, (U32)h3Size); + return tableSpace + optSpace; +} + +size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); + U32 const divider = (cParams.minMatch==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams); + size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq)); + + /* estimateCCtxSize is for one-shot compression. So no buffers should + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + size_t const bufferSpace = ZSTD_cwksp_alloc_size(0) + + ZSTD_cwksp_alloc_size(0); + + size_t const cctxSpace = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)); + + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; + } +} + +size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + return ZSTD_estimateCCtxSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCCtxSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCCtxSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); + size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize; + size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; + size_t const streamingSize = ZSTD_cwksp_alloc_size(inBuffSize) + + ZSTD_cwksp_alloc_size(outBuffSize); + + return CCtxSize + streamingSize; + } +} + +size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + return ZSTD_estimateCStreamSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCStreamSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCStreamSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +/* ZSTD_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads (non-blocking mode). + */ +ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_getFrameProgression(cctx->mtctx); + } +#endif + { ZSTD_frameProgression fp; + size_t const buffered = (cctx->inBuff == NULL) ? 0 : + cctx->inBuffPos - cctx->inToCompress; + if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress); + assert(buffered <= ZSTD_BLOCKSIZE_MAX); + fp.ingested = cctx->consumedSrcSize + buffered; + fp.consumed = cctx->consumedSrcSize; + fp.produced = cctx->producedCSize; + fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */ + fp.currentJobID = 0; + fp.nbActiveWorkers = 0; + return fp; +} } + +/*! ZSTD_toFlushNow() + * Only useful for multithreading scenarios currently (nbWorkers >= 1). + */ +size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_toFlushNow(cctx->mtctx); + } +#endif + (void)cctx; + return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */ +} + +static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1, + ZSTD_compressionParameters cParams2) +{ + (void)cParams1; + (void)cParams2; + assert(cParams1.windowLog == cParams2.windowLog); + assert(cParams1.chainLog == cParams2.chainLog); + assert(cParams1.hashLog == cParams2.hashLog); + assert(cParams1.searchLog == cParams2.searchLog); + assert(cParams1.minMatch == cParams2.minMatch); + assert(cParams1.targetLength == cParams2.targetLength); + assert(cParams1.strategy == cParams2.strategy); +} + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) +{ + int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + bs->rep[i] = repStartValue[i]; + bs->entropy.huf.repeatMode = HUF_repeat_none; + bs->entropy.fse.offcode_repeatMode = FSE_repeat_none; + bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none; + bs->entropy.fse.litlength_repeatMode = FSE_repeat_none; +} + +/*! ZSTD_invalidateMatchState() + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) +{ + ZSTD_window_clear(&ms->window); + + ms->nextToUpdate = ms->window.dictLimit; + ms->loadedDictEnd = 0; + ms->opt.litLengthSum = 0; /* force reset of btopt stats */ + ms->dictMatchState = NULL; +} + +/** + * Indicates whether this compression proceeds directly from user-provided + * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or + * whether the context needs to buffer the input/output (ZSTDb_buffered). + */ +typedef enum { + ZSTDb_not_buffered, + ZSTDb_buffered +} ZSTD_buffered_policy_e; + +/** + * Controls, for this matchState reset, whether the tables need to be cleared / + * prepared for the coming compression (ZSTDcrp_makeClean), or whether the + * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a + * subsequent operation will overwrite the table space anyways (e.g., copying + * the matchState contents in from a CDict). + */ +typedef enum { + ZSTDcrp_makeClean, + ZSTDcrp_leaveDirty +} ZSTD_compResetPolicy_e; + +/** + * Controls, for this matchState reset, whether indexing can continue where it + * left off (ZSTDirp_continue), or whether it needs to be restarted from zero + * (ZSTDirp_reset). + */ +typedef enum { + ZSTDirp_continue, + ZSTDirp_reset +} ZSTD_indexResetPolicy_e; + +typedef enum { + ZSTD_resetTarget_CDict, + ZSTD_resetTarget_CCtx +} ZSTD_resetTarget_e; + +static size_t +ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +{ + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + + DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + if (forceResetIndex == ZSTDirp_reset) { + ZSTD_window_init(&ms->window); + ZSTD_cwksp_mark_tables_dirty(ws); + } + + ms->hashLog3 = hashLog3; + + ZSTD_invalidateMatchState(ms); + + assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */ + + ZSTD_cwksp_clear_tables(ws); + + DEBUGLOG(5, "reserving table space"); + /* table Space */ + ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32)); + ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32)); + ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32)); + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + + DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty); + if (crp!=ZSTDcrp_leaveDirty) { + /* reset tables only */ + ZSTD_cwksp_clean_tables(ws); + } + + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); + ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); + ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); + ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); + ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + } + + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + + return 0; +} + +/* ZSTD_indexTooCloseToMax() : + * minor optimization : prefer memset() rather than reduceIndex() + * which is measurably slow in some circumstances (reported for Visual Studio). + * Works when re-using a context for a lot of smallish inputs : + * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN, + * memset() will be triggered before reduceIndex(). + */ +#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB) +static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) +{ + return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); +} + +/*! ZSTD_resetCCtx_internal() : + note : `params` are assumed fully validated at this stage */ +static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + ZSTD_CCtx_params params, + U64 const pledgedSrcSize, + ZSTD_compResetPolicy_e const crp, + ZSTD_buffered_policy_e const zbuff) +{ + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", + (U32)pledgedSrcSize, params.cParams.windowLog); + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + + zc->isFirstBlock = 1; + + if (params.ldmParams.enableLdm) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); + assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); + assert(params.ldmParams.hashRateLog < 32); + zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); + U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0; + size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0; + size_t const matchStateSize = ZSTD_sizeof_matchState(¶ms.cParams, /* forCCtx */ 1); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); + + ZSTD_indexResetPolicy_e needsIndexReset = zc->initialized ? ZSTDirp_continue : ZSTDirp_reset; + + if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) { + needsIndexReset = ZSTDirp_reset; + } + + if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); + + /* Check if workspace is large enough, alloc a new one if needed */ + { size_t const cctxSpace = zc->staticSize ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize); + size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams); + size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)); + + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace; + + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); + + DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers", + neededSpace>>10, matchStateSize>>10, bufferSpace>>10); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + + if (workspaceTooSmall || workspaceWasteful) { + DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", + ZSTD_cwksp_sizeof(ws) >> 10, + neededSpace >> 10); + + RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize"); + + needsIndexReset = ZSTDirp_reset; + + ZSTD_cwksp_free(ws, zc->customMem); + FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), ""); + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. + * entropyWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); + zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); + } } + + ZSTD_cwksp_clear(ws); + + /* init params */ + zc->appliedParams = params; + zc->blockState.matchState.cParams = params.cParams; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; + if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); + zc->blockSize = blockSize; + + XXH64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; + zc->dictID = 0; + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + + /* buffers */ + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->outBuffSize = buffOutSize; + zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + + /* ldm bucketOffsets table */ + if (params.ldmParams.enableLdm) { + /* TODO: avoid memset? */ + size_t const ldmBucketSize = + ((size_t)1) << (params.ldmParams.hashLog - + params.ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize); + memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize); + } + + /* sequences storage */ + ZSTD_referenceExternalSequences(zc, NULL, 0); + zc->seqStore.maxNbSeq = maxNbSeq; + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); + + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms.cParams, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + + /* ldm hash table */ + if (params.ldmParams.enableLdm) { + /* TODO: avoid memset? */ + size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; + zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); + memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); + zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); + zc->maxNbLdmSequences = maxNbLdmSeq; + + ZSTD_window_init(&zc->ldmState.window); + ZSTD_window_clear(&zc->ldmState.window); + zc->ldmState.loadedDictEnd = 0; + } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + zc->initialized = 1; + + return 0; + } +} + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { + int i; + for (i=0; iblockState.prevCBlock->rep[i] = 0; + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); +} + +/* These are the approximate sizes for each strategy past which copying the + * dictionary tables into the working context is faster than using them + * in-place. + */ +static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = { + 8 KB, /* unused */ + 8 KB, /* ZSTD_fast */ + 16 KB, /* ZSTD_dfast */ + 32 KB, /* ZSTD_greedy */ + 32 KB, /* ZSTD_lazy */ + 32 KB, /* ZSTD_lazy2 */ + 32 KB, /* ZSTD_btlazy2 */ + 32 KB, /* ZSTD_btopt */ + 8 KB, /* ZSTD_btultra */ + 8 KB /* ZSTD_btultra2 */ +}; + +static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize) +{ + size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy]; + return ( pledgedSrcSize <= cutoff + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || params->attachDictPref == ZSTD_dictForceAttach ) + && params->attachDictPref != ZSTD_dictForceCopy + && !params->forceWindow; /* dictMatchState isn't correctly + * handled in _enforceMaxDist */ +} + +static size_t +ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + { const ZSTD_compressionParameters* const cdict_cParams = &cdict->matchState.cParams; + unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Resize working context table params for input only, since the dict + * has its own tables. */ + /* pledgeSrcSize == 0 means 0! */ + params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0); + params.cParams.windowLog = windowLog; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_makeClean, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + } + + { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc + - cdict->matchState.window.base); + const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit; + if (cdictLen == 0) { + /* don't even attach dictionaries with no contents */ + DEBUGLOG(4, "skipping attaching empty dictionary"); + } else { + DEBUGLOG(4, "attaching dictionary into context"); + cctx->blockState.matchState.dictMatchState = &cdict->matchState; + + /* prep working match state so dict matches never have negative indices + * when they are translated to the working context's index space. */ + if (cctx->blockState.matchState.window.dictLimit < cdictEnd) { + cctx->blockState.matchState.window.nextSrc = + cctx->blockState.matchState.window.base + cdictEnd; + ZSTD_window_clear(&cctx->blockState.matchState.window); + } + /* loadedDictEnd is expressed within the referential of the active context */ + cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit; + } } + + cctx->dictID = cdict->dictID; + + /* copy block state */ + memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; + + DEBUGLOG(4, "copying dictionary into context"); + + { unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Copy only compression parameters related to tables. */ + params.cParams = *cdict_cParams; + params.cParams.windowLog = windowLog; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_leaveDirty, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); + assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog); + } + + ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + + /* copy tables */ + { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + + memcpy(cctx->blockState.matchState.hashTable, + cdict->matchState.hashTable, + hSize * sizeof(U32)); + memcpy(cctx->blockState.matchState.chainTable, + cdict->matchState.chainTable, + chainSize * sizeof(U32)); + } + + /* Zero the hashTable3, since the cdict never fills it */ + { int const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ + { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; + ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + + cctx->dictID = cdict->dictID; + + /* copy block state */ + memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +/* We have a choice between copying the dictionary context into the working + * context, or referencing the dictionary context from the working context + * in-place. We decide here which strategy to use. */ +static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + + DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", + (unsigned)pledgedSrcSize); + + if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) { + return ZSTD_resetCCtx_byAttachingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } else { + return ZSTD_resetCCtx_byCopyingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } +} + +/*! ZSTD_copyCCtx_internal() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * The "context", in this case, refers to the hash and chain tables, + * entropy tables, and dictionary references. + * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx. + * @return : 0, or an error code */ +static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + const ZSTD_CCtx* srcCCtx, + ZSTD_frameParameters fParams, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); + RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, + "Can't copy a ctx that's not in init stage."); + + memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + { ZSTD_CCtx_params params = dstCCtx->requestedParams; + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + params.fParams = fParams; + ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTDcrp_leaveDirty, zbuff); + assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); + assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); + assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog); + assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog); + assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3); + } + + ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); + + /* copy tables */ + { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; + int const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + memcpy(dstCCtx->blockState.matchState.hashTable, + srcCCtx->blockState.matchState.hashTable, + hSize * sizeof(U32)); + memcpy(dstCCtx->blockState.matchState.chainTable, + srcCCtx->blockState.matchState.chainTable, + chainSize * sizeof(U32)); + memcpy(dstCCtx->blockState.matchState.hashTable3, + srcCCtx->blockState.matchState.hashTable3, + h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace); + + /* copy dictionary offsets */ + { + const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; + ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + dstCCtx->dictID = srcCCtx->dictID; + + /* copy block state */ + memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); + + return 0; +} + +/*! ZSTD_copyCCtx() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * pledgedSrcSize==0 means "unknown". +* @return : 0, or an error code */ +size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) +{ + ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0); + ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); + + return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, + fParams, pledgedSrcSize, + zbuff); +} + + +#define ZSTD_ROWSIZE 16 +/*! ZSTD_reduceTable() : + * reduce table indexes by `reducerValue`, or squash to zero. + * PreserveMark preserves "unsorted mark" for btlazy2 strategy. + * It must be set to a clear 0/1 value, to remove branch during inlining. + * Presume table size is a multiple of ZSTD_ROWSIZE + * to help auto-vectorization */ +FORCE_INLINE_TEMPLATE void +ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) +{ + int const nbRows = (int)size / ZSTD_ROWSIZE; + int cellNb = 0; + int rowNb; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ + assert(size < (1U<<31)); /* can be casted to int */ + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table re-use logic is sound, and that we don't + * access table space that we haven't cleaned, we re-"poison" the table + * space every time we mark it dirty. + * + * This function however is intended to operate on those dirty tables and + * re-clean them. So when this function is used correctly, we can unpoison + * the memory it operated on. This introduces a blind spot though, since + * if we now try to operate on __actually__ poisoned memory, we will not + * detect that. */ + __msan_unpoison(table, size * sizeof(U32)); +#endif + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { + int column; + for (column=0; columncParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); + } + + if (params->cParams.strategy != ZSTD_fast) { + U32 const chainSize = (U32)1 << params->cParams.chainLog; + if (params->cParams.strategy == ZSTD_btlazy2) + ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); + else + ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); + } + + if (ms->hashLog3) { + U32 const h3Size = (U32)1 << ms->hashLog3; + ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue); + } +} + + +/*-******************************************************* +* Block entropic compression +*********************************************************/ + +/* See doc/zstd_compression_format.md for detailed format description */ + +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) +{ + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; + BYTE* const ofCodeTable = seqStorePtr->ofCode; + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; ulongLengthID==1) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthID==2) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; +} + +/* ZSTD_useTargetCBlockSize(): + * Returns if target compressed block size param is being used. + * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize); + return (cctxParams->targetCBlockSize != 0); +} + +/* ZSTD_compressSequences_internal(): + * actually compresses both literals and sequences */ +MEM_STATIC size_t +ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + void* entropyWorkspace, size_t entropyWkspSize, + const int bmi2) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned count[MaxSeq+1]; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ + const seqDef* const sequences = seqStorePtr->sequencesStart; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* seqHead; + BYTE* lastNCount = NULL; + + DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<litStart; + size_t const litSize = (size_t)(seqStorePtr->lit - literals); + size_t const cSize = ZSTD_compressLiterals( + &prevEntropy->huf, &nextEntropy->huf, + cctxParams->cParams.strategy, + ZSTD_disableLiteralsCompression(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, + bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; + } + + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, "Can't fit seq hdr in output buf!"); + if (nbSeq < 128) { + *op++ = (BYTE)nbSeq; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } + assert(op <= oend); + if (nbSeq==0) { + /* Copy the old tables over as if we repeated them */ + memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + assert(op <= oend); + + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + /* build CTable for Literal Lengths */ + { unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; + LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, + count, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->fse.litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, + count, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->fse.litlengthCTable, + sizeof(prevEntropy->fse.litlengthCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); + if (LLtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + /* build CTable for Offsets */ + { unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp( + count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; + Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, + count, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->fse.offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, + count, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->fse.offcodeCTable, + sizeof(prevEntropy->fse.offcodeCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); + if (Offtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + /* build CTable for MatchLengths */ + { unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp( + count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; + MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, + count, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->fse.matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, + count, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->fse.matchlengthCTable, + sizeof(prevEntropy->fse.matchlengthCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); + if (MLtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + + *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, (size_t)(oend - op), + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + assert(op <= oend); + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ + if (lastNCount && (op - lastNCount) < 4) { + /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(op - lastNCount == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } + } + + DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart)); + return (size_t)(op - ostart); +} + +MEM_STATIC size_t +ZSTD_compressSequences(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + size_t srcSize, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + size_t const cSize = ZSTD_compressSequences_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, + dst, dstCapacity, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ + if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) + return 0; /* block not compressed */ + FORWARD_IF_ERROR(cSize, "ZSTD_compressSequences_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } + + return cSize; +} + +/* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +{ + static const ZSTD_blockCompressor blockCompressor[3][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, + ZSTD_compressBlock_doubleFast, + ZSTD_compressBlock_greedy, + ZSTD_compressBlock_lazy, + ZSTD_compressBlock_lazy2, + ZSTD_compressBlock_btlazy2, + ZSTD_compressBlock_btopt, + ZSTD_compressBlock_btultra, + ZSTD_compressBlock_btultra2 }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, + ZSTD_compressBlock_doubleFast_extDict, + ZSTD_compressBlock_greedy_extDict, + ZSTD_compressBlock_lazy_extDict, + ZSTD_compressBlock_lazy2_extDict, + ZSTD_compressBlock_btlazy2_extDict, + ZSTD_compressBlock_btopt_extDict, + ZSTD_compressBlock_btultra_extDict, + ZSTD_compressBlock_btultra_extDict }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, + ZSTD_compressBlock_doubleFast_dictMatchState, + ZSTD_compressBlock_greedy_dictMatchState, + ZSTD_compressBlock_lazy_dictMatchState, + ZSTD_compressBlock_lazy2_dictMatchState, + ZSTD_compressBlock_btlazy2_dictMatchState, + ZSTD_compressBlock_btopt_dictMatchState, + ZSTD_compressBlock_btultra_dictMatchState, + ZSTD_compressBlock_btultra_dictMatchState } + }; + ZSTD_blockCompressor selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + assert(selectedCompressor != NULL); + return selectedCompressor; +} + +static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) +{ + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; +} + +void ZSTD_resetSeqStore(seqStore_t* ssPtr) +{ + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthID = 0; +} + +typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + +static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +{ + ZSTD_matchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); + if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { + ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); + return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */ + } + ZSTD_resetSeqStore(&(zc->seqStore)); + /* required for optimal parser to read stats from dictionary */ + ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy; + /* tell the optimal parser how we expect to compress literals */ + ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode; + /* a gap between an attached dict and the current window is not safe, + * they must remain adjacent, + * and when that stops being the case, the dict must be unset */ + assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit); + + /* limited update after a very long match */ + { const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 current = (U32)(istart-base); + if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */ + if (current > ms->nextToUpdate + 384) + ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384)); + } + + /* select and store sequences */ + { ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms); + size_t lastLLSize; + { int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(!zc->appliedParams.ldmParams.enableLdm); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm) { + rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0}; + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ + FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, + &zc->appliedParams.ldmParams, + src, srcSize), ""); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&ldmSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); + } else { /* not long range mode */ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } + return ZSTDbss_compress; +} + +static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +{ + const seqStore_t* seqStore = ZSTD_getSeqStore(zc); + const seqDef* seqs = seqStore->sequencesStart; + size_t seqsSize = seqStore->sequences - seqs; + + ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; + size_t i; size_t position; int repIdx; + + assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); + for (i = 0, position = 0; i < seqsSize; ++i) { + outSeqs[i].offset = seqs[i].offset; + outSeqs[i].litLength = seqs[i].litLength; + outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH; + + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthID == 1) { + outSeqs[i].litLength += 0x10000; + } else if (seqStore->longLengthID == 2) { + outSeqs[i].matchLength += 0x10000; + } + } + + if (outSeqs[i].offset <= ZSTD_REP_NUM) { + outSeqs[i].rep = outSeqs[i].offset; + repIdx = (unsigned int)i - outSeqs[i].offset; + + if (outSeqs[i].litLength == 0) { + if (outSeqs[i].offset < 3) { + --repIdx; + } else { + repIdx = (unsigned int)i - 1; + } + ++outSeqs[i].rep; + } + assert(repIdx >= -3); + outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1]; + if (outSeqs[i].rep == 4) { + --outSeqs[i].offset; + } + } else { + outSeqs[i].offset -= ZSTD_REP_NUM; + } + + position += outSeqs[i].litLength; + outSeqs[i].matchPos = (unsigned int)position; + position += outSeqs[i].matchLength; + } + zc->seqCollector.seqIndex += seqsSize; +} + +size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) +{ + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem); + SeqCollector seqCollector; + + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + + seqCollector.collectSequences = 1; + seqCollector.seqStart = outSeqs; + seqCollector.seqIndex = 0; + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + + ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_free(dst, ZSTD_defaultCMem); + return zc->seqCollector.seqIndex; +} + +/* Returns true if the given block is a RLE block */ +static int ZSTD_isRLE(const BYTE *ip, size_t length) { + size_t i; + if (length < 2) return 1; + for (i = 1; i < length; ++i) { + if (ip[0] != ip[i]) return 0; + } + return 1; +} + +/* Returns true if the given block may be RLE. + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +static int ZSTD_maybeRLE(seqStore_t const* seqStore) +{ + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); + + return nbSeqs < 4 && nbLits < 10; +} + +static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +{ + ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; + zc->blockState.prevCBlock = zc->blockState.nextCBlock; + zc->blockState.nextCBlock = tmp; +} + +static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) +{ + /* This the upper bound for the length of an rle block. + * This isn't the actual upper bound. Finding the real threshold + * needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } + } + + if (zc->seqCollector.collectSequences) { + ZSTD_copyBlockSequences(zc); + return 0; + } + + /* encode sequences and literals */ + cSize = ZSTD_compressSequences(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + dst, dstCapacity, + srcSize, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + cSize < rleMaxLength && + ZSTD_isRLE(ip, srcSize)) + { + cSize = 1; + op[0] = ip[0]; + } + +out: + if (!ZSTD_isError(cSize) && cSize > 1) { + ZSTD_confirmRepcodesAndEntropyTables(zc); + } + /* We check that dictionaries have offset codes available for the first + * block. After the first block, the offcode table might not have large + * enough codes to represent the offsets in the data. + */ + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const size_t bss, U32 lastBlock) +{ + DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()"); + if (bss == ZSTDbss_compress) { + if (/* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + ZSTD_maybeRLE(&zc->seqStore) && + ZSTD_isRLE((BYTE const*)src, srcSize)) + { + return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); + } + /* Attempt superblock compression. + * + * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the + * standard ZSTD_compressBound(). This is a problem, because even if we have + * space now, taking an extra byte now could cause us to run out of space later + * and violate ZSTD_compressBound(). + * + * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize. + * + * In order to respect ZSTD_compressBound() we must attempt to emit a raw + * uncompressed block in these cases: + * * cSize == 0: Return code for an uncompressed block. + * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize). + * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of + * output space. + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ + { + size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { + size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_confirmRepcodesAndEntropyTables(zc); + return cSize; + } + } + } + } + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. + * The decoder will be able to stream this block since it is uncompressed. + */ + return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); +} + +static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock) +{ + size_t cSize = 0; + const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + + cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed"); + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, + void const* iend) +{ + if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { + U32 const maxDist = (U32)1 << params->cParams.windowLog; + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); + ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + ZSTD_cwksp_mark_tables_dirty(ws); + ZSTD_reduceIndex(ms, params, correction); + ZSTD_cwksp_mark_tables_clean(ws); + if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; + else ms->nextToUpdate -= correction; + /* invalidate dictionaries on overflow correction */ + ms->loadedDictEnd = 0; + ms->dictMatchState = NULL; + } +} + +/*! ZSTD_compress_frameChunk() : +* Compress a chunk of data into one or multiple blocks. +* All blocks will be terminated, all input will be consumed. +* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. +* Frame is supposed already started (header already produced) +* @return : compressed size, or an error code +*/ +static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastFrameChunk) +{ + size_t blockSize = cctx->blockSize; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; + + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + + DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + XXH64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); + ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + + /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ + if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; + + { size_t cSize; + if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); + assert(cSize > 0); + assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else { + cSize = ZSTD_compressBlock_internal(cctx, + op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, + ip, blockSize, 1 /* frame */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed"); + + if (cSize == 0) { /* block is not compressible */ + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + } else { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } + } + + + ip += blockSize; + assert(remaining >= blockSize); + remaining -= blockSize; + op += cSize; + assert(dstCapacity >= cSize); + dstCapacity -= cSize; + cctx->isFirstBlock = 0; + DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u", + (unsigned)cSize); + } } + + if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending; + return (size_t)(op-ostart); +} + + +static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, + const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) +{ BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; + U32 const windowSize = (U32)1 << params->cParams.windowLog; + U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); + BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); + U32 const fcsCode = params->fParams.contentSizeFlag ? + (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */ + BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) ); + size_t pos=0; + + assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)); + RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall, + "dst buf is too small to fit worst-case frame header size."); + DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", + !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode); + + if (params->format == ZSTD_f_zstd1) { + MEM_writeLE32(dst, ZSTD_MAGICNUMBER); + pos = 4; + } + op[pos++] = frameHeaderDescriptionByte; + if (!singleSegment) op[pos++] = windowLogByte; + switch(dictIDSizeCode) + { + default: assert(0); /* impossible */ + case 0 : break; + case 1 : op[pos] = (BYTE)(dictID); pos++; break; + case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; + case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break; + } + switch(fcsCode) + { + default: assert(0); /* impossible */ + case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; + case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; + case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; + case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break; + } + return pos; +} + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapacity` is too small (stage != ZSTDcs_init, stage_wrong, + "wrong cctx stage"); + RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, + parameter_unsupported, + "incompatible with ldm"); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + return 0; +} + + +static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) +{ + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", + cctx->stage, (unsigned)srcSize); + RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong, + "missing init (ZSTD_compressBegin)"); + + if (frame && (cctx->stage==ZSTDcs_init)) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, + cctx->pledgedSrcSizePlusOne-1, cctx->dictID); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + assert(fhSize <= dstCapacity); + dstCapacity -= fhSize; + dst = (char*)dst + fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (!srcSize) return fhSize; /* do not generate an empty block if no input */ + + if (!ZSTD_window_update(&ms->window, src, srcSize)) { + ms->nextToUpdate = ms->window.dictLimit; + } + if (cctx->appliedParams.ldmParams.enableLdm) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + } + + if (!frame) { + /* overflow check and correction for block mode */ + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, + src, (BYTE const*)src + srcSize); + } + + DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); + FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed"); + cctx->consumedSrcSize += srcSize; + cctx->producedCSize += (cSize + fhSize); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + RETURN_ERROR_IF( + cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize >= %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + return cSize + fhSize; + } +} + +size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); +} + + +size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +{ + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); + return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); +} + +size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); + { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); +} + +/*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, + ZSTD_dictTableLoadMethod_e dtlm) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + ZSTD_window_update(&ms->window, src, srcSize); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + + if (params->ldmParams.enableLdm && ls != NULL) { + ZSTD_window_update(&ls->window, src, srcSize); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); + } + + /* Assert that we the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + if (srcSize <= HASH_READ_SIZE) return 0; + + while (iend - ip > HASH_READ_SIZE) { + size_t const remaining = (size_t)(iend - ip); + size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); + const BYTE* const ichunk = ip + chunk; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + + if (params->ldmParams.enableLdm && ls != NULL) + ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, ¶ms->ldmParams); + + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, ichunk, dtlm); + break; + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + if (chunk >= HASH_READ_SIZE) + ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + if (chunk >= HASH_READ_SIZE) + ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); + break; + + default: + assert(0); /* not possible : not a valid strategy id */ + } + + ip = ichunk; + } + + ms->nextToUpdate = (U32)(iend - ms->window.base); + return 0; +} + + +/* Dictionaries that assign zero probability to symbols that show up causes problems + when FSE encoding. Refuse dictionaries that assign zero probability to symbols + that we may encounter during compression. + NOTE: This behavior is not standard and could be improved in the future. */ +static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) { + U32 s; + RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols"); + for (s = 0; s <= maxSymbolValue; ++s) { + RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols"); + } + return 0; +} + +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + short* offcodeNCount, unsigned* offcodeMaxValue, + const void* const dict, size_t dictSize) +{ + const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ + const BYTE* const dictEnd = dictPtr + dictSize; + dictPtr += 8; + bs->entropy.huf.repeatMode = HUF_repeat_check; + + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, + dictEnd-dictPtr, &hasZeroWeights); + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ + if (!hasZeroWeights) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + + { unsigned offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ + /* fill all offset symbols to avoid garbage at end of table */ + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.offcodeCTable, + offcodeNCount, MaxOff, offcodeLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + /* Every match length code must have non-zero probability */ + FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.matchlengthCTable, + matchlengthNCount, matchlengthMaxValue, matchlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + /* Every literal length code must have non-zero probability */ + FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.litlengthCTable, + litlengthNCount, litlengthMaxValue, litlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + bs->rep[0] = MEM_readLE32(dictPtr+0); + bs->rep[1] = MEM_readLE32(dictPtr+4); + bs->rep[2] = MEM_readLE32(dictPtr+8); + dictPtr += 12; + + return dictPtr - (const BYTE*)dict; +} + +/* Dictionary format : + * See : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : dictID, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed >= 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, + void* workspace) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff; + size_t dictID; + size_t eSize; + + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); + assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); + + dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); + eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize); + FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); + dictPtr += eSize; + + { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ + } + /* All offset values <= dictContentSize + 128 KB must be representable */ + FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), ""); + /* All repCodes must be <= dictContentSize and != 0*/ + { U32 u; + for (u=0; u<3; u++) { + RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } + + bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid; + bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid; + bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid; + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( + ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); + return dictID; + } +} + +/** ZSTD_compress_insertDictionary() : +* @return : dictID, or an error code */ +static size_t +ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_matchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + void* workspace) +{ + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); + if ((dict==NULL) || (dictSize<8)) { + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + return 0; + } + + ZSTD_reset_compressedBlockState(bs); + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) + return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( + ms, ls, ws, params, dict, dictSize, dtlm); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ + } + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( + bs, ms, ws, params, dict, dictSize, dtlm, workspace); +} + +#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) +#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6) + +/*! ZSTD_compressBegin_internal() : + * @return : 0, or an error code */ +static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if ( (cdict) + && (cdict->dictContentSize > 0) + && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0) + && (params->attachDictPref != ZSTD_dictForceLoad) ) { + return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); + } + + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, + ZSTDcrp_makeClean, zbuff) , ""); + { size_t const dictID = cdict ? + ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, dictContentType, dtlm, + cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, + dictContentType, dtlm, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; + } + return 0; +} + +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog); + /* compression parameters verification and optimization */ + FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , ""); + return ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, dtlm, + cdict, + params, pledgedSrcSize, + ZSTDb_not_buffered); +} + +/*! ZSTD_compressBegin_advanced() : +* @return : 0, or an error code */ +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + return ZSTD_compressBegin_advanced_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, + NULL /*cdict*/, + &cctxParams, pledgedSrcSize); +} + +size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); +} + +size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) +{ + return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); +} + + +/*! ZSTD_writeEpilogue() : +* Ends a frame. +* @return : nb of bytes written into dst (or an error code) */ +static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); + MEM_writeLE32(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } + + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); + DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum); + MEM_writeLE32(op, checksum); + op += 4; + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ + return op-ostart; +} + +size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, + dst, dstCapacity, src, srcSize, + 1 /* frame mode */, 1 /* last chunk */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed"); + endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); + FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed"); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + DEBUGLOG(4, "end of frame : controlling src size"); + RETURN_ERROR_IF( + cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize = %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + return cSize + endResult; +} + + +static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_parameters* params) +{ + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); + DEBUGLOG(4, "ZSTD_compress_internal"); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + &cctxParams); +} + +size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced"); + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + return ZSTD_compress_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + ¶ms); +} + +/* Internal */ +size_t ZSTD_compress_advanced_internal( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel) +{ + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0); + ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); + assert(params.fParams.contentSizeFlag == 1); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); +} + +size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize); + assert(cctx != NULL); + return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); +} + +size_t ZSTD_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + size_t result; + ZSTD_CCtx ctxBody; + ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem); + result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel); + ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */ + return result; +} + + +/* ===== Dictionary API ===== */ + +/*! ZSTD_estimateCDictSize_advanced() : + * Estimate amount of memory that will be needed to create a dictionary with following arguments */ +size_t ZSTD_estimateCDictSize_advanced( + size_t dictSize, ZSTD_compressionParameters cParams, + ZSTD_dictLoadMethod_e dictLoadMethod) +{ + DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); + return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); +} + +size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); +} + +size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support sizeof on NULL */ + DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict)); + /* cdict may be in the workspace */ + return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict)) + + ZSTD_cwksp_sizeof(&cdict->workspace); +} + +static size_t ZSTD_initCDict_internal( + ZSTD_CDict* cdict, + const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType); + assert(!ZSTD_checkCParams(cParams)); + cdict->matchState.cParams = cParams; + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { + cdict->dictContent = dictBuffer; + } else { + void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*))); + RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!"); + cdict->dictContent = internalBuffer; + memcpy(internalBuffer, dictBuffer, dictSize); + } + cdict->dictContentSize = dictSize; + + cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); + + + /* Reset the state to no dictionary */ + ZSTD_reset_compressedBlockState(&cdict->cBlockState); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &cdict->matchState, + &cdict->workspace, + &cParams, + ZSTDcrp_makeClean, + ZSTDirp_reset, + ZSTD_resetTarget_CDict), ""); + /* (Maybe) load the dictionary + * Skips loading the dictionary if it is < 8 bytes. + */ + { ZSTD_CCtx_params params; + memset(¶ms, 0, sizeof(params)); + params.compressionLevel = ZSTD_CLEVEL_DEFAULT; + params.fParams.contentSizeFlag = 1; + params.cParams = cParams; + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, + dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; + } + } + + return 0; +} + +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, ZSTD_customMem customMem) +{ + DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + void* const workspace = ZSTD_malloc(workspaceSize, customMem); + ZSTD_cwksp ws; + ZSTD_CDict* cdict; + + if (!workspace) { + ZSTD_free(workspace, customMem); + return NULL; + } + + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + assert(cdict != NULL); + ZSTD_cwksp_move(&cdict->workspace, &ws); + cdict->customMem = customMem; + cdict->compressionLevel = 0; /* signals advanced API usage */ + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dictBuffer, dictSize, + dictLoadMethod, dictContentType, + cParams) )) { + ZSTD_freeCDict(cdict); + return NULL; + } + + return cdict; + } +} + +ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); + if (cdict) + cdict->compressionLevel = compressionLevel == 0 ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + return cdict; +} + +ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + return ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); +} + +size_t ZSTD_freeCDict(ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support free on NULL */ + { ZSTD_customMem const cMem = cdict->customMem; + int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict); + ZSTD_cwksp_free(&cdict->workspace, cMem); + if (!cdictInWorkspace) { + ZSTD_free(cdict, cMem); + } + return 0; + } +} + +/*! ZSTD_initStaticCDict_advanced() : + * Generate a digested dictionary in provided memory area. + * workspace: The memory area to emplace the dictionary into. + * Provided pointer must 8-bytes aligned. + * It must outlive dictionary usage. + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level + * into its relevants cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. + */ +const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + matchStateSize; + ZSTD_CDict* cdict; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { + ZSTD_cwksp ws; + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + if (cdict == NULL) return NULL; + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + + DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", + (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cParams) )) + return NULL; + + return cdict; +} + +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) +{ + assert(cdict != NULL); + return cdict->matchState.cParams; +} + +/* ZSTD_compressBegin_usingCDict_advanced() : + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); + { ZSTD_CCtx_params params = cctx->requestedParams; + params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0 ) + && (params.attachDictPref != ZSTD_dictForceLoad) ? + ZSTD_getCParamsFromCDict(cdict) + : ZSTD_getCParams(cdict->compressionLevel, + pledgedSrcSize, + cdict->dictContentSize); + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog); + } + params.fParams = fParams; + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, + cdict, + ¶ms, pledgedSrcSize, + ZSTDb_not_buffered); + } +} + +/* ZSTD_compressBegin_usingCDict() : + * pledgedSrcSize=0 means "unknown" + * if pledgedSrcSize>0, it will enable contentSizeFlag */ +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); + return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); +} + +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. + * Note that compression parameters are decided at CDict creation time + * while frame parameters are hardcoded */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + + + +/* ****************************************************************** +* Streaming +********************************************************************/ + +ZSTD_CStream* ZSTD_createCStream(void) +{ + DEBUGLOG(3, "ZSTD_createCStream"); + return ZSTD_createCStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticCCtx(workspace, workspaceSize); +} + +ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) +{ /* CStream and CCtx are now same object */ + return ZSTD_createCCtx_advanced(customMem); +} + +size_t ZSTD_freeCStream(ZSTD_CStream* zcs) +{ + return ZSTD_freeCCtx(zcs); /* same object */ +} + + + +/*====== Initialization ======*/ + +size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_CStreamOutSize(void) +{ + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; +} + +static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, + const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, + const ZSTD_CDict* const cdict, + ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_resetCStream_internal"); + /* Finalize the compression parameters */ + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, ZSTD_dtlm_fast, + cdict, + ¶ms, pledgedSrcSize, + ZSTDb_buffered) , ""); + + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + cctx->inBuffTarget = cctx->blockSize + + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; + return 0; /* ready to go */ +} + +/* ZSTD_resetCStream(): + * pledgedSrcSize == 0 means "unknown" */ +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +/*! ZSTD_initCStream_internal() : + * Note : for lib/compress only. Used by zstdmt_compress.c. + * Assumption 1 : params are valid + * Assumption 2 : either dict, or cdict, is defined, not both */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_internal"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + zcs->requestedParams = *params; + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if (dict) { + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + } else { + /* Dictionary is cleared if !cdict */ + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + } + return 0; +} + +/* ZSTD_initCStream_usingCDict_advanced() : + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */ +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + zcs->requestedParams.fParams = fParams; + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + +/* note : cdict must outlive compression session */ +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + + +/* ZSTD_initCStream_advanced() : + * pledgedSrcSize must be exact. + * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */ +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pss) +{ + /* for compatibility with older programs relying on this behavior. + * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. + * This line will be removed in the future. + */ + U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, ¶ms); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_srcSize"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + return 0; +} + +/*====== Compression ======*/ + +static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) +{ + size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; + if (hintInSize==0) hintInSize = cctx->blockSize; + return hintInSize; +} + +/** ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants + * non-static, because can be called from zstdmt_compress.c + * @return : hint size for next input */ +static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) +{ + const char* const istart = (const char*)input->src; + const char* const iend = input->size != 0 ? istart + input->size : istart; + const char* ip = input->pos != 0 ? istart + input->pos : istart; + char* const ostart = (char*)output->dst; + char* const oend = output->size != 0 ? ostart + output->size : ostart; + char* op = output->pos != 0 ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ + DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + assert(output->pos <= output->size); + assert(input->pos <= input->size); + + while (someMoreWork) { + switch(zcs->streamStage) + { + case zcss_init: + RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!"); + + case zcss_load: + if ( (flushMode == ZSTD_e_end) + && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip)) /* enough dstCapacity */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ + size_t const cSize = ZSTD_compressEnd(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); + ip = iend; + op += cSize; + zcs->frameEnded = 1; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + someMoreWork = 0; break; + } + /* complete loading into inBuffer */ + { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; + if (loaded != 0) + ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (zcs->inBuffPos == zcs->inToCompress) ) { + /* empty */ + someMoreWork = 0; break; + } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); + { void* cDst; + size_t cSize; + size_t const iSize = zcs->inBuffPos - zcs->inToCompress; + size_t oSize = oend-op; + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + if (oSize >= ZSTD_compressBound(iSize)) + cDst = op; /* compress into output buffer, to skip flush stage */ + else + cDst = zcs->outBuff, oSize = zcs->outBuffSize; + cSize = lastBlock ? + ZSTD_compressEnd(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : + ZSTD_compressContinue(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ + zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; + if (zcs->inBuffTarget > zcs->inBuffSize) + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; + if (cDst == op) { /* no need to flush */ + op += cSize; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed directly in outBuffer"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + } + break; + } + zcs->outBuffContentSize = cSize; + zcs->outBuffFlushedSize = 0; + zcs->streamStage = zcss_flush; /* pass-through to flush stage */ + } + /* fall-through */ + case zcss_flush: + DEBUGLOG(5, "flush stage"); + { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op), + zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", + (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed); + if (flushed) + op += flushed; + zcs->outBuffFlushedSize += flushed; + if (toFlush!=flushed) { + /* flush not fully completed, presumably because dst is too small */ + assert(op==oend); + someMoreWork = 0; + break; + } + zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed on flush"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + break; + } + zcs->streamStage = zcss_load; + break; + } + + default: /* impossible */ + assert(0); + } + } + + input->pos = ip - istart; + output->pos = op - ostart; + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); +} + +static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers >= 1) { + assert(cctx->mtctx != NULL); + return ZSTDMT_nextInputSizeHint(cctx->mtctx); + } +#endif + return ZSTD_nextInputSizeHint(cctx); + +} + +size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , ""); + return ZSTD_nextInputSizeHint_MTorST(zcs); +} + + +size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); + /* check conditions */ + RETURN_ERROR_IF(output->pos > output->size, GENERIC, "invalid buffer"); + RETURN_ERROR_IF(input->pos > input->size, GENERIC, "invalid buffer"); + assert(cctx!=NULL); + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ + assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ + params.cParams = ZSTD_getCParamsFromCCtxParams( + &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/); + + +#ifdef ZSTD_MULTITHREAD + if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { + params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + } + if (params.nbWorkers > 0) { + /* mt context creation */ + if (cctx->mtctx == NULL) { + DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", + params.nbWorkers); + cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem); + RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!"); + } + /* mt compression */ + DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); + FORWARD_IF_ERROR( ZSTDMT_initCStream_internal( + cctx->mtctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , ""); + cctx->streamStage = zcss_load; + cctx->appliedParams.nbWorkers = params.nbWorkers; + } else +#endif + { FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, + params, cctx->pledgedSrcSizePlusOne-1) , ""); + assert(cctx->streamStage == zcss_load); + assert(cctx->appliedParams.nbWorkers == 0); + } } + /* end of transparent initialization stage */ + + /* compression stage */ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + int const forceMaxProgress = (endOp == ZSTD_e_flush || endOp == ZSTD_e_end); + size_t flushMin; + assert(forceMaxProgress || endOp == ZSTD_e_continue /* Protection for a new flush type */); + if (cctx->cParamsChanged) { + ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams); + cctx->cParamsChanged = 0; + } + do { + flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + if ( ZSTD_isError(flushMin) + || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + } + FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed"); + } while (forceMaxProgress && flushMin != 0 && output->pos < output->size); + DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic"); + /* Either we don't require maximum forward progress, we've finished the + * flush, or we are out of output space. + */ + assert(!forceMaxProgress || flushMin == 0 || output->pos == output->size); + return flushMin; + } +#endif + FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , ""); + DEBUGLOG(5, "completed ZSTD_compressStream2"); + return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ +} + +size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; +} + +size_t ZSTD_compress2(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + { size_t oPos = 0; + size_t iPos = 0; + size_t const result = ZSTD_compressStream2_simpleArgs(cctx, + dst, dstCapacity, &oPos, + src, srcSize, &iPos, + ZSTD_e_end); + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); + RETURN_ERROR(dstSize_tooSmall, ""); + } + assert(iPos == srcSize); /* all input is expected consumed */ + return oPos; + } +} + +/*====== Finalize ======*/ + +/*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ +size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = { NULL, 0, 0 }; + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); +} + + +size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = { NULL, 0, 0 }; + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); + FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; + size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4); + size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize; + DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush); + return toFlush; + } +} + + +/*-===== Pre-defined compression levels =====-*/ + +#define ZSTD_MAX_CLEVEL 22 +int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } +int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { +{ /* "default" - for any srcSize > 256 KB */ + /* W, C, H, S, L, TL, strat */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ + { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ + { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ + { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ +}, +{ /* for srcSize <= 256 KB */ + /* W, C, H, S, L, T, strat */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ + { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 128 KB */ + /* W, C, H, S, L, T, strat */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ + { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 16 KB */ + /* W, C, H, S, L, T, strat */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +}; + +/*! ZSTD_getCParams_internal() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. */ +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; + size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; + U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); + int row = compressionLevel; + DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); + if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ + if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */ + /* refine parameters based on srcSize & dictSize */ + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); + } +} + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Size values are optional, provide 0 if not known or unused */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); + memset(¶ms, 0, sizeof(params)); + params.cParams = cParams; + params.fParams.contentSizeFlag = 1; + return params; +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize); +} +/**** ended inlining compress/zstd_compress.c ****/ +/**** start inlining compress/zstd_double_fast.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_double_fast.h ****/ + + +void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashLarge = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32 const mls = cParams->minMatch; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) + hashSmall[smHash] = current + i; + if (i == 0 || hashLarge[lgHash] == 0) + hashLarge[lgHash] = current + i; + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; + } } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */, ZSTD_dictMode_e const dictMode) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = + dictMode == ZSTD_dictMatchState ? + &dms->cParams : NULL; + const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ? + dms->hashTable : NULL; + const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? + dms->chainTable : NULL; + const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ? + dms->window.dictLimit : 0; + const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? + dms->window.base : NULL; + const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ? + dictBase + dictStartIndex : NULL; + const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? + dms->window.nextSrc : NULL; + const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? + prefixLowestIndex - (U32)(dictEnd - dictBase) : + 0; + const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ? + dictCParams->hashLog : hBitsL; + const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ? + dictCParams->chainLog : hBitsS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); + + assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); + + /* if a dictionary is attached, it must be within window range */ + if (dictMode == ZSTD_dictMatchState) { + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + } + + /* init */ + ip += (dictAndPrefixLength == 0); + if (dictMode == ZSTD_noDict) { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + if (dictMode == ZSTD_dictMatchState) { + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); + size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); + size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; + const BYTE* matchLong = base + matchIndexL; + const BYTE* match = base + matchIndexS; + const U32 repIndex = current + 1 - offset_1; + const BYTE* repMatch = (dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + hashLong[h2] = hashSmall[h] = current; /* update hash tables */ + + /* check dictMatchState repcode */ + if (dictMode == ZSTD_dictMatchState + && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + goto _match_stored; + } + + /* check noDict repcode */ + if ( dictMode == ZSTD_noDict + && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + goto _match_stored; + } + + if (matchIndexL > prefixLowestIndex) { + /* check prefix long match */ + if (MEM_read64(matchLong) == MEM_read64(ip)) { + mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; + offset = (U32)(ip-matchLong); + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dictMatchState long match */ + U32 const dictMatchIndexL = dictHashLong[dictHL]; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + + if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) { + mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8; + offset = (U32)(current - dictMatchIndexL - dictIndexDelta); + while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */ + goto _match_found; + } } + + if (matchIndexS > prefixLowestIndex) { + /* check prefix short match */ + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dictMatchState short match */ + U32 const dictMatchIndexS = dictHashSmall[dictHS]; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + + if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } } + + ip += ((ip-anchor) >> kSearchStrength) + 1; +#if defined(__aarch64__) + PREFETCH_L1(ip+256); +#endif + continue; + +_search_next_long: + + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = current + 1; + + /* check prefix long +1 match */ + if (matchIndexL3 > prefixLowestIndex) { + if (MEM_read64(matchL3) == MEM_read64(ip+1)) { + mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; + ip++; + offset = (U32)(ip-matchL3); + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dict long +1 match */ + U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { + mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8; + ip++; + offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta); + while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */ + goto _match_found; + } } } + + /* if no long +1 match, explore the short match we found */ + if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { + mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; + offset = (U32)(current - matchIndexS); + while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } else { + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + offset = (U32)(ip - match); + while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + + /* fall-through */ + +_match_found: + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = current+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + if (dictMode == ZSTD_dictMatchState) { + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState + && repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } + + if (dictMode == ZSTD_noDict) { + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + } /* while (ip < ilimit) */ + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); + case 5 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); + case 6 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); + case 7 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); + } +} + + +size_t ZSTD_compressBlock_doubleFast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); + case 5 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); + case 6 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); + case 7 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); + } +} + + +static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); + const U32 dictStartIndex = lowLimit; + const U32 dictLimit = ms->window.dictLimit; + const U32 prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dictBase + prefixStartIndex; + U32 offset_1=rep[0], offset_2=rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize); + + /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 matchIndex = hashSmall[hSmall]; + const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + + const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); + const U32 matchLongIndex = hashLong[hLong]; + const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base; + const BYTE* matchLong = matchLongBase + matchLongIndex; + + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ + + if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ + & (repIndex > dictStartIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart; + U32 offset; + mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8; + offset = current - matchLongIndex; + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base; + const BYTE* match3 = match3Base + matchIndex3; + U32 offset; + hashLong[h3] = current + 1; + if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { + const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8; + ip++; + offset = current+1 - matchIndex3; + while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ + } else { + const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; + offset = current - matchIndex; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; + continue; + } } + + /* move to next sequence start */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = current+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ + & (repIndex2 > dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + } +} +/**** ended inlining compress/zstd_double_fast.c ****/ +/**** start inlining compress/zstd_fast.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_fast.h ****/ + + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hBits = cParams->hashLog; + U32 const mls = cParams->minMatch; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ + for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls); + hashTable[hash0] = current; + if (dtlm == ZSTD_dtlm_fast) continue; + /* Only load extra positions for ZSTD_dtlm_full */ + { U32 p; + for (p = 1; p < fastHashFillStep; ++p) { + size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls); + if (hashTable[hash] == 0) { /* not yet filled */ + hashTable[hash] = current + p; + } } } } +} + + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_fast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ + const BYTE* ip0 = istart; + const BYTE* ip1; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); + ip0 += (ip0 == prefixStart); + ip1 = ip0 + 1; + { U32 const current = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ +#ifdef __INTEL_COMPILER + /* From intel 'The vector pragma indicates that the loop should be + * vectorized if it is legal to do so'. Can be used together with + * #pragma ivdep (but have opted to exclude that because intel + * warns against using it).*/ + #pragma vector always +#endif + while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */ + size_t mLength; + BYTE const* ip2 = ip0 + 2; + size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); + U32 const val0 = MEM_read32(ip0); + size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); + U32 const val1 = MEM_read32(ip1); + U32 const current0 = (U32)(ip0-base); + U32 const current1 = (U32)(ip1-base); + U32 const matchIndex0 = hashTable[h0]; + U32 const matchIndex1 = hashTable[h1]; + BYTE const* repMatch = ip2 - offset_1; + const BYTE* match0 = base + matchIndex0; + const BYTE* match1 = base + matchIndex1; + U32 offcode; + +#if defined(__aarch64__) + PREFETCH_L1(ip0+256); +#endif + + hashTable[h0] = current0; /* update hash table */ + hashTable[h1] = current1; /* update hash table */ + + assert(ip0 + 1 == ip1); + + if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { + mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0; + ip0 = ip2 - mLength; + match0 = repMatch - mLength; + mLength += 4; + offcode = 0; + goto _match; + } + if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { + /* found a regular match */ + goto _offset; + } + if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { + /* found a regular match after one literal */ + ip0 = ip1; + match0 = match1; + goto _offset; + } + { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; + assert(step >= 2); + ip0 += step; + ip1 += step; + continue; + } +_offset: /* Requires: ip0, match0 */ + /* Compute the offset code */ + offset_2 = offset_1; + offset_1 = (U32)(ip0-match0); + offcode = offset_1 + ZSTD_REP_MOVE; + mLength = 4; + /* Count the backwards match length */ + while (((ip0>anchor) & (match0>prefixStart)) + && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ + +_match: /* Requires: ip0, match0, offcode */ + /* Count the forward length */ + mLength += ZSTD_count(ip0+mLength, match0+mLength, iend); + ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); + /* match found */ + ip0 += mLength; + anchor = ip0; + + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */ + while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; + { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } + ip1 = ip0 + 1; + } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); + } +} + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_fast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; + const U32* const dictHashTable = dms->hashTable; + const U32 dictStartIndex = dms->window.dictLimit; + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); + const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); + const U32 dictHLog = dictCParams->hashLog; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; + const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + + /* ensure there will be no no underflow + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); + ip += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h = ZSTD_hashPtr(ip, hlog, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndex = hashTable[h]; + const BYTE* match = base + matchIndex; + const U32 repIndex = current + 1 - offset_1; + const BYTE* repMatch = (repIndex < prefixStartIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + hashTable[h] = current; /* update hash table */ + + if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + } else if ( (matchIndex <= prefixStartIndex) ) { + size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); + U32 const dictMatchIndex = dictHashTable[dictHash]; + const BYTE* dictMatch = dictBase + dictMatchIndex; + if (dictMatchIndex <= dictStartIndex || + MEM_read32(dictMatch) != MEM_read32(ip)) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } else { + /* found a dict match */ + U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta); + mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; + while (((ip>anchor) & (dictMatch>dictStart)) + && (ip[-1] == dictMatch[-1])) { + ip--; dictMatch--; mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } + } else if (MEM_read32(match) != MEM_read32(ip)) { + /* it's not a match, and we're not going to check the dictionary */ + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } else { + /* found a regular match */ + U32 const offset = (U32)(ip-match); + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + while (((ip>anchor) & (match>prefixStart)) + && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + assert(base+current+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } + } + } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + +size_t ZSTD_compressBlock_fast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + assert(ms->dictMatchState != NULL); + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + } +} + + +static size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); + const U32 dictStartIndex = lowLimit; + const BYTE* const dictStart = dictBase + dictStartIndex; + const U32 dictLimit = ms->window.dictLimit; + const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const dictEnd = dictBase + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); + + /* switch to "regular" variant if extDict is invalidated due to maxDistance */ + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t h = ZSTD_hashPtr(ip, hlog, mls); + const U32 matchIndex = hashTable[h]; + const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + hashTable[h] = current; /* update hash table */ + DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current); + assert(offset_1 <= current +1); /* check repIndex */ + + if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + } else { + if ( (matchIndex < dictStartIndex) || + (MEM_read32(match) != MEM_read32(ip)) ) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } + { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; + U32 const offset = current - matchIndex; + size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = offset; /* update offset history */ + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ip += mLength; + anchor = ip; + } } + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + } +} +/**** ended inlining compress/zstd_fast.c ****/ +/**** start inlining compress/zstd_lazy.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_lazy.h ****/ + + +/*-************************************* +* Binary Tree search +***************************************/ + +static void +ZSTD_updateDUBT(ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + if (idx != target) + DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", + idx, target, ms->window.dictLimit); + assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ + (void)iend; + + assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ + for ( ; idx < target ; idx++) { + size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ + U32 const matchIndex = hashTable[h]; + + U32* const nextCandidatePtr = bt + 2*(idx&btMask); + U32* const sortMarkPtr = nextCandidatePtr + 1; + + DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); + hashTable[h] = idx; /* Update Hash Table */ + *nextCandidatePtr = matchIndex; /* update BT like a chain */ + *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; + } + ms->nextToUpdate = target; +} + + +/** ZSTD_insertDUBT1() : + * sort one already inserted but unsorted position + * assumption : current >= btlow == (current - btmask) + * doesn't fail */ +static void +ZSTD_insertDUBT1(ZSTD_matchState_t* ms, + U32 current, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current; + const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ + U32 dummy32; /* to be nullified at the end */ + U32 const windowValid = ms->window.lowLimit; + U32 const maxDistance = 1U << cParams->windowLog; + U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid; + + + DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", + current, dictLimit, windowLow); + assert(current >= btLow); + assert(ip < iend); /* condition for ZSTD_count */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + /* note : all candidates are now supposed sorted, + * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK + * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ + + if ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ + || (current < dictLimit) /* both in extDict */) { + const BYTE* const mBase = ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit)) ? + base : dictBase; + assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ + || (current < dictLimit) ); + match = mBase + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* preparation for next read of match[matchLength] */ + } + + DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", + current, matchIndex, (U32)matchLength); + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", + matchIndex, btLow, nextPtr[1]); + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", + matchIndex, btLow, nextPtr[0]); + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; +} + + +static size_t +ZSTD_DUBT_findBetterDictMatch ( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + size_t bestLength, + U32 nbCompares, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_matchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 dictMatchIndex = dictHashTable[h]; + + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + U32 const current = (U32)(ip-base); + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictEnd = dms->window.nextSrc; + U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); + U32 const dictLowLimit = dms->window.lowLimit; + U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; + + U32* const dictBt = dms->chainTable; + U32 const btLog = dmsCParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; + + size_t commonLengthSmaller=0, commonLengthLarger=0; + + (void)dictMode; + assert(dictMode == ZSTD_dictMatchState); + + while (nbCompares-- && (dictMatchIndex > dictLowLimit)) { + U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dictBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (dictMatchIndex+matchLength >= dictHighLimit) + match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", + current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } + } + + if (bestLength >= MINMATCH) { + U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + current, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + +} + + +static size_t +ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const unsortLimit = MAX(btLow, windowLow); + + U32* nextCandidate = bt + 2*(matchIndex&btMask); + U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; + U32 nbCompares = 1U << cParams->searchLog; + U32 nbCandidates = nbCompares; + U32 previousCandidate = 0; + + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current); + assert(ip <= iend-8); /* required for h calculation */ + + /* reach end of unsorted candidates list */ + while ( (matchIndex > unsortLimit) + && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) + && (nbCandidates > 1) ) { + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", + matchIndex); + *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ + previousCandidate = matchIndex; + matchIndex = *nextCandidate; + nextCandidate = bt + 2*(matchIndex&btMask); + unsortedMark = bt + 2*(matchIndex&btMask) + 1; + nbCandidates --; + } + + /* nullify last candidate if it's still unsorted + * simplification, detrimental to compression ratio, beneficial for speed */ + if ( (matchIndex > unsortLimit) + && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", + matchIndex); + *nextCandidate = *unsortedMark = 0; + } + + /* batch sort stacked candidates */ + matchIndex = previousCandidate; + while (matchIndex) { /* will end on matchIndex == 0 */ + U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; + U32 const nextCandidateIdx = *nextCandidateIdxPtr; + ZSTD_insertDUBT1(ms, matchIndex, iend, + nbCandidates, unsortLimit, dictMode); + matchIndex = nextCandidateIdx; + nbCandidates++; + } + + /* find longest match */ + { size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current + 8 + 1; + U32 dummy32; /* to be nullified at the end */ + size_t bestLength = 0; + + matchIndex = hashTable[h]; + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match; + + if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any + * further in this loop, make sure we + * skip checking in the dictionary. */ + } + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, + offsetPtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { + U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + current, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + } +} + + +/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +FORCE_INLINE_TEMPLATE size_t +ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) +{ + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); + return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); +} + + +static size_t +ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + } +} + + +static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + } +} + + +static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + } +} + + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); +} + + +/* inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_HcFindBestMatch_generic ( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const chainTable = ms->chainTable; + const U32 chainSize = (1 << cParams->chainLog); + const U32 chainMask = chainSize-1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 current = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 minChain = current > chainSize ? current - chainSize : 0; + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + + /* HC4 match finder */ + U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); + + for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + if (match[ml] == ip[ml]) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= minChain) break; + matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); + } + + if (dictMode == ZSTD_dictMatchState) { + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const U32* const dmsChainTable = dms->chainTable; + const U32 dmsChainSize = (1 << dms->cParams.chainLog); + const U32 dmsChainMask = dmsChainSize - 1; + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; + + matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; + + for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= dmsMinChain) break; + matchIndex = dmsChainTable[matchIndex & dmsChainMask]; + } + } + + return ml; +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + } +} + + +static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + } +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + } +} + + +/* ******************************* +* Common parser - lazy strategy +*********************************/ +typedef enum { search_hashChain, search_binaryTree } searchMethod_e; + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth, + ZSTD_dictMode_e const dictMode) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 prefixLowestIndex = ms->window.dictLimit; + const BYTE* const prefixLowest = base + prefixLowestIndex; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ? + (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS + : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) : + (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS + : ZSTD_HcFindBestMatch_selectMLS); + U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ? + dms->window.dictLimit : 0; + const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? + dms->window.base : NULL; + const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ? + dictBase + dictLowestIndex : NULL; + const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? + dms->window.nextSrc : NULL; + const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? + prefixLowestIndex - (U32)(dictEnd - dictBase) : + 0; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); + + /* init */ + ip += (dictAndPrefixLength == 0); + if (dictMode == ZSTD_noDict) { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, ms->cParams.windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; + if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; + } + if (dictMode == ZSTD_dictMatchState) { + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + } + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + + /* check repCode */ + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) + 1 - offset_1; + const BYTE* repMatch = (dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + if (depth==0) goto _storeSequence; + } + } + if ( dictMode == ZSTD_noDict + && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { + matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + if (depth==0) goto _storeSequence; + } + + /* first search (depth 0) */ + { size_t offsetFound = 999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + } + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + } + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* NOTE: + * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. + * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which + * overflows the pointer, which is undefined behavior. + */ + /* catch up */ + if (offset) { + if (dictMode == ZSTD_noDict) { + while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) + && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (dictMode == ZSTD_dictMatchState) { + U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + if (dictMode == ZSTD_dictMatchState) { + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex = current2 - offset_2; + const BYTE* repMatch = dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; + } + break; + } + } + + if (dictMode == ZSTD_noDict) { + while ( ((ip <= ilimit) & (offset_2>0)) + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + + /* Save reps for next block */ + rep[0] = offset_1 ? offset_1 : savedOffset; + rep[1] = offset_2 ? offset_2 : savedOffset; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const dictStart = dictBase + ms->window.lowLimit; + const U32 windowLog = ms->cParams.windowLog; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + + U32 offset_1 = rep[0], offset_2 = rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); + + /* init */ + ip += (ip == prefixStart); + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + U32 current = (U32)(ip-base); + + /* check repCode */ + { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current+1, windowLog); + const U32 repIndex = (U32)(current+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; + if (depth==0) goto _storeSequence; + } } + + /* first search (depth 0) */ + { size_t offsetFound = 999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 1 */ + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 2 */ + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ + if (offset) { + U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while (ip <= ilimit) { + const U32 repCurrent = (U32)(ip-base); + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + break; + } } + + /* Save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); +} + +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); +} +/**** ended inlining compress/zstd_lazy.c ****/ +/**** start inlining compress/zstd_ldm.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_ldm.h ****/ + +/**** skipping file: ../common/debug.h ****/ +/**** skipping file: zstd_fast.h ****/ +/**** skipping file: zstd_double_fast.h ****/ + +#define LDM_BUCKET_SIZE_LOG 3 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_RLOG 7 +#define LDM_HASH_CHAR_OFFSET 10 + +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams) +{ + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); + if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; + if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; + if (cParams->strategy >= ZSTD_btopt) { + /* Get out of the way of the optimal parser */ + U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength); + assert(minMatch >= ZSTD_LDM_MINMATCH_MIN); + assert(minMatch <= ZSTD_LDM_MINMATCH_MAX); + params->minMatchLength = minMatch; + } + if (params->hashLog == 0) { + params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); + assert(params->hashLog <= ZSTD_HASHLOG_MAX); + } + if (params->hashRateLog == 0) { + params->hashRateLog = params->windowLog < params->hashLog + ? 0 + : params->windowLog - params->hashLog; + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); +} + +size_t ZSTD_ldm_getTableSize(ldmParams_t params) +{ + size_t const ldmHSize = ((size_t)1) << params.hashLog; + size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); + size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); + size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + return params.enableLdm ? totalSize : 0; +} + +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) +{ + return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; +} + +/** ZSTD_ldm_getSmallHash() : + * numBits should be <= 32 + * If numBits==0, returns 0. + * @return : the most significant numBits of value. */ +static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits) +{ + assert(numBits <= 32); + return numBits == 0 ? 0 : (U32)(value >> (64 - numBits)); +} + +/** ZSTD_ldm_getChecksum() : + * numBitsToDiscard should be <= 32 + * @return : the next most significant 32 bits after numBitsToDiscard */ +static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard) +{ + assert(numBitsToDiscard <= 32); + return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF; +} + +/** ZSTD_ldm_getTag() ; + * Given the hash, returns the most significant numTagBits bits + * after (32 + hbits) bits. + * + * If there are not enough bits remaining, return the last + * numTagBits bits. */ +static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits) +{ + assert(numTagBits < 32 && hbits <= 32); + if (32 - hbits < numTagBits) { + return hash & (((U32)1 << numTagBits) - 1); + } else { + return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1); + } +} + +/** ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ +static ldmEntry_t* ZSTD_ldm_getBucket( + ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) +{ + return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); +} + +/** ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ +static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, + ldmParams_t const ldmParams) +{ + BYTE* const bucketOffsets = ldmState->bucketOffsets; + *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry; + bucketOffsets[hash]++; + bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1; +} + +/** ZSTD_ldm_makeEntryAndInsertByTag() : + * + * Gets the small hash, checksum, and tag from the rollingHash. + * + * If the tag matches (1 << ldmParams.hashRateLog)-1, then + * creates an ldmEntry from the offset, and inserts it into the hash table. + * + * hBits is the length of the small hash, which is the most significant hBits + * of rollingHash. The checksum is the next 32 most significant bits, followed + * by ldmParams.hashRateLog bits that make up the tag. */ +static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, + U64 const rollingHash, + U32 const hBits, + U32 const offset, + ldmParams_t const ldmParams) +{ + U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog); + U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1; + if (tag == tagMask) { + U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits); + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + ldmEntry_t entry; + entry.offset = offset; + entry.checksum = checksum; + ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams); + } +} + +/** ZSTD_ldm_countBackwardsMatch() : + * Returns the number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ +static size_t ZSTD_ldm_countBackwardsMatch( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pBase) +{ + size_t matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +/** ZSTD_ldm_fillFastTables() : + * + * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. + * This is similar to ZSTD_loadDictionaryContent. + * + * The tables for the other strategies are filled within their + * block compressors. */ +static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + void const* end) +{ + const BYTE* const iend = (const BYTE*)end; + + switch(ms->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); + break; + + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + default: + assert(0); /* not possible : not a valid strategy id */ + } + + return 0; +} + +/** ZSTD_ldm_fillLdmHashTable() : + * + * Fills hashTable from (lastHashed + 1) to iend (non-inclusive). + * lastHash is the rolling hash that corresponds to lastHashed. + * + * Returns the rolling hash corresponding to position iend-1. */ +static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, + U64 lastHash, const BYTE* lastHashed, + const BYTE* iend, const BYTE* base, + U32 hBits, ldmParams_t const ldmParams) +{ + U64 rollingHash = lastHash; + const BYTE* cur = lastHashed + 1; + + while (cur < iend) { + rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1], + cur[ldmParams.minMatchLength-1], + state->hashPower); + ZSTD_ldm_makeEntryAndInsertByTag(state, + rollingHash, hBits, + (U32)(cur - base), ldmParams); + ++cur; + } + return rollingHash; +} + +void ZSTD_ldm_fillHashTable( + ldmState_t* state, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params) +{ + DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); + if ((size_t)(iend - ip) >= params->minMatchLength) { + U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength); + ZSTD_ldm_fillLdmHashTable( + state, startingHash, ip, iend - params->minMatchLength, state->window.base, + params->hashLog - params->bucketSizeLog, + *params); + } +} + + +/** ZSTD_ldm_limitTableUpdate() : + * + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) +{ + U32 const current = (U32)(anchor - ms->window.base); + if (current > ms->nextToUpdate + 1024) { + ms->nextToUpdate = + current - MIN(512, current - ms->nextToUpdate - 1024); + } +} + +static size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + /* LDM parameters */ + int const extDict = ZSTD_window_hasExtDict(ldmState->window); + U32 const minMatchLength = params->minMatchLength; + U64 const hashPower = ldmState->hashPower; + U32 const hBits = params->hashLog - params->bucketSizeLog; + U32 const ldmBucketSize = 1U << params->bucketSizeLog; + U32 const hashRateLog = params->hashRateLog; + U32 const ldmTagMask = (1U << params->hashRateLog) - 1; + /* Prefix and extDict parameters */ + U32 const dictLimit = ldmState->window.dictLimit; + U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; + BYTE const* const base = ldmState->window.base; + BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL; + BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL; + BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL; + BYTE const* const lowPrefixPtr = base + dictLimit; + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE); + /* Input positions */ + BYTE const* anchor = istart; + BYTE const* ip = istart; + /* Rolling hash */ + BYTE const* lastHashed = NULL; + U64 rollingHash = 0; + + while (ip <= ilimit) { + size_t mLength; + U32 const current = (U32)(ip - base); + size_t forwardMatchLength = 0, backwardMatchLength = 0; + ldmEntry_t* bestEntry = NULL; + if (ip != istart) { + rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0], + lastHashed[minMatchLength], + hashPower); + } else { + rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength); + } + lastHashed = ip; + + /* Do not insert and do not look for a match */ + if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) { + ip++; + continue; + } + + /* Get the best entry and compute the match lengths */ + { + ldmEntry_t* const bucket = + ZSTD_ldm_getBucket(ldmState, + ZSTD_ldm_getSmallHash(rollingHash, hBits), + *params); + ldmEntry_t* cur; + size_t bestMatchLength = 0; + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + + for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) { + size_t curForwardMatchLength, curBackwardMatchLength, + curTotalMatchLength; + if (cur->checksum != checksum || cur->offset <= lowestIndex) { + continue; + } + if (extDict) { + BYTE const* const curMatchBase = + cur->offset < dictLimit ? dictBase : base; + BYTE const* const pMatch = curMatchBase + cur->offset; + BYTE const* const matchEnd = + cur->offset < dictLimit ? dictEnd : iend; + BYTE const* const lowMatchPtr = + cur->offset < dictLimit ? dictStart : lowPrefixPtr; + + curForwardMatchLength = ZSTD_count_2segments( + ip, pMatch, iend, + matchEnd, lowPrefixPtr); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowMatchPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } else { /* !extDict */ + BYTE const* const pMatch = base + cur->offset; + curForwardMatchLength = ZSTD_count(ip, pMatch, iend); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowPrefixPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } + + if (curTotalMatchLength > bestMatchLength) { + bestMatchLength = curTotalMatchLength; + forwardMatchLength = curForwardMatchLength; + backwardMatchLength = curBackwardMatchLength; + bestEntry = cur; + } + } + } + + /* No match found -- continue searching */ + if (bestEntry == NULL) { + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, + hBits, current, + *params); + ip++; + continue; + } + + /* Match found */ + mLength = forwardMatchLength + backwardMatchLength; + ip -= backwardMatchLength; + + { + /* Store the sequence: + * ip = current - backwardMatchLength + * The match is at (bestEntry->offset - backwardMatchLength) + */ + U32 const matchIndex = bestEntry->offset; + U32 const offset = current - matchIndex; + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(ip - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } + + /* Insert the current entry into the hash table */ + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits, + (U32)(lastHashed - base), + *params); + + assert(ip + backwardMatchLength == lastHashed); + + /* Fill the hash table from lastHashed+1 to ip+mLength*/ + /* Heuristic: don't need to fill the entire table at end of block */ + if (ip + mLength <= ilimit) { + rollingHash = ZSTD_ldm_fillLdmHashTable( + ldmState, rollingHash, lastHashed, + ip + mLength, base, hBits, *params); + lastHashed = ip + mLength - 1; + } + ip += mLength; + anchor = ip; + } + return iend - anchor; +} + +/*! ZSTD_ldm_reduceTable() : + * reduce table indexes by `reducerValue` */ +static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + U32 const reducerValue) +{ + U32 u; + for (u = 0; u < size; u++) { + if (table[u].offset < reducerValue) table[u].offset = 0; + else table[u].offset -= reducerValue; + } +} + +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldmState, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + U32 const maxDist = 1U << params->windowLog; + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + size_t const kMaxChunkSize = 1 << 20; + size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0); + size_t chunk; + size_t leftoverSize = 0; + + assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize); + /* Check that ZSTD_window_update() has been called for this chunk prior + * to passing it to this function. + */ + assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize); + /* The input could be very large (in zstdmt), so it must be broken up into + * chunks to enforce the maximum distance and handle overflow correction. + */ + assert(sequences->pos <= sequences->size); + assert(sequences->size <= sequences->capacity); + for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) { + BYTE const* const chunkStart = istart + chunk * kMaxChunkSize; + size_t const remaining = (size_t)(iend - chunkStart); + BYTE const *const chunkEnd = + (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize; + size_t const chunkSize = chunkEnd - chunkStart; + size_t newLeftoverSize; + size_t const prevSize = sequences->size; + + assert(chunkStart < iend); + /* 1. Perform overflow correction if necessary. */ + if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + U32 const ldmHSize = 1U << params->hashLog; + U32 const correction = ZSTD_window_correctOverflow( + &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); + ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); + /* invalidate dictionaries on overflow correction */ + ldmState->loadedDictEnd = 0; + } + /* 2. We enforce the maximum offset allowed. + * + * kMaxChunkSize should be small enough that we don't lose too much of + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the + * the offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may + * be split into two sequences. This condition holds when using + * ZSTD_window_enforceMaxDist(), but if we move to checking offsets + * against maxDist directly, we'll have to carefully handle that case. + */ + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL); + /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ + newLeftoverSize = ZSTD_ldm_generateSequences_internal( + ldmState, sequences, params, chunkStart, chunkSize); + if (ZSTD_isError(newLeftoverSize)) + return newLeftoverSize; + /* 4. We add the leftover literals from previous iterations to the first + * newly generated sequence, or add the `newLeftoverSize` if none are + * generated. + */ + /* Prepend the leftover literals from the last call */ + if (prevSize < sequences->size) { + sequences->seq[prevSize].litLength += (U32)leftoverSize; + leftoverSize = newLeftoverSize; + } else { + assert(newLeftoverSize == chunkSize); + leftoverSize += chunkSize; + } + } + return 0; +} + +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; + if (srcSize <= seq->litLength) { + /* Skip past srcSize literals */ + seq->litLength -= (U32)srcSize; + return; + } + srcSize -= seq->litLength; + seq->litLength = 0; + if (srcSize < seq->matchLength) { + /* Skip past the first srcSize of the match */ + seq->matchLength -= (U32)srcSize; + if (seq->matchLength < minMatch) { + /* The match is too short, omit it */ + if (rawSeqStore->pos + 1 < rawSeqStore->size) { + seq[1].litLength += seq[0].matchLength; + } + rawSeqStore->pos++; + } + return; + } + srcSize -= seq->matchLength; + seq->matchLength = 0; + rawSeqStore->pos++; + } +} + +/** + * If the sequence length is longer than remaining then the sequence is split + * between this block and the next. + * + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) +{ + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; + assert(sequence.offset > 0); + /* Likely: No partial sequence */ + if (remaining >= sequence.litLength + sequence.matchLength) { + rawSeqStore->pos++; + return sequence; + } + /* Cut the sequence short (offset == 0 ==> rest is literals). */ + if (remaining <= sequence.litLength) { + sequence.offset = 0; + } else if (remaining < sequence.litLength + sequence.matchLength) { + sequence.matchLength = remaining - sequence.litLength; + if (sequence.matchLength < minMatch) { + sequence.offset = 0; + } + } + /* Skip past `remaining` bytes for the future sequences. */ + ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch); + return sequence; +} + +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; + ZSTD_blockCompressor const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + /* Input positions */ + BYTE const* ip = istart; + + DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize); + assert(rawSeqStore->pos <= rawSeqStore->size); + assert(rawSeqStore->size <= rawSeqStore->capacity); + /* Loop through each sequence and apply the block compressor to the lits */ + while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); + int i; + /* End signal */ + if (sequence.offset == 0) + break; + + assert(ip + sequence.litLength + sequence.matchLength <= iend); + + /* Fill tables for block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; + /* Update the repcodes */ + for (i = ZSTD_REP_NUM - 1; i > 0; i--) + rep[i] = rep[i-1]; + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, + sequence.offset + ZSTD_REP_MOVE, + sequence.matchLength - MINMATCH); + ip += sequence.matchLength; + } + } + /* Fill the tables for the block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Compress the last literals */ + return blockCompressor(ms, seqStore, rep, ip, iend - ip); +} +/**** ended inlining compress/zstd_ldm.c ****/ +/**** start inlining compress/zstd_opt.c ****/ +/* + * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: hist.h ****/ +/**** skipping file: zstd_opt.h ****/ + + +#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ +#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ +#define ZSTD_MAX_PRICE (1<<30) + +#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + +/*-************************************* +* Price functions for optimal parser +***************************************/ + +#if 0 /* approximation at bit level */ +# define BITCOST_ACCURACY 0 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) +#else /* opt==approx, ultra==accurate */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) +#endif + +MEM_STATIC U32 ZSTD_bitWeight(U32 stat) +{ + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); +} + +MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) +{ + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); + return weight; +} + +#if (DEBUGLEVEL>=2) +/* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +MEM_STATIC double ZSTD_fCost(U32 price) +{ + return (double)price / (BITCOST_MULTIPLIER*8); +} +#endif + +static int ZSTD_compressedLiterals(optState_t const* const optPtr) +{ + return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; +} + +static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) +{ + if (ZSTD_compressedLiterals(optPtr)) + optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel); + optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel); + optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel); + optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel); +} + + +/* ZSTD_downscaleStat() : + * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) + * return the resulting sum of elements */ +static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +{ + U32 s, sum=0; + DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1); + assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31); + for (s=0; s> (ZSTD_FREQ_DIV+malus)); + sum += table[s]; + } + return sum; +} + +/* ZSTD_rescaleFreqs() : + * if first block (detected by optPtr->litLengthSum == 0) : init statistics + * take hints from dictionary if there is one + * or init from zero, using src for literals stats, or flat 1 for match symbols + * otherwise downscale existing stats, to be used as seed for next block. + */ +static void +ZSTD_rescaleFreqs(optState_t* const optPtr, + const BYTE* const src, size_t const srcSize, + int const optLevel) +{ + int const compressedLiterals = ZSTD_compressedLiterals(optPtr); + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + + if (optPtr->litLengthSum == 0) { /* first block : init */ + if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ + DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { + /* huffman table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; + for (lit=0; lit<=MaxLit; lit++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); + assert(bitCost <= scaleLog); + optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litSum += optPtr->litFreq[lit]; + } } + + { unsigned ll; + FSE_CState_t llstate; + FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable); + optPtr->litLengthSum = 0; + for (ll=0; ll<=MaxLL; ll++) { + U32 const scaleLog = 10; /* scale to 1K */ + U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll); + assert(bitCost < scaleLog); + optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litLengthSum += optPtr->litLengthFreq[ll]; + } } + + { unsigned ml; + FSE_CState_t mlstate; + FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable); + optPtr->matchLengthSum = 0; + for (ml=0; ml<=MaxML; ml++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml); + assert(bitCost < scaleLog); + optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->matchLengthSum += optPtr->matchLengthFreq[ml]; + } } + + { unsigned of; + FSE_CState_t ofstate; + FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable); + optPtr->offCodeSum = 0; + for (of=0; of<=MaxOff; of++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of); + assert(bitCost < scaleLog); + optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + + } else { /* not a dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ + optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + } + + { unsigned ll; + for (ll=0; ll<=MaxLL; ll++) + optPtr->litLengthFreq[ll] = 1; + } + optPtr->litLengthSum = MaxLL+1; + + { unsigned ml; + for (ml=0; ml<=MaxML; ml++) + optPtr->matchLengthFreq[ml] = 1; + } + optPtr->matchLengthSum = MaxML+1; + + { unsigned of; + for (of=0; of<=MaxOff; of++) + optPtr->offCodeFreq[of] = 1; + } + optPtr->offCodeSum = MaxOff+1; + + } + + } else { /* new block : re-use previous statistics, scaled down */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); + optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); + optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); + } + + ZSTD_setBasePrices(optPtr, optLevel); +} + +/* ZSTD_rawLiteralsCost() : + * price of literals (only) in specified segment (which length can be 0). + * does not include price of literalLength symbol */ +static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) +{ + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) + return (litLength << 3) * BITCOST_MULTIPLIER; /* Uncompressed - 8 bytes per literal. */ + + if (optPtr->priceType == zop_predef) + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ + { U32 price = litLength * optPtr->litSumBasePrice; + U32 u; + for (u=0; u < litLength; u++) { + assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ + price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); + } + return price; + } +} + +/* ZSTD_litLengthPrice() : + * cost of literalLength symbol */ +static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) +{ + if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); + + /* dynamic statistics */ + { U32 const llCode = ZSTD_LLcode(litLength); + return (LL_bits[llCode] * BITCOST_MULTIPLIER) + + optPtr->litLengthSumBasePrice + - WEIGHT(optPtr->litLengthFreq[llCode], optLevel); + } +} + +/* ZSTD_getMatchPrice() : + * Provides the cost of the match part (offset + matchLength) of a sequence + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. + * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_getMatchPrice(U32 const offset, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) +{ + U32 price; + U32 const offCode = ZSTD_highbit32(offset+1); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + + if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ + return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); + if ((optLevel<2) /*static*/ && offCode >= 20) + price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */ + + /* match Length */ + { U32 const mlCode = ZSTD_MLcode(mlBase); + price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel)); + } + + price += BITCOST_MULTIPLIER / 5; /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */ + + DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price); + return price; +} + +/* ZSTD_updateStats() : + * assumption : literals + litLengtn <= iend */ +static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, + U32 offsetCode, U32 matchLength) +{ + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { + U32 u; + for (u=0; u < litLength; u++) + optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; + } + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + optPtr->litLengthFreq[llCode]++; + optPtr->litLengthSum++; + } + + /* match offset code (0-2=>repCode; 3+=>offset+2) */ + { U32 const offCode = ZSTD_highbit32(offsetCode+1); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; + } + + /* match Length */ + { U32 const mlBase = matchLength - MINMATCH; + U32 const mlCode = ZSTD_MLcode(mlBase); + optPtr->matchLengthFreq[mlCode]++; + optPtr->matchLengthSum++; + } +} + + +/* ZSTD_readMINMATCH() : + * function safe only for comparisons + * assumption : memPtr must be at least 4 bytes before end of buffer */ +MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) +{ + switch (length) + { + default : + case 4 : return MEM_read32(memPtr); + case 3 : if (MEM_isLittleEndian()) + return MEM_read32(memPtr)<<8; + else + return MEM_read32(memPtr)>>8; + } +} + + +/* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) +{ + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; + const BYTE* const base = ms->window.base; + U32 idx = *nextToUpdate3; + U32 const target = (U32)(ip - base); + size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3); + assert(hashLog3 > 0); + + while(idx < target) { + hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx; + idx++; + } + + *nextToUpdate3 = target; + return hashTable3[hash3]; +} + + +/*-************************************* +* Binary Tree search +***************************************/ +/** ZSTD_insertBt1() : add one or multiple positions to tree. + * ip : assumed <= iend-8 . + * @return : nb of positions added */ +static U32 ZSTD_insertBt1( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const mls, const int extDict) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + const U32 current = (U32)(ip-base); + const U32 btLow = btMask >= current ? 0 : current - btMask; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 dummy32; /* to be nullified at the end */ + U32 const windowLow = ms->window.lowLimit; + U32 matchEndIdx = current+8+1; + size_t bestLength = 8; + U32 nbCompares = 1U << cParams->searchLog; +#ifdef ZSTD_C_PREDICT + U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); + U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1); + predictedSmall += (predictedSmall>0); + predictedLarge += (predictedLarge>0); +#endif /* ZSTD_C_PREDICT */ + + DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current); + + assert(ip <= iend-8); /* required for h calculation */ + hashTable[h] = current; /* Update Hash Table */ + + assert(windowLow > 0); + while (nbCompares-- && (matchIndex >= windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + +#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ + const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ + if (matchIndex == predictedSmall) { + /* no need to check length, result known */ + *smallerPtr = matchIndex; + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + predictedSmall = predictPtr[1] + (predictPtr[1]>0); + continue; + } + if (matchIndex == predictedLarge) { + *largerPtr = matchIndex; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + predictedLarge = predictPtr[0] + (predictPtr[0]>0); + continue; + } +#endif + + if (!extDict || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* might be wrong if actually extDict */ + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + bestLength = matchLength; + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + } + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + { U32 positions = 0; + if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */ + assert(matchEndIdx > current + 8); + return MAX(positions, matchEndIdx - (current + 8)); + } +} + +FORCE_INLINE_TEMPLATE +void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); + assert(idx < (U32)(idx + forward)); + idx += forward; + } + assert((size_t)(ip - base) <= (size_t)(U32)(-1)); + assert((size_t)(iend - base) <= (size_t)(U32)(-1)); + ms->nextToUpdate = target; +} + +void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_insertBtAndGetAllMatches ( + ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ + const U32 lengthToBeat, + U32 const mls /* template */) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const hashLog = cParams->hashLog; + U32 const minMatch = (mls==3) ? 3 : 4; + U32* const hashTable = ms->hashTable; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask= (1U << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const dictBase = ms->window.dictBase; + U32 const dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + U32 const matchLow = windowLow ? windowLow : 1; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */ + U32 dummy32; /* to be nullified at the end */ + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + + const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; + const BYTE* const dmsEnd = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL; + U32 const dmsHighLimit = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0; + U32 const dmsLowLimit = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0; + U32 const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0; + U32 const dmsHashLog = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog; + U32 const dmsBtLog = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog; + U32 const dmsBtMask = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0; + U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit; + + size_t bestLength = lengthToBeat-1; + DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current); + + /* check repCode */ + assert(ll0 <= 1); /* necessarily 1 or 0 */ + { U32 const lastR = ZSTD_REP_NUM + ll0; + U32 repCode; + for (repCode = ll0; repCode < lastR; repCode++) { + U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + U32 const repIndex = current - repOffset; + U32 repLen = 0; + assert(current >= dictLimit); + if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) { /* equivalent to `current > repIndex >= dictLimit` */ + /* We must validate the repcode offset because when we're using a dictionary the + * valid offset range shrinks when the dictionary goes out of bounds. + */ + if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) { + repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch; + } + } else { /* repIndex < dictLimit || repIndex >= current */ + const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ? + dmsBase + repIndex - dmsIndexDelta : + dictBase + repIndex; + assert(current >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow) /* equivalent to `current > repIndex >= windowLow` */ + & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `current > repIndex >= dmsLowLimit` */ + & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } + /* save longer solution */ + if (repLen > bestLength) { + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; + matches[mnum].off = repCode - ll0; + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) + | (ip+repLen == iLimit) ) { /* best possible */ + return mnum; + } } } } + + /* HC3 match finder */ + if ((mls == 3) /*static*/ && (bestLength < mls)) { + U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip); + if ((matchIndex3 >= matchLow) + & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { + size_t mlen; + if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) { + const BYTE* const match = base + matchIndex3; + mlen = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex3; + mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart); + } + + /* save best solution */ + if (mlen >= mls /* == 3 > bestLength */) { + DEBUGLOG(8, "found small match with hlog3, of length %u", + (U32)mlen); + bestLength = mlen; + assert(current > matchIndex3); + assert(mnum==0); /* no prior solution */ + matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | + (ip+mlen == iLimit) ) { /* best possible length */ + ms->nextToUpdate = current+1; /* skip insertion */ + return 1; + } } } + /* no dictMatchState lookup: dicts don't have a populated HC3 table */ + } + + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex >= matchLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + const BYTE* match; + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(current > matchIndex); + + if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ + match = base + matchIndex; + if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); + } else { + match = dictBase + matchIndex; + assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* prepare for match[matchLength] read */ + } + + if (matchLength > bestLength) { + DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", + (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ + break; /* drop, to preserve bt consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new candidate => larger than match, which was smaller than current */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous, closer to current */ + } else { + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + if (dictMode == ZSTD_dictMatchState && nbCompares) { + size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls); + U32 dictMatchIndex = dms->hashTable[dmsH]; + const U32* const dmsBt = dms->chainTable; + commonLengthSmaller = commonLengthLarger = 0; + while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) { + const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dmsBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart); + if (dictMatchIndex+matchLength >= dmsHighLimit) + match = base + dictMatchIndex + dmsIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; + DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", + (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ + if (match[matchLength] < ip[matchLength]) { + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } + } + } + + assert(matchEndIdx > current+8); + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + return mnum; +} + + +FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( + ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const matchLengthSearch = cParams->minMatch; + DEBUGLOG(8, "ZSTD_BtGetAllMatches"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); + switch(matchLengthSearch) + { + case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); + default : + case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); + case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); + case 7 : + case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); + } +} + + +/*-******************************* +* Optimal parser +*********************************/ + + +static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +{ + return sol.litlen + sol.mlen; +} + +#if 0 /* debug */ + +static void +listStats(const U32* table, int lastEltID) +{ + int const nbElts = lastEltID + 1; + int enb; + for (enb=0; enb < nbElts; enb++) { + (void)table; + /* RAWLOG(2, "%3i:%3i, ", enb, table[enb]); */ + RAWLOG(2, "%4i,", table[enb]); + } + RAWLOG(2, " \n"); +} + +#endif + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const int optLevel, + const ZSTD_dictMode_e dictMode) +{ + optState_t* const optStatePtr = &ms->opt; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; + U32 nextToUpdate3 = ms->nextToUpdate; + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; + ZSTD_optimal_t lastSequence; + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u", + (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate); + assert(optLevel <= 2); + ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel); + ip += (ip==prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, last_pos = 0; + + /* find first match */ + { U32 const litlen = (U32)(ip - anchor); + U32 const ll0 = !litlen; + U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); + if (!nbMatches) { ip++; continue; } + + /* initialize opt[0] */ + { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; + U32 const maxOffset = matches[nbMatches-1].off; + DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", + nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + lastSequence.litlen = litlen; + lastSequence.mlen = maxML; + lastSequence.off = maxOffset; + DEBUGLOG(6, "large match (%u>%u), immediate encoding", + maxML, sufficient_len); + cur = 0; + last_pos = ZSTD_totalLen(lastSequence); + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { + opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { + U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); + U32 const sequencePrice = literalsPrice + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = sequencePrice; + } } + last_pos = pos-1; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; + assert(cur < ZSTD_OPT_NUM); + DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) + + /* Fix current position with one literal if cheaper */ + { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; + int const price = opt[cur-1].price + + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) + + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) + - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { + DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); + opt[cur].mlen = 0; + opt[cur].off = 0; + opt[cur].litlen = litlen; + opt[cur].price = price; + } else { + DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), + opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); + } + } + + /* Set the repcodes of the current position. We must do it here + * because we rely on the repcodes of the 2nd to last sequence being + * correct to set the next chunks repcodes during the backward + * traversal. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); + assert(cur >= opt[cur].mlen); + if (opt[cur].mlen != 0) { + U32 const prev = cur - opt[cur].mlen; + repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); + memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); + } else { + memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ + if (inr > ilimit) continue; + + if (cur == last_pos) break; + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { + DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + { U32 const ll0 = (opt[cur].mlen != 0); + U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; + U32 const previousPrice = opt[cur].price; + U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); + U32 matchNb; + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); + continue; + } + + { U32 const maxML = matches[nbMatches-1].len; + DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", + inr-istart, cur, nbMatches, maxML); + + if ( (maxML > sufficient_len) + || (cur + maxML >= ZSTD_OPT_NUM) ) { + lastSequence.mlen = maxML; + lastSequence.off = matches[nbMatches-1].off; + lastSequence.litlen = litlen; + cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ + last_pos = cur + ZSTD_totalLen(lastSequence); + if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ + goto _shortestPath; + } } + + /* set prices using matches found at position == cur */ + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const lastML = matches[matchNb].len; + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + + DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; + int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ + opt[pos].mlen = mlen; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } + } /* for (cur = 1; cur <= last_pos; cur++) */ + + lastSequence = opt[last_pos]; + cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ + assert(cur < ZSTD_OPT_NUM); /* control overflow*/ + +_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); + + /* Set the next chunk's repcodes based on the repcodes of the beginning + * of the last match, and the last sequence. This avoids us having to + * update them while traversing the sequences. + */ + if (lastSequence.mlen != 0) { + repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); + memcpy(rep, &reps, sizeof(reps)); + } else { + memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); + } + + { U32 const storeEnd = cur + 1; + U32 storeStart = storeEnd; + U32 seqPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; + assert(storeEnd < ZSTD_OPT_NUM); + DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); + opt[storeEnd] = lastSequence; + while (seqPos > 0) { + U32 const backDist = ZSTD_totalLen(opt[seqPos]); + storeStart--; + DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); + opt[storeStart] = opt[seqPos]; + seqPos = (seqPos > backDist) ? seqPos - backDist : 0; + } + + /* save sequences */ + DEBUGLOG(6, "sending selected sequences into seqStore") + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; + U32 const offCode = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ + ip = anchor + llen; /* last "sequence" is a bunch of literals => don't progress anchor */ + continue; /* will finish */ + } + + assert(anchor + llen <= iend); + ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); + anchor += advance; + ip = anchor; + } } + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); +} + + +/* used in 2-pass strategy */ +static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) +{ + U32 s, sum=0; + assert(ZSTD_FREQ_DIV+bonus >= 0); + for (s=0; slitSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); + optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); + optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); + optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); +} + +/* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. + * this function cannot error, hence its contract must be respected. + */ +static void +ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + memcpy(tmpRep, rep, sizeof(tmpRep)); + + DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); + assert(ms->opt.litLengthSum == 0); /* first block */ + assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */ + assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ + assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ + + ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + + /* invalidate first scan from history */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; + ms->window.lowLimit = ms->window.dictLimit; + ms->nextToUpdate = ms->window.dictLimit; + + /* re-inforce weight of collected statistics */ + ZSTD_upscaleStats(&ms->opt); +} + +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btultra2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 const current = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + + /* 2-pass strategy: + * this strategy makes a first pass over first block to collect statistics + * and seed next round's statistics with it. + * After 1st pass, function forgets everything, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), + * the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ + && (current == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ + && (srcSize > ZSTD_PREDEF_THRESHOLD) + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); +} + +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); +} + +/* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ +/**** ended inlining compress/zstd_opt.c ****/ + +/**** start inlining decompress/huf_decompress.c ****/ +/* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Dependencies +****************************************************************/ +#include /* memcpy, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/error_private.h ****/ + +/* ************************************************************** +* Macros +****************************************************************/ + +/* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. + */ +#if defined(HUF_FORCE_DECOMPRESS_X1) && \ + defined(HUF_FORCE_DECOMPRESS_X2) +#error "Cannot force the use of the X1 and X2 decoders at the same time!" +#endif + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError + + +/* ************************************************************** +* Byte alignment for workSpace management +****************************************************************/ +#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) +#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) + + +/* ************************************************************** +* BMI2 Variant Wrappers +****************************************************************/ +#if DYNAMIC_BMI2 + +#define HUF_DGEN(fn) \ + \ + static size_t fn##_default( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + if (bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#else + +#define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + (void)bmi2; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#endif + + +/*-***************************/ +/* generic DTableDesc */ +/*-***************************/ +typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; + +static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) +{ + DTableDesc dtd; + memcpy(&dtd, table, sizeof(dtd)); + return dtd; +} + + +#ifndef HUF_FORCE_DECOMPRESS_X2 + +/*-***************************/ +/* single-symbol decoding */ +/*-***************************/ +typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ + +size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +{ + U32 tableLog = 0; + U32 nbSymbols = 0; + size_t iSize; + void* const dtPtr = DTable + 1; + HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; + + U32* rankVal; + BYTE* huffWeight; + size_t spaceUsed32 = 0; + + rankVal = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; + huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* Table header */ + { DTableDesc dtd = HUF_getDTableDesc(DTable); + if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ + dtd.tableType = 0; + dtd.tableLog = (BYTE)tableLog; + memcpy(DTable, &dtd, sizeof(dtd)); + } + + /* Calculate starting value for each rank */ + { U32 n, nextRankStart = 0; + for (n=1; n> 1; + size_t const uStart = rankVal[w]; + size_t const uEnd = uStart + length; + size_t u; + HUF_DEltX1 D; + D.byte = (BYTE)n; + D.nbBits = (BYTE)(tableLog + 1 - w); + rankVal[w] = (U32)uEnd; + if (length < 4) { + /* Use length in the loop bound so the compiler knows it is short. */ + for (u = 0; u < length; ++u) + dt[uStart + u] = D; + } else { + /* Unroll the loop 4 times, we know it is a power of 2. */ + for (u = uStart; u < uEnd; u += 4) { + dt[u + 0] = D; + dt[u + 1] = D; + dt[u + 2] = D; + dt[u + 3] = D; + } } } } + return iSize; +} + +size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX1_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); +} + +FORCE_INLINE_TEMPLATE BYTE +HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ + BYTE const c = dt[val].byte; + BIT_skipBits(Dstream, dt[val].nbBits); + return c; +} + +#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ + *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) + +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) + +HINT_INLINE size_t +HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 4 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + + /* [0-3] symbols remaining */ + if (MEM_32bits()) + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + /* no more data to retrieve from bitstream, no need to reload */ + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + return pEnd-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BYTE* op = (BYTE*)dst; + BYTE* const oend = op + dstSize; + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog); + + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + return dstSize; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - 3; + const void* const dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + U32 endSignal = 1; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } + + /* check corruption */ + /* note : should not be necessary : op# advance in lock step, and we control op4. + * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + + +typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); + +HUF_DGEN(HUF_decompress1X1_usingDTable_internal) +HUF_DGEN(HUF_decompress4X1_usingDTable_internal) + + + +size_t HUF_decompress1X1_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) return ERROR(GENERIC); + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); + return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); +} + +size_t HUF_decompress4X1_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) return ERROR(GENERIC); + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); +} + + +size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} +size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); + return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +#endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +#ifndef HUF_FORCE_DECOMPRESS_X1 + +/* *************************/ +/* double-symbols decoding */ +/* *************************/ + +typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ +typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; +typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; + + +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, + const U32* rankValOrigin, const int minWeight, + const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, + U32 nbBitsBaseline, U16 baseSeq) +{ + HUF_DEltX2 DElt; + U32 rankVal[HUF_TABLELOG_MAX + 1]; + + /* get pre-calculated rankVal */ + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill skipped values */ + if (minWeight>1) { + U32 i, skipSize = rankVal[minWeight]; + MEM_writeLE16(&(DElt.sequence), baseSeq); + DElt.nbBits = (BYTE)(consumed); + DElt.length = 1; + for (i = 0; i < skipSize; i++) + DTable[i] = DElt; + } + + /* fill DTable */ + { U32 s; for (s=0; s= 1 */ + + rankVal[weight] += length; + } } +} + + +static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, const U32 sortedListSize, + const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) +{ + U32 rankVal[HUF_TABLELOG_MAX + 1]; + const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ + const U32 minBits = nbBitsBaseline - maxWeight; + U32 s; + + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill DTable */ + for (s=0; s= minBits) { /* enough room for a second symbol */ + U32 sortedRank; + int minWeight = nbBits + scaleLog; + if (minWeight < 1) minWeight = 1; + sortedRank = rankStart[minWeight]; + HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, + rankValOrigin[nbBits], minWeight, + sortedList+sortedRank, sortedListSize-sortedRank, + nbBitsBaseline, symbol); + } else { + HUF_DEltX2 DElt; + MEM_writeLE16(&(DElt.sequence), symbol); + DElt.nbBits = (BYTE)(nbBits); + DElt.length = 1; + { U32 const end = start + length; + U32 u; + for (u = start; u < end; u++) DTable[u] = DElt; + } } + rankVal[weight] += length; + } +} + +size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize) +{ + U32 tableLog, maxW, sizeOfSort, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog; + size_t iSize; + void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ + HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; + U32 *rankStart; + + rankValCol_t* rankVal; + U32* rankStats; + U32* rankStart0; + sortedSymbol_t* sortedSymbol; + BYTE* weightList; + size_t spaceUsed32 = 0; + + rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; + rankStats = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 1; + rankStart0 = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 2; + sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); + spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; + weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + + rankStart = rankStart0 + 1; + memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* check result */ + if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + + /* find maxWeight */ + for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + + /* Get start index of each weight */ + { U32 w, nextRankStart = 0; + for (w=1; w> consumed; + } } } } + + HUF_fillDTableX2(dt, maxTableLog, + sortedSymbol, sizeOfSort, + rankStart0, rankVal, maxW, + tableLog+1); + + dtd.tableLog = (BYTE)maxTableLog; + dtd.tableType = 1; + memcpy(DTable, &dtd, sizeof(dtd)); + return iSize; +} + +size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX2_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); +} + + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 1); + if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); + else { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) + /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); + } } + return 1; +} + +#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +HINT_INLINE size_t +HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, + const HUF_DEltX2* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + + /* closer to end : up to 2 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + + if (p < pEnd) + p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); + + return p-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BIT_DStream_t bitD; + + /* Init */ + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog); + } + + /* check */ + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - (sizeof(size_t)-1); + const void* const dtPtr = DTable+1; + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + size_t const segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal = 1; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + for ( ; (endSignal) & (op4 < olimit); ) { +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +#else + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY( + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); +#endif + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + +HUF_DGEN(HUF_decompress1X2_usingDTable_internal) +HUF_DGEN(HUF_decompress4X2_usingDTable_internal) + +size_t HUF_decompress1X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) return ERROR(GENERIC); + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); + return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +size_t HUF_decompress4X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) return ERROR(GENERIC); + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + + +size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); + return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +#endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +/* ***********************************/ +/* Universal decompression selectors */ +/* ***********************************/ + +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#else + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#endif +} + +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#else + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#endif +} + + +#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) +typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +{ + /* single, double, quad */ + {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ + {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ + {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ + {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ + {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ + {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ + {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ + {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ + {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ + {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ + {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ + {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ + {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ + {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ + {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ + {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ +}; +#endif + +/** HUF_selectDecoder() : + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . + * Assumption : 0 < dstSize <= 128 KB */ +U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) +{ + assert(dstSize > 0); + assert(dstSize <= 128*1024); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dstSize; + (void)cSrcSize; + return 0; +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dstSize; + (void)cSrcSize; + return 1; +#else + /* decoder timing evaluation */ + { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + return DTime1 < DTime0; + } +#endif +} + + +typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); + +size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ +#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 }; +#endif + + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize); +#else + return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); +#endif + } +} + +size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); +#else + return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : + HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ; +#endif + } +} + +size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, + size_t dstSize, const void* cSrc, + size_t cSrcSize, void* workSpace, + size_t wkspSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#else + return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize): + HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#endif + } +} + +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#endif + } +} + +size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#else + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#endif +} + +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} +#endif + +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#else + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#endif +} + +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#else + return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : + HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#endif + } +} +/**** ended inlining decompress/huf_decompress.c ****/ +/**** start inlining decompress/zstd_ddict.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* zstd_ddict.c : + * concentrates all logic that needs to know the internals of ZSTD_DDict object */ + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** start inlining zstd_decompress_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* zstd_decompress_internal: + * objects and definitions shared within lib/decompress modules */ + + #ifndef ZSTD_DECOMPRESS_INTERNAL_H + #define ZSTD_DECOMPRESS_INTERNAL_H + + +/*-******************************************************* + * Dependencies + *********************************************************/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + + + +/*-******************************************************* + * Constants + *********************************************************/ +static const U32 LL_base[MaxLL+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 18, 20, 22, 24, 28, 32, 40, + 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, + 0x2000, 0x4000, 0x8000, 0x10000 }; + +static const U32 OF_base[MaxOff+1] = { + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; + +static const U32 OF_bits[MaxOff+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 }; + +static const U32 ML_base[MaxML+1] = { + 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 37, 39, 41, 43, 47, 51, 59, + 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, + 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; + + +/*-******************************************************* + * Decompression types + *********************************************************/ + typedef struct { + U32 fastMode; + U32 tableLog; + } ZSTD_seqSymbol_header; + + typedef struct { + U16 nextState; + BYTE nbAdditionalBits; + BYTE nbBits; + U32 baseValue; + } ZSTD_seqSymbol; + + #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log))) + +typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ + HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; +} ZSTD_entropyDTables_t; + +typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader, + ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock, + ZSTDds_decompressLastBlock, ZSTDds_checkChecksum, + ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage; + +typedef enum { zdss_init=0, zdss_loadHeader, + zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage; + +typedef enum { + ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */ + ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */ + ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */ +} ZSTD_dictUses_e; + +typedef enum { + ZSTD_obm_buffered = 0, /* Buffer the output */ + ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */ +} ZSTD_outBufferMode_e; + +struct ZSTD_DCtx_s +{ + const ZSTD_seqSymbol* LLTptr; + const ZSTD_seqSymbol* MLTptr; + const ZSTD_seqSymbol* OFTptr; + const HUF_DTable* HUFptr; + ZSTD_entropyDTables_t entropy; + U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */ + const void* previousDstEnd; /* detect continuity */ + const void* prefixStart; /* start of current segment */ + const void* virtualStart; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ + size_t expected; + ZSTD_frameHeader fParams; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ + ZSTD_dStage stage; + U32 litEntropy; + U32 fseEntropy; + XXH64_state_t xxhState; + size_t headerSize; + ZSTD_format_e format; + const BYTE* litPtr; + ZSTD_customMem customMem; + size_t litSize; + size_t rleSize; + size_t staticSize; + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + + /* dictionary */ + ZSTD_DDict* ddictLocal; + const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */ + U32 dictID; + int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */ + ZSTD_dictUses_e dictUses; + + /* streaming */ + ZSTD_dStreamStage streamStage; + char* inBuff; + size_t inBuffSize; + size_t inPos; + size_t maxWindowSize; + char* outBuff; + size_t outBuffSize; + size_t outStart; + size_t outEnd; + size_t lhSize; + void* legacyContext; + U32 previousLegacyVersion; + U32 legacyVersion; + U32 hostageByte; + int noForwardProgress; + ZSTD_outBufferMode_e outBufferMode; + ZSTD_outBuffer expectedOutBuffer; + + /* workspace */ + BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; + + size_t oversizedDuration; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + void const* dictContentBeginForFuzzing; + void const* dictContentEndForFuzzing; +#endif +}; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + + +/*-******************************************************* + * Shared internal functions + *********************************************************/ + +/*! ZSTD_loadDEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */ +size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + const void* const dict, size_t const dictSize); + +/*! ZSTD_checkContinuity() : + * check if next `dst` follows previous position, where decompression ended. + * If yes, do nothing (continue on current segment). + * If not, classify previous segment as "external dictionary", and start a new segment. + * This function cannot fail. */ +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst); + + +#endif /* ZSTD_DECOMPRESS_INTERNAL_H */ +/**** ended inlining zstd_decompress_internal.h ****/ +/**** start inlining zstd_ddict.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +#ifndef ZSTD_DDICT_H +#define ZSTD_DDICT_H + +/*-******************************************************* + * Dependencies + *********************************************************/ +#include /* size_t */ +/**** skipping file: ../zstd.h ****/ + + +/*-******************************************************* + * Interface + *********************************************************/ + +/* note: several prototypes are already published in `zstd.h` : + * ZSTD_createDDict() + * ZSTD_createDDict_byReference() + * ZSTD_createDDict_advanced() + * ZSTD_freeDDict() + * ZSTD_initStaticDDict() + * ZSTD_sizeof_DDict() + * ZSTD_estimateDDictSize() + * ZSTD_getDictID_fromDict() + */ + +const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict); +size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict); + +void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + + + +#endif /* ZSTD_DDICT_H */ +/**** ended inlining zstd_ddict.h ****/ + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) +/**** start inlining ../legacy/zstd_legacy.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LEGACY_H +#define ZSTD_LEGACY_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/error_private.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + +#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0) +# undef ZSTD_LEGACY_SUPPORT +# define ZSTD_LEGACY_SUPPORT 8 +#endif + +#if (ZSTD_LEGACY_SUPPORT <= 1) +/**** start inlining zstd_v01.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V01_H_28739879432 +#define ZSTD_V01_H_28739879432 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv01_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.1.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv01_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error +*/ +unsigned ZSTDv01_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx; +ZSTDv01_Dctx* ZSTDv01_createDCtx(void); +size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx); + +size_t ZSTDv01_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx); + +size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx); +size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv01_magicNumber 0xFD2FB51E /* Big Endian version */ +#define ZSTDv01_magicNumberLE 0x1EB52FFD /* Little Endian version */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V01_H_28739879432 */ +/**** ended inlining zstd_v01.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) +/**** start inlining zstd_v02.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V02_H_4174539423 +#define ZSTD_V02_H_4174539423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv02_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.2.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv02_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error +*/ +unsigned ZSTDv02_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx; +ZSTDv02_Dctx* ZSTDv02_createDCtx(void); +size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx); + +size_t ZSTDv02_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx); + +size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx); +size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv02_magicNumber 0xFD2FB522 /* v0.2 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V02_H_4174539423 */ +/**** ended inlining zstd_v02.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) +/**** start inlining zstd_v03.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V03_H_298734209782 +#define ZSTD_V03_H_298734209782 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv03_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.3.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ + void ZSTDv03_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + + /** +ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error +*/ +unsigned ZSTDv03_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx; +ZSTDv03_Dctx* ZSTDv03_createDCtx(void); +size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx); + +size_t ZSTDv03_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx); + +size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx); +size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv03_magicNumber 0xFD2FB523 /* v0.3 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V03_H_298734209782 */ +/**** ended inlining zstd_v03.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) +/**** start inlining zstd_v04.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V04_H_91868324769238 +#define ZSTD_V04_H_91868324769238 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv04_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.4.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ + void ZSTDv04_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error +*/ +unsigned ZSTDv04_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx; +ZSTDv04_Dctx* ZSTDv04_createDCtx(void); +size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx); + +size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + +/* ************************************* +* Direct Streaming +***************************************/ +size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx); + +size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx); +size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + + +/* ************************************* +* Buffered Streaming +***************************************/ +typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx; +ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void); +size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx); + +size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx); +size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize); + +size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr); + +/** ************************************************ +* Streaming decompression +* +* A ZBUFF_DCtx object is required to track streaming operation. +* Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources. +* Use ZBUFF_decompressInit() to start a new decompression operation. +* ZBUFF_DCtx objects can be reused multiple times. +* +* Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary() +* It must be the same content as the one set during compression phase. +* Dictionary content must remain accessible during the decompression process. +* +* Use ZBUFF_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *maxDstSizePtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency) +* or 0 when a frame is completely decoded +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize +* output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded. +* input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* **************************************************/ +unsigned ZBUFFv04_isError(size_t errorCode); +const char* ZBUFFv04_getErrorName(size_t errorCode); + + +/** The below functions provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are not compulsory, they just tend to offer better latency */ +size_t ZBUFFv04_recommendedDInSize(void); +size_t ZBUFFv04_recommendedDOutSize(void); + + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv04_magicNumber 0xFD2FB524 /* v0.4 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V04_H_91868324769238 */ +/**** ended inlining zstd_v04.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) +/**** start inlining zstd_v05.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv05_H +#define ZSTDv05_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-************************************* +* Dependencies +***************************************/ +#include /* size_t */ +/**** skipping file: ../common/mem.h ****/ + + +/* ************************************* +* Simple functions +***************************************/ +/*! ZSTDv05_decompress() : + `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail. + `dstCapacity` must be large enough, equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */ +size_t ZSTDv05_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + + /** + ZSTDv05_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.5.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv05_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/* ************************************* +* Helper functions +***************************************/ +/* Error Management */ +unsigned ZSTDv05_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +const char* ZSTDv05_getErrorName(size_t code); /*!< provides readable string for an error code */ + + +/* ************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx; +ZSTDv05_DCtx* ZSTDv05_createDCtx(void); +size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv05_decompressDCtx() : +* Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */ +size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* Simple Dictionary API +*************************/ +/*! ZSTDv05_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted. +* Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */ +size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + +/*-************************ +* Advanced Streaming API +***************************/ +typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy; +typedef struct { + U64 srcSize; + U32 windowLog; /* the only useful information to retrieve */ + U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy; +} ZSTDv05_parameters; +size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize); + +size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize); +void ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx); +size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx); +size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* ZBUFF API +*************************/ +typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx; +ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void); +size_t ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx); + +size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx); +size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize); + +size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression +* +* A ZBUFFv05_DCtx object is required to track streaming operations. +* Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources. +* Use ZBUFFv05_decompressInit() to start a new decompression operation, +* or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv05_DCtx objects can be reused multiple times. +* +* Use ZBUFFv05_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency) +* or 0 when a frame is completely decoded +* or an error code, which can be tested using ZBUFFv05_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize() +* output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +unsigned ZBUFFv05_isError(size_t errorCode); +const char* ZBUFFv05_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, and tend to offer better latency */ +size_t ZBUFFv05_recommendedDInSize(void); +size_t ZBUFFv05_recommendedDOutSize(void); + + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv05_MAGICNUMBER 0xFD2FB525 /* v0.5 */ + + + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv0505_H */ +/**** ended inlining zstd_v05.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) +/**** start inlining zstd_v06.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv06_H +#define ZSTDv06_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/*====== Dependency ======*/ +#include /* size_t */ + + +/*====== Export for Windows ======*/ +/*! +* ZSTDv06_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +*/ +#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1) +# define ZSTDLIBv06_API __declspec(dllexport) +#else +# define ZSTDLIBv06_API +#endif + + +/* ************************************* +* Simple functions +***************************************/ +/*! ZSTDv06_decompress() : + `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail. + `dstCapacity` must be large enough, equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */ +ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/** +ZSTDv06_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.6.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. +*/ +void ZSTDv06_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/* ************************************* +* Helper functions +***************************************/ +ZSTDLIBv06_API size_t ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */ + +/* Error Management */ +ZSTDLIBv06_API unsigned ZSTDv06_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code); /*!< provides readable string for an error code */ + + +/* ************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx; +ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void); +ZSTDLIBv06_API size_t ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv06_decompressDCtx() : +* Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */ +ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* Dictionary API +*************************/ +/*! ZSTDv06_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted. +* Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */ +ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*-************************ +* Advanced Streaming API +***************************/ +struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; }; +typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams; + +ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIBv06_API void ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx); + +ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx); +ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + + +/* ************************************* +* ZBUFF API +***************************************/ + +typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx; +ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void); +ZSTDLIBv06_API size_t ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx); + +ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx); +ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize); + +ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression howto +* +* A ZBUFFv06_DCtx object is required to track streaming operations. +* Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources. +* Use ZBUFFv06_decompressInit() to start a new decompression operation, +* or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv06_DCtx objects can be re-init multiple times. +* +* Use ZBUFFv06_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency), +* or 0 when a frame is completely decoded, +* or an error code, which can be tested using ZBUFFv06_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize() +* output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv06_recommendedDInSize == 128KB + 3; +* just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode); +ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, they tend to offer better latency */ +ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void); +ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void); + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv06_MAGICNUMBER 0xFD2FB526 /* v0.6 */ + + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv06_BUFFERED_H */ +/**** ended inlining zstd_v06.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) +/**** start inlining zstd_v07.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv07_H_235446 +#define ZSTDv07_H_235446 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*====== Dependency ======*/ +#include /* size_t */ + + +/*====== Export for Windows ======*/ +/*! +* ZSTDv07_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +*/ +#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1) +# define ZSTDLIBv07_API __declspec(dllexport) +#else +# define ZSTDLIBv07_API +#endif + + +/* ************************************* +* Simple API +***************************************/ +/*! ZSTDv07_getDecompressedSize() : +* @return : decompressed size if known, 0 otherwise. + note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause. + note 2 : decompressed size could be wrong or intentionally modified ! + always ensure results fit within application's authorized limits */ +unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTDv07_decompress() : + `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail. + `dstCapacity` must be equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */ +ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/** +ZSTDv07_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.7.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. +*/ +void ZSTDv07_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/*====== Helper functions ======*/ +ZSTDLIBv07_API unsigned ZSTDv07_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code); /*!< provides readable string from an error code */ + + +/*-************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx; +ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void); +ZSTDLIBv07_API size_t ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv07_decompressDCtx() : +* Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */ +ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-************************ +* Simple dictionary API +***************************/ +/*! ZSTDv07_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression. +* Note : This function load the dictionary, resulting in a significant startup time */ +ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*-************************** +* Advanced Dictionary API +****************************/ +/*! ZSTDv07_createDDict() : +* Create a digested dictionary, ready to start decompression operation without startup delay. +* `dict` can be released after creation */ +typedef struct ZSTDv07_DDict_s ZSTDv07_DDict; +ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize); +ZSTDLIBv07_API size_t ZSTDv07_freeDDict(ZSTDv07_DDict* ddict); + +/*! ZSTDv07_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */ +ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTDv07_DDict* ddict); + +typedef struct { + unsigned long long frameContentSize; + unsigned windowSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTDv07_frameParams; + +ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */ + + + + +/* ************************************* +* Streaming functions +***************************************/ +typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx; +ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void); +ZSTDLIBv07_API size_t ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx); + +ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx); +ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize); + +ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression howto +* +* A ZBUFFv07_DCtx object is required to track streaming operations. +* Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources. +* Use ZBUFFv07_decompressInit() to start a new decompression operation, +* or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv07_DCtx objects can be re-init multiple times. +* +* Use ZBUFFv07_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency), +* or 0 when a frame is completely decoded, +* or an error code, which can be tested using ZBUFFv07_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize() +* output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv07_recommendedDInSize == 128KB + 3; +* just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode); +ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, they tend to offer better latency */ +ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void); +ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void); + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv07_MAGICNUMBER 0xFD2FB527 /* v0.7 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv07_H_235446 */ +/**** ended inlining zstd_v07.h ****/ +#endif + +/** ZSTD_isLegacy() : + @return : > 0 if supported by legacy decoder. 0 otherwise. + return value is the version. +*/ +MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize) +{ + U32 magicNumberLE; + if (srcSize<4) return 0; + magicNumberLE = MEM_readLE32(src); + switch(magicNumberLE) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case ZSTDv01_magicNumberLE:return 1; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case ZSTDv02_magicNumber : return 2; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case ZSTDv03_magicNumber : return 3; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case ZSTDv04_magicNumber : return 4; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case ZSTDv05_MAGICNUMBER : return 5; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case ZSTDv06_MAGICNUMBER : return 6; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case ZSTDv07_MAGICNUMBER : return 7; +#endif + default : return 0; + } +} + + +MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize) +{ + U32 const version = ZSTD_isLegacy(src, srcSize); + if (version < 5) return 0; /* no decompressed size in frame header, or not a legacy format */ +#if (ZSTD_LEGACY_SUPPORT <= 5) + if (version==5) { + ZSTDv05_parameters fParams; + size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.srcSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + if (version==6) { + ZSTDv06_frameParams fParams; + size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.frameContentSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + if (version==7) { + ZSTDv07_frameParams fParams; + size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.frameContentSize; + } +#endif + return 0; /* should not be possible */ +} + + +MEM_STATIC size_t ZSTD_decompressLegacy( + void* dst, size_t dstCapacity, + const void* src, size_t compressedSize, + const void* dict,size_t dictSize) +{ + U32 const version = ZSTD_isLegacy(src, compressedSize); + (void)dst; (void)dstCapacity; (void)dict; (void)dictSize; /* unused when ZSTD_LEGACY_SUPPORT >= 8 */ + switch(version) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case 1 : + return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case 2 : + return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case 3 : + return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { size_t result; + ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv05_freeDCtx(zd); + return result; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { size_t result; + ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv06_freeDCtx(zd); + return result; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { size_t result; + ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv07_freeDCtx(zd); + return result; + } +#endif + default : + return ERROR(prefix_unknown); + } +} + +MEM_STATIC ZSTD_frameSizeInfo ZSTD_findFrameSizeInfoLegacy(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo; + U32 const version = ZSTD_isLegacy(src, srcSize); + switch(version) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case 1 : + ZSTDv01_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case 2 : + ZSTDv02_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case 3 : + ZSTDv03_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + ZSTDv04_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + ZSTDv05_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + ZSTDv06_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + ZSTDv07_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif + default : + frameSizeInfo.compressedSize = ERROR(prefix_unknown); + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + break; + } + if (!ZSTD_isError(frameSizeInfo.compressedSize) && frameSizeInfo.compressedSize > srcSize) { + frameSizeInfo.compressedSize = ERROR(srcSize_wrong); + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + } + return frameSizeInfo; +} + +MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo = ZSTD_findFrameSizeInfoLegacy(src, srcSize); + return frameSizeInfo.compressedSize; +} + +MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version) +{ + switch(version) + { + default : + case 1 : + case 2 : + case 3 : + (void)legacyContext; + return ERROR(version_unsupported); +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext); +#endif + } +} + + +MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion, + const void* dict, size_t dictSize) +{ + DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion); + if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion); + switch(newVersion) + { + default : + case 1 : + case 2 : + case 3 : + (void)dict; (void)dictSize; + return 0; +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + { + ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv04_decompressInit(dctx); + ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { + ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { + ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { + ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif + } +} + + + +MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version, + ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version); + switch(version) + { + default : + case 1 : + case 2 : + case 3 : + (void)legacyContext; (void)output; (void)input; + return ERROR(version_unsupported); +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + { + ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { + ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { + ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { + ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif + } +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_LEGACY_H */ +/**** ended inlining ../legacy/zstd_legacy.h ****/ +#endif + + + +/*-******************************************************* +* Types +*********************************************************/ +struct ZSTD_DDict_s { + void* dictBuffer; + const void* dictContent; + size_t dictSize; + ZSTD_entropyDTables_t entropy; + U32 dictID; + U32 entropyPresent; + ZSTD_customMem cMem; +}; /* typedef'd to ZSTD_DDict within "zstd.h" */ + +const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict) +{ + assert(ddict != NULL); + return ddict->dictContent; +} + +size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict) +{ + assert(ddict != NULL); + return ddict->dictSize; +} + +void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_copyDDictParameters"); + assert(dctx != NULL); + assert(ddict != NULL); + dctx->dictID = ddict->dictID; + dctx->prefixStart = ddict->dictContent; + dctx->virtualStart = ddict->dictContent; + dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; + dctx->previousDstEnd = dctx->dictEnd; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif + if (ddict->entropyPresent) { + dctx->litEntropy = 1; + dctx->fseEntropy = 1; + dctx->LLTptr = ddict->entropy.LLTable; + dctx->MLTptr = ddict->entropy.MLTable; + dctx->OFTptr = ddict->entropy.OFTable; + dctx->HUFptr = ddict->entropy.hufTable; + dctx->entropy.rep[0] = ddict->entropy.rep[0]; + dctx->entropy.rep[1] = ddict->entropy.rep[1]; + dctx->entropy.rep[2] = ddict->entropy.rep[2]; + } else { + dctx->litEntropy = 0; + dctx->fseEntropy = 0; + } +} + + +static size_t +ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict, + ZSTD_dictContentType_e dictContentType) +{ + ddict->dictID = 0; + ddict->entropyPresent = 0; + if (dictContentType == ZSTD_dct_rawContent) return 0; + + if (ddict->dictSize < 8) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + { U32 const magic = MEM_readLE32(ddict->dictContent); + if (magic != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + } + ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE); + + /* load entropy tables */ + RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy( + &ddict->entropy, ddict->dictContent, ddict->dictSize)), + dictionary_corrupted, ""); + ddict->entropyPresent = 1; + return 0; +} + + +static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) { + ddict->dictBuffer = NULL; + ddict->dictContent = dict; + if (!dict) dictSize = 0; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem); + ddict->dictBuffer = internalBuffer; + ddict->dictContent = internalBuffer; + if (!internalBuffer) return ERROR(memory_allocation); + memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; + ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); + + return 0; +} + +ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem) +{ + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem); + if (ddict == NULL) return NULL; + ddict->cMem = customMem; + { size_t const initResult = ZSTD_initDDict_internal(ddict, + dict, dictSize, + dictLoadMethod, dictContentType); + if (ZSTD_isError(initResult)) { + ZSTD_freeDDict(ddict); + return NULL; + } } + return ddict; + } +} + +/*! ZSTD_createDDict() : +* Create a digested dictionary, to start decompression without startup delay. +* `dict` content is copied inside DDict. +* Consequently, `dict` can be released after `ZSTD_DDict` creation */ +ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator); +} + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, to start decompression without startup delay. + * Dictionary content is simply referenced, it will be accessed during decompression. + * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */ +ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator); +} + + +const ZSTD_DDict* ZSTD_initStaticDDict( + void* sBuffer, size_t sBufferSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + size_t const neededSpace = sizeof(ZSTD_DDict) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); + ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer; + assert(sBuffer != NULL); + assert(dict != NULL); + if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */ + if (sBufferSize < neededSpace) return NULL; + if (dictLoadMethod == ZSTD_dlm_byCopy) { + memcpy(ddict+1, dict, dictSize); /* local copy */ + dict = ddict+1; + } + if (ZSTD_isError( ZSTD_initDDict_internal(ddict, + dict, dictSize, + ZSTD_dlm_byRef, dictContentType) )) + return NULL; + return ddict; +} + + +size_t ZSTD_freeDDict(ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; /* support free on NULL */ + { ZSTD_customMem const cMem = ddict->cMem; + ZSTD_free(ddict->dictBuffer, cMem); + ZSTD_free(ddict, cMem); + return 0; + } +} + +/*! ZSTD_estimateDDictSize() : + * Estimate amount of memory that will be needed to create a dictionary for decompression. + * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */ +size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) +{ + return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); +} + +size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; /* support sizeof on NULL */ + return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ; +} + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; + return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); +} +/**** ended inlining decompress/zstd_ddict.c ****/ +/**** start inlining decompress/zstd_decompress.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! + * HEAPMODE : + * Select how default decompression function ZSTD_decompress() allocates its context, + * on stack (0), or into heap (1, default; requires malloc()). + * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected. + */ +#ifndef ZSTD_HEAPMODE +# define ZSTD_HEAPMODE 1 +#endif + +/*! +* LEGACY_SUPPORT : +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) +*/ +#ifndef ZSTD_LEGACY_SUPPORT +# define ZSTD_LEGACY_SUPPORT 0 +#endif + +/*! + * MAXWINDOWSIZE_DEFAULT : + * maximum window size accepted by DStream __by default__. + * Frames requiring more memory will be rejected. + * It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize(). + */ +#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT +# define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1) +#endif + +/*! + * NO_FORWARD_PROGRESS_MAX : + * maximum allowed nb of calls to ZSTD_decompressStream() + * without any forward progress + * (defined as: no byte read from input, and no byte flushed to output) + * before triggering an error. + */ +#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX +# define ZSTD_NO_FORWARD_PROGRESS_MAX 16 +#endif + + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ +/**** skipping file: zstd_ddict.h ****/ +/**** start inlining zstd_decompress_block.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +#ifndef ZSTD_DEC_BLOCK_H +#define ZSTD_DEC_BLOCK_H + +/*-******************************************************* + * Dependencies + *********************************************************/ +#include /* size_t */ +/**** skipping file: ../zstd.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ + + +/* === Prototypes === */ + +/* note: prototypes already published within `zstd.h` : + * ZSTD_decompressBlock() + */ + +/* note: prototypes already published within `zstd_internal.h` : + * ZSTD_getcBlockSize() + * ZSTD_decodeSeqHeaders() + */ + + +/* ZSTD_decompressBlock_internal() : + * decompress block, starting at `src`, + * into destination buffer `dst`. + * @return : decompressed block size, + * or an error code (which can be tested using ZSTD_isError()) + */ +size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, const int frame); + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) + * this function must be called with valid parameters only + * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.) + * in which case it cannot fail. + * Internal use only. + */ +void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog); + + +#endif /* ZSTD_DEC_BLOCK_H */ +/**** ended inlining zstd_decompress_block.h ****/ + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) +/**** skipping file: ../legacy/zstd_legacy.h ****/ +#endif + + +/*-************************************************************* +* Context management +***************************************************************/ +size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support sizeof NULL */ + return sizeof(*dctx) + + ZSTD_sizeof_DDict(dctx->ddictLocal) + + dctx->inBuffSize + dctx->outBuffSize; +} + +size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); } + + +static size_t ZSTD_startingInputLength(ZSTD_format_e format) +{ + size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format); + /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */ + assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) ); + return startingInputLength; +} + +static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +{ + dctx->format = ZSTD_f_zstd1; /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */ + dctx->staticSize = 0; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + dctx->ddict = NULL; + dctx->ddictLocal = NULL; + dctx->dictEnd = NULL; + dctx->ddictIsCold = 0; + dctx->dictUses = ZSTD_dont_use; + dctx->inBuff = NULL; + dctx->inBuffSize = 0; + dctx->outBuffSize = 0; + dctx->streamStage = zdss_init; + dctx->legacyContext = NULL; + dctx->previousLegacyVersion = 0; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; + dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + dctx->outBufferMode = ZSTD_obm_buffered; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentEndForFuzzing = NULL; +#endif +} + +ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL; /* minimum size */ + + ZSTD_initDCtx_internal(dctx); + dctx->staticSize = workspaceSize; + dctx->inBuff = (char*)(dctx+1); + return dctx; +} + +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem); + if (!dctx) return NULL; + dctx->customMem = customMem; + ZSTD_initDCtx_internal(dctx); + return dctx; + } +} + +ZSTD_DCtx* ZSTD_createDCtx(void) +{ + DEBUGLOG(3, "ZSTD_createDCtx"); + return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); +} + +static void ZSTD_clearDict(ZSTD_DCtx* dctx) +{ + ZSTD_freeDDict(dctx->ddictLocal); + dctx->ddictLocal = NULL; + dctx->ddict = NULL; + dctx->dictUses = ZSTD_dont_use; +} + +size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx"); + { ZSTD_customMem const cMem = dctx->customMem; + ZSTD_clearDict(dctx); + ZSTD_free(dctx->inBuff, cMem); + dctx->inBuff = NULL; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (dctx->legacyContext) + ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); +#endif + ZSTD_free(dctx, cMem); + return 0; + } +} + +/* no longer useful */ +void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) +{ + size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx); + memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ +} + + +/*-************************************************************* + * Frame header decoding + ***************************************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +unsigned ZSTD_isFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if (magic == ZSTD_MAGICNUMBER) return 1; + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(buffer, size)) return 1; +#endif + return 0; +} + +/** ZSTD_frameHeaderSize_internal() : + * srcSize must be large enough to reach header size fields. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. + * @return : size of the Frame Header + * or an error code, which can be tested with ZSTD_isError() */ +static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) +{ + size_t const minInputSize = ZSTD_startingInputLength(format); + RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, ""); + + { BYTE const fhd = ((const BYTE*)src)[minInputSize-1]; + U32 const dictID= fhd & 3; + U32 const singleSegment = (fhd >> 5) & 1; + U32 const fcsId = fhd >> 6; + return minInputSize + !singleSegment + + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + + (singleSegment && !fcsId); + } +} + +/** ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_frameHeaderSize_prefix. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +{ + return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameHeader_advanced() : + * decode Frame Header, or require larger `srcSize`. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) +{ + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + + memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ + if (srcSize < minInputSize) return minInputSize; + RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); + + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + /* skippable frame */ + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + memset(zfhPtr, 0, sizeof(*zfhPtr)); + zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + zfhPtr->frameType = ZSTD_skippableFrame; + return 0; + } + RETURN_ERROR(prefix_unknown, ""); + } + + /* ensure there is enough `srcSize` to fully read/decode frame header */ + { size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format); + if (srcSize < fhsize) return fhsize; + zfhPtr->headerSize = (U32)fhsize; + } + + { BYTE const fhdByte = ip[minInputSize-1]; + size_t pos = minInputSize; + U32 const dictIDSizeCode = fhdByte&3; + U32 const checksumFlag = (fhdByte>>2)&1; + U32 const singleSegment = (fhdByte>>5)&1; + U32 const fcsID = fhdByte>>6; + U64 windowSize = 0; + U32 dictID = 0; + U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN; + RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported, + "reserved bits, must be zero"); + + if (!singleSegment) { + BYTE const wlByte = ip[pos++]; + U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; + RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, ""); + windowSize = (1ULL << windowLog); + windowSize += (windowSize >> 3) * (wlByte&7); + } + switch(dictIDSizeCode) + { + default: assert(0); /* impossible */ + case 0 : break; + case 1 : dictID = ip[pos]; pos++; break; + case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; + case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break; + } + switch(fcsID) + { + default: assert(0); /* impossible */ + case 0 : if (singleSegment) frameContentSize = ip[pos]; break; + case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; + case 2 : frameContentSize = MEM_readLE32(ip+pos); break; + case 3 : frameContentSize = MEM_readLE64(ip+pos); break; + } + if (singleSegment) windowSize = frameContentSize; + + zfhPtr->frameType = ZSTD_frame; + zfhPtr->frameContentSize = frameContentSize; + zfhPtr->windowSize = windowSize; + zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + zfhPtr->dictID = dictID; + zfhPtr->checksumFlag = checksumFlag; + } + return 0; +} + +/** ZSTD_getFrameHeader() : + * decode Frame Header, or require larger `srcSize`. + * note : this function does not consume input, it only reads it. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) +{ + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameContentSize() : + * compatible with legacy mode + * @return : decompressed size of the single frame pointed to be `src` if known, otherwise + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) +{ +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize); + return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret; + } +#endif + { ZSTD_frameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { + return 0; + } else { + return zfh.frameContentSize; + } } +} + +static size_t readSkippableFrameSize(void const* src, size_t srcSize) +{ + size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE; + U32 sizeU32; + + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); + { + size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } +} + +/** ZSTD_findDecompressedSize() : + * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames + * @return : decompressed size of the frames contained */ +unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + if (ZSTD_isError(skippableSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + + { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; + + /* check for overflow */ + if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; + totalDstSize += ret; + } + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); + if (ZSTD_isError(frameSrcSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; + } + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + if (srcSize) return ZSTD_CONTENTSIZE_ERROR; + + return totalDstSize; +} + +/** ZSTD_getDecompressedSize() : + * compatible with legacy mode + * @return : decompressed size if known, 0 otherwise + note : 0 can mean any of the following : + - frame content is empty + - decompressed size field is not present in frame header + - frame header unknown / not supported + - frame header not complete (`srcSize` too small) */ +unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN); + return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret; +} + + +/** ZSTD_decodeFrameHeader() : + * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ +static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) +{ + size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); + if (ZSTD_isError(result)) return result; /* invalid header */ + RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* Skip the dictID check in fuzzing mode, because it makes the search + * harder. + */ + RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID), + dictionary_wrong, ""); +#endif + if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0); + return 0; +} + +static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) +{ + ZSTD_frameSizeInfo frameSizeInfo; + frameSizeInfo.compressedSize = ret; + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + return frameSizeInfo; +} + +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo; + memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) + return ZSTD_findFrameSizeInfoLegacy(src, srcSize); +#endif + + if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || + frameSizeInfo.compressedSize <= srcSize); + return frameSizeInfo; + } else { + const BYTE* ip = (const BYTE*)src; + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ + { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + } + + ip += zfh.headerSize; + remainingSize -= zfh.headerSize; + + /* Iterate over each block */ + while (1) { + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) + return ZSTD_errorFrameSizeInfo(cBlockSize); + + if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + + ip += ZSTD_blockHeaderSize + cBlockSize; + remainingSize -= ZSTD_blockHeaderSize + cBlockSize; + nbBlocks++; + + if (blockProperties.lastBlock) break; + } + + /* Final frame content checksum */ + if (zfh.checksumFlag) { + if (remainingSize < 4) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + ip += 4; + } + + frameSizeInfo.compressedSize = ip - ipstart; + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize + : nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } +} + +/** ZSTD_findFrameCompressedSize() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the compressed size of the frame starting at `src` */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + return frameSizeInfo.compressedSize; +} + +/** ZSTD_decompressBound() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame or a skippeable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) +{ + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) + return ZSTD_CONTENTSIZE_ERROR; + assert(srcSize >= compressedSize); + src = (const BYTE*)src + compressedSize; + srcSize -= compressedSize; + bound += decompressedBound; + } + return bound; +} + + +/*-************************************************************* + * Frame decoding + ***************************************************************/ + +/** ZSTD_insertBlock() : + * insert `src` block into `dctx` history. Useful to track uncompressed blocks. */ +size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) +{ + DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); + ZSTD_checkContinuity(dctx, blockStart); + dctx->previousDstEnd = (const char*)blockStart + blockSize; + return blockSize; +} + + +static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_copyRawBlock"); + if (dst == NULL) { + if (srcSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, ""); + memcpy(dst, src, srcSize); + return srcSize; +} + +static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + BYTE b, + size_t regenSize) +{ + if (dst == NULL) { + if (regenSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, ""); + memset(dst, b, regenSize); + return regenSize; +} + + +/*! ZSTD_decompressFrame() : + * @dctx must be properly initialized + * will update *srcPtr and *srcSizePtr, + * to make *srcPtr progress by one frame. */ +static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void** srcPtr, size_t *srcSizePtr) +{ + const BYTE* ip = (const BYTE*)(*srcPtr); + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; + BYTE* op = ostart; + size_t remainingSrcSize = *srcSizePtr; + + DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr); + + /* check */ + RETURN_ERROR_IF( + remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + + /* Frame Header */ + { size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal( + ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format); + if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize; + RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , ""); + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + + /* Loop on each block */ + while (1) { + size_t decodedSize; + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + + ip += ZSTD_blockHeaderSize; + remainingSrcSize -= ZSTD_blockHeaderSize; + RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + + switch(blockProperties.blockType) + { + case bt_compressed: + decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1); + break; + case bt_raw : + decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize); + break; + case bt_rle : + decodedSize = ZSTD_setRleBlock(op, oend-op, *ip, blockProperties.origSize); + break; + case bt_reserved : + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + + if (ZSTD_isError(decodedSize)) return decodedSize; + if (dctx->fParams.checksumFlag) + XXH64_update(&dctx->xxhState, op, decodedSize); + if (decodedSize != 0) + op += decodedSize; + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; + if (blockProperties.lastBlock) break; + } + + if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize, + corruption_detected, ""); + } + if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ + U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); + U32 checkRead; + RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, ""); + checkRead = MEM_readLE32(ip); + RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, ""); + ip += 4; + remainingSrcSize -= 4; + } + + /* Allow caller to get size read */ + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return op-ostart; +} + +static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + const ZSTD_DDict* ddict) +{ + void* const dststart = dst; + int moreThan1Frame = 0; + + DEBUGLOG(5, "ZSTD_decompressMultiFrame"); + assert(dict==NULL || ddict==NULL); /* either dict or ddict set, not both */ + + if (ddict) { + dict = ZSTD_DDict_dictContent(ddict); + dictSize = ZSTD_DDict_dictSize(ddict); + } + + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + size_t decodedSize; + size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); + if (ZSTD_isError(frameSize)) return frameSize; + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, + "legacy support is not compatible with static dctx"); + + decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); + if (ZSTD_isError(decodedSize)) return decodedSize; + + assert(decodedSize <=- dstCapacity); + dst = (BYTE*)dst + decodedSize; + dstCapacity -= decodedSize; + + src = (const BYTE*)src + frameSize; + srcSize -= frameSize; + + continue; + } +#endif + + { U32 const magicNumber = MEM_readLE32(src); + DEBUGLOG(4, "reading magic number %08X (expecting %08X)", + (unsigned)magicNumber, ZSTD_MAGICNUMBER); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } } + + if (ddict) { + /* we were called from ZSTD_decompress_usingDDict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), ""); + } else { + /* this will initialize correctly with no dict if dict == NULL, so + * use this in all cases but ddict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); + } + ZSTD_checkContinuity(dctx, dst); + + { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, + &src, &srcSize); + RETURN_ERROR_IF( + (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown) + && (moreThan1Frame==1), + srcSize_wrong, + "at least one frame successfully completed, but following " + "bytes are garbage: it's more likely to be a srcSize error, " + "specifying more bytes than compressed size of frame(s). This " + "error message replaces ERROR(prefix_unknown), which would be " + "confusing, as the first header is actually correct. Note that " + "one could be unlucky, it might be a corruption error instead, " + "happening right at the place where we expect zstd magic " + "bytes. But this is _much_ less likely than a srcSize field " + "error."); + if (ZSTD_isError(res)) return res; + assert(res <= dstCapacity); + if (res != 0) + dst = (BYTE*)dst + res; + dstCapacity -= res; + } + moreThan1Frame = 1; + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed"); + + return (BYTE*)dst - (BYTE*)dststart; +} + +size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize) +{ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL); +} + + +static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) +{ + switch (dctx->dictUses) { + default: + assert(0 /* Impossible */); + /* fall-through */ + case ZSTD_dont_use: + ZSTD_clearDict(dctx); + return NULL; + case ZSTD_use_indefinitely: + return dctx->ddict; + case ZSTD_use_once: + dctx->dictUses = ZSTD_dont_use; + return dctx->ddict; + } +} + +size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx)); +} + + +size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ +#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) + size_t regenSize; + ZSTD_DCtx* const dctx = ZSTD_createDCtx(); + RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); + regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); + ZSTD_freeDCtx(dctx); + return regenSize; +#else /* stack mode */ + ZSTD_DCtx dctx; + ZSTD_initDCtx_internal(&dctx); + return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize); +#endif +} + + +/*-************************************** +* Advanced Streaming Decompression API +* Bufferless and synchronous +****************************************/ +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + +/** + * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed, + * we allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce + * output, and avoid copying the input. + * + * @param inputSize - The total amount of input that the caller currently has. + */ +static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) { + if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock)) + return dctx->expected; + if (dctx->bType != bt_raw) + return dctx->expected; + return MIN(MAX(inputSize, 1), dctx->expected); +} + +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { + switch(dctx->stage) + { + default: /* should not happen */ + assert(0); + case ZSTDds_getFrameHeaderSize: + case ZSTDds_decodeFrameHeader: + return ZSTDnit_frameHeader; + case ZSTDds_decodeBlockHeader: + return ZSTDnit_blockHeader; + case ZSTDds_decompressBlock: + return ZSTDnit_block; + case ZSTDds_decompressLastBlock: + return ZSTDnit_lastBlock; + case ZSTDds_checkChecksum: + return ZSTDnit_checksum; + case ZSTDds_decodeSkippableHeader: + case ZSTDds_skipFrame: + return ZSTDnit_skippableFrame; + } +} + +static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; } + +/** ZSTD_decompressContinue() : + * srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress()) + * @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); + /* Sanity check */ + RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); + if (dstCapacity) ZSTD_checkContinuity(dctx, dst); + + switch (dctx->stage) + { + case ZSTDds_getFrameHeaderSize : + assert(src != NULL); + if (dctx->format == ZSTD_f_zstd1) { /* allows header */ + assert(srcSize >= ZSTD_FRAMEIDSIZE); /* to read skippable magic number */ + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */ + dctx->stage = ZSTDds_decodeSkippableHeader; + return 0; + } } + dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format); + if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize; + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = dctx->headerSize - srcSize; + dctx->stage = ZSTDds_decodeFrameHeader; + return 0; + + case ZSTDds_decodeFrameHeader: + assert(src != NULL); + memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), ""); + dctx->expected = ZSTD_blockHeaderSize; + dctx->stage = ZSTDds_decodeBlockHeader; + return 0; + + case ZSTDds_decodeBlockHeader: + { blockProperties_t bp; + size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum"); + dctx->expected = cBlockSize; + dctx->bType = bp.blockType; + dctx->rleSize = bp.origSize; + if (cBlockSize) { + dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock; + return 0; + } + /* empty block */ + if (bp.lastBlock) { + if (dctx->fParams.checksumFlag) { + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* end of frame */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */ + dctx->stage = ZSTDds_decodeBlockHeader; + } + return 0; + } + + case ZSTDds_decompressLastBlock: + case ZSTDds_decompressBlock: + DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock"); + { size_t rSize; + switch(dctx->bType) + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : + assert(srcSize <= dctx->expected); + rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); + FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed"); + assert(rSize == srcSize); + dctx->expected -= rSize; + break; + case bt_rle : + rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_reserved : /* should never happen */ + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + FORWARD_IF_ERROR(rSize, ""); + RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum"); + DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize); + dctx->decodedSize += rSize; + if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize); + dctx->previousDstEnd = (char*)dst + rSize; + + /* Stay on the same stage until we are finished streaming the block. */ + if (dctx->expected > 0) { + return rSize; + } + + if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ + DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize); + RETURN_ERROR_IF( + dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && dctx->decodedSize != dctx->fParams.frameContentSize, + corruption_detected, ""); + if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* ends here */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->stage = ZSTDds_decodeBlockHeader; + dctx->expected = ZSTD_blockHeaderSize; + } + return rSize; + } + + case ZSTDds_checkChecksum: + assert(srcSize == 4); /* guaranteed by dctx->expected */ + { U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); + U32 const check32 = MEM_readLE32(src); + DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); + RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + } + + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); + memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; + return 0; + + case ZSTDds_skipFrame: + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ + } +} + + +static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dict; + dctx->previousDstEnd = (const char*)dict + dictSize; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif + return 0; +} + +/*! ZSTD_loadDEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * @return : size of entropy tables read */ +size_t +ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + const void* const dict, size_t const dictSize) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + + RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small"); + assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */ + dictPtr += 8; /* skip header = magic + dictID */ + + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable)); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable)); + ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE); + { void* const workspace = &entropy->LLTable; /* use fse tables as temporary workspace; implies fse tables are grouped together */ + size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable); +#ifdef HUF_FORCE_DECOMPRESS_X1 + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, + workspace, workspaceSize); +#else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, + workspace, workspaceSize); +#endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; + } + + { short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff, offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->OFTable, + offcodeNCount, offcodeMaxValue, + OF_base, OF_bits, + offcodeLog); + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->MLTable, + matchlengthNCount, matchlengthMaxValue, + ML_base, ML_bits, + matchlengthLog); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->LLTable, + litlengthNCount, litlengthMaxValue, + LL_base, LL_bits, + litlengthLog); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + { int i; + size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12)); + for (i=0; i<3; i++) { + U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4; + RETURN_ERROR_IF(rep==0 || rep > dictContentSize, + dictionary_corrupted, ""); + entropy->rep[i] = rep; + } } + + return dictPtr - (const BYTE*)dict; +} + +static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize); + { U32 const magic = MEM_readLE32(dict); + if (magic != ZSTD_MAGIC_DICTIONARY) { + return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */ + } } + dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); + + /* load entropy tables */ + { size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize); + RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, ""); + dict = (const char*)dict + eSize; + dictSize -= eSize; + } + dctx->litEntropy = dctx->fseEntropy = 1; + + /* reference dictionary content */ + return ZSTD_refDictContent(dctx, dict, dictSize); +} + +size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) +{ + assert(dctx != NULL); + dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ + dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->decodedSize = 0; + dctx->previousDstEnd = NULL; + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; + dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; + dctx->MLTptr = dctx->entropy.MLTable; + dctx->OFTptr = dctx->entropy.OFTable; + dctx->HUFptr = dctx->entropy.hufTable; + return 0; +} + +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (dict && dictSize) + RETURN_ERROR_IF( + ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)), + dictionary_corrupted, ""); + return 0; +} + + +/* ====== ZSTD_DDict ====== */ + +size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict"); + assert(dctx != NULL); + if (ddict) { + const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict); + size_t const dictSize = ZSTD_DDict_dictSize(ddict); + const void* const dictEnd = dictStart + dictSize; + dctx->ddictIsCold = (dctx->dictEnd != dictEnd); + DEBUGLOG(4, "DDict is %s", + dctx->ddictIsCold ? "~cold~" : "hot!"); + } + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (ddict) { /* NULL ddict is equivalent to no dictionary */ + ZSTD_copyDDictParameters(dctx, ddict); + } + return 0; +} + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) +{ + if (dictSize < 8) return 0; + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0; + return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); +} + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompress frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. + * Needed dictionary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ +unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) +{ + ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +} + + +/*! ZSTD_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Use dictionary without significant overhead. */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict) +{ + /* pass content and size in case legacy frames are encountered */ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, + NULL, 0, + ddict); +} + + +/*===================================== +* Streaming decompression +*====================================*/ + +ZSTD_DStream* ZSTD_createDStream(void) +{ + DEBUGLOG(3, "ZSTD_createDStream"); + return ZSTD_createDStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticDCtx(workspace, workspaceSize); +} + +ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_advanced(customMem); +} + +size_t ZSTD_freeDStream(ZSTD_DStream* zds) +{ + return ZSTD_freeDCtx(zds); +} + + +/* *** Initialization *** */ + +size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; } +size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (dict && dictSize != 0) { + dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem); + RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!"); + dctx->ddict = dctx->ddictLocal; + dctx->dictUses = ZSTD_use_indefinitely; + } + return 0; +} + +size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), ""); + dctx->dictUses = ZSTD_use_once; + return 0; +} + +size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + + +/* ZSTD_initDStream_usingDict() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) +{ + DEBUGLOG(4, "ZSTD_initDStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , ""); + return ZSTD_startingInputLength(zds->format); +} + +/* note : this variant can't fail */ +size_t ZSTD_initDStream(ZSTD_DStream* zds) +{ + DEBUGLOG(4, "ZSTD_initDStream"); + return ZSTD_initDStream_usingDDict(zds, NULL); +} + +/* ZSTD_initDStream_usingDDict() : + * ddict will just be referenced, and must outlive decompression session + * this function cannot fail */ +size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) +{ + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +} + +/* ZSTD_resetDStream() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_resetDStream(ZSTD_DStream* dctx) +{ + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); +} + + +size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (ddict) { + dctx->ddict = ddict; + dctx->dictUses = ZSTD_use_indefinitely; + } + return 0; +} + +/* ZSTD_DCtx_setMaxWindowSize() : + * note : no direct equivalence in ZSTD_DCtx_setParameter, + * since this version sets windowSize, and the other sets windowLog */ +size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax); + size_t const min = (size_t)1 << bounds.lowerBound; + size_t const max = (size_t)1 << bounds.upperBound; + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, ""); + RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, ""); + dctx->maxWindowSize = maxWindowSize; + return 0; +} + +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) +{ + return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, format); +} + +ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + switch(dParam) { + case ZSTD_d_windowLogMax: + bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + case ZSTD_d_format: + bounds.lowerBound = (int)ZSTD_f_zstd1; + bounds.upperBound = (int)ZSTD_f_zstd1_magicless; + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + return bounds; + case ZSTD_d_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_obm_buffered; + bounds.upperBound = (int)ZSTD_obm_stable; + return bounds; + default:; + } + bounds.error = ERROR(parameter_unsupported); + return bounds; +} + +/* ZSTD_dParam_withinBounds: + * @return 1 if value is within dParam bounds, + * 0 otherwise */ +static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam); + if (ZSTD_isError(bounds.error)) return 0; + if (value < bounds.lowerBound) return 0; + if (value > bounds.upperBound) return 0; + return 1; +} + +#define CHECK_DBOUNDS(p,v) { \ + RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \ +} + +size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + switch(dParam) { + case ZSTD_d_windowLogMax: + if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT; + CHECK_DBOUNDS(ZSTD_d_windowLogMax, value); + dctx->maxWindowSize = ((size_t)1) << value; + return 0; + case ZSTD_d_format: + CHECK_DBOUNDS(ZSTD_d_format, value); + dctx->format = (ZSTD_format_e)value; + return 0; + case ZSTD_d_stableOutBuffer: + CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value); + dctx->outBufferMode = (ZSTD_outBufferMode_e)value; + return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +} + +size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + dctx->format = ZSTD_f_zstd1; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + } + return 0; +} + + +size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) +{ + return ZSTD_sizeof_DCtx(dctx); +} + +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +{ + size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, + frameParameter_windowTooLarge, ""); + return minRBSize; +} + +size_t ZSTD_estimateDStreamSize(size_t windowSize) +{ + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + size_t const inBuffSize = blockSize; /* no block can be larger */ + size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize; +} + +size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) +{ + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ + ZSTD_frameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); + RETURN_ERROR_IF(zfh.windowSize > windowSizeMax, + frameParameter_windowTooLarge, ""); + return ZSTD_estimateDStreamSize((size_t)zfh.windowSize); +} + + +/* ***** Decompression ***** */ + +static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR; +} + +static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize)) + zds->oversizedDuration++; + else + zds->oversizedDuration = 0; +} + +static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds) +{ + return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */ +static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output) +{ + ZSTD_outBuffer const expect = zds->expectedOutBuffer; + /* No requirement when ZSTD_obm_stable is not enabled. */ + if (zds->outBufferMode != ZSTD_obm_stable) + return 0; + /* Any buffer is allowed in zdss_init, this must be the same for every other call until + * the context is reset. + */ + if (zds->streamStage == zdss_init) + return 0; + /* The buffer must match our expectation exactly. */ + if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size) + return 0; + RETURN_ERROR(dstBuffer_wrong, "ZSTD_obm_stable enabled but output differs!"); +} + +/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream() + * and updates the stage and the output buffer state. This call is extracted so it can be + * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode. + * NOTE: You must break after calling this function since the streamStage is modified. + */ +static size_t ZSTD_decompressContinueStream( + ZSTD_DStream* zds, char** op, char* oend, + void const* src, size_t srcSize) { + int const isSkipFrame = ZSTD_isSkipFrame(zds); + if (zds->outBufferMode == ZSTD_obm_buffered) { + size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart; + size_t const decodedSize = ZSTD_decompressContinue(zds, + zds->outBuff + zds->outStart, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + if (!decodedSize && !isSkipFrame) { + zds->streamStage = zdss_read; + } else { + zds->outEnd = zds->outStart + decodedSize; + zds->streamStage = zdss_flush; + } + } else { + /* Write directly into the output buffer */ + size_t const dstSize = isSkipFrame ? 0 : oend - *op; + size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + *op += decodedSize; + /* Flushing is not needed. */ + zds->streamStage = zdss_read; + assert(*op <= oend); + assert(zds->outBufferMode == ZSTD_obm_stable); + } + return 0; +} + +size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + const char* const src = (const char*)input->src; + const char* const istart = input->pos != 0 ? src + input->pos : src; + const char* const iend = input->size != 0 ? src + input->size : src; + const char* ip = istart; + char* const dst = (char*)output->dst; + char* const ostart = output->pos != 0 ? dst + output->pos : dst; + char* const oend = output->size != 0 ? dst + output->size : dst; + char* op = ostart; + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, + "forbidden. in: pos: %u vs size: %u", + (U32)input->pos, (U32)input->size); + RETURN_ERROR_IF( + output->pos > output->size, + dstSize_tooSmall, + "forbidden. out: pos: %u vs size: %u", + (U32)output->pos, (U32)output->size); + DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos)); + FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), ""); + + while (someMoreWork) { + switch(zds->streamStage) + { + case zdss_init : + DEBUGLOG(5, "stage zdss_init => transparent reset "); + zds->streamStage = zdss_loadHeader; + zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; + zds->legacyVersion = 0; + zds->hostageByte = 0; + zds->expectedOutBuffer = *output; + /* fall-through */ + + case zdss_loadHeader : + DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + if (zds->legacyVersion) { + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; + return hint; + } } +#endif + { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); + if (legacyVersion) { + ZSTD_DDict const* const ddict = ZSTD_getDDict(zds); + const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL; + size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0; + DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion); + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext, + zds->previousLegacyVersion, legacyVersion, + dict, dictSize), ""); + zds->legacyVersion = zds->previousLegacyVersion = legacyVersion; + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */ + return hint; + } } +#endif + return hSize; /* error */ + } + if (hSize != 0) { /* need more input */ + size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */ + size_t const remainingInput = (size_t)(iend-ip); + assert(iend >= ip); + if (toLoad > remainingInput) { /* not enough input to load full header */ + if (remainingInput > 0) { + memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); + zds->lhSize += remainingInput; + } + input->pos = input->size; + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); + memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; + break; + } } + + /* check for single-pass mode opportunity */ + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { + size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") + ip = istart + cSize; + op += decompressedSize; + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } } + + /* Check output buffer is large enough for ZSTD_odm_stable. */ + if (zds->outBufferMode == ZSTD_obm_stable + && zds->fParams.frameType != ZSTD_skippableFrame + && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) { + RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small"); + } + + /* Consume header (see ZSTDds_decodeFrameHeader) */ + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + + if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), ""); + zds->expected = ZSTD_blockHeaderSize; + zds->stage = ZSTDds_decodeBlockHeader; + } + + /* control buffer memory usage */ + DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)", + (U32)(zds->fParams.windowSize >>10), + (U32)(zds->maxWindowSize >> 10) ); + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_obm_buffered + ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); + + { int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize); + int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds); + + if (tooSmall || tooLarge) { + size_t const bufferSize = neededInBuffSize + neededOutBuffSize; + DEBUGLOG(4, "inBuff : from %u to %u", + (U32)zds->inBuffSize, (U32)neededInBuffSize); + DEBUGLOG(4, "outBuff : from %u to %u", + (U32)zds->outBuffSize, (U32)neededOutBuffSize); + if (zds->staticSize) { /* static DCtx */ + DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); + assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ + RETURN_ERROR_IF( + bufferSize > zds->staticSize - sizeof(ZSTD_DCtx), + memory_allocation, ""); + } else { + ZSTD_free(zds->inBuff, zds->customMem); + zds->inBuffSize = 0; + zds->outBuffSize = 0; + zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem); + RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, ""); + } + zds->inBuffSize = neededInBuffSize; + zds->outBuff = zds->inBuff + zds->inBuffSize; + zds->outBuffSize = neededOutBuffSize; + } } } + zds->streamStage = zdss_read; + /* fall-through */ + + case zdss_read: + DEBUGLOG(5, "stage zdss_read"); + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip); + DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize); + if (neededInSize==0) { /* end of frame */ + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; + } } + if (ip==iend) { someMoreWork = 0; break; } /* no more input */ + zds->streamStage = zdss_load; + /* fall-through */ + + case zdss_load: + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + size_t const toLoad = neededInSize - zds->inPos; + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ + assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { + RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos, + corruption_detected, + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip); + } + ip += loadedSize; + zds->inPos += loadedSize; + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ + zds->inPos = 0; /* input is consumed */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), ""); + /* Function modifies the stage so we must break */ + break; + } + case zdss_flush: + { size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize); + op += flushedSize; + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) + && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); + zds->outStart = zds->outEnd = 0; + } + break; + } } + /* cannot complete flush */ + someMoreWork = 0; + break; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ + } } + + /* result */ + input->pos = (size_t)(ip - (const char*)(input->src)); + output->pos = (size_t)(op - (char*)(output->dst)); + + /* Update the expected output buffer for ZSTD_obm_stable. */ + zds->expectedOutBuffer = *output; + + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { + RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); + RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); + assert(0); + } + } else { + zds->noForwardProgress = 0; + } + { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds); + if (!nextSrcSizeHint) { /* frame fully decoded */ + if (zds->outEnd == zds->outStart) { /* output fully flushed */ + if (zds->hostageByte) { + if (input->pos >= input->size) { + /* can't release hostage (not present) */ + zds->streamStage = zdss_read; + return 1; + } + input->pos++; /* release hostage */ + } /* zds->hostageByte */ + return 0; + } /* zds->outEnd == zds->outStart */ + if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */ + input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */ + zds->hostageByte=1; + } + return 1; + } /* nextSrcSizeHint==0 */ + nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */ + assert(zds->inPos <= nextSrcSizeHint); + nextSrcSizeHint -= zds->inPos; /* part already loaded*/ + return nextSrcSizeHint; + } +} + +size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; +} +/**** ended inlining decompress/zstd_decompress.c ****/ +/**** start inlining decompress/zstd_decompress_block.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* zstd_decompress_block : + * this module takes care of decompressing _compressed_ block */ + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ +/**** skipping file: zstd_ddict.h ****/ +/**** skipping file: zstd_decompress_block.h ****/ + +/*_******************************************************* +* Macros +**********************************************************/ + +/* These two optional macros force the use one way or another of the two + * ZSTD_decompressSequences implementations. You can't force in both directions + * at the same time. + */ +#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" +#endif + + +/*_******************************************************* +* Memory operations +**********************************************************/ +static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); } + + +/*-************************************************************* + * Block decoding + ***************************************************************/ + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr) +{ + RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); + + { U32 const cBlockHeader = MEM_readLE24(src); + U32 const cSize = cBlockHeader >> 3; + bpPtr->lastBlock = cBlockHeader & 1; + bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); + bpPtr->origSize = cSize; /* only useful for RLE */ + if (bpPtr->blockType == bt_rle) return 1; + RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); + return cSize; + } +} + + +/* Hidden declaration for fullbench */ +size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize); +/*! ZSTD_decodeLiteralsBlock() : + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ +{ + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); + + { const BYTE* const istart = (const BYTE*) src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); + + switch(litEncType) + { + case set_repeat: + DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); + RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); + /* fall-through */ + + case set_compressed: + RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ + /* 2 - 2 - 10 - 10 */ + singleStream = !lhlCode; + lhSize = 3; + litSize = (lhc >> 4) & 0x3FF; + litCSize = (lhc >> 14) & 0x3FF; + break; + case 2: + /* 2 - 2 - 14 - 14 */ + lhSize = 4; + litSize = (lhc >> 4) & 0x3FFF; + litCSize = lhc >> 18; + break; + case 3: + /* 2 - 2 - 18 - 18 */ + lhSize = 5; + litSize = (lhc >> 4) & 0x3FFFF; + litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); + break; + } + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + + /* prefetch huffman table if cold */ + if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { + PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); + } + + if (litEncType==set_repeat) { + if (singleStream) { + hufSuccess = HUF_decompress1X_usingDTable_bmi2( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, dctx->bmi2); + } else { + hufSuccess = HUF_decompress4X_usingDTable_bmi2( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, dctx->bmi2); + } + } else { + if (singleStream) { +#if defined(HUF_FORCE_DECOMPRESS_X2) + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace)); +#else + hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), dctx->bmi2); +#endif + } else { + hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), dctx->bmi2); + } + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); + + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + dctx->litEntropy = 1; + if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return litCSize + lhSize; + } + + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = MEM_readLE24(istart) >> 4; + break; + } + + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ + RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); + memcpy(dctx->litBuffer, istart+lhSize, litSize); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return lhSize+litSize; + } + /* direct reference into compressed stream */ + dctx->litPtr = istart+lhSize; + dctx->litSize = litSize; + return lhSize+litSize; + } + + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = MEM_readLE24(istart) >> 4; + RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + return lhSize+1; + } + default: + RETURN_ERROR(corruption_detected, "impossible"); + } + } +} + +/* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions + * They were generated programmatically with following method : + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables + * - pretify output, report below, test with fuzzer to ensure it's correct */ + +/* Default FSE distribution table for Literal Lengths */ +static const ZSTD_seqSymbol LL_defaultDTable[(1<tableLog = 0; + DTableH->fastMode = 0; + + cell->nbBits = 0; + cell->nextState = 0; + assert(nbAddBits < 255); + cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->baseValue = baseValue; +} + + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) + * cannot fail if input is valid => + * all inputs are presumed validated at this stage */ +void +ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog) +{ + ZSTD_seqSymbol* const tableDecode = dt+1; + U16 symbolNext[MaxSeq+1]; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize-1; + + /* Sanity Checks */ + assert(maxSymbolValue <= MaxSeq); + assert(tableLog <= MaxFSELog); + + /* Init, lay down lowprob symbols */ + { ZSTD_seqSymbol_header DTableH; + DTableH.tableLog = tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + assert(normalizedCounter[s]>=0); + symbolNext[s] = (U16)normalizedCounter[s]; + } } } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { U32 u; + for (u=0; u max, corruption_detected, ""); + { U32 const symbol = *(const BYTE*)src; + U32 const baseline = baseValue[symbol]; + U32 const nbBits = nbAdditionalBits[symbol]; + ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); + } + *DTablePtr = DTableSpace; + return 1; + case set_basic : + *DTablePtr = defaultTable; + return 0; + case set_repeat: + RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); + /* prefetch FSE table if used */ + if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { + const void* const pStart = *DTablePtr; + size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); + PREFETCH_AREA(pStart, pSize); + } + return 0; + case set_compressed : + { unsigned tableLog; + S16 norm[MaxSeq+1]; + size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); + RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); + RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog); + *DTablePtr = DTableSpace; + return headerSize; + } + default : + assert(0); + RETURN_ERROR(GENERIC, "impossible"); + } +} + +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize) +{ + const BYTE* const istart = (const BYTE* const)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + int nbSeq; + DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); + + /* check */ + RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); + + /* SeqHead */ + nbSeq = *ip++; + if (!nbSeq) { + *nbSeqPtr=0; + RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, ""); + return 1; + } + if (nbSeq > 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); + nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2; + } else { + RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); + nbSeq = ((nbSeq-0x80)<<8) + *ip++; + } + } + *nbSeqPtr = nbSeq; + + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ + { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, + LLtype, MaxLL, LLFSELog, + ip, iend-ip, + LL_base, LL_bits, + LL_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += llhSize; + } + + { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, + OFtype, MaxOff, OffFSELog, + ip, iend-ip, + OF_base, OF_bits, + OF_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += ofhSize; + } + + { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, + MLtype, MaxML, MLFSELog, + ip, iend-ip, + ML_base, ML_bits, + ML_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += mlhSize; + } + } + + return ip-istart; +} + + +typedef struct { + size_t litLength; + size_t matchLength; + size_t offset; + const BYTE* match; +} seq_t; + +typedef struct { + size_t state; + const ZSTD_seqSymbol* table; +} ZSTD_fseState; + +typedef struct { + BIT_DStream_t DStream; + ZSTD_fseState stateLL; + ZSTD_fseState stateOffb; + ZSTD_fseState stateML; + size_t prevOffset[ZSTD_REP_NUM]; + const BYTE* prefixStart; + const BYTE* dictEnd; + size_t pos; +} seqState_t; + +/*! ZSTD_overlapCopy8() : + * Copies 8 bytes from ip to op and updates op and ip where ip <= op. + * If the offset is < 8 then the offset is spread to at least 8 bytes. + * + * Precondition: *ip <= *op + * Postcondition: *op - *op >= 8 + */ +HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { + assert(*ip <= *op); + if (offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ + int const sub2 = dec64table[offset]; + (*op)[0] = (*ip)[0]; + (*op)[1] = (*ip)[1]; + (*op)[2] = (*ip)[2]; + (*op)[3] = (*ip)[3]; + *ip += dec32table[offset]; + ZSTD_copy4(*op+4, *ip); + *ip -= sub2; + } else { + ZSTD_copy8(*op, *ip); + } + *ip += 8; + *op += 8; + assert(*op - *ip >= 8); +} + +/*! ZSTD_safecopy() : + * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer + * and write up to 16 bytes past oend_w (op >= oend_w is allowed). + * This function is only called in the uncommon case where the sequence is near the end of the block. It + * should be fast for a single long sequence, but can be slow for several short sequences. + * + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. + * The src buffer must be before the dst buffer. + */ +static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || + (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); + + if (length < 8) { + /* Handle short lengths. */ + while (op < oend) *op++ = *ip++; + return; + } + if (ovtype == ZSTD_overlap_src_before_dst) { + /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ + assert(length >= 8); + ZSTD_overlapCopy8(&op, &ip, diff); + assert(op - ip >= 8); + assert(op <= oend); + } + + if (oend <= oend_w) { + /* No risk of overwrite. */ + ZSTD_wildcopy(op, ip, length, ovtype); + return; + } + if (op <= oend_w) { + /* Wildcopy until we get close to the end. */ + assert(oend > oend_w); + ZSTD_wildcopy(op, ip, oend_w - op, ovtype); + ip += oend_w - op; + op = oend_w; + } + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + +/* ZSTD_execSequenceEnd(): + * This version handles cases that are near the end of the output buffer. It requires + * more careful checks to make sure there is no overflow. By separating out these hard + * and unlikely cases, we can speed up the common cases. + * + * NOTE: This function needs to be fast for a single long sequence, but doesn't need + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart-match); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +static void +ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) +{ + const void* ptr = dt; + const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", + (U32)DStatePtr->state, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +{ + ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.nextState + lowBits; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) +{ + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.nextState + lowBits; +} + +/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum + * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ + (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ + ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ + : 0) + +typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; +typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e; + +FORCE_INLINE_TEMPLATE seq_t +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch) +{ + seq_t seq; + ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; + ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; + ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; + U32 const llBase = llDInfo.baseValue; + U32 const mlBase = mlDInfo.baseValue; + U32 const ofBase = ofDInfo.baseValue; + BYTE const llBits = llDInfo.nbAdditionalBits; + BYTE const mlBits = mlDInfo.nbAdditionalBits; + BYTE const ofBits = ofDInfo.nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + /* sequence */ + { size_t offset; + if (ofBits > 1) { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } else { + U32 const ll0 = (llBase == 0); + if (LIKELY((ofBits == 0))) { + if (LIKELY(!ll0)) + offset = seqState->prevOffset[0]; + else { + offset = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; + } + + seq.matchLength = mlBase; + if (mlBits > 0) + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + seq.litLength = llBase; + if (llBits > 0) + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + if (prefetch == ZSTD_p_prefetch) { + size_t const pos = seqState->pos + seq.litLength; + const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; + seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : no memory access will occur, offset is only used for prefetching */ + seqState->pos = pos + seq.matchLength; + } + + /* ANS state update + * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). + * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). + * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the + * better option, so it is the default for other compilers. But, if you + * measure that it is worse, please put up a pull request. + */ + { +#if defined(__GNUC__) && !defined(__clang__) + const int kUseUpdateFseState = 1; +#else + const int kUseUpdateFseState = 0; +#endif + if (kUseUpdateFseState) { + ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + } else { + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ + } + } + + return seq; +} + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +{ + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ + if (dctx->dictContentEndForFuzzing == NULL) return 0; + /* Dictionary is our prefix. */ + if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; + /* Dictionary is not our ext-dict. */ + if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; + /* Dictionary is not within our window size. */ + if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; + /* Dictionary is active. */ + return 1; +} + +MEM_STATIC void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) +{ + size_t const windowSize = dctx->fParams.windowSize; + size_t const sequenceSize = seq.litLength + seq.matchLength; + BYTE const* const oLitEnd = op + seq.litLength; + DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + assert(op <= oend); + assert((size_t)(oend - op) >= sequenceSize); + assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); + if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { + size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); + /* Offset must be within the dictionary. */ + assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); + assert(seq.offset <= windowSize + dictSize); + } else { + /* Offset must be within our window. */ + assert(seq.offset <= windowSize); + } +} +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body"); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + size_t error = 0; + dctx->fseEntropy = 1; + { U32 i; for (i=0; ientropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__GNUC__) && defined(__x86_64__) + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * I've been able to reproduce this issue on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +#endif + for ( ; ; ) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + BIT_reloadDStream(&(seqState.DStream)); + /* gcc and clang both don't like early returns in this loop. + * gcc doesn't like early breaks either. + * Instead save an error and report it at the end. + * When there is an error, don't increment op, so we don't + * overwrite. + */ + if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize; + else op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) break; + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); + if (ZSTD_isError(error)) return error; + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +static size_t +ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +FORCE_INLINE_TEMPLATE size_t +ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 4 +#define STORED_SEQS_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS 4 + seq_t sequences[STORED_SEQS]; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + dctx->fseEntropy = 1; + { int i; for (i=0; ientropy.rep[i]; } + seqState.prefixStart = prefixStart; + seqState.pos = (size_t)(op-prefixStart); + seqState.dictEnd = dictEnd; + assert(dst != NULL); + assert(iend >= ip); + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNbentropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +static size_t +ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + + +#if DYNAMIC_BMI2 + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static TARGET_ATTRIBUTE("bmi2") size_t +DONT_VECTORIZE +ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +#endif /* DYNAMIC_BMI2 */ + +typedef size_t (*ZSTD_decompressSequences_t)( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame); + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static size_t +ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequences"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +/* ZSTD_decompressSequencesLong() : + * decompression function triggered when a minimum share of offsets is considered "long", + * aka out of cache. + * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". + * This function will try to mitigate main memory latency through the use of prefetching */ +static size_t +ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +/* ZSTD_getLongOffsetsShare() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) + * compared to maximum possible of (1< 22) total += 1; + } + + assert(tableLog <= OffFSELog); + total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + + return total; +} +#endif + +size_t +ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, const int frame) +{ /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; + /* isLongOffset must be true if there are long offsets. + * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. + * We don't expect that to be the case in 64-bit mode. + * In block mode, window size is not known, so we have to be conservative. + * (note: but it could be evaluated from current-lowLimit) + */ + ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + + RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; + } + + /* Build Decoding Tables */ + { + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. + */ +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; +#endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + srcSize -= seqHSize; + + RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + if ( !usePrefetchDecoder + && (!frame || (dctx->fParams.windowSize > (1<<24))) + && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ + U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); + U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ + usePrefetchDecoder = (shareLongOffsets >= minShare); + } +#endif + + dctx->ddictIsCold = 0; + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + if (usePrefetchDecoder) +#endif +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +#endif + } +} + + +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +{ + if (dst != dctx->previousDstEnd) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dst; + dctx->previousDstEnd = dst; + } +} + + +size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t dSize; + ZSTD_checkContinuity(dctx, dst); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; +} +/**** ended inlining decompress/zstd_decompress_block.c ****/ diff --git a/module/zstd/lib/zstd.h b/module/zstd/lib/zstd.h new file mode 100644 index 0000000000..b6772f8818 --- /dev/null +++ b/module/zstd/lib/zstd.h @@ -0,0 +1,2115 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + +/* ====== Dependency ======*/ +#include /* INT_MAX */ +#include /* size_t */ + + +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDLIB_API ZSTDLIB_VISIBILITY +#endif + + +/******************************************************************************* + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 5 + +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) +ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */ + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) +ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +/*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/*************************************** +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * They return an error otherwise. */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression work is performed in parallel, within worker threads. + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +typedef enum { + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. + */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + +typedef enum { + + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/**************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /**< start of input buffer */ + size_t size; /**< size of input buffer */ + size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /**< start of output buffer */ + size_t size; /**< size of output buffer */ + size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + + + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: + * + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). + */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/************************** +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*********************************** + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/******************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/******************************************************************************* + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/**************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 3, then this is seqDef.offset - 3 + * If seqDef.offset < 3, then this is the corresponding repeat offset + * But if seqDef.offset < 3 and litLength == 0, this is the + * repeat offset before the corresponding repeat offset + * And if seqDef.offset == 3 and litLength == 0, this is the + * most recent repeat offset - 1 + */ + unsigned int offset; + unsigned int litLength; /* Literal length */ + unsigned int matchLength; /* Match length */ + /* 0 when seq not rep and seqDef.offset otherwise + * when litLength == 0 this will be <= 4, otherwise <= 3 like normal + */ + unsigned int rep; +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /**< dispatch table : larger == faster, more memory */ + unsigned searchLog; /**< nb of searches : larger == more compression, slower */ + unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ +} ZSTD_frameParameters; + +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) + */ +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +/*! ZSTD_getSequences() : + * Extract sequences from the sequence store + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 + * @return : number of sequences extracted + */ +ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + + +/*************************************** +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. + */ +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*************************************** +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats + */ +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/******************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ +/**! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/**! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/**! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. + */ +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + + +/*===== Advanced Streaming decompression functions =====*/ +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/********************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ + +/** + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ + +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*- + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ + +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif diff --git a/module/zstd/lib/zstd_errors.h b/module/zstd/lib/zstd_errors.h new file mode 100644 index 0000000000..998398e7e5 --- /dev/null +++ b/module/zstd/lib/zstd_errors.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDERRORLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#endif + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ERRORS_H_398273423 */ diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c new file mode 100644 index 0000000000..2c698716c9 --- /dev/null +++ b/module/zstd/zfs_zstd.c @@ -0,0 +1,794 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2018, Klara Inc. + * Copyright (c) 2016-2018, Allan Jude + * Copyright (c) 2018-2020, Sebastian Gottschall + * Copyright (c) 2019-2020, Michael Niewöhner + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#define ZSTD_STATIC_LINKING_ONLY +#include "lib/zstd.h" +#include "lib/zstd_errors.h" + +kstat_t *zstd_ksp = NULL; + +typedef struct zstd_stats { + kstat_named_t zstd_stat_alloc_fail; + kstat_named_t zstd_stat_alloc_fallback; + kstat_named_t zstd_stat_com_alloc_fail; + kstat_named_t zstd_stat_dec_alloc_fail; + kstat_named_t zstd_stat_com_inval; + kstat_named_t zstd_stat_dec_inval; + kstat_named_t zstd_stat_dec_header_inval; + kstat_named_t zstd_stat_com_fail; + kstat_named_t zstd_stat_dec_fail; + kstat_named_t zstd_stat_buffers; + kstat_named_t zstd_stat_size; +} zstd_stats_t; + +static zstd_stats_t zstd_stats = { + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "alloc_fallback", KSTAT_DATA_UINT64 }, + { "compress_alloc_fail", KSTAT_DATA_UINT64 }, + { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, + { "compress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_header_invalid", KSTAT_DATA_UINT64 }, + { "compress_failed", KSTAT_DATA_UINT64 }, + { "decompress_failed", KSTAT_DATA_UINT64 }, + { "buffers", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, +}; + +/* Enums describing the allocator type specified by kmem_type in zstd_kmem */ +enum zstd_kmem_type { + ZSTD_KMEM_UNKNOWN = 0, + /* Allocation type using kmem_vmalloc */ + ZSTD_KMEM_DEFAULT, + /* Pool based allocation using mempool_alloc */ + ZSTD_KMEM_POOL, + /* Reserved fallback memory for decompression only */ + ZSTD_KMEM_DCTX, + ZSTD_KMEM_COUNT, +}; + +/* Structure for pooled memory objects */ +struct zstd_pool { + void *mem; + size_t size; + kmutex_t barrier; + hrtime_t timeout; +}; + +/* Global structure for handling memory allocations */ +struct zstd_kmem { + enum zstd_kmem_type kmem_type; + size_t kmem_size; + struct zstd_pool *pool; +}; + +/* Fallback memory structure used for decompression only if memory runs out */ +struct zstd_fallback_mem { + size_t mem_size; + void *mem; + kmutex_t barrier; +}; + +struct zstd_levelmap { + int16_t zstd_level; + enum zio_zstd_levels level; +}; + +/* + * ZSTD memory handlers + * + * For decompression we use a different handler which also provides fallback + * memory allocation in case memory runs out. + * + * The ZSTD handlers were split up for the most simplified implementation. + */ +static void *zstd_alloc(void *opaque, size_t size); +static void *zstd_dctx_alloc(void *opaque, size_t size); +static void zstd_free(void *opaque, void *ptr); + +/* Compression memory handler */ +static const ZSTD_customMem zstd_malloc = { + zstd_alloc, + zstd_free, + NULL, +}; + +/* Decompression memory handler */ +static const ZSTD_customMem zstd_dctx_malloc = { + zstd_dctx_alloc, + zstd_free, + NULL, +}; + +/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ +static struct zstd_levelmap zstd_levels[] = { + {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, + {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, + {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, + {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, + {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, + {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, + {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, + {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, + {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, + {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, + {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, + {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, + {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, + {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, + {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, + {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, + {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, + {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, + {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, + {-1, ZIO_ZSTD_LEVEL_FAST_1}, + {-2, ZIO_ZSTD_LEVEL_FAST_2}, + {-3, ZIO_ZSTD_LEVEL_FAST_3}, + {-4, ZIO_ZSTD_LEVEL_FAST_4}, + {-5, ZIO_ZSTD_LEVEL_FAST_5}, + {-6, ZIO_ZSTD_LEVEL_FAST_6}, + {-7, ZIO_ZSTD_LEVEL_FAST_7}, + {-8, ZIO_ZSTD_LEVEL_FAST_8}, + {-9, ZIO_ZSTD_LEVEL_FAST_9}, + {-10, ZIO_ZSTD_LEVEL_FAST_10}, + {-20, ZIO_ZSTD_LEVEL_FAST_20}, + {-30, ZIO_ZSTD_LEVEL_FAST_30}, + {-40, ZIO_ZSTD_LEVEL_FAST_40}, + {-50, ZIO_ZSTD_LEVEL_FAST_50}, + {-60, ZIO_ZSTD_LEVEL_FAST_60}, + {-70, ZIO_ZSTD_LEVEL_FAST_70}, + {-80, ZIO_ZSTD_LEVEL_FAST_80}, + {-90, ZIO_ZSTD_LEVEL_FAST_90}, + {-100, ZIO_ZSTD_LEVEL_FAST_100}, + {-500, ZIO_ZSTD_LEVEL_FAST_500}, + {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, +}; + +/* + * This variable represents the maximum count of the pool based on the number + * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. + */ +static int pool_count = 16; + +#define ZSTD_POOL_MAX pool_count +#define ZSTD_POOL_TIMEOUT 60 * 2 + +static struct zstd_fallback_mem zstd_dctx_fallback; +static struct zstd_pool *zstd_mempool_cctx; +static struct zstd_pool *zstd_mempool_dctx; + +/* + * The library zstd code expects these if ADDRESS_SANITIZER gets defined, + * and while ASAN does this, KASAN defines that and does not. So to avoid + * changing the external code, we do this. + */ +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define ADDRESS_SANITIZER 1 +#endif +#elif defined(__SANITIZE_ADDRESS__) +#define ADDRESS_SANITIZER 1 +#endif +#if defined(_KERNEL) && defined(ADDRESS_SANITIZER) +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +void __asan_poison_memory_region(void const volatile *addr, size_t size); +void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; +void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; +#endif + + +static void +zstd_mempool_reap(struct zstd_pool *zstd_mempool) +{ + struct zstd_pool *pool; + + if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { + return; + } + + /* free obsolete slots */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + if (pool->mem && mutex_tryenter(&pool->barrier)) { + /* Free memory if unused object older than 2 minutes */ + if (pool->mem && gethrestime_sec() > pool->timeout) { + vmem_free(pool->mem, pool->size); + ZSTDSTAT_SUB(zstd_stat_buffers, 1); + ZSTDSTAT_SUB(zstd_stat_size, pool->size); + pool->mem = NULL; + pool->size = 0; + pool->timeout = 0; + } + mutex_exit(&pool->barrier); + } + } +} + +/* + * Try to get a cached allocated buffer from memory pool or allocate a new one + * if necessary. If a object is older than 2 minutes and does not fit the + * requested size, it will be released and a new cached entry will be allocated. + * If other pooled objects are detected without being used for 2 minutes, they + * will be released, too. + * + * The concept is that high frequency memory allocations of bigger objects are + * expensive. So if a lot of work is going on, allocations will be kept for a + * while and can be reused in that time frame. + * + * The scheduled release will be updated every time a object is reused. + */ + +static void * +zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) +{ + struct zstd_pool *pool; + struct zstd_kmem *mem = NULL; + + if (!zstd_mempool) { + return (NULL); + } + + /* Seek for preallocated memory slot and free obsolete slots */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + /* + * This lock is simply a marker for a pool object being in use. + * If it's already hold, it will be skipped. + * + * We need to create it before checking it to avoid race + * conditions caused by running in a threaded context. + * + * The lock is later released by zstd_mempool_free. + */ + if (mutex_tryenter(&pool->barrier)) { + /* + * Check if objects fits the size, if so we take it and + * update the timestamp. + */ + if (pool->mem && size <= pool->size) { + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + mem = pool->mem; + return (mem); + } + mutex_exit(&pool->barrier); + } + } + + /* + * If no preallocated slot was found, try to fill in a new one. + * + * We run a similar algorithm twice here to avoid pool fragmentation. + * The first one may generate holes in the list if objects get released. + * We always make sure that these holes get filled instead of adding new + * allocations constantly at the end. + */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + if (mutex_tryenter(&pool->barrier)) { + /* Object is free, try to allocate new one */ + if (!pool->mem) { + mem = vmem_alloc(size, KM_SLEEP); + if (mem) { + ZSTDSTAT_ADD(zstd_stat_buffers, 1); + ZSTDSTAT_ADD(zstd_stat_size, size); + pool->mem = mem; + pool->size = size; + /* Keep track for later release */ + mem->pool = pool; + mem->kmem_type = ZSTD_KMEM_POOL; + mem->kmem_size = size; + } + } + + if (size <= pool->size) { + /* Update timestamp */ + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + + return (pool->mem); + } + + mutex_exit(&pool->barrier); + } + } + + /* + * If the pool is full or the allocation failed, try lazy allocation + * instead. + */ + if (!mem) { + mem = vmem_alloc(size, KM_NOSLEEP); + if (mem) { + mem->pool = NULL; + mem->kmem_type = ZSTD_KMEM_DEFAULT; + mem->kmem_size = size; + } + } + + return (mem); +} + +/* Mark object as released by releasing the barrier mutex */ +static void +zstd_mempool_free(struct zstd_kmem *z) +{ + mutex_exit(&z->pool->barrier); +} + +/* Convert ZFS internal enum to ZSTD level */ +static int +zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) +{ + if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { + *zstd_level = zstd_levels[level - 1].zstd_level; + return (0); + } + if (level >= ZIO_ZSTD_LEVEL_FAST_1 && + level <= ZIO_ZSTD_LEVEL_FAST_1000) { + *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 + + ZIO_ZSTD_LEVEL_19].zstd_level; + return (0); + } + + /* Invalid/unknown zfs compression enum - this should never happen. */ + return (1); +} + + +/* Compress block using zstd */ +size_t +zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level) +{ + size_t c_len; + int16_t zstd_level; + zfs_zstdhdr_t *hdr; + ZSTD_CCtx *cctx; + + hdr = (zfs_zstdhdr_t *)d_start; + + /* Skip compression if the specified level is invalid */ + if (zstd_enum_to_level(level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_com_inval); + return (s_len); + } + + ASSERT3U(d_len, >=, sizeof (*hdr)); + ASSERT3U(d_len, <=, s_len); + ASSERT3U(zstd_level, !=, 0); + + cctx = ZSTD_createCCtx_advanced(zstd_malloc); + + /* + * Out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (!cctx) { + ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); + return (s_len); + } + + /* Set the compression level */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); + + /* Use the "magicless" zstd header which saves us 4 header bytes */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); + + /* + * Disable redundant checksum calculation and content size storage since + * this is already done by ZFS itself. + */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); + ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); + + c_len = ZSTD_compress2(cctx, + hdr->data, + d_len - sizeof (*hdr), + s_start, s_len); + + ZSTD_freeCCtx(cctx); + + /* Error in the compression routine, disable compression. */ + if (ZSTD_isError(c_len)) { + /* + * If we are aborting the compression because the saves are + * too small, that is not a failure. Everything else is a + * failure, so increment the compression failure counter. + */ + if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) { + ZSTDSTAT_BUMP(zstd_stat_com_fail); + } + return (s_len); + } + + /* + * Encode the compressed buffer size at the start. We'll need this in + * decompression to counter the effects of padding which might be added + * to the compressed buffer and which, if unhandled, would confuse the + * hell out of our decompression function. + */ + hdr->c_len = BE_32(c_len); + + /* + * Check version for overflow. + * The limit of 24 bits must not be exceeded. This allows a maximum + * version 1677.72.15 which we don't expect to be ever reached. + */ + ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); + + /* + * Encode the compression level as well. We may need to know the + * original compression level if compressed_arc is disabled, to match + * the compression settings to write this block to the L2ARC. + * + * Encode the actual level, so if the enum changes in the future, we + * will be compatible. + * + * The upper 24 bits store the ZSTD version to be able to provide + * future compatibility, since new versions might enhance the + * compression algorithm in a way, where the compressed data will + * change. + * + * As soon as such incompatibility occurs, handling code needs to be + * added, differentiating between the versions. + */ + zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); + zfs_set_hdrlevel(hdr, level); + hdr->raw_version_level = BE_32(hdr->raw_version_level); + + return (c_len + sizeof (*hdr)); +} + +/* Decompress block using zstd and return its stored level */ +int +zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, + size_t d_len, uint8_t *level) +{ + ZSTD_DCtx *dctx; + size_t result; + int16_t zstd_level; + uint32_t c_len; + const zfs_zstdhdr_t *hdr; + zfs_zstdhdr_t hdr_copy; + + hdr = (const zfs_zstdhdr_t *)s_start; + c_len = BE_32(hdr->c_len); + + /* + * Make a copy instead of directly converting the header, since we must + * not modify the original data that may be used again later. + */ + hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); + uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); + + /* + * NOTE: We ignore the ZSTD version for now. As soon as any + * incompatibility occurs, it has to be handled accordingly. + * The version can be accessed via `hdr_copy.version`. + */ + + /* + * Convert and check the level + * An invalid level is a strong indicator for data corruption! In such + * case return an error so the upper layers can try to fix it. + */ + if (zstd_enum_to_level(curlevel, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_dec_inval); + return (1); + } + + ASSERT3U(d_len, >=, s_len); + ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); + + /* Invalid compressed buffer size encoded at start */ + if (c_len + sizeof (*hdr) > s_len) { + ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); + return (1); + } + + dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); + if (!dctx) { + ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); + return (1); + } + + /* Set header type to "magicless" */ + ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); + + /* Decompress the data and release the context */ + result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); + ZSTD_freeDCtx(dctx); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative. + */ + if (ZSTD_isError(result)) { + ZSTDSTAT_BUMP(zstd_stat_dec_fail); + return (1); + } + + if (level) { + *level = curlevel; + } + + return (0); +} + +/* Decompress datablock using zstd */ +int +zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level __maybe_unused) +{ + + return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, + NULL)); +} + +/* Allocator for zstd compression context using mempool_allocator */ +static void * +zstd_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); + + if (!z) { + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + return (NULL); + } + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* + * Allocator for zstd decompression context using mempool_allocator with + * fallback to reserved memory if allocation fails + */ +static void * +zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); + if (!z) { + /* Try harder, decompression shall not fail */ + z = vmem_alloc(nbytes, KM_SLEEP); + if (z) { + z->pool = NULL; + } + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + } else { + return ((void*)z + (sizeof (struct zstd_kmem))); + } + + /* Fallback if everything fails */ + if (!z) { + /* + * Barrier since we only can handle it in a single thread. All + * other following threads need to wait here until decompression + * is completed. zstd_free will release this barrier later. + */ + mutex_enter(&zstd_dctx_fallback.barrier); + + z = zstd_dctx_fallback.mem; + type = ZSTD_KMEM_DCTX; + ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); + } + + /* Allocation should always be successful */ + if (!z) { + return (NULL); + } + + z->kmem_type = type; + z->kmem_size = nbytes; + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* Free allocated memory by its specific type */ +static void +zstd_free(void *opaque __maybe_unused, void *ptr) +{ + struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); + enum zstd_kmem_type type; + + ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); + ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); + + type = z->kmem_type; + switch (type) { + case ZSTD_KMEM_DEFAULT: + vmem_free(z, z->kmem_size); + break; + case ZSTD_KMEM_POOL: + zstd_mempool_free(z); + break; + case ZSTD_KMEM_DCTX: + mutex_exit(&zstd_dctx_fallback.barrier); + break; + default: + break; + } +} + +/* Allocate fallback memory to ensure safe decompression */ +static void __init +create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) +{ + mem->mem_size = size; + mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); + mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); +} + +/* Initialize memory pool barrier mutexes */ +static void __init +zstd_mempool_init(void) +{ + zstd_mempool_cctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + zstd_mempool_dctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + mutex_init(&zstd_mempool_cctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + mutex_init(&zstd_mempool_dctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + } +} + +/* Initialize zstd-related memory handling */ +static int __init +zstd_meminit(void) +{ + zstd_mempool_init(); + + /* + * Estimate the size of the fallback decompression context. + * The expected size on x64 with current ZSTD should be about 160 KB. + */ + create_fallback_mem(&zstd_dctx_fallback, + P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), + PAGESIZE)); + + return (0); +} + +/* Release object from pool and free memory */ +static void __exit +release_pool(struct zstd_pool *pool) +{ + mutex_destroy(&pool->barrier); + vmem_free(pool->mem, pool->size); + pool->mem = NULL; + pool->size = 0; +} + +/* Release memory pool objects */ +static void __exit +zstd_mempool_deinit(void) +{ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + release_pool(&zstd_mempool_cctx[i]); + release_pool(&zstd_mempool_dctx[i]); + } + + kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + zstd_mempool_dctx = NULL; + zstd_mempool_cctx = NULL; +} + +/* release unused memory from pool */ + +void +zfs_zstd_cache_reap_now(void) +{ + /* + * calling alloc with zero size seeks + * and releases old unused objects + */ + zstd_mempool_reap(zstd_mempool_cctx); + zstd_mempool_reap(zstd_mempool_dctx); +} + +extern int __init +zstd_init(void) +{ + /* Set pool size by using maximum sane thread count * 4 */ + pool_count = (boot_ncpus * 4); + zstd_meminit(); + + /* Initialize kstat */ + zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", + KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (zstd_ksp != NULL) { + zstd_ksp->ks_data = &zstd_stats; + kstat_install(zstd_ksp); + } + + return (0); +} + +extern void __exit +zstd_fini(void) +{ + /* Deinitialize kstat */ + if (zstd_ksp != NULL) { + kstat_delete(zstd_ksp); + zstd_ksp = NULL; + } + + /* Release fallback memory */ + vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); + mutex_destroy(&zstd_dctx_fallback.barrier); + + /* Deinit memory pool */ + zstd_mempool_deinit(); +} + +#if defined(_KERNEL) +module_init(zstd_init); +module_exit(zstd_fini); + +ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS"); +ZFS_MODULE_LICENSE("Dual BSD/GPL"); +ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a"); + +EXPORT_SYMBOL(zfs_zstd_compress); +EXPORT_SYMBOL(zfs_zstd_decompress_level); +EXPORT_SYMBOL(zfs_zstd_decompress); +EXPORT_SYMBOL(zfs_zstd_cache_reap_now); +#endif diff --git a/module/zstd/zstd-in.c b/module/zstd/zstd-in.c new file mode 100644 index 0000000000..121f375e55 --- /dev/null +++ b/module/zstd/zstd-in.c @@ -0,0 +1,68 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) 2019-2020, Michael Niewöhner + */ + +#define MEM_MODULE +#define XXH_NAMESPACE ZSTD_ +#define XXH_PRIVATE_API +#define XXH_INLINE_ALL +#define ZSTD_LEGACY_SUPPORT 0 +#define ZSTD_LIB_DICTBUILDER 0 +#define ZSTD_LIB_DEPRECATED 0 +#define ZSTD_NOBENCH + +#include "common/debug.c" +#include "common/entropy_common.c" +#include "common/error_private.c" +#include "common/fse_decompress.c" +#include "common/pool.c" +#include "common/zstd_common.c" + +#include "compress/fse_compress.c" +#include "compress/hist.c" +#include "compress/huf_compress.c" +#include "compress/zstd_compress_literals.c" +#include "compress/zstd_compress_sequences.c" +#include "compress/zstd_compress_superblock.c" +#include "compress/zstd_compress.c" +#include "compress/zstd_double_fast.c" +#include "compress/zstd_fast.c" +#include "compress/zstd_lazy.c" +#include "compress/zstd_ldm.c" +#include "compress/zstd_opt.c" + +#include "decompress/huf_decompress.c" +#include "decompress/zstd_ddict.c" +#include "decompress/zstd_decompress.c" +#include "decompress/zstd_decompress_block.c" diff --git a/module/zstd/zstd_sparc.c b/module/zstd/zstd_sparc.c new file mode 100644 index 0000000000..463df99bd7 --- /dev/null +++ b/module/zstd/zstd_sparc.c @@ -0,0 +1,11 @@ +#ifdef __sparc__ +#include +#include +#include "include/sparc_compat.h" +uint64_t __bswapdi2(uint64_t in) { + return (BSWAP_64(in)); +} +uint32_t __bswapsi2(uint32_t in) { + return (BSWAP_32(in)); +} +#endif diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 568bef988c..e0c410c680 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -4,8 +4,8 @@ %define not_rpm 1 %endif -# See comment in zfs.spec.in. -%global __brp_mangle_shebangs_exclude_from arc_summary.py|arcstat.py|dbufstat.py|test-runner.py|zts-report.py +# Exclude input files from mangling +%global __brp_mangle_shebangs_exclude_from ^/usr/src/.*$ %define module @PACKAGE@ %define mkconf scripts/dkms.mkconf @@ -18,7 +18,7 @@ Summary: Kernel module(s) (dkms) Group: System Environment/Kernel License: @ZFS_META_LICENSE@ -URL: http://zfsonlinux.org/ +URL: https://github.com/openzfs/zfs Source0: %{module}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch @@ -26,12 +26,19 @@ BuildArch: noarch Requires: dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version} -Requires: kernel-devel +Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Obsoletes: spl-dkms %endif Provides: %{module}-kmod = %{version} AutoReqProv: no +%if 0%{?rhel}%{?fedora}%{?suse_version} +# We don't directly use it, but if this isn't installed, rpmbuild as root can +# crash+corrupt rpmdb +# See issue #12071 +BuildRequires: ncompress +%endif + %description This package contains the dkms ZFS kernel modules. @@ -73,7 +80,7 @@ exit 1 %preun # Are we doing an upgrade? -if [ $1 -ne 0 ] ; then +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then # Yes we are. Are we upgrading to a new ZFS version? NEWEST_VER=$(dkms status zfs | sed 's/,//g' | sort -r -V | awk '/installed/{print $2; exit}') if [ "$NEWEST_VER" != "%{version}" ] ; then @@ -93,7 +100,7 @@ fi CONFIG_H="/var/lib/dkms/%{module}/%{version}/*/*/%{module}_config.h" SPEC_META_ALIAS="@PACKAGE@-@VERSION@-@RELEASE@" DKMS_META_ALIAS=`cat $CONFIG_H 2>/dev/null | - awk -F'"' '/META_ALIAS/ { print $2; exit 0 }'` + awk -F'"' '/META_ALIAS\s+"/ { print $2; exit 0 }'` if [ "$SPEC_META_ALIAS" = "$DKMS_META_ALIAS" ]; then echo -e echo -e "Uninstall of %{module} module ($SPEC_META_ALIAS) beginning:" diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 4a8f662316..1692be1a72 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -1,8 +1,5 @@ %define module @PACKAGE@ -# See comment in zfs.spec.in. -%global __brp_mangle_shebangs_exclude_from arc_summary.py|arcstat.py|dbufstat.py|test-runner.py|zts-report.py - %if !%{defined ksrc} %if 0%{?rhel}%{?fedora} %define ksrc ${kernel_version##*___} @@ -51,7 +48,7 @@ Summary: Kernel module(s) Group: System Environment/Kernel License: @ZFS_META_LICENSE@ -URL: http://zfsonlinux.org/ +URL: https://github.com/openzfs/zfs Source0: %{module}-%{version}.tar.gz Source10: kmodtool BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id} -u -n) @@ -60,6 +57,13 @@ BuildRequires: gcc, make BuildRequires: elfutils-libelf-devel %endif +%if 0%{?rhel}%{?fedora}%{?suse_version} +# We don't directly use it, but if this isn't installed, rpmbuild as root can +# crash+corrupt rpmdb +# See issue #12071 +BuildRequires: ncompress +%endif + # The developments headers will conflict with the dkms packages. Conflicts: %{module}-dkms @@ -91,10 +95,6 @@ BuildRequires: %{_bindir}/kmodtool %global __global_ldflags %{nil} %endif -%if 0%{?fedora} >= 17 -%define prefix /usr -%endif - # Kmodtool does its magic here. A patched version of kmodtool is shipped # with the source rpm until kmod development packages are supported upstream. # https://bugzilla.rpmfusion.org/show_bug.cgi?id=2714 diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 9faa3ba771..4a37ae8ce1 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -48,14 +48,15 @@ %global _systemdgeneratordir %{_prefix}/lib/systemd/system-generators %endif +%if %{undefined _pkgconfigdir} +%global _pkgconfigdir %{_prefix}/%{_lib}/pkgconfig +%endif + %bcond_with debug %bcond_with debuginfo %bcond_with asan %bcond_with systemd - -# Exclude test-runner.py from the rpmbuild shebang check to allow it to run -# under Python 2 and 3. -%global __brp_mangle_shebangs_exclude_from test-runner.py +%bcond_with pam # Generic enable switch for systemd %if %{with systemd} @@ -99,6 +100,7 @@ %define __python_cffi_pkg python%{__python_pkg_version}-cffi %define __python_setuptools_pkg python%{__python_pkg_version}-setuptools %endif +%define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())") # By default python-pyzfs is enabled, with the exception of # RHEL 6 which by default uses Python 2.6 which is too old. @@ -115,19 +117,19 @@ Summary: Commands to control the kernel modules and libraries Group: System Environment/Kernel License: @ZFS_META_LICENSE@ -URL: http://zfsonlinux.org/ +URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -Requires: libzpool2 = %{version} -Requires: libnvpair1 = %{version} -Requires: libuutil1 = %{version} -Requires: libzfs2 = %{version} +Requires: libzpool5 = %{version} +Requires: libnvpair3 = %{version} +Requires: libuutil3 = %{version} +Requires: libzfs5 = %{version} Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version} Obsoletes: spl -# zfs-fuse provides the same commands and man pages that ZoL does. Renaming -# those on either side would conflict with all available documentation. +# zfs-fuse provides the same commands and man pages that OpenZFS does. +# Renaming those on either side would conflict with all available documentation. Conflicts: zfs-fuse %if 0%{?rhel}%{?fedora}%{?suse_version} @@ -138,9 +140,14 @@ BuildRequires: libblkid-devel BuildRequires: libudev-devel BuildRequires: libattr-devel BuildRequires: openssl-devel -%if 0%{?fedora} >= 28 +# We don't directly use it, but if this isn't installed, rpmbuild as root can +# crash+corrupt rpmdb +# See issue #12071 +BuildRequires: ncompress +%if 0%{?fedora} >= 28 || 0%{?rhel} >= 8 || 0%{?centos} >= 8 BuildRequires: libtirpc-devel %endif + Requires: openssl %if 0%{?_systemd} BuildRequires: systemd @@ -160,36 +167,48 @@ Requires: sysstat %description This package contains the core ZFS command line utilities. -%package -n libzpool2 +%package -n libzpool5 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel +Obsoletes: libzpool2 +Obsoletes: libzpool4 -%description -n libzpool2 +%description -n libzpool5 This package contains the zpool library, which provides support for managing zpools -%post -n libzpool2 -p /sbin/ldconfig -%postun -n libzpool2 -p /sbin/ldconfig +%if %{defined ldconfig_scriptlets} +%ldconfig_scriptlets -n libzpool5 +%else +%post -n libzpool5 -p /sbin/ldconfig +%postun -n libzpool5 -p /sbin/ldconfig +%endif -%package -n libnvpair1 +%package -n libnvpair3 Summary: Solaris name-value library for Linux Group: System Environment/Kernel +Obsoletes: libnvpair1 -%description -n libnvpair1 +%description -n libnvpair3 This package contains routines for packing and unpacking name-value pairs. This functionality is used to portably transport data across process boundaries, between kernel and user space, and can be used to write self describing data structures on disk. -%post -n libnvpair1 -p /sbin/ldconfig -%postun -n libnvpair1 -p /sbin/ldconfig +%if %{defined ldconfig_scriptlets} +%ldconfig_scriptlets -n libnvpair3 +%else +%post -n libnvpair3 -p /sbin/ldconfig +%postun -n libnvpair3 -p /sbin/ldconfig +%endif -%package -n libuutil1 +%package -n libuutil3 Summary: Solaris userland utility library for Linux Group: System Environment/Kernel +Obsoletes: libuutil1 -%description -n libuutil1 -This library provides a variety of compatibility functions for ZFS on Linux: +%description -n libuutil3 +This library provides a variety of compatibility functions for OpenZFS: * libspl: The Solaris Porting Layer userland library, which provides APIs that make it possible to run Solaris user code in a Linux environment with relatively minimal modification. @@ -199,32 +218,47 @@ This library provides a variety of compatibility functions for ZFS on Linux: partitioning. * libshare: NFS, SMB, and iSCSI service integration for ZFS. -%post -n libuutil1 -p /sbin/ldconfig -%postun -n libuutil1 -p /sbin/ldconfig +%if %{defined ldconfig_scriptlets} +%ldconfig_scriptlets -n libuutil3 +%else +%post -n libuutil3 -p /sbin/ldconfig +%postun -n libuutil3 -p /sbin/ldconfig +%endif -%package -n libzfs2 +# The library version is encoded in the package name. When updating the +# version information it is important to add an obsoletes line below for +# the previous version of the package. +%package -n libzfs5 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel +Obsoletes: libzfs2 +Obsoletes: libzfs4 -%description -n libzfs2 +%description -n libzfs5 This package provides support for managing ZFS filesystems -%post -n libzfs2 -p /sbin/ldconfig -%postun -n libzfs2 -p /sbin/ldconfig +%if %{defined ldconfig_scriptlets} +%ldconfig_scriptlets -n libzfs5 +%else +%post -n libzfs5 -p /sbin/ldconfig +%postun -n libzfs5 -p /sbin/ldconfig +%endif -%package -n libzfs2-devel +%package -n libzfs5-devel Summary: Development headers Group: System Environment/Kernel -Requires: libzfs2 = %{version} -Requires: libzpool2 = %{version} -Requires: libnvpair1 = %{version} -Requires: libuutil1 = %{version} -Provides: libzpool2-devel -Provides: libnvpair1-devel -Provides: libuutil1-devel +Requires: libzfs5 = %{version} +Requires: libzpool5 = %{version} +Requires: libnvpair3 = %{version} +Requires: libuutil3 = %{version} +Provides: libzpool5-devel +Provides: libnvpair3-devel +Provides: libuutil3-devel Obsoletes: zfs-devel +Obsoletes: libzfs2-devel +Obsoletes: libzfs4-devel -%description -n libzfs2-devel +%description -n libzfs5-devel This package contains the header files needed for building additional applications against the ZFS libraries. @@ -255,7 +289,8 @@ validating the file system. %package dracut Summary: Dracut module Group: System Environment/Kernel -Requires: %{name}%{?_isa} = %{version}-%{release} +BuildArch: noarch +Requires: %{name} >= %{version} Requires: dracut Requires: /usr/bin/awk Requires: grep @@ -270,12 +305,17 @@ Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch -Requires: libzfs2 = %{version} -Requires: libnvpair1 = %{version} +Requires: libzfs5 = %{version} +Requires: libnvpair3 = %{version} Requires: libffi Requires: python%{__python_pkg_version} Requires: %{__python_cffi_pkg} %if 0%{?rhel}%{?fedora}%{?suse_version} +%if 0%{?rhel} >= 8 || 0%{?centos} >= 8 || 0%{?fedora} >= 28 +BuildRequires: python3-packaging +%else +BuildRequires: python-packaging +%endif BuildRequires: python%{__python_pkg_version}-devel BuildRequires: %{__python_cffi_pkg} BuildRequires: %{__python_setuptools_pkg} @@ -320,7 +360,7 @@ image which is ZFS aware. %if 0%{?_systemd} %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit - %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target + %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target %else %define systemd --enable-sysvinit --disable-systemd %endif @@ -331,6 +371,12 @@ image which is ZFS aware. %define pyzfs --disable-pyzfs %endif +%if %{with pam} + %define pam --enable-pam +%else + %define pam --disable-pam +%endif + %setup -q %build @@ -339,12 +385,16 @@ image which is ZFS aware. --with-udevdir=%{_udevdir} \ --with-udevruledir=%{_udevruledir} \ --with-dracutdir=%{_dracutdir} \ + --with-pamconfigsdir=%{_datadir}/pam-configs \ + --with-pammoduledir=%{_libdir}/security \ --with-python=%{__python} \ + --with-pkgconfigdir=%{_pkgconfigdir} \ --disable-static \ %{debug} \ %{debuginfo} \ %{asan} \ - %{systemd}\ + %{systemd} \ + %{pam} \ %{pyzfs} make %{?_smp_mflags} @@ -352,6 +402,14 @@ make %{?_smp_mflags} %{__rm} -rf $RPM_BUILD_ROOT make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; +%if 0%{!?__brp_mangle_shebangs:1} +find %{?buildroot}%{_bindir} \ + \( -name arc_summary -or -name arcstat -or -name dbufstat \) \ + -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; +find %{?buildroot}%{_datadir} \ + \( -name test-runner.py -or -name zts-report.py \) \ + -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; +%endif %post %if 0%{?_systemd} @@ -416,20 +474,32 @@ systemctl --system daemon-reload >/dev/null || true # Core utilities %{_sbindir}/* %{_bindir}/raidz_test -%{_bindir}/zgenhostid +%{_sbindir}/zgenhostid +%{_bindir}/zvol_wait # Optional Python 2/3 scripts %{_bindir}/arc_summary %{_bindir}/arcstat %{_bindir}/dbufstat # Man pages %{_mandir}/man1/* +%{_mandir}/man4/* %{_mandir}/man5/* +%{_mandir}/man7/* %{_mandir}/man8/* # Configuration files and scripts %{_libexecdir}/%{name} %{_udevdir}/vdev_id %{_udevdir}/zvol_id %{_udevdir}/rules.d/* +%{_datadir}/%{name}/compatibility.d +%if ! 0%{?_systemd} || 0%{?_initramfs} +# Files needed for sysvinit and initramfs-tools +%{_sysconfdir}/%{name}/zfs-functions +%config(noreplace) %{_initconfdir}/zfs +%else +%exclude %{_sysconfdir}/%{name}/zfs-functions +%exclude %{_initconfdir}/zfs +%endif %if 0%{?_systemd} %{_unitdir}/* %{_presetdir}/* @@ -437,32 +507,41 @@ systemctl --system daemon-reload >/dev/null || true %{_systemdgeneratordir}/* %else %config(noreplace) %{_sysconfdir}/init.d/* -%config(noreplace) %{_initconfdir}/zfs %endif -%config(noreplace) %{_sysconfdir}/%{name} +%config(noreplace) %{_sysconfdir}/%{name}/zed.d/* +%config(noreplace) %{_sysconfdir}/%{name}/zpool.d/* +%config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* +%if %{with pam} +%{_libdir}/security/* +%{_datadir}/pam-configs/* +%endif -%files -n libzpool2 +%files -n libzpool5 %{_libdir}/libzpool.so.* -%files -n libnvpair1 +%files -n libnvpair3 %{_libdir}/libnvpair.so.* -%files -n libuutil1 +%files -n libuutil3 %{_libdir}/libuutil.so.* -%files -n libzfs2 +%files -n libzfs5 %{_libdir}/libzfs*.so.* -%files -n libzfs2-devel -%{_datadir}/pkgconfig/libzfs.pc -%{_datadir}/pkgconfig/libzfs_core.pc +%files -n libzfs5-devel +%{_pkgconfigdir}/libzfs.pc +%{_pkgconfigdir}/libzfsbootenv.pc +%{_pkgconfigdir}/libzfs_core.pc %{_libdir}/*.so %{_includedir}/* %doc AUTHORS COPYRIGHT LICENSE NOTICE README.md %files test -%{_datadir}/%{name} +%{_datadir}/%{name}/zfs-tests +%{_datadir}/%{name}/test-runner +%{_datadir}/%{name}/runfiles +%{_datadir}/%{name}/*.sh %files dracut %doc contrib/dracut/README.dracut.markdown @@ -473,8 +552,8 @@ systemctl --system daemon-reload >/dev/null || true %doc contrib/pyzfs/README %doc contrib/pyzfs/LICENSE %defattr(-,root,root,-) -%{python_sitelib}/libzfs_core/* -%{python_sitelib}/pyzfs* +%{__python_sitelib}/libzfs_core/* +%{__python_sitelib}/pyzfs* %endif %if 0%{?_initramfs} diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 473f2d0325..eb93aeeb2e 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -1,9 +1,6 @@ %bcond_with debug %bcond_with debuginfo -# See comment in zfs.spec.in. -%global __brp_mangle_shebangs_exclude_from arc_summary.py|arcstat.py|dbufstat.py|test-runner.py|zts-report.py - Name: @PACKAGE@-kmod Version: @VERSION@ Release: @RELEASE@%{?dist} @@ -11,7 +8,7 @@ Release: @RELEASE@%{?dist} Summary: Kernel module(s) Group: System Environment/Kernel License: @ZFS_META_LICENSE@ -URL: http://zfsonlinux.org/ +URL: https://github.com/openzfs/zfs BuildRequires: %kernel_module_package_buildreqs Source0: @PACKAGE@-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) @@ -20,8 +17,9 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) # by generating a preamble text file which kmodtool can append to the spec file. %(/bin/echo -e "\ Requires: @PACKAGE@ = %{version}\n\ -Conflicts: @PACKAGE@-dkms\n\n" > %{_sourcedir}/kmod-preamble\n\ -Obsoletes: spl-kmod) +Conflicts: @PACKAGE@-dkms\n\ +Obsoletes: kmod-spl\n\ +Obsoletes: spl-kmod\n\n" > %{_sourcedir}/kmod-preamble) # LDFLAGS are not sanitized by arch/*/Makefile for these architectures. %ifarch ppc ppc64 ppc64le aarch64 @@ -41,6 +39,7 @@ This package contains the ZFS kernel modules. %package -n kmod-%{kmod_name}-devel Summary: ZFS kernel module(s) devel common Group: System Environment/Kernel +Provides: kmod-spl-devel = %{version} %description -n kmod-%{kmod_name}-devel This package provides the header files and objects to build kernel modules. diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 11e963c527..6c59fd7d4f 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -1,31 +1,40 @@ +include $(top_srcdir)/config/Shellcheck.am + pkgdatadir = $(datadir)/@PACKAGE@ dist_pkgdata_SCRIPTS = \ - $(top_srcdir)/scripts/zimport.sh \ - $(top_srcdir)/scripts/zfs.sh \ - $(top_srcdir)/scripts/zfs-tests.sh \ - $(top_srcdir)/scripts/zloop.sh \ - $(top_srcdir)/scripts/zfs-helpers.sh + zimport.sh \ + zfs.sh \ + zfs-tests.sh \ + zloop.sh \ + zfs-helpers.sh -EXTRA_DIST = \ +EXTRA_SCRIPTS = \ commitcheck.sh \ common.sh.in \ - cstyle.pl \ dkms.mkconf \ dkms.postbuild \ - enum-extract.pl \ kmodtool \ make_gitrev.sh \ man-dates.sh \ paxcheck.sh \ + mancheck.sh + +EXTRA_DIST = \ + cstyle.pl \ + enum-extract.pl \ zfs2zol-patch.sed \ - zol2zfs-patch.sed + zol2zfs-patch.sed \ + $(EXTRA_SCRIPTS) + +SHELLCHECK_IGNORE = ,SC1117 +SHELLCHECKSCRIPTS = $(EXTRA_SCRIPTS) define EXTRA_ENVIRONMENT # Only required for in-tree use export INTREE="yes" -export GDB="/usr/bin/libtool --mode=execute gdb" +export GDB="libtool --mode=execute gdb" export LDMOD=/sbin/insmod export CMD_DIR=@abs_top_builddir@/cmd @@ -34,8 +43,10 @@ export ZEDLET_ETC_DIR=$$CMD_DIR/zed/zed.d export ZEDLET_LIBEXEC_DIR=$$CMD_DIR/zed/zed.d export ZPOOL_SCRIPT_DIR=$$CMD_DIR/zpool/zpool.d export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/zpool/zpool.d +export ZPOOL_COMPAT_DIR=$$CMD_DIR/zpool/compatibility.d export CONTRIB_DIR=@abs_top_builddir@/contrib export LIB_DIR=@abs_top_builddir@/lib +export SYSCONF_DIR=@abs_top_builddir@/etc export INSTALL_UDEV_DIR=@udevdir@ export INSTALL_UDEV_RULE_DIR=@udevruledir@ @@ -51,6 +62,8 @@ export KMOD_ZCOMMON=@abs_top_builddir@/module/zcommon/zcommon.ko export KMOD_ZLUA=@abs_top_builddir@/module/lua/zlua.ko export KMOD_ICP=@abs_top_builddir@/module/icp/icp.ko export KMOD_ZFS=@abs_top_builddir@/module/zfs/zfs.ko +export KMOD_FREEBSD=@abs_top_builddir@/module/openzfs.ko +export KMOD_ZZSTD=@abs_top_builddir@/module/zstd/zzstd.ko endef export EXTRA_ENVIRONMENT @@ -58,9 +71,10 @@ export EXTRA_ENVIRONMENT all-local: -$(SED) -e '\|^export BIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ -e '\|^export SBIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ + -e '\|^export LIBEXEC_DIR=|s|$$|@abs_top_builddir@/bin|' \ -e '\|^export ZTS_DIR=|s|$$|@abs_top_srcdir@/tests|' \ -e '\|^export SCRIPT_DIR=|s|$$|@abs_top_srcdir@/scripts|' \ - common.sh.in >common.sh + $(abs_top_srcdir)/scripts/common.sh.in >common.sh -echo "$$EXTRA_ENVIRONMENT" >>common.sh clean-local: @@ -69,6 +83,8 @@ clean-local: install-data-hook: -$(SED) -e '\|^export BIN_DIR=|s|$$|@bindir@|' \ -e '\|^export SBIN_DIR=|s|$$|@sbindir@|' \ + -e '\|^export LIBEXEC_DIR=|s|$$|@zfsexecdir@|' \ -e '\|^export ZTS_DIR=|s|$$|@datadir@/@PACKAGE@|' \ -e '\|^export SCRIPT_DIR=|s|$$|@datadir@/@PACKAGE@|' \ - common.sh.in >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh + $(abs_top_srcdir)/scripts/common.sh.in \ + >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh diff --git a/scripts/commitcheck.sh b/scripts/commitcheck.sh index 2954b0fd72..cb9fd66c6f 100755 --- a/scripts/commitcheck.sh +++ b/scripts/commitcheck.sh @@ -1,26 +1,13 @@ -#!/bin/bash +#!/bin/sh REF="HEAD" -# test a url -function test_url() -{ - url="$1" - if ! curl --output /dev/null --max-time 60 \ - --silent --head --fail "$url" ; then - echo "\"$url\" is unreachable" - return 1 - fi - - return 0 -} - # test commit body for length # lines containing urls are exempt for the length limit. -function test_commit_bodylength() +test_commit_bodylength() { length="72" - body=$(git log -n 1 --pretty=%b "$REF" | grep -Ev "http(s)*://" | grep -E -m 1 ".{$((length + 1))}") + body=$(git log --no-show-signature -n 1 --pretty=%b "$REF" | grep -Ev "http(s)*://" | grep -E -m 1 ".{$((length + 1))}") if [ -n "$body" ]; then echo "error: commit message body contains line over ${length} characters" return 1 @@ -30,10 +17,10 @@ function test_commit_bodylength() } # check for a tagged line -function check_tagged_line() +check_tagged_line() { - regex='^\s*'"$1"':\s[[:print:]]+\s<[[:graph:]]+>$' - foundline=$(git log -n 1 "$REF" | grep -E -m 1 "$regex") + regex='^[[:space:]]*'"$1"':[[:space:]][[:print:]]+[[:space:]]<[[:graph:]]+>$' + foundline=$(git log --no-show-signature -n 1 "$REF" | grep -E -m 1 "$regex") if [ -z "$foundline" ]; then echo "error: missing \"$1\"" return 1 @@ -42,35 +29,13 @@ function check_tagged_line() return 0 } -# check for a tagged line and check that the link is valid -function check_tagged_line_with_url() -{ - regex='^\s*'"$1"':\s\K([[:graph:]]+)$' - foundline=$(git log -n 1 "$REF" | grep -Po "$regex") - if [ -z "$foundline" ]; then - echo "error: missing \"$1\"" - return 1 - fi - - OLDIFS=$IFS - IFS=$'\n' - for url in $(echo -e "$foundline"); do - if ! test_url "$url"; then - return 1 - fi - done - IFS=$OLDIFS - - return 0 -} - # check commit message for a normal commit -function new_change_commit() +new_change_commit() { error=0 # subject is not longer than 72 characters - long_subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '.{73}') + long_subject=$(git log --no-show-signature -n 1 --pretty=%s "$REF" | grep -E -m 1 '.{73}') if [ -n "$long_subject" ]; then echo "error: commit subject over 72 characters" error=1 @@ -89,60 +54,10 @@ function new_change_commit() return $error } -function is_openzfs_port() -{ - # subject starts with OpenZFS means it's an openzfs port - subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS') - if [ -n "$subject" ]; then - return 0 - fi - - return 1 -} - -function openzfs_port_commit() -{ - error=0 - - # subject starts with OpenZFS dddd - subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS [[:digit:]]+(, [[:digit:]]+)* - ') - if [ -z "$subject" ]; then - echo "error: OpenZFS patch ports must have a subject line that starts with \"OpenZFS dddd - \"" - error=1 - fi - - # need an authored by line - if ! check_tagged_line "Authored by" ; then - error=1 - fi - - # need a reviewed by line - if ! check_tagged_line "Reviewed by" ; then - error=1 - fi - - # need ported by line - if ! check_tagged_line "Ported-by" ; then - error=1 - fi - - # need a url to openzfs commit and it should be valid - if ! check_tagged_line_with_url "OpenZFS-commit" ; then - error=1 - fi - - # need a url to illumos issue and it should be valid - if ! check_tagged_line_with_url "OpenZFS-issue" ; then - error=1 - fi - - return $error -} - -function is_coverity_fix() +is_coverity_fix() { # subject starts with Fix coverity defects means it's a coverity fix - subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^Fix coverity defects') + subject=$(git log --no-show-signature -n 1 --pretty=%s "$REF" | grep -E -m 1 '^Fix coverity defects') if [ -n "$subject" ]; then return 0 fi @@ -150,12 +65,12 @@ function is_coverity_fix() return 1 } -function coverity_fix_commit() +coverity_fix_commit() { error=0 # subject starts with Fix coverity defects: CID dddd, dddd... - subject=$(git log -n 1 --pretty=%s "$REF" | + subject=$(git log --no-show-signature -n 1 --pretty=%s "$REF" | grep -E -m 1 'Fix coverity defects: CID [[:digit:]]+(, [[:digit:]]+)*') if [ -z "$subject" ]; then echo "error: Coverity defect fixes must have a subject line that starts with \"Fix coverity defects: CID dddd\"" @@ -169,11 +84,10 @@ function coverity_fix_commit() # test each summary line for the proper format OLDIFS=$IFS - IFS=$'\n' - for line in $(git log -n 1 --pretty=%b "$REF" | grep -E '^CID'); do - echo "$line" | grep -E '^CID [[:digit:]]+: ([[:graph:]]+|[[:space:]])+ \(([[:upper:]]|\_)+\)' > /dev/null - # shellcheck disable=SC2181 - if [[ $? -ne 0 ]]; then + IFS=' +' + for line in $(git log --no-show-signature -n 1 --pretty=%b "$REF" | grep -E '^CID'); do + if ! echo "$line" | grep -qE '^CID [[:digit:]]+: ([[:graph:]]+|[[:space:]])+ \(([[:upper:]]|\_)+\)'; then echo "error: commit message has an improperly formatted CID defect line" error=1 fi @@ -192,15 +106,6 @@ if [ -n "$1" ]; then REF="$1" fi -# if openzfs port, test against that -if is_openzfs_port; then - if ! openzfs_port_commit ; then - exit 1 - else - exit 0 - fi -fi - # if coverity fix, test against that if is_coverity_fix; then if ! coverity_fix_commit; then diff --git a/scripts/common.sh.in b/scripts/common.sh.in index 2d9d9c7866..8268315b33 100644 --- a/scripts/common.sh.in +++ b/scripts/common.sh.in @@ -3,6 +3,7 @@ # Directories export BIN_DIR= export SBIN_DIR= +export LIBEXEC_DIR= export ZTS_DIR= export SCRIPT_DIR= diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl index 00b33dddfb..d19718ecf4 100755 --- a/scripts/cstyle.pl +++ b/scripts/cstyle.pl @@ -58,8 +58,9 @@ use Getopt::Std; use strict; my $usage = -"usage: cstyle [-chpvCP] [-o constructs] file ... +"usage: cstyle [-cghpvCP] [-o constructs] file ... -c check continuation indentation inside functions + -g print github actions' workflow commands -h perform heuristic checks that are sometimes wrong -p perform some of the more picky checks -v verbose @@ -73,12 +74,13 @@ my $usage = my %opts; -if (!getopts("cho:pvCP", \%opts)) { +if (!getopts("cgho:pvCP", \%opts)) { print $usage; exit 2; } my $check_continuation = $opts{'c'}; +my $github_workflow = $opts{'g'} || $ENV{'CI'}; my $heuristic = $opts{'h'}; my $picky = $opts{'p'}; my $verbose = $opts{'v'}; @@ -197,7 +199,10 @@ sub err($) { printf $fmt, $filename, $., $error, $line; } else { printf $fmt, $filename, $., $error; - } + } + if ($github_workflow) { + printf "::error file=%s,line=%s::%s\n", $filename, $., $error; + } $err_stat = 1; } } @@ -415,7 +420,7 @@ line: while (<$filehandle>) { $prev = $line; next line; } elsif ($picky && ! (/^\t/ && $function_header_full_indent != 0)) { - + err("continuation line should be indented by 4 spaces"); } } diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf index e1a49dca14..9d12a8c3b3 100755 --- a/scripts/dkms.mkconf +++ b/scripts/dkms.mkconf @@ -6,22 +6,25 @@ pkgcfg=/etc/sysconfig/zfs while getopts "n:v:c:f:" opt; do case $opt in - n) pkgname=$OPTARG ;; - v) pkgver=$OPTARG ;; - c) pkgcfg=$OPTARG ;; + n) pkgname=$OPTARG ;; + v) pkgver=$OPTARG ;; + c) pkgcfg=$OPTARG ;; f) filename=$OPTARG ;; + *) err=1 ;; esac done -if [ -z "${pkgname}" -o -z "${pkgver}" -o -z "${filename}" ]; then +if [ -z "${pkgname}" ] || [ -z "${pkgver}" ] || [ -z "${filename}" ] || + [ -n "${err}" ]; then echo "Usage: $PROG -n -v -c -f " exit 1 fi -cat >${filename} <"${filename}" < -k -n " \ - "-t -v " + "-t -v " exit 1 fi -cp "${tree}/${pkgname}/${pkgver}/build/zfs_config.h" \ +exec cp "${tree}/${pkgname}/${pkgver}/build/zfs_config.h" \ "${tree}/${pkgname}/${pkgver}/build/module/Module.symvers" \ "${tree}/${pkgname}/${pkgver}/${kver}/${arch}/" diff --git a/scripts/kmodtool b/scripts/kmodtool index 27a14cdac2..26bacf5991 100755 --- a/scripts/kmodtool +++ b/scripts/kmodtool @@ -1,4 +1,5 @@ -#!/bin/bash +#!/usr/bin/env bash +# shellcheck disable=SC2086 # kmodtool - Helper script for building kernel module RPMs # Copyright (c) 2003-2012 Ville Skyttä , @@ -38,15 +39,16 @@ prefix= filterfile= target= buildroot= +dashvariant= error_out() { local errorlevel=${1} shift - echo "Error: $@" >&2 + echo "Error: $*" >&2 # the next line is not multi-line safe -- not needed *yet* - echo "%global kmodtool_check echo \"kmodtool error: $@\"; exit ${errorlevel};" - exit ${errorlevel} + echo "%global kmodtool_check echo \"kmodtool error: $*\"; exit ${errorlevel};" + exit "${errorlevel}" } print_rpmtemplate_header() @@ -144,7 +146,13 @@ print_rpmtemplate_per_kmodpkg () local kernel_uname_r=${1} local kernel_variant="${2:+-${2}}" - # first part + # Detect depmod install location + local depmod_path=/sbin/depmod + if [ ! -f ${depmod_path} ]; then + depmod_path=/usr/sbin/depmod + fi + + # first part cat <= %{?epoch:%{epoch}:}%{version} -Requires(post): ${prefix}/sbin/depmod -Requires(postun): ${prefix}/sbin/depmod + +%if 0%{?rhel} == 6 || 0%{?centos} == 6 +Requires(post): module-init-tools +Requires(postun): module-init-tools +%else +Requires(post): kmod +Requires(postun): kmod +%endif EOF if [[ ${obsolete_name} ]]; then @@ -170,17 +184,29 @@ BuildRequires: kernel-devel-uname-r = ${kernel_uname_r} %{?KmodsRequires:Requires: %{KmodsRequires}-uname-r = ${kernel_uname_r}} %{?KmodsRequires:BuildRequires: %{KmodsRequires}-uname-r = ${kernel_uname_r}} %post -n kmod-${kmodname}-${kernel_uname_r} -${prefix}/sbin/depmod -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || : +if [[ -f "/boot/System.map-${kernel_uname_r}" ]]; then + ${prefix}${depmod_path} -aeF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} > /dev/null || : +elif [[ -f "/lib/modules/${kernel_uname_r}/System.map" ]]; then + ${prefix}${depmod_path} -aeF /lib/modules/${kernel_uname_r}/System.map ${kernel_uname_r} > /dev/null || : +else + ${prefix}${depmod_path} -ae ${kernel_uname_r} &> /dev/null || : +fi %postun -n kmod-${kmodname}-${kernel_uname_r} -${prefix}/sbin/depmod -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || : +if [[ -f "/boot/System.map-${kernel_uname_r}" ]]; then + ${prefix}${depmod_path} -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || : +elif [[ -f "/lib/modules/${kernel_uname_r}/System.map" ]]; then + ${prefix}${depmod_path} -aF /lib/modules/${kernel_uname_r}/System.map ${kernel_uname_r} &> /dev/null || : +else + ${prefix}${depmod_path} -a ${kernel_uname_r} &> /dev/null || : +fi EOF else cat < /dev/null || : +[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}${depmod_path} -a > /dev/null || : %postun -n kmod-${kmodname}-${kernel_uname_r} -[[ "$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}/sbin/depmod -a > /dev/null || : +[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}${depmod_path} -a > /dev/null || : EOF fi @@ -397,7 +423,7 @@ print_rpmtemplate () # and print it and some other required stuff as macro print_rpmtemplate_header - # now print the packages itselfs + # now print the packages for kernel in ${kernel_versions_to_build_for} ; do local kernel_verrelarch=${kernel%%${kernels_known_variants}} @@ -489,7 +515,7 @@ while [ "${1}" ] ; do --obsolete-name) shift if [[ ! "${1}" ]] ; then - error_out 2 "Please provide the name of the kmod to obsolte together with --obsolete-name" >&2 + error_out 2 "Please provide the name of the kmod to obsolete together with --obsolete-name" >&2 fi obsolete_name="${1}" shift @@ -497,7 +523,7 @@ while [ "${1}" ] ; do --obsolete-version) shift if [[ ! "${1}" ]] ; then - error_out 2 "Please provide the version of the kmod to obsolte together with --obsolete-version" >&2 + error_out 2 "Please provide the version of the kmod to obsolete together with --obsolete-version" >&2 fi obsolete_version="${1}" shift @@ -555,7 +581,7 @@ elif [[ ! "${kmodname}" ]]; then error_out 2 "please pass kmodname with --kmodname" elif [[ ! "${kernels_known_variants}" ]] ; then error_out 2 "could not determine known variants" -elif ( [[ "${obsolete_name}" ]] && [[ ! "${obsolete_version}" ]] ) || ( [[ ! "${obsolete_name}" ]] && [[ "${obsolete_version}" ]] ) ; then +elif { [[ "${obsolete_name}" ]] && [[ ! "${obsolete_version}" ]]; } || { [[ ! "${obsolete_name}" ]] && [[ "${obsolete_version}" ]]; } ; then error_out 2 "you need to provide both --obsolete-name and --obsolete-version" fi @@ -573,7 +599,7 @@ else # we need more sanity checks in this case if [[ ! "${repo}" ]]; then error_out 2 "please provide repo name with --repo" - elif ! $(which buildsys-build-${repo}-kerneldevpkgs &> /dev/null) ; then + elif ! command -v "buildsys-build-${repo}-kerneldevpkgs" &> /dev/null ; then error_out 2 "buildsys-build-${repo}-kerneldevpkgs not found" fi @@ -587,7 +613,7 @@ else kernel_versions_to_build_for="$(buildsys-build-${repo}-kerneldevpkgs --${build_kernels} ${cmdoptions})" returncode=$? - if (( ${returncode} != 0 )); then + if (( returncode != 0 )); then error_out 2 "buildsys-build-${repo}-kerneldevpkgs failed: $(buildsys-build-${repo}-kerneldevpkgs --${build_kernels} ${cmdoptions})" fi diff --git a/scripts/make_gitrev.sh b/scripts/make_gitrev.sh index bab9be88d7..e7f4ce8844 100755 --- a/scripts/make_gitrev.sh +++ b/scripts/make_gitrev.sh @@ -27,15 +27,52 @@ set -e -u -cleanup() { - ZFS_GIT_REV=${ZFS_GIT_REV:-"unknown"} - cat << EOF > "$(dirname "$0")"/../include/zfs_gitrev.h -#define ZFS_META_GITREV "${ZFS_GIT_REV}" -EOF -} -trap cleanup EXIT +dist=no +distdir=. +while getopts D: flag +do + case $flag in + \?) echo "Usage: $0 [-D distdir] [file]" >&2; exit 1;; + D) dist=yes; distdir=${OPTARG};; + esac +done +shift $((OPTIND - 1)) -# Check if git is installed and we are in a git repo. -git rev-parse --git-dir > /dev/null 2>&1 -# Get the git current git revision -ZFS_GIT_REV=$(git describe --always --long --dirty 2>/dev/null) +top_srcdir="$(dirname "$0")/.." +GITREV="${1:-include/zfs_gitrev.h}" + +# GITREV should be a relative path (relative to top_builddir or distdir) +case "${GITREV}" in + /*) echo "Error: ${GITREV} should be a relative path" >&2 + exit 1;; +esac + +ZFS_GITREV=$({ cd "${top_srcdir}" && + git describe --always --long --dirty 2>/dev/null; } || :) + +if [ -z "${ZFS_GITREV}" ] +then + # If the source directory is not a git repository, check if the file + # already exists (in the source) + if [ -f "${top_srcdir}/${GITREV}" ] + then + ZFS_GITREV=$(sed -n \ + '1s/^#define[[:blank:]]ZFS_META_GITREV "\([^"]*\)"$/\1/p' \ + "${top_srcdir}/${GITREV}") + fi +elif [ ${dist} = yes ] +then + # Append -dist when creating distributed sources from a git repository + ZFS_GITREV="${ZFS_GITREV}-dist" +fi +ZFS_GITREV=${ZFS_GITREV:-unknown} + +GITREVTMP="${GITREV}~" +printf '#define\tZFS_META_GITREV "%s"\n' "${ZFS_GITREV}" >"${GITREVTMP}" +GITREV="${distdir}/${GITREV}" +if cmp -s "${GITREV}" "${GITREVTMP}" +then + rm -f "${GITREVTMP}" +else + mv -f "${GITREVTMP}" "${GITREV}" +fi diff --git a/scripts/man-dates.sh b/scripts/man-dates.sh index 186d94639a..39f1b5fb13 100755 --- a/scripts/man-dates.sh +++ b/scripts/man-dates.sh @@ -7,6 +7,6 @@ set -eu find man -type f | while read -r i ; do git_date=$(git log -1 --date=short --format="%ad" -- "$i") - [ "x$git_date" = "x" ] && continue + [ -z "$git_date" ] && continue sed -i "s|^\.Dd.*|.Dd $(date -d "$git_date" "+%B %-d, %Y")|" "$i" done diff --git a/scripts/mancheck.sh b/scripts/mancheck.sh new file mode 100755 index 0000000000..0793cc48fa --- /dev/null +++ b/scripts/mancheck.sh @@ -0,0 +1,53 @@ +#!/bin/sh +# +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +# shellcheck disable=SC2086,SC2250 + +trap 'rm -f "$stdout_file" "$stderr_file" "$result_file"' EXIT + +if [ "$#" -eq 0 ]; then + echo "Usage: $0 manpage-directory..." + exit 1 +fi + +if ! command -v mandoc > /dev/null; then + echo "skipping mancheck because mandoc is not installed" + exit 0 +fi + +IFS=" +" +files="$(find "$@" -type f -name '*[1-9]*')" || exit 1 + +add_excl="$(awk ' + /^.\\" lint-ok:/ { + print "-e" + $1 = "mandoc:" + $2 = FILENAME ":[[:digit:]]+:[[:digit:]]+:" + print + }' $files)" + +# Redirect to file instead of 2>&1ing because mandoc flushes inconsistently(?) which tears lines +# https://github.com/openzfs/zfs/pull/12129/checks?check_run_id=2701608671#step:5:3 +stdout_file="$(mktemp)" +stderr_file="$(mktemp)" +mandoc -Tlint $files 1>"$stdout_file" 2>"$stderr_file" +result_file="$(mktemp)" +grep -vhE -e 'mandoc: outdated mandoc.db' -e 'STYLE: referenced manual not found' $add_excl "$stdout_file" "$stderr_file" > "$result_file" + +if [ -s "$result_file" ]; then + cat "$result_file" + exit 1 +else + echo "no errors found" +fi diff --git a/scripts/paxcheck.sh b/scripts/paxcheck.sh index 87e817500b..27acc95364 100755 --- a/scripts/paxcheck.sh +++ b/scripts/paxcheck.sh @@ -1,7 +1,6 @@ #!/bin/sh -# shellcheck disable=SC2039 -if ! type scanelf > /dev/null 2>&1; then +if ! command -v scanelf > /dev/null; then echo "scanelf (from pax-utils) is required for these checks." >&2 exit 3 fi diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 7c5286ba70..ac28788582 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # CDDL HEADER START # @@ -31,25 +31,35 @@ fi PROG=zfs-tests.sh VERBOSE="no" -QUIET= +QUIET="" CLEANUP="yes" CLEANUPALL="no" LOOPBACK="yes" STACK_TRACER="no" FILESIZE="4G" -RUNFILE=${RUNFILE:-"linux.run"} +DEFAULT_RUNFILES="common.run,$(uname | tr '[:upper:]' '[:lower:]').run" +RUNFILES=${RUNFILES:-$DEFAULT_RUNFILES} FILEDIR=${FILEDIR:-/var/tmp} DISKS=${DISKS:-""} -SINGLETEST=() +SINGLETEST="" SINGLETESTUSER="root" TAGS="" ITERATIONS=1 ZFS_DBGMSG="$STF_SUITE/callbacks/zfs_dbgmsg.ksh" ZFS_DMESG="$STF_SUITE/callbacks/zfs_dmesg.ksh" -ZFS_MMP="$STF_SUITE/callbacks/zfs_mmp.ksh" -TESTFAIL_CALLBACKS=${TESTFAIL_CALLBACKS:-"$ZFS_DBGMSG:$ZFS_DMESG:$ZFS_MMP"} -LOSETUP=${LOSETUP:-/sbin/losetup} -DMSETUP=${DMSETUP:-/sbin/dmsetup} +UNAME=$(uname -s) + +# Override some defaults if on FreeBSD +if [ "$UNAME" = "FreeBSD" ] ; then + TESTFAIL_CALLBACKS=${TESTFAIL_CALLBACKS:-"$ZFS_DMESG"} + LOSETUP=/sbin/mdconfig + DMSETUP=/sbin/gpart +else + ZFS_MMP="$STF_SUITE/callbacks/zfs_mmp.ksh" + TESTFAIL_CALLBACKS=${TESTFAIL_CALLBACKS:-"$ZFS_DBGMSG:$ZFS_DMESG:$ZFS_MMP"} + LOSETUP=${LOSETUP:-/sbin/losetup} + DMSETUP=${DMSETUP:-/sbin/dmsetup} +fi # # Log an informational message when additional verbosity is enabled. @@ -64,11 +74,38 @@ msg() { # Log a failure message, cleanup, and return an error. # fail() { - echo -e "$PROG: $1" >&2 + echo "$PROG: $1" >&2 cleanup exit 1 } +cleanup_freebsd_loopback() { + for TEST_LOOPBACK in ${LOOPBACKS}; do + if [ -c "/dev/${TEST_LOOPBACK}" ]; then + sudo "${LOSETUP}" -d -u "${TEST_LOOPBACK}" || + echo "Failed to destroy: ${TEST_LOOPBACK}" + fi + done +} + +cleanup_linux_loopback() { + for TEST_LOOPBACK in ${LOOPBACKS}; do + LOOP_DEV=$(basename "$TEST_LOOPBACK") + DM_DEV=$(sudo "${DMSETUP}" ls 2>/dev/null | \ + grep "${LOOP_DEV}" | cut -f1) + + if [ -n "$DM_DEV" ]; then + sudo "${DMSETUP}" remove "${DM_DEV}" || + echo "Failed to remove: ${DM_DEV}" + fi + + if [ -n "${TEST_LOOPBACK}" ]; then + sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" || + echo "Failed to remove: ${TEST_LOOPBACK}" + fi + done +} + # # Attempt to remove loopback devices and files which where created earlier # by this script to run the test framework. The '-k' option may be passed @@ -79,26 +116,17 @@ cleanup() { return 0 fi + if [ "$LOOPBACK" = "yes" ]; then - for TEST_LOOPBACK in ${LOOPBACKS}; do - LOOP_DEV=$(basename "$TEST_LOOPBACK") - DM_DEV=$(sudo "${DMSETUP}" ls 2>/dev/null | \ - grep "${LOOP_DEV}" | cut -f1) - - if [ -n "$DM_DEV" ]; then - sudo "${DMSETUP}" remove "${DM_DEV}" || - echo "Failed to remove: ${DM_DEV}" - fi - - if [ -n "${TEST_LOOPBACK}" ]; then - sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" || - echo "Failed to remove: ${TEST_LOOPBACK}" - fi - done + if [ "$UNAME" = "FreeBSD" ] ; then + cleanup_freebsd_loopback + else + cleanup_linux_loopback + fi fi for TEST_FILE in ${FILES}; do - rm -f "${TEST_FILE}" &>/dev/null + rm -f "${TEST_FILE}" >/dev/null 2>&1 done if [ "$STF_PATH_REMOVE" = "yes" ] && [ -d "$STF_PATH" ]; then @@ -114,11 +142,12 @@ trap cleanup EXIT # be dangerous and should only be used in a dedicated test environment. # cleanup_all() { - local TEST_POOLS TEST_POOLS=$(sudo "$ZPOOL" list -H -o name | grep testpool) - local TEST_LOOPBACKS - TEST_LOOPBACKS=$(sudo "${LOSETUP}" -a|grep file-vdev|cut -f1 -d:) - local TEST_FILES + if [ "$UNAME" = "FreeBSD" ] ; then + TEST_LOOPBACKS=$(sudo "${LOSETUP}" -l) + else + TEST_LOOPBACKS=$(sudo "${LOSETUP}" -a|grep file-vdev|cut -f1 -d:) + fi TEST_FILES=$(ls /var/tmp/file-vdev* 2>/dev/null) msg @@ -128,13 +157,19 @@ cleanup_all() { sudo "$ZPOOL" destroy "${TEST_POOL}" done - msg "Removing dm(s): $(sudo "${DMSETUP}" ls | - grep loop | tr '\n' ' ')" - sudo "${DMSETUP}" remove_all + if [ "$UNAME" != "FreeBSD" ] ; then + msg "Removing dm(s): $(sudo "${DMSETUP}" ls | + grep loop | tr '\n' ' ')" + sudo "${DMSETUP}" remove_all + fi msg "Removing loopback(s): $(echo "${TEST_LOOPBACKS}" | tr '\n' ' ')" for TEST_LOOPBACK in $TEST_LOOPBACKS; do - sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" + if [ "$UNAME" = "FreeBSD" ] ; then + sudo "${LOSETUP}" -d -u "${TEST_LOOPBACK}" + else + sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" + fi done msg "Removing files(s): $(echo "${TEST_FILES}" | tr '\n' ' ')" @@ -153,8 +188,8 @@ cleanup_all() { # .run # find_runfile() { - local NAME=$1 - local RESULT="" + NAME=$1 + RESULT="" if [ -f "$RUNFILE_DIR/$NAME" ]; then RESULT="$RUNFILE_DIR/$NAME" @@ -173,8 +208,8 @@ find_runfile() { # Symlink file if it appears under any of the given paths. # create_links() { - local dir_list="$1" - local file_list="$2" + dir_list="$1" + file_list="$2" [ -n "$STF_PATH" ] || fail "STF_PATH wasn't correctly set" @@ -183,14 +218,16 @@ create_links() { [ ! -e "$STF_PATH/$i" ] || continue if [ ! -d "$j/$i" ] && [ -e "$j/$i" ]; then - ln -s "$j/$i" "$STF_PATH/$i" || \ + ln -sf "$j/$i" "$STF_PATH/$i" || \ fail "Couldn't link $i" break fi done - [ ! -e "$STF_PATH/$i" ] && STF_MISSING_BIN="$STF_MISSING_BIN$i " + [ ! -e "$STF_PATH/$i" ] && \ + STF_MISSING_BIN="$STF_MISSING_BIN $i" done + STF_MISSING_BIN=${STF_MISSING_BIN# } } # @@ -201,6 +238,12 @@ create_links() { constrain_path() { . "$STF_SUITE/include/commands.cfg" + # On FreeBSD, base system zfs utils are in /sbin and OpenZFS utils + # install to /usr/local/sbin. To avoid testing the wrong utils we + # need /usr/local to come before / in the path search order. + SYSTEM_DIRS="/usr/local/bin /usr/local/sbin" + SYSTEM_DIRS="$SYSTEM_DIRS /usr/bin /usr/sbin /bin /sbin $LIBEXEC_DIR" + if [ "$INTREE" = "yes" ]; then # Constrained path set to ./zfs/bin/ STF_PATH="$BIN_DIR" @@ -222,34 +265,40 @@ constrain_path() { create_links "$DIRS" "$ZFSTEST_FILES" else # Constrained path set to /var/tmp/constrained_path.* - SYSTEMDIR=${SYSTEMDIR:-/var/tmp/constrained_path.XXXX} - STF_PATH=$(/bin/mktemp -d "$SYSTEMDIR") + SYSTEMDIR=${SYSTEMDIR:-/var/tmp/constrained_path.XXXXXX} + STF_PATH=$(mktemp -d "$SYSTEMDIR") STF_PATH_REMOVE="yes" STF_MISSING_BIN="" chmod 755 "$STF_PATH" || fail "Couldn't chmod $STF_PATH" # Special case links for standard zfs utilities - create_links "/bin /usr/bin /sbin /usr/sbin" "$ZFS_FILES" + create_links "$SYSTEM_DIRS" "$ZFS_FILES" # Special case links for zfs test suite utilities create_links "$STF_SUITE/bin" "$ZFSTEST_FILES" fi # Standard system utilities - create_links "/bin /usr/bin /sbin /usr/sbin" "$SYSTEM_FILES" + SYSTEM_FILES="$SYSTEM_FILES_COMMON" + if [ "$UNAME" = "FreeBSD" ] ; then + SYSTEM_FILES="$SYSTEM_FILES $SYSTEM_FILES_FREEBSD" + else + SYSTEM_FILES="$SYSTEM_FILES $SYSTEM_FILES_LINUX" + fi + create_links "$SYSTEM_DIRS" "$SYSTEM_FILES" # Exceptions ln -fs "$STF_PATH/awk" "$STF_PATH/nawk" - ln -fs /sbin/fsck.ext4 "$STF_PATH/fsck" - ln -fs /sbin/mkfs.ext4 "$STF_PATH/newfs" - ln -fs "$STF_PATH/gzip" "$STF_PATH/compress" - ln -fs "$STF_PATH/gunzip" "$STF_PATH/uncompress" - ln -fs "$STF_PATH/exportfs" "$STF_PATH/share" - ln -fs "$STF_PATH/exportfs" "$STF_PATH/unshare" - - if [ -L "$STF_PATH/arc_summary3" ]; then - ln -fs "$STF_PATH/arc_summary3" "$STF_PATH/arc_summary" + if [ "$UNAME" = "Linux" ] ; then + ln -fs /sbin/fsck.ext4 "$STF_PATH/fsck" + ln -fs /sbin/mkfs.ext4 "$STF_PATH/newfs" + ln -fs "$STF_PATH/gzip" "$STF_PATH/compress" + ln -fs "$STF_PATH/gunzip" "$STF_PATH/uncompress" + ln -fs "$STF_PATH/exportfs" "$STF_PATH/share" + ln -fs "$STF_PATH/exportfs" "$STF_PATH/unshare" + elif [ "$UNAME" = "FreeBSD" ] ; then + ln -fs /usr/local/bin/ksh93 "$STF_PATH/ksh" fi } @@ -259,7 +308,7 @@ constrain_path() { usage() { cat << EOF USAGE: -$0 [hvqxkfS] [-s SIZE] [-r RUNFILE] [-t PATH] [-u USER] +$0 [-hvqxkfS] [-s SIZE] [-r RUNFILES] [-t PATH] [-u USER] DESCRIPTION: ZFS Test Suite launch script @@ -277,7 +326,7 @@ OPTIONS: -I NUM Number of iterations -d DIR Use DIR for files and loopback devices -s SIZE Use vdevs of SIZE (default: 4G) - -r RUNFILE Run tests in RUNFILE (default: linux.run) + -r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES}) -t PATH Run single test at PATH relative to test suite -T TAGS Comma separated list of tags (default: 'functional') -u USER Run single test as USER (default: root) @@ -289,6 +338,9 @@ $0 -v # Run a smaller suite of tests designed to run more quickly. $0 -r linux-fast +# Run a single test +$0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh + # Cleanup a previous run of the test suite prior to testing, run the # default (linux) suite of tests and perform no cleanup on exit. $0 -x @@ -303,11 +355,10 @@ while getopts 'hvqxkfScn:d:s:r:?t:T:u:I:' OPTION; do exit 1 ;; v) - # shellcheck disable=SC2034 VERBOSE="yes" ;; q) - QUIET="-q" + QUIET="yes" ;; x) CLEANUPALL="yes" @@ -327,7 +378,7 @@ while getopts 'hvqxkfScn:d:s:r:?t:T:u:I:' OPTION; do ;; n) nfsfile=$OPTARG - [[ -f $nfsfile ]] || fail "Cannot read file: $nfsfile" + [ -f "$nfsfile" ] || fail "Cannot read file: $nfsfile" export NFS=1 . "$nfsfile" ;; @@ -344,13 +395,13 @@ while getopts 'hvqxkfScn:d:s:r:?t:T:u:I:' OPTION; do FILESIZE="$OPTARG" ;; r) - RUNFILE="$OPTARG" + RUNFILES="$OPTARG" ;; t) - if [ ${#SINGLETEST[@]} -ne 0 ]; then + if [ -n "$SINGLETEST" ]; then fail "-t can only be provided once." fi - SINGLETEST+=("$OPTARG") + SINGLETEST="$OPTARG" ;; T) TAGS="$OPTARG" @@ -370,19 +421,19 @@ shift $((OPTIND-1)) FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} LOOPBACKS=${LOOPBACKS:-""} -if [ ${#SINGLETEST[@]} -ne 0 ]; then +if [ -n "$SINGLETEST" ]; then if [ -n "$TAGS" ]; then fail "-t and -T are mutually exclusive." fi RUNFILE_DIR="/var/tmp" - RUNFILE="zfs-tests.$$.run" + RUNFILES="zfs-tests.$$.run" SINGLEQUIET="False" if [ -n "$QUIET" ]; then SINGLEQUIET="True" fi - cat >$RUNFILE_DIR/$RUNFILE << EOF + cat >$RUNFILE_DIR/$RUNFILES << EOF [DEFAULT] pre = quiet = $SINGLEQUIET @@ -393,22 +444,20 @@ post_user = root post = outputdir = /var/tmp/test_results EOF - for t in "${SINGLETEST[@]}" - do - SINGLETESTDIR=$(dirname "$t") - SINGLETESTFILE=$(basename "$t") - SETUPSCRIPT= - CLEANUPSCRIPT= + SINGLETESTDIR=$(dirname "$SINGLETEST") + SINGLETESTFILE=$(basename "$SINGLETEST") + SETUPSCRIPT= + CLEANUPSCRIPT= - if [ -f "$STF_SUITE/$SINGLETESTDIR/setup.ksh" ]; then - SETUPSCRIPT="setup" - fi + if [ -f "$STF_SUITE/$SINGLETESTDIR/setup.ksh" ]; then + SETUPSCRIPT="setup" + fi - if [ -f "$STF_SUITE/$SINGLETESTDIR/cleanup.ksh" ]; then - CLEANUPSCRIPT="cleanup" - fi + if [ -f "$STF_SUITE/$SINGLETESTDIR/cleanup.ksh" ]; then + CLEANUPSCRIPT="cleanup" + fi - cat >>$RUNFILE_DIR/$RUNFILE << EOF + cat >>$RUNFILE_DIR/$RUNFILES << EOF [$SINGLETESTDIR] tests = ['$SINGLETESTFILE'] @@ -416,7 +465,6 @@ pre = $SETUPSCRIPT post = $CLEANUPSCRIPT tags = ['functional'] EOF - done fi # @@ -425,17 +473,24 @@ fi TAGS=${TAGS:='functional'} # -# Attempt to locate the runfile describing the test workload. +# Attempt to locate the runfiles describing the test workload. # -if [ -n "$RUNFILE" ]; then - SAVED_RUNFILE="$RUNFILE" - RUNFILE=$(find_runfile "$RUNFILE") - [ -z "$RUNFILE" ] && fail "Cannot find runfile: $SAVED_RUNFILE" -fi +R="" +IFS=, +for RUNFILE in $RUNFILES; do + if [ -n "$RUNFILE" ]; then + SAVED_RUNFILE="$RUNFILE" + RUNFILE=$(find_runfile "$RUNFILE") + [ -z "$RUNFILE" ] && fail "Cannot find runfile: $SAVED_RUNFILE" + R="$R,$RUNFILE" + fi -if [ ! -r "$RUNFILE" ]; then - fail "Cannot read runfile: $RUNFILE" -fi + if [ ! -r "$RUNFILE" ]; then + fail "Cannot read runfile: $RUNFILE" + fi +done +unset IFS +RUNFILES=${R#,} # # This script should not be run as root. Instead the test user, which may @@ -458,6 +513,9 @@ constrain_path # # Check if ksh exists # +if [ "$UNAME" = "FreeBSD" ]; then + sudo ln -fs /usr/local/bin/ksh93 /bin/ksh +fi [ -e "$STF_PATH/ksh" ] || fail "This test suite requires ksh." [ -e "$STF_SUITE/include/default.cfg" ] || fail \ "Missing $STF_SUITE/include/default.cfg file." @@ -466,9 +524,9 @@ constrain_path # Verify the ZFS module stack is loaded. # if [ "$STACK_TRACER" = "yes" ]; then - sudo "${ZFS_SH}" -S &>/dev/null + sudo "${ZFS_SH}" -S >/dev/null 2>&1 else - sudo "${ZFS_SH}" &>/dev/null + sudo "${ZFS_SH}" >/dev/null 2>&1 fi # @@ -489,7 +547,7 @@ if [ -z "${KEEP}" ]; then KEEP="rpool" fi else - KEEP="$(echo -e "${KEEP//[[:blank:]]/\n}")" + KEEP="$(echo "$KEEP" | tr '[:blank:]' '\n')" fi # @@ -501,22 +559,25 @@ fi # # See libzfs/libzfs_config.c for more information. # -__ZFS_POOL_EXCLUDE="$(echo "$KEEP" | sed ':a;N;s/\n/ /g;ba')" +if [ "$UNAME" = "FreeBSD" ] ; then + __ZFS_POOL_EXCLUDE="$(echo "$KEEP" | tr -s '\n' ' ')" +else + __ZFS_POOL_EXCLUDE="$(echo "$KEEP" | sed ':a;N;s/\n/ /g;ba')" +fi . "$STF_SUITE/include/default.cfg" -msg -msg "--- Configuration ---" -msg "Runfile: $RUNFILE" -msg "STF_TOOLS: $STF_TOOLS" -msg "STF_SUITE: $STF_SUITE" -msg "STF_PATH: $STF_PATH" - # # No DISKS have been provided so a basic file or loopback based devices # must be created for the test suite to use. # if [ -z "${DISKS}" ]; then + # + # If this is a performance run, prevent accidental use of + # loopback devices. + # + [ "$TAGS" = "perf" ] && fail "Running perf tests without disks." + # # Create sparse files for the test suite. These may be used # directory or have loopback devices layered on them. @@ -525,44 +586,52 @@ if [ -z "${DISKS}" ]; then [ -f "$TEST_FILE" ] && fail "Failed file exists: ${TEST_FILE}" truncate -s "${FILESIZE}" "${TEST_FILE}" || fail "Failed creating: ${TEST_FILE} ($?)" - if [[ "$DISKS" ]]; then - DISKS="$DISKS $TEST_FILE" - else - DISKS="$TEST_FILE" - fi done # # If requested setup loopback devices backed by the sparse files. # if [ "$LOOPBACK" = "yes" ]; then - DISKS="" - test -x "$LOSETUP" || fail "$LOSETUP utility must be installed" for TEST_FILE in ${FILES}; do - TEST_LOOPBACK=$(sudo "${LOSETUP}" -f) - sudo "${LOSETUP}" "${TEST_LOOPBACK}" "${TEST_FILE}" || - fail "Failed: ${TEST_FILE} -> ${TEST_LOOPBACK}" - LOOPBACKS="${LOOPBACKS}${TEST_LOOPBACK} " - BASELOOPBACKS=$(basename "$TEST_LOOPBACK") - if [[ "$DISKS" ]]; then - DISKS="$DISKS $BASELOOPBACKS" + if [ "$UNAME" = "FreeBSD" ] ; then + MDDEVICE=$(sudo "${LOSETUP}" -a -t vnode -f "${TEST_FILE}") + if [ -z "$MDDEVICE" ] ; then + fail "Failed: ${TEST_FILE} -> loopback" + fi + DISKS="$DISKS $MDDEVICE" + LOOPBACKS="$LOOPBACKS $MDDEVICE" else - DISKS="$BASELOOPBACKS" + TEST_LOOPBACK=$(sudo "${LOSETUP}" -f) + sudo "${LOSETUP}" "${TEST_LOOPBACK}" "${TEST_FILE}" || + fail "Failed: ${TEST_FILE} -> ${TEST_LOOPBACK}" + BASELOOPBACK=$(basename "$TEST_LOOPBACK") + DISKS="$DISKS $BASELOOPBACK" + LOOPBACKS="$LOOPBACKS $TEST_LOOPBACK" fi done + DISKS=${DISKS# } + LOOPBACKS=${LOOPBACKS# } + else + DISKS="$FILES" fi fi +# +# It may be desirable to test with fewer disks than the default when running +# the performance tests, but the functional tests require at least three. +# NUM_DISKS=$(echo "${DISKS}" | awk '{print NF}') -[ "$NUM_DISKS" -lt 3 ] && fail "Not enough disks ($NUM_DISKS/3 minimum)" +if [ "$TAGS" != "perf" ]; then + [ "$NUM_DISKS" -lt 3 ] && fail "Not enough disks ($NUM_DISKS/3 minimum)" +fi # # Disable SELinux until the ZFS Test Suite has been updated accordingly. # if [ -x "$STF_PATH/setenforce" ]; then - sudo setenforce permissive &>/dev/null + sudo setenforce permissive >/dev/null 2>&1 fi # @@ -573,6 +642,12 @@ if [ -e /sys/module/zfs/parameters/zfs_dbgmsg_enable ]; then sudo /bin/sh -c "echo 0 >/proc/spl/kstat/zfs/dbgmsg" fi +msg +msg "--- Configuration ---" +msg "Runfiles: $RUNFILES" +msg "STF_TOOLS: $STF_TOOLS" +msg "STF_SUITE: $STF_SUITE" +msg "STF_PATH: $STF_PATH" msg "FILEDIR: $FILEDIR" msg "FILES: $FILES" msg "LOOPBACKS: $LOOPBACKS" @@ -596,24 +671,36 @@ export __ZFS_POOL_EXCLUDE export TESTFAIL_CALLBACKS export PATH=$STF_PATH -RESULTS_FILE=$(mktemp -u -t zts-results.XXXX -p "$FILEDIR") -REPORT_FILE=$(mktemp -u -t zts-report.XXXX -p "$FILEDIR") +if [ "$UNAME" = "FreeBSD" ] ; then + mkdir -p "$FILEDIR" || true + RESULTS_FILE=$(mktemp -u "${FILEDIR}/zts-results.XXXXXX") + REPORT_FILE=$(mktemp -u "${FILEDIR}/zts-report.XXXXXX") +else + RESULTS_FILE=$(mktemp -u -t zts-results.XXXXXX -p "$FILEDIR") + REPORT_FILE=$(mktemp -u -t zts-report.XXXXXX -p "$FILEDIR") +fi # # Run all the tests as specified. # -msg "${TEST_RUNNER} ${QUIET} -c ${RUNFILE} -T ${TAGS} -i ${STF_SUITE}" \ - "-I ${ITERATIONS}" -${TEST_RUNNER} ${QUIET} -c "${RUNFILE}" -T "${TAGS}" -i "${STF_SUITE}" \ - -I "${ITERATIONS}" 2>&1 | tee "$RESULTS_FILE" +msg "${TEST_RUNNER} ${QUIET:+-q}" \ + "-c \"${RUNFILES}\"" \ + "-T \"${TAGS}\"" \ + "-i \"${STF_SUITE}\"" \ + "-I \"${ITERATIONS}\"" +${TEST_RUNNER} ${QUIET:+-q} \ + -c "${RUNFILES}" \ + -T "${TAGS}" \ + -i "${STF_SUITE}" \ + -I "${ITERATIONS}" \ + 2>&1 | tee "$RESULTS_FILE" # # Analyze the results. # -set -o pipefail -${ZTS_REPORT} "$RESULTS_FILE" | tee "$REPORT_FILE" +${ZTS_REPORT} "$RESULTS_FILE" >"$REPORT_FILE" RESULT=$? -set +o pipefail +cat "$REPORT_FILE" RESULTS_DIR=$(awk '/^Log directory/ { print $3 }' "$RESULTS_FILE") if [ -d "$RESULTS_DIR" ]; then @@ -622,8 +709,8 @@ fi rm -f "$RESULTS_FILE" "$REPORT_FILE" -if [ ${#SINGLETEST[@]} -ne 0 ]; then - rm -f "$RUNFILE" &>/dev/null +if [ -n "$SINGLETEST" ]; then + rm -f "$RUNFILES" >/dev/null 2>&1 fi exit ${RESULT} diff --git a/scripts/zfs.sh b/scripts/zfs.sh index 015b3ba9de..7870b8930c 100755 --- a/scripts/zfs.sh +++ b/scripts/zfs.sh @@ -14,6 +14,7 @@ fi PROG=zfs.sh VERBOSE="no" UNLOAD="no" +LOAD="yes" STACK_TRACER="no" ZED_PIDFILE=${ZED_PIDFILE:-/var/run/zed.pid} @@ -29,6 +30,8 @@ KMOD_ZCOMMON=${KMOD_ZCOMMON:-zcommon} KMOD_ZLUA=${KMOD_ZLUA:-zlua} KMOD_ICP=${KMOD_ICP:-icp} KMOD_ZFS=${KMOD_ZFS:-zfs} +KMOD_FREEBSD=${KMOD_FREEBSD:-openzfs} +KMOD_ZZSTD=${KMOD_ZZSTD:-zzstd} usage() { @@ -42,12 +45,13 @@ DESCRIPTION: OPTIONS: -h Show this message -v Verbose + -r Reload modules -u Unload modules -S Enable kernel stack tracer EOF } -while getopts 'hvuS' OPTION; do +while getopts 'hvruS' OPTION; do case $OPTION in h) usage @@ -56,8 +60,13 @@ while getopts 'hvuS' OPTION; do v) VERBOSE="yes" ;; + r) + UNLOAD="yes" + LOAD="yes" + ;; u) UNLOAD="yes" + LOAD="no" ;; S) STACK_TRACER="yes" @@ -76,12 +85,12 @@ kill_zed() { fi } -check_modules() { +check_modules_linux() { LOADED_MODULES="" MISSING_MODULES="" - for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR \ - $KMOD_ZUNICODE $KMOD_ZCOMMON $KMOD_ZLUA $KMOD_ICP $KMOD_ZFS; do + for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR $KMOD_ZUNICODE $KMOD_ZCOMMON \ + $KMOD_ZLUA $KMOD_ZZSTD $KMOD_ICP $KMOD_ZFS; do NAME=$(basename "$KMOD" .ko) if lsmod | grep -E -q "^${NAME}"; then @@ -108,7 +117,7 @@ check_modules() { return 0 } -load_module() { +load_module_linux() { KMOD=$1 FILE=$(modinfo "$KMOD" | awk '/^filename:/ {print $2}') @@ -118,9 +127,7 @@ load_module() { echo "Loading: $FILE ($VERSION)" fi - $LDMOD "$KMOD" >/dev/null 2>&1 - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then + if ! $LDMOD "$KMOD" >/dev/null 2>&1; then echo "Failed to load $KMOD" return 1 fi @@ -128,7 +135,17 @@ load_module() { return 0 } -load_modules() { +load_modules_freebsd() { + kldload "$KMOD_FREEBSD" || return 1 + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully loaded ZFS module stack" + fi + + return 0 +} + +load_modules_linux() { mkdir -p /etc/zfs if modinfo "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1; then @@ -140,8 +157,9 @@ load_modules() { fi for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR \ - $KMOD_ZUNICODE $KMOD_ZCOMMON $KMOD_ZLUA $KMOD_ICP $KMOD_ZFS; do - load_module "$KMOD" || return 1 + $KMOD_ZUNICODE $KMOD_ZCOMMON $KMOD_ZLUA $KMOD_ZZSTD \ + $KMOD_ICP $KMOD_ZFS; do + load_module_linux "$KMOD" || return 1 done if [ "$VERBOSE" = "yes" ]; then @@ -151,7 +169,7 @@ load_modules() { return 0 } -unload_module() { +unload_module_linux() { KMOD=$1 NAME=$(basename "$KMOD" .ko) @@ -167,14 +185,27 @@ unload_module() { return 0 } -unload_modules() { - for KMOD in $KMOD_ZFS $KMOD_ICP $KMOD_ZLUA $KMOD_ZCOMMON $KMOD_ZUNICODE \ - $KMOD_ZNVPAIR $KMOD_ZAVL $KMOD_SPL; do +unload_modules_freebsd() { + kldunload "$KMOD_FREEBSD" || echo "Failed to unload $KMOD_FREEBSD" + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully unloaded ZFS module stack" + fi + + return 0 +} + +unload_modules_linux() { + for KMOD in $KMOD_ZFS $KMOD_ICP $KMOD_ZZSTD $KMOD_ZLUA $KMOD_ZCOMMON \ + $KMOD_ZUNICODE $KMOD_ZNVPAIR $KMOD_ZAVL $KMOD_SPL; do NAME=$(basename "$KMOD" .ko) USE_COUNT=$(lsmod | grep -E "^${NAME} " | awk '{print $3}') if [ "$USE_COUNT" = "0" ] ; then - unload_module "$KMOD" || return 1 + unload_module_linux "$KMOD" || return 1 + elif [ "$USE_COUNT" != "" ] ; then + echo "Module ${NAME} is still in use!" + return 1 fi done @@ -193,7 +224,7 @@ unload_modules() { return 0 } -stack_clear() { +stack_clear_linux() { STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size STACK_TRACER_ENABLED=/proc/sys/kernel/stack_tracer_enabled @@ -203,7 +234,7 @@ stack_clear() { fi } -stack_check() { +stack_check_linux() { STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size STACK_TRACE=/sys/kernel/debug/tracing/stack_trace STACK_LIMIT=15362 @@ -224,17 +255,34 @@ if [ "$(id -u)" != 0 ]; then exit 1 fi +UNAME=$(uname -s) + if [ "$UNLOAD" = "yes" ]; then kill_zed umount -t zfs -a - stack_check - unload_modules -else - stack_clear - check_modules - load_modules "$@" - udevadm trigger - udevadm settle + case $UNAME in + FreeBSD) + unload_modules_freebsd + ;; + Linux) + stack_check_linux + unload_modules_linux + ;; + esac +fi +if [ "$LOAD" = "yes" ]; then + case $UNAME in + FreeBSD) + load_modules_freebsd + ;; + Linux) + stack_clear_linux + check_modules_linux + load_modules_linux "$@" + udevadm trigger + udevadm settle + ;; + esac fi exit 0 diff --git a/scripts/zimport.sh b/scripts/zimport.sh index d7e82fe9f0..0e9c01182b 100755 --- a/scripts/zimport.sh +++ b/scripts/zimport.sh @@ -1,10 +1,10 @@ -#!/bin/bash +#!/usr/bin/env bash # # Verify that an assortment of known good reference pools can be imported -# using different versions of the ZoL code. +# using different versions of OpenZFS code. # # By default references pools for the major ZFS implementation will be -# checked against the most recent ZoL tags and the master development branch. +# checked against the most recent OpenZFS tags and the master development branch. # Alternate tags or branches may be verified with the '-s option. # Passing the keyword "installed" will instruct the script to test whatever # version is installed. @@ -39,7 +39,7 @@ # -s "zfs-0.6.2 master installed" \ # -p "zevo-1.1.1 zol-0.6.2 zol-0.6.2-173 master installed" # -# --------------------- ZFS on Linux Source Versions -------------- +# ------------------------ OpenZFS Source Versions ---------------- # zfs-0.6.2 master 0.6.2-175_g36eb554 # ----------------------------------------------------------------- # Clone ZFS Local Local Skip @@ -68,9 +68,9 @@ TEST_DIR=$(mktemp -u -d -p /var/tmp zimport.XXXXXXXX) KEEP="no" VERBOSE="no" COLOR="yes" -REPO="https://github.com/zfsonlinux" +REPO="https://github.com/openzfs" IMAGES_DIR="$SCRIPTDIR/zfs-images/" -IMAGES_TAR="https://github.com/zfsonlinux/zfs-images/tarball/master" +IMAGES_TAR="https://github.com/openzfs/zfs-images/tarball/master" ERROR=0 CONFIG_LOG="configure.log" @@ -98,7 +98,7 @@ OPTIONS: -c No color -k Keep temporary directory -r Source repository ($REPO) - -s ... Verify ZoL versions with the listed tags + -s ... Verify OpenZFS versions with the listed tags -i Pool image directory -p ... Verify pools created with the listed tags -f Temporary directory to use @@ -164,15 +164,13 @@ populate() { local MAX_DIR_SIZE=$2 local MAX_FILE_SIZE=$3 - # shellcheck disable=SC2086 - mkdir -p $ROOT/{a,b,c,d,e,f,g}/{h,i} + mkdir -p "$ROOT"/{a,b,c,d,e,f,g}/{h,i} DIRS=$(find "$ROOT") for DIR in $DIRS; do COUNT=$((RANDOM % MAX_DIR_SIZE)) - # shellcheck disable=SC2034 - for i in $(seq $COUNT); do + for _ in $(seq $COUNT); do FILE=$(mktemp -p "$DIR") SIZE=$((RANDOM % MAX_FILE_SIZE)) dd if=/dev/urandom of="$FILE" bs=1k \ @@ -334,9 +332,8 @@ fi for TAG in $POOL_TAGS; do if [ "$TAG" = "all" ]; then - # shellcheck disable=SC2010 - ALL_TAGS=$(ls "$IMAGES_DIR" | grep "tar.bz2" | \ - sed 's/.tar.bz2//' | tr '\n' ' ') + ALL_TAGS=$(echo "$IMAGES_DIR"/*.tar.bz2 | \ + sed "s|$IMAGES_DIR/||g;s|.tar.bz2||g") NEW_TAGS="$NEW_TAGS $ALL_TAGS" else NEW_TAGS="$NEW_TAGS $TAG" @@ -365,7 +362,7 @@ if [ ! -d "$SRC_DIR" ]; then fi # Print a header for all tags which are being tested. -echo "--------------------- ZFS on Linux Source Versions --------------" +echo "------------------------ OpenZFS Source Versions ----------------" printf "%-16s" " " for TAG in $SRC_TAGS; do src_set_vars "$TAG" @@ -491,10 +488,8 @@ for TAG in $POOL_TAGS; do POOL_NAME=$($ZPOOL_CMD import -d "$POOL_DIR_COPY" | \ awk '/pool:/ { print $2; exit 0 }') - $ZPOOL_CMD import -N -d "$POOL_DIR_COPY" \ - "$POOL_NAME" &>/dev/null - # shellcheck disable=SC2181 - if [ $? -ne 0 ]; then + if ! $ZPOOL_CMD import -N -d "$POOL_DIR_COPY" + "$POOL_NAME" &>/dev/null; then fail_nonewline ERROR=1 else diff --git a/scripts/zloop.sh b/scripts/zloop.sh index 1f36f865b5..4a572ebab1 100755 --- a/scripts/zloop.sh +++ b/scripts/zloop.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # CDDL HEADER START @@ -18,6 +18,7 @@ # # Copyright (c) 2015 by Delphix. All rights reserved. # Copyright (C) 2016 Lawrence Livermore National Security, LLC. +# Copyright (c) 2017, Intel Corporation. # BASE_DIR=$(dirname "$0") @@ -37,54 +38,66 @@ DEFAULTCOREDIR=/var/tmp/zloop function usage { - echo -e "\n$0 [-t ] [ -s ] [-c ]" \ - "[ -- [extra ztest parameters]]\n" \ - "\n" \ - " This script runs ztest repeatedly with randomized arguments.\n" \ - " If a crash is encountered, the ztest logs, any associated\n" \ - " vdev files, and core file (if one exists) are moved to the\n" \ - " output directory ($DEFAULTCOREDIR by default). Any options\n" \ - " after the -- end-of-options marker will be passed to ztest.\n" \ - "\n" \ - " Options:\n" \ - " -t Total time to loop for, in seconds. If not provided,\n" \ - " zloop runs forever.\n" \ - " -s Size of vdev devices.\n" \ - " -f Specify working directory for ztest vdev files.\n" \ - " -c Specify a core dump directory to use.\n" \ - " -m Max number of core dumps to allow before exiting.\n" \ - " -l Create 'ztest.core.N' symlink to core directory.\n" \ - " -h Print this help message.\n" \ - "" >&2 + cat >&2 <] [-f ] + [-m ] [-s ] [-t ] + [-I ] [-- [extra ztest parameters]] + + This script runs ztest repeatedly with randomized arguments. + If a crash is encountered, the ztest logs, any associated + vdev files, and core file (if one exists) are moved to the + output directory ($DEFAULTCOREDIR by default). Any options + after the -- end-of-options marker will be passed to ztest. + + Options: + -c Specify a core dump directory to use. + -f Specify working directory for ztest vdev files. + -h Print this help message. + -l Create 'ztest.core.N' symlink to core directory. + -m Max number of core dumps to allow before exiting. + -s Size of vdev devices. + -t Total time to loop for, in seconds. If not provided, + zloop runs forever. + -I Max number of iterations to loop before exiting. + +EOF } function or_die { # shellcheck disable=SC2068 - $@ - # shellcheck disable=SC2181 - if [[ $? -ne 0 ]]; then - # shellcheck disable=SC2145 - echo "Command failed: $@" + if ! $@; then + echo "Command failed: $*" exit 1 fi } -# core file helpers -origcorepattern="$(cat /proc/sys/kernel/core_pattern)" -coreglob="$(grep -E -o '^([^|%[:space:]]*)' /proc/sys/kernel/core_pattern)*" +case $(uname) in +FreeBSD) + coreglob="z*.core" + ;; +Linux) + # core file helpers + origcorepattern="$(cat /proc/sys/kernel/core_pattern)" + coreglob="$(grep -E -o '^([^|%[:space:]]*)' /proc/sys/kernel/core_pattern)*" -if [[ $coreglob = "*" ]]; then - echo "Setting core file pattern..." - echo "core" > /proc/sys/kernel/core_pattern - coreglob="$(grep -E -o '^([^|%[:space:]]*)' \ - /proc/sys/kernel/core_pattern)*" -fi + if [[ $coreglob = "*" ]]; then + echo "Setting core file pattern..." + echo "core" > /proc/sys/kernel/core_pattern + coreglob="$(grep -E -o '^([^|%[:space:]]*)' \ + /proc/sys/kernel/core_pattern)*" + fi + ;; +*) + exit 1 + ;; +esac function core_file { - # shellcheck disable=SC2012 disable=2086 - printf "%s" "$(ls -tr1 $coreglob 2> /dev/null | head -1)" + # shellcheck disable=SC2012,SC2086 + ls -tr1 $coreglob 2>/dev/null | head -1 } function core_prog @@ -92,8 +105,7 @@ function core_prog prog=$ZTEST core_id=$($GDB --batch -c "$1" | grep "Core was generated by" | \ tr \' ' ') - # shellcheck disable=SC2076 - if [[ "$core_id" =~ "zdb " ]]; then + if [[ "$core_id" == *"zdb "* ]]; then prog=$ZDB fi printf "%s" "$prog" @@ -178,10 +190,12 @@ timeout=0 size="512m" coremax=0 symlink=0 -while getopts ":ht:m:s:c:f:l" opt; do +iterations=0 +while getopts ":ht:m:I:s:c:f:l" opt; do case $opt in t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;; m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;; + I ) [[ $OPTARG ]] && iterations=$OPTARG ;; s ) [[ $OPTARG ]] && size=$OPTARG ;; c ) [[ $OPTARG ]] && coredir=$OPTARG ;; f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;; @@ -226,9 +240,14 @@ ztrc=0 # ztest return value foundcrashes=0 # number of crashes found so far starttime=$(date +%s) curtime=$starttime +iteration=0 # if no timeout was specified, loop forever. -while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do +while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do + if (( iterations > 0 )) && (( iteration++ == iterations )); then + break + fi + zopt="-G -VVVVV" # start each run with an empty directory @@ -236,34 +255,60 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do or_die rm -rf "$workdir" or_die mkdir "$workdir" - # switch between common arrangements & fully randomized - if [[ $((RANDOM % 2)) -eq 0 ]]; then - mirrors=2 - raidz=0 - parity=1 - vdevs=2 - else - mirrors=$(((RANDOM % 3) * 1)) - parity=$(((RANDOM % 3) + 1)) - raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) - vdevs=$(((RANDOM % 3) + 3)) - fi - align=$(((RANDOM % 2) * 3 + 9)) - runtime=$((RANDOM % 100)) - passtime=$((RANDOM % (runtime / 3 + 1) + 10)) + # switch between three types of configs + # 1/3 basic, 1/3 raidz mix, and 1/3 draid mix + choice=$((RANDOM % 3)) + # ashift range 9 - 15 + align=$(((RANDOM % 2) * 3 + 9)) + + # randomly use special classes + class="special=random" + + if [[ $choice -eq 0 ]]; then + # basic mirror only + parity=1 + mirrors=2 + draid_data=0 + draid_spares=0 + raid_children=0 + vdevs=2 + raid_type="raidz" + elif [[ $choice -eq 1 ]]; then + # fully randomized mirror/raidz (sans dRAID) + parity=$(((RANDOM % 3) + 1)) + mirrors=$(((RANDOM % 3) * 1)) + draid_data=0 + draid_spares=0 + raid_children=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) + vdevs=$(((RANDOM % 3) + 3)) + raid_type="raidz" + else + # fully randomized dRAID (sans mirror/raidz) + parity=$(((RANDOM % 3) + 1)) + mirrors=0 + draid_data=$(((RANDOM % 8) + 3)) + draid_spares=$(((RANDOM % 2) + parity)) + stripe=$((draid_data + parity)) + extra=$((draid_spares + (RANDOM % 4))) + raid_children=$(((((RANDOM % 4) + 1) * stripe) + extra)) + vdevs=$((RANDOM % 3)) + raid_type="draid" + fi + + zopt="$zopt -K $raid_type" zopt="$zopt -m $mirrors" - zopt="$zopt -r $raidz" + zopt="$zopt -r $raid_children" + zopt="$zopt -D $draid_data" + zopt="$zopt -S $draid_spares" zopt="$zopt -R $parity" zopt="$zopt -v $vdevs" zopt="$zopt -a $align" - zopt="$zopt -T $runtime" - zopt="$zopt -P $passtime" + zopt="$zopt -C $class" zopt="$zopt -s $size" zopt="$zopt -f $workdir" - # shellcheck disable=SC2124 - cmd="$ZTEST $zopt $@" + cmd="$ZTEST $zopt $*" desc="$(date '+%m/%d %T') $cmd" echo "$desc" | tee -a ztest.history echo "$desc" >>ztest.out @@ -278,8 +323,14 @@ done echo "zloop finished, $foundcrashes crashes found" -#restore core pattern -echo "$origcorepattern" > /proc/sys/kernel/core_pattern +# restore core pattern. +case $(uname) in +Linux) + echo "$origcorepattern" > /proc/sys/kernel/core_pattern + ;; +*) + ;; +esac uptime >>ztest.out diff --git a/scripts/zol2zfs-patch.sed b/scripts/zol2zfs-patch.sed index bb6d9faac4..0ca4b6cd6b 100755 --- a/scripts/zol2zfs-patch.sed +++ b/scripts/zol2zfs-patch.sed @@ -12,7 +12,7 @@ s:lib/libzfs:usr/src/lib/libzfs/common:g s:lib/libzfs_core:usr/src/lib/libzfs_core/common:g s:lib/libzpool:lib/libzpool/common:g s:lib/libzpool:usr/src/lib/libzpool:g -s:man/man5/zpool-features.5:usr/src/man/man5/zpool-features.5:g +s:man/man7/zpool-features.7:usr/src/man/man5/zpool-features.5:g s:man/man8/zfs.8:usr/src/man/man1m/zfs.1m:g s:module/nvpair:usr/src/common/nvpair:g s:module/zcommon:usr/src/common/zfs/:g diff --git a/tests/Makefile.am b/tests/Makefile.am index 28d6e95c39..1dfc2cc5f5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1 +1,8 @@ +include $(top_srcdir)/config/Shellcheck.am + SUBDIRS = runfiles test-runner zfs-tests + +EXTRA_DIST = README.md + +SHELLCHECKSCRIPTS = $$(find . -name '*.sh') +.PHONY: $(SHELLCHECKSCRIPTS) diff --git a/tests/README.md b/tests/README.md index 7b3768c291..72b994fa9f 100644 --- a/tests/README.md +++ b/tests/README.md @@ -29,7 +29,7 @@ The pre-requisites for running the ZFS Test Suite are: * Three scratch disks * Specify the disks you wish to use in the $DISKS variable, as a space delimited list like this: DISKS='vdb vdc vdd'. By default - the zfs-tests.sh sciprt will construct three loopback devices to + the zfs-tests.sh script will construct three loopback devices to be used for testing: DISKS='loop0 loop1 loop2'. * A non-root user with a full set of basic privileges and the ability to sudo(8) to root without a password to run the test. @@ -78,7 +78,7 @@ The following zfs-tests.sh options are supported: when test-runner exists. This is useful when the results of a specific test need to be preserved for further analysis. - -f Use sparse files directly instread of loopback devices for + -f Use sparse files directly instead of loopback devices for the testing. When running in this mode certain tests will be skipped which depend on real block devices. @@ -91,7 +91,7 @@ The following zfs-tests.sh options are supported: -s SIZE Use vdevs of SIZE (default: 4G) - -r RUNFILE Run tests in RUNFILE (default: linux.run) + -r RUNFILES Run tests in RUNFILES (default: common.run,linux.run) -t PATH Run single test at PATH relative to test suite @@ -128,7 +128,7 @@ with the `zfs-tests.sh` wrapper script will look something like this: STF_PATH: /var/tmp/constrained_path.G0Sf FILEDIR: /tmp/test FILES: /tmp/test/file-vdev0 /tmp/test/file-vdev1 /tmp/test/file-vdev2 - LOOPBACKS: /dev/loop0 /dev/loop1 /dev/loop2 + LOOPBACKS: /dev/loop0 /dev/loop1 /dev/loop2 DISKS: loop0 loop1 loop2 NUM_DISKS: 3 FILESIZE: 4G @@ -149,4 +149,4 @@ with the `zfs-tests.sh` wrapper script will look something like this: Running Time: 02:35:33 Percent passed: 95.6% - Log directory: /var/tmp/test_results/20180515T054509 \ No newline at end of file + Log directory: /var/tmp/test_results/20180515T054509 diff --git a/tests/runfiles/Makefile.am b/tests/runfiles/Makefile.am index 138d905a57..278e94934f 100644 --- a/tests/runfiles/Makefile.am +++ b/tests/runfiles/Makefile.am @@ -1,2 +1,9 @@ pkgdatadir = $(datadir)/@PACKAGE@/runfiles -dist_pkgdata_DATA = *.run +dist_pkgdata_DATA = \ + common.run \ + freebsd.run \ + linux.run \ + longevity.run \ + perf-regression.run \ + sanity.run \ + sunos.run diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run new file mode 100644 index 0000000000..9f181b53e1 --- /dev/null +++ b/tests/runfiles/common.run @@ -0,0 +1,952 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This run file contains all of the common functional tests. When +# adding a new test consider also adding it to the sanity.run file +# if the new test runs to completion in only a few seconds. +# +# Approximate run time: 4-5 hours +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/acl/off] +tests = ['posixmode'] +tags = ['functional', 'acl'] + +[tests/functional/alloc_class] +tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', + 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', + 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', + 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', + 'alloc_class_013_pos'] +tags = ['functional', 'alloc_class'] + +[tests/functional/arc] +tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', + 'arcstats_runtime_tuning'] +tags = ['functional', 'arc'] + +[tests/functional/atime] +tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] +tags = ['functional', 'atime'] + +[tests/functional/bootfs] +tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', + 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', + 'bootfs_008_pos'] +tags = ['functional', 'bootfs'] + +[tests/functional/btree] +tests = ['btree_positive', 'btree_negative'] +tags = ['functional', 'btree'] +pre = +post = + +[tests/functional/cache] +tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', + 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', + 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] +tags = ['functional', 'cache'] + +[tests/functional/cachefile] +tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', + 'cachefile_004_pos'] +tags = ['functional', 'cachefile'] + +[tests/functional/casenorm] +tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', + 'sensitive_none_lookup', 'sensitive_none_delete', + 'sensitive_formd_lookup', 'sensitive_formd_delete', + 'insensitive_none_lookup', 'insensitive_none_delete', + 'insensitive_formd_lookup', 'insensitive_formd_delete', + 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', + 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] +tags = ['functional', 'casenorm'] + +[tests/functional/channel_program/lua_core] +tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', + 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', + 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', + 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', + 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', + 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', + 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] +tags = ['functional', 'channel_program', 'lua_core'] + +[tests/functional/channel_program/synctask_core] +tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', + 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', + 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', + 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', + 'tst.list_children', 'tst.list_clones', 'tst.list_holds', + 'tst.list_snapshots', 'tst.list_system_props', + 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', + 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', + 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', + 'tst.snapshot_recursive', 'tst.snapshot_simple', + 'tst.bookmark.create', 'tst.bookmark.copy', + 'tst.terminate_by_signal' + ] +tags = ['functional', 'channel_program', 'synctask_core'] + +[tests/functional/checksum] +tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos', + 'filetest_002_pos'] +tags = ['functional', 'checksum'] + +[tests/functional/clean_mirror] +tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', + 'clean_mirror_003_pos', 'clean_mirror_004_pos'] +tags = ['functional', 'clean_mirror'] + +[tests/functional/cli_root/zdb] +tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', + 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', + 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', + 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', + 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] +pre = +post = +tags = ['functional', 'cli_root', 'zdb'] + +[tests/functional/cli_root/zfs] +tests = ['zfs_001_neg', 'zfs_002_pos'] +tags = ['functional', 'cli_root', 'zfs'] + +[tests/functional/cli_root/zfs_bookmark] +tests = ['zfs_bookmark_cliargs'] +tags = ['functional', 'cli_root', 'zfs_bookmark'] + +[tests/functional/cli_root/zfs_change-key] +tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', + 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', + 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] +tags = ['functional', 'cli_root', 'zfs_change-key'] + +[tests/functional/cli_root/zfs_clone] +tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', + 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', + 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', + 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] +tags = ['functional', 'cli_root', 'zfs_clone'] + +[tests/functional/cli_root/zfs_copies] +tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', + 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] +tags = ['functional', 'cli_root', 'zfs_copies'] + +[tests/functional/cli_root/zfs_create] +tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', + 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', + 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', + 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', + 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', + 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount', + 'zfs_create_verbose'] +tags = ['functional', 'cli_root', 'zfs_create'] + +[tests/functional/cli_root/zfs_destroy] +tests = ['zfs_clone_livelist_condense_and_disable', + 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', + 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', + 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', + 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', + 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', + 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', + 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', + 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] +tags = ['functional', 'cli_root', 'zfs_destroy'] + +[tests/functional/cli_root/zfs_diff] +tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', + 'zfs_diff_types', 'zfs_diff_encrypted'] +tags = ['functional', 'cli_root', 'zfs_diff'] + +[tests/functional/cli_root/zfs_get] +tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', + 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', + 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] +tags = ['functional', 'cli_root', 'zfs_get'] + +[tests/functional/cli_root/zfs_ids_to_path] +tests = ['zfs_ids_to_path_001_pos'] +tags = ['functional', 'cli_root', 'zfs_ids_to_path'] + +[tests/functional/cli_root/zfs_inherit] +tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', + 'zfs_inherit_mountpoint'] +tags = ['functional', 'cli_root', 'zfs_inherit'] + +[tests/functional/cli_root/zfs_load-key] +tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', + 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', + 'zfs_load-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_load-key'] + +[tests/functional/cli_root/zfs_mount] +tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', + 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', + 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', + 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', + 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race'] +tags = ['functional', 'cli_root', 'zfs_mount'] + +[tests/functional/cli_root/zfs_program] +tests = ['zfs_program_json'] +tags = ['functional', 'cli_root', 'zfs_program'] + +[tests/functional/cli_root/zfs_promote] +tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', + 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', + 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] +tags = ['functional', 'cli_root', 'zfs_promote'] + +[tests/functional/cli_root/zfs_property] +tests = ['zfs_written_property_001_pos'] +tags = ['functional', 'cli_root', 'zfs_property'] + +[tests/functional/cli_root/zfs_receive] +tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', + 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', + 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', + 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', + 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', + 'zfs_receive_016_pos', 'receive-o-x_props_override', + 'receive-o-x_props_aliases', + 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', + 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', + 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] +tags = ['functional', 'cli_root', 'zfs_receive'] + +[tests/functional/cli_root/zfs_rename] +tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', + 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', + 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', + 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', + 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', + 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] +tags = ['functional', 'cli_root', 'zfs_rename'] + +[tests/functional/cli_root/zfs_reservation] +tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] +tags = ['functional', 'cli_root', 'zfs_reservation'] + +[tests/functional/cli_root/zfs_rollback] +tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', + 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] +tags = ['functional', 'cli_root', 'zfs_rollback'] + +[tests/functional/cli_root/zfs_send] +tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', + 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', + 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', + 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] +tags = ['functional', 'cli_root', 'zfs_send'] + +[tests/functional/cli_root/zfs_set] +tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', + 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', + 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', + 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', + 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', + 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', + 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', + 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', + 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', + 'zfs_set_feature_activation'] +tags = ['functional', 'cli_root', 'zfs_set'] + +[tests/functional/cli_root/zfs_share] +tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', + 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', + 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] +tags = ['functional', 'cli_root', 'zfs_share'] + +[tests/functional/cli_root/zfs_snapshot] +tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', + 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', + 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', + 'zfs_snapshot_009_pos'] +tags = ['functional', 'cli_root', 'zfs_snapshot'] + +[tests/functional/cli_root/zfs_unload-key] +tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_unload-key'] + +[tests/functional/cli_root/zfs_unmount] +tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', + 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', + 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', + 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] +tags = ['functional', 'cli_root', 'zfs_unmount'] + +[tests/functional/cli_root/zfs_unshare] +tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', + 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', + 'zfs_unshare_007_pos'] +tags = ['functional', 'cli_root', 'zfs_unshare'] + +[tests/functional/cli_root/zfs_upgrade] +tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', + 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', + 'zfs_upgrade_007_neg'] +tags = ['functional', 'cli_root', 'zfs_upgrade'] + +[tests/functional/cli_root/zfs_wait] +tests = ['zfs_wait_deleteq'] +tags = ['functional', 'cli_root', 'zfs_wait'] + +[tests/functional/cli_root/zpool] +tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] +tags = ['functional', 'cli_root', 'zpool'] + +[tests/functional/cli_root/zpool_add] +tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', + 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', + 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', + 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_attach] +tests = ['zpool_attach_001_neg', 'attach-o_ashift'] +tags = ['functional', 'cli_root', 'zpool_attach'] + +[tests/functional/cli_root/zpool_clear] +tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', + 'zpool_clear_readonly'] +tags = ['functional', 'cli_root', 'zpool_clear'] + +[tests/functional/cli_root/zpool_create] +tests = ['zpool_create_001_pos', 'zpool_create_002_pos', + 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', + 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', + 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', + 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', + 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', + 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', + 'zpool_create_023_neg', 'zpool_create_024_pos', + 'zpool_create_encrypted', 'zpool_create_crypt_combos', + 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', + 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', + 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', + 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', + 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', + 'zpool_create_features_007_pos', 'zpool_create_features_008_pos', + 'zpool_create_features_009_pos', 'create-o_ashift', + 'zpool_create_tempname', 'zpool_create_dryrun_output'] +tags = ['functional', 'cli_root', 'zpool_create'] + +[tests/functional/cli_root/zpool_destroy] +tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', + 'zpool_destroy_003_neg'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_destroy'] + +[tests/functional/cli_root/zpool_detach] +tests = ['zpool_detach_001_neg'] +tags = ['functional', 'cli_root', 'zpool_detach'] + +[tests/functional/cli_root/zpool_events] +tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', + 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates', + 'zpool_events_clear_retained'] +tags = ['functional', 'cli_root', 'zpool_events'] + +[tests/functional/cli_root/zpool_export] +tests = ['zpool_export_001_pos', 'zpool_export_002_pos', + 'zpool_export_003_neg', 'zpool_export_004_pos'] +tags = ['functional', 'cli_root', 'zpool_export'] + +[tests/functional/cli_root/zpool_get] +tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', + 'zpool_get_004_neg', 'zpool_get_005_pos'] +tags = ['functional', 'cli_root', 'zpool_get'] + +[tests/functional/cli_root/zpool_history] +tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] +tags = ['functional', 'cli_root', 'zpool_history'] + +[tests/functional/cli_root/zpool_import] +tests = ['zpool_import_001_pos', 'zpool_import_002_pos', + 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', + 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', + 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', + 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', + 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', + 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', + 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', + 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', + 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', + 'zpool_import_encrypted', 'zpool_import_encrypted_load', + 'zpool_import_errata3', 'zpool_import_errata4', + 'import_cachefile_device_added', + 'import_cachefile_device_removed', + 'import_cachefile_device_replaced', + 'import_cachefile_mirror_attached', + 'import_cachefile_mirror_detached', + 'import_cachefile_paths_changed', + 'import_cachefile_shared_device', + 'import_devices_missing', + 'import_paths_changed', + 'import_rewind_config_changed', + 'import_rewind_device_replaced'] +tags = ['functional', 'cli_root', 'zpool_import'] +timeout = 1200 + +[tests/functional/cli_root/zpool_labelclear] +tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', + 'zpool_labelclear_removed', 'zpool_labelclear_valid'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_labelclear'] + +[tests/functional/cli_root/zpool_initialize] +tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_fault_export_import_online', + 'zpool_initialize_import_export', + 'zpool_initialize_offline_export_import_online', + 'zpool_initialize_online_offline', + 'zpool_initialize_split', + 'zpool_initialize_start_and_cancel_neg', + 'zpool_initialize_start_and_cancel_pos', + 'zpool_initialize_suspend_resume', + 'zpool_initialize_unsupported_vdevs', + 'zpool_initialize_verify_checksums', + 'zpool_initialize_verify_initialized'] +pre = +tags = ['functional', 'cli_root', 'zpool_initialize'] + +[tests/functional/cli_root/zpool_offline] +tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', + 'zpool_offline_003_pos'] +tags = ['functional', 'cli_root', 'zpool_offline'] + +[tests/functional/cli_root/zpool_online] +tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] +tags = ['functional', 'cli_root', 'zpool_online'] + +[tests/functional/cli_root/zpool_remove] +tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', + 'zpool_remove_003_pos'] +tags = ['functional', 'cli_root', 'zpool_remove'] + +[tests/functional/cli_root/zpool_replace] +tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] +tags = ['functional', 'cli_root', 'zpool_replace'] + +[tests/functional/cli_root/zpool_resilver] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] +tags = ['functional', 'cli_root', 'zpool_resilver'] + +[tests/functional/cli_root/zpool_scrub] +tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', + 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', + 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] +tags = ['functional', 'cli_root', 'zpool_scrub'] + +[tests/functional/cli_root/zpool_set] +tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', + 'zpool_set_ashift', 'zpool_set_features'] +tags = ['functional', 'cli_root', 'zpool_set'] + +[tests/functional/cli_root/zpool_split] +tests = ['zpool_split_cliargs', 'zpool_split_devices', + 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', + 'zpool_split_resilver', 'zpool_split_indirect', + 'zpool_split_dryrun_output'] +tags = ['functional', 'cli_root', 'zpool_split'] + +[tests/functional/cli_root/zpool_status] +tests = ['zpool_status_001_pos', 'zpool_status_002_pos', + 'zpool_status_features_001_pos'] +tags = ['functional', 'cli_root', 'zpool_status'] + +[tests/functional/cli_root/zpool_sync] +tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] +tags = ['functional', 'cli_root', 'zpool_sync'] + +[tests/functional/cli_root/zpool_trim] +tests = ['zpool_trim_attach_detach_add_remove', + 'zpool_trim_fault_export_import_online', + 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', + 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', + 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', + 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', + 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', + 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', + 'zpool_trim_verify_trimmed'] +tags = ['functional', 'zpool_trim'] + +[tests/functional/cli_root/zpool_upgrade] +tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', + 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', + 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', + 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', + 'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos'] +tags = ['functional', 'cli_root', 'zpool_upgrade'] + +[tests/functional/cli_root/zpool_wait] +tests = ['zpool_wait_discard', 'zpool_wait_freeing', + 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', + 'zpool_wait_initialize_flag', 'zpool_wait_multiple', + 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', + 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', + 'zpool_wait_usage'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_root/zpool_wait/scan] +tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', + 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', + 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_user/misc] +tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', + 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', + 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', + 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', + 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', + 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', + 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', + 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', + 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', + 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', + 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', + 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', + 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', + 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] +user = +tags = ['functional', 'cli_user', 'misc'] + +[tests/functional/cli_user/zfs_list] +tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', + 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] +user = +tags = ['functional', 'cli_user', 'zfs_list'] + +[tests/functional/cli_user/zpool_iostat] +tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', + 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', + 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_iostat'] + +[tests/functional/cli_user/zpool_list] +tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] +user = +tags = ['functional', 'cli_user', 'zpool_list'] + +[tests/functional/cli_user/zpool_status] +tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', + 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_status'] + +[tests/functional/compression] +tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', + 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', + 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] +tags = ['functional', 'compression'] + +[tests/functional/cp_files] +tests = ['cp_files_001_pos'] +tags = ['functional', 'cp_files'] + +[tests/functional/crtime] +tests = ['crtime_001_pos' ] +tags = ['functional', 'crtime'] + +[tests/functional/ctime] +tests = ['ctime_001_pos' ] +tags = ['functional', 'ctime'] + +[tests/functional/deadman] +tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio'] +pre = +post = +tags = ['functional', 'deadman'] + +[tests/functional/delegate] +tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', + 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', + 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', + 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', + 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', + 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', + 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] +tags = ['functional', 'delegate'] + +[tests/functional/exec] +tests = ['exec_001_pos', 'exec_002_neg'] +tags = ['functional', 'exec'] + +[tests/functional/fallocate] +tests = ['fallocate_punch-hole'] +tags = ['functional', 'fallocate'] + +[tests/functional/features/async_destroy] +tests = ['async_destroy_001_pos'] +tags = ['functional', 'features', 'async_destroy'] + +[tests/functional/features/large_dnode] +tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', + 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] +tags = ['functional', 'features', 'large_dnode'] + +[tests/functional/grow] +pre = +post = +tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] +tags = ['functional', 'grow'] + +[tests/functional/history] +tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', + 'history_004_pos', 'history_005_neg', 'history_006_neg', + 'history_007_pos', 'history_008_pos', 'history_009_pos', + 'history_010_pos'] +tags = ['functional', 'history'] + +[tests/functional/hkdf] +tests = ['run_hkdf_test'] +tags = ['functional', 'hkdf'] + +[tests/functional/inheritance] +tests = ['inherit_001_pos'] +pre = +tags = ['functional', 'inheritance'] + +[tests/functional/io] +tests = ['sync', 'psync', 'posixaio', 'mmap'] +tags = ['functional', 'io'] + +[tests/functional/inuse] +tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] +post = +tags = ['functional', 'inuse'] + +[tests/functional/large_files] +tests = ['large_files_001_pos', 'large_files_002_pos'] +tags = ['functional', 'large_files'] + +[tests/functional/largest_pool] +tests = ['largest_pool_001_pos'] +pre = +post = +tags = ['functional', 'largest_pool'] + +[tests/functional/limits] +tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', + 'snapshot_limit'] +tags = ['functional', 'limits'] + +[tests/functional/link_count] +tests = ['link_count_001', 'link_count_root_inode'] +tags = ['functional', 'link_count'] + +[tests/functional/migration] +tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', + 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', + 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', + 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] +tags = ['functional', 'migration'] + +[tests/functional/mmap] +tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos'] +tags = ['functional', 'mmap'] + +[tests/functional/mount] +tests = ['umount_001', 'umountall_001'] +tags = ['functional', 'mount'] + +[tests/functional/mv_files] +tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] +tags = ['functional', 'mv_files'] + +[tests/functional/nestedfs] +tests = ['nestedfs_001_pos'] +tags = ['functional', 'nestedfs'] + +[tests/functional/no_space] +tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', + 'enospc_df'] +tags = ['functional', 'no_space'] + +[tests/functional/nopwrite] +tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', + 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', + 'nopwrite_varying_compression', 'nopwrite_volume'] +tags = ['functional', 'nopwrite'] + +[tests/functional/online_offline] +tests = ['online_offline_001_pos', 'online_offline_002_neg', + 'online_offline_003_neg'] +tags = ['functional', 'online_offline'] + +[tests/functional/pool_checkpoint] +tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', + 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', + 'checkpoint_discard_busy', 'checkpoint_discard_many', + 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', + 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', + 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', + 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] +tags = ['functional', 'pool_checkpoint'] +timeout = 1800 + +[tests/functional/pool_names] +tests = ['pool_names_001_pos', 'pool_names_002_neg'] +pre = +post = +tags = ['functional', 'pool_names'] + +[tests/functional/poolversion] +tests = ['poolversion_001_pos', 'poolversion_002_pos'] +tags = ['functional', 'poolversion'] + +[tests/functional/pyzfs] +tests = ['pyzfs_unittest'] +pre = +post = +tags = ['functional', 'pyzfs'] + +[tests/functional/quota] +tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', + 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +tags = ['functional', 'quota'] + +[tests/functional/redacted_send] +tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', + 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', + 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', + 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', + 'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume', + 'redacted_size', 'redacted_volume'] +tags = ['functional', 'redacted_send'] + +[tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] +tags = ['functional', 'raidz'] + +[tests/functional/redundancy] +tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', + 'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1', + 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', + 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', + 'redundancy_raidz3', 'redundancy_stripe'] +tags = ['functional', 'redundancy'] +timeout = 1200 + +[tests/functional/refquota] +tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', + 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', + 'refquota_007_neg', 'refquota_008_neg'] +tags = ['functional', 'refquota'] + +[tests/functional/refreserv] +tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', + 'refreserv_raidz'] +tags = ['functional', 'refreserv'] + +[tests/functional/removal] +pre = +tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', + 'removal_condense_export', 'removal_multiple_indirection', + 'removal_nopwrite', 'removal_remap_deadlists', + 'removal_resume_export', 'removal_sanity', 'removal_with_add', + 'removal_with_create_fs', 'removal_with_dedup', + 'removal_with_errors', 'removal_with_export', + 'removal_with_ganging', 'removal_with_faulted', + 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', + 'removal_with_send_recv', 'removal_with_snapshot', + 'removal_with_write', 'removal_with_zdb', 'remove_expanded', + 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', + 'remove_indirect', 'remove_attach_mirror'] +tags = ['functional', 'removal'] + +[tests/functional/rename_dirs] +tests = ['rename_dirs_001_pos'] +tags = ['functional', 'rename_dirs'] + +[tests/functional/replacement] +tests = ['attach_import', 'attach_multiple', 'attach_rebuild', + 'attach_resilver', 'detach', 'rebuild_disabled_feature', + 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', + 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', + 'scrub_cancel'] +tags = ['functional', 'replacement'] + +[tests/functional/reservation] +tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', + 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', + 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', + 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', + 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', + 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', + 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', + 'reservation_022_pos'] +tags = ['functional', 'reservation'] + +[tests/functional/rootpool] +tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] +tags = ['functional', 'rootpool'] + +[tests/functional/rsend] +tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', + 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', + 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', + 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', + 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', + 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', + 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', + 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', + 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', + 'send-c_mixed_compression', 'send-c_stream_size_estimate', + 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', + 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', + 'send_encrypted_props', 'send_encrypted_truncated_files', + 'send_freeobjects', 'send_realloc_files', + 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', + 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', + 'send_partial_dataset', 'send_invalid', 'send_doall'] +tags = ['functional', 'rsend'] + +[tests/functional/scrub_mirror] +tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', + 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] +tags = ['functional', 'scrub_mirror'] + +[tests/functional/slog] +tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', + 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', + 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', + 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', + 'slog_replay_fs_002', 'slog_replay_volume'] +tags = ['functional', 'slog'] + +[tests/functional/snapshot] +tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', + 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', + 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', + 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', + 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', + 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', + 'snapshot_017_pos'] +tags = ['functional', 'snapshot'] + +[tests/functional/snapused] +tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', + 'snapused_004_pos', 'snapused_005_pos'] +tags = ['functional', 'snapused'] + +[tests/functional/sparse] +tests = ['sparse_001_pos'] +tags = ['functional', 'sparse'] + +[tests/functional/suid] +tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', + 'suid_write_to_none'] +tags = ['functional', 'suid'] + +[tests/functional/threadsappend] +tests = ['threadsappend_001_pos'] +tags = ['functional', 'threadsappend'] + +[tests/functional/trim] +tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', + 'trim_integrity', 'trim_config', 'trim_l2arc'] +tags = ['functional', 'trim'] + +[tests/functional/truncate] +tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] +tags = ['functional', 'truncate'] + +[tests/functional/upgrade] +tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] +tags = ['functional', 'upgrade'] + +[tests/functional/userquota] +tests = [ + 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', + 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', + 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', + 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', + 'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted'] +tags = ['functional', 'userquota'] + +[tests/functional/vdev_zaps] +tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', + 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', + 'vdev_zaps_007_pos'] +tags = ['functional', 'vdev_zaps'] + +[tests/functional/write_dirs] +tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] +tags = ['functional', 'write_dirs'] + +[tests/functional/xattr] +tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', + 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', + 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] +tags = ['functional', 'xattr'] + +[tests/functional/zvol/zvol_ENOSPC] +tests = ['zvol_ENOSPC_001_pos'] +tags = ['functional', 'zvol', 'zvol_ENOSPC'] + +[tests/functional/zvol/zvol_cli] +tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] +tags = ['functional', 'zvol', 'zvol_cli'] + +[tests/functional/zvol/zvol_misc] +tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', + 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] +tags = ['functional', 'zvol', 'zvol_misc'] + +[tests/functional/zvol/zvol_swap] +tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] +tags = ['functional', 'zvol', 'zvol_swap'] + +[tests/functional/libzfs] +tests = ['many_fds', 'libzfs_input'] +tags = ['functional', 'libzfs'] + +[tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] + +[tests/functional/l2arc] +tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos', + 'persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos'] +tags = ['functional', 'l2arc'] + +[tests/functional/zpool_influxdb] +tests = ['zpool_influxdb'] +tags = ['functional', 'zpool_influxdb'] diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run new file mode 100644 index 0000000000..153b204b49 --- /dev/null +++ b/tests/runfiles/freebsd.run @@ -0,0 +1,31 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/acl/off:FreeBSD] +tests = ['dosmode'] +tags = ['functional', 'acl'] + +[tests/functional/cli_root/zfs_jail:FreeBSD] +tests = ['zfs_jail_001_pos'] +tags = ['functional', 'cli_root', 'zfs_jail'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8219cf42b1..01e1f79e58 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -17,906 +17,158 @@ user = root timeout = 600 post_user = root post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] -[tests/functional/acl/posix] -tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos'] +[tests/functional/acl/posix:Linux] +tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix'] -[tests/functional/alloc_class] -tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', - 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', - 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', - 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', - 'alloc_class_013_pos'] -tags = ['functional', 'alloc_class'] +[tests/functional/acl/posix-sa:Linux] +tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] +tags = ['functional', 'acl', 'posix-sa'] -[tests/functional/arc] -tests = ['dbufstats_001_pos', 'dbufstats_002_pos'] -tags = ['functional', 'arc'] - -[tests/functional/atime] -tests = ['atime_001_pos', 'atime_002_neg', 'atime_003_pos', 'root_atime_off', - 'root_atime_on', 'root_relatime_on'] +[tests/functional/atime:Linux] +tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] -[tests/functional/bootfs] -tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', - 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', - 'bootfs_008_pos'] -tags = ['functional', 'bootfs'] - -[tests/functional/cache] -tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', - 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', - 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] -tags = ['functional', 'cache'] - -[tests/functional/cachefile] -tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', - 'cachefile_004_pos'] -tags = ['functional', 'cachefile'] - -[tests/functional/casenorm] -tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', - 'sensitive_none_lookup', 'sensitive_none_delete', - 'sensitive_formd_lookup', 'sensitive_formd_delete', - 'insensitive_none_lookup', 'insensitive_none_delete', - 'insensitive_formd_lookup', 'insensitive_formd_delete', - 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', - 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] -tags = ['functional', 'casenorm'] - -[tests/functional/channel_program/lua_core] -tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', - 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', - 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', - 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', - 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', - 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', - 'tst.return_recursive_table', 'tst.timeout'] -tags = ['functional', 'channel_program', 'lua_core'] - -[tests/functional/channel_program/synctask_core] -tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', - 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', - 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', - 'tst.get_userquota', 'tst.get_written', 'tst.list_children', - 'tst.list_clones', 'tst.list_snapshots', 'tst.list_system_props', - 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', - 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', - 'tst.rollback_one', 'tst.snapshot_destroy', 'tst.snapshot_neg', - 'tst.snapshot_recursive', 'tst.snapshot_simple'] -tags = ['functional', 'channel_program', 'synctask_core'] - -[tests/functional/chattr] +[tests/functional/chattr:Linux] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] -[tests/functional/checksum] -tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', - 'filetest_001_pos'] +[tests/functional/checksum:Linux] +tests = ['run_edonr_test'] tags = ['functional', 'checksum'] -[tests/functional/clean_mirror] -tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', - 'clean_mirror_003_pos', 'clean_mirror_004_pos'] -tags = ['functional', 'clean_mirror'] - -[tests/functional/cli_root/zdb] -tests = ['zdb_001_neg', 'zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', - 'zdb_005_pos', 'zdb_006_pos'] -pre = -post = -tags = ['functional', 'cli_root', 'zdb'] - -[tests/functional/cli_root/zfs] -tests = ['zfs_001_neg', 'zfs_002_pos', 'zfs_003_neg'] +[tests/functional/cli_root/zfs:Linux] +tests = ['zfs_003_neg'] tags = ['functional', 'cli_root', 'zfs'] -[tests/functional/cli_root/zfs_bookmark] -tests = ['zfs_bookmark_cliargs'] -tags = ['functional', 'cli_root', 'zfs_bookmark'] - -[tests/functional/cli_root/zfs_change-key] -tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', - 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', - 'zfs_change-key_pbkdf2iters'] -tags = ['functional', 'cli_root', 'zfs_change-key'] - -[tests/functional/cli_root/zfs_clone] -tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', - 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', - 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', - 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] -tags = ['functional', 'cli_root', 'zfs_clone'] - -[tests/functional/cli_root/zfs_copies] -tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', - 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] -tags = ['functional', 'cli_root', 'zfs_copies'] - -[tests/functional/cli_root/zfs_create] -tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', - 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', - 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', - 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', - 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', - 'zfs_create_crypt_combos'] -tags = ['functional', 'cli_root', 'zfs_create'] - -[tests/functional/cli_root/zfs_destroy] -tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', - 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', - 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', - 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', - 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', - 'zfs_destroy_016_pos'] -tags = ['functional', 'cli_root', 'zfs_destroy'] - -[tests/functional/cli_root/zfs_diff] -tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', - 'zfs_diff_types', 'zfs_diff_encrypted'] -tags = ['functional', 'cli_root', 'zfs_diff'] - -[tests/functional/cli_root/zfs_get] -tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', - 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', - 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] -tags = ['functional', 'cli_root', 'zfs_get'] - -[tests/functional/cli_root/zfs_inherit] -tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', - 'zfs_inherit_mountpoint'] -tags = ['functional', 'cli_root', 'zfs_inherit'] - -[tests/functional/cli_root/zfs_load-key] -tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', - 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] -tags = ['functional', 'cli_root', 'zfs_load-key'] - -[tests/functional/cli_root/zfs_mount] -tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', - 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_006_pos', - 'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg', - 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg', - 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints'] +[tests/functional/cli_root/zfs_mount:Linux] +tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_mount_013_pos', + 'zfs_mount_014_neg', 'zfs_multi_mount'] tags = ['functional', 'cli_root', 'zfs_mount'] -[tests/functional/cli_root/zfs_program] -tests = ['zfs_program_json'] -tags = ['functional', 'cli_root', 'zfs_program'] - -[tests/functional/cli_root/zfs_promote] -tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', - 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', - 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] -tags = ['functional', 'cli_root', 'zfs_promote'] - -[tests/functional/cli_root/zfs_property] -tests = ['zfs_written_property_001_pos'] -tags = ['functional', 'cli_root', 'zfs_property'] - -[tests/functional/cli_root/zfs_receive] -tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', - 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', - 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', - 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', - 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', - 'receive-o-x_props_override', 'zfs_receive_from_encrypted', - 'zfs_receive_to_encrypted', 'zfs_receive_raw', - 'zfs_receive_raw_incremental', 'zfs_receive_-e'] -tags = ['functional', 'cli_root', 'zfs_receive'] - -[tests/functional/cli_root/zfs_remap] -tests = ['zfs_remap_cliargs', 'zfs_remap_obsolete_counts'] -tags = ['functional', 'cli_root', 'zfs_remap'] - -[tests/functional/cli_root/zfs_rename] -tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', - 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', - 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', - 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', - 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', - 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint'] -tags = ['functional', 'cli_root', 'zfs_rename'] - -[tests/functional/cli_root/zfs_reservation] -tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] -tags = ['functional', 'cli_root', 'zfs_reservation'] - -[tests/functional/cli_root/zfs_rollback] -tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', - 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] -tags = ['functional', 'cli_root', 'zfs_rollback'] - -[tests/functional/cli_root/zfs_send] -tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', - 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', - 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', - 'zfs_send_sparse', 'zfs_send-b'] -tags = ['functional', 'cli_root', 'zfs_send'] - -[tests/functional/cli_root/zfs_set] -tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', - 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', - 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', - 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', - 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', - 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', - 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', - 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', - 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation'] -tags = ['functional', 'cli_root', 'zfs_set'] - -[tests/functional/cli_root/zfs_share] -tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', - 'zfs_share_004_pos', 'zfs_share_005_pos', 'zfs_share_006_pos', - 'zfs_share_007_neg', 'zfs_share_008_neg', 'zfs_share_009_neg', - 'zfs_share_010_neg', 'zfs_share_011_pos'] +[tests/functional/cli_root/zfs_share:Linux] +tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', + 'zfs_share_012_pos', 'zfs_share_013_pos'] tags = ['functional', 'cli_root', 'zfs_share'] -[tests/functional/cli_root/zfs_snapshot] -tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', - 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', - 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', - 'zfs_snapshot_009_pos'] -tags = ['functional', 'cli_root', 'zfs_snapshot'] - -[tests/functional/cli_root/zfs_sysfs] -tests = ['zfeature_set_unsupported.ksh', 'zfs_get_unsupported', - 'zfs_set_unsupported', 'zfs_sysfs_live.ksh', 'zpool_get_unsupported', +[tests/functional/cli_root/zfs_sysfs:Linux] +tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', + 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', 'zpool_set_unsupported'] tags = ['functional', 'cli_root', 'zfs_sysfs'] -[tests/functional/cli_root/zfs_unload-key] -tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] -tags = ['functional', 'cli_root', 'zfs_unload-key'] - -[tests/functional/cli_root/zfs_unmount] -tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', - 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', - 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', - 'zfs_unmount_all_001_pos', 'zfs_unmount_nested'] -tags = ['functional', 'cli_root', 'zfs_unmount'] - -[tests/functional/cli_root/zfs_unshare] -tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', - 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', - 'zfs_unshare_007_pos'] -tags = ['functional', 'cli_root', 'zfs_unshare'] - -[tests/functional/cli_root/zfs_upgrade] -tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', - 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', - 'zfs_upgrade_007_neg'] -tags = ['functional', 'cli_root', 'zfs_upgrade'] - -[tests/functional/cli_root/zpool] -tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos'] -tags = ['functional', 'cli_root', 'zpool'] - -[tests/functional/cli_root/zpool_add] -tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', - 'zpool_add_004_pos', 'zpool_add_005_pos', 'zpool_add_006_pos', - 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', - 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'add_nested_replacing_spare'] +[tests/functional/cli_root/zpool_add:Linux] +tests = ['add_nested_replacing_spare'] tags = ['functional', 'cli_root', 'zpool_add'] -[tests/functional/cli_root/zpool_attach] -tests = ['zpool_attach_001_neg', 'attach-o_ashift'] -tags = ['functional', 'cli_root', 'zpool_attach'] - -[tests/functional/cli_root/zpool_clear] -tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', - 'zpool_clear_readonly'] -tags = ['functional', 'cli_root', 'zpool_clear'] - -[tests/functional/cli_root/zpool_create] -tests = ['zpool_create_001_pos', 'zpool_create_002_pos', - 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', - 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', - 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', - 'zpool_create_012_neg', 'zpool_create_014_neg', - 'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg', - 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', - 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', - 'zpool_create_024_pos', - 'zpool_create_encrypted', 'zpool_create_crypt_combos', - 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', - 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', - 'zpool_create_features_005_pos', - 'create-o_ashift', 'zpool_create_tempname'] -tags = ['functional', 'cli_root', 'zpool_create'] - -[tests/functional/cli_root/zpool_destroy] -tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', - 'zpool_destroy_003_neg'] -pre = -post = -tags = ['functional', 'cli_root', 'zpool_destroy'] - -[tests/functional/cli_root/zpool_detach] -tests = ['zpool_detach_001_neg'] -tags = ['functional', 'cli_root', 'zpool_detach'] - -[tests/functional/cli_root/zpool_events] -tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', - 'zpool_events_poolname', 'zpool_events_errors'] -tags = ['functional', 'cli_root', 'zpool_events'] - -[tests/functional/cli_root/zpool_expand] +[tests/functional/cli_root/zpool_expand:Linux] tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] -[tests/functional/cli_root/zpool_export] -tests = ['zpool_export_001_pos', 'zpool_export_002_pos', - 'zpool_export_003_neg', 'zpool_export_004_pos'] -tags = ['functional', 'cli_root', 'zpool_export'] - -[tests/functional/cli_root/zpool_get] -tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', - 'zpool_get_004_neg'] -tags = ['functional', 'cli_root', 'zpool_get'] - -[tests/functional/cli_root/zpool_history] -tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] -tags = ['functional', 'cli_root', 'zpool_history'] - -[tests/functional/cli_root/zpool_import] -tests = ['zpool_import_001_pos', 'zpool_import_002_pos', - 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', - 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', - 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', - 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', - 'zpool_import_015_pos', - 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', - 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', - 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', - 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', - 'zpool_import_encrypted', 'zpool_import_encrypted_load', - 'zpool_import_errata3', 'zpool_import_errata4', - 'import_cachefile_device_added', - 'import_cachefile_device_removed', - 'import_cachefile_device_replaced', - 'import_cachefile_mirror_attached', - 'import_cachefile_mirror_detached', - 'import_cachefile_shared_device', - 'import_devices_missing', - 'import_paths_changed', - 'import_rewind_config_changed', - 'import_rewind_device_replaced'] -tags = ['functional', 'cli_root', 'zpool_import'] - -[tests/functional/cli_root/zpool_labelclear] -tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', - 'zpool_labelclear_removed', 'zpool_labelclear_valid'] -pre = -post = -tags = ['functional', 'cli_root', 'zpool_labelclear'] - -[tests/functional/cli_root/zpool_initialize] -tests = ['zpool_initialize_attach_detach_add_remove', - 'zpool_initialize_import_export', - 'zpool_initialize_offline_export_import_online', - 'zpool_initialize_online_offline', - 'zpool_initialize_split', - 'zpool_initialize_start_and_cancel_neg', - 'zpool_initialize_start_and_cancel_pos', - 'zpool_initialize_suspend_resume', - 'zpool_initialize_unsupported_vdevs', - 'zpool_initialize_verify_checksums', - 'zpool_initialize_verify_initialized'] -pre = -tags = ['functional', 'cli_root', 'zpool_initialize'] - -[tests/functional/cli_root/zpool_offline] -tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', - 'zpool_offline_003_pos'] -tags = ['functional', 'cli_root', 'zpool_offline'] - -[tests/functional/cli_root/zpool_online] -tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] -tags = ['functional', 'cli_root', 'zpool_online'] - -[tests/functional/cli_root/zpool_remove] -tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', - 'zpool_remove_003_pos'] -tags = ['functional', 'cli_root', 'zpool_remove'] - -[tests/functional/cli_root/zpool_reopen] +[tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] -[tests/functional/cli_root/zpool_replace] -tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] -tags = ['functional', 'cli_root', 'zpool_replace'] - -[tests/functional/cli_root/zpool_resilver] -tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] -tags = ['functional', 'cli_root', 'zpool_resilver'] - -[tests/functional/cli_root/zpool_scrub] -tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', - 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', - 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', - 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] -tags = ['functional', 'cli_root', 'zpool_scrub'] - -[tests/functional/cli_root/zpool_set] -tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', - 'zpool_set_ashift', 'zpool_set_features'] -tags = ['functional', 'cli_root', 'zpool_set'] - -[tests/functional/cli_root/zpool_split] -tests = ['zpool_split_cliargs', 'zpool_split_devices', - 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', - 'zpool_split_resilver', 'zpool_split_wholedisk'] +[tests/functional/cli_root/zpool_split:Linux] +tests = ['zpool_split_wholedisk'] tags = ['functional', 'cli_root', 'zpool_split'] -[tests/functional/cli_root/zpool_status] -tests = ['zpool_status_001_pos', 'zpool_status_002_pos','zpool_status_003_pos', - 'zpool_status_-c_disable', 'zpool_status_-c_homedir', - 'zpool_status_-c_searchpath'] -user = -tags = ['functional', 'cli_root', 'zpool_status'] - -[tests/functional/cli_root/zpool_sync] -tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] -tags = ['functional', 'cli_root', 'zpool_sync'] - -[tests/functional/cli_root/zpool_trim] -tests = ['zpool_trim_attach_detach_add_remove', - 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', - 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', - 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', - 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', - 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', - 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', - 'zpool_trim_verify_trimmed'] -tags = ['functional', 'zpool_trim'] - -[tests/functional/cli_root/zpool_upgrade] -tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', - 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', - 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', - 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', - 'zpool_upgrade_009_neg'] -tags = ['functional', 'cli_root', 'zpool_upgrade'] - -[tests/functional/cli_user/misc] -tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', - 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', - 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', - 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', - 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', - 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', - 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', - 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', - 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', - 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', - 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', - 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', - 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', - 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'dbufstat_001_pos'] -user = -tags = ['functional', 'cli_user', 'misc'] - -[tests/functional/cli_user/zfs_list] -tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', - 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] -user = -tags = ['functional', 'cli_user', 'zfs_list'] - -[tests/functional/cli_user/zpool_iostat] -tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', - 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', - 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] -user = -tags = ['functional', 'cli_user', 'zpool_iostat'] - -[tests/functional/cli_user/zpool_list] -tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] -user = -tags = ['functional', 'cli_user', 'zpool_list'] - -[tests/functional/compression] -tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', - 'compress_004_pos'] +[tests/functional/compression:Linux] +tests = ['compress_004_pos'] tags = ['functional', 'compression'] -[tests/functional/cp_files] -tests = ['cp_files_001_pos'] -tags = ['functional', 'cp_files'] - -[tests/functional/ctime] -tests = ['ctime_001_pos' ] -tags = ['functional', 'ctime'] - -[tests/functional/deadman] -tests = ['deadman_sync', 'deadman_zio'] -pre = -post = -tags = ['functional', 'deadman'] - -[tests/functional/delegate] -tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', - 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', - 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', - 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', - 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', - 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', - 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] -tags = ['functional', 'delegate'] - -[tests/functional/devices] +[tests/functional/devices:Linux] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] -[tests/functional/events] -tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter'] +[tests/functional/events:Linux] +tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill'] tags = ['functional', 'events'] -[tests/functional/exec] -tests = ['exec_001_pos', 'exec_002_neg'] -tags = ['functional', 'exec'] +[tests/functional/fallocate:Linux] +tests = ['fallocate_prealloc'] +tags = ['functional', 'fallocate'] -[tests/functional/fault] -tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos', - 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_ashift', - 'auto_spare_multiple', 'auto_spare_shared', 'scrub_after_resilver', - 'decrypt_fault', 'decompress_fault', 'zpool_status_-s'] +[tests/functional/fault:Linux] +tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', + 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', + 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', + 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', + 'zpool_status_-s'] tags = ['functional', 'fault'] -[tests/functional/features/async_destroy] -tests = ['async_destroy_001_pos'] -tags = ['functional', 'features', 'async_destroy'] - -[tests/functional/features/large_dnode] -tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos', - 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos', - 'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos'] +[tests/functional/features/large_dnode:Linux] +tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] -[tests/functional/grow] -pre = -post = -tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] -tags = ['functional', 'grow'] - -[tests/functional/history] -tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', - 'history_004_pos', 'history_005_neg', 'history_006_neg', - 'history_007_pos', 'history_008_pos', 'history_009_pos', - 'history_010_pos'] -tags = ['functional', 'history'] - -[tests/functional/hkdf] -tests = ['run_hkdf_test'] -tags = ['functional', 'hkdf'] - -[tests/functional/inheritance] -tests = ['inherit_001_pos'] -pre = -tags = ['functional', 'inheritance'] - -[tests/functional/io] -tests = ['sync', 'psync', 'libaio', 'posixaio', 'mmap'] +[tests/functional/io:Linux] +tests = ['libaio', 'io_uring'] tags = ['functional', 'io'] -[tests/functional/inuse] -tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos', - 'inuse_005_pos', 'inuse_006_pos', 'inuse_007_pos', 'inuse_008_pos', - 'inuse_009_pos'] -post = -tags = ['functional', 'inuse'] - -[tests/functional/large_files] -tests = ['large_files_001_pos', 'large_files_002_pos'] -tags = ['functional', 'large_files'] - -[tests/functional/largest_pool] -tests = ['largest_pool_001_pos'] -pre = -post = -tags = ['functional', 'largest_pool'] - -[tests/functional/limits] -tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', - 'snapshot_limit'] -tags = ['functional', 'limits'] - -[tests/functional/link_count] -tests = ['link_count_001'] -tags = ['functional', 'link_count'] - -[tests/functional/migration] -tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', - 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', - 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', - 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] -tags = ['functional', 'migration'] - -[tests/functional/mmap] -tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_libaio_001_pos'] +[tests/functional/mmap:Linux] +tests = ['mmap_libaio_001_pos'] tags = ['functional', 'mmap'] -[tests/functional/mmp] +[tests/functional/mmp:Linux] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] tags = ['functional', 'mmp'] -[tests/functional/mount] -tests = ['umount_001', 'umount_unlinked_drain', 'umountall_001'] +[tests/functional/mount:Linux] +tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] -[tests/functional/mv_files] -tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] -tags = ['functional', 'mv_files'] +[tests/functional/pam:Linux] +tests = ['pam_basic', 'pam_nounmount'] +tags = ['functional', 'pam'] -[tests/functional/nestedfs] -tests = ['nestedfs_001_pos'] -tags = ['functional', 'nestedfs'] - -[tests/functional/no_space] -tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', - 'enospc_df'] -tags = ['functional', 'no_space'] - -[tests/functional/nopwrite] -tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', - 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', - 'nopwrite_varying_compression', 'nopwrite_volume'] -tags = ['functional', 'nopwrite'] - -[tests/functional/online_offline] -tests = ['online_offline_001_pos', 'online_offline_002_neg', - 'online_offline_003_neg'] -tags = ['functional', 'online_offline'] - -[tests/functional/pool_checkpoint] -tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', - 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', - 'checkpoint_discard_busy', 'checkpoint_discard_many', - 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', - 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', - 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', - 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] -tags = ['functional', 'pool_checkpoint'] -timeout = 1800 - -[tests/functional/pool_names] -tests = ['pool_names_001_pos', 'pool_names_002_neg'] -pre = -post = -tags = ['functional', 'pool_names'] - -[tests/functional/poolversion] -tests = ['poolversion_001_pos', 'poolversion_002_pos'] -tags = ['functional', 'poolversion'] - -[tests/functional/privilege] -tests = ['privilege_001_pos', 'privilege_002_pos'] -tags = ['functional', 'privilege'] - -[tests/functional/procfs] +[tests/functional/procfs:Linux] tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', 'procfs_list_stale_read', 'pool_state'] tags = ['functional', 'procfs'] -[tests/functional/projectquota] +[tests/functional/projectquota:Linux] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', 'projectspace_004_pos', - 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg' ] + 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg'] tags = ['functional', 'projectquota'] -[tests/functional/pyzfs] -tests = ['pyzfs_unittest'] -pre = -post = -tags = ['functional', 'pyzfs'] - -[tests/functional/quota] -tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', - 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] -tags = ['functional', 'quota'] - -[tests/functional/raidz] -tests = ['raidz_001_neg', 'raidz_002_pos'] -tags = ['functional', 'raidz'] - -[tests/functional/redundancy] -tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', - 'redundancy_004_neg'] -tags = ['functional', 'redundancy'] - -[tests/functional/refquota] -tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', - 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg'] -tags = ['functional', 'refquota'] - -[tests/functional/refreserv] -tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', - 'refreserv_004_pos', 'refreserv_005_pos'] -tags = ['functional', 'refreserv'] - -[tests/functional/removal] -pre = -tests = ['removal_all_vdev', 'removal_check_space', - 'removal_condense_export', 'removal_multiple_indirection', - 'removal_remap', 'removal_remap_deadlists', - 'removal_resume_export', 'removal_sanity', 'removal_with_add', - 'removal_with_create_fs', 'removal_with_dedup', - 'removal_with_errors', 'removal_with_export', - 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remap', - 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', - 'removal_with_send_recv', 'removal_with_snapshot', - 'removal_with_write', 'removal_with_zdb', 'remove_expanded', - 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz'] -tags = ['functional', 'removal'] - -[tests/functional/rename_dirs] -tests = ['rename_dirs_001_pos'] -tags = ['functional', 'rename_dirs'] - -[tests/functional/replacement] -tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos'] -tags = ['functional', 'replacement'] - -[tests/functional/reservation] -tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', - 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', - 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', - 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', - 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', - 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', - 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', - 'reservation_022_pos'] -tags = ['functional', 'reservation'] - -[tests/functional/rootpool] -tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] -tags = ['functional', 'rootpool'] - -[tests/functional/rsend] -tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', - 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', - 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', - 'rsend_013_pos', 'rsend_014_pos', - 'rsend_019_pos', 'rsend_020_pos', - 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', - 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', - 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', - 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', - 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD', - 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', - 'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_hierarchy', - 'send_encrypted_props', 'send_encrypted_truncated_files', - 'send_freeobjects', 'send_realloc_dnode_size', 'send_realloc_files', - 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', - 'send_hole_birth', 'send_mixed_raw', 'send-wDR_encrypted_zvol'] +[tests/functional/rsend:Linux] +tests = ['send_realloc_dnode_size', 'send_encrypted_files'] tags = ['functional', 'rsend'] -[tests/functional/scrub_mirror] -tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', - 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] -tags = ['functional', 'scrub_mirror'] - -[tests/functional/slog] -tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', - 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', - 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', - 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs', - 'slog_replay_volume'] -tags = ['functional', 'slog'] - -[tests/functional/snapshot] -tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', - 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', - 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', - 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', - 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', - 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', - 'snapshot_015_pos', 'snapshot_016_pos', 'snapshot_017_pos'] +[tests/functional/snapshot:Linux] +tests = ['snapshot_015_pos', 'snapshot_016_pos'] tags = ['functional', 'snapshot'] -[tests/functional/snapused] -tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', - 'snapused_004_pos', 'snapused_005_pos'] -tags = ['functional', 'snapused'] - -[tests/functional/sparse] -tests = ['sparse_001_pos'] -tags = ['functional', 'sparse'] - -[tests/functional/threadsappend] -tests = ['threadsappend_001_pos'] -tags = ['functional', 'threadsappend'] - -[tests/functional/tmpfile] -tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos'] +[tests/functional/tmpfile:Linux] +tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', + 'tmpfile_stat_mode'] tags = ['functional', 'tmpfile'] -[tests/functional/trim] -tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', - 'trim_integrity', 'trim_config'] -tags = ['functional', 'trim'] - -[tests/functional/truncate] -tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] -tags = ['functional', 'truncate'] - -[tests/functional/upgrade] -tests = ['upgrade_userobj_001_pos', 'upgrade_projectquota_001_pos', - 'upgrade_readonly_pool'] +[tests/functional/upgrade:Linux] +tests = ['upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] -[tests/functional/user_namespace] +[tests/functional/user_namespace:Linux] tests = ['user_namespace_001'] tags = ['functional', 'user_namespace'] -[tests/functional/userquota] -tests = [ - 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', - 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', - 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', - 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', - 'userquota_013_pos', - 'userspace_001_pos', 'userspace_002_pos', 'userspace_003_pos', - 'groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos' ] +[tests/functional/userquota:Linux] +tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', + 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] - -[tests/functional/vdev_zaps] -tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', - 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', - 'vdev_zaps_007_pos'] -tags = ['functional', 'vdev_zaps'] - -[tests/functional/write_dirs] -tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] -tags = ['functional', 'write_dirs'] - -[tests/functional/xattr] -tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', - 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_008_pos', - 'xattr_009_neg', 'xattr_010_neg', 'xattr_011_pos', 'xattr_012_pos', - 'xattr_013_pos'] -tags = ['functional', 'xattr'] - -[tests/functional/zvol/zvol_ENOSPC] -tests = ['zvol_ENOSPC_001_pos'] -tags = ['functional', 'zvol', 'zvol_ENOSPC'] - -[tests/functional/zvol/zvol_cli] -tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] -tags = ['functional', 'zvol', 'zvol_cli'] - -[tests/functional/zvol/zvol_misc] -tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg', - 'zvol_misc_004_pos', 'zvol_misc_005_neg', 'zvol_misc_006_pos', - 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', - 'zvol_misc_volmode', 'zvol_misc_zil'] -tags = ['functional', 'zvol', 'zvol_misc'] - -[tests/functional/zvol/zvol_swap] -tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos', - 'zvol_swap_004_pos', 'zvol_swap_005_pos', 'zvol_swap_006_pos'] -tags = ['functional', 'zvol', 'zvol_swap'] - -[tests/functional/libzfs] -tests = ['many_fds', 'libzfs_input'] -tags = ['functional', 'libzfs'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run new file mode 100644 index 0000000000..2a9b4adf5f --- /dev/null +++ b/tests/runfiles/sanity.run @@ -0,0 +1,623 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This run file contains a subset of functional tests which exercise +# as much functionality as possible while still executing relatively +# quickly. The included tests should take no more than a few seconds +# each to run at most. This provides a convenient way to sanity test a +# change before committing to a full test run which takes several hours. +# +# Approximate run time: 15 minutes +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 180 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/acl/off] +tests = ['posixmode'] +tags = ['functional', 'acl'] + +[tests/functional/alloc_class] +tests = ['alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', + 'alloc_class_006_pos', 'alloc_class_008_pos', 'alloc_class_010_pos', + 'alloc_class_011_neg'] +tags = ['functional', 'alloc_class'] + +[tests/functional/arc] +tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'arcstats_runtime_tuning'] +tags = ['functional', 'arc'] + +[tests/functional/bootfs] +tests = ['bootfs_004_neg', 'bootfs_007_pos'] +tags = ['functional', 'bootfs'] + +[tests/functional/cache] +tests = ['cache_004_neg', 'cache_005_neg', 'cache_007_neg', 'cache_010_pos'] +tags = ['functional', 'cache'] + +[tests/functional/cachefile] +tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', + 'cachefile_004_pos'] +tags = ['functional', 'cachefile'] + +[tests/functional/casenorm] +tests = ['case_all_values', 'norm_all_values', 'sensitive_none_lookup', + 'sensitive_none_delete', 'insensitive_none_lookup', + 'insensitive_none_delete', 'mixed_none_lookup', 'mixed_none_delete'] +tags = ['functional', 'casenorm'] + +[tests/functional/channel_program/lua_core] +tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', + 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', + 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', + 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', + 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', + 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', + 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] +tags = ['functional', 'channel_program', 'lua_core'] + +[tests/functional/channel_program/synctask_core] +tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', + 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', + 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', + 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', + 'tst.list_children', 'tst.list_clones', 'tst.list_holds', + 'tst.list_snapshots', 'tst.list_system_props', + 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', + 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', + 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', + 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', + 'tst.bookmark.create', 'tst.bookmark.copy'] +tags = ['functional', 'channel_program', 'synctask_core'] + +[tests/functional/cli_root/zdb] +tests = ['zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos'] +pre = +post = +tags = ['functional', 'cli_root', 'zdb'] + +[tests/functional/cli_root/zfs] +tests = ['zfs_001_neg', 'zfs_002_pos'] +tags = ['functional', 'cli_root', 'zfs'] + +[tests/functional/cli_root/zfs_bookmark] +tests = ['zfs_bookmark_cliargs'] +tags = ['functional', 'cli_root', 'zfs_bookmark'] + +[tests/functional/cli_root/zfs_change-key] +tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', + 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', + 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] +tags = ['functional', 'cli_root', 'zfs_change-key'] + +[tests/functional/cli_root/zfs_clone] +tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', + 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', + 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', + 'zfs_clone_encrypted'] +tags = ['functional', 'cli_root', 'zfs_clone'] + +[tests/functional/cli_root/zfs_create] +tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', + 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', + 'zfs_create_007_pos', 'zfs_create_011_pos', 'zfs_create_012_pos', + 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', + 'zfs_create_dryrun', 'zfs_create_verbose'] +tags = ['functional', 'cli_root', 'zfs_create'] + +[tests/functional/cli_root/zfs_destroy] +tests = ['zfs_destroy_002_pos', 'zfs_destroy_003_pos', + 'zfs_destroy_004_pos', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', + 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', + 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', + 'zfs_destroy_014_pos', 'zfs_destroy_dev_removal', + 'zfs_destroy_dev_removal_condense'] +tags = ['functional', 'cli_root', 'zfs_destroy'] + +[tests/functional/cli_root/zfs_diff] +tests = ['zfs_diff_cliargs', 'zfs_diff_encrypted'] +tags = ['functional', 'cli_root', 'zfs_diff'] + +[tests/functional/cli_root/zfs_get] +tests = ['zfs_get_003_pos', 'zfs_get_006_neg', 'zfs_get_007_neg', + 'zfs_get_010_neg'] +tags = ['functional', 'cli_root', 'zfs_get'] + +[tests/functional/cli_root/zfs_inherit] +tests = ['zfs_inherit_001_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] +tags = ['functional', 'cli_root', 'zfs_inherit'] + +[tests/functional/cli_root/zfs_load-key] +tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', + 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', + 'zfs_load-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_load-key'] + +[tests/functional/cli_root/zfs_mount] +tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', + 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', + 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', + 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', + 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] +tags = ['functional', 'cli_root', 'zfs_mount'] + +[tests/functional/cli_root/zfs_program] +tests = ['zfs_program_json'] +tags = ['functional', 'cli_root', 'zfs_program'] + +[tests/functional/cli_root/zfs_promote] +tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', + 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', + 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] +tags = ['functional', 'cli_root', 'zfs_promote'] + +[tests/functional/cli_root/zfs_receive] +tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', + 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', + 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', + 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', + 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', + 'zfs_receive_016_pos', 'zfs_receive_from_encrypted', + 'zfs_receive_to_encrypted', 'zfs_receive_raw', + 'zfs_receive_raw_incremental', 'zfs_receive_-e', + 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] +tags = ['functional', 'cli_root', 'zfs_receive'] + +[tests/functional/cli_root/zfs_rename] +tests = ['zfs_rename_003_pos', 'zfs_rename_004_neg', + 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', + 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', + 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', + 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', + 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] +tags = ['functional', 'cli_root', 'zfs_rename'] + +[tests/functional/cli_root/zfs_reservation] +tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] +tags = ['functional', 'cli_root', 'zfs_reservation'] + +[tests/functional/cli_root/zfs_rollback] +tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg'] +tags = ['functional', 'cli_root', 'zfs_rollback'] + +[tests/functional/cli_root/zfs_send] +tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', + 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_encrypted', + 'zfs_send_raw'] +tags = ['functional', 'cli_root', 'zfs_send'] + +[tests/functional/cli_root/zfs_set] +tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', + 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', + 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', + 'mountpoint_002_pos', 'user_property_002_pos', + 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', + 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', + 'user_property_004_pos', 'version_001_neg', + 'zfs_set_003_neg', 'property_alias_001_pos', + 'zfs_set_keylocation', 'zfs_set_feature_activation'] +tags = ['functional', 'cli_root', 'zfs_set'] + +[tests/functional/cli_root/zfs_snapshot] +tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', + 'zfs_snapshot_003_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg'] +tags = ['functional', 'cli_root', 'zfs_snapshot'] + +[tests/functional/cli_root/zfs_unload-key] +tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_unload-key'] + +[tests/functional/cli_root/zfs_unmount] +tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', + 'zfs_unmount_004_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', + 'zfs_unmount_009_pos', 'zfs_unmount_unload_keys'] +tags = ['functional', 'cli_root', 'zfs_unmount'] + +[tests/functional/cli_root/zfs_upgrade] +tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_006_neg', + 'zfs_upgrade_007_neg'] +tags = ['functional', 'cli_root', 'zfs_upgrade'] + +[tests/functional/cli_root/zfs_wait] +tests = ['zfs_wait_deleteq'] +tags = ['functional', 'cli_root', 'zfs_wait'] + +[tests/functional/cli_root/zpool] +tests = ['zpool_001_neg', 'zpool_003_pos', 'zpool_colors'] +tags = ['functional', 'cli_root', 'zpool'] + +[tests/functional/cli_root/zpool_add] +tests = ['zpool_add_002_pos', 'zpool_add_003_pos', + 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', + 'zpool_add_008_neg', 'zpool_add_009_neg'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_attach] +tests = ['zpool_attach_001_neg'] +tags = ['functional', 'cli_root', 'zpool_attach'] + +[tests/functional/cli_root/zpool_clear] +tests = ['zpool_clear_002_neg'] +tags = ['functional', 'cli_root', 'zpool_clear'] + +[tests/functional/cli_root/zpool_create] +tests = ['zpool_create_001_pos', 'zpool_create_002_pos', + 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_007_neg', + 'zpool_create_008_pos', 'zpool_create_010_neg', 'zpool_create_011_neg', + 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', + 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', + 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', + 'zpool_create_encrypted', + 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', + 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', + 'zpool_create_features_005_pos'] +tags = ['functional', 'cli_root', 'zpool_create'] + +[tests/functional/cli_root/zpool_destroy] +tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', + 'zpool_destroy_003_neg'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_destroy'] + +[tests/functional/cli_root/zpool_detach] +tests = ['zpool_detach_001_neg'] +tags = ['functional', 'cli_root', 'zpool_detach'] + +[tests/functional/cli_root/zpool_events] +tests = ['zpool_events_clear', 'zpool_events_follow', 'zpool_events_poolname'] +tags = ['functional', 'cli_root', 'zpool_events'] + +[tests/functional/cli_root/zpool_export] +tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg'] +tags = ['functional', 'cli_root', 'zpool_export'] + +[tests/functional/cli_root/zpool_get] +tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', + 'zpool_get_004_neg', 'zpool_get_005_pos'] +tags = ['functional', 'cli_root', 'zpool_get'] + +[tests/functional/cli_root/zpool_history] +tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] +tags = ['functional', 'cli_root', 'zpool_history'] + +[tests/functional/cli_root/zpool_import] +tests = ['zpool_import_003_pos', 'zpool_import_010_pos', 'zpool_import_011_neg', + 'zpool_import_014_pos', 'zpool_import_features_001_pos', + 'zpool_import_all_001_pos', 'zpool_import_encrypted'] +tags = ['functional', 'cli_root', 'zpool_import'] + +[tests/functional/cli_root/zpool_labelclear] +tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', + 'zpool_labelclear_removed', 'zpool_labelclear_valid'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_labelclear'] + +[tests/functional/cli_root/zpool_initialize] +tests = ['zpool_initialize_online_offline'] +pre = +tags = ['functional', 'cli_root', 'zpool_initialize'] + +[tests/functional/cli_root/zpool_offline] +tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg'] +tags = ['functional', 'cli_root', 'zpool_offline'] + +[tests/functional/cli_root/zpool_online] +tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] +tags = ['functional', 'cli_root', 'zpool_online'] + +[tests/functional/cli_root/zpool_remove] +tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', + 'zpool_remove_003_pos'] +tags = ['functional', 'cli_root', 'zpool_remove'] + +[tests/functional/cli_root/zpool_replace] +tests = ['zpool_replace_001_neg'] +tags = ['functional', 'cli_root', 'zpool_replace'] + +[tests/functional/cli_root/zpool_resilver] +tests = ['zpool_resilver_bad_args'] +tags = ['functional', 'cli_root', 'zpool_resilver'] + +[tests/functional/cli_root/zpool_scrub] +tests = ['zpool_scrub_001_neg', 'zpool_scrub_003_pos', + 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] +tags = ['functional', 'cli_root', 'zpool_scrub'] + +[tests/functional/cli_root/zpool_set] +tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', + 'zpool_set_ashift', 'zpool_set_features'] +tags = ['functional', 'cli_root', 'zpool_set'] + +[tests/functional/cli_root/zpool_split] +tests = ['zpool_split_cliargs', 'zpool_split_devices', + 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect'] +tags = ['functional', 'cli_root', 'zpool_split'] + +[tests/functional/cli_root/zpool_status] +tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] +tags = ['functional', 'cli_root', 'zpool_status'] + +[tests/functional/cli_root/zpool_sync] +tests = ['zpool_sync_002_neg'] +tags = ['functional', 'cli_root', 'zpool_sync'] + +[tests/functional/cli_root/zpool_trim] +tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_neg', + 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', + 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', + 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos'] +tags = ['functional', 'zpool_trim'] + +[tests/functional/cli_root/zpool_upgrade] +tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_003_pos', + 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', + 'zpool_upgrade_009_neg'] +tags = ['functional', 'cli_root', 'zpool_upgrade'] + +[tests/functional/cli_root/zpool_wait] +tests = ['zpool_wait_no_activity', 'zpool_wait_usage'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_root/zpool_wait/scan] +tests = ['zpool_wait_scrub_flag'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_user/misc] +tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', + 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', + 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', + 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', + 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', + 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', + 'zfs_unmount_001_neg', 'zfs_upgrade_001_neg', + 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', + 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', + 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', + 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', + 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', + 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] +user = +tags = ['functional', 'cli_user', 'misc'] + +[tests/functional/cli_user/zpool_iostat] +tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', + 'zpool_iostat_-c_disable', + 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_iostat'] + +[tests/functional/cli_user/zpool_list] +tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] +user = +tags = ['functional', 'cli_user', 'zpool_list'] + +[tests/functional/compression] +tests = ['compress_003_pos','compress_zstd_bswap'] +tags = ['functional', 'compression'] + +[tests/functional/exec] +tests = ['exec_001_pos', 'exec_002_neg'] +tags = ['functional', 'exec'] + +[tests/functional/features/large_dnode] +tests = ['large_dnode_003_pos', 'large_dnode_004_neg', + 'large_dnode_005_pos', 'large_dnode_007_neg'] +tags = ['functional', 'features', 'large_dnode'] + +[tests/functional/grow] +pre = +post = +tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] +tags = ['functional', 'grow'] + +[tests/functional/history] +tests = ['history_004_pos', 'history_005_neg', 'history_006_neg', + 'history_007_pos', 'history_008_pos', 'history_009_pos'] +tags = ['functional', 'history'] + +[tests/functional/hkdf] +tests = ['run_hkdf_test'] +tags = ['functional', 'hkdf'] + +[tests/functional/inuse] +tests = ['inuse_004_pos', 'inuse_005_pos'] +post = +tags = ['functional', 'inuse'] + +[tests/functional/large_files] +tests = ['large_files_001_pos', 'large_files_002_pos'] +tags = ['functional', 'large_files'] + +[tests/functional/libzfs] +tests = ['many_fds', 'libzfs_input'] +tags = ['functional', 'libzfs'] + +[tests/functional/limits] +tests = ['filesystem_count', 'snapshot_count'] +tags = ['functional', 'limits'] + +[tests/functional/link_count] +tests = ['link_count_root_inode'] +tags = ['functional', 'link_count'] + +[tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] + +[tests/functional/migration] +tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', + 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', + 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', + 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] +tags = ['functional', 'migration'] + +[tests/functional/mmap] +tests = ['mmap_read_001_pos'] +tags = ['functional', 'mmap'] + +[tests/functional/nestedfs] +tests = ['nestedfs_001_pos'] +tags = ['functional', 'nestedfs'] + +[tests/functional/nopwrite] +tests = ['nopwrite_sync', 'nopwrite_volume'] +tags = ['functional', 'nopwrite'] + +[tests/functional/pool_checkpoint] +tests = ['checkpoint_conf_change', 'checkpoint_discard_many', + 'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice'] +tags = ['functional', 'pool_checkpoint'] +timeout = 1800 + +[tests/functional/poolversion] +tests = ['poolversion_001_pos', 'poolversion_002_pos'] +tags = ['functional', 'poolversion'] + +[tests/functional/redacted_send] +tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', + 'redacted_disabled_feature', 'redacted_incrementals', + 'redacted_largeblocks', 'redacted_mixed_recsize', 'redacted_negative', + 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size'] +tags = ['functional', 'redacted_send'] + +[tests/functional/raidz] +tests = ['raidz_001_neg'] +tags = ['functional', 'raidz'] + +[tests/functional/refquota] +tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', + 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', + 'refquota_007_neg'] +tags = ['functional', 'refquota'] + +[tests/functional/refreserv] +tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', + 'refreserv_005_pos', 'refreserv_multi_raidz'] +tags = ['functional', 'refreserv'] + +[tests/functional/removal] +pre = +tests = ['removal_all_vdev', 'removal_sanity', 'removal_with_dedup', + 'removal_with_ganging', 'removal_with_faulted'] +tags = ['functional', 'removal'] + +[tests/functional/replacement] +tests = ['rebuild_raidz'] +tags = ['functional', 'replacement'] + +[tests/functional/reservation] +tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', + 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', + 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', + 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', + 'reservation_014_pos', 'reservation_015_pos', + 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', + 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', + 'reservation_022_pos'] +tags = ['functional', 'reservation'] + +[tests/functional/rsend] +tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', + 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', + 'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', + 'rsend_014_pos', 'rsend_016_neg', 'send-c_verify_contents', + 'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup', + 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', + 'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds', + 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', + 'send_invalid'] +tags = ['functional', 'rsend'] + +[tests/functional/scrub_mirror] +tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos'] +tags = ['functional', 'scrub_mirror'] + +[tests/functional/slog] +tests = ['slog_008_neg', 'slog_009_neg', 'slog_010_neg'] +tags = ['functional', 'slog'] + +[tests/functional/snapshot] +tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', + 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', + 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', + 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', + 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', + 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', + 'snapshot_017_pos'] +tags = ['functional', 'snapshot'] + +[tests/functional/snapused] +tests = ['snapused_002_pos', 'snapused_004_pos', 'snapused_005_pos'] +tags = ['functional', 'snapused'] + +[tests/functional/sparse] +tests = ['sparse_001_pos'] +tags = ['functional', 'sparse'] + +[tests/functional/suid] +tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', + 'suid_write_to_none'] +tags = ['functional', 'suid'] + +[tests/functional/threadsappend] +tests = ['threadsappend_001_pos'] +tags = ['functional', 'threadsappend'] + +[tests/functional/truncate] +tests = ['truncate_001_pos', 'truncate_002_pos'] +tags = ['functional', 'truncate'] + +[tests/functional/upgrade] +tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] +tags = ['functional', 'upgrade'] + +[tests/functional/vdev_zaps] +tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', + 'vdev_zaps_005_pos', 'vdev_zaps_006_pos'] +tags = ['functional', 'vdev_zaps'] + +[tests/functional/xattr] +tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', + 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', + 'xattr_011_pos', 'xattr_013_pos'] +tags = ['functional', 'xattr'] + +[tests/functional/zvol/zvol_ENOSPC] +tests = ['zvol_ENOSPC_001_pos'] +tags = ['functional', 'zvol', 'zvol_ENOSPC'] + +[tests/functional/zvol/zvol_cli] +tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] +tags = ['functional', 'zvol', 'zvol_cli'] + +[tests/functional/zvol/zvol_swap] +tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos'] +tags = ['functional', 'zvol', 'zvol_swap'] + +[tests/functional/zpool_influxdb] +tests = ['zpool_influxdb'] +tags = ['functional', 'zpool_influxdb'] diff --git a/tests/runfiles/sunos.run b/tests/runfiles/sunos.run new file mode 100644 index 0000000000..9ba00f452e --- /dev/null +++ b/tests/runfiles/sunos.run @@ -0,0 +1,53 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/inuse:illumos] +tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_006_pos', 'inuse_007_pos'] +post = +tags = ['functional', 'inuse'] + +[tests/functional/cli_root/zpool_add:illumos] +tests = ['zpool_add_005_pos'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_create:illumos] +tests = ['zpool_create_016_pos'] +tags = ['functional', 'cli_root', 'zpool_create'] + +[tests/functional/privilege] +tests = ['privilege_001_pos', 'privilege_002_pos'] +tags = ['functional', 'privilege'] + +[tests/functional/xattr:illumos] +tests = ['xattr_008_pos', 'xattr_009_neg', 'xattr_010_neg'] +tags = ['functional', 'xattr'] + +[tests/functional/zvol/zvol_misc:illumos] +tests = ['zvol_misc_001_neg', 'zvol_misc_003_neg', 'zvol_misc_004_pos', + 'zvol_misc_005_neg', 'zvol_misc_006_pos'] +tags = ['functional', 'zvol', 'zvol_misc'] + +[tests/functional/zvol/zvol_swap:illumos] +tests = ['zvol_swap_003_pos', 'zvol_swap_005_pos', 'zvol_swap_006_pos'] +tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/test-runner/bin/.gitignore b/tests/test-runner/bin/.gitignore new file mode 100644 index 0000000000..ff7e2f8fcc --- /dev/null +++ b/tests/test-runner/bin/.gitignore @@ -0,0 +1,2 @@ +test-runner.py +zts-report.py diff --git a/tests/test-runner/bin/Makefile.am b/tests/test-runner/bin/Makefile.am index 30c564e555..e11e55fffd 100644 --- a/tests/test-runner/bin/Makefile.am +++ b/tests/test-runner/bin/Makefile.am @@ -1,15 +1,8 @@ +include $(top_srcdir)/config/Substfiles.am + pkgdatadir = $(datadir)/@PACKAGE@/test-runner/bin -dist_pkgdata_SCRIPTS = \ +pkgdata_SCRIPTS = \ test-runner.py \ zts-report.py -# -# These scripts are compatibile with both Python 2.6 and 3.4. As such the -# python 3 shebang can be replaced at install time when targeting a python -# 2 system. This allows us to maintain a single version of the source. -# -if USING_PYTHON_2 -install-data-hook: - sed --in-place 's|^#!/usr/bin/python3|#!/usr/bin/python2|' \ - $(DESTDIR)$(pkgdatadir)/test-runner.py \ - $(DESTDIR)$(pkgdatadir)/zts-report.py -endif + +SUBSTFILES += $(pkgdata_SCRIPTS) diff --git a/tests/test-runner/bin/test-runner.py b/tests/test-runner/bin/test-runner.py.in similarity index 79% rename from tests/test-runner/bin/test-runner.py rename to tests/test-runner/bin/test-runner.py.in index 4d4fd96ad7..bbabf247c1 100755 --- a/tests/test-runner/bin/test-runner.py +++ b/tests/test-runner/bin/test-runner.py.in @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env @PYTHON_SHEBANG@ # # This file and its contents are supplied under the terms of the @@ -155,9 +155,10 @@ class Output(object): class Cmd(object): verified_users = [] - def __init__(self, pathname, outputdir=None, timeout=None, user=None, - tags=None): + def __init__(self, pathname, identifier=None, outputdir=None, + timeout=None, user=None, tags=None): self.pathname = pathname + self.identifier = identifier self.outputdir = outputdir or 'BASEDIR' """ The timeout for tests is measured in wall-clock time @@ -172,8 +173,13 @@ class Cmd(object): self.timeout = 60 def __str__(self): - return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % \ - (self.pathname, self.outputdir, self.timeout, self.user) + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Timeout: %d +User: %s +''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc, keyboard_interrupt=False): """ @@ -302,12 +308,12 @@ class Cmd(object): self.result.runtime = '%02d:%02d' % (m, s) self.result.result = 'SKIP' - def log(self, options): + def log(self, options, suppress_console=False): """ This function is responsible for writing all output. This includes the console output, the logfile of all results (with timestamped merged stdout and stderr), and for each test, the unmodified - stdout/stderr/merged in it's own file. + stdout/stderr/merged in its own file. """ logname = getpwuid(os.getuid()).pw_name @@ -315,19 +321,24 @@ class Cmd(object): if self.reran is True: rer = ' (RERAN)' user = ' (run as %s)' % (self.user if len(self.user) else logname) - msga = 'Test: %s%s ' % (self.pathname, user) + if self.identifier: + msga = 'Test (%s): %s%s ' % (self.identifier, self.pathname, user) + else: + msga = 'Test: %s%s ' % (self.pathname, user) msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer) pad = ' ' * (80 - (len(msga) + len(msgb))) result_line = msga + pad + msgb # The result line is always written to the log file. If -q was # specified only failures are written to the console, otherwise - # the result line is written to the console. + # the result line is written to the console. The console output + # may be suppressed by calling log() with suppress_console=True. write_log(bytearray(result_line, encoding='utf-8'), LOG_FILE) - if not options.quiet: - write_log(result_line, LOG_OUT) - elif options.quiet and self.result.result != 'PASS': - write_log(result_line, LOG_OUT) + if not suppress_console: + if not options.quiet: + write_log(result_line, LOG_OUT) + elif options.quiet and self.result.result != 'PASS': + write_log(result_line, LOG_OUT) lines = sorted(self.result.stdout + self.result.stderr, key=lambda x: x[0]) @@ -355,36 +366,49 @@ class Cmd(object): class Test(Cmd): props = ['outputdir', 'timeout', 'user', 'pre', 'pre_user', 'post', - 'post_user', 'tags'] + 'post_user', 'failsafe', 'failsafe_user', 'tags'] - def __init__(self, pathname, outputdir=None, timeout=None, user=None, + def __init__(self, pathname, pre=None, pre_user=None, post=None, post_user=None, - tags=None): - super(Test, self).__init__(pathname, outputdir, timeout, user) + failsafe=None, failsafe_user=None, tags=None, **kwargs): + super(Test, self).__init__(pathname, **kwargs) self.pre = pre or '' self.pre_user = pre_user or '' self.post = post or '' self.post_user = post_user or '' + self.failsafe = failsafe or '' + self.failsafe_user = failsafe_user or '' self.tags = tags or [] def __str__(self): - post_user = pre_user = '' + post_user = pre_user = failsafe_user = '' if len(self.pre_user): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nPre: %s%s\nPost: " \ - "%s%s\nUser: %s\nTags: %s\n" % \ - (self.pathname, self.outputdir, self.timeout, self.pre, - pre_user, self.post, post_user, self.user, self.tags) + if len(self.failsafe_user): + failsafe_user = ' (as %s)' % (self.failsafe_user) + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Timeout: %d +User: %s +Pre: %s%s +Post: %s%s +Failsafe: %s%s +Tags: %s +''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user, + self.pre, pre_user, self.post, post_user, self.failsafe, + failsafe_user, self.tags) def verify(self): """ - Check the pre/post scripts, user and Test. Omit the Test from this - run if there are any problems. + Check the pre/post/failsafe scripts, user and Test. Omit the Test from + this run if there are any problems. """ - files = [self.pre, self.pathname, self.post] - users = [self.pre_user, self.user, self.post_user] + files = [self.pre, self.pathname, self.post, self.failsafe] + users = [self.pre_user, self.user, self.post_user, self.failsafe_user] for f in [f for f in files if len(f)]: if not verify_file(f): @@ -402,17 +426,23 @@ class Test(Cmd): def run(self, options): """ - Create Cmd instances for the pre/post scripts. If the pre script - doesn't pass, skip this Test. Run the post script regardless. + Create Cmd instances for the pre/post/failsafe scripts. If the pre + script doesn't pass, skip this Test. Run the post script regardless. + If the Test is killed, also run the failsafe script. """ odir = os.path.join(self.outputdir, os.path.basename(self.pre)) - pretest = Cmd(self.pre, outputdir=odir, timeout=self.timeout, - user=self.pre_user) - test = Cmd(self.pathname, outputdir=self.outputdir, - timeout=self.timeout, user=self.user) + pretest = Cmd(self.pre, identifier=self.identifier, outputdir=odir, + timeout=self.timeout, user=self.pre_user) + test = Cmd(self.pathname, identifier=self.identifier, + outputdir=self.outputdir, timeout=self.timeout, + user=self.user) + odir = os.path.join(self.outputdir, os.path.basename(self.failsafe)) + failsafe = Cmd(self.failsafe, identifier=self.identifier, + outputdir=odir, timeout=self.timeout, + user=self.failsafe_user) odir = os.path.join(self.outputdir, os.path.basename(self.post)) - posttest = Cmd(self.post, outputdir=odir, timeout=self.timeout, - user=self.post_user) + posttest = Cmd(self.post, identifier=self.identifier, outputdir=odir, + timeout=self.timeout, user=self.post_user) cont = True if len(pretest.pathname): @@ -422,6 +452,9 @@ class Test(Cmd): if cont: test.run(options.dryrun) + if test.result.result == 'KILLED' and len(failsafe.pathname): + failsafe.run(options.dryrun) + failsafe.log(options, suppress_console=True) else: test.skip() @@ -435,42 +468,53 @@ class Test(Cmd): class TestGroup(Test): props = Test.props + ['tests'] - def __init__(self, pathname, outputdir=None, timeout=None, user=None, - pre=None, pre_user=None, post=None, post_user=None, - tests=None, tags=None): - super(TestGroup, self).__init__(pathname, outputdir, timeout, user, - pre, pre_user, post, post_user, tags) + def __init__(self, pathname, tests=None, **kwargs): + super(TestGroup, self).__init__(pathname, **kwargs) self.tests = tests or [] def __str__(self): - post_user = pre_user = '' + post_user = pre_user = failsafe_user = '' if len(self.pre_user): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTests: %s\nTimeout: %s\n" \ - "Pre: %s%s\nPost: %s%s\nUser: %s\nTags: %s\n" % \ - (self.pathname, self.outputdir, self.tests, self.timeout, - self.pre, pre_user, self.post, post_user, self.user, self.tags) + if len(self.failsafe_user): + failsafe_user = ' (as %s)' % (self.failsafe_user) + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Tests: %s +Timeout: %s +User: %s +Pre: %s%s +Post: %s%s +Failsafe: %s%s +Tags: %s +''' % (self.pathname, self.identifier, self.outputdir, self.tests, + self.timeout, self.user, self.pre, pre_user, self.post, post_user, + self.failsafe, failsafe_user, self.tags) def verify(self): """ - Check the pre/post scripts, user and tests in this TestGroup. Omit - the TestGroup entirely, or simply delete the relevant tests in the + Check the pre/post/failsafe scripts, user and tests in this TestGroup. + Omit the TestGroup entirely, or simply delete the relevant tests in the group, if that's all that's required. """ - # If the pre or post scripts are relative pathnames, convert to + # If the pre/post/failsafe scripts are relative pathnames, convert to # absolute, so they stand a chance of passing verification. if len(self.pre) and not os.path.isabs(self.pre): self.pre = os.path.join(self.pathname, self.pre) if len(self.post) and not os.path.isabs(self.post): self.post = os.path.join(self.pathname, self.post) + if len(self.failsafe) and not os.path.isabs(self.failsafe): + self.post = os.path.join(self.pathname, self.post) - auxfiles = [self.pre, self.post] - users = [self.pre_user, self.user, self.post_user] + auxfiles = [self.pre, self.post, self.failsafe] + users = [self.pre_user, self.user, self.post_user, self.failsafe_user] for f in [f for f in auxfiles if len(f)]: - if self.pathname != os.path.dirname(f): + if f != self.failsafe and self.pathname != os.path.dirname(f): write_log("Warning: TestGroup '%s' not added to this run. " "Auxiliary script '%s' exists in a different " "directory.\n" % (self.pathname, f), LOG_ERR) @@ -500,9 +544,9 @@ class TestGroup(Test): def run(self, options): """ - Create Cmd instances for the pre/post scripts. If the pre script - doesn't pass, skip all the tests in this TestGroup. Run the post - script regardless. + Create Cmd instances for the pre/post/failsafe scripts. If the pre + script doesn't pass, skip all the tests in this TestGroup. Run the + post script regardless. Run the failsafe script when a test is killed. """ # tags assigned to this test group also include the test names if options.tags and not set(self.tags).intersection(set(options.tags)): @@ -510,10 +554,10 @@ class TestGroup(Test): odir = os.path.join(self.outputdir, os.path.basename(self.pre)) pretest = Cmd(self.pre, outputdir=odir, timeout=self.timeout, - user=self.pre_user) + user=self.pre_user, identifier=self.identifier) odir = os.path.join(self.outputdir, os.path.basename(self.post)) posttest = Cmd(self.post, outputdir=odir, timeout=self.timeout, - user=self.post_user) + user=self.post_user, identifier=self.identifier) cont = True if len(pretest.pathname): @@ -522,11 +566,18 @@ class TestGroup(Test): pretest.log(options) for fname in self.tests: - test = Cmd(os.path.join(self.pathname, fname), - outputdir=os.path.join(self.outputdir, fname), - timeout=self.timeout, user=self.user) + odir = os.path.join(self.outputdir, fname) + test = Cmd(os.path.join(self.pathname, fname), outputdir=odir, + timeout=self.timeout, user=self.user, + identifier=self.identifier) + odir = os.path.join(odir, os.path.basename(self.failsafe)) + failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout, + user=self.failsafe_user, identifier=self.identifier) if cont: test.run(options.dryrun) + if test.result.result == 'KILLED' and len(failsafe.pathname): + failsafe.run(options.dryrun) + failsafe.log(options, suppress_console=True) else: test.skip() @@ -556,6 +607,8 @@ class TestRun(object): ('pre_user', ''), ('post', ''), ('post_user', ''), + ('failsafe', ''), + ('failsafe_user', ''), ('tags', []) ] @@ -593,8 +646,8 @@ class TestRun(object): for prop in Test.props: setattr(testgroup, prop, getattr(options, prop)) - # Prevent pre/post scripts from running as regular tests - for f in [testgroup.pre, testgroup.post]: + # Prevent pre/post/failsafe scripts from running as regular tests + for f in [testgroup.pre, testgroup.post, testgroup.failsafe]: if f in filenames: del filenames[filenames.index(f)] @@ -605,7 +658,7 @@ class TestRun(object): def read(self, options): """ - Read in the specified runfile, and apply the TestRun properties + Read in the specified runfiles, and apply the TestRun properties listed in the 'DEFAULT' section to our TestRun. Then read each section, and apply the appropriate properties to the Test or TestGroup. Properties from individual sections override those set @@ -613,30 +666,43 @@ class TestRun(object): verification, add it to the TestRun. """ config = configparser.RawConfigParser() - if not len(config.read(options.runfile)): - fail("Coulnd't read config file %s" % options.runfile) + parsed = config.read(options.runfiles) + failed = options.runfiles - set(parsed) + if len(failed): + files = ' '.join(sorted(failed)) + fail("Couldn't read config files: %s" % files) for opt in TestRun.props: if config.has_option('DEFAULT', opt): setattr(self, opt, config.get('DEFAULT', opt)) self.outputdir = os.path.join(self.outputdir, self.timestamp) + testdir = options.testdir + for section in config.sections(): if 'tests' in config.options(section): - if os.path.isdir(section): - pathname = section - elif os.path.isdir(os.path.join(options.testdir, section)): - pathname = os.path.join(options.testdir, section) + parts = section.split(':', 1) + sectiondir = parts[0] + identifier = parts[1] if len(parts) == 2 else None + if os.path.isdir(sectiondir): + pathname = sectiondir + elif os.path.isdir(os.path.join(testdir, sectiondir)): + pathname = os.path.join(testdir, sectiondir) else: - pathname = section + pathname = sectiondir - testgroup = TestGroup(os.path.abspath(pathname)) + testgroup = TestGroup(os.path.abspath(pathname), + identifier=identifier) for prop in TestGroup.props: for sect in ['DEFAULT', section]: if config.has_option(sect, prop): - if prop == "tags": + if prop == 'tags': setattr(testgroup, prop, eval(config.get(sect, prop))) + elif prop == 'failsafe': + failsafe = config.get(sect, prop) + setattr(testgroup, prop, + os.path.join(testdir, failsafe)) else: setattr(testgroup, prop, config.get(sect, prop)) @@ -651,7 +717,12 @@ class TestRun(object): for prop in Test.props: for sect in ['DEFAULT', section]: if config.has_option(sect, prop): - setattr(test, prop, config.get(sect, prop)) + if prop == 'failsafe': + failsafe = config.get(sect, prop) + setattr(test, prop, + os.path.join(testdir, failsafe)) + else: + setattr(test, prop, config.get(sect, prop)) if test.verify(): self.tests[section] = test @@ -693,7 +764,8 @@ class TestRun(object): outputdir, and are guaranteed uniqueness because a group can only contain files in one directory. Pre and post tests will create a directory rooted at the outputdir of the Test or TestGroup in - question for their output. + question for their output. Failsafe scripts will create a directory + rooted at the outputdir of each Test for their output. """ done = False components = 0 @@ -716,7 +788,7 @@ class TestRun(object): def setup_logging(self, options): """ - This funtion creates the output directory and gets a file object + This function creates the output directory and gets a file object for the logfile. This function must be called before write_log() can be used. """ @@ -873,32 +945,34 @@ def fail(retstr, ret=1): def options_cb(option, opt_str, value, parser): - path_options = ['runfile', 'outputdir', 'template', 'testdir'] + path_options = ['outputdir', 'template', 'testdir'] - if option.dest == 'runfile' and '-w' in parser.rargs or \ + if option.dest == 'runfiles' and '-w' in parser.rargs or \ option.dest == 'template' and '-c' in parser.rargs: fail('-c and -w are mutually exclusive.') if opt_str in parser.rargs: fail('%s may only be specified once.' % opt_str) - if option.dest == 'runfile': + if option.dest == 'runfiles': parser.values.cmd = 'rdconfig' + value = set(os.path.abspath(p) for p in value.split(',')) if option.dest == 'template': parser.values.cmd = 'wrconfig' if option.dest == 'tags': value = [x.strip() for x in value.split(',')] - setattr(parser.values, option.dest, value) if option.dest in path_options: setattr(parser.values, option.dest, os.path.abspath(value)) + else: + setattr(parser.values, option.dest, value) def parse_args(): parser = OptionParser() parser.add_option('-c', action='callback', callback=options_cb, - type='string', dest='runfile', metavar='runfile', - help='Specify tests to run via config file.') + type='string', dest='runfiles', metavar='runfiles', + help='Specify tests to run via config files.') parser.add_option('-d', action='store_true', default=False, dest='dryrun', help='Dry run. Print tests, but take no other action.') parser.add_option('-g', action='store_true', default=False, @@ -917,6 +991,13 @@ def parse_args(): type='string', help='Specify a post script.') parser.add_option('-q', action='store_true', default=False, dest='quiet', help='Silence on the console during a test run.') + parser.add_option('-s', action='callback', callback=options_cb, + default='', dest='failsafe', metavar='script', + type='string', help='Specify a failsafe script.') + parser.add_option('-S', action='callback', callback=options_cb, + default='', dest='failsafe_user', + metavar='failsafe_user', type='string', + help='Specify a user to execute the failsafe script.') parser.add_option('-t', action='callback', callback=options_cb, default=60, dest='timeout', metavar='seconds', type='int', help='Timeout (in seconds) for an individual test.') @@ -940,10 +1021,10 @@ def parse_args(): help='Number of times to run the test run.') (options, pathnames) = parser.parse_args() - if not options.runfile and not options.template: + if not options.runfiles and not options.template: options.cmd = 'runtests' - if options.runfile and len(pathnames): + if options.runfiles and len(pathnames): fail('Extraneous arguments.') options.pathnames = [os.path.abspath(path) for path in pathnames] diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py.in similarity index 63% rename from tests/test-runner/bin/zts-report.py rename to tests/test-runner/bin/zts-report.py.in index d046c13a55..ce43d204fc 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py.in @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env @PYTHON_SHEBANG@ # # This file and its contents are supplied under the terms of the @@ -60,14 +60,6 @@ known_reason = 'Known issue' # exec_reason = 'Test user execute permissions required for utilities' -# -# Some tests require that the DISKS provided can be partitioned. This is -# normally not an issue because loop back devices are used for DISKS and they -# can be partition. There is one notable exception, the CentOS 6.x kernel is -# old enough that it does not support partitioning loop back devices. -# -disk_reason = 'Partitionable DISKS required' - # # Some tests require a minimum python version of 3.5 and will be skipped when # the default system version is too old. There may also be tests which require @@ -84,11 +76,10 @@ python_deps_reason = 'Python modules missing: python-cffi' tmpfile_reason = 'Kernel O_TMPFILE support required' # -# Some tests may depend on udev change events being generated when block -# devices change capacity. This functionality wasn't available until the -# 2.6.38 kernel. +# Some tests require the statx(2) system call on Linux which was first +# introduced in the 4.11 kernel. # -udev_reason = 'Kernel block device udev change events required' +statx_reason = 'Kernel statx(2) system call required on Linux' # # Some tests require that the NFS client and server utilities be installed. @@ -136,11 +127,23 @@ fio_reason = 'Fio v2.3 or newer required' trim_reason = 'DISKS must support discard (TRIM/UNMAP)' # -# Some tests are not applicable to Linux or need to be updated to operate -# in the manor required by Linux. Any tests which are skipped for this +# Some tests on FreeBSD require the fspacectl(2) system call and the +# truncate(1) utility supporting the -d option. The system call was first +# introduced in FreeBSD version 1400032. +# +fspacectl_reason = 'fspacectl(2) and truncate -d support required' + +# +# Some tests are not applicable to a platform or need to be updated to operate +# in the manor required by the platform. Any tests which are skipped for this # reason will be suppressed in the final analysis output. # -na_reason = "N/A on Linux" +na_reason = "Not applicable" + +# +# Some test cases doesn't have all requirements to run on Github actions CI. +# +ci_reason = 'CI runner doesn\'t have all requirements' summary = { 'total': float(0), @@ -160,51 +163,36 @@ summary = { # reasons listed above can be used. # known = { - 'casenorm/sensitive_none_lookup': ['FAIL', '7633'], - 'casenorm/sensitive_none_delete': ['FAIL', '7633'], - 'casenorm/sensitive_formd_lookup': ['FAIL', '7633'], - 'casenorm/sensitive_formd_delete': ['FAIL', '7633'], - 'casenorm/insensitive_none_lookup': ['FAIL', '7633'], - 'casenorm/insensitive_none_delete': ['FAIL', '7633'], - 'casenorm/insensitive_formd_lookup': ['FAIL', '7633'], - 'casenorm/insensitive_formd_delete': ['FAIL', '7633'], - 'casenorm/mixed_none_lookup': ['FAIL', '7633'], 'casenorm/mixed_none_lookup_ci': ['FAIL', '7633'], - 'casenorm/mixed_none_delete': ['FAIL', '7633'], - 'casenorm/mixed_formd_lookup': ['FAIL', '7633'], 'casenorm/mixed_formd_lookup_ci': ['FAIL', '7633'], - 'casenorm/mixed_formd_delete': ['FAIL', '7633'], - 'cli_root/zfs_receive/zfs_receive_004_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], - 'cli_root/zpool_create/zpool_create_016_pos': ['SKIP', na_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], - 'inuse/inuse_001_pos': ['SKIP', na_reason], - 'inuse/inuse_003_pos': ['SKIP', na_reason], - 'inuse/inuse_006_pos': ['SKIP', na_reason], - 'inuse/inuse_007_pos': ['SKIP', na_reason], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], - 'removal/removal_condense_export': ['SKIP', known_reason], - 'removal/removal_with_zdb': ['SKIP', known_reason], 'rootpool/setup': ['SKIP', na_reason], 'rsend/rsend_008_pos': ['SKIP', '6066'], - 'snapshot/rollback_003_pos': ['SKIP', '6143'], 'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason], - 'xattr/xattr_008_pos': ['SKIP', na_reason], - 'xattr/xattr_009_neg': ['SKIP', na_reason], - 'xattr/xattr_010_neg': ['SKIP', na_reason], - 'zvol/zvol_misc/zvol_misc_001_neg': ['SKIP', na_reason], - 'zvol/zvol_misc/zvol_misc_003_neg': ['SKIP', na_reason], - 'zvol/zvol_misc/zvol_misc_004_pos': ['SKIP', na_reason], - 'zvol/zvol_misc/zvol_misc_005_neg': ['SKIP', na_reason], - 'zvol/zvol_misc/zvol_misc_006_pos': ['SKIP', na_reason], - 'zvol/zvol_swap/zvol_swap_003_pos': ['SKIP', na_reason], - 'zvol/zvol_swap/zvol_swap_005_pos': ['SKIP', na_reason], - 'zvol/zvol_swap/zvol_swap_006_pos': ['SKIP', na_reason], } +if sys.platform.startswith('freebsd'): + known.update({ + 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], + 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], + 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], + 'link_count/link_count_001': ['SKIP', na_reason], + }) +elif sys.platform.startswith('linux'): + known.update({ + 'casenorm/mixed_formd_lookup': ['FAIL', '7633'], + 'casenorm/mixed_formd_delete': ['FAIL', '7633'], + 'casenorm/sensitive_formd_lookup': ['FAIL', '7633'], + 'casenorm/sensitive_formd_delete': ['FAIL', '7633'], + 'removal/removal_with_zdb': ['SKIP', known_reason], + }) + + # # These tests may occasionally fail or be skipped. We want there failures # to be reported but only unexpected failures should bubble up to cause @@ -217,51 +205,48 @@ known = { # reasons listed above can be used. # maybe = { - 'cache/setup': ['SKIP', disk_reason], - 'cache/cache_010_neg': ['FAIL', known_reason], 'chattr/setup': ['SKIP', exec_reason], + 'crtime/crtime_001_pos': ['SKIP', statx_reason], 'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason], + 'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense': + ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', '5479'], - 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', '6415'], - 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', '6416'], + 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason], + 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason], 'cli_root/zfs_share/setup': ['SKIP', share_reason], 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/setup': ['SKIP', share_reason], - 'cli_root/zpool_add/setup': ['SKIP', disk_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], - 'cli_root/zpool_create/setup': ['SKIP', disk_reason], - 'cli_root/zpool_create/zpool_create_008_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'], - 'cli_root/zpool_expand/setup': ['SKIP', udev_reason], - 'cli_root/zpool_export/setup': ['SKIP', disk_reason], - 'cli_root/zpool_import/setup': ['SKIP', disk_reason], 'cli_root/zpool_import/import_rewind_device_replaced': ['FAIL', rewind_reason], 'cli_root/zpool_import/import_rewind_config_changed': ['FAIL', rewind_reason], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', '6839'], - 'cli_root/zpool_remove/setup': ['SKIP', disk_reason], + 'cli_root/zpool_initialize/zpool_initialize_import_export': + ['FAIL', '11948'], + 'cli_root/zpool_labelclear/zpool_labelclear_removed': + ['FAIL', known_reason], 'cli_root/zpool_trim/setup': ['SKIP', trim_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', '6141'], - 'cli_user/misc/arc_summary3_001_pos': ['SKIP', python_reason], 'delegate/setup': ['SKIP', exec_reason], - 'fault/auto_online_001_pos': ['SKIP', disk_reason], - 'fault/auto_replace_001_pos': ['SKIP', disk_reason], + 'fallocate/fallocate_punch-hole': ['SKIP', fspacectl_reason], 'history/history_004_pos': ['FAIL', '7026'], 'history/history_005_neg': ['FAIL', '6680'], 'history/history_006_neg': ['FAIL', '5657'], 'history/history_008_pos': ['FAIL', known_reason], 'history/history_010_pos': ['SKIP', exec_reason], - 'inuse/inuse_005_pos': ['SKIP', disk_reason], - 'inuse/inuse_008_pos': ['SKIP', disk_reason], - 'inuse/inuse_009_pos': ['SKIP', disk_reason], 'io/mmap': ['SKIP', fio_reason], 'largest_pool/largest_pool_001_pos': ['FAIL', known_reason], + 'mmp/mmp_on_uberblocks': ['FAIL', known_reason], 'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason], 'no_space/enospc_002_pos': ['FAIL', enospc_reason], + 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', '11946'], 'projectquota/setup': ['SKIP', exec_reason], 'redundancy/redundancy_004_neg': ['FAIL', '7290'], + 'redundancy/redundancy_draid_spare3': ['SKIP', known_reason], + 'removal/removal_condense_export': ['FAIL', known_reason], 'reservation/reservation_008_pos': ['FAIL', '7741'], 'reservation/reservation_018_pos': ['FAIL', '5642'], 'rsend/rsend_019_pos': ['FAIL', '6086'], @@ -269,6 +254,7 @@ maybe = { 'rsend/rsend_021_pos': ['FAIL', '6446'], 'rsend/rsend_024_pos': ['FAIL', '5665'], 'rsend/send-c_volume': ['FAIL', '6087'], + 'rsend/send_partial_dataset': ['FAIL', known_reason], 'snapshot/clone_001_pos': ['FAIL', known_reason], 'snapshot/snapshot_009_pos': ['FAIL', '7961'], 'snapshot/snapshot_010_pos': ['FAIL', '7961'], @@ -280,10 +266,83 @@ maybe = { 'user_namespace/setup': ['SKIP', user_ns_reason], 'userquota/setup': ['SKIP', exec_reason], 'vdev_zaps/vdev_zaps_004_pos': ['FAIL', '6935'], - 'write_dirs/setup': ['SKIP', disk_reason], 'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', '5848'], + 'pam/setup': ['SKIP', "pamtester might be not available"], } +if sys.platform.startswith('freebsd'): + maybe.update({ + 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], + 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], + 'cli_root/zfs_receive/receive-o-x_props_override': + ['FAIL', known_reason], + 'cli_root/zfs_share/zfs_share_011_pos': ['FAIL', known_reason], + 'cli_root/zfs_share/zfs_share_concurrent_shares': + ['FAIL', known_reason], + 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], + 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], + 'inheritance/inherit_001_pos': ['FAIL', '11829'], + 'resilver/resilver_restart_001': ['FAIL', known_reason], + 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', '12622'], + 'pool_checkpoint/checkpoint_indirect': ['FAIL', '12623'], + }) +elif sys.platform.startswith('linux'): + maybe.update({ + 'alloc_class/alloc_class_009_pos': ['FAIL', known_reason], + 'alloc_class/alloc_class_010_pos': ['FAIL', known_reason], + 'alloc_class/alloc_class_011_neg': ['FAIL', known_reason], + 'alloc_class/alloc_class_012_pos': ['FAIL', known_reason], + 'alloc_class/alloc_class_013_pos': ['FAIL', '11888'], + 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], + 'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason], + 'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason], + 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], + 'fault/auto_spare_shared': ['FAIL', '11889'], + 'io/io_uring': ['SKIP', 'io_uring support required'], + 'limits/filesystem_limit': ['SKIP', known_reason], + 'limits/snapshot_limit': ['SKIP', known_reason], + 'mmp/mmp_active_import': ['FAIL', known_reason], + 'mmp/mmp_exported_import': ['FAIL', known_reason], + 'mmp/mmp_inactive_import': ['FAIL', known_reason], + 'refreserv/refreserv_raidz': ['FAIL', known_reason], + 'rsend/rsend_007_pos': ['FAIL', known_reason], + 'rsend/rsend_010_pos': ['FAIL', known_reason], + 'rsend/rsend_011_pos': ['FAIL', known_reason], + 'snapshot/rollback_003_pos': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', '12621'], + 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + }) + + +# Not all Github actions runners have scsi_debug module, so we may skip +# some tests which use it. +if os.environ.get('CI') == 'true': + known.update({ + 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', ci_reason], + 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', ci_reason], + 'cli_root/zpool_expand/zpool_expand_005_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/setup': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_001_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_002_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_004_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_005_pos': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_006_neg': ['SKIP', ci_reason], + 'cli_root/zpool_reopen/zpool_reopen_007_pos': ['SKIP', ci_reason], + 'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason], + 'fault/auto_offline_001_pos': ['SKIP', ci_reason], + 'fault/auto_online_001_pos': ['SKIP', ci_reason], + 'fault/auto_online_002_pos': ['SKIP', ci_reason], + 'fault/auto_replace_001_pos': ['SKIP', ci_reason], + 'fault/auto_spare_ashift': ['SKIP', ci_reason], + 'fault/auto_spare_shared': ['SKIP', ci_reason], + 'procfs/pool_state': ['SKIP', ci_reason], + }) + + maybe.update({ + 'events/events_002_pos': ['FAIL', '11546'], + }) + def usage(s): print(s) @@ -299,13 +358,14 @@ def process_results(pathname): prefix = '/zfs-tests/tests/functional/' pattern = \ - r'^Test:\s*\S*%s(\S+)\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' \ + r'^Test(?:\s+\(\S+\))?:' + \ + r'\s*\S*%s(\S+)\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' \ % prefix pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} - for l in f.readlines(): - m = re.match(pattern, l) + for line in f.readlines(): + m = re.match(pattern, line) if m and len(m.groups()) == 4: summary['total'] += 1 if m.group(4) == "PASS": @@ -313,7 +373,7 @@ def process_results(pathname): d[m.group(1)] = m.group(4) continue - m = re.match(pattern_log, l) + m = re.match(pattern_log, line) if m: summary['logfile'] = m.group(1) @@ -352,10 +412,10 @@ if __name__ == "__main__": print("\nTests with results other than PASS that are expected:") for test in sorted(expected): - issue_url = 'https://github.com/zfsonlinux/zfs/issues/' + issue_url = 'https://github.com/openzfs/zfs/issues/' # Include the reason why the result is expected, given the following: - # 1. Suppress test results which set the "N/A on Linux" reason. + # 1. Suppress test results which set the "Not applicable" reason. # 2. Numerical reasons are assumed to be GitHub issue numbers. # 3. When an entire test group is skipped only report the setup reason. if test in known: diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib index 32fc006161..c9c01ab752 100644 --- a/tests/test-runner/include/logapi.shlib +++ b/tests/test-runner/include/logapi.shlib @@ -23,7 +23,7 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2020 by Delphix. All rights reserved. # . ${STF_TOOLS}/include/stf.shlib @@ -68,6 +68,16 @@ function log_must (( $? != 0 )) && log_fail } +# Execute a positive test (expecting no stderr) and exit $STF_FAIL +# if test fails +# $@ - command to execute + +function log_must_nostderr +{ + log_pos_nostderr "$@" + (( $? != 0 )) && log_fail +} + # Execute a positive test but retry the command on failure if the output # matches an expected pattern. Otherwise behave like log_must and exit # $STF_FAIL is test fails. @@ -105,7 +115,7 @@ function log_must_retry " assertion failure exited $status" status=1 else - [[ -n $LOGAPI_DEBUG ]] && print $($out) + [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" fi break @@ -165,6 +175,23 @@ function log_mustnot_expect (( $? != 0 )) && log_fail } +# Signal numbers are platform-dependent +case $(uname) in +Darwin|FreeBSD) + SIGBUS=10 + SIGSEGV=11 + ;; +illumos|Linux|*) + SIGBUS=7 + SIGSEGV=11 + ;; +esac +EXIT_SUCCESS=0 +EXIT_NOTFOUND=127 +EXIT_SIGNAL=256 +EXIT_SIGBUS=$((EXIT_SIGNAL + SIGBUS)) +EXIT_SIGSEGV=$((EXIT_SIGNAL + SIGSEGV)) + # Execute and print command with status where success equals non-zero result # or output includes expected keyword # @@ -191,19 +218,19 @@ function log_neg_expect out="cat $logfile" # unexpected status - if (( $status == 0 )); then + if (( $status == EXIT_SUCCESS )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status" # missing binary - elif (( $status == 127 )); then + elif (( $status == EXIT_NOTFOUND )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (File not found)" # bus error - core dump - elif (( $status == 138 )); then + elif (( $status == EXIT_SIGBUS )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (Bus Error)" # segmentation violation - core dump - elif (( $status == 139 )); then + elif (( $status == EXIT_SIGSEGV )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (SEGV)" else @@ -227,7 +254,7 @@ function log_neg_expect fi if (( $ret == 0 )); then - [[ -n $LOGAPI_DEBUG ]] && print $($out) + [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" "exited $status" fi fi @@ -267,7 +294,47 @@ function log_pos " exited $status" status=1 else - [[ -n $LOGAPI_DEBUG ]] && print $($out) + [[ -n $LOGAPI_DEBUG ]] && cat $logfile + _printsuccess "$@" + fi + fi + _recursive_output $logfile "false" + return $status +} + +# Execute and print command with status where success equals zero result +# and no stderr output +# +# $@ command to execute +# +# return 0 if command succeeds and no stderr output +# return 1 othersie + +function log_pos_nostderr +{ + typeset out="" + typeset logfile="/tmp/log.$$" + + while [[ -e $logfile ]]; do + logfile="$logfile.$$" + done + + "$@" 2>$logfile + typeset status=$? + out="cat $logfile" + typeset out_msg=$($out) + + if (( $status != 0 )) ; then + print -u2 $out_msg + _printerror "$@" "exited $status" + else + if [[ ! -z "$out_msg" ]]; then + print -u2 $out_msg + _printerror "$@" "message in stderr" \ + " exited $status" + status=1 + else + [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" fi fi @@ -281,7 +348,23 @@ function log_pos function log_onexit { - _CLEANUP="$@" + _CLEANUP=("$*") +} + +# Push an exit handler on the cleanup stack +# +# $@ - function(s) to perform on exit + +function log_onexit_push +{ + _CLEANUP+=("$*") +} + +# Pop an exit handler off the cleanup stack + +function log_onexit_pop +{ + _CLEANUP=("${_CLEANUP[@]:0:${#_CLEANUP[@]}-1}") } # @@ -387,6 +470,11 @@ function log_other _endlog $STF_OTHER "$@" } +function set_main_pid +{ + _MAINPID=$1 +} + # # Internal functions # @@ -421,16 +509,27 @@ function _endlog shift (( ${#@} > 0 )) && _printline "$@" + # + # If we're running in a subshell then just exit and let + # the parent handle the failures + # + if [[ -n "$_MAINPID" && $$ != "$_MAINPID" ]]; then + log_note "subshell exited: "$_MAINPID + exit $exitcode + fi + if [[ $exitcode == $STF_FAIL ]] ; then _execute_testfail_callbacks fi - if [[ -n $_CLEANUP ]] ; then - typeset cleanup=$_CLEANUP - log_onexit "" + typeset stack=("${_CLEANUP[@]}") + log_onexit "" + typeset i=${#stack[@]} + while (( i-- )); do + typeset cleanup="${stack[i]}" log_note "Performing local cleanup via log_onexit ($cleanup)" $cleanup - fi + done exit $exitcode } diff --git a/tests/test-runner/man/test-runner.1 b/tests/test-runner/man/test-runner.1 index 31cd412452..f7cbcbc5b9 100644 --- a/tests/test-runner/man/test-runner.1 +++ b/tests/test-runner/man/test-runner.1 @@ -8,300 +8,255 @@ .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. .\" -.\" .\" Copyright (c) 2012 by Delphix. All rights reserved. .\" -.TH run 1 "23 Sep 2012" -.SH NAME -run \- find, execute, and log the results of tests -.SH SYNOPSIS -.LP -.nf -\fBrun\fR [\fB-dgq] [\fB-o\fR \fIoutputdir\fR] [\fB-pP\fR \fIscript\fR] [\fB-t\fR \fIseconds\fR] [\fB-uxX\fR \fIusername\fR] - \fIpathname\fR ... -.fi - -.LP -.nf -\fBrun\fR \fB-w\fR \fIrunfile\fR [\fB-gq\fR] [\fB-o\fR \fIoutputdir\fR] [\fB-pP\fR \fIscript\fR] [\fB-t\fR \fIseconds\fR] - [\fB-uxX\fR \fIusername\fR] \fIpathname\fR ... -.fi - -.LP -.nf -\fBrun\fR \fB-c\fR \fIrunfile\fR [\fB-dq\fR] -.fi - -.LP -.nf -\fBrun\fR [\fB-h\fR] -.fi - -.SH DESCRIPTION -.sp -.LP -The \fBrun\fR command has three basic modes of operation. With neither the -\fB-c\fR nor the \fB-w\fR option, \fBrun\fR processes the arguments provided on -the command line, adding them to the list for this run. If a specified -\fIpathname\fR is an executable file, it is added as a test. If a specified -\fIpathname\fR is a directory, the behavior depends upon the \fB-g\fR option. -If \fB-g\fR is specified, the directory is treated as a test group. See the -section on "Test Groups" below. Without the \fB-g\fR option, \fBrun\fR simply -descends into the directory looking for executable files. The tests are then -executed, and the results are logged. - -With the \fB-w\fR option, \fBrun\fR finds tests in the manner described above. +.Dd May 26, 2021 +.Dt RUN 1 +.Os +. +.Sh NAME +.Nm run +.Nd find, execute, and log the results of tests +.Sh SYNOPSIS +.Nm +.Op Fl dgq +.Op Fl o Ar outputdir +.Op Fl pP Ar script +.Op Fl t seconds +.Op Fl uxX Ar username +.Ar pathname Ns No … +.Pp +.Nm +.Fl w Ar runfile +.Op Fl gq +.Op Fl o Ar outputdir +.Op Fl pP Ar script +.Op Fl t seconds +.Op Fl uxX Ar username +.Ar pathname Ns No … +.Pp +.Nm +.Fl c Ar runfile +.Op Fl dq +.Pp +.Nm +.Op Fl h +. +.Sh DESCRIPTION +.Nm +command has three basic modes of operation. +With neither +.Fl c +nor +.Fl w , +.Nm +processes the arguments provided on +the command line, adding them to the list for this run. +If a specified +.Ar pathname +is an executable file, it is added as a test. +If a specified +.Ar pathname +is a directory, the behavior depends upon the presence of +.Fl g . +If +.Fl g +is specified, the directory is treated as a test group. +See the section on +.Sy Test Groups +below. +Without +.Fl g , +.Nm +simply descends into the directory looking for executable files. +The tests are then executed, and the results are logged. +.Pp +With +.Fl w , +.Nm +finds tests in the manner described above. Rather than executing the tests and logging the results, the test configuration -is stored in a \fIrunfile\fR which can be used in future invocations, or edited -to modify which tests are executed and which options are applied. Options -included on the command line with \fB-w\fR become defaults in the -\fIrunfile\fR. - -With the \fB-c\fR option, \fBrun\fR parses a \fIrunfile\fR, which can specify a -series of tests and test groups to be executed. The tests are then executed, -and the results are logged. -.sp -.SS "Test Groups" -.sp -.LP +is stored in a +.Ar runfile , +which can be used in future invocations, or edited +to modify which tests are executed and which options are applied. +Options included on the command line with +.Fl w +become defaults in the +.Ar runfile . +.Pp +With +.Fl c , +.Nm +parses a +.Ar runfile , +which can specify a series of tests and test groups to be executed. +The tests are then executed, and the results are logged. +. +.Ss Test Groups A test group is comprised of a set of executable files, all of which exist in -one directory. The options specified on the command line or in a \fIrunfile\fR -apply to individual tests in the group. The exception is options pertaining to -pre and post scripts, which act on all tests as a group. Rather than running -before and after each test, these scripts are run only once each at the start -and end of the test group. -.SS "Test Execution" -.sp -.LP +one directory. +The options specified on the command line or in a +.Ar runfile +apply to individual tests in the group. +The exception is options pertaining to pre and post scripts, which act on all tests as a group. +Rather than running before and after each test, +these scripts are run only once each at the start and end of the test group. +.Ss Test Execution The specified tests run serially, and are typically assigned results according -to exit values. Tests that exit zero and non-zero are marked "PASS" and "FAIL" -respectively. When a pre script fails for a test group, only the post script is -executed, and the remaining tests are marked "SKIPPED." Any test that exceeds -its \fItimeout\fR is terminated, and marked "KILLED." - -By default, tests are executed with the credentials of the \fBrun\fR script. -Executing tests with other credentials is done via \fBsudo\fR(1m), which must -be configured to allow execution without prompting for a password. Environment -variables from the calling shell are available to individual tests. During test -execution, the working directory is changed to \fIoutputdir\fR. -.SS "Output Logging" -.sp -.LP -By default, \fBrun\fR will print one line on standard output at the conclusion -of each test indicating the test name, result and elapsed time. Additionally, -for each invocation of \fBrun\fR, a directory is created using the ISO 8601 -date format. Within this directory is a file named \fIlog\fR containing all the -test output with timestamps, and a directory for each test. Within the test -directories, there is one file each for standard output, standard error and -merged output. The default location for the \fIoutputdir\fR is -\fI/var/tmp/test_results\fR. -.SS "Runfiles" -.sp -.LP -The \fIrunfile\fR is an ini style configuration file that describes a test run. -The file has one section named "DEFAULT," which contains configuration option -names and their values in "name = value" format. The values in this section -apply to all the subsequent sections, unless they are also specified there, in -which case the default is overridden. The remaining section names are the -absolute pathnames of files and direcotries, describing tests and test groups -respectively. The legal option names are: -.sp -.ne 2 -.na -\fBoutputdir\fR = \fIpathname\fR -.ad -.sp .6 -.RS 4n +to exit values. +Tests that exit zero and non-zero are marked +.Sy PASS +and +.Sy FAIL , +respectively. +When a pre script fails for a test group, only the post script is executed, +and the remaining tests are marked +.Sy SKIPPED . +Any test that exceeds +its +.Ar timeout +is terminated, and marked +.Sy KILLED . +.Pp +By default, tests are executed with the credentials of the +.Nm +script. +Executing tests with other credentials is done via +.Xr sudo 1m , +which must +be configured to allow execution without prompting for a password. +Environment variables from the calling shell are available to individual tests. +During test execution, the working directory is changed to +.Ar outputdir . +. +.Ss Output Logging +By default, +.Nm +will print one line on standard output at the conclusion +of each test indicating the test name, result and elapsed time. +Additionally, for each invocation of +.Nm , +a directory is created using the ISO 8601 date format. +Within this directory is a file named +.Sy log +containing all the +test output with timestamps, and a directory for each test. +Within the test directories, there is one file each for standard output, +standard error and merged output. +The default location for the +.Ar outputdir +is +.Pa /var/tmp/test_results . +.Ss "Runfiles" +The +.Ar runfile +is an INI-style configuration file that describes a test run. +The file has one section named +.Sy DEFAULT , +which contains configuration option +names and their values in +.Sy name No = Ar value +format. +The values in this section apply to all the subsequent sections, +unless they are also specified there, in which case the default is overridden. +The remaining section names are the absolute pathnames of files and directories, +describing tests and test groups respectively. +The legal option names are: +.Bl -tag -width "tests = ['filename', …]" +.It Sy outputdir No = Ar pathname The name of the directory that holds test logs. -.RE -.sp -.ne 2 -.na -\fBpre\fR = \fIscript\fR -.ad -.sp .6 -.RS 4n -Run \fIscript\fR prior to the test or test group. -.RE -.sp -.ne 2 -.na -\fBpre_user\fR = \fIusername\fR -.ad -.sp .6 -.RS 4n -Execute the pre script as \fIusername\fR. -.RE -.sp -.ne 2 -.na -\fBpost\fR = \fIscript\fR -.ad -.sp .6 -.RS 4n -Run \fIscript\fR after the test or test group. -.RE -.sp -.ne 2 -.na -\fBpost_user\fR = \fIusername\fR -.ad -.sp .6 -.RS 4n -Execute the post script as \fIusername\fR. -.RE -.sp -.ne 2 -.na -\fBquiet\fR = [\fITrue\fR|\fIFalse\fR] -.ad -.sp .6 -.RS 4n -If set to True, only the results summary is printed to standard out. -.RE -.sp -.ne 2 -.na -\fBtests\fR = [\fI'filename'\fR [,...]] -.ad -.sp .6 -.RS 4n -Specify a list of \fIfilenames\fR for this test group. Only the basename of the -absolute path is required. This option is only valid for test groups, and each -\fIfilename\fR must be single quoted. -.RE -.sp -.ne 2 -.na -\fBtimeout\fR = \fIn\fR -.ad -.sp .6 -.RS 4n -A timeout value of \fIn\fR seconds. -.RE -.sp -.ne 2 -.na -\fBuser\fR = \fIusername\fR -.ad -.sp .6 -.RS 4n -Execute the test or test group as \fIusername\fR. -.RE - -.SH OPTIONS -.sp -.LP -The following options are available for the \fBrun\fR command. -.sp -.ne 2 -.na -\fB-c\fR \fIrunfile\fR -.ad -.RS 6n -Specify a \fIrunfile\fR to be consumed by the run command. -.RE - -.ne 2 -.na -\fB-d\fR -.ad -.RS 6n -Dry run mode. Execute no tests, but print a description of each test that would -have been run. -.RE - -.ne 2 -.na -\fB-g\fR -.ad -.RS 6n +.It Sy pre No = Ar script +Run +.Ar script +prior to the test or test group. +.It Sy pre_user No = Ar username +Execute the pre script as +.Ar username . +.It Sy post No = Ar script +Run +.Ar script +after the test or test group. +.It Sy post_user No = Ar username +Execute the post script as +.Ar username . +.It Sy quiet No = Sy True Ns | Ns Sy False +If +.Sy True , +only the results summary is printed to standard out. +.It Sy tests No = [ Ns Ar 'filename' , No … ] +Specify a list of +.Ar filenames +for this test group. +Only the basename of the absolute path is required. +This option is only valid for test groups, and each +.Ar filename +must be single quoted. +.It Sy timeout No = Ar n +A timeout value of +.Ar n +seconds. +.It Sy user No = Ar username +Execute the test or test group as +.Ar username . +.El +. +.Sh OPTIONS +.Bl -tag -width "-o outputdir" +.It Fl c Ar runfile +Specify a +.Ar runfile +to be consumed by the run command. +.It Fl d +Dry run mode. +Execute no tests, but print a description of each test that would have been run. +.It Fl g Create test groups from any directories found while searching for tests. -.RE - -.ne 2 -.na -\fB-o\fR \fIoutputdir\fR -.ad -.RS 6n +.It Fl o Ar outputdir Specify the directory in which to write test results. -.RE - -.ne 2 -.na -\fB-p\fR \fIscript\fR -.ad -.RS 6n -Run \fIscript\fR prior to any test or test group. -.RE - -.ne 2 -.na -\fB-P\fR \fIscript\fR -.ad -.RS 6n -Run \fIscript\fR after any test or test group. -.RE - -.ne 2 -.na -\fB-q\fR -.ad -.RS 6n -Print only the results sumary to the standard output. -.RE - -.ne 2 -.na -\fB-t\fR \fIn\fR -.ad -.RS 6n -Specify a timeout value of \fIn\fR seconds per test. -.RE - -.ne 2 -.na -\fB-u\fR \fIusername\fR -.ad -.RS 6n -Execute tests or test groups as \fIusername\fR. -.RE - -.ne 2 -.na -\fB-w\fR \fIrunfile\fR -.ad -.RS 6n -Specify the name of the \fIrunfile\fR to create. -.RE - -.ne 2 -.na -\fB-x\fR \fIusername\fR -.ad -.RS 6n -Execute the pre script as \fIusername\fR. -.RE - -.ne 2 -.na -\fB-X\fR \fIusername\fR -.ad -.RS 6n -Execute the post script as \fIusername\fR. -.RE - -.SH EXAMPLES -.LP -\fBExample 1\fR Running ad-hoc tests. -.sp -.LP -This example demonstrates the simplest invocation of \fBrun\fR. - -.sp -.in +2 -.nf -% \fBrun my-tests\fR +.It Fl p Ar script +Run +.Ar script +prior to any test or test group. +.It Fl P Ar script +Run +.Ar script +after any test or test group. +.It Fl q +Print only the results summary to the standard output. +.It Fl s Ar script +Run +.Ar script +as a failsafe after any test is killed. +.It Fl S Ar username +Execute the failsafe script as +.Ar username . +.It Fl t Ar n +Specify a timeout value of +.Ar n +seconds per test. +.It Fl u Ar username +Execute tests or test groups as +.Ar username . +.It Fl w Ar runfile +Specify the name of the +.Ar runfile +to create. +.It Fl x Ar username +Execute the pre script as +.Ar username . +.It Fl X Ar username +Execute the post script as +.Ar username . +.El +. +.Sh EXAMPLES +.Bl -tag -width "-h" +.It Sy Example 1 : No Running ad-hoc tests. +This example demonstrates the simplest invocation of +.Nm . +.Bd -literal +.No % Nm run Ar my-tests Test: /home/jkennedy/my-tests/test-01 [00:02] [PASS] Test: /home/jkennedy/my-tests/test-02 [00:04] [PASS] Test: /home/jkennedy/my-tests/test-03 [00:01] [PASS] @@ -312,20 +267,14 @@ PASS 3 Running Time: 00:00:07 Percent passed: 100.0% Log directory: /var/tmp/test_results/20120923T180654 -.fi -.in -2 - -.LP -\fBExample 2\fR Creating a \fIrunfile\fR for future use. -.sp -.LP -This example demonstrates creating a \fIrunfile\fR with non default options. - -.sp -.in +2 -.nf -% \fBrun -p setup -x root -g -w new-tests.run new-tests\fR -% \fBcat new-tests.run\fR +.Ed +.It Sy Example 2 : No Creating a Ar runfile No for future use. +This example demonstrates creating a +.Ar runfile +with non-default options. +.Bd -literal +.No % Nm run Fl p Ar setup Fl x Ar root Fl g Fl w Ar new-tests.run Ar new-tests +.No % Nm cat Pa new-tests.run [DEFAULT] pre = setup post_user = @@ -338,33 +287,8 @@ outputdir = /var/tmp/test_results [/home/jkennedy/new-tests] tests = ['test-01', 'test-02', 'test-03'] -.fi -.in -2 - -.SH EXIT STATUS -.sp -.LP -The following exit values are returned: -.sp -.ne 2 -.na -\fB\fB0\fR\fR -.ad -.sp .6 -.RS 4n -Successful completion. -.RE -.sp -.ne 2 -.na -\fB\fB1\fR\fR -.ad -.sp .6 -.RS 4n -An error occurred. -.RE - -.SH SEE ALSO -.sp -.LP -\fBsudo\fR(1m) +.Ed +.El +. +.Sh SEE ALSO +.Xr sudo 1m diff --git a/tests/zfs-tests/callbacks/Makefile.am b/tests/zfs-tests/callbacks/Makefile.am index 30e8472411..512a737bb5 100644 --- a/tests/zfs-tests/callbacks/Makefile.am +++ b/tests/zfs-tests/callbacks/Makefile.am @@ -1,5 +1,6 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/callbacks dist_pkgdata_SCRIPTS = \ + zfs_failsafe.ksh \ zfs_dbgmsg.ksh \ zfs_dmesg.ksh \ zfs_mmp.ksh diff --git a/tests/zfs-tests/callbacks/zfs_failsafe.ksh b/tests/zfs-tests/callbacks/zfs_failsafe.ksh new file mode 100755 index 0000000000..0d14df7012 --- /dev/null +++ b/tests/zfs-tests/callbacks/zfs_failsafe.ksh @@ -0,0 +1,8 @@ +#!/bin/ksh + +# Commands to perform failsafe-critical cleanup after a test is killed. +# +# This should only be used to ensure the system is restored to a functional +# state in the event of tests being killed (preventing normal cleanup). + +zinject -c all diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 39a538d2d2..d1c29fcd1c 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -1,13 +1,16 @@ EXTRA_DIST = file_common.h SUBDIRS = \ + badsend \ + btree_test \ chg_usr_exec \ - user_ns_exec \ devname2devid \ dir_rd_update \ + draid \ file_check \ file_trunc \ file_write \ + get_diff \ largest_file \ libzfs_input_check \ mkbusy \ @@ -16,12 +19,20 @@ SUBDIRS = \ mktree \ mmap_exec \ mmap_libaio \ + mmap_seek \ mmapwrite \ nvlist_to_lua \ - randfree_file \ randwritecomp \ readmmap \ rename_dir \ rm_lnkcnt_zero_file \ - threadsappend \ + send_doall \ + stride_dd \ + threadsappend + +if BUILD_LINUX +SUBDIRS += \ + randfree_file \ + user_ns_exec \ xattrtest +endif diff --git a/tests/zfs-tests/cmd/badsend/.gitignore b/tests/zfs-tests/cmd/badsend/.gitignore new file mode 100644 index 0000000000..d2efa627aa --- /dev/null +++ b/tests/zfs-tests/cmd/badsend/.gitignore @@ -0,0 +1 @@ +/badsend diff --git a/tests/zfs-tests/cmd/badsend/Makefile.am b/tests/zfs-tests/cmd/badsend/Makefile.am new file mode 100644 index 0000000000..5a8946f0d4 --- /dev/null +++ b/tests/zfs-tests/cmd/badsend/Makefile.am @@ -0,0 +1,11 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = badsend + +badsend_SOURCES = badsend.c +badsend_LDADD = \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/badsend/badsend.c b/tests/zfs-tests/cmd/badsend/badsend.c new file mode 100644 index 0000000000..af17bc7255 --- /dev/null +++ b/tests/zfs-tests/cmd/badsend/badsend.c @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Portions Copyright 2020 iXsystems, Inc. + */ + +/* + * Test some invalid send operations with libzfs/libzfs_core. + * + * Specifying the to and from snaps in the wrong order should return EXDEV. + * We are checking that the early return doesn't accidentally leave any + * references held, so this test is designed to trigger a panic when asserts + * are verified with the bug present. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +static void +usage(const char *name) +{ + fprintf(stderr, "usage: %s snap0 snap1\n", name); + exit(EX_USAGE); +} + +int +main(int argc, char const * const argv[]) +{ + sendflags_t flags = { 0 }; + libzfs_handle_t *zhdl; + zfs_handle_t *zhp; + const char *fromfull, *tofull, *fsname, *fromsnap, *tosnap, *p; + uint64_t size; + int fd, error; + + if (argc != 3) + usage(argv[0]); + + fromfull = argv[1]; + tofull = argv[2]; + + p = strchr(fromfull, '@'); + if (p == NULL) + usage(argv[0]); + fromsnap = p + 1; + + p = strchr(tofull, '@'); + if (p == NULL) + usage(argv[0]); + tosnap = p + 1; + + fsname = strndup(tofull, p - tofull); + if (strncmp(fsname, fromfull, p - tofull) != 0) + usage(argv[0]); + + fd = open("/dev/null", O_WRONLY); + if (fd == -1) + err(EX_OSERR, "open(\"/dev/null\", O_WRONLY)"); + + zhdl = libzfs_init(); + if (zhdl == NULL) + errx(EX_OSERR, "libzfs_init(): %s", libzfs_error_init(errno)); + + zhp = zfs_open(zhdl, fsname, ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + err(EX_OSERR, "zfs_open(\"%s\")", fsname); + + /* + * Exercise EXDEV in dmu_send_obj. The error gets translated to + * EZFS_CROSSTARGET in libzfs. + */ + error = zfs_send(zhp, tosnap, fromsnap, &flags, fd, NULL, NULL, NULL); + if (error == 0 || libzfs_errno(zhdl) != EZFS_CROSSTARGET) + errx(EX_OSERR, "zfs_send(\"%s\", \"%s\") should have failed " + "with EZFS_CROSSTARGET, not %d", + tofull, fromfull, libzfs_errno(zhdl)); + printf("zfs_send(\"%s\", \"%s\"): %s\n", + tofull, fromfull, libzfs_error_description(zhdl)); + + zfs_close(zhp); + + /* + * Exercise EXDEV in dmu_send. + */ + error = lzc_send_resume_redacted(fromfull, tofull, fd, 0, 0, 0, NULL); + if (error != EXDEV) + errx(EX_OSERR, "lzc_send_resume_redacted(\"%s\", \"%s\")" + " should have failed with EXDEV, not %d", + fromfull, tofull, error); + printf("lzc_send_resume_redacted(\"%s\", \"%s\"): %s\n", + fromfull, tofull, strerror(error)); + + /* + * Exercise EXDEV in dmu_send_estimate_fast. + */ + error = lzc_send_space_resume_redacted(fromfull, tofull, 0, 0, 0, 0, + NULL, fd, &size); + if (error != EXDEV) + errx(EX_OSERR, "lzc_send_space_resume_redacted(\"%s\", \"%s\")" + " should have failed with EXDEV, not %d", + fromfull, tofull, error); + printf("lzc_send_space_resume_redacted(\"%s\", \"%s\"): %s\n", + fromfull, tofull, strerror(error)); + + close(fd); + libzfs_fini(zhdl); + free((void *)fsname); + + return (EXIT_SUCCESS); +} diff --git a/tests/zfs-tests/cmd/btree_test/.gitignore b/tests/zfs-tests/cmd/btree_test/.gitignore new file mode 100644 index 0000000000..73777c4c1f --- /dev/null +++ b/tests/zfs-tests/cmd/btree_test/.gitignore @@ -0,0 +1 @@ +/btree_test diff --git a/tests/zfs-tests/cmd/btree_test/Makefile.am b/tests/zfs-tests/cmd/btree_test/Makefile.am new file mode 100644 index 0000000000..4c9a1a4cc2 --- /dev/null +++ b/tests/zfs-tests/cmd/btree_test/Makefile.am @@ -0,0 +1,32 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +pkgexec_PROGRAMS = btree_test +btree_test_SOURCES = btree_test.c + +btree_test_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la diff --git a/tests/zfs-tests/cmd/btree_test/btree_test.c b/tests/zfs-tests/cmd/btree_test/btree_test.c new file mode 100644 index 0000000000..8de14ff2a2 --- /dev/null +++ b/tests/zfs-tests/cmd/btree_test/btree_test.c @@ -0,0 +1,554 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#define BUFSIZE 256 + +int seed = 0; +int stress_timeout = 180; +int contents_frequency = 100; +int tree_limit = 64 * 1024; +boolean_t stress_only = B_FALSE; + +static void +usage(int exit_value) +{ + (void) fprintf(stderr, "Usage:\tbtree_test -n \n"); + (void) fprintf(stderr, "\tbtree_test -s [-r ] [-l ] " + "[-t timeout>] [-c check_contents]\n"); + (void) fprintf(stderr, "\tbtree_test [-r ] [-l ] " + "[-t timeout>] [-c check_contents]\n"); + (void) fprintf(stderr, "\n With the -n option, run the named " + "negative test. With the -s option,\n"); + (void) fprintf(stderr, " run the stress test according to the " + "other options passed. With\n"); + (void) fprintf(stderr, " neither, run all the positive tests, " + "including the stress test with\n"); + (void) fprintf(stderr, " the default options.\n"); + (void) fprintf(stderr, "\n Options that control the stress test\n"); + (void) fprintf(stderr, "\t-c stress iterations after which to compare " + "tree contents [default: 100]\n"); + (void) fprintf(stderr, "\t-l the largest value to allow in the tree " + "[default: 1M]\n"); + (void) fprintf(stderr, "\t-r random seed [default: from " + "gettimeofday()]\n"); + (void) fprintf(stderr, "\t-t seconds to let the stress test run " + "[default: 180]\n"); + exit(exit_value); +} + +typedef struct int_node { + avl_node_t node; + uint64_t data; +} int_node_t; + +/* + * Utility functions + */ + +static int +avl_compare(const void *v1, const void *v2) +{ + const int_node_t *n1 = v1; + const int_node_t *n2 = v2; + uint64_t a = n1->data; + uint64_t b = n2->data; + + return (TREE_CMP(a, b)); +} + +static int +zfs_btree_compare(const void *v1, const void *v2) +{ + const uint64_t *a = v1; + const uint64_t *b = v2; + + return (TREE_CMP(*a, *b)); +} + +static void +verify_contents(avl_tree_t *avl, zfs_btree_t *bt) +{ + static int count = 0; + zfs_btree_index_t bt_idx = {0}; + int_node_t *node; + uint64_t *data; + + boolean_t forward = count % 2 == 0 ? B_TRUE : B_FALSE; + count++; + + ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); + if (forward == B_TRUE) { + node = avl_first(avl); + data = zfs_btree_first(bt, &bt_idx); + } else { + node = avl_last(avl); + data = zfs_btree_last(bt, &bt_idx); + } + + while (node != NULL) { + ASSERT3U(*data, ==, node->data); + if (forward == B_TRUE) { + data = zfs_btree_next(bt, &bt_idx, &bt_idx); + node = AVL_NEXT(avl, node); + } else { + data = zfs_btree_prev(bt, &bt_idx, &bt_idx); + node = AVL_PREV(avl, node); + } + } +} + +static void +verify_node(avl_tree_t *avl, zfs_btree_t *bt, int_node_t *node) +{ + zfs_btree_index_t bt_idx = {0}; + zfs_btree_index_t bt_idx2 = {0}; + int_node_t *inp; + uint64_t data = node->data; + uint64_t *rv = NULL; + + ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); + ASSERT3P((rv = (uint64_t *)zfs_btree_find(bt, &data, &bt_idx)), !=, + NULL); + ASSERT3S(*rv, ==, data); + ASSERT3P(zfs_btree_get(bt, &bt_idx), !=, NULL); + ASSERT3S(data, ==, *(uint64_t *)zfs_btree_get(bt, &bt_idx)); + + if ((inp = AVL_NEXT(avl, node)) != NULL) { + ASSERT3P((rv = zfs_btree_next(bt, &bt_idx, &bt_idx2)), !=, + NULL); + ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); + ASSERT3S(inp->data, ==, *rv); + } else { + ASSERT3U(data, ==, *(uint64_t *)zfs_btree_last(bt, &bt_idx)); + } + + if ((inp = AVL_PREV(avl, node)) != NULL) { + ASSERT3P((rv = zfs_btree_prev(bt, &bt_idx, &bt_idx2)), !=, + NULL); + ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); + ASSERT3S(inp->data, ==, *rv); + } else { + ASSERT3U(data, ==, *(uint64_t *)zfs_btree_first(bt, &bt_idx)); + } +} + +/* + * Tests + */ + +/* Verify that zfs_btree_find works correctly with a NULL index. */ +static int +find_without_index(zfs_btree_t *bt, char *why) +{ + u_longlong_t *p, i = 12345; + + zfs_btree_add(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) == NULL || + *p != i) { + snprintf(why, BUFSIZE, "Unexpectedly found %llu\n", + p == NULL ? 0 : *p); + return (1); + } + + i++; + + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) != NULL) { + snprintf(why, BUFSIZE, "Found bad value: %llu\n", *p); + return (1); + } + + return (0); +} + +/* Verify simple insertion and removal from the tree. */ +static int +insert_find_remove(zfs_btree_t *bt, char *why) +{ + u_longlong_t *p, i = 12345; + zfs_btree_index_t bt_idx = {0}; + + /* Insert 'i' into the tree, and attempt to find it again. */ + zfs_btree_add(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { + snprintf(why, BUFSIZE, "Didn't find value in tree\n"); + return (1); + } else if (*p != i) { + snprintf(why, BUFSIZE, "Found (%llu) in tree\n", *p); + return (1); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 1); + zfs_btree_verify(bt); + + /* Remove 'i' from the tree, and verify it is not found. */ + zfs_btree_remove(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + snprintf(why, BUFSIZE, "Found removed value (%llu)\n", *p); + return (1); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + zfs_btree_verify(bt); + + return (0); +} + +/* + * Add a number of random entries into a btree and avl tree. Then walk them + * backwards and forwards while emptying the tree, verifying the trees look + * the same. + */ +static int +drain_tree(zfs_btree_t *bt, char *why) +{ + uint64_t *p; + avl_tree_t avl; + int i = 0; + int_node_t *node; + avl_index_t avl_idx = {0}; + zfs_btree_index_t bt_idx = {0}; + + avl_create(&avl, avl_compare, sizeof (int_node_t), + offsetof(int_node_t, node)); + + /* Fill both trees with the same data */ + for (i = 0; i < 64 * 1024; i++) { + void *ret; + + u_longlong_t randval = random(); + node = malloc(sizeof (int_node_t)); + if ((p = (uint64_t *)zfs_btree_find(bt, &randval, &bt_idx)) != + NULL) { + continue; + } + zfs_btree_add_idx(bt, &randval, &bt_idx); + + node->data = randval; + if ((ret = avl_find(&avl, node, &avl_idx)) != NULL) { + snprintf(why, BUFSIZE, "Found in avl: %llu\n", randval); + return (1); + } + avl_insert(&avl, node, avl_idx); + } + + /* Remove data from either side of the trees, comparing the data */ + while (avl_numnodes(&avl) != 0) { + uint64_t *data; + + ASSERT3U(avl_numnodes(&avl), ==, zfs_btree_numnodes(bt)); + if (avl_numnodes(&avl) % 2 == 0) { + node = avl_first(&avl); + data = zfs_btree_first(bt, &bt_idx); + } else { + node = avl_last(&avl); + data = zfs_btree_last(bt, &bt_idx); + } + ASSERT3U(node->data, ==, *data); + zfs_btree_remove_idx(bt, &bt_idx); + avl_remove(&avl, node); + + if (avl_numnodes(&avl) == 0) { + break; + } + + node = avl_first(&avl); + ASSERT3U(node->data, ==, + *(uint64_t *)zfs_btree_first(bt, NULL)); + node = avl_last(&avl); + ASSERT3U(node->data, ==, *(uint64_t *)zfs_btree_last(bt, NULL)); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + + void *avl_cookie = NULL; + while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) + free(node); + avl_destroy(&avl); + + return (0); +} + +/* + * This test uses an avl and btree, and continually processes new random + * values. Each value is either removed or inserted, depending on whether + * or not it is found in the tree. The test periodically checks that both + * trees have the same data and does consistency checks. This stress + * option can also be run on its own from the command line. + */ +static int +stress_tree(zfs_btree_t *bt, char *why) +{ + avl_tree_t avl; + int_node_t *node; + struct timeval tp; + time_t t0; + int insertions = 0, removals = 0, iterations = 0; + u_longlong_t max = 0, min = UINT64_MAX; + + (void) gettimeofday(&tp, NULL); + t0 = tp.tv_sec; + + avl_create(&avl, avl_compare, sizeof (int_node_t), + offsetof(int_node_t, node)); + + while (1) { + zfs_btree_index_t bt_idx = {0}; + avl_index_t avl_idx = {0}; + + uint64_t randval = random() % tree_limit; + node = malloc(sizeof (*node)); + node->data = randval; + + max = randval > max ? randval : max; + min = randval < min ? randval : min; + + void *ret = avl_find(&avl, node, &avl_idx); + if (ret == NULL) { + insertions++; + avl_insert(&avl, node, avl_idx); + ASSERT3P(zfs_btree_find(bt, &randval, &bt_idx), ==, + NULL); + zfs_btree_add_idx(bt, &randval, &bt_idx); + verify_node(&avl, bt, node); + } else { + removals++; + verify_node(&avl, bt, ret); + zfs_btree_remove(bt, &randval); + avl_remove(&avl, ret); + free(ret); + free(node); + } + + zfs_btree_verify(bt); + + iterations++; + if (iterations % contents_frequency == 0) { + verify_contents(&avl, bt); + } + + zfs_btree_verify(bt); + + (void) gettimeofday(&tp, NULL); + if (tp.tv_sec > t0 + stress_timeout) { + fprintf(stderr, "insertions/removals: %u/%u\nmax/min: " + "%llu/%llu\n", insertions, removals, max, min); + break; + } + } + + void *avl_cookie = NULL; + while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) + free(node); + avl_destroy(&avl); + + if (stress_only) { + zfs_btree_index_t *idx = NULL; + uint64_t *rv; + + while ((rv = zfs_btree_destroy_nodes(bt, &idx)) != NULL) + ; + zfs_btree_verify(bt); + } + + return (0); +} + +/* + * Verify inserting a duplicate value will cause a crash. + * Note: negative test; return of 0 is a failure. + */ +static int +insert_duplicate(zfs_btree_t *bt) +{ + uint64_t *p, i = 23456; + zfs_btree_index_t bt_idx = {0}; + + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + fprintf(stderr, "Found value in empty tree.\n"); + return (0); + } + zfs_btree_add_idx(bt, &i, &bt_idx); + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { + fprintf(stderr, "Did not find expected value.\n"); + return (0); + } + + /* Crash on inserting a duplicate */ + zfs_btree_add_idx(bt, &i, NULL); + + return (0); +} + +/* + * Verify removing a non-existent value will cause a crash. + * Note: negative test; return of 0 is a failure. + */ +static int +remove_missing(zfs_btree_t *bt) +{ + uint64_t *p, i = 23456; + zfs_btree_index_t bt_idx = {0}; + + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + fprintf(stderr, "Found value in empty tree.\n"); + return (0); + } + + /* Crash removing a nonexistent entry */ + zfs_btree_remove(bt, &i); + + return (0); +} + +static int +do_negative_test(zfs_btree_t *bt, char *test_name) +{ + int rval = 0; + struct rlimit rlim = {0}; + setrlimit(RLIMIT_CORE, &rlim); + + if (strcmp(test_name, "insert_duplicate") == 0) { + rval = insert_duplicate(bt); + } else if (strcmp(test_name, "remove_missing") == 0) { + rval = remove_missing(bt); + } + + /* + * Return 0, since callers will expect non-zero return values for + * these tests, and we should have crashed before getting here anyway. + */ + (void) fprintf(stderr, "Test: %s returned %d.\n", test_name, rval); + return (0); +} + +typedef struct btree_test { + const char *name; + int (*func)(zfs_btree_t *, char *); +} btree_test_t; + +static btree_test_t test_table[] = { + { "insert_find_remove", insert_find_remove }, + { "find_without_index", find_without_index }, + { "drain_tree", drain_tree }, + { "stress_tree", stress_tree }, + { NULL, NULL } +}; + +int +main(int argc, char *argv[]) +{ + char *negative_test = NULL; + int failed_tests = 0; + struct timeval tp; + zfs_btree_t bt; + int c; + + while ((c = getopt(argc, argv, "c:l:n:r:st:")) != -1) { + switch (c) { + case 'c': + contents_frequency = atoi(optarg); + break; + case 'l': + tree_limit = atoi(optarg); + break; + case 'n': + negative_test = optarg; + break; + case 'r': + seed = atoi(optarg); + break; + case 's': + stress_only = B_TRUE; + break; + case 't': + stress_timeout = atoi(optarg); + break; + case 'h': + default: + usage(1); + break; + } + } + argc -= optind; + argv += optind; + optind = 1; + + + if (seed == 0) { + (void) gettimeofday(&tp, NULL); + seed = tp.tv_sec; + } + srandom(seed); + + zfs_btree_init(); + zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + + /* + * This runs the named negative test. None of them should + * return, as they both cause crashes. + */ + if (negative_test) { + return (do_negative_test(&bt, negative_test)); + } + + fprintf(stderr, "Seed: %u\n", seed); + + /* + * This is a stress test that does operations on a btree over the + * requested timeout period, verifying them against identical + * operations in an avl tree. + */ + if (stress_only != 0) { + return (stress_tree(&bt, NULL)); + } + + /* Do the positive tests */ + btree_test_t *test = &test_table[0]; + while (test->name) { + int retval; + uint64_t *rv; + char why[BUFSIZE] = {0}; + zfs_btree_index_t *idx = NULL; + + (void) fprintf(stdout, "%-20s", test->name); + retval = test->func(&bt, why); + + if (retval == 0) { + (void) fprintf(stdout, "ok\n"); + } else { + (void) fprintf(stdout, "failed with %d\n", retval); + if (strlen(why) != 0) + (void) fprintf(stdout, "\t%s\n", why); + why[0] = '\0'; + failed_tests++; + } + + /* Remove all the elements and re-verify the tree */ + while ((rv = zfs_btree_destroy_nodes(&bt, &idx)) != NULL) + ; + zfs_btree_verify(&bt); + + test++; + } + + zfs_btree_verify(&bt); + zfs_btree_fini(); + + return (failed_tests); +} diff --git a/tests/zfs-tests/cmd/devname2devid/Makefile.am b/tests/zfs-tests/cmd/devname2devid/Makefile.am index a8991bb781..b8b630dc2d 100644 --- a/tests/zfs-tests/cmd/devname2devid/Makefile.am +++ b/tests/zfs-tests/cmd/devname2devid/Makefile.am @@ -5,5 +5,6 @@ pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin if WANT_DEVNAME2DEVID pkgexec_PROGRAMS = devname2devid devname2devid_SOURCES = devname2devid.c -devname2devid_LDADD = -ludev +devname2devid_CFLAGS = $(AM_CFLAGS) $(LIBUDEV_CFLAGS) +devname2devid_LDADD = $(LIBUDEV_LIBS) endif diff --git a/tests/zfs-tests/cmd/draid/.gitignore b/tests/zfs-tests/cmd/draid/.gitignore new file mode 100644 index 0000000000..911b9f0778 --- /dev/null +++ b/tests/zfs-tests/cmd/draid/.gitignore @@ -0,0 +1 @@ +/draid diff --git a/tests/zfs-tests/cmd/draid/Makefile.am b/tests/zfs-tests/cmd/draid/Makefile.am new file mode 100644 index 0000000000..69fed7a6be --- /dev/null +++ b/tests/zfs-tests/cmd/draid/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +AM_CFLAGS += $(ZLIB_CFLAGS) + +pkgexec_PROGRAMS = draid + +draid_SOURCES = draid.c + +draid_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +draid_LDADD += $(ZLIB_LIBS) diff --git a/tests/zfs-tests/cmd/draid/draid.c b/tests/zfs-tests/cmd/draid/draid.c new file mode 100644 index 0000000000..57261348b3 --- /dev/null +++ b/tests/zfs-tests/cmd/draid/draid.c @@ -0,0 +1,1414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +/* + * The number of rows to generate for new permutation maps. + */ +#define MAP_ROWS_DEFAULT 256 + +/* + * Key values for dRAID maps when stored as nvlists. + */ +#define MAP_SEED "seed" +#define MAP_CHECKSUM "checksum" +#define MAP_WORST_RATIO "worst_ratio" +#define MAP_AVG_RATIO "avg_ratio" +#define MAP_CHILDREN "children" +#define MAP_NPERMS "nperms" +#define MAP_PERMS "perms" + +static void +draid_usage(void) +{ + (void) fprintf(stderr, + "usage: draid command args ...\n" + "Available commands are:\n" + "\n" + "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" + "\tdraid verify [-rv] FILE\n" + "\tdraid dump [-v] [-m min] [-n max] FILE\n" + "\tdraid table FILE\n" + "\tdraid merge FILE SRC SRC...\n"); + exit(1); +} + +static int +read_map(const char *filename, nvlist_t **allcfgs) +{ + int block_size = 131072; + int buf_size = 131072; + int tmp_size, error; + char *tmp_buf; + + struct stat64 stat; + if (lstat64(filename, &stat) != 0) + return (errno); + + if (stat.st_size == 0 || + !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { + return (EINVAL); + } + + gzFile fp = gzopen(filename, "rb"); + if (fp == Z_NULL) + return (errno); + + char *buf = malloc(buf_size); + if (buf == NULL) { + (void) gzclose(fp); + return (ENOMEM); + } + + ssize_t rc, bytes = 0; + while (!gzeof(fp)) { + rc = gzread(fp, buf + bytes, block_size); + if ((rc < 0) || (rc == 0 && !gzeof(fp))) { + free(buf); + (void) gzclose(fp); + (void) gzerror(fp, &error); + return (error); + } else { + bytes += rc; + + if (bytes + block_size >= buf_size) { + tmp_size = 2 * buf_size; + tmp_buf = malloc(tmp_size); + if (tmp_buf == NULL) { + free(buf); + (void) gzclose(fp); + return (ENOMEM); + } + + memcpy(tmp_buf, buf, bytes); + free(buf); + buf = tmp_buf; + buf_size = tmp_size; + } + } + } + + (void) gzclose(fp); + + error = nvlist_unpack(buf, bytes, allcfgs, 0); + free(buf); + + return (error); +} + +/* + * Read a map from the specified filename. A file contains multiple maps + * which are indexed by the number of children. The caller is responsible + * for freeing the configuration returned. + */ +static int +read_map_key(const char *filename, char *key, nvlist_t **cfg) +{ + nvlist_t *allcfgs, *foundcfg = NULL; + int error; + + error = read_map(filename, &allcfgs); + if (error != 0) + return (error); + + nvlist_lookup_nvlist(allcfgs, key, &foundcfg); + if (foundcfg != NULL) { + nvlist_dup(foundcfg, cfg, KM_SLEEP); + error = 0; + } else { + error = ENOENT; + } + + nvlist_free(allcfgs); + + return (error); +} + +/* + * Write all mappings to the map file. + */ +static int +write_map(const char *filename, nvlist_t *allcfgs) +{ + size_t buflen = 0; + int error; + + error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); + if (error) + return (error); + + char *buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); + + error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error) { + free(buf); + return (error); + } + + /* + * Atomically update the file using a temporary file and the + * traditional unlink then rename steps. This code provides + * no locking, it only guarantees the packed nvlist on disk + * is updated atomically and is internally consistent. + */ + char *tmpname = calloc(MAXPATHLEN, 1); + if (tmpname == NULL) { + free(buf); + return (ENOMEM); + } + + snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); + + int fd = mkstemp(tmpname); + if (fd < 0) { + error = errno; + free(buf); + free(tmpname); + return (error); + } + (void) close(fd); + + gzFile fp = gzopen(tmpname, "w9b"); + if (fp == Z_NULL) { + error = errno; + free(buf); + free(tmpname); + return (errno); + } + + ssize_t rc, bytes = 0; + while (bytes < buflen) { + size_t size = MIN(buflen - bytes, 131072); + rc = gzwrite(fp, buf + bytes, size); + if (rc < 0) { + free(buf); + (void) gzerror(fp, &error); + (void) gzclose(fp); + (void) unlink(tmpname); + free(tmpname); + return (error); + } else if (rc == 0) { + break; + } else { + bytes += rc; + } + } + + free(buf); + (void) gzclose(fp); + + if (bytes != buflen) { + (void) unlink(tmpname); + free(tmpname); + return (EIO); + } + + /* + * Unlink the previous config file and replace it with the updated + * version. If we're able to unlink the file then directory is + * writable by us and the subsequent rename should never fail. + */ + error = unlink(filename); + if (error != 0 && errno != ENOENT) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + error = rename(tmpname, filename); + if (error != 0) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + free(tmpname); + + return (0); +} + +/* + * Add the dRAID map to the file and write it out. + */ +static int +write_map_key(const char *filename, char *key, draid_map_t *map, + double worst_ratio, double avg_ratio) +{ + nvlist_t *nv_cfg, *allcfgs; + int error; + + /* + * Add the configuration to an existing or new file. The new + * configuration will replace an existing configuration with the + * same key if it has a lower ratio and is therefore better. + */ + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + return (error); + } + + error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); + if (error == 0) { + uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, + MAP_WORST_RATIO); + double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; + + if (worst_ratio < nv_worst_ratio) { + /* Replace old map with the more balanced new map. */ + fnvlist_remove(allcfgs, key); + } else { + /* The old map is preferable, keep it. */ + nvlist_free(allcfgs); + return (EEXIST); + } + } + + nvlist_t *cfg = fnvlist_alloc(); + fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); + fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); + fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); + fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); + fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, + map->dm_children * map->dm_nperms * sizeof (uint8_t)); + + fnvlist_add_uint64(cfg, MAP_WORST_RATIO, + (uint64_t)(worst_ratio * 1000.0)); + fnvlist_add_uint64(cfg, MAP_AVG_RATIO, + (uint64_t)(avg_ratio * 1000.0)); + + error = nvlist_add_nvlist(allcfgs, key, cfg); + if (error == 0) + error = write_map(filename, allcfgs); + + nvlist_free(cfg); + nvlist_free(allcfgs); + return (error); +} + +static void +dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio, + int verbose) +{ + if (verbose == 0) { + return; + } else if (verbose == 1) { + printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " + "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, + worst_ratio, avg_ratio); + return; + } else { + printf(" \"%s\":\n" + " seed: 0x%016llx\n" + " checksum: 0x%016llx\n" + " worst_ratio: %2.03f\n" + " avg_ratio: %2.03f\n" + " children: %llu\n" + " nperms: %llu\n", + key, (u_longlong_t)map->dm_seed, + (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, + (u_longlong_t)map->dm_children, + (u_longlong_t)map->dm_nperms); + + if (verbose > 2) { + printf(" perms = {\n"); + for (int i = 0; i < map->dm_nperms; i++) { + printf(" { "); + for (int j = 0; j < map->dm_children; j++) { + printf("%3d%s ", map->dm_perms[ + i * map->dm_children + j], + j < map->dm_children - 1 ? + "," : ""); + } + printf(" },\n"); + } + printf(" }\n"); + } else if (verbose == 2) { + printf(" draid_perms = \n"); + } + } +} + +static void +dump_map_nv(char *key, nvlist_t *cfg, int verbose) +{ + draid_map_t map; + uint_t c; + + uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); + uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c); + + dump_map(&map, key, (double)worst_ratio / 1000.0, + avg_ratio / 1000.0, verbose); +} + +/* + * Print a summary of the mapping. + */ +static int +dump_map_key(const char *filename, char *key, int verbose) +{ + nvlist_t *cfg; + int error; + + error = read_map_key(filename, key, &cfg); + if (error != 0) + return (error); + + dump_map_nv(key, cfg, verbose); + + return (0); +} + +/* + * Allocate a new permutation map for evaluation. + */ +static int +alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, + draid_map_t **mapp) +{ + draid_map_t *map; + int error; + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + map->dm_children = children; + map->dm_nperms = nperms; + map->dm_seed = seed; + map->dm_checksum = 0; + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Allocate the fixed permutation map for N children. + */ +static int +alloc_fixed_map(uint64_t children, draid_map_t **mapp) +{ + const draid_map_t *fixed_map; + draid_map_t *map; + int error; + + error = vdev_draid_lookup_map(children, &fixed_map); + if (error) + return (error); + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + memcpy(map, fixed_map, sizeof (draid_map_t)); + VERIFY3U(map->dm_checksum, !=, 0); + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Free a permutation map. + */ +static void +free_map(draid_map_t *map) +{ + free(map->dm_perms); + free(map); +} + +/* + * Check if dev is in the provided list of faulted devices. + */ +static inline boolean_t +is_faulted(int *faulted_devs, int nfaulted, int dev) +{ + for (int i = 0; i < nfaulted; i++) + if (faulted_devs[i] == dev) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Evaluate how resilvering I/O will be distributed given a list of faulted + * vdevs. As a simplification we assume one IO is sufficient to repair each + * damaged device in a group. + */ +static double +eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, + int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) +{ + uint64_t children = map->dm_children; + uint64_t ngroups = 1; + uint64_t ndisks = children - nspares; + + /* + * Calculate the minimum number of groups required to fill a slice. + */ + while (ngroups * (groupwidth) % (children - nspares) != 0) + ngroups++; + + int *ios = calloc(map->dm_children, sizeof (uint64_t)); + + /* Resilver all rows */ + for (int i = 0; i < map->dm_nperms; i++) { + uint8_t *row = &map->dm_perms[i * map->dm_children]; + + /* Resilver all groups with faulted drives */ + for (int j = 0; j < ngroups; j++) { + uint64_t spareidx = map->dm_children - nspares; + boolean_t repair_needed = B_FALSE; + + /* See if any devices in this group are faulted */ + uint64_t groupstart = (j * groupwidth) % ndisks; + + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + repair_needed = is_faulted(faulted_devs, + nfaulted, row[groupidx]); + if (repair_needed) + break; + } + + if (repair_needed == B_FALSE) + continue; + + /* + * This group is degraded. Calculate the number of + * reads the non-faulted drives require and the number + * of writes to the distributed hot spare for this row. + */ + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + if (!is_faulted(faulted_devs, nfaulted, + row[groupidx])) { + ios[row[groupidx]]++; + } else if (nspares > 0) { + while (is_faulted(faulted_devs, + nfaulted, row[spareidx])) { + spareidx++; + } + + ASSERT3U(spareidx, <, map->dm_children); + ios[row[spareidx]]++; + spareidx++; + } + } + } + } + + *min_child_ios = INT_MAX; + *max_child_ios = 0; + + /* + * Find the drives with fewest and most required I/O. These values + * are used to calculate the imbalance ratio. To avoid returning an + * infinite value for permutations which have children that perform + * no IO a floor of 1 IO per child is set. This ensures a meaningful + * ratio is returned for comparison and it is not an uncommon when + * there are a large number of children. + */ + for (int i = 0; i < map->dm_children; i++) { + + if (is_faulted(faulted_devs, nfaulted, i)) { + ASSERT0(ios[i]); + continue; + } + + if (ios[i] == 0) + ios[i] = 1; + + if (ios[i] < *min_child_ios) + *min_child_ios = ios[i]; + + if (ios[i] > *max_child_ios) + *max_child_ios = ios[i]; + } + + ASSERT3S(*min_child_ios, !=, INT_MAX); + ASSERT3S(*max_child_ios, !=, 0); + + double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); + + free(ios); + + return (ratio); +} + +/* + * Evaluate the quality of the permutation mapping by considering possible + * device failures. Returns the imbalance ratio for the worst mapping which + * is defined to be the largest number of child IOs over the fewest number + * child IOs. A value of 1.0 indicates the mapping is perfectly balance and + * all children perform an equal amount of work during reconstruction. + */ +static void +eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) +{ + uint64_t children = map->dm_children; + double worst_ratio = 1.0; + double sum = 0; + int worst_min_ios = 0, worst_max_ios = 0; + int n = 0; + + /* + * When there are only 2 children there can be no distributed + * spare and no resilver to evaluate. Default to a ratio of 1.0 + * for this degenerate case. + */ + if (children == VDEV_DRAID_MIN_CHILDREN) { + *worst_ratiop = 1.0; + *avg_ratiop = 1.0; + return; + } + + /* + * Score the mapping as if it had either 1 or 2 distributed spares. + */ + for (int nspares = 1; nspares <= 2; nspares++) { + uint64_t faults = nspares; + + /* + * Score groupwidths up to 19. This value was chosen as the + * largest reasonable width (16d+3p). dRAID pools may be still + * be created with wider stripes but they are not considered in + * this analysis in order to optimize for the most common cases. + */ + for (uint64_t groupwidth = 2; + groupwidth <= MIN(children - nspares, 19); + groupwidth++) { + int faulted_devs[2]; + int min_ios, max_ios; + + /* + * Score possible devices faults. This is limited + * to exactly one fault per distributed spare for + * the purposes of this similation. + */ + for (int f1 = 0; f1 < children; f1++) { + faulted_devs[0] = f1; + double ratio; + + if (faults == 1) { + ratio = eval_resilver(map, groupwidth, + nspares, faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } else if (faults == 2) { + for (int f2 = f1 + 1; f2 < children; + f2++) { + faulted_devs[1] = f2; + + ratio = eval_resilver(map, + groupwidth, nspares, + faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } + } + } + } + } + + *worst_ratiop = worst_ratio; + *avg_ratiop = sum / n; + + /* + * Log the min/max io values for particularly unbalanced maps. + * Since the maps are generated entirely randomly these are possible + * be exceedingly unlikely. We log it for possible investigation. + */ + if (worst_ratio > 100.0) { + dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); + printf("worst_min_ios=%d worst_max_ios=%d\n", + worst_min_ios, worst_max_ios); + } +} + +static int +eval_maps(uint64_t children, int passes, uint64_t *map_seed, + draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) +{ + draid_map_t *best_map = NULL; + double best_worst_ratio = 1000.0; + double best_avg_ratio = 1000.0; + + /* + * Perform the requested number of passes evaluating randomly + * generated permutation maps. Only the best version is kept. + */ + for (int i = 0; i < passes; i++) { + double worst_ratio, avg_ratio; + draid_map_t *map; + int error; + + /* + * Calculate the next seed and generate a new candidate map. + */ + error = alloc_new_map(children, MAP_ROWS_DEFAULT, + vdev_draid_rand(map_seed), &map); + if (error) + return (error); + + /* + * Consider maps with a lower worst_ratio to be of higher + * quality. Some maps may have a lower avg_ratio but they + * are discarded since they might include some particularly + * imbalanced permutations. The average is tracked to in + * order to get a sense of the average permutation quality. + */ + eval_decluster(map, &worst_ratio, &avg_ratio); + + if (best_map == NULL || worst_ratio < best_worst_ratio) { + + if (best_map != NULL) + free_map(best_map); + + best_map = map; + best_worst_ratio = worst_ratio; + best_avg_ratio = avg_ratio; + } else { + free_map(map); + } + } + + /* + * After determining the best map generate a checksum over the full + * permutation array. This checksum is verified when opening a dRAID + * pool to ensure the generated in memory permutations are correct. + */ + zio_cksum_t cksum; + fletcher_4_native_varsize(best_map->dm_perms, + sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, + &cksum); + best_map->dm_checksum = cksum.zc_word[0]; + + *best_mapp = best_map; + *best_ratiop = best_worst_ratio; + *avg_ratiop = best_avg_ratio; + + return (0); +} + +static int +draid_generate(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + uint64_t map_seed; + int c, fd, error, verbose = 0, passes = 1, continuous = 0; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + int restarts = 0; + + while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { + switch (c) { + case 'c': + continuous++; + break; + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < VDEV_DRAID_MIN_CHILDREN) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'p': + passes = (int)strtol(optarg, NULL, 0); + break; + case 'v': + /* + * 0 - Only log when a better map is added to the file. + * 1 - Log the current best map for each child count. + * Minimal output on a single summary line. + * 2 - Log the current best map for each child count. + * More verbose includes most map fields. + * 3 - Log the current best map for each child count. + * Very verbose all fields including the full map. + */ + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + +restart: + /* + * Start with a fresh seed from /dev/urandom. + */ + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); + return (1); + } else { + ssize_t bytes = sizeof (map_seed); + ssize_t bytes_read = 0; + + while (bytes_read < bytes) { + ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, + bytes - bytes_read); + if (rc < 0) { + printf("Unable to read /dev/urandom: %s\n:", + strerror(errno)); + return (1); + } + bytes_read += rc; + } + + (void) close(fd); + } + + if (restarts == 0) + printf("Writing generated mappings to '%s':\n", filename); + + /* + * Generate maps for all requested child counts. The best map for + * each child count is written out to the specified file. If the file + * already contains a better mapping this map will not be added. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + draid_map_t *map; + double worst_ratio = 1000.0; + double avg_ratio = 1000.0; + + error = eval_maps(children, passes, &map_seed, &map, + &worst_ratio, &avg_ratio); + if (error) { + printf("Error eval_maps(): %s\n", strerror(error)); + return (1); + } + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio < 1.0: worst_ratio = %2.03f " + "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); + return (1); + } + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = write_map_key(filename, key, map, worst_ratio, + avg_ratio); + if (error == 0) { + /* The new map was added to the file. */ + dump_map(map, key, worst_ratio, avg_ratio, + MAX(verbose, 1)); + } else if (error == EEXIST) { + /* The existing map was preferable and kept. */ + if (verbose > 0) + dump_map_key(filename, key, verbose); + } else { + printf("Error write_map_key(): %s\n", strerror(error)); + return (1); + } + + free_map(map); + } + + /* + * When the continuous option is set restart at the minimum number of + * children instead of exiting. This option is useful as a mechanism + * to continuous try and refine the discovered permutations. + */ + if (continuous) { + restarts++; + printf("Restarting by request (-c): %d\n", restarts); + goto restart; + } + + return (0); +} + +/* + * Verify each map in the file by generating its in-memory permutation array + * and comfirming its checksum is correct. + */ +static int +draid_verify(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int n = 0, c, error, verbose = 1; + int check_ratios = 0; + + while ((c = getopt(argc, argv, ":rv")) != -1) { + switch (c) { + case 'r': + check_ratios++; + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + char *abspath = malloc(MAXPATHLEN); + if (abspath == NULL) + return (ENOMEM); + + bzero(filename, MAXPATHLEN); + if (realpath(argv[optind], abspath) != NULL) + strncpy(filename, abspath, MAXPATHLEN - 1); + else + strncpy(filename, argv[optind], MAXPATHLEN - 1); + + free(abspath); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("Verifying permutation maps: '%s'\n", filename); + + /* + * Lookup hardcoded permutation map for each valid number of children + * and verify a generated map has the correct checksum. Then compare + * the generated map values with the nvlist map values read from the + * reference file to cross-check the permutation. + */ + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + draid_map_t *map; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = alloc_fixed_map(children, &map); + if (error) { + printf("Error alloc_fixed_map() failed: %s\n", + error == ECKSUM ? "Invalid checksum" : + strerror(error)); + return (1); + } + + uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; + uint8_t *nv_perms; + nvlist_t *cfg; + uint_t c; + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + free_map(map); + return (1); + } + + nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); + + /* + * Compare draid_map_t and nvlist reference values. + */ + if (map->dm_seed != nv_seed) { + printf("Error different seeds: 0x%016llx != " + "0x%016llx\n", (u_longlong_t)map->dm_seed, + (u_longlong_t)nv_seed); + error = EINVAL; + } + + if (map->dm_checksum != nv_checksum) { + printf("Error different checksums: 0x%016llx " + "!= 0x%016llx\n", + (u_longlong_t)map->dm_checksum, + (u_longlong_t)nv_checksum); + error = EINVAL; + } + + if (map->dm_children != nv_children) { + printf("Error different children: %llu " + "!= %llu\n", (u_longlong_t)map->dm_children, + (u_longlong_t)nv_children); + error = EINVAL; + } + + if (map->dm_nperms != nv_nperms) { + printf("Error different nperms: %llu " + "!= %llu\n", (u_longlong_t)map->dm_nperms, + (u_longlong_t)nv_nperms); + error = EINVAL; + } + + for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { + if (map->dm_perms[i] != nv_perms[i]) { + printf("Error different perms[%llu]: " + "%d != %d\n", (u_longlong_t)i, + (int)map->dm_perms[i], + (int)nv_perms[i]); + error = EINVAL; + break; + } + } + + /* + * For good measure recalculate the worst and average + * ratios and confirm they match the nvlist values. + */ + if (check_ratios) { + uint64_t nv_worst_ratio, nv_avg_ratio; + double worst_ratio, avg_ratio; + + eval_decluster(map, &worst_ratio, &avg_ratio); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + nv_avg_ratio = fnvlist_lookup_uint64(cfg, + MAP_AVG_RATIO); + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio out of range %2.03f, " + "%2.03f\n", worst_ratio, avg_ratio); + error = EINVAL; + } + + if ((uint64_t)(worst_ratio * 1000.0) != + nv_worst_ratio) { + printf("Error different worst_ratio %2.03f " + "!= %2.03f\n", (double)nv_worst_ratio / + 1000.0, worst_ratio); + error = EINVAL; + } + + if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { + printf("Error different average_ratio %2.03f " + "!= %2.03f\n", (double)nv_avg_ratio / + 1000.0, avg_ratio); + error = EINVAL; + } + } + + if (error) { + free_map(map); + nvlist_free(cfg); + return (1); + } + + if (verbose > 0) { + printf("- %llu children: good\n", + (u_longlong_t)children); + } + n++; + + free_map(map); + nvlist_free(cfg); + } + + if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { + printf("Error permutation maps missing: %d / %d checked\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + return (1); + } + + printf("Successfully verified %d / %d permutation maps\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + + return (0); +} + +/* + * Dump the contents of the specified mapping(s) for inspection. + */ +static int +draid_dump(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, verbose = 1; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + + while ((c = getopt(argc, argv, ":vm:n:")) != -1) { + switch (c) { + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < 2) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + /* + * Dump maps for the requested child counts. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = dump_map_key(filename, key, verbose); + if (error) { + printf("Error dump_map_key(): %s\n", strerror(error)); + return (1); + } + } + + return (0); +} + +/* + * Print all of the mappings as a C formatted draid_map_t array. This table + * is found in the module/zcommon/zfs_draid.c file and is the definitive + * source for all mapping used by dRAID. It cannot be updated without + * changing the dRAID on disk format. + */ +static int +draid_table(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int error; + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("static const draid_map_t " + "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); + + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + uint64_t seed, checksum, nperms, avg_ratio; + nvlist_t *cfg; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + return (1); + } + + seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" + "/* %2.03f */\n", (u_longlong_t)children, + (u_longlong_t)nperms, (u_longlong_t)seed, + (u_longlong_t)checksum, (double)avg_ratio / 1000.0); + + nvlist_free(cfg); + } + + printf("};\n"); + + return (0); +} + +static int +draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) +{ + nvlist_t *srccfgs; + nvpair_t *elem = NULL; + int error, merged = 0; + + error = read_map(srcfilename, &srccfgs); + if (error != 0) + return (error); + + while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { + uint64_t nv_worst_ratio; + uint64_t allcfg_worst_ratio; + nvlist_t *cfg, *allcfg; + char *key; + + switch (nvpair_type(elem)) { + case DATA_TYPE_NVLIST: + + (void) nvpair_value_nvlist(elem, &cfg); + key = nvpair_name(elem); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + + error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); + if (error == 0) { + allcfg_worst_ratio = fnvlist_lookup_uint64( + allcfg, MAP_WORST_RATIO); + + if (nv_worst_ratio < allcfg_worst_ratio) { + fnvlist_remove(allcfgs, key); + error = nvlist_add_nvlist(allcfgs, + key, cfg); + merged++; + } + } else if (error == ENOENT) { + error = nvlist_add_nvlist(allcfgs, key, cfg); + merged++; + } else { + return (error); + } + + break; + default: + continue; + } + } + + nvlist_free(srccfgs); + + *mergedp = merged; + + return (0); +} + +/* + * Merge the best map for each child count found in the listed files into + * a new file. This allows 'draid generate' to be run in parallel and for + * the results maps to be combined. + */ +static int +draid_merge(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, total_merged = 0, verbose = 0; + nvlist_t *allcfgs; + + while ((c = getopt(argc, argv, ":v")) != -1) { + switch (c) { + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc < 4) { + (void) fprintf(stderr, + "A FILE and multiple SRCs must be specified.\n"); + return (1); + } + + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + optind++; + + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + printf("Error read_map(): %s\n", strerror(error)); + return (error); + } + + while (optind < argc) { + char srcfilename[MAXPATHLEN]; + int merged = 0; + + bzero(srcfilename, MAXPATHLEN); + strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); + + error = draid_merge_impl(allcfgs, srcfilename, &merged); + if (error) { + printf("Error draid_merge_impl(): %s\n", + strerror(error)); + nvlist_free(allcfgs); + return (1); + } + + total_merged += merged; + printf("Merged %d key(s) from '%s' into '%s'\n", merged, + srcfilename, filename); + + optind++; + } + + if (total_merged > 0) + write_map(filename, allcfgs); + + printf("Merged a total of %d key(s) into '%s'\n", total_merged, + filename); + + nvlist_free(allcfgs); + + return (0); +} + +int +main(int argc, char *argv[]) +{ + if (argc < 2) + draid_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "generate") == 0) { + return (draid_generate(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "verify") == 0) { + return (draid_verify(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "dump") == 0) { + return (draid_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "table") == 0) { + return (draid_table(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "merge") == 0) { + return (draid_merge(argc - 1, argv + 1)); + } else { + draid_usage(); + } +} diff --git a/tests/zfs-tests/cmd/file_check/file_check.c b/tests/zfs-tests/cmd/file_check/file_check.c index 5df0ea735b..3d3db753f3 100644 --- a/tests/zfs-tests/cmd/file_check/file_check.c +++ b/tests/zfs-tests/cmd/file_check/file_check.c @@ -40,7 +40,6 @@ main(int argc, char **argv) long i, n; unsigned char fillchar = DATA; int bigbuffersize = BIGBUFFERSIZE; - int64_t read_count = 0; /* * Validate arguments @@ -78,8 +77,6 @@ main(int argc, char **argv) exit(1); } } - - read_count += n; } while (n == bigbuffersize); return (0); diff --git a/tests/zfs-tests/cmd/file_write/file_write.c b/tests/zfs-tests/cmd/file_write/file_write.c index 81fc5de397..60893c34fb 100644 --- a/tests/zfs-tests/cmd/file_write/file_write.c +++ b/tests/zfs-tests/cmd/file_write/file_write.c @@ -34,10 +34,6 @@ #include #include -typedef unsigned char uchar_t; -typedef long long longlong_t; -typedef longlong_t offset_t; - static unsigned char bigbuffer[BIGBUFFERSIZE]; /* @@ -48,9 +44,9 @@ static unsigned char bigbuffer[BIGBUFFERSIZE]; static void usage(char *); /* - * psudo-randomize the buffer + * pseudo-randomize the buffer */ -void randomize_buffer(int block_size) { +static void randomize_buffer(int block_size) { int i; char rnd = rand() & 0xff; for (i = 0; i < block_size; i++) diff --git a/tests/zfs-tests/cmd/get_diff/.gitignore b/tests/zfs-tests/cmd/get_diff/.gitignore new file mode 100644 index 0000000000..f5fc360a68 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/.gitignore @@ -0,0 +1 @@ +/get_diff diff --git a/tests/zfs-tests/cmd/get_diff/Makefile.am b/tests/zfs-tests/cmd/get_diff/Makefile.am new file mode 100644 index 0000000000..06c39ddd81 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = get_diff +get_diff_SOURCES = get_diff.c diff --git a/tests/zfs-tests/cmd/get_diff/get_diff.c b/tests/zfs-tests/cmd/get_diff/get_diff.c new file mode 100644 index 0000000000..2799f46b07 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/get_diff.c @@ -0,0 +1,109 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +usage(char *msg, int exit_value) +{ + (void) fprintf(stderr, "get_diff file redacted_file\n"); + (void) fprintf(stderr, "%s\n", msg); + exit(exit_value); +} + +/* + * This utility compares two files, an original and its redacted counterpart + * (in that order). It compares the files 512 bytes at a time, printing out + * any ranges (as offset and length) where the redacted file does not match + * the original. This output is used to verify that the expected ranges of + * a redacted file do not contain the original data. + */ +int +main(int argc, char *argv[]) +{ + off_t diff_off = 0, diff_len = 0, off = 0; + int fd1, fd2; + char *fname1, *fname2; + char buf1[DEV_BSIZE], buf2[DEV_BSIZE]; + ssize_t bytes; + + if (argc != 3) + usage("Incorrect number of arguments.", 1); + + if ((fname1 = argv[1]) == NULL) + usage("Filename missing.", 1); + if ((fd1 = open(fname1, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open1 failed"); + exit(1); + } + + if ((fname2 = argv[2]) == NULL) + usage("Redacted filename missing.", 1); + if ((fd2 = open(fname2, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open2 failed"); + exit(1); + } + + while ((bytes = pread(fd1, buf1, DEV_BSIZE, off)) > 0) { + if (pread(fd2, buf2, DEV_BSIZE, off) < 0) { + if (errno == EIO) { + /* + * A read in a redacted section of a file will + * fail with EIO. If we get EIO, continue on + * but ensure that a comparison of buf1 and + * buf2 will fail, indicating a redacted block. + */ + buf2[0] = ~buf1[0]; + } else { + perror("pread failed"); + exit(1); + } + } + if (memcmp(buf1, buf2, bytes) == 0) { + if (diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", + (long long)diff_off, (long long)diff_len); + assert(off == diff_off + diff_len); + diff_len = 0; + } + diff_off = 0; + } else { + if (diff_len == 0) + diff_off = off; + assert(off == diff_off + diff_len); + diff_len += bytes; + } + off += bytes; + } + + if (diff_len != 0 && diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", (long long)diff_off, + (long long)diff_len); + } + + (void) close(fd1); + (void) close(fd2); + + return (0); +} diff --git a/tests/zfs-tests/cmd/largest_file/largest_file.c b/tests/zfs-tests/cmd/largest_file/largest_file.c index d1eceaf568..00e1019cc8 100644 --- a/tests/zfs-tests/cmd/largest_file/largest_file.c +++ b/tests/zfs-tests/cmd/largest_file/largest_file.c @@ -33,12 +33,9 @@ #include #include #include -#include +#include #include -typedef long long offset_t; -#define MAXOFFSET_T LLONG_MAX - /* * -------------------------------------------------------------- * diff --git a/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am b/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am index b62a6bb0f5..cd46220895 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am +++ b/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am @@ -2,14 +2,16 @@ include $(top_srcdir)/config/Rules.am pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - pkgexec_PROGRAMS = libzfs_input_check +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs +endif +if BUILD_LINUX +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/linux/zfs +endif + libzfs_input_check_SOURCES = libzfs_input_check.c libzfs_input_check_LDADD = \ - $(top_builddir)/lib/libspl/libspl.la \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs_core/libzfs_core.la + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 977b9e2f3d..0e552c2680 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -22,9 +22,12 @@ #include #include #include +#include #include +#include #include +#include /* * Test the nvpair inputs for the non-legacy zfs ioctl commands. @@ -99,10 +102,12 @@ static unsigned ioc_skip[] = { ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_POOL_REGUID, ZFS_IOC_SEND_PROGRESS, - ZFS_IOC_EVENTS_NEXT, ZFS_IOC_EVENTS_CLEAR, ZFS_IOC_EVENTS_SEEK, + ZFS_IOC_NEXTBOOT, + ZFS_IOC_JAIL, + ZFS_IOC_UNJAIL, }; @@ -154,7 +159,7 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); zc.zc_nvlist_dst = (uint64_t)(uintptr_t)malloc(zc.zc_nvlist_dst_size); - if (ioctl(zfs_fd, ioc, &zc) != 0) + if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0) error = errno; if (error != expected) { @@ -272,13 +277,13 @@ test_pool_sync(const char *pool) static void test_pool_reopen(const char *pool) { - nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); - fnvlist_add_boolean_value(required, "scrub_restart", B_FALSE); + fnvlist_add_boolean_value(optional, "scrub_restart", B_FALSE); - IOC_INPUT_TEST(ZFS_IOC_POOL_REOPEN, pool, required, NULL, 0); + IOC_INPUT_TEST(ZFS_IOC_POOL_REOPEN, pool, NULL, optional, 0); - nvlist_free(required); + nvlist_free(optional); } static void @@ -505,6 +510,7 @@ test_send_new(const char *snapshot, int fd) fnvlist_add_string(optional, "fromsnap", from); fnvlist_add_uint64(optional, "resume_object", resumeobj); fnvlist_add_uint64(optional, "resume_offset", offset); + fnvlist_add_boolean(optional, "savedok"); #endif IOC_INPUT_TEST(ZFS_IOC_SEND_NEW, snapshot, required, optional, 0); @@ -552,7 +558,8 @@ test_recv_new(const char *dataset, int fd) fnvlist_add_boolean(optional, "resumable"); fnvlist_add_uint64(optional, "action_handle", *action_handle); #endif - IOC_INPUT_TEST(ZFS_IOC_RECV_NEW, dataset, required, optional, EBADE); + IOC_INPUT_TEST(ZFS_IOC_RECV_NEW, dataset, required, optional, + ZFS_ERR_STREAM_TRUNCATED); nvlist_free(props); nvlist_free(optional); @@ -685,11 +692,86 @@ zfs_destroy(const char *dataset) (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; - err = ioctl(zfs_fd, ZFS_IOC_DESTROY, &zc); + err = lzc_ioctl_fd(zfs_fd, ZFS_IOC_DESTROY, &zc); return (err == 0 ? 0 : errno); } +static void +test_redact(const char *snapshot1, const char *snapshot2) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *snapnv = fnvlist_alloc(); + char bookmark[MAXNAMELEN + 32]; + + fnvlist_add_string(required, "bookname", "testbookmark"); + fnvlist_add_boolean(snapnv, snapshot2); + fnvlist_add_nvlist(required, "snapnv", snapnv); + + IOC_INPUT_TEST(ZFS_IOC_REDACT, snapshot1, required, NULL, 0); + + nvlist_free(snapnv); + nvlist_free(required); + + strlcpy(bookmark, snapshot1, sizeof (bookmark)); + *strchr(bookmark, '@') = '\0'; + strlcat(bookmark, "#testbookmark", sizeof (bookmark) - + strlen(bookmark)); + zfs_destroy(bookmark); +} + +static void +test_get_bookmark_props(const char *bookmark) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, NULL, NULL, 0); +} + +static void +test_wait(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_int32(required, "wait_activity", 2); + fnvlist_add_uint64(optional, "wait_tag", 0xdeadbeefdeadbeef); + + IOC_INPUT_TEST(ZFS_IOC_WAIT, pool, required, optional, EINVAL); + + nvlist_free(required); + nvlist_free(optional); +} + +static void +test_wait_fs(const char *dataset) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_int32(required, "wait_activity", 2); + + IOC_INPUT_TEST(ZFS_IOC_WAIT_FS, dataset, required, NULL, EINVAL); + + nvlist_free(required); +} + +static void +test_get_bootenv(const char *pool) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOTENV, pool, NULL, NULL, 0); +} + +static void +test_set_bootenv(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_uint64(required, "version", VB_RAW); + fnvlist_add_string(required, GRUB_ENVMAP, "test"); + + IOC_INPUT_TEST_WILD(ZFS_IOC_SET_BOOTENV, pool, required, NULL, 0); + + nvlist_free(required); +} + static void zfs_ioc_input_tests(const char *pool) { @@ -700,6 +782,7 @@ zfs_ioc_input_tests(const char *pool) char bookmark[ZFS_MAX_DATASET_NAME_LEN + 32]; char backup[ZFS_MAX_DATASET_NAME_LEN]; char clone[ZFS_MAX_DATASET_NAME_LEN]; + char clonesnap[ZFS_MAX_DATASET_NAME_LEN + 32]; int tmpfd, err; /* @@ -710,9 +793,10 @@ zfs_ioc_input_tests(const char *pool) (void) snprintf(snapshot, sizeof (snapshot), "%s@snapshot", dataset); (void) snprintf(bookmark, sizeof (bookmark), "%s#bookmark", dataset); (void) snprintf(clone, sizeof (clone), "%s/test-fs-clone", pool); + (void) snprintf(clonesnap, sizeof (clonesnap), "%s@snap", clone); (void) snprintf(backup, sizeof (backup), "%s/backup", pool); - err = lzc_create(dataset, DMU_OST_ZFS, NULL, NULL, 0); + err = lzc_create(dataset, LZC_DATSET_TYPE_ZFS, NULL, NULL, -1); if (err) { (void) fprintf(stderr, "could not create '%s': %s\n", dataset, strerror(errno)); @@ -747,6 +831,7 @@ zfs_ioc_input_tests(const char *pool) test_bookmark(pool, snapshot, bookmark); test_get_bookmarks(dataset); + test_get_bookmark_props(bookmark); test_destroy_bookmarks(pool, bookmark); test_hold(pool, snapshot); @@ -754,6 +839,9 @@ zfs_ioc_input_tests(const char *pool) test_release(pool, snapshot); test_clone(snapshot, clone); + test_snapshot(pool, clonesnap); + test_redact(snapshot, clonesnap); + zfs_destroy(clonesnap); zfs_destroy(clone); test_rollback(dataset, snapshot); @@ -770,6 +858,12 @@ zfs_ioc_input_tests(const char *pool) test_vdev_initialize(pool); test_vdev_trim(pool); + test_wait(pool); + test_wait_fs(dataset); + + test_set_bootenv(pool); + test_get_bootenv(pool); + /* * cleanup */ @@ -806,7 +900,7 @@ zfs_ioc_input_tests(const char *pool) if (ioc_tested[cmd]) continue; - if (ioctl(zfs_fd, ioc, &zc) != 0 && + if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0 && errno != ZFS_ERR_IOC_CMD_UNAVAIL) { (void) fprintf(stderr, "cmd %d is missing a test case " "(%d)\n", cmd, errno); @@ -815,103 +909,127 @@ zfs_ioc_input_tests(const char *pool) } enum zfs_ioc_ref { +#ifdef __FreeBSD__ + ZFS_IOC_BASE = 0, +#else ZFS_IOC_BASE = ('Z' << 8), - LINUX_IOC_BASE = ('Z' << 8) + 0x80, - FREEBSD_IOC_BASE = ('Z' << 8) + 0xC0, +#endif + ZFS_IOC_PLATFORM_BASE = ZFS_IOC_BASE + 0x80, }; /* * Canonical reference check of /dev/zfs ioctl numbers. * These cannot change and new ioctl numbers must be appended. */ -boolean_t +static boolean_t validate_ioc_values(void) { - return ( - ZFS_IOC_BASE + 0 == ZFS_IOC_POOL_CREATE && - ZFS_IOC_BASE + 1 == ZFS_IOC_POOL_DESTROY && - ZFS_IOC_BASE + 2 == ZFS_IOC_POOL_IMPORT && - ZFS_IOC_BASE + 3 == ZFS_IOC_POOL_EXPORT && - ZFS_IOC_BASE + 4 == ZFS_IOC_POOL_CONFIGS && - ZFS_IOC_BASE + 5 == ZFS_IOC_POOL_STATS && - ZFS_IOC_BASE + 6 == ZFS_IOC_POOL_TRYIMPORT && - ZFS_IOC_BASE + 7 == ZFS_IOC_POOL_SCAN && - ZFS_IOC_BASE + 8 == ZFS_IOC_POOL_FREEZE && - ZFS_IOC_BASE + 9 == ZFS_IOC_POOL_UPGRADE && - ZFS_IOC_BASE + 10 == ZFS_IOC_POOL_GET_HISTORY && - ZFS_IOC_BASE + 11 == ZFS_IOC_VDEV_ADD && - ZFS_IOC_BASE + 12 == ZFS_IOC_VDEV_REMOVE && - ZFS_IOC_BASE + 13 == ZFS_IOC_VDEV_SET_STATE && - ZFS_IOC_BASE + 14 == ZFS_IOC_VDEV_ATTACH && - ZFS_IOC_BASE + 15 == ZFS_IOC_VDEV_DETACH && - ZFS_IOC_BASE + 16 == ZFS_IOC_VDEV_SETPATH && - ZFS_IOC_BASE + 17 == ZFS_IOC_VDEV_SETFRU && - ZFS_IOC_BASE + 18 == ZFS_IOC_OBJSET_STATS && - ZFS_IOC_BASE + 19 == ZFS_IOC_OBJSET_ZPLPROPS && - ZFS_IOC_BASE + 20 == ZFS_IOC_DATASET_LIST_NEXT && - ZFS_IOC_BASE + 21 == ZFS_IOC_SNAPSHOT_LIST_NEXT && - ZFS_IOC_BASE + 22 == ZFS_IOC_SET_PROP && - ZFS_IOC_BASE + 23 == ZFS_IOC_CREATE && - ZFS_IOC_BASE + 24 == ZFS_IOC_DESTROY && - ZFS_IOC_BASE + 25 == ZFS_IOC_ROLLBACK && - ZFS_IOC_BASE + 26 == ZFS_IOC_RENAME && - ZFS_IOC_BASE + 27 == ZFS_IOC_RECV && - ZFS_IOC_BASE + 28 == ZFS_IOC_SEND && - ZFS_IOC_BASE + 29 == ZFS_IOC_INJECT_FAULT && - ZFS_IOC_BASE + 30 == ZFS_IOC_CLEAR_FAULT && - ZFS_IOC_BASE + 31 == ZFS_IOC_INJECT_LIST_NEXT && - ZFS_IOC_BASE + 32 == ZFS_IOC_ERROR_LOG && - ZFS_IOC_BASE + 33 == ZFS_IOC_CLEAR && - ZFS_IOC_BASE + 34 == ZFS_IOC_PROMOTE && - ZFS_IOC_BASE + 35 == ZFS_IOC_SNAPSHOT && - ZFS_IOC_BASE + 36 == ZFS_IOC_DSOBJ_TO_DSNAME && - ZFS_IOC_BASE + 37 == ZFS_IOC_OBJ_TO_PATH && - ZFS_IOC_BASE + 38 == ZFS_IOC_POOL_SET_PROPS && - ZFS_IOC_BASE + 39 == ZFS_IOC_POOL_GET_PROPS && - ZFS_IOC_BASE + 40 == ZFS_IOC_SET_FSACL && - ZFS_IOC_BASE + 41 == ZFS_IOC_GET_FSACL && - ZFS_IOC_BASE + 42 == ZFS_IOC_SHARE && - ZFS_IOC_BASE + 43 == ZFS_IOC_INHERIT_PROP && - ZFS_IOC_BASE + 44 == ZFS_IOC_SMB_ACL && - ZFS_IOC_BASE + 45 == ZFS_IOC_USERSPACE_ONE && - ZFS_IOC_BASE + 46 == ZFS_IOC_USERSPACE_MANY && - ZFS_IOC_BASE + 47 == ZFS_IOC_USERSPACE_UPGRADE && - ZFS_IOC_BASE + 48 == ZFS_IOC_HOLD && - ZFS_IOC_BASE + 49 == ZFS_IOC_RELEASE && - ZFS_IOC_BASE + 50 == ZFS_IOC_GET_HOLDS && - ZFS_IOC_BASE + 51 == ZFS_IOC_OBJSET_RECVD_PROPS && - ZFS_IOC_BASE + 52 == ZFS_IOC_VDEV_SPLIT && - ZFS_IOC_BASE + 53 == ZFS_IOC_NEXT_OBJ && - ZFS_IOC_BASE + 54 == ZFS_IOC_DIFF && - ZFS_IOC_BASE + 55 == ZFS_IOC_TMP_SNAPSHOT && - ZFS_IOC_BASE + 56 == ZFS_IOC_OBJ_TO_STATS && - ZFS_IOC_BASE + 57 == ZFS_IOC_SPACE_WRITTEN && - ZFS_IOC_BASE + 58 == ZFS_IOC_SPACE_SNAPS && - ZFS_IOC_BASE + 59 == ZFS_IOC_DESTROY_SNAPS && - ZFS_IOC_BASE + 60 == ZFS_IOC_POOL_REGUID && - ZFS_IOC_BASE + 61 == ZFS_IOC_POOL_REOPEN && - ZFS_IOC_BASE + 62 == ZFS_IOC_SEND_PROGRESS && - ZFS_IOC_BASE + 63 == ZFS_IOC_LOG_HISTORY && - ZFS_IOC_BASE + 64 == ZFS_IOC_SEND_NEW && - ZFS_IOC_BASE + 65 == ZFS_IOC_SEND_SPACE && - ZFS_IOC_BASE + 66 == ZFS_IOC_CLONE && - ZFS_IOC_BASE + 67 == ZFS_IOC_BOOKMARK && - ZFS_IOC_BASE + 68 == ZFS_IOC_GET_BOOKMARKS && - ZFS_IOC_BASE + 69 == ZFS_IOC_DESTROY_BOOKMARKS && - ZFS_IOC_BASE + 70 == ZFS_IOC_RECV_NEW && - ZFS_IOC_BASE + 71 == ZFS_IOC_POOL_SYNC && - ZFS_IOC_BASE + 72 == ZFS_IOC_CHANNEL_PROGRAM && - ZFS_IOC_BASE + 73 == ZFS_IOC_LOAD_KEY && - ZFS_IOC_BASE + 74 == ZFS_IOC_UNLOAD_KEY && - ZFS_IOC_BASE + 75 == ZFS_IOC_CHANGE_KEY && - ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP && - ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT && - ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT && - ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE && - ZFS_IOC_BASE + 80 == ZFS_IOC_POOL_TRIM && - LINUX_IOC_BASE + 1 == ZFS_IOC_EVENTS_NEXT && - LINUX_IOC_BASE + 2 == ZFS_IOC_EVENTS_CLEAR && - LINUX_IOC_BASE + 3 == ZFS_IOC_EVENTS_SEEK); + boolean_t result = B_TRUE; + +#define CHECK(expr) do { \ + if (!(expr)) { \ + result = B_FALSE; \ + fprintf(stderr, "(%s) === FALSE\n", #expr); \ + } \ +} while (0) + + CHECK(ZFS_IOC_BASE + 0 == ZFS_IOC_POOL_CREATE); + CHECK(ZFS_IOC_BASE + 1 == ZFS_IOC_POOL_DESTROY); + CHECK(ZFS_IOC_BASE + 2 == ZFS_IOC_POOL_IMPORT); + CHECK(ZFS_IOC_BASE + 3 == ZFS_IOC_POOL_EXPORT); + CHECK(ZFS_IOC_BASE + 4 == ZFS_IOC_POOL_CONFIGS); + CHECK(ZFS_IOC_BASE + 5 == ZFS_IOC_POOL_STATS); + CHECK(ZFS_IOC_BASE + 6 == ZFS_IOC_POOL_TRYIMPORT); + CHECK(ZFS_IOC_BASE + 7 == ZFS_IOC_POOL_SCAN); + CHECK(ZFS_IOC_BASE + 8 == ZFS_IOC_POOL_FREEZE); + CHECK(ZFS_IOC_BASE + 9 == ZFS_IOC_POOL_UPGRADE); + CHECK(ZFS_IOC_BASE + 10 == ZFS_IOC_POOL_GET_HISTORY); + CHECK(ZFS_IOC_BASE + 11 == ZFS_IOC_VDEV_ADD); + CHECK(ZFS_IOC_BASE + 12 == ZFS_IOC_VDEV_REMOVE); + CHECK(ZFS_IOC_BASE + 13 == ZFS_IOC_VDEV_SET_STATE); + CHECK(ZFS_IOC_BASE + 14 == ZFS_IOC_VDEV_ATTACH); + CHECK(ZFS_IOC_BASE + 15 == ZFS_IOC_VDEV_DETACH); + CHECK(ZFS_IOC_BASE + 16 == ZFS_IOC_VDEV_SETPATH); + CHECK(ZFS_IOC_BASE + 17 == ZFS_IOC_VDEV_SETFRU); + CHECK(ZFS_IOC_BASE + 18 == ZFS_IOC_OBJSET_STATS); + CHECK(ZFS_IOC_BASE + 19 == ZFS_IOC_OBJSET_ZPLPROPS); + CHECK(ZFS_IOC_BASE + 20 == ZFS_IOC_DATASET_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 21 == ZFS_IOC_SNAPSHOT_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 22 == ZFS_IOC_SET_PROP); + CHECK(ZFS_IOC_BASE + 23 == ZFS_IOC_CREATE); + CHECK(ZFS_IOC_BASE + 24 == ZFS_IOC_DESTROY); + CHECK(ZFS_IOC_BASE + 25 == ZFS_IOC_ROLLBACK); + CHECK(ZFS_IOC_BASE + 26 == ZFS_IOC_RENAME); + CHECK(ZFS_IOC_BASE + 27 == ZFS_IOC_RECV); + CHECK(ZFS_IOC_BASE + 28 == ZFS_IOC_SEND); + CHECK(ZFS_IOC_BASE + 29 == ZFS_IOC_INJECT_FAULT); + CHECK(ZFS_IOC_BASE + 30 == ZFS_IOC_CLEAR_FAULT); + CHECK(ZFS_IOC_BASE + 31 == ZFS_IOC_INJECT_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 32 == ZFS_IOC_ERROR_LOG); + CHECK(ZFS_IOC_BASE + 33 == ZFS_IOC_CLEAR); + CHECK(ZFS_IOC_BASE + 34 == ZFS_IOC_PROMOTE); + CHECK(ZFS_IOC_BASE + 35 == ZFS_IOC_SNAPSHOT); + CHECK(ZFS_IOC_BASE + 36 == ZFS_IOC_DSOBJ_TO_DSNAME); + CHECK(ZFS_IOC_BASE + 37 == ZFS_IOC_OBJ_TO_PATH); + CHECK(ZFS_IOC_BASE + 38 == ZFS_IOC_POOL_SET_PROPS); + CHECK(ZFS_IOC_BASE + 39 == ZFS_IOC_POOL_GET_PROPS); + CHECK(ZFS_IOC_BASE + 40 == ZFS_IOC_SET_FSACL); + CHECK(ZFS_IOC_BASE + 41 == ZFS_IOC_GET_FSACL); + CHECK(ZFS_IOC_BASE + 42 == ZFS_IOC_SHARE); + CHECK(ZFS_IOC_BASE + 43 == ZFS_IOC_INHERIT_PROP); + CHECK(ZFS_IOC_BASE + 44 == ZFS_IOC_SMB_ACL); + CHECK(ZFS_IOC_BASE + 45 == ZFS_IOC_USERSPACE_ONE); + CHECK(ZFS_IOC_BASE + 46 == ZFS_IOC_USERSPACE_MANY); + CHECK(ZFS_IOC_BASE + 47 == ZFS_IOC_USERSPACE_UPGRADE); + CHECK(ZFS_IOC_BASE + 48 == ZFS_IOC_HOLD); + CHECK(ZFS_IOC_BASE + 49 == ZFS_IOC_RELEASE); + CHECK(ZFS_IOC_BASE + 50 == ZFS_IOC_GET_HOLDS); + CHECK(ZFS_IOC_BASE + 51 == ZFS_IOC_OBJSET_RECVD_PROPS); + CHECK(ZFS_IOC_BASE + 52 == ZFS_IOC_VDEV_SPLIT); + CHECK(ZFS_IOC_BASE + 53 == ZFS_IOC_NEXT_OBJ); + CHECK(ZFS_IOC_BASE + 54 == ZFS_IOC_DIFF); + CHECK(ZFS_IOC_BASE + 55 == ZFS_IOC_TMP_SNAPSHOT); + CHECK(ZFS_IOC_BASE + 56 == ZFS_IOC_OBJ_TO_STATS); + CHECK(ZFS_IOC_BASE + 57 == ZFS_IOC_SPACE_WRITTEN); + CHECK(ZFS_IOC_BASE + 58 == ZFS_IOC_SPACE_SNAPS); + CHECK(ZFS_IOC_BASE + 59 == ZFS_IOC_DESTROY_SNAPS); + CHECK(ZFS_IOC_BASE + 60 == ZFS_IOC_POOL_REGUID); + CHECK(ZFS_IOC_BASE + 61 == ZFS_IOC_POOL_REOPEN); + CHECK(ZFS_IOC_BASE + 62 == ZFS_IOC_SEND_PROGRESS); + CHECK(ZFS_IOC_BASE + 63 == ZFS_IOC_LOG_HISTORY); + CHECK(ZFS_IOC_BASE + 64 == ZFS_IOC_SEND_NEW); + CHECK(ZFS_IOC_BASE + 65 == ZFS_IOC_SEND_SPACE); + CHECK(ZFS_IOC_BASE + 66 == ZFS_IOC_CLONE); + CHECK(ZFS_IOC_BASE + 67 == ZFS_IOC_BOOKMARK); + CHECK(ZFS_IOC_BASE + 68 == ZFS_IOC_GET_BOOKMARKS); + CHECK(ZFS_IOC_BASE + 69 == ZFS_IOC_DESTROY_BOOKMARKS); + CHECK(ZFS_IOC_BASE + 70 == ZFS_IOC_RECV_NEW); + CHECK(ZFS_IOC_BASE + 71 == ZFS_IOC_POOL_SYNC); + CHECK(ZFS_IOC_BASE + 72 == ZFS_IOC_CHANNEL_PROGRAM); + CHECK(ZFS_IOC_BASE + 73 == ZFS_IOC_LOAD_KEY); + CHECK(ZFS_IOC_BASE + 74 == ZFS_IOC_UNLOAD_KEY); + CHECK(ZFS_IOC_BASE + 75 == ZFS_IOC_CHANGE_KEY); + CHECK(ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP); + CHECK(ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT); + CHECK(ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT); + CHECK(ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE); + CHECK(ZFS_IOC_BASE + 80 == ZFS_IOC_POOL_TRIM); + CHECK(ZFS_IOC_BASE + 81 == ZFS_IOC_REDACT); + CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); + CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); + CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); + CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); + CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); + CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); + CHECK(ZFS_IOC_PLATFORM_BASE + 4 == ZFS_IOC_NEXTBOOT); + CHECK(ZFS_IOC_PLATFORM_BASE + 5 == ZFS_IOC_JAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 6 == ZFS_IOC_UNJAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 7 == ZFS_IOC_SET_BOOTENV); + CHECK(ZFS_IOC_PLATFORM_BASE + 8 == ZFS_IOC_GET_BOOTENV); + +#undef CHECK + + return (result); } int diff --git a/tests/zfs-tests/cmd/mkbusy/mkbusy.c b/tests/zfs-tests/cmd/mkbusy/mkbusy.c index 9634904f0d..e1cbd95cd1 100644 --- a/tests/zfs-tests/cmd/mkbusy/mkbusy.c +++ b/tests/zfs-tests/cmd/mkbusy/mkbusy.c @@ -30,20 +30,19 @@ #include #include -typedef enum boolean { B_FALSE, B_TRUE } boolean_t; -static void +static __attribute__((noreturn)) void usage(char *progname) { (void) fprintf(stderr, "Usage: %s \n", progname); exit(1); } -static void -fail(char *err, int rval) +static __attribute__((noreturn)) void +fail(char *err) { perror(err); - exit(rval); + exit(1); } static void @@ -52,7 +51,7 @@ daemonize(void) pid_t pid; if ((pid = fork()) < 0) { - fail("fork", 1); + fail("fork"); } else if (pid != 0) { (void) fprintf(stdout, "%ld\n", (long)pid); exit(0); @@ -64,27 +63,32 @@ daemonize(void) (void) close(2); } + +static const char * +get_basename(const char *path) +{ + const char *bn = strrchr(path, '/'); + return (bn ? bn + 1 : path); +} + +static ssize_t +get_dirnamelen(const char *path) +{ + const char *end = strrchr(path, '/'); + return (end ? end - path : -1); +} + int main(int argc, char *argv[]) { - int ret, c; + int c; boolean_t isdir = B_FALSE; - boolean_t fflag = B_FALSE; - boolean_t rflag = B_FALSE; struct stat sbuf; char *fpath = NULL; char *prog = argv[0]; - while ((c = getopt(argc, argv, "fr")) != -1) { + while ((c = getopt(argc, argv, "")) != -1) { switch (c) { - /* Open the file or directory read only */ - case 'r': - rflag = B_TRUE; - break; - /* Run in the foreground */ - case 'f': - fflag = B_TRUE; - break; default: usage(prog); } @@ -96,84 +100,68 @@ main(int argc, char *argv[]) if (argc != 1) usage(prog); - if ((ret = stat(argv[0], &sbuf)) != 0) { - char *arg, *dname, *fname; - int arglen; - char *slash; - int rc; + if (stat(argv[0], &sbuf) != 0) { + char *arg; + const char *dname, *fname; + size_t arglen; + ssize_t dnamelen; /* * The argument supplied doesn't exist. Copy the path, and - * remove the trailing slash if presnt. + * remove the trailing slash if present. */ if ((arg = strdup(argv[0])) == NULL) - fail("strdup", 1); + fail("strdup"); arglen = strlen(arg); if (arg[arglen - 1] == '/') arg[arglen - 1] = '\0'; - /* - * Get the directory and file names, using the current directory - * if the provided path doesn't specify a directory at all. - */ - if ((slash = strrchr(arg, '/')) == NULL) { - dname = strdup("."); - fname = strdup(arg); - } else { - *slash = '\0'; - dname = strdup(arg); - fname = strdup(slash + 1); - } - free(arg); - if (dname == NULL || fname == NULL) - fail("strdup", 1); + /* Get the directory and file names. */ + fname = get_basename(arg); + dname = arg; + if ((dnamelen = get_dirnamelen(arg)) != -1) + arg[dnamelen] = '\0'; + else + dname = "."; /* The directory portion of the path must exist */ - if ((ret = stat(dname, &sbuf)) != 0 || !(sbuf.st_mode & - S_IFDIR)) + if (stat(dname, &sbuf) != 0 || !(sbuf.st_mode & S_IFDIR)) usage(prog); - rc = asprintf(&fpath, "%s/%s", dname, fname); - free(dname); - free(fname); - if (rc == -1 || fpath == NULL) - fail("asprintf", 1); + if (asprintf(&fpath, "%s/%s", dname, fname) == -1) + fail("asprintf"); - } else if ((sbuf.st_mode & S_IFMT) == S_IFREG || - (sbuf.st_mode & S_IFMT) == S_IFLNK || - (sbuf.st_mode & S_IFMT) == S_IFCHR || - (sbuf.st_mode & S_IFMT) == S_IFBLK) { - fpath = strdup(argv[0]); - } else if ((sbuf.st_mode & S_IFMT) == S_IFDIR) { - fpath = strdup(argv[0]); - isdir = B_TRUE; - } else { - usage(prog); - } + free(arg); + } else + switch (sbuf.st_mode & S_IFMT) { + case S_IFDIR: + isdir = B_TRUE; + fallthrough; + case S_IFLNK: + case S_IFCHR: + case S_IFBLK: + if ((fpath = strdup(argv[0])) == NULL) + fail("strdup"); + break; + default: + usage(prog); + } - if (fpath == NULL) - fail("strdup", 1); + if (!isdir) { + int fd; - if (isdir == B_FALSE) { - int fd, flags; - mode_t mode = S_IRUSR | S_IWUSR; - - flags = rflag == B_FALSE ? O_CREAT | O_RDWR : O_RDONLY; - - if ((fd = open(fpath, flags, mode)) < 0) - fail("open", 1); + if ((fd = open(fpath, O_CREAT | O_RDWR, 0600)) < 0) + fail("open"); } else { DIR *dp; if ((dp = opendir(fpath)) == NULL) - fail("opendir", 1); + fail("opendir"); } free(fpath); - if (fflag == B_FALSE) - daemonize(); + daemonize(); (void) pause(); - /* NOTREACHED */ return (0); } diff --git a/tests/zfs-tests/cmd/mkfile/Makefile.am b/tests/zfs-tests/cmd/mkfile/Makefile.am index 016c671281..5f0e2e03ef 100644 --- a/tests/zfs-tests/cmd/mkfile/Makefile.am +++ b/tests/zfs-tests/cmd/mkfile/Makefile.am @@ -4,3 +4,5 @@ pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin pkgexec_PROGRAMS = mkfile mkfile_SOURCES = mkfile.c + +mkfile_LDADD = $(LTLIBINTL) diff --git a/tests/zfs-tests/cmd/mkfile/mkfile.c b/tests/zfs-tests/cmd/mkfile/mkfile.c index 7ebf7bbcf8..673cbf9e00 100644 --- a/tests/zfs-tests/cmd/mkfile/mkfile.c +++ b/tests/zfs-tests/cmd/mkfile/mkfile.c @@ -34,19 +34,17 @@ #include #include #include +#include +#include -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -#define BLOCK_SIZE 512 /* bytes */ +#define BLOCKSIZE 512 /* bytes */ #define KILOBYTE 1024 #define MEGABYTE (KILOBYTE * KILOBYTE) #define GIGABYTE (KILOBYTE * MEGABYTE) #define FILE_MODE (S_ISVTX + S_IRUSR + S_IWUSR) -typedef long long offset_t; - -static void usage(void); +static void usage(void) __attribute__((noreturn)); int main(int argc, char **argv) @@ -95,7 +93,7 @@ main(int argc, char **argv) break; case 'b': case 'B': - mult = BLOCK_SIZE; + mult = BLOCKSIZE; break; case 'm': case 'M': @@ -141,8 +139,17 @@ main(int argc, char **argv) argv++; argc--; continue; - } - if (lseek(fd, (off_t)size-1, SEEK_SET) < 0) { + } else if (fchown(fd, getuid(), getgid()) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not set owner/group of %s: %s\n"), + argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } else if (lseek(fd, (off_t)size-1, SEEK_SET) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Could not seek to offset %ld in %s: %s\n"), @@ -271,5 +278,4 @@ static void usage() (void) fprintf(stderr, gettext( "Usage: mkfile [-nv] [g|k|b|m] [] ...\n")); exit(1); - /* NOTREACHED */ } diff --git a/tests/zfs-tests/cmd/mkfiles/mkfiles.c b/tests/zfs-tests/cmd/mkfiles/mkfiles.c index 62dee16279..32abfd0c3d 100644 --- a/tests/zfs-tests/cmd/mkfiles/mkfiles.c +++ b/tests/zfs-tests/cmd/mkfiles/mkfiles.c @@ -55,6 +55,10 @@ main(int argc, char **argv) (void) fprintf(stderr, "Failed to create %s %s\n", buf, strerror(errno)); return (-4); + } else if (fchown(fd, getuid(), getgid()) < 0) { + (void) fprintf(stderr, "Failed to chown %s %s\n", buf, + strerror(errno)); + return (-5); } (void) close(fd); } diff --git a/tests/zfs-tests/cmd/mktree/mktree.c b/tests/zfs-tests/cmd/mktree/mktree.c index 02d4974d78..25b26c9e15 100644 --- a/tests/zfs-tests/cmd/mktree/mktree.c +++ b/tests/zfs-tests/cmd/mktree/mktree.c @@ -30,7 +30,9 @@ #include #include #include +#ifdef __linux__ #include +#endif #include #include #include @@ -176,11 +178,13 @@ crtfile(char *pname) exit(errno); } +#ifdef __linux__ if (fsetxattr(fd, "user.xattr", pbuf, 1024, 0) < 0) { (void) fprintf(stderr, "fsetxattr(fd, \"xattr\", pbuf, " "1024, 0) failed.\n[%d]: %s.\n", errno, strerror(errno)); exit(errno); } +#endif (void) close(fd); free(pbuf); diff --git a/tests/zfs-tests/cmd/mmap_libaio/Makefile.am b/tests/zfs-tests/cmd/mmap_libaio/Makefile.am index 67d0f0eced..25f9dda2b6 100644 --- a/tests/zfs-tests/cmd/mmap_libaio/Makefile.am +++ b/tests/zfs-tests/cmd/mmap_libaio/Makefile.am @@ -5,5 +5,6 @@ pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin if WANT_MMAP_LIBAIO pkgexec_PROGRAMS = mmap_libaio mmap_libaio_SOURCES = mmap_libaio.c -mmap_libaio_LDADD = $(LIBAIO) +mmap_libaio_CFLAGS = $(AM_CFLAGS) $(LIBAIO_CFLAGS) +mmap_libaio_LDADD = $(LIBAIO_LIBS) endif diff --git a/tests/zfs-tests/cmd/mmap_seek/.gitignore b/tests/zfs-tests/cmd/mmap_seek/.gitignore new file mode 100644 index 0000000000..6b05a79175 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_seek/.gitignore @@ -0,0 +1 @@ +/mmap_seek diff --git a/tests/zfs-tests/cmd/mmap_seek/Makefile.am b/tests/zfs-tests/cmd/mmap_seek/Makefile.am new file mode 100644 index 0000000000..b938931125 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_seek/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mmap_seek +mmap_seek_SOURCES = mmap_seek.c diff --git a/tests/zfs-tests/cmd/mmap_seek/mmap_seek.c b/tests/zfs-tests/cmd/mmap_seek/mmap_seek.c new file mode 100644 index 0000000000..f476e1dba9 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_seek/mmap_seek.c @@ -0,0 +1,147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void +seek_data(int fd, off_t offset, off_t expected) +{ + off_t data_offset = lseek(fd, offset, SEEK_DATA); + if (data_offset != expected) { + fprintf(stderr, "lseek(fd, %d, SEEK_DATA) = %d (expected %d)\n", + (int)offset, (int)data_offset, (int)expected); + exit(2); + } +} + +static void +seek_hole(int fd, off_t offset, off_t expected) +{ + off_t hole_offset = lseek(fd, offset, SEEK_HOLE); + if (hole_offset != expected) { + fprintf(stderr, "lseek(fd, %d, SEEK_HOLE) = %d (expected %d)\n", + (int)offset, (int)hole_offset, (int)expected); + exit(2); + } +} + +int +main(int argc, char **argv) +{ + char *execname = argv[0]; + char *file_path = argv[1]; + char *buf = NULL; + int err; + + if (argc != 4) { + (void) printf("usage: %s " + "\n", argv[0]); + exit(1); + } + + int fd = open(file_path, O_RDWR | O_CREAT, 0666); + if (fd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, file_path); + perror("open"); + exit(2); + } + + off_t file_size = atoi(argv[2]); + off_t block_size = atoi(argv[3]); + + if (block_size * 2 > file_size) { + (void) fprintf(stderr, "file size must be at least " + "double the block size\n"); + exit(2); + } + + err = ftruncate(fd, file_size); + if (err == -1) { + perror("ftruncate"); + exit(2); + } + + if ((buf = mmap(NULL, file_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0)) == MAP_FAILED) { + perror("mmap"); + exit(2); + } + + /* Verify the file is sparse and reports no data. */ + seek_data(fd, 0, -1); + + /* Verify the file is reported as a hole. */ + seek_hole(fd, 0, 0); + + /* Verify search beyond end of file is an error. */ + seek_data(fd, 2 * file_size, -1); + seek_hole(fd, 2 * file_size, -1); + + /* Dirty the first byte. */ + memset(buf, 'a', 1); + seek_data(fd, 0, 0); + seek_data(fd, block_size, -1); + seek_hole(fd, 0, block_size); + seek_hole(fd, block_size, block_size); + + /* Dirty the first half of the file. */ + memset(buf, 'b', file_size / 2); + seek_data(fd, 0, 0); + seek_data(fd, block_size, block_size); + seek_hole(fd, 0, P2ROUNDUP(file_size / 2, block_size)); + seek_hole(fd, block_size, P2ROUNDUP(file_size / 2, block_size)); + + /* Dirty the whole file. */ + memset(buf, 'c', file_size); + seek_data(fd, 0, 0); + seek_data(fd, file_size * 3 / 4, + P2ROUNDUP(file_size * 3 / 4, block_size)); + seek_hole(fd, 0, file_size); + seek_hole(fd, file_size / 2, file_size); + + /* Punch a hole (required compression be enabled). */ + memset(buf + block_size, 0, block_size); + seek_data(fd, 0, 0); + seek_data(fd, block_size, 2 * block_size); + seek_hole(fd, 0, block_size); + seek_hole(fd, block_size, block_size); + seek_hole(fd, 2 * block_size, file_size); + + err = munmap(buf, file_size); + if (err == -1) { + perror("munmap"); + exit(2); + } + + close(fd); + + return (0); +} diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c index b9915d5d31..1f344534d5 100644 --- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c +++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c @@ -42,8 +42,8 @@ * 2. In the same process, context #2, mmap page fault (which means the mm_sem * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous * txg "n" completed. - * 3. context #1 call uiomove to write, however page fault is occurred in - * uiomove, which means it need mm_sem, but mm_sem is hold by + * 3. context #1 call zfs_uiomove to write, however page fault is occurred in + * zfs_uiomove, which means it needs mm_sem, but mm_sem is hold by * context #2, so it stuck and can't complete, then txg "n" will not * complete. * @@ -66,19 +66,15 @@ normal_writer(void *filename) err(1, "failed to open %s", file_path); } - char *buf = malloc(1); + char buf; while (1) { - write_num = write(fd, buf, 1); + write_num = write(fd, &buf, 1); if (write_num == 0) { err(1, "write failed!"); break; } lseek(fd, page_size, SEEK_CUR); } - - if (buf) { - free(buf); - } } static void * @@ -140,7 +136,7 @@ main(int argc, char **argv) int i = 0; if (argc != 3) { - (void) printf("usage: %s " + (void) printf("usage: %s " "\n", argv[0]); exit(1); } @@ -156,7 +152,6 @@ main(int argc, char **argv) err(1, "pthread_create map_writer failed."); } - /* NOTREACHED */ pthread_join(map_write_tid, NULL); return (0); } diff --git a/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am b/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am index f509a97e38..511b6c6913 100644 --- a/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am +++ b/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am @@ -2,13 +2,9 @@ include $(top_srcdir)/config/Rules.am pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - pkgexec_PROGRAMS = nvlist_to_lua nvlist_to_lua_SOURCES = nvlist_to_lua.c nvlist_to_lua_LDADD = \ - $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs_core/libzfs_core.la + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c index 7986851efa..e262ecefea 100644 --- a/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c +++ b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c @@ -47,7 +47,6 @@ #include #include -static const int TRUE = 1; static char *filebase; static int @@ -65,7 +64,7 @@ mover(void *a) len = strlen(filebase) + 5; - while (TRUE) { + for (;;) { idx = pickidx(); (void) snprintf(buf, len, "%s.%03d", filebase, idx); ret = rename(filebase, buf); @@ -85,7 +84,7 @@ cleaner(void *a) len = strlen(filebase) + 5; - while (TRUE) { + for (;;) { idx = pickidx(); (void) snprintf(buf, len, "%s.%03d", filebase, idx); ret = remove(buf); @@ -102,7 +101,7 @@ writer(void *a) int *fd = (int *)a; int ret; - while (TRUE) { + for (;;) { if (*fd != -1) (void) close (*fd); @@ -143,7 +142,7 @@ main(int argc, char **argv) (void) pthread_create(&tid, NULL, cleaner, NULL); (void) pthread_create(&tid, NULL, writer, (void *) &fd); - while (TRUE) { + for (;;) { int ret; struct stat st; diff --git a/tests/zfs-tests/cmd/send_doall/.gitignore b/tests/zfs-tests/cmd/send_doall/.gitignore new file mode 100644 index 0000000000..6ba2e603f7 --- /dev/null +++ b/tests/zfs-tests/cmd/send_doall/.gitignore @@ -0,0 +1 @@ +/send_doall diff --git a/tests/zfs-tests/cmd/send_doall/Makefile.am b/tests/zfs-tests/cmd/send_doall/Makefile.am new file mode 100644 index 0000000000..33a6b83122 --- /dev/null +++ b/tests/zfs-tests/cmd/send_doall/Makefile.am @@ -0,0 +1,11 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = send_doall + +send_doall_SOURCES = send_doall.c +send_doall_LDADD = \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/send_doall/send_doall.c b/tests/zfs-tests/cmd/send_doall/send_doall.c new file mode 100644 index 0000000000..6f47df0474 --- /dev/null +++ b/tests/zfs-tests/cmd/send_doall/send_doall.c @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Portions Copyright 2020 iXsystems, Inc. + */ + +/* + * Test a corner case : a "doall" send without children datasets. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +static void +usage(const char *name) +{ + fprintf(stderr, "usage: %s snap\n", name); + exit(EX_USAGE); +} + +int +main(int argc, char const * const argv[]) +{ + sendflags_t flags = { 0 }; + libzfs_handle_t *zhdl; + zfs_handle_t *zhp; + const char *tofull, *fsname, *tosnap, *p; + int error; + + if (argc != 2) + usage(argv[0]); + + tofull = argv[1]; + + p = strchr(tofull, '@'); + if (p == NULL) + usage(argv[0]); + tosnap = p + 1; + + fsname = strndup(tofull, p - tofull); + + zhdl = libzfs_init(); + if (zhdl == NULL) + errx(EX_OSERR, "libzfs_init(): %s", libzfs_error_init(errno)); + + zhp = zfs_open(zhdl, fsname, ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + err(EX_OSERR, "zfs_open(\"%s\")", fsname); + + flags.doall = B_TRUE; + + error = zfs_send(zhp, NULL, tosnap, &flags, + STDOUT_FILENO, NULL, NULL, NULL); + + zfs_close(zhp); + + libzfs_fini(zhdl); + free((void *)fsname); + + return (error); +} diff --git a/tests/zfs-tests/cmd/stride_dd/.gitignore b/tests/zfs-tests/cmd/stride_dd/.gitignore new file mode 100644 index 0000000000..7c072ee0de --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/.gitignore @@ -0,0 +1 @@ +/stride_dd diff --git a/tests/zfs-tests/cmd/stride_dd/Makefile.am b/tests/zfs-tests/cmd/stride_dd/Makefile.am new file mode 100644 index 0000000000..d6f1adbac2 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = stride_dd +stride_dd_SOURCES = stride_dd.c +stride_dd_LDADD = -lrt diff --git a/tests/zfs-tests/cmd/stride_dd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd/stride_dd.c new file mode 100644 index 0000000000..88bd532923 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/stride_dd.c @@ -0,0 +1,214 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int bsize = 0; +static int count = 0; +static char *ifile = NULL; +static char *ofile = NULL; +static int stride = 0; +static int seek = 0; +static char *execname = "stride_dd"; + +static void usage(void); +static void parse_options(int argc, char *argv[]); + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: %s -i inputfile -o outputfile -b blocksize -c count \n" + " -s stride [ -k seekblocks]\n" + "\n" + "Simplified version of dd that supports the stride option.\n" + "A stride of n means that for each block written, n - 1 blocks\n" + "are skipped in both the input and output file. A stride of 1\n" + "means that blocks are read and written consecutively.\n" + "All numeric parameters must be integers.\n" + "\n" + " inputfile: File to read from\n" + " outputfile: File to write to\n" + " blocksize: Size of each block to read/write\n" + " count: Number of blocks to read/write\n" + " stride: Read/write a block then skip (stride - 1) blocks\n" + " seekblocks: Number of blocks to skip at start of output\n", + execname); + (void) exit(1); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + int errflag = 0; + + execname = argv[0]; + + extern char *optarg; + extern int optind, optopt; + + while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) { + switch (c) { + case 'b': + bsize = atoi(optarg); + break; + + case 'c': + count = atoi(optarg); + break; + + case 'i': + ifile = optarg; + break; + + case 'o': + ofile = optarg; + break; + + case 's': + stride = atoi(optarg); + break; + + case 'k': + seek = atoi(optarg); + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an operand\n", optopt); + errflag++; + break; + + case '?': + default: + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + + if (errflag) { + (void) usage(); + } + } + + if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL || + ofile == NULL || seek < 0) { + (void) fprintf(stderr, + "Required parameter(s) missing or invalid.\n"); + (void) usage(); + } +} + +int +main(int argc, char *argv[]) +{ + int i; + int ifd; + int ofd; + void *buf; + int c; + + parse_options(argc, argv); + + ifd = open(ifile, O_RDONLY); + if (ifd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ifile); + perror("open"); + exit(2); + } + + ofd = open(ofile, O_WRONLY | O_CREAT, 0666); + if (ofd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ofile); + perror("open"); + exit(2); + } + + /* + * We use valloc because some character block devices expect a + * page-aligned buffer. + */ + int err = posix_memalign(&buf, 4096, bsize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (seek > 0) { + if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + + for (i = 0; i < count; i++) { + c = read(ifd, buf, bsize); + if (c != bsize) { + + perror("read"); + exit(2); + } + if (c != bsize) { + if (c < 0) { + perror("read"); + } else { + (void) fprintf(stderr, + "%s: unexpected short read, read %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + c = write(ofd, buf, bsize); + if (c != bsize) { + if (c < 0) { + perror("write"); + } else { + (void) fprintf(stderr, + "%s: unexpected short write, wrote %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + if (stride > 1) { + if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + } + free(buf); + + (void) close(ofd); + (void) close(ifd); + + return (0); +} diff --git a/tests/zfs-tests/cmd/xattrtest/xattrtest.c b/tests/zfs-tests/cmd/xattrtest/xattrtest.c index 42c510ed08..0b68126c03 100644 --- a/tests/zfs-tests/cmd/xattrtest/xattrtest.c +++ b/tests/zfs-tests/cmd/xattrtest/xattrtest.c @@ -44,11 +44,9 @@ #include #include -extern char *program_invocation_short_name; - #define ERROR(fmt, ...) \ - fprintf(stderr, "%s: %s:%d: %s: " fmt "\n", \ - program_invocation_short_name, __FILE__, __LINE__, \ + fprintf(stderr, "xattrtest: %s:%d: %s: " fmt "\n", \ + __FILE__, __LINE__, \ __func__, ## __VA_ARGS__); static const char shortopts[] = "hvycdn:f:x:s:p:t:e:rRko:"; @@ -264,7 +262,7 @@ run_process(const char *path, char *argv[]) pid_t pid; int rc, devnull_fd; - pid = vfork(); + pid = fork(); if (pid == 0) { devnull_fd = open("/dev/null", O_WRONLY); diff --git a/tests/zfs-tests/include/Makefile.am b/tests/zfs-tests/include/Makefile.am index 41e105287b..16cdf2c814 100644 --- a/tests/zfs-tests/include/Makefile.am +++ b/tests/zfs-tests/include/Makefile.am @@ -1,3 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/include dist_pkgdata_DATA = \ blkdev.shlib \ @@ -5,16 +7,8 @@ dist_pkgdata_DATA = \ libtest.shlib \ math.shlib \ properties.shlib \ + tunables.cfg \ zpool_script.shlib -EXTRA_DIST = default.cfg.in - nodist_pkgdata_DATA = default.cfg - -$(nodist_pkgdata_DATA): %: %.in - -$(SED) -e 's,@zfsexecdir\@,$(zfsexecdir),g' \ - -e 's,@sysconfdir\@,$(sysconfdir),g' \ - $< >'$@' - -distclean-local:: - -$(RM) default.cfg +SUBSTFILES += $(nodist_pkgdata_DATA) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 9cac7184f9..bf70952904 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -12,12 +12,13 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2019 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +# Copyright 2019 Richard Elling # # @@ -55,13 +56,53 @@ function scan_scsi_hosts # # Wait for newly created block devices to have their minors created. +# Additional arguments can be passed to udevadm trigger, with the expected +# arguments to typically be a block device pathname. This is useful when +# checking waiting on a specific device to settle rather than triggering +# all devices and waiting for them all to settle. +# +# The udevadm settle timeout can be 120 or 180 seconds by default for +# some distros. If a long delay is experienced, it could be due to some +# strangeness in a malfunctioning device that isn't related to the devices +# under test. To help debug this condition, a notice is given if settle takes +# too long. +# +# Note: there is no meaningful return code if udevadm fails. Consumers +# should not expect a return code (do not call as argument to log_must) # function block_device_wait { if is_linux; then - udevadm trigger + udevadm trigger $* 2>/dev/null + typeset start=$SECONDS udevadm settle + typeset elapsed=$((SECONDS - start)) + [[ $elapsed > 60 ]] && \ + log_note udevadm settle time too long: $elapsed + elif is_freebsd; then + if [[ ${#@} -eq 0 ]]; then + # Do something that has to go through the geom event + # queue to complete. + sysctl kern.geom.conftxt >/dev/null + return + fi fi + # Poll for the given paths to appear, but give up eventually. + typeset -i i + for (( i = 0; i < 5; ++i )); do + typeset missing=false + typeset dev + for dev in "${@}"; do + if ! [[ -e $dev ]]; then + missing=true + break + fi + done + if ! $missing; then + break + fi + sleep ${#@} + done } # @@ -69,13 +110,23 @@ function block_device_wait # function is_physical_device #device { - typeset device=${1#$DEV_DSKDIR} - device=${device#$DEV_RDSKDIR} + typeset device=${1#$DEV_DSKDIR/} + device=${device#$DEV_RDSKDIR/} if is_linux; then - [[ -b "$DEV_DSKDIR/$device" ]] && \ + is_disk_device "$DEV_DSKDIR/$device" && \ [[ -f /sys/module/loop/parameters/max_part ]] return $? + elif is_freebsd; then + is_disk_device "$DEV_DSKDIR/$device" && \ + echo $device | egrep -q \ + -e '^a?da[0-9]+$' \ + -e '^md[0-9]+$' \ + -e '^mfid[0-9]+$' \ + -e '^nda[0-9]+$' \ + -e '^nvd[0-9]+$' \ + -e '^vtbd[0-9]+$' + return $? else echo $device | egrep "^c[0-F]+([td][0-F]+)+$" > /dev/null 2>&1 return $? @@ -113,10 +164,17 @@ function is_loop_device #disk } # -# Check if the given device is a multipath device and if there is a sybolic +# Linux: +# Check if the given device is a multipath device and if there is a symbolic # link to a device mapper and to a disk # Currently no support for dm devices alone without multipath # +# FreeBSD: +# Check if the given device is a gmultipath device. +# +# Others: +# No multipath detection. +# function is_mpath_device #disk { typeset disk=$1 @@ -131,6 +189,25 @@ function is_mpath_device #disk else return $? fi + elif is_freebsd; then + is_disk_device $DEV_MPATHDIR/$disk + else + false + fi +} + +# +# Check if the given path is the appropriate sort of device special node. +# +function is_disk_device #path +{ + typeset path=$1 + + if is_freebsd; then + # FreeBSD doesn't have block devices, only character devices. + test -c $path + else + test -b $path fi } @@ -200,11 +277,11 @@ function get_device_dir #device { typeset device=$1 - if ! $(is_physical_device $device) ; then + if ! is_freebsd && ! is_physical_device $device; then if [[ $device != "/" ]]; then device=${device%/*} fi - if [[ -b "$DEV_DSKDIR/$device" ]]; then + if is_disk_device "$DEV_DSKDIR/$device"; then device="$DEV_DSKDIR" fi echo $device @@ -261,25 +338,25 @@ function on_off_disk # disk state{online,offline} host if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" - slave="$(ls /sys/block/${dm_name}/slaves \ + dep="$(ls /sys/block/${dm_name}/slaves \ | nawk '{print $1}')" - while [[ -n $slave ]]; do + while [[ -n $dep ]]; do #check if disk is online - lsscsi | egrep $slave > /dev/null + lsscsi | egrep $dep > /dev/null if (($? == 0)); then - slave_dir="/sys/block/${dm_name}" - slave_dir+="/slaves/${slave}/device" - ss="${slave_dir}/state" - sd="${slave_dir}/delete" + dep_dir="/sys/block/${dm_name}" + dep_dir+="/slaves/${dep}/device" + ss="${dep_dir}/state" + sd="${dep_dir}/delete" log_must eval "echo 'offline' > ${ss}" log_must eval "echo '1' > ${sd}" - lsscsi | egrep $slave > /dev/null + lsscsi | egrep $dep > /dev/null if (($? == 0)); then log_fail "Offlining" \ "$disk failed" fi fi - slave="$(ls /sys/block/$dm_name/slaves \ + dep="$(ls /sys/block/$dm_name/slaves \ 2>/dev/null | nawk '{print $1}')" done elif [[ $state == "offline" ]] && ( is_real_device $disk ); then @@ -305,9 +382,9 @@ function on_off_disk # disk state{online,offline} host if is_mpath_device $disk; then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" - slave="$(ls /sys/block/$dm_name/slaves \ + dep="$(ls /sys/block/$dm_name/slaves \ | nawk '{print $1}')" - lsscsi | egrep $slave > /dev/null + lsscsi | egrep $dep > /dev/null if (($? != 0)); then log_fail "Onlining $disk failed" fi @@ -441,9 +518,137 @@ function get_pool_devices #testpool #devdir typeset devdir=$2 typeset out="" - if is_linux; then + if is_linux || is_freebsd; then out=$(zpool status -P $testpool |grep ${devdir} | awk '{print $1}') out=$(echo $out | sed -e "s|${devdir}/||g" | tr '\n' ' ') fi echo $out } + +# +# Write to standard out giving the level, device name, offset and length +# of all blocks in an input file. The offset and length are in units of +# 512 byte blocks. In the case of mirrored vdevs, only the first +# device is listed, as the levels, blocks and offsets will be the same +# on other devices. Note that this function only works with mirrored +# or non-redundant pools, not raidz. +# +# The output of this function can be used to introduce corruption at +# varying levels of indirection. +# +function list_file_blocks # input_file +{ + typeset input_file=$1 + + [[ -f $input_file ]] || log_fail "Couldn't find $input_file" + + typeset ds="$(zfs list -H -o name $input_file)" + typeset pool="${ds%%/*}" + typeset objnum="$(get_objnum $input_file)" + + # + # Establish a mapping between vdev ids as shown in a DVA and the + # pathnames they correspond to in ${VDEV_MAP[][]}. + # + # The vdev bits in a DVA refer to the top level vdev id. + # ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev. + # + eval $(zdb -C $pool | awk ' + BEGIN { printf "typeset -a VDEV_MAP;" } + function subscript(s) { + # "[#]" is more convenient than the bare "#" + match(s, /\[[0-9]*\]/) + return substr(s, RSTART, RLENGTH) + } + id && !/^ / { + # left a top level vdev + id = 0 + } + id && $1 ~ /^path:$/ { + # found a vdev path; save it in the map + printf "VDEV_MAP%s%s=%s;", id, child, $2 + } + /^ children/ { + # entering a top level vdev + id = subscript($0) + child = "[0]" # default in case there is no nested vdev + printf "typeset -a VDEV_MAP%s;", id + } + /^ children/ { + # entering a nested vdev (e.g. child of a top level mirror) + child = subscript($0) + } + ') + + # + # The awk below parses the output of zdb, printing out the level + # of each block along with vdev id, offset and length. The last + # two are converted to decimal in the while loop. 4M is added to + # the offset to compensate for the first two labels and boot + # block. Lastly, the offset and length are printed in units of + # 512B blocks for ease of use with dd. + # + typeset level vdev path offset length + if awk -n '' 2>/dev/null; then + # gawk needs -n to decode hex + AWK='awk -n' + else + AWK='awk' + fi + log_must zpool sync -f + zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' + /^$/ { looking = 0 } + looking { + level = $2 + field = 3 + while (split($field, dva, ":") == 3) { + # top level vdev id + vdev = int(dva[1]) + # offset + 4M label/boot pad in 512B blocks + offset = (int("0x"dva[2]) + pad) / bs + # length in 512B blocks + len = int("0x"dva[3]) / bs + + print level, vdev, offset, len + + ++field + } + } + /^Indirect blocks:/ { looking = 1 } + ' | \ + while read level vdev offset length; do + for path in ${VDEV_MAP[$vdev][@]}; do + echo "$level $path $offset $length" + done + done 2>/dev/null +} + +function corrupt_blocks_at_level # input_file corrupt_level +{ + typeset input_file=$1 + typeset corrupt_level="L${2:-0}" + typeset level path offset length + + [[ -f $input_file ]] || log_fail "Couldn't find $input_file" + + if is_freebsd; then + # Temporarily allow corrupting an inuse device. + debugflags=$(sysctl -n kern.geom.debugflags) + sysctl kern.geom.debugflags=16 + fi + + list_file_blocks $input_file | \ + while read level path offset length; do + if [[ $level = $corrupt_level ]]; then + log_must dd if=/dev/urandom of=$path bs=512 \ + count=$length seek=$offset conv=notrunc + fi + done + + if is_freebsd; then + sysctl kern.geom.debugflags=$debugflags + fi + + # This is necessary for pools made of loop devices. + sync +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 127a1477d4..4497a6248b 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -1,4 +1,5 @@ # +# Copyright (c) 2016, 2019 by Delphix. All rights reserved. # These variables are used by zfs-tests.sh to constrain which utilities # may be used by the suite. The suite will create a directory which is # the only element of $PATH and create symlinks from that dir to the @@ -7,18 +8,14 @@ # Please keep the contents of each variable sorted for ease of reading # and maintenance. # -export SYSTEM_FILES='arp +export SYSTEM_FILES_COMMON='arp awk - attr base64 basename bc - blkid - blockdev bunzip2 bzcat cat - chattr chgrp chmod chown @@ -36,27 +33,19 @@ export SYSTEM_FILES='arp du echo egrep - exportfs + env expr - fallocate false - fdisk file find fio - free getconf getent getfacl - getfattr grep - groupadd - groupdel - groupmod gunzip gzip head - hostid hostname id iostat @@ -64,29 +53,17 @@ export SYSTEM_FILES='arp ksh ln logname - losetup ls - lsattr - lsblk - lscpu - lsmod - lsscsi - md5sum mkdir mknod - mkswap mktemp - modprobe mount - mpstat mv net - nproc od openssl - parted + pamtester pax - perf pgrep ping pkill @@ -95,20 +72,18 @@ export SYSTEM_FILES='arp ps pwd python + python2 python3 quotaon readlink rm rmdir scp + script sed seq - setenforce setfacl - setfattr sh - sha256sum - shuf sleep sort ssh @@ -128,13 +103,10 @@ export SYSTEM_FILES='arp tr true truncate - udevadm umask umount uname - useradd - userdel - usermod + uniq uuidgen vmstat wait @@ -142,6 +114,66 @@ export SYSTEM_FILES='arp which xargs' +export SYSTEM_FILES_FREEBSD='chflags + compress + diskinfo + dumpon + fsck + getextattr + gpart + jail + jexec + jls + lsextattr + md5 + mdconfig + mkfifo + newfs + pw + rmextattr + setextattr + sha256 + showmount + swapctl + sysctl + uncompress' + +export SYSTEM_FILES_LINUX='attr + bash + blkid + blockdev + chattr + dmidecode + exportfs + fallocate + fdisk + free + getfattr + groupadd + groupdel + groupmod + hostid + losetup + lsattr + lsblk + lscpu + lsmod + lsscsi + md5sum + mkswap + modprobe + mpstat + nproc + parted + perf + setenforce + setfattr + sha256sum + udevadm + useradd + userdel + usermod' + export ZFS_FILES='zdb zfs zhack @@ -150,19 +182,25 @@ export ZFS_FILES='zdb ztest raidz_test arc_summary - arc_summary3 arcstat dbufstat + mount.zfs zed zgenhostid - zstreamdump' + zstream + zfs_ids_to_path + zpool_influxdb' -export ZFSTEST_FILES='chg_usr_exec +export ZFSTEST_FILES='badsend + btree_test + chg_usr_exec devname2devid dir_rd_update + draid file_check file_trunc file_write + get_diff largest_file libzfs_input_check mkbusy @@ -171,6 +209,7 @@ export ZFSTEST_FILES='chg_usr_exec mktree mmap_exec mmap_libaio + mmap_seek mmapwrite nvlist_to_lua randfree_file @@ -178,6 +217,8 @@ export ZFSTEST_FILES='chg_usr_exec readmmap rename_dir rm_lnkcnt_zero_file + send_doall threadsappend user_ns_exec - xattrtest' + xattrtest + stride_dd' diff --git a/tests/zfs-tests/include/default.cfg.in b/tests/zfs-tests/include/default.cfg.in index e1e2a7e91f..1a9cc5a2bb 100644 --- a/tests/zfs-tests/include/default.cfg.in +++ b/tests/zfs-tests/include/default.cfg.in @@ -1,3 +1,5 @@ +#!/bin/sh + # # CDDL HEADER START # @@ -30,12 +32,12 @@ # . $STF_SUITE/include/commands.cfg -. $STF_SUITE/include/libtest.shlib # ZFS Directories export ZEDLET_ETC_DIR=${ZEDLET_ETC_DIR:-@sysconfdir@/zfs/zed.d} export ZEDLET_LIBEXEC_DIR=${ZEDLET_LIBEXEC_DIR:-@zfsexecdir@/zed.d} export ZPOOL_SCRIPT_DIR=${ZPOOL_SCRIPT_DIR:-@sysconfdir@/zfs/zpool.d} +export ZPOOL_COMPAT_DIR=${ZPOOL_COMPAT_DIR:-@datadir@/zfs/compatibility.d} # Define run length constants export RT_LONG="3" @@ -143,17 +145,6 @@ export SPA_MINDEVSIZE=$((64 * 1024 * 1024)) # For iscsi target support export ISCSITGTFILE=/tmp/iscsitgt_file export ISCSITGT_FMRI=svc:/system/iscsitgt:default -if ! is_linux; then -export AUTO_SNAP=$(svcs -a | grep auto-snapshot | grep online | awk \ - '{print $3}') -fi - -# -# finally, if we're running in a local zone -# we take some additional actions -if ! is_global_zone; then - reexport_pool -fi export ZFS_VERSION=5 export ZFS_ALL_VERSIONS="1 2 3 4 5" @@ -164,7 +155,8 @@ done export MAX_PARTITIONS=8 -if is_linux; then +case $(uname -o) in +GNU/Linux) unpack_opts="--sparse -xf" pack_opts="--sparse -cf" verbose=" -v" @@ -173,6 +165,7 @@ if is_linux; then ZVOL_DEVDIR="/dev/zvol" ZVOL_RDEVDIR="/dev/zvol" + DEV_DSKDIR="/dev" DEV_RDSKDIR="/dev" DEV_MPATHDIR="/dev/mapper" @@ -182,9 +175,34 @@ if is_linux; then VDEVID_CONF="$ZEDLET_DIR/vdev_id.conf" VDEVID_CONF_ETC="/etc/zfs/vdev_id.conf" - NEWFS_DEFAULT_FS="ext2" -else + SLICE_PREFIX="" + ;; +FreeBSD) + unpack_opts="xv" + pack_opts="cf" + verbose="v" + unpack_preserve="xpf" + pack_preserve="cpf" + + ZVOL_DEVDIR="/dev/zvol" + ZVOL_RDEVDIR="/dev/zvol" + DEV_DSKDIR="/dev" + DEV_RDSKDIR="/dev" + DEV_MPATHDIR="/dev/multipath" + + NEWFS_DEFAULT_FS="ufs" + SLICE_PREFIX="p" + ;; +illumos) + export AUTO_SNAP=$(svcs -a | \ + awk '/auto-snapshot/ && /online/ { print $3 }') + # finally, if we're running in a local zone + # we take some additional actions + if [ "$(zonename 2>/dev/null)" != "global" ]; then + reexport_pool + fi + unpack_opts="xv" pack_opts="cf" verbose="v" @@ -197,7 +215,10 @@ else DEV_RDSKDIR="/dev/rdsk" NEWFS_DEFAULT_FS="ufs" -fi + SLICE_PREFIX="s" + ;; +esac export unpack_opts pack_opts verbose unpack_preserve pack_preserve \ - ZVOL_DEVDIR ZVOL_RDEVDIR NEWFS_DEFAULT_FS DEV_RDSKDIR DEV_MPATHDIR \ - ZEDLET_DIR ZED_LOG ZED_DEBUG_LOG VDEVID_CONF VDEVID_CONF_ETC + ZVOL_DEVDIR ZVOL_RDEVDIR DEV_DSKDIR DEV_RDSKDIR DEV_MPATHDIR \ + ZEDLET_DIR ZED_LOG ZED_DEBUG_LOG VDEVID_CONF VDEVID_CONF_ETC \ + NEWFS_DEFAULT_FS SLICE_PREFIX diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 57d0880cc9..ab0cd5270c 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -20,14 +20,14 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# Copyright (c) 2012, 2017 by Delphix. All rights reserved. -# Copyright (c) 2017 by Tim Chase. All rights reserved. -# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. -# Copyright (c) 2017 Lawrence Livermore National Security, LLC. -# Copyright (c) 2017 Datto Inc. -# Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +# Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2012, 2020, Delphix. All rights reserved. +# Copyright (c) 2017, Tim Chase. All rights reserved. +# Copyright (c) 2017, Nexenta Systems Inc. All rights reserved. +# Copyright (c) 2017, Lawrence Livermore National Security LLC. +# Copyright (c) 2017, Datto Inc. All rights reserved. +# Copyright (c) 2017, Open-E Inc. All rights reserved. +# Copyright (c) 2021, The FreeBSD Foundation. # Use is subject to license terms. # @@ -35,12 +35,14 @@ . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib +. ${STF_SUITE}/include/tunables.cfg + # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. # if [ -n "$STF_PATH" ]; then - PATH="$STF_PATH" + export PATH="$STF_PATH" fi # @@ -93,6 +95,46 @@ function is_linux fi } +# Determine if this is an illumos test system +# +# Return 0 if platform illumos, 1 if otherwise +function is_illumos +{ + if [[ $(uname -o) == "illumos" ]]; then + return 0 + else + return 1 + fi +} + +# Determine if this is a FreeBSD test system +# +# Return 0 if platform FreeBSD, 1 if otherwise + +function is_freebsd +{ + if [[ $(uname -o) == "FreeBSD" ]]; then + return 0 + else + return 1 + fi +} + +# Determine if this is a DilOS test system +# +# Return 0 if platform DilOS, 1 if otherwise + +function is_dilos +{ + typeset ID="" + [[ -f /etc/os-release ]] && . /etc/os-release + if [[ $ID == "dilos" ]]; then + return 0 + else + return 1 + fi +} + # Determine if this is a 32-bit system # # Return 0 if platform is 32-bit, 1 if otherwise @@ -145,17 +187,23 @@ function ismounted fi ;; ufs|nfs) - out=$(df -F $fstype $1 2>/dev/null) - ret=$? - (($ret != 0)) && return $ret + if is_freebsd; then + mount -pt $fstype | while read dev dir _t _flags; do + [[ "$1" == "$dev" || "$1" == "$dir" ]] && return 0 + done + else + out=$(df -F $fstype $1 2>/dev/null) + ret=$? + (($ret != 0)) && return $ret - dir=${out%%\(*} - dir=${dir%% *} - name=${out##*\(} - name=${name%%\)*} - name=${name%% *} + dir=${out%%\(*} + dir=${dir%% *} + name=${out##*\(} + name=${name%%\)*} + name=${name%% *} - [[ "$1" == "$dir" || "$1" == "$name" ]] && return 0 + [[ "$1" == "$dir" || "$1" == "$name" ]] && return 0 + fi ;; ext*) out=$(df -t $fstype $1 2>/dev/null) @@ -405,7 +453,8 @@ function create_recv_clone log_must eval "zfs send $snap | zfs recv -u $recvfs" log_must mkfile 1m "$mountpoint/data" log_must zfs snapshot $incr - log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 > $sendfile" + log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \ + iflag=fullblock > $sendfile" log_mustnot eval "zfs recv -su $recvfs < $sendfile" destroy_dataset "$sendfs" "-r" log_must rm -f "$sendfile" @@ -564,8 +613,8 @@ function default_cleanup_noexit then destroy_pool $pool fi - ALL_POOLS=$(get_all_pools) done + ALL_POOLS=$(get_all_pools) done zfs mount -a @@ -740,6 +789,18 @@ function bkmarkexists return $? } +# +# Return 0 if a hold exists; $? otherwise +# +# $1 - hold tag +# $2 - snapshot name +# +function holdexists +{ + zfs holds "$2" | awk '{ print $2 }' | grep "$1" > /dev/null 2>&1 + return $? +} + # # Set a property to a certain value on a dataset. # Sets a property of the dataset to the value as passed in. @@ -834,7 +895,9 @@ function zero_partitions # typeset diskname=$1 typeset i - if is_linux; then + if is_freebsd; then + gpart destroy -F $diskname + elif is_linux; then DSK=$DEV_DSKDIR/$diskname DSK=$(echo $DSK | sed -e "s|//|/|g") log_must parted $DSK -s -- mklabel gpt @@ -856,22 +919,26 @@ function zero_partitions # # Size should be specified with units as per # the `format` command requirements eg. 100mb 3gb # -# NOTE: This entire interface is problematic for the Linux parted utilty +# NOTE: This entire interface is problematic for the Linux parted utility # which requires the end of the partition to be specified. It would be # best to retire this interface and replace it with something more flexible. # At the moment a best effort is made. # -function set_partition # +# arguments: +function set_partition { typeset -i slicenum=$1 typeset start=$2 typeset size=$3 - typeset disk=$4 + typeset disk=${4#$DEV_DSKDIR/} + disk=${disk#$DEV_RDSKDIR/} - if is_linux; then + case "$(uname)" in + Linux) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi + disk=$DEV_DSKDIR/$disk typeset size_mb=${size%%[mMgG]} size_mb=${size_mb%%[mMgG][bB]} @@ -881,10 +948,10 @@ function set_partition # /dev/null + parted $disk -s -- print 1 >/dev/null typeset ret_val=$? if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then - parted $DEV_DSKDIR/$disk -s -- mklabel gpt + parted $disk -s -- mklabel gpt if [[ $? -ne 0 ]]; then log_note "Failed to create GPT partition table on $disk" return 1 @@ -899,21 +966,51 @@ function set_partition # /dev/null - block_device_wait - else + blockdev --rereadpt $disk 2>/dev/null + block_device_wait $disk + ;; + FreeBSD) + if [[ -z $size || -z $disk ]]; then + log_fail "The size or disk name is unspecified." + fi + disk=$DEV_DSKDIR/$disk + + if [[ $slicenum -eq 0 ]] || ! gpart show $disk >/dev/null 2>&1; then + gpart destroy -F $disk >/dev/null 2>&1 + gpart create -s GPT $disk + if [[ $? -ne 0 ]]; then + log_note "Failed to create GPT partition table on $disk" + return 1 + fi + fi + + typeset index=$((slicenum + 1)) + + if [[ -n $start ]]; then + start="-b $start" + fi + gpart add -t freebsd-zfs $start -s $size -i $index $disk + if [[ $ret_val -ne 0 ]]; then + log_note "Failed to create partition $slicenum on $disk" + return 1 + fi + + block_device_wait $disk + ;; + *) if [[ -z $slicenum || -z $size || -z $disk ]]; then log_fail "The slice, size or disk name is unspecified." fi @@ -932,10 +1029,11 @@ function set_partition # > $format_file format -e -s -d $disk -f $format_file - fi + typeset ret_val=$? + rm -f $format_file + ;; + esac - typeset ret_val=$? - rm -f $format_file if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" return 1 @@ -950,61 +1048,34 @@ function set_partition # /dev/null 2>&1 - if (( $? == 1 )); then - lsblk | egrep ${DISK}${SLICE_PREFIX}${j} > /dev/null - if (( $? == 1 )); then - log_note "Partitions for $DISK should be deleted" - else - log_fail "Partition for ${DISK}${SLICE_PREFIX}${j} not deleted" - fi - return 0 + typeset -i part + for disk in $DISKSARRAY; do + for (( part = 1; part < MAX_PARTITIONS; part++ )); do + typeset partition=${disk}${SLICE_PREFIX}${part} + parted $DEV_DSKDIR/$disk -s rm $part > /dev/null 2>&1 + if lsblk | grep -qF ${partition}; then + log_fail "Partition ${partition} not deleted" else - lsblk | egrep ${DISK}${SLICE_PREFIX}${j} > /dev/null - if (( $? == 0 )); then - log_fail "Partition for ${DISK}${SLICE_PREFIX}${j} not deleted" - fi + log_note "Partition ${partition} deleted" fi - ((j = j+1)) done - else - for disk in `echo $DISKSARRAY`; do - while ((j < MAX_PARTITIONS)); do - parted $DEV_DSKDIR/$disk -s rm $j > /dev/null 2>&1 - if (( $? == 1 )); then - lsblk | egrep ${disk}${SLICE_PREFIX}${j} > /dev/null - if (( $? == 1 )); then - log_note "Partitions for $disk should be deleted" - else - log_fail "Partition for ${disk}${SLICE_PREFIX}${j} not deleted" - fi - j=7 - else - lsblk | egrep ${disk}${SLICE_PREFIX}${j} > /dev/null - if (( $? == 0 )); then - log_fail "Partition for ${disk}${SLICE_PREFIX}${j} not deleted" - fi - fi - ((j = j+1)) - done - j=1 - done - fi + done + elif is_freebsd; then + for disk in $DISKSARRAY; do + if gpart destroy -F $disk; then + log_note "Partitions for ${disk} deleted" + else + log_fail "Partitions for ${disk} not deleted" + fi + done fi - return 0 } # @@ -1018,13 +1089,22 @@ function get_endslice # log_fail "The disk name or slice number is unspecified." fi - if is_linux; then + case "$(uname)" in + Linux) endcyl=$(parted -s $DEV_DSKDIR/$disk -- unit cyl print | \ grep "part${slice}" | \ awk '{print $3}' | \ sed 's,cyl,,') ((endcyl = (endcyl + 1))) - else + ;; + FreeBSD) + disk=${disk#/dev/zvol/} + disk=${disk%p*} + slice=$((slice + 1)) + endcyl=$(gpart show $disk | \ + awk -v slice=$slice '$3 == slice { print $1 + $2 }') + ;; + *) disk=${disk#/dev/dsk/} disk=${disk#/dev/rdsk/} disk=${disk%s*} @@ -1042,7 +1122,8 @@ function get_endslice # nawk -v token="$slice" '{if ($1==token) print $6}') ((endcyl = (endcyl + 1) / ratio)) - fi + ;; + esac echo $endcyl } @@ -1091,11 +1172,11 @@ function partition_disk # # dirnum: the maximum number of subdirectories to use, -1 no limit # filenum: the maximum number of files per subdirectory # bytes: number of bytes to write -# num_writes: numer of types to write out bytes +# num_writes: number of types to write out bytes # data: the data that will be written # # E.g. -# file_fs /testdir 20 25 1024 256 0 +# fill_fs /testdir 20 25 1024 256 0 # # Note: bytes * num_writes equals the size of the testfile # @@ -1108,33 +1189,12 @@ function fill_fs # destdir dirnum filenum bytes num_writes data typeset -i num_writes=${5:-10240} typeset data=${6:-0} - typeset -i odirnum=1 - typeset -i idirnum=0 - typeset -i fn=0 - typeset -i retval=0 - - mkdir -p $destdir/$idirnum - while (($odirnum > 0)); do - if ((dirnum >= 0 && idirnum >= dirnum)); then - odirnum=0 - break - fi - file_write -o create -f $destdir/$idirnum/$TESTFILE.$fn \ - -b $bytes -c $num_writes -d $data - retval=$? - if (($retval != 0)); then - odirnum=0 - break - fi - if (($fn >= $filenum)); then - fn=0 - ((idirnum = idirnum + 1)) - mkdir -p $destdir/$idirnum - else - ((fn = fn + 1)) - fi + mkdir -p $destdir/{1..$dirnum} + for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do + file_write -o create -f $f -b $bytes -c $num_writes -d $data \ + || return $? done - return $retval + return 0 } # @@ -1244,20 +1304,18 @@ function datasetnonexists return 0 } -function is_shared_impl +function is_shared_freebsd +{ + typeset fs=$1 + + pgrep -q mountd && showmount -E | grep -qx $fs +} + +function is_shared_illumos { typeset fs=$1 typeset mtpt - if is_linux; then - for mtpt in `share | awk '{print $1}'` ; do - if [[ $mtpt == $fs ]] ; then - return 0 - fi - done - return 1 - fi - for mtpt in `share | awk '{print $2}'` ; do if [[ $mtpt == $fs ]] ; then return 0 @@ -1272,6 +1330,19 @@ function is_shared_impl return 1 } +function is_shared_linux +{ + typeset fs=$1 + typeset mtpt + + for mtpt in `share | awk '{print $1}'` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + return 1 +} + # # Given a mountpoint, or a dataset name, determine if it is shared via NFS. # @@ -1296,7 +1367,85 @@ function is_shared fi fi - is_shared_impl "$fs" + case $(uname) in + FreeBSD) is_shared_freebsd "$fs" ;; + Linux) is_shared_linux "$fs" ;; + *) is_shared_illumos "$fs" ;; + esac +} + +function is_exported_illumos +{ + typeset fs=$1 + typeset mtpt + + for mtpt in `awk '{print $1}' /etc/dfs/sharetab` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + + return 1 +} + +function is_exported_freebsd +{ + typeset fs=$1 + typeset mtpt + + for mtpt in `awk '{print $1}' /etc/zfs/exports` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + + return 1 +} + +function is_exported_linux +{ + typeset fs=$1 + typeset mtpt + + for mtpt in `awk '{print $1}' /etc/exports.d/zfs.exports` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + + return 1 +} + +# +# Given a mountpoint, or a dataset name, determine if it is exported via +# the os-specific NFS exports file. +# +# Returns 0 if exported, 1 otherwise. +# +function is_exported +{ + typeset fs=$1 + typeset mtpt + + if [[ $fs != "/"* ]] ; then + if datasetnonexists "$fs" ; then + return 1 + else + mtpt=$(get_prop mountpoint "$fs") + case $mtpt in + none|legacy|-) return 1 + ;; + *) fs=$mtpt + ;; + esac + fi + fi + + case $(uname) in + FreeBSD) is_exported_freebsd "$fs" ;; + Linux) is_exported_linux "$fs" ;; + *) is_exported_illumos "$fs" ;; + esac } # @@ -1323,7 +1472,7 @@ function is_shared_smb done return 1 else - log_unsupported "Currently unsupported by the test framework" + log_note "Currently unsupported by the test framework" return 1 fi } @@ -1371,7 +1520,7 @@ function unshare_fs #fs is_shared $fs || is_shared_smb $fs if (($? == 0)); then - log_must zfs unshare $fs + zfs unshare $fs || log_fail "zfs unshare $fs failed" fi return 0 @@ -1449,6 +1598,21 @@ function showshares_smb return 0 } +function check_nfs +{ + if is_linux; then + share -s + elif is_freebsd; then + showmount -e + else + log_unsupported "Unknown platform" + fi + + if [[ $? -ne 0 ]]; then + log_unsupported "The NFS utilities are not installed" + fi +} + # # Check NFS server status and trigger it online. # @@ -1468,6 +1632,11 @@ function setup_nfs_server # log_must share -r + log_note "NFS server must be started prior to running ZTS." + return + elif is_freebsd; then + kill -s HUP $(cat /var/run/mountd.pid) + log_note "NFS server must be started prior to running ZTS." return fi @@ -1517,7 +1686,7 @@ function setup_nfs_server # function is_global_zone { - if is_linux; then + if is_linux || is_freebsd; then return 0 else typeset cur_zone=$(zonename 2>/dev/null) @@ -1729,13 +1898,11 @@ function zfs_zones_setup #zone_name zone_root zone_ip block_device_wait # - # If current system support slog, add slog device for pool + # Add slog device for pool # - if verify_slog_support ; then - typeset sdevs="$TEST_BASE_DIR/sdev1 $TEST_BASE_DIR/sdev2" - log_must mkfile $MINVDEVSIZE $sdevs - log_must zpool add $pool_name log mirror $sdevs - fi + typeset sdevs="$TEST_BASE_DIR/sdev1 $TEST_BASE_DIR/sdev2" + log_must mkfile $MINVDEVSIZE $sdevs + log_must zpool add $pool_name log mirror $sdevs # this isn't supported just yet. # Create a filesystem. In order to add this to @@ -1955,7 +2122,12 @@ function verify_pool log_must zpool scrub $pool log_must wait_scrubbed $pool - cksum=$(zpool status $pool | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') + typeset -i cksum=$(zpool status $pool | awk ' + !NF { isvdev = 0 } + isvdev { errors += $NF } + /CKSUM$/ { isvdev = 1 } + END { print errors } + ') if [[ $cksum != 0 ]]; then log_must zpool status -v log_fail "Unexpected CKSUM errors found on $pool ($cksum)" @@ -2143,25 +2315,27 @@ function check_pool_status # pool token keyword if [[ $verbose == true ]]; then log_note $scan fi - echo $scan | grep -i "$keyword" > /dev/null 2>&1 + echo $scan | egrep -i "$keyword" > /dev/null 2>&1 return $? } # -# These 6 following functions are instance of check_pool_status() -# is_pool_resilvering - to check if the pool is resilver in progress -# is_pool_resilvered - to check if the pool is resilver completed -# is_pool_scrubbing - to check if the pool is scrub in progress -# is_pool_scrubbed - to check if the pool is scrub completed -# is_pool_scrub_stopped - to check if the pool is scrub stopped -# is_pool_scrub_paused - to check if the pool has scrub paused -# is_pool_removing - to check if the pool is removing a vdev -# is_pool_removed - to check if the pool is remove completed +# The following functions are instance of check_pool_status() +# is_pool_resilvering - to check if the pool resilver is in progress +# is_pool_resilvered - to check if the pool resilver is completed +# is_pool_scrubbing - to check if the pool scrub is in progress +# is_pool_scrubbed - to check if the pool scrub is completed +# is_pool_scrub_stopped - to check if the pool scrub is stopped +# is_pool_scrub_paused - to check if the pool scrub has paused +# is_pool_removing - to check if the pool removing is a vdev +# is_pool_removed - to check if the pool remove is completed +# is_pool_discarding - to check if the pool checkpoint is being discarded # function is_pool_resilvering #pool { - check_pool_status "$1" "scan" "resilver in progress since " $2 + check_pool_status "$1" "scan" \ + "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 return $? } @@ -2207,6 +2381,12 @@ function is_pool_removed #pool return $? } +function is_pool_discarding #pool +{ + check_pool_status "$1" "checkpoint" "discarding" + return $? +} + function wait_for_degraded { typeset pool=$1 @@ -2234,10 +2414,11 @@ function cleanup_devices #vdevs { typeset pool="foopool$$" - if poolexists $pool ; then - destroy_pool $pool - fi + for vdev in $@; do + zero_partitions $vdev + done + poolexists $pool && destroy_pool $pool create_pool $pool $@ destroy_pool $pool @@ -2257,7 +2438,7 @@ function cleanup_devices #vdevs function find_disks { # Trust provided list, no attempt is made to locate unused devices. - if is_linux; then + if is_linux || is_freebsd; then echo "$@" return fi @@ -2325,19 +2506,236 @@ EOF # each case. limit the number to max_finddisksnum count=0 for disk in $unused_candidates; do - if [ -b $DEV_DSKDIR/${disk}s0 ]; then - if [ $count -lt $max_finddisksnum ]; then + if is_disk_device $DEV_DSKDIR/${disk}s0 && \ + [ $count -lt $max_finddisksnum ]; then unused="$unused $disk" # do not impose limit if $@ is provided [[ -z $@ ]] && ((count = count + 1)) fi - fi done # finally, return our disk list echo $unused } +function add_user_freebsd # +{ + typeset group=$1 + typeset user=$2 + typeset basedir=$3 + + # Check to see if the user exists. + if id $user > /dev/null 2>&1; then + return 0 + fi + + # Assign 1000 as the base uid + typeset -i uid=1000 + while true; do + typeset -i ret + pw useradd -u $uid -g $group -d $basedir/$user -m -n $user + ret=$? + case $ret in + 0) break ;; + # The uid is not unique + 65) ((uid += 1)) ;; + *) return 1 ;; + esac + if [[ $uid == 65000 ]]; then + log_fail "No user id available under 65000 for $user" + fi + done + + # Silence MOTD + touch $basedir/$user/.hushlogin + + return 0 +} + +# +# Delete the specified user. +# +# $1 login name +# +function del_user_freebsd # +{ + typeset user=$1 + + if id $user > /dev/null 2>&1; then + log_must pw userdel $user + fi + + return 0 +} + +# +# Select valid gid and create specified group. +# +# $1 group name +# +function add_group_freebsd # +{ + typeset group=$1 + + # See if the group already exists. + if pw groupshow $group >/dev/null 2>&1; then + return 0 + fi + + # Assign 1000 as the base gid + typeset -i gid=1000 + while true; do + pw groupadd -g $gid -n $group > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + 0) return 0 ;; + # The gid is not unique + 65) ((gid += 1)) ;; + *) return 1 ;; + esac + if [[ $gid == 65000 ]]; then + log_fail "No user id available under 65000 for $group" + fi + done +} + +# +# Delete the specified group. +# +# $1 group name +# +function del_group_freebsd # +{ + typeset group=$1 + + pw groupdel -n $group > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + # Group does not exist, or was deleted successfully. + 0|6|65) return 0 ;; + # Name already exists as a group name + 9) log_must pw groupdel $group ;; + *) return 1 ;; + esac + + return 0 +} + +function add_user_illumos # +{ + typeset group=$1 + typeset user=$2 + typeset basedir=$3 + + log_must useradd -g $group -d $basedir/$user -m $user + + return 0 +} + +function del_user_illumos # +{ + typeset user=$1 + + if id $user > /dev/null 2>&1; then + log_must_retry "currently used" 6 userdel $user + fi + + return 0 +} + +function add_group_illumos # +{ + typeset group=$1 + + typeset -i gid=100 + while true; do + groupadd -g $gid $group > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + 0) return 0 ;; + # The gid is not unique + 4) ((gid += 1)) ;; + *) return 1 ;; + esac + done +} + +function del_group_illumos # +{ + typeset group=$1 + + groupmod -n $grp $grp > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + # Group does not exist. + 6) return 0 ;; + # Name already exists as a group name + 9) log_must groupdel $grp ;; + *) return 1 ;; + esac +} + +function add_user_linux # +{ + typeset group=$1 + typeset user=$2 + typeset basedir=$3 + + log_must useradd -g $group -d $basedir/$user -m $user + + # Add new users to the same group and the command line utils. + # This allows them to be run out of the original users home + # directory as long as it permissioned to be group readable. + cmd_group=$(stat --format="%G" $(which zfs)) + log_must usermod -a -G $cmd_group $user + + return 0 +} + +function del_user_linux # +{ + typeset user=$1 + + if id $user > /dev/null 2>&1; then + log_must_retry "currently used" 6 userdel $user + fi + + return 0 +} + +function add_group_linux # +{ + typeset group=$1 + + # Assign 100 as the base gid, a larger value is selected for + # Linux because for many distributions 1000 and under are reserved. + while true; do + groupadd $group > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + 0) return 0 ;; + *) return 1 ;; + esac + done +} + +function del_group_linux # +{ + typeset group=$1 + + getent group $group > /dev/null 2>&1 + typeset -i ret=$? + case $ret in + # Group does not exist. + 2) return 0 ;; + # Name already exists as a group name + 0) log_must groupdel $group ;; + *) return 1 ;; + esac + + return 0 +} + # # Add specified user to specified group # @@ -2347,26 +2745,25 @@ EOF # function add_user # { - typeset gname=$1 - typeset uname=$2 + typeset group=$1 + typeset user=$2 typeset basedir=${3:-"/var/tmp"} - if ((${#gname} == 0 || ${#uname} == 0)); then + if ((${#group} == 0 || ${#user} == 0)); then log_fail "group name or user name are not defined." fi - log_must useradd -g $gname -d $basedir/$uname -m $uname - echo "export PATH=\"$STF_PATH\"" >>$basedir/$uname/.profile - echo "export PATH=\"$STF_PATH\"" >>$basedir/$uname/.bash_profile - echo "export PATH=\"$STF_PATH\"" >>$basedir/$uname/.login - - # Add new users to the same group and the command line utils. - # This allows them to be run out of the original users home - # directory as long as it permissioned to be group readable. - if is_linux; then - cmd_group=$(stat --format="%G" $(which zfs)) - log_must usermod -a -G $cmd_group $uname - fi + case $(uname) in + FreeBSD) + add_user_freebsd "$group" "$user" "$basedir" + ;; + Linux) + add_user_linux "$group" "$user" "$basedir" + ;; + *) + add_user_illumos "$group" "$user" "$basedir" + ;; + esac return 0 } @@ -2386,9 +2783,17 @@ function del_user # log_fail "login name is necessary." fi - if id $user > /dev/null 2>&1; then - log_must_retry "currently used" 5 userdel $user - fi + case $(uname) in + FreeBSD) + del_user_freebsd "$user" + ;; + Linux) + del_user_linux "$user" + ;; + *) + del_user_illumos "$user" + ;; + esac [[ -d $basedir/$user ]] && rm -fr $basedir/$user @@ -2408,30 +2813,19 @@ function add_group # log_fail "group name is necessary." fi - # Assign 100 as the base gid, a larger value is selected for - # Linux because for many distributions 1000 and under are reserved. - if is_linux; then - while true; do - groupadd $group > /dev/null 2>&1 - typeset -i ret=$? - case $ret in - 0) return 0 ;; - *) return 1 ;; - esac - done - else - typeset -i gid=100 - while true; do - groupadd -g $gid $group > /dev/null 2>&1 - typeset -i ret=$? - case $ret in - 0) return 0 ;; - # The gid is not unique - 4) ((gid += 1)) ;; - *) return 1 ;; - esac - done - fi + case $(uname) in + FreeBSD) + add_group_freebsd "$group" + ;; + Linux) + add_group_linux "$group" + ;; + *) + add_group_illumos "$group" + ;; + esac + + return 0 } # @@ -2441,32 +2835,23 @@ function add_group # # function del_group # { - typeset grp=$1 - if ((${#grp} == 0)); then + typeset group=$1 + + if ((${#group} == 0)); then log_fail "group name is necessary." fi - if is_linux; then - getent group $grp > /dev/null 2>&1 - typeset -i ret=$? - case $ret in - # Group does not exist. - 2) return 0 ;; - # Name already exists as a group name - 0) log_must groupdel $grp ;; - *) return 1 ;; - esac - else - groupmod -n $grp $grp > /dev/null 2>&1 - typeset -i ret=$? - case $ret in - # Group does not exist. - 6) return 0 ;; - # Name already exists as a group name - 9) log_must groupdel $grp ;; - *) return 1 ;; - esac - fi + case $(uname) in + FreeBSD) + del_group_freebsd "$group" + ;; + Linux) + del_group_linux "$group" + ;; + *) + del_group_illumos "$group" + ;; + esac return 0 } @@ -2542,29 +2927,6 @@ function safe_to_destroy_pool { # $1 the pool name fi } -# -# Get the available ZFS compression options -# $1 option type zfs_set|zfs_compress -# -function get_compress_opts -{ - typeset COMPRESS_OPTS - typeset GZIP_OPTS="gzip gzip-1 gzip-2 gzip-3 gzip-4 gzip-5 \ - gzip-6 gzip-7 gzip-8 gzip-9" - - if [[ $1 == "zfs_compress" ]] ; then - COMPRESS_OPTS="on lzjb" - elif [[ $1 == "zfs_set" ]] ; then - COMPRESS_OPTS="on off lzjb" - fi - typeset valid_opts="$COMPRESS_OPTS" - zfs get 2>&1 | grep gzip >/dev/null 2>&1 - if [[ $? -eq 0 ]]; then - valid_opts="$valid_opts $GZIP_OPTS" - fi - echo "$valid_opts" -} - # # Verify zfs operation with -p option work as expected # $1 operation, value could be create, clone or rename @@ -2704,28 +3066,6 @@ function random_get _random_get "$#" "$@" } -# -# Detect if the current system support slog -# -function verify_slog_support -{ - typeset dir=$TEST_BASE_DIR/disk.$$ - typeset pool=foo.$$ - typeset vdev=$dir/a - typeset sdev=$dir/b - - mkdir -p $dir - mkfile $MINVDEVSIZE $vdev $sdev - - typeset -i ret=0 - if ! zpool create -n $pool $vdev log $sdev > /dev/null 2>&1; then - ret=1 - fi - rm -r $dir - - return $ret -} - # # The function will generate a dataset name with specific length # $1, the length of the name @@ -2836,7 +3176,7 @@ function labelvtoc typeset label_file=/var/tmp/labelvtoc.$$ typeset arch=$(uname -p) - if is_linux; then + if is_linux || is_freebsd; then log_note "Currently unsupported by the test framework" return 1 fi @@ -2878,7 +3218,7 @@ function labelvtoc # # check if the system was installed as zfsroot or not -# return: 0 ture, otherwise false +# return: 0 if zfsroot, non-zero if not # function is_zfsroot { @@ -2894,7 +3234,9 @@ function get_rootfs { typeset rootfs="" - if ! is_linux; then + if is_freebsd; then + rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') + elif ! is_linux; then rootfs=$(awk '{if ($2 == "/" && $3 == "zfs") print $1}' \ /etc/mnttab) fi @@ -2919,7 +3261,9 @@ function get_rootpool typeset rootfs="" typeset rootpool="" - if ! is_linux; then + if is_freebsd; then + rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') + elif ! is_linux; then rootfs=$(awk '{if ($2 == "/" && $3 =="zfs") print $1}' \ /etc/mnttab) fi @@ -2928,23 +3272,12 @@ function get_rootpool fi zfs list $rootfs > /dev/null 2>&1 if (($? == 0)); then - rootpool=`echo $rootfs | awk -F\/ '{print $1}'` - echo $rootpool + echo ${rootfs%%/*} else log_fail "This is not a zfsroot system." fi } -# -# Get the package name -# -function get_package_name -{ - typeset dirpath=${1:-$STC_NAME} - - echo "SUNWstc-${dirpath}" | /usr/bin/sed -e "s/\//-/g" -} - # # Get the word numbers from a string separated by white space # @@ -3007,6 +3340,8 @@ function is_mp { if is_linux; then (($(nproc) > 1)) + elif is_freebsd; then + sysctl -n kern.smp.cpus else (($(psrinfo | wc -l) > 1)) fi @@ -3018,6 +3353,8 @@ function get_cpu_freq { if is_linux; then lscpu | awk '/CPU MHz/ { print $3 }' + elif is_freebsd; then + sysctl -n hw.clockrate else psrinfo -v 0 | awk '/processor operates at/ {print $6}' fi @@ -3029,9 +3366,17 @@ function user_run typeset user=$1 shift - log_note "user:$user $@" - eval su - \$user -c \"$@\" > $TEST_BASE_DIR/out 2>$TEST_BASE_DIR/err - return $? + log_note "user: $user" + log_note "cmd: $*" + + typeset out=$TEST_BASE_DIR/out + typeset err=$TEST_BASE_DIR/err + + sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err + typeset res=$? + log_note "out: $(<$out)" + log_note "err: $(<$err)" + return $res } # @@ -3076,7 +3421,7 @@ function get_max shift for i in "$@"; do - max=$(echo $((max > i ? max : i))) + max=$((max > i ? max : i)) done echo $max @@ -3088,21 +3433,12 @@ function get_min shift for i in "$@"; do - min=$(echo $((min < i ? min : i))) + min=$((min < i ? min : i)) done echo $min } -# -# Generate a random number between 1 and the argument. -# -function random -{ - typeset max=$1 - echo $(( ($RANDOM % $max) + 1 )) -} - # Write data that can be compressed into a directory function write_compressible { @@ -3157,7 +3493,11 @@ function get_objnum typeset objnum [[ -e $pathname ]] || log_fail "No such file or directory: $pathname" - objnum=$(stat -c %i $pathname) + if is_freebsd; then + objnum=$(stat -f "%i" $pathname) + else + objnum=$(stat -c %i $pathname) + fi echo $objnum } @@ -3214,17 +3554,12 @@ function wait_replacing #pool # Wait for a pool to be scrubbed # # $1 pool name -# $2 number of seconds to wait (optional) -# -# Returns true when pool has been scrubbed, or false if there's a timeout or if -# no scrub was done. # function wait_scrubbed { typeset pool=${1:-$TESTPOOL} - while true ; do - is_pool_scrubbed $pool && break - log_must sleep 1 + while ! is_pool_scrubbed $pool ; do + sleep 1 done } @@ -3250,7 +3585,7 @@ function zed_rc_restore function zed_setup { if ! is_linux; then - return + log_unsupported "No zed on $(uname)" fi if [[ ! -d $ZEDLET_DIR ]]; then @@ -3336,16 +3671,16 @@ function zed_start # Verify the ZED is not already running. pgrep -x zed > /dev/null if (($? == 0)); then - log_fail "ZED already running" + log_note "ZED already running" + else + log_note "Starting ZED" + # run ZED in the background and redirect foreground logging + # output to $ZED_LOG. + log_must truncate -s 0 $ZED_DEBUG_LOG + log_must eval "zed -vF -d $ZEDLET_DIR -P $PATH" \ + "-s $ZEDLET_DIR/state -j 1 2>$ZED_LOG &" fi - log_note "Starting ZED" - # run ZED in the background and redirect foreground logging - # output to $ZED_LOG. - log_must truncate -s 0 $ZED_DEBUG_LOG - log_must eval "zed -vF -d $ZEDLET_DIR -p $ZEDLET_DIR/zed.pid -P $PATH" \ - "-s $ZEDLET_DIR/state 2>$ZED_LOG &" - return 0 } @@ -3359,14 +3694,13 @@ function zed_stop fi log_note "Stopping ZED" - if [[ -f ${ZEDLET_DIR}/zed.pid ]]; then - zedpid=$(<${ZEDLET_DIR}/zed.pid) - kill $zedpid - while ps -p $zedpid > /dev/null; do - sleep 1 - done - rm -f ${ZEDLET_DIR}/zed.pid - fi + while true; do + zedpids="$(pgrep -x zed)" + [ "$?" -ne 0 ] && break + + log_must kill $zedpids + sleep 1 + done return 0 } @@ -3412,6 +3746,8 @@ function is_swap_inuse if is_linux; then swapon -s | grep -w $(readlink -f $device) > /dev/null 2>&1 + elif is_freebsd; then + swapctl -l | grep -w $device else swap -l | grep -w $device > /dev/null 2>&1 fi @@ -3429,6 +3765,8 @@ function swap_setup if is_linux; then log_must eval "mkswap $swapdev > /dev/null 2>&1" log_must swapon $swapdev + elif is_freebsd; then + log_must swapctl -a $swapdev else log_must swap -a $swapdev fi @@ -3446,6 +3784,8 @@ function swap_cleanup if is_swap_inuse $swapdev; then if is_linux; then log_must swapoff $swapdev + elif is_freebsd; then + log_must swapoff $swapdev else log_must swap -d $swapdev fi @@ -3457,7 +3797,7 @@ function swap_cleanup # # Set a global system tunable (64-bit value) # -# $1 tunable name +# $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable64 @@ -3468,7 +3808,7 @@ function set_tunable64 # # Set a global system tunable (32-bit value) # -# $1 tunable name +# $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable32 @@ -3478,12 +3818,23 @@ function set_tunable32 function set_tunable_impl { - typeset tunable="$1" + typeset name="$1" typeset value="$2" typeset mdb_cmd="$3" typeset module="${4:-zfs}" - [[ -z "$tunable" ]] && return 1 + eval "typeset tunable=\$$name" + case "$tunable" in + UNSUPPORTED) + log_unsupported "Tunable '$name' is unsupported on $(uname)" + ;; + "") + log_fail "Tunable '$name' must be added to tunables.cfg" + ;; + *) + ;; + esac + [[ -z "$value" ]] && return 1 [[ -z "$mdb_cmd" ]] && return 1 @@ -3491,13 +3842,17 @@ function set_tunable_impl Linux) typeset zfs_tunables="/sys/module/$module/parameters" [[ -w "$zfs_tunables/$tunable" ]] || return 1 - echo -n "$value" > "$zfs_tunables/$tunable" + cat >"$zfs_tunables/$tunable" <<<"$value" + return $? + ;; + FreeBSD) + sysctl vfs.zfs.$tunable=$value return "$?" ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw - return "$?" + return $? ;; esac } @@ -3505,7 +3860,7 @@ function set_tunable_impl # # Get a global system tunable # -# $1 tunable name +# $1 tunable name (use a NAME defined in tunables.cfg) # function get_tunable { @@ -3514,17 +3869,30 @@ function get_tunable function get_tunable_impl { - typeset tunable="$1" + typeset name="$1" typeset module="${2:-zfs}" - [[ -z "$tunable" ]] && return 1 + eval "typeset tunable=\$$name" + case "$tunable" in + UNSUPPORTED) + log_unsupported "Tunable '$name' is unsupported on $(uname)" + ;; + "") + log_fail "Tunable '$name' must be added to tunables.cfg" + ;; + *) + ;; + esac case "$(uname)" in Linux) typeset zfs_tunables="/sys/module/$module/parameters" [[ -f "$zfs_tunables/$tunable" ]] || return 1 cat $zfs_tunables/$tunable - return "$?" + return $? + ;; + FreeBSD) + sysctl -n vfs.zfs.$tunable ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 @@ -3594,3 +3962,313 @@ function mdb_ctf_set_int return 0 } + +# +# Compute MD5 digest for given file or stdin if no file given. +# Note: file path must not contain spaces +# +function md5digest +{ + typeset file=$1 + + case $(uname) in + FreeBSD) + md5 -q $file + ;; + *) + md5sum -b $file | awk '{ print $1 }' + ;; + esac +} + +# +# Compute SHA256 digest for given file or stdin if no file given. +# Note: file path must not contain spaces +# +function sha256digest +{ + typeset file=$1 + + case $(uname) in + FreeBSD) + sha256 -q $file + ;; + *) + sha256sum -b $file | awk '{ print $1 }' + ;; + esac +} + +function new_fs # +{ + case $(uname) in + FreeBSD) + newfs "$@" + ;; + *) + echo y | newfs -v "$@" + ;; + esac +} + +function stat_size # +{ + typeset path=$1 + + case $(uname) in + FreeBSD) + stat -f %z "$path" + ;; + *) + stat -c %s "$path" + ;; + esac +} + +function stat_ctime # +{ + typeset path=$1 + + case $(uname) in + FreeBSD) + stat -f %c "$path" + ;; + *) + stat -c %Z "$path" + ;; + esac +} + +function stat_crtime # +{ + typeset path=$1 + + case $(uname) in + FreeBSD) + stat -f %B "$path" + ;; + *) + stat -c %W "$path" + ;; + esac +} + +# Run a command as if it was being run in a TTY. +# +# Usage: +# +# faketty command +# +function faketty +{ + if is_freebsd; then + script -q /dev/null env "$@" + else + script --return --quiet -c "$*" /dev/null + fi +} + +# +# Produce a random permutation of the integers in a given range (inclusive). +# +function range_shuffle # begin end +{ + typeset -i begin=$1 + typeset -i end=$2 + + seq ${begin} ${end} | sort -R +} + +# +# Cross-platform xattr helpers +# + +function get_xattr # name path +{ + typeset name=$1 + typeset path=$2 + + case $(uname) in + FreeBSD) + getextattr -qq user "${name}" "${path}" + ;; + *) + attr -qg "${name}" "${path}" + ;; + esac +} + +function set_xattr # name value path +{ + typeset name=$1 + typeset value=$2 + typeset path=$3 + + case $(uname) in + FreeBSD) + setextattr user "${name}" "${value}" "${path}" + ;; + *) + attr -qs "${name}" -V "${value}" "${path}" + ;; + esac +} + +function set_xattr_stdin # name value +{ + typeset name=$1 + typeset path=$2 + + case $(uname) in + FreeBSD) + setextattr -i user "${name}" "${path}" + ;; + *) + attr -qs "${name}" "${path}" + ;; + esac +} + +function rm_xattr # name path +{ + typeset name=$1 + typeset path=$2 + + case $(uname) in + FreeBSD) + rmextattr -q user "${name}" "${path}" + ;; + *) + attr -qr "${name}" "${path}" + ;; + esac +} + +function ls_xattr # path +{ + typeset path=$1 + + case $(uname) in + FreeBSD) + lsextattr -qq user "${path}" + ;; + *) + attr -ql "${path}" + ;; + esac +} + +function kstat # stat flags? +{ + typeset stat=$1 + typeset flags=${2-"-n"} + + case $(uname) in + FreeBSD) + sysctl $flags kstat.zfs.misc.$stat + ;; + Linux) + typeset zfs_kstat="/proc/spl/kstat/zfs/$stat" + [[ -f "$zfs_kstat" ]] || return 1 + cat $zfs_kstat + ;; + *) + false + ;; + esac +} + +function get_arcstat # stat +{ + typeset stat=$1 + + case $(uname) in + FreeBSD) + kstat arcstats.$stat + ;; + Linux) + kstat arcstats | awk "/$stat/ { print \$3 }" + ;; + *) + false + ;; + esac +} + +function punch_hole # offset length file +{ + typeset offset=$1 + typeset length=$2 + typeset file=$3 + + case $(uname) in + FreeBSD) + truncate -d -o $offset -l $length "$file" + ;; + Linux) + fallocate --punch-hole --offset $offset --length $length "$file" + ;; + *) + false + ;; + esac +} + +# +# Wait for the specified arcstat to reach non-zero quiescence. +# If echo is 1 echo the value after reaching quiescence, otherwise +# if echo is 0 print the arcstat we are waiting on. +# +function arcstat_quiescence # stat echo +{ + typeset stat=$1 + typeset echo=$2 + typeset do_once=true + + if [[ $echo -eq 0 ]]; then + echo "Waiting for arcstat $1 quiescence." + fi + + while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do + typeset stat1=$(get_arcstat $stat) + sleep 2 + typeset stat2=$(get_arcstat $stat) + do_once=false + done + + if [[ $echo -eq 1 ]]; then + echo $stat2 + fi +} + +function arcstat_quiescence_noecho # stat +{ + typeset stat=$1 + arcstat_quiescence $stat 0 +} + +function arcstat_quiescence_echo # stat +{ + typeset stat=$1 + arcstat_quiescence $stat 1 +} + +# +# Given an array of pids, wait until all processes +# have completed and check their return status. +# +function wait_for_children #children +{ + rv=0 + children=("$@") + for child in "${children[@]}" + do + child_exit=0 + wait ${child} || child_exit=$? + if [ $child_exit -ne 0 ]; then + echo "child ${child} failed with ${child_exit}" + rv=1 + fi + done + return $rv +} diff --git a/tests/zfs-tests/include/math.shlib b/tests/zfs-tests/include/math.shlib index 0c3508ec2f..7ac59f2796 100644 --- a/tests/zfs-tests/include/math.shlib +++ b/tests/zfs-tests/include/math.shlib @@ -30,14 +30,15 @@ function within_percent typeset percent=$3 # Set $a or $b to $2 such that a >= b - [[ '1' = $(echo "if ($2 > $a) 1" | bc) ]] && a=$2 || b=$2 + [[ '1' = $(echo "if ($2 > $a) 1 else 0" | bc) ]] && a=$2 || b=$2 # Prevent division by 0 [[ $a =~ [1-9] ]] || return 1 typeset p=$(echo "scale=2; $b * 100 / $a" | bc) log_note "Comparing $a and $b given $percent% (calculated: $p%)" - [[ '1' = $(echo "scale=2; if ($p >= $percent) 1" | bc) ]] && return 0 + [[ '1' = $(echo "scale=2; if ($p >= $percent) 1 else 0" | bc) ]] && \ + return 0 return 1 } @@ -119,3 +120,25 @@ function verify_ne # log_fail "Compared $type should be not equal: $a == $b" fi } + +# A simple function to get a random number between two bounds (inclusive) +# +# Probably not the most efficient for large ranges, but it's okay. +# +# Note since we're using $RANDOM, 32767 is the largest number we +# can accept as the upper bound. +# +# $1 lower bound +# $2 upper bound +function random_int_between +{ + typeset -i min=$1 + typeset -i max=$2 + typeset -i rand=0 + + while [[ $rand -lt $min ]] ; do + rand=$(( $RANDOM % $max + 1)) + done + + echo $rand +} diff --git a/tests/zfs-tests/include/properties.shlib b/tests/zfs-tests/include/properties.shlib index 25a9846ddd..6d467b6005 100644 --- a/tests/zfs-tests/include/properties.shlib +++ b/tests/zfs-tests/include/properties.shlib @@ -10,13 +10,17 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2016, Delphix. All rights reserved. # -typeset -a compress_prop_vals=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2' - 'gzip-3' 'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle' 'lz4') +. $STF_SUITE/include/libtest.shlib + +typeset -a compress_prop_vals=('off' 'lzjb' 'lz4' 'gzip' 'zle' 'zstd') typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256' - 'noparity' 'sha512' 'skein' 'edonr') + 'noparity' 'sha512' 'skein') +if ! is_freebsd; then + checksum_prop_vals+=('edonr') +fi typeset -a recsize_prop_vals=('512' '1024' '2048' '4096' '8192' '16384' '32768' '65536' '131072' '262144' '524288' '1048576') typeset -a canmount_prop_vals=('on' 'off' 'noauto') @@ -35,64 +39,66 @@ typeset -a vol_props=('compress' 'checksum' 'copies' 'logbias' 'primarycache' 'secondarycache' 'redundant_metadata' 'sync') # -# Given the property array passed in, return 'num_props' elements to the -# user, excluding any elements below 'start.' This allows us to exclude -# 'off' and 'on' which can be either unwanted, or a duplicate of another -# property respectively. +# Given the 'prop' passed in, return 'num_vals' elements of the corresponding +# values array to the user, excluding any elements below 'first.' This allows +# us to exclude 'off' and 'on' which can be either unwanted, or a duplicate of +# another property respectively. # -function get_rand_prop +function get_rand_prop_vals { - typeset prop_array=($(eval echo \${$1[@]})) - typeset -i num_props=$2 - typeset -i start=$3 + typeset prop=$1 + typeset -i num_vals=$2 + typeset -i first=$3 + + [[ -z $prop || -z $num_vals || -z $first ]] && \ + log_fail "get_rand_prop_vals: bad arguments" + typeset retstr="" - [[ -z $prop_array || -z $num_props || -z $start ]] && \ - log_fail "get_rand_prop: bad arguments" + typeset prop_vals_var=${prop}_prop_vals + typeset -a prop_vals=($(eval echo \${${prop_vals_var}[@]})) - typeset prop_max=$((${#prop_array[@]} - 1)) + [[ -z $prop_vals ]] && \ + log_fail "get_rand_prop_vals: bad prop $prop" + + typeset -i last=$((${#prop_vals[@]} - 1)) typeset -i i - for i in $(shuf -i $start-$prop_max -n $num_props); do - retstr="${prop_array[$i]} $retstr" + for i in $(range_shuffle $first $last | head -n $num_vals); do + retstr="${prop_vals[$i]} $retstr" done echo $retstr } -function get_rand_compress -{ - get_rand_prop compress_prop_vals $1 2 -} - -function get_rand_compress_any -{ - get_rand_prop compress_prop_vals $1 0 -} - function get_rand_checksum { - get_rand_prop checksum_prop_vals $1 2 + get_rand_prop_vals checksum $1 2 } function get_rand_checksum_any { - get_rand_prop checksum_prop_vals $1 0 + get_rand_prop_vals checksum $1 0 } function get_rand_recsize { - get_rand_prop recsize_prop_vals $1 0 + get_rand_prop_vals recsize $1 0 } function get_rand_large_recsize { - get_rand_prop recsize_prop_vals $1 9 + get_rand_prop_vals recsize $1 9 } # # Functions to toggle on/off properties # -typeset -a binary_props=('atime' 'devices' 'exec' 'readonly' 'setuid' 'xattr' - 'zoned') +typeset -a binary_props=('atime' 'devices' 'exec' 'readonly' 'setuid' 'xattr') + +if is_freebsd; then + binary_props+=('jailed') +else + binary_props+=('zoned') +fi if is_linux; then # Only older kernels support non-blocking mandatory locks @@ -143,7 +149,7 @@ function randomize_ds_props fi for prop in $proplist; do - typeset val=$(get_rand_prop "${prop}_prop_vals" 1 0) + typeset val=$(get_rand_prop_vals $prop 1 0) log_must zfs set $prop=$val $ds done } diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg new file mode 100644 index 0000000000..fff43e4691 --- /dev/null +++ b/tests/zfs-tests/include/tunables.cfg @@ -0,0 +1,96 @@ +# This file exports variables for each tunable used in the test suite. +# +# Different platforms use different names for most tunables. To avoid littering +# the tests with conditional logic for deciding how to set each tunable, the +# logic is instead consolidated to this one file. +# +# Any use of tunables in tests must use a name defined here. New entries +# should be added to the table as needed. Please keep the table sorted +# alphabetically for ease of maintenance. +# +# Platform-specific tunables should still use a NAME from this table for +# consistency. Enter UNSUPPORTED in the column for platforms on which the +# tunable is not implemented. + +UNAME=$(uname) + +# NAME FreeBSD tunable Linux tunable +cat <<%%%% | +ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot +ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount +ARC_MAX arc.max zfs_arc_max +ARC_MIN arc.min zfs_arc_min +ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks +CHECKSUM_EVENTS_PER_SECOND checksum_events_per_second zfs_checksum_events_per_second +COMMIT_TIMEOUT_PCT commit_timeout_pct zfs_commit_timeout_pct +COMPRESSED_ARC_ENABLED compressed_arc_enabled zfs_compressed_arc_enabled +CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms zfs_condense_indirect_commit_entry_delay_ms +CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct +CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes +DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift +DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms +DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode +DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms +DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms +DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check +DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync +INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size +INITIALIZE_VALUE initialize_value zfs_initialize_value +KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export +LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit +L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly +L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch +L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size +L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled +L2ARC_TRIM_AHEAD l2arc.trim_ahead l2arc_trim_ahead +L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost +L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max +LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc +LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel +LIVELIST_CONDENSE_SYNC_PAUSE livelist.condense.sync_pause zfs_livelist_condense_sync_pause +LIVELIST_CONDENSE_ZTHR_CANCEL livelist.condense.zthr_cancel zfs_livelist_condense_zthr_cancel +LIVELIST_CONDENSE_ZTHR_PAUSE livelist.condense.zthr_pause zfs_livelist_condense_zthr_pause +LIVELIST_MAX_ENTRIES livelist.max_entries zfs_livelist_max_entries +LIVELIST_MIN_PERCENT_SHARED livelist.min_percent_shared zfs_livelist_min_percent_shared +MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting +MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds +METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load +METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging +MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals +MULTIHOST_HISTORY multihost.history zfs_multihost_history +MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals +MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval +OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize +PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled +REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress +REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment +RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms +SCAN_LEGACY scan_legacy zfs_scan_legacy +SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress +SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit +SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time +SLOW_IO_EVENTS_PER_SECOND slow_io_events_per_second zfs_slow_io_events_per_second +SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation +SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit +SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data +SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min +TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip +TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch +TXG_HISTORY txg.history zfs_txg_history +TXG_TIMEOUT txg.timeout zfs_txg_timeout +UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress +VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift +VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count +VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip +VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev +VOL_MODE vol.mode zvol_volmode +VOL_RECURSIVE vol.recursive UNSUPPORTED +ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max +ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max +ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms +%%%% +while read name FreeBSD Linux; do + eval "export ${name}=\$${UNAME}" +done diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index da27673ec9..137cddd5f7 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -4,6 +4,7 @@ SUBDIRS = \ arc \ atime \ bootfs \ + btree \ cache \ cachefile \ casenorm \ @@ -15,12 +16,14 @@ SUBDIRS = \ cli_user \ compression \ cp_files \ + crtime \ ctime \ deadman \ delegate \ devices \ events \ exec \ + fallocate \ fault \ features \ grow \ @@ -29,12 +32,13 @@ SUBDIRS = \ inheritance \ inuse \ io \ + l2arc \ large_files \ largest_pool \ libzfs \ limits \ - pyzfs \ link_count \ + log_spacemap \ migration \ mmap \ mmp \ @@ -44,14 +48,17 @@ SUBDIRS = \ no_space \ nopwrite \ online_offline \ + pam \ pool_checkpoint \ pool_names \ poolversion \ privilege \ procfs \ projectquota \ + pyzfs \ quota \ raidz \ + redacted_send \ redundancy \ refquota \ refreserv \ @@ -66,8 +73,8 @@ SUBDIRS = \ snapshot \ snapused \ sparse \ + suid \ threadsappend \ - tmpfile \ trim \ truncate \ upgrade \ @@ -76,4 +83,10 @@ SUBDIRS = \ vdev_zaps \ write_dirs \ xattr \ + zpool_influxdb \ zvol + +if BUILD_LINUX +SUBDIRS += \ + tmpfile +endif diff --git a/tests/zfs-tests/tests/functional/acl/Makefile.am b/tests/zfs-tests/tests/functional/acl/Makefile.am index 6086930e36..d752f63744 100644 --- a/tests/zfs-tests/tests/functional/acl/Makefile.am +++ b/tests/zfs-tests/tests/functional/acl/Makefile.am @@ -3,4 +3,4 @@ dist_pkgdata_DATA = \ acl.cfg \ acl_common.kshlib -SUBDIRS = posix +SUBDIRS = off posix posix-sa diff --git a/tests/zfs-tests/tests/functional/acl/acl_common.kshlib b/tests/zfs-tests/tests/functional/acl/acl_common.kshlib index a81cd76ba6..ba08bcb48b 100644 --- a/tests/zfs-tests/tests/functional/acl/acl_common.kshlib +++ b/tests/zfs-tests/tests/functional/acl/acl_common.kshlib @@ -34,7 +34,7 @@ # # Get the given file/directory access mode # -# $1 object -- file or directroy +# $1 object -- file or directory # function get_mode # { @@ -49,7 +49,7 @@ function get_mode # # # Get the given file/directory ACL # -# $1 object -- file or directroy +# $1 object -- file or directory # function get_acl # { @@ -64,7 +64,7 @@ function get_acl # # # Get the given file/directory ACL # -# $1 object -- file or directroy +# $1 object -- file or directory # function get_compact_acl # { @@ -243,12 +243,12 @@ function usr_exec # [...] # # Count how many ACEs for the specified file or directory. # -# $1 file or directroy name +# $1 file or directory name # function count_ACE # { if [[ ! -e $1 ]]; then - log_note "Need input file or directroy name." + log_note "Need input file or directory name." return 1 fi @@ -399,7 +399,7 @@ function rwx_node #user node acl_spec|access # # Get the given file/directory xattr # -# $1 object -- file or directroy +# $1 object -- file or directory # function get_xattr # { diff --git a/tests/zfs-tests/tests/functional/acl/off/.gitignore b/tests/zfs-tests/tests/functional/acl/off/.gitignore new file mode 100644 index 0000000000..f3c93191ce --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/.gitignore @@ -0,0 +1 @@ +/dosmode_readonly_write diff --git a/tests/zfs-tests/tests/functional/acl/off/Makefile.am b/tests/zfs-tests/tests/functional/acl/off/Makefile.am new file mode 100644 index 0000000000..36aa13dd03 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/acl/off + +dist_pkgdata_SCRIPTS = \ + dosmode.ksh \ + posixmode.ksh \ + cleanup.ksh \ + setup.ksh + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/acl/off + +if BUILD_FREEBSD +pkgexec_PROGRAMS = dosmode_readonly_write +dosmode_readonly_write_SOURCES = dosmode_readonly_write.c +endif diff --git a/tests/zfs-tests/tests/functional/acl/off/cleanup.ksh b/tests/zfs-tests/tests/functional/acl/off/cleanup.ksh new file mode 100755 index 0000000000..bb58a8cf2e --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/cleanup.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +cleanup_user_group + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/acl/off/dosmode.ksh b/tests/zfs-tests/tests/functional/acl/off/dosmode.ksh new file mode 100755 index 0000000000..e232dfd525 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/dosmode.ksh @@ -0,0 +1,199 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +# +# DESCRIPTION: +# Verify that DOS mode flags function correctly. +# +# These flags are not currently exposed on Linux, so the test is +# only useful on FreeBSD. +# +# STRATEGY: +# 1. ARCHIVE +# 2. HIDDEN +# 3. OFFLINE +# 4. READONLY +# 5. REPARSE +# 6. SPARSE +# 7. SYSTEM +# + +verify_runnable "both" + +function cleanup +{ + rm -f $testfile +} + +function hasflag +{ + typeset flag=$1 + typeset path=$2 + + ls -lo $path | awk '{ gsub(",", "\n", $5); print $5 }' | grep -qxF $flag +} + +log_assert "Verify DOS mode flags function correctly" +log_onexit cleanup + +tests_base=$STF_SUITE/tests/functional/acl/off +testfile=$TESTDIR/testfile +owner=$ZFS_ACL_STAFF1 +other=$ZFS_ACL_STAFF2 + +# +# ARCHIVE +# +# This flag is set by ZFS when a file has been updated to indicate that +# the file needs to be archived. +# +log_must touch $testfile +log_must hasflag uarch $testfile +log_must chflags nouarch $testfile +log_must hasflag - $testfile +log_must touch $testfile +log_must hasflag uarch $testfile +log_must rm $testfile +log_must user_run $owner touch $testfile +log_must hasflag uarch $testfile +log_must user_run $owner chflags nouarch $testfile +log_mustnot user_run $other chflags uarch $testfile +log_must hasflag - $testfile +log_must user_run $owner touch $testfile +log_mustnot user_run $other chflags nouarch $testfile +log_must hasflag uarch $testfile +log_must user_run $owner rm $testfile + +# +# HIDDEN +# +log_must touch $testfile +log_must chflags hidden $testfile +log_must hasflag hidden $testfile +log_must chflags 0 $testfile +log_must hasflag - $testfile +log_must rm $testfile +log_must user_run $owner touch $testfile +log_must user_run $owner chflags hidden $testfile +log_mustnot user_run $other chflags nohidden $testfile +log_must hasflag hidden $testfile +log_must user_run $owner chflags 0 $testfile +log_mustnot user_run $other chflags hidden $testfile +log_must hasflag - $testfile +log_must user_run $owner rm $testfile + + +# +# OFFLINE +# +log_must touch $testfile +log_must chflags offline $testfile +log_must hasflag offline $testfile +log_must chflags 0 $testfile +log_must hasflag - $testfile +log_must rm $testfile +log_must user_run $owner touch $testfile +log_must user_run $owner chflags offline $testfile +log_mustnot user_run $other chflags nooffline $testfile +log_must hasflag offline $testfile +log_must user_run $owner chflags 0 $testfile +log_mustnot user_run $other chflags offline $testfile +log_must hasflag - $testfile +log_must user_run $owner rm $testfile + +# +# READONLY +# +# This flag prevents users from writing or appending to the file, +# but root is always allowed the operation. +# +log_must touch $testfile +log_must chflags rdonly $testfile +log_must hasflag rdonly $testfile +log_must eval "echo 'root write allowed' >> $testfile" +log_must cat $testfile +log_must chflags 0 $testfile +log_must hasflag - $tesfile +log_must rm $testfile +# It is required to still be able to write to an fd that was opened RW before +# READONLY is set. We have a special test program for that. +log_must user_run $owner touch $testfile +log_mustnot user_run $other chflags rdonly $testfile +log_must user_run $owner $tests_base/dosmode_readonly_write $testfile +log_mustnot user_run $other chflags nordonly $testfile +log_must hasflag rdonly $testfile +log_mustnot user_run $owner "echo 'user write forbidden' >> $testfile" +log_must eval "echo 'root write allowed' >> $testfile" +# We are still allowed to read and remove the file when READONLY is set. +log_must user_run $owner cat $testfile +log_must user_run $owner rm $testfile + +# +# REPARSE +# +# FIXME: does not work, not sure if broken or testing wrong +# + +# +# SPARSE +# +log_must truncate -s 1m $testfile +log_must chflags sparse $testfile +log_must hasflag sparse $testfile +log_must chflags 0 $testfile +log_must hasflag - $testfile +log_must rm $testfile +log_must user_run $owner truncate -s 1m $testfile +log_must user_run $owner chflags sparse $testfile +log_mustnot user_run $other chflags nosparse $testfile +log_must hasflag sparse $testfile +log_must user_run $owner chflags 0 $testfile +log_mustnot user_run $other chflags sparse $testfile +log_must hasflag - $testfile +log_must user_run $owner rm $testfile + +# +# SYSTEM +# +log_must touch $testfile +log_must chflags system $testfile +log_must hasflag system $testfile +log_must chflags 0 $testfile +log_must hasflag - $testfile +log_must rm $testfile +log_must user_run $owner touch $testfile +log_must user_run $owner chflags system $testfile +log_mustnot user_run $other chflags nosystem $testfile +log_must hasflag system $testfile +log_must user_run $owner chflags 0 $testfile +log_mustnot user_run $other chflags system $testfile +log_must hasflag - $testfile +log_must user_run $owner rm $testfile + +log_pass "DOS mode flags function correctly" diff --git a/tests/zfs-tests/tests/functional/acl/off/dosmode_readonly_write.c b/tests/zfs-tests/tests/functional/acl/off/dosmode_readonly_write.c new file mode 100644 index 0000000000..372c3f7f64 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/dosmode_readonly_write.c @@ -0,0 +1,61 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Test for correct behavior of DOS mode READONLY flag on a file. + * We should be able to open a file RW, set READONLY, and still write to the fd. + */ + +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, const char *argv[]) +{ + const char *buf = "We should be allowed to write this to the fd.\n"; + const char *path; + int fd; + + if (argc != 2) { + fprintf(stderr, "usage: %s PATH\n", argv[0]); + return (EXIT_FAILURE); + } + path = argv[1]; + fd = open(path, O_CREAT|O_RDWR, 0777); + if (fd == -1) + err(EXIT_FAILURE, "%s: open failed", path); + if (chflags(path, UF_READONLY) == -1) + err(EXIT_FAILURE, "%s: chflags failed", path); + if (write(fd, buf, strlen(buf)) == -1) + err(EXIT_FAILURE, "%s: write failed", path); + if (close(fd) == -1) + err(EXIT_FAILURE, "%s: close failed", path); + return (EXIT_SUCCESS); +} diff --git a/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh b/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh new file mode 100755 index 0000000000..63870caa32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh @@ -0,0 +1,145 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +# +# DESCRIPTION: +# Verify that POSIX mode bits function correctly. +# +# These tests are incomplete and will be added to over time. +# +# NOTE: Creating directory entries behaves differently between platforms. +# The parent directory's group is used on FreeBSD, while the effective +# group is used on Linux. We chown to the effective group when creating +# directories and files in these tests to achieve consistency across all +# platforms. +# +# STRATEGY: +# 1. Sanity check the POSIX mode test on tmpfs +# 2. Test POSIX mode bits on ZFS +# + +verify_runnable "both" + +function cleanup +{ + umount -f $tmpdir + rm -rf $tmpdir $TESTDIR/dir +} + +log_assert "Verify POSIX mode bits function correctly" +log_onexit cleanup + +owner=$ZFS_ACL_STAFF1 +other=$ZFS_ACL_STAFF2 +group=$ZFS_ACL_STAFF_GROUP +if is_linux; then + wheel=root +else + wheel=wheel +fi + +function test_posix_mode # base +{ + typeset base=$1 + typeset dir=$base/dir + typeset file=$dir/file + + # dir owned by root + log_must mkdir $dir + log_must chown :$wheel $dir + log_must chmod 007 $dir + + # file owned by root + log_must touch $file + log_must chown :$wheel $file + log_must ls -la $dir + log_must rm $file + + log_must touch $file + log_must chown :$wheel $file + log_must user_run $other rm $file + + # file owned by user + log_must user_run $owner touch $file + log_must chown :$group $file + log_must ls -la $dir + log_must user_run $owner rm $file + + log_must user_run $owner touch $file + log_must chown :$group $file + log_must user_run $other rm $file + + log_must user_run $owner touch $file + log_must chown :$group $file + log_must rm $file + + log_must rm -rf $dir + + # dir owned by user + log_must user_run $owner mkdir $dir + log_must chown :$group $dir + log_must user_run $owner chmod 007 $dir + + # file owned by root + log_must touch $file + log_must chown :$wheel $file + log_must ls -la $dir + log_must rm $file + + log_must touch $file + log_must chown :$wheel $file + log_mustnot user_run $other rm $file + log_must rm $file + + # file owned by user + log_mustnot user_run $owner touch $file + log_must touch $file + log_must chown $owner:$group $file + log_must ls -la $dir + log_mustnot user_run $owner rm $file + log_mustnot user_run $other rm $file + log_must rm $file + + log_must rm -rf $dir +} + +# Sanity check on tmpfs first +tmpdir=$(TMPDIR=$TEST_BASE_DIR mktemp -d) +log_must mount -t tmpfs tmp $tmpdir +log_must chmod 777 $tmpdir + +test_posix_mode $tmpdir + +log_must umount $tmpdir +log_must rmdir $tmpdir + +# Verify ZFS +test_posix_mode $TESTDIR + +log_pass "POSIX mode bits function correctly" diff --git a/tests/zfs-tests/tests/functional/acl/off/setup.ksh b/tests/zfs-tests/tests/functional/acl/off/setup.ksh new file mode 100755 index 0000000000..9a5b598a59 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/off/setup.ksh @@ -0,0 +1,44 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright (c) 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +DISK=${DISKS%% *} + +cleanup_user_group + +# Create staff group and add users to it +log_must add_group $ZFS_ACL_STAFF_GROUP +log_must add_user $ZFS_ACL_STAFF_GROUP $ZFS_ACL_STAFF1 +log_must add_user $ZFS_ACL_STAFF_GROUP $ZFS_ACL_STAFF2 + +default_setup_noexit $DISK + +log_must zfs set acltype=off $TESTPOOL/$TESTFS +log_must chmod 0777 $TESTDIR + +log_pass diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/Makefile.am b/tests/zfs-tests/tests/functional/acl/posix-sa/Makefile.am new file mode 100644 index 0000000000..31d1237ce2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/acl/posix-sa +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + posix_001_pos.ksh \ + posix_002_pos.ksh \ + posix_003_pos.ksh \ + posix_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/cleanup.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/cleanup.ksh new file mode 100755 index 0000000000..bb58a8cf2e --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/cleanup.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +cleanup_user_group + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/posix_001_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_001_pos.ksh new file mode 120000 index 0000000000..e6467b3470 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_001_pos.ksh @@ -0,0 +1 @@ +../posix/posix_001_pos.ksh \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/posix_002_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_002_pos.ksh new file mode 120000 index 0000000000..10140d0e87 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_002_pos.ksh @@ -0,0 +1 @@ +../posix/posix_002_pos.ksh \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/posix_003_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_003_pos.ksh new file mode 120000 index 0000000000..3f3db2807d --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_003_pos.ksh @@ -0,0 +1 @@ +../posix/posix_003_pos.ksh \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/posix_004_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_004_pos.ksh new file mode 120000 index 0000000000..2c2bab4477 --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/posix_004_pos.ksh @@ -0,0 +1 @@ +../posix/posix_004_pos.ksh \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/acl/posix-sa/setup.ksh b/tests/zfs-tests/tests/functional/acl/posix-sa/setup.ksh new file mode 100755 index 0000000000..d8bf8a638e --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix-sa/setup.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +log_must getfacl --version +log_must setfacl --version + +cleanup_user_group + +# Create staff group and add user to it +log_must add_group $ZFS_ACL_STAFF_GROUP +log_must add_user $ZFS_ACL_STAFF_GROUP $ZFS_ACL_STAFF1 + +DISK=${DISKS%% *} +default_setup_noexit $DISK +log_must chmod 777 $TESTDIR + +# Use POSIX ACLs on filesystem +log_must zfs set acltype=posix $TESTPOOL/$TESTFS +log_must zfs set xattr=sa $TESTPOOL/$TESTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/acl/posix/Makefile.am b/tests/zfs-tests/tests/functional/acl/posix/Makefile.am index dcf2788580..e63f63185a 100644 --- a/tests/zfs-tests/tests/functional/acl/posix/Makefile.am +++ b/tests/zfs-tests/tests/functional/acl/posix/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ posix_001_pos.ksh \ posix_002_pos.ksh \ - posix_003_pos.ksh + posix_003_pos.ksh \ + posix_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/acl/posix/posix_001_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix/posix_001_pos.ksh index 66124fe9cc..d62bf9c346 100755 --- a/tests/zfs-tests/tests/functional/acl/posix/posix_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/acl/posix/posix_001_pos.ksh @@ -34,7 +34,7 @@ # # DESCRIPTION: -# Verify that user can access file/directory if acltype=posixacl. +# Verify that user can access file/directory if acltype=posix. # # STRATEGY: # 1. Test access to file (mode=rw-) @@ -50,7 +50,7 @@ function cleanup rmdir $TESTDIR/dir.0 } -log_assert "Verify acltype=posixacl works on file" +log_assert "Verify acltype=posix works on file" log_onexit cleanup # Test access to FILE diff --git a/tests/zfs-tests/tests/functional/acl/posix/posix_002_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix/posix_002_pos.ksh index 1aceffd156..d9b5036458 100755 --- a/tests/zfs-tests/tests/functional/acl/posix/posix_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/acl/posix/posix_002_pos.ksh @@ -34,7 +34,7 @@ # # DESCRIPTION: -# Verify that user can access file/directory if acltype=posixacl. +# Verify that user can access file/directory if acltype=posix. # # STRATEGY: # 1. Test access to directory (mode=-wx) @@ -43,7 +43,7 @@ # verify_runnable "both" -log_assert "Verify acltype=posixacl works on directory" +log_assert "Verify acltype=posix works on directory" # Test access to DIRECTORY log_note "Testing access to DIRECTORY" diff --git a/tests/zfs-tests/tests/functional/acl/posix/posix_003_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix/posix_003_pos.ksh index dc6ef0d247..1b04a024f2 100755 --- a/tests/zfs-tests/tests/functional/acl/posix/posix_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/acl/posix/posix_003_pos.ksh @@ -25,7 +25,6 @@ # # DESCRIPTION: # Verify that ACLs survive remount. -# Regression test for https://github.com/zfsonlinux/zfs/issues/4520 # # STRATEGY: # 1. Test presence of default and regular ACLs after remount diff --git a/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh b/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh new file mode 100755 index 0000000000..6c6b592fbb --- /dev/null +++ b/tests/zfs-tests/tests/functional/acl/posix/posix_004_pos.ksh @@ -0,0 +1,49 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright 2020 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/acl/acl_common.kshlib + +# +# DESCRIPTION: +# Verify chown works with POSIX ACLs. +# Regression test for https://github.com/openzfs/zfs/issues/10043 +# +# STRATEGY: +# 1. Prepare an appropriate ACL on the test directory +# 2. Change the owner of the directory +# + +verify_runnable "both" +log_assert "Verify chown works with POSIX ACLs" + +log_must setfacl -d -m u:$ZFS_ACL_STAFF1:rwx $TESTDIR +log_must setfacl -b $TESTDIR + +log_must chown $ZFS_ACL_STAFF1 $TESTDIR +log_must chown 0 $TESTDIR + +log_pass "chown works with POSIX ACLs" diff --git a/tests/zfs-tests/tests/functional/acl/posix/setup.ksh b/tests/zfs-tests/tests/functional/acl/posix/setup.ksh index 5d6d158641..526c78e17f 100755 --- a/tests/zfs-tests/tests/functional/acl/posix/setup.ksh +++ b/tests/zfs-tests/tests/functional/acl/posix/setup.ksh @@ -46,7 +46,6 @@ default_setup_noexit $DISK log_must chmod 777 $TESTDIR # Use POSIX ACLs on filesystem -log_must zfs set acltype=posixacl $TESTPOOL/$TESTFS -log_must zfs set xattr=sa $TESTPOOL/$TESTFS +log_must zfs set acltype=posix $TESTPOOL/$TESTFS log_pass diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh index 441df82967..3237d7cb78 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh @@ -20,7 +20,8 @@ # # DESCRIPTION: -# Creating a pool with a special device succeeds. +# Creating a pool with a special device succeeds, but only if +# "feature@allocation_classes" is enabled. # verify_runnable "global" @@ -31,6 +32,9 @@ log_assert $claim log_onexit cleanup log_must disk_setup +for type in special dedup; do + log_mustnot zpool create -d $TESTPOOL $CLASS_DISK0 $type $CLASS_DISK1 +done log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 log_must display_status "$TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh index dcc6f7607c..79ac9364c2 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh @@ -52,7 +52,7 @@ do log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ special $stype $sdisks - ac_value="$(zpool get all -H -o property,value | \ + ac_value="$(zpool get -H -o property,value all | \ egrep allocation_classes | nawk '{print $2}')" if [ "$ac_value" = "active" ]; then log_note "feature@allocation_classes is active" diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh index 417c68aa73..337114cdb5 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh @@ -41,7 +41,7 @@ do else log_must zpool create $TESTPOOL $type $ZPOOL_DISKS fi - ac_value="$(zpool get all -H -o property,value | \ + ac_value="$(zpool get -H -o property,value all | \ egrep allocation_classes | awk '{print $2}')" if [ "$ac_value" = "enabled" ]; then log_note "feature@allocation_classes is enabled" @@ -56,7 +56,7 @@ do log_must zpool add $TESTPOOL special mirror \ $CLASS_DISK0 $CLASS_DISK1 fi - ac_value="$(zpool get all -H -o property,value | \ + ac_value="$(zpool get -H -o property,value all | \ egrep allocation_classes | awk '{print $2}')" if [ "$ac_value" = "active" ]; then log_note "feature@allocation_classes is active" diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh index 7c1d6e15c0..e8061fdabc 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh @@ -13,7 +13,7 @@ # # Copyright (c) 2017, Intel Corporation. -# Copyright (c) 2018 by Delphix. All rights reserved. +# Copyright (c) 2018, 2020 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib @@ -33,22 +33,33 @@ log_must disk_setup typeset stype="" typeset sdisks="" +typeset props="" for type in "" "mirror" "raidz" do if [ "$type" = "mirror" ]; then stype="mirror" sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" + props="-o ashift=12" elif [ "$type" = "raidz" ]; then stype="mirror" sdisks="${CLASS_DISK0} ${CLASS_DISK1}" else stype="" - special_args="${CLASS_DISK0}" + sdisks="${CLASS_DISK0}" + fi + + # + # 1/3 of the time add the special vdev after creating the pool + # + if [ $((RANDOM % 3)) -eq 0 ]; then + log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS + log_must zpool add ${props} $TESTPOOL special $stype $sdisks + else + log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS \ + special $stype $sdisks fi - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ - special $stype $sdisks log_must zpool export $TESTPOOL log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL log_must display_status $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index fe1ae366a6..d804e5371e 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -21,6 +21,7 @@ # # DESCRIPTION: # Setting the special_small_blocks property to invalid values fails. +# Powers of two from 512 to 1M are allowed. # verify_runnable "global" @@ -34,7 +35,7 @@ log_must disk_setup log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 262144 +for value in 256 1025 2097152 do log_mustnot zfs set special_small_blocks=$value $TESTPOOL done diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh index bd6c6631fa..b49a8919ed 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh @@ -33,8 +33,9 @@ function file_in_special_vdev # { typeset dataset="$1" typeset inum="$2" + typeset num_normal=$(echo $ZPOOL_DISKS | wc -w | xargs) - zdb -dddddd $dataset $inum | awk '{ + zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ # find DVAs from string "offset level dva" only for L0 (data) blocks if (match($0,"L0 [0-9]+")) { dvas[0]=$3 @@ -49,7 +50,7 @@ if (match($0,"L0 [0-9]+")) { exit 1; } # verify vdev is "special" - if (arr[1] < 3) { + if (arr[1] < d) { exit 1; } } @@ -57,57 +58,66 @@ if (match($0,"L0 [0-9]+")) { }}' } +# +# Check that device removal works for special class vdevs +# +function check_removal +{ + # + # Create a non-raidz pool so we can remove top-level vdevs + # + log_must disk_setup + log_must zpool create $TESTPOOL $ZPOOL_DISKS \ + special $CLASS_DISK0 special $CLASS_DISK1 + log_must display_status "$TESTPOOL" + + # + # Generate some metadata and small blocks in the special class vdev + # before removal + # + typeset -l i=1 + typeset -l blocks=25 + + log_must zfs create -o special_small_blocks=32K -o recordsize=32K \ + $TESTPOOL/$TESTFS + for i in 1 2 3 4; do + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/testfile.$i \ + bs=1M count=$blocks + ((blocks = blocks + 25)) + done + log_must sync_pool $TESTPOOL + log_must zpool list -v $TESTPOOL + + # Verify the files were written in the special class vdevs + for i in 1 2 3 4; do + dataset="$TESTPOOL/$TESTFS" + inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)" + log_must file_in_special_vdev $dataset $inum + done + + log_must zpool remove $TESTPOOL $CLASS_DISK0 + + sleep 5 + log_must sync_pool $TESTPOOL + sleep 1 + + log_must zdb -bbcc $TESTPOOL + log_must zpool list -v $TESTPOOL + log_must zpool destroy -f "$TESTPOOL" + log_must disk_cleanup +} + claim="Removing a special device from a pool succeeds." log_assert $claim log_onexit cleanup -# -# Create a non-raidz pool so we can remove top-level vdevs -# -log_must disk_setup -log_must zpool create $TESTPOOL $ZPOOL_DISK0 $ZPOOL_DISK1 $ZPOOL_DISK2 \ - special $CLASS_DISK0 special $CLASS_DISK1 -log_must display_status "$TESTPOOL" - -# -# Generate some metadata and small blocks in the special class before removal -# -typeset -l i=1 -typeset -l blocks=25 - -log_must zfs create -o special_small_blocks=32K -o recordsize=32K \ - $TESTPOOL/$TESTFS -for i in 1 2 3 4; do - log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/testfile.$i bs=1M \ - count=$blocks - ((blocks = blocks + 25)) +typeset CLASS_DEVSIZE=$CLASS_DEVSIZE +for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do + typeset ZPOOL_DISKS=$ZPOOL_DISKS + for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do + check_removal + done done -log_must sync_pool $TESTPOOL -log_must zpool list -v $TESTPOOL - -# Verify the files were written in the special class vdevs -for i in 1 2 3 4; do - dataset="$TESTPOOL/$TESTFS" - inum="$(stat -c '%i' /$TESTPOOL/$TESTFS/testfile.$i)" - log_must file_in_special_vdev $dataset $inum -done - -# -# remove a special allocation vdev and force a remapping -# N.B. The 'zfs remap' command has been disabled and may be removed. -# -export ZFS_REMAP_ENABLED=YES - -log_must zpool remove $TESTPOOL $CLASS_DISK0 -log_must zfs remap $TESTPOOL/$TESTFS - -sleep 5 -log_must sync_pool $TESTPOOL -sleep 1 - -log_must zdb -bbcc $TESTPOOL -log_must zpool list -v $TESTPOOL -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh index 22a8f1a97d..2ce22a6242 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh @@ -42,7 +42,7 @@ log_must display_status "$TESTPOOL" log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL -log_must echo y | newfs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null 2>&1 +log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null 2>&1" sync_pool log_must zpool list -v $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/alloc_class/cleanup.ksh b/tests/zfs-tests/tests/functional/alloc_class/cleanup.ksh index c12d5973b4..13775da919 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/cleanup.ksh @@ -21,7 +21,7 @@ verify_runnable "global" -default_cleanup +default_cleanup_noexit disk_cleanup log_pass diff --git a/tests/zfs-tests/tests/functional/arc/Makefile.am b/tests/zfs-tests/tests/functional/arc/Makefile.am index dc57ebc862..809d0346f8 100644 --- a/tests/zfs-tests/tests/functional/arc/Makefile.am +++ b/tests/zfs-tests/tests/functional/arc/Makefile.am @@ -2,5 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/arc dist_pkgdata_SCRIPTS = \ cleanup.ksh \ setup.ksh \ + arcstats_runtime_tuning.ksh \ dbufstats_001_pos.ksh \ - dbufstats_002_pos.ksh + dbufstats_002_pos.ksh \ + dbufstats_003_pos.ksh diff --git a/tests/zfs-tests/tests/functional/arc/arcstats_runtime_tuning.ksh b/tests/zfs-tests/tests/functional/arc/arcstats_runtime_tuning.ksh new file mode 100755 index 0000000000..6650b2e1a4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/arc/arcstats_runtime_tuning.ksh @@ -0,0 +1,46 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, loli10K . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + # Set tunables to their recorded actual size and then to their original + # value: this works for previously unconfigured tunables. + log_must set_tunable64 ARC_MIN "$MINSIZE" + log_must set_tunable64 ARC_MIN "$ZFS_ARC_MIN" + log_must set_tunable64 ARC_MAX "$MAXSIZE" + log_must set_tunable64 ARC_MAX "$ZFS_ARC_MAX" +} + +log_onexit cleanup + +ZFS_ARC_MAX="$(get_tunable ARC_MAX)" +ZFS_ARC_MIN="$(get_tunable ARC_MIN)" +MINSIZE="$(get_min_arc_size)" +MAXSIZE="$(get_max_arc_size)" + +log_assert "ARC tunables should be updated dynamically" + +for size in $((MAXSIZE/4)) $((MAXSIZE/3)) $((MAXSIZE/2)) $MAXSIZE; do + log_must set_tunable64 ARC_MAX "$size" + log_must test "$(get_max_arc_size)" == "$size" + log_must set_tunable64 ARC_MIN "$size" + log_must test "$(get_min_arc_size)" == "$size" +done + +log_pass "ARC tunables can be updated dynamically" diff --git a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh index 7ec9eaf4c5..0577a6b80c 100755 --- a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh @@ -55,10 +55,16 @@ function testdbufstat # stat_name dbufstat_filter [[ -n "$2" ]] && filter="-F $2" - from_dbufstat=$(grep -w "$name" "$DBUFSTATS_FILE" | awk '{ print $3 }') + if is_linux; then + from_dbufstat=$(grep -w "$name" "$DBUFSTATS_FILE" | + awk '{ print $3 }') + else + from_dbufstat=$(awk "/dbufstats\.$name:/ { print \$2 }" \ + "$DBUFSTATS_FILE") + fi from_dbufs=$(dbufstat -bxn -i "$DBUFS_FILE" "$filter" | wc -l) - within_tolerance $from_dbufstat $from_dbufs 9 \ + within_tolerance $from_dbufstat $from_dbufs 15 \ || log_fail "Stat $name exceeded tolerance" } @@ -71,8 +77,8 @@ log_onexit cleanup log_must file_write -o create -f "$TESTDIR/file" -b 1048576 -c 20 -d R log_must zpool sync -log_must eval "cat /proc/spl/kstat/zfs/dbufs > $DBUFS_FILE" -log_must eval "cat /proc/spl/kstat/zfs/dbufstats > $DBUFSTATS_FILE" +log_must eval "kstat dbufs > $DBUFS_FILE" +log_must eval "kstat dbufstats '' > $DBUFSTATS_FILE" for level in {0..11}; do testdbufstat "cache_level_$level" "dbc=1,level=$level" diff --git a/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh index dc30b66065..58d401539e 100755 --- a/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh @@ -58,10 +58,10 @@ log_onexit cleanup log_must file_write -o create -f "$TESTDIR/file" -b 1048576 -c 1 -d R log_must zpool sync -objid=$(stat --format="%i" "$TESTDIR/file") +objid=$(get_objnum "$TESTDIR/file") log_note "Object ID for $TESTDIR/file is $objid" -log_must eval "cat /proc/spl/kstat/zfs/dbufs > $DBUFS_FILE" +log_must eval "kstat dbufs > $DBUFS_FILE" dbuf=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid" | wc -l) mru=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid,list=1" | wc -l) mfu=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid,list=3" | wc -l) @@ -70,7 +70,7 @@ verify_ne "0" "$mru" "mru count" verify_eq "0" "$mfu" "mfu count" log_must eval "cat $TESTDIR/file > /dev/null" -log_must eval "cat /proc/spl/kstat/zfs/dbufs > $DBUFS_FILE" +log_must eval "kstat dbufs > $DBUFS_FILE" dbuf=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid" | wc -l) mru=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid,list=1" | wc -l) mfu=$(dbufstat -bxn -i "$DBUFS_FILE" -F "object=$objid,list=3" | wc -l) diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/dbufstat_001_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_003_pos.ksh similarity index 90% rename from tests/zfs-tests/tests/functional/cli_user/misc/dbufstat_001_pos.ksh rename to tests/zfs-tests/tests/functional/arc/dbufstats_003_pos.ksh index 0e187015f8..91cec74881 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/dbufstat_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_003_pos.ksh @@ -33,11 +33,11 @@ log_assert "dbufstat generates output and doesn't return an error code" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do - log_must eval "sudo dbufstat ${args[i]} > /dev/null" + log_must eval "dbufstat ${args[i]} >/dev/null" ((i = i + 1)) done # A simple test of dbufstat filter functionality -log_must eval "sudo dbufstat -F object=10,dbc=1,pool=$TESTPOOL > /dev/null" +log_must eval "dbufstat -F object=10,dbc=1,pool=$TESTPOOL >/dev/null" log_pass "dbufstat generates output and doesn't return an error code" diff --git a/tests/zfs-tests/tests/functional/atime/atime_common.kshlib b/tests/zfs-tests/tests/functional/atime/atime_common.kshlib index bd6c6dc39d..fce85c3798 100644 --- a/tests/zfs-tests/tests/functional/atime/atime_common.kshlib +++ b/tests/zfs-tests/tests/functional/atime/atime_common.kshlib @@ -47,6 +47,9 @@ function check_atime_updated if is_linux; then typeset before=$(stat -c %X $filename) sleep 2 + elif is_freebsd; then + typeset before=$(stat -f %a $filename) + sleep 2 else typeset before=$(ls -Eu $filename | awk '{print $7}') fi @@ -55,6 +58,8 @@ function check_atime_updated if is_linux; then typeset after=$(stat -c %X $filename) + elif is_freebsd; then + typeset after=$(stat -f %a $filename) else typeset after=$(ls -Eu $filename | awk '{print $7}') fi diff --git a/tests/zfs-tests/tests/functional/atime/root_atime_off.ksh b/tests/zfs-tests/tests/functional/atime/root_atime_off.ksh index 2fbf06b137..7eb2ed9372 100755 --- a/tests/zfs-tests/tests/functional/atime/root_atime_off.ksh +++ b/tests/zfs-tests/tests/functional/atime/root_atime_off.ksh @@ -53,7 +53,7 @@ log_onexit cleanup # # Create $TESTFILE, snapshot and clone. -# Same as 002 except that atime applies to root dataset (ZoL#8675). +# Same as 002 except that atime applies to root dataset (OpenZFS#8675). # setup_snap_clone reset_atime diff --git a/tests/zfs-tests/tests/functional/atime/root_atime_on.ksh b/tests/zfs-tests/tests/functional/atime/root_atime_on.ksh index 3976523b0b..44d471a212 100755 --- a/tests/zfs-tests/tests/functional/atime/root_atime_on.ksh +++ b/tests/zfs-tests/tests/functional/atime/root_atime_on.ksh @@ -52,7 +52,7 @@ log_onexit cleanup # # Create $TESTFILE, snapshot and clone. -# Same as 001 except that atime/relatime applies to root dataset (ZoL#8675). +# Same as 001 except that atime/relatime applies to root dataset (OpenZFS#8675). # setup_snap_clone reset_atime diff --git a/tests/zfs-tests/tests/functional/atime/root_relatime_on.ksh b/tests/zfs-tests/tests/functional/atime/root_relatime_on.ksh index c919e9f298..120129425a 100755 --- a/tests/zfs-tests/tests/functional/atime/root_relatime_on.ksh +++ b/tests/zfs-tests/tests/functional/atime/root_relatime_on.ksh @@ -53,7 +53,7 @@ log_onexit cleanup # # Create $TESTFILE, snapshot and clone. -# Same as 003 except that atime/relatime applies to root dataset (ZoL#8675). +# Same as 003 except that atime/relatime applies to root dataset (OpenZFS#8675). # setup_snap_clone reset_atime diff --git a/tests/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh b/tests/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh index 6a72bfcdc4..a5bc7753e9 100755 --- a/tests/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh @@ -51,14 +51,9 @@ verify_runnable "global" function cleanup { - if datasetexists $TESTPOOL/vol - then - log_must zfs destroy $TESTPOOL/vol - fi - if poolexists $TESTPOOL - then - log_must zpool destroy $TESTPOOL - fi + datasetexists $TESTPOOL/vol && destroy_dataset $TESTPOOL/vol + poolexists $TESTPOOL && log_must zpool destroy $TESTPOOL + if [[ -f $VDEV ]]; then log_must rm -f $VDEV fi diff --git a/tests/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh b/tests/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh index e17c06bb5d..d29fe7e89c 100755 --- a/tests/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh @@ -117,7 +117,7 @@ verify_bootfs $TESTPOOL log_must zpool create $TESTPOOL mirror $VDEV1 $VDEV2 spare $VDEV3 verify_bootfs $TESTPOOL -if is_linux; then +if is_linux || is_freebsd; then # stripe log_must zpool create $TESTPOOL $VDEV1 $VDEV2 verify_bootfs $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/btree/Makefile.am b/tests/zfs-tests/tests/functional/btree/Makefile.am new file mode 100644 index 0000000000..333209d98f --- /dev/null +++ b/tests/zfs-tests/tests/functional/btree/Makefile.am @@ -0,0 +1,20 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/btree + +dist_pkgdata_SCRIPTS = \ + btree_positive.ksh \ + btree_negative.ksh diff --git a/tests/zfs-tests/tests/functional/btree/btree_negative.ksh b/tests/zfs-tests/tests/functional/btree/btree_negative.ksh new file mode 100755 index 0000000000..cefcbc51e7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/btree/btree_negative.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# Verify that the btree functions don't allow bad inputs +# +# insert_duplicate - Callers may not add values that are already in the tree +# remove_missing - Callers may not remove values that are not in the tree +# +# Note: These invocations cause btree_test to crash, but the program disables +# core dumps first. As such, we can't use log_mustnot because it explicitly +# looks for return values that correspond to a core dump and cause a test +# failure. + +btree_test -n insert_duplicate +[[ $? -eq 0 ]] && log_fail "Failure from insert_duplicate" + +btree_test -n remove_missing +[[ $? -eq 0 ]] && log_fail "Failure from remove_missing" + +log_pass "Btree negative tests passed" diff --git a/tests/zfs-tests/tests/functional/btree/btree_positive.ksh b/tests/zfs-tests/tests/functional/btree/btree_positive.ksh new file mode 100755 index 0000000000..badbac2fab --- /dev/null +++ b/tests/zfs-tests/tests/functional/btree/btree_positive.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# The `btree_test` binary runs a series of positive tests when called +# without arguments. +# +# insert_find_remove - Basic functionality test +# find_without_index - Using the find function with a NULL argument +# drain_tree - Fill the tree then empty it using the first and last +# functions +# stress_tree - Allow the tree to have items added and removed for a +# given amount of time +# + +log_must btree_test + +log_pass "Btree positive tests passed" diff --git a/tests/zfs-tests/tests/functional/cache/Makefile.am b/tests/zfs-tests/tests/functional/cache/Makefile.am index 18dd9c1985..f28130ee9e 100644 --- a/tests/zfs-tests/tests/functional/cache/Makefile.am +++ b/tests/zfs-tests/tests/functional/cache/Makefile.am @@ -11,8 +11,9 @@ dist_pkgdata_SCRIPTS = \ cache_007_neg.ksh \ cache_008_neg.ksh \ cache_009_pos.ksh \ - cache_010_neg.ksh \ - cache_011_pos.ksh + cache_010_pos.ksh \ + cache_011_pos.ksh \ + cache_012_pos.ksh dist_pkgdata_DATA = \ cache.cfg \ diff --git a/tests/zfs-tests/tests/functional/cache/cache_010_neg.ksh b/tests/zfs-tests/tests/functional/cache/cache_010_pos.ksh similarity index 64% rename from tests/zfs-tests/tests/functional/cache/cache_010_neg.ksh rename to tests/zfs-tests/tests/functional/cache/cache_010_pos.ksh index 1d0683b858..1d9fc5a892 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_010_pos.ksh @@ -34,12 +34,12 @@ # # DESCRIPTION: -# Verify cache device must be a block device. +# Verify that cache devices can be block devices, files or character devices # # STRATEGY: # 1. Create a pool # 2. Add different object as cache -# 3. Verify character devices and files fail +# 3. Verify character devices and files pass # verify_runnable "global" @@ -50,51 +50,55 @@ function cleanup_testenv if [[ -n $lofidev ]]; then if is_linux; then losetup -d $lofidev + elif is_freebsd; then + mdconfig -du ${lofidev#md} else lofiadm -d $lofidev fi fi } -log_assert "Cache device can only be block devices." +log_assert "Verify cache devices can be disk, file, lofi device or any " \ + "device that presents a block interface" + +verify_disk_count "$DISKS" 2 log_onexit cleanup_testenv TESTVOL=testvol1$$ dsk1=${DISKS%% *} log_must zpool create $TESTPOOL ${DISKS#$dsk1} -# Add nomal ${DEV_RDSKDIR} device +# Add normal ${DEV_RDSKDIR} device log_must zpool add $TESTPOOL cache \ - ${DEV_RDSKDIR}/${dsk1}${SLICE_PREFIX}${SLICE0} + ${DEV_RDSKDIR}/${dsk1} +log_must zpool remove $TESTPOOL ${DEV_RDSKDIR}/${dsk1} + + +# Add provided disk +log_must zpool add $TESTPOOL cache $dsk1 log_must verify_cache_device $TESTPOOL $dsk1 'ONLINE' +log_must zpool remove $TESTPOOL $dsk1 # Add normal file -log_mustnot zpool add $TESTPOOL cache $VDEV2 +log_must zpool add $TESTPOOL cache $VDEV +ldev=$(random_get $VDEV) +log_must verify_cache_device $TESTPOOL $ldev 'ONLINE' -# Add /dev/rlofi device (allowed under Linux) +# Add loop back device if is_linux; then lofidev=$(losetup -f) - lofidev=${lofidev##*/} log_must losetup $lofidev ${VDEV2%% *} - log_must zpool add $TESTPOOL cache $lofidev - log_must zpool remove $TESTPOOL $lofidev - log_must losetup -d $lofidev - lofidev="" + lofidev=${lofidev##*/} +elif is_freebsd; then + lofidev=$(mdconfig -a ${VDEV2%% *}) else lofidev=${VDEV2%% *} log_must lofiadm -a $lofidev lofidev=$(lofiadm $lofidev) - log_mustnot zpool add $TESTPOOL cache "/dev/rlofi/${lofidev#/dev/lofi/}" - log_must lofiadm -d $lofidev - lofidev="" fi -# Add /dev/zvol/rdsk device (allowed under Linux) -if ! is_linux; then - log_must zpool create $TESTPOOL2 $VDEV2 - log_must zfs create -V $SIZE $TESTPOOL2/$TESTVOL - log_mustnot zpool add $TESTPOOL cache \ - ${ZVOL_RDEVDIR}/$TESTPOOL2/$TESTVOL -fi +log_must zpool add $TESTPOOL cache $lofidev +log_must verify_cache_device $TESTPOOL $lofidev 'ONLINE' -log_pass "Cache device can only be block devices." +log_pass "Verify cache devices can be disk, file, lofi device or any " \ + "device that presents a block interface" diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh new file mode 100755 index 0000000000..edefe9c1bf --- /dev/null +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -0,0 +1,110 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/cache/cache.cfg +. $STF_SUITE/tests/functional/cache/cache.kshlib + +# +# DESCRIPTION: +# Looping around a cache device with l2arc_write_size exceeding +# the device size succeeds. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Set l2arc_write_max to a value larger than the cache device. +# 3. Create a file larger than the cache device and random read +# for 10 sec. +# 4. Verify that l2arc_write_max is set back to the default. +# 5. Set l2arc_write_max to a value less than the cache device size but +# larger than the default (64MB). +# 6. Record the l2_size. +# 7. Random read for 1 sec. +# 8. Record the l2_size again. +# 9. If (6) <= (8) then we have not looped around yet. +# 10. If (6) > (8) then we looped around. Break out of the loop and test. +# 11. Destroy pool. +# + +verify_runnable "global" + +log_assert "Looping around a cache device succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_WRITE_MAX $write_max + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch +} +log_onexit cleanup + +typeset write_max=$(get_tunable L2ARC_WRITE_MAX) +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 0 + +typeset VDEV="$VDIR/vdev.disk" +typeset VDEV_SZ=$(( 4 * 1024 * 1024 * 1024 )) +typeset VCACHE="$VDIR/vdev.cache" +typeset VCACHE_SZ=$(( $VDEV_SZ / 2 )) + +typeset fill_mb=$(( floor($VDEV_SZ * 3 / 4 ) )) +export DIRECTORY=/$TESTPOOL +export NUMJOBS=4 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) )) + +log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 )) + +log_must truncate -s $VCACHE_SZ $VCACHE +log_must truncate -s $VDEV_SZ $VDEV + +log_must zpool create -f $TESTPOOL $VDEV cache $VCACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +typeset write_max2=$(get_tunable L2ARC_WRITE_MAX) + +log_must test $write_max2 -eq $write_max + +log_must set_tunable32 L2ARC_WRITE_MAX $(( 64 * 1024 * 1024 )) +export RUNTIME=1 + +typeset do_once=true +while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do + typeset l2_size1=$(get_arcstat l2_size) + log_must fio $FIO_SCRIPTS/random_reads.fio + typeset l2_size2=$(get_arcstat l2_size) + do_once=false +done + +log_must test $l2_size1 -gt $l2_size2 + +log_must zpool destroy $TESTPOOL + +log_pass "Looping around a cache device succeeds." diff --git a/tests/zfs-tests/tests/functional/cache/setup.ksh b/tests/zfs-tests/tests/functional/cache/setup.ksh index d5da5d9bb0..0493637fcc 100755 --- a/tests/zfs-tests/tests/functional/cache/setup.ksh +++ b/tests/zfs-tests/tests/functional/cache/setup.ksh @@ -34,10 +34,6 @@ verify_runnable "global" -if ! is_physical_device $LDEV; then - log_unsupported "Only physical disk could be cache device" -fi - log_must rm -rf $VDIR $VDIR2 log_must mkdir -p $VDIR $VDIR2 log_must mkfile $SIZE $VDEV $VDEV2 diff --git a/tests/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh b/tests/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh index e0b81e1662..841b141e16 100755 --- a/tests/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh @@ -38,9 +38,9 @@ # Verify set, export and destroy when cachefile is set on pool. # # STRATEGY: -# 1. Create two pools with one same cahcefile1. +# 1. Create two pools with one same cachefile1. # 2. Set cachefile of the two pools to another same cachefile2. -# 3. Verify cachefile1 not exist. +# 3. Verify cachefile1 does not exist. # 4. Export the two pools. # 5. Verify cachefile2 not exist. # 6. Import the two pools and set cachefile to cachefile2. diff --git a/tests/zfs-tests/tests/functional/casenorm/casenorm.cfg b/tests/zfs-tests/tests/functional/casenorm/casenorm.cfg index 9e8e456863..5d2efbf000 100644 --- a/tests/zfs-tests/tests/functional/casenorm/casenorm.cfg +++ b/tests/zfs-tests/tests/functional/casenorm/casenorm.cfg @@ -17,12 +17,16 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # -NAME_C_ORIG=$(echo 'F\0303\0257L\0303\0253N\0303\0204m\0303\0253') -NAME_C_UPPER=$(echo 'F\0303\0217L\0303\0213N\0303\0204M\0303\0213') -NAME_C_LOWER=$(echo 'f\0303\0257l\0303\0253n\0303\0244m\0303\0253') -NAME_D_ORIG=$(echo 'Fi\0314\0210Le\0314\0210NA\0314\0210me\0314\0210') -NAME_D_UPPER=$(echo 'FI\0314\0210LE\0314\0210NA\0314\0210ME\0314\0210') -NAME_D_LOWER=$(echo 'fi\0314\0210le\0314\0210na\0314\0210me\0314\0210') +# Ksh on linux may have locale env variables undefined +export LANG="C.UTF-8" +export LC_ALL="C.UTF-8" + +NAME_C_ORIG=$(printf '\u0046\u00ef\u004c\u00eb\u004e\u00c4\u006d\u00eb') +NAME_C_UPPER=$(printf '\u0046\u00cf\u004c\u00cb\u004e\u00c4\u004d\u00cb') +NAME_C_LOWER=$(printf '\u0066\u00ef\u006c\u00eb\u006e\u00e4\u006d\u00eb') +NAME_D_ORIG=$(printf '\u0046\u0069\u0308\u004c\u0065\u0308\u004e\u0041\u0308\u006d\u0065\u0308') +NAME_D_UPPER=$(printf '\u0046\u0049\u0308\u004c\u0045\u0308\u004e\u0041\u0308\u004d\u0045\u0308') +NAME_D_LOWER=$(printf '\u0066\u0069\u0308\u006c\u0065\u0308\u006e\u0061\u0308\u006d\u0065\u0308') NAMES_ORIG="$NAME_C_ORIG $NAME_D_ORIG" NAMES_UPPER="$NAME_C_UPPER $NAME_D_UPPER" NAMES_LOWER="$NAME_C_LOWER $NAME_D_LOWER" diff --git a/tests/zfs-tests/tests/functional/casenorm/casenorm.kshlib b/tests/zfs-tests/tests/functional/casenorm/casenorm.kshlib index 273522406b..f0fe1bbaa8 100644 --- a/tests/zfs-tests/tests/functional/casenorm/casenorm.kshlib +++ b/tests/zfs-tests/tests/functional/casenorm/casenorm.kshlib @@ -34,7 +34,7 @@ function create_testfs function destroy_testfs { if datasetexists $TESTPOOL/$TESTFS ; then - log_must zfs destroy -f $TESTPOOL/$TESTFS + destroy_dataset $TESTPOOL/$TESTFS -f rm -rf $TESTDIR || log_unresolved Could not remove $TESTDIR fi } @@ -65,14 +65,22 @@ function lookup_file { typeset name=$1 - zlook -l $TESTDIR $name >/dev/null 2>&1 + if is_illumos; then + zlook -l $TESTDIR $name >/dev/null 2>&1 + else + test -f "${TESTDIR}/${name}" >/dev/null 2>&1 + fi } function lookup_file_ci { typeset name=$1 - zlook -il $TESTDIR $name >/dev/null 2>&1 + if is_illumos; then + zlook -il $TESTDIR $name >/dev/null 2>&1 + else + test -f "${TESTDIR}/${name}" >/dev/null 2>&1 + fi } function lookup_any diff --git a/tests/zfs-tests/tests/functional/casenorm/insensitive_formd_lookup.ksh b/tests/zfs-tests/tests/functional/casenorm/insensitive_formd_lookup.ksh index d28431300a..1ef9d2756f 100755 --- a/tests/zfs-tests/tests/functional/casenorm/insensitive_formd_lookup.ksh +++ b/tests/zfs-tests/tests/functional/casenorm/insensitive_formd_lookup.ksh @@ -19,7 +19,7 @@ # DESCRIPTION: # For the filesystem with casesensitivity=insensitive, normalization=formD, -# check that lookup succeds using any name form. +# check that lookup succeeds using any name form. # # STRATEGY: # For each c/n name form: diff --git a/tests/zfs-tests/tests/functional/casenorm/norm_all_values.ksh b/tests/zfs-tests/tests/functional/casenorm/norm_all_values.ksh index 87779a710d..cae15ebc40 100755 --- a/tests/zfs-tests/tests/functional/casenorm/norm_all_values.ksh +++ b/tests/zfs-tests/tests/functional/casenorm/norm_all_values.ksh @@ -58,4 +58,15 @@ for form in formC formD formKC formKD; do destroy_testfs done +for form in formC formD formKC formKD; do + create_testfs "-o normalization=$form" + log_must zfs create -o utf8only=off $TESTPOOL/$TESTFS/$TESTSUBFS + normalization=$(zfs get -H -o value normalization $TESTPOOL/$TESTFS/$TESTSUBFS) + if [[ $normalization != "none" ]]; then + log_fail "Turning off utf8only didn't set normalization to none" + fi + log_must zfs destroy $TESTPOOL/$TESTFS/$TESTSUBFS + destroy_testfs +done + log_pass "Can create FS with all supported normalization forms" diff --git a/tests/zfs-tests/tests/functional/channel_program/channel_common.kshlib b/tests/zfs-tests/tests/functional/channel_program/channel_common.kshlib index 722a477556..a828ba2906 100644 --- a/tests/zfs-tests/tests/functional/channel_program/channel_common.kshlib +++ b/tests/zfs-tests/tests/functional/channel_program/channel_common.kshlib @@ -141,24 +141,16 @@ function log_program_construct_args pool=$1 shift - # - # Catch HERE document if it exists and save it within our - # temp file. The reason we do this is that since the - # log_must_program wrapper calls zfs-program twice (once - # for open context and once for syncing) the HERE doc - # is consumed in the first invocation and the second one - # does not have a program to run. - # - test -s /dev/stdin && cat > $tmpin + infile=$1 + shift # - # If $tmpin has contents it means that we consumed a HERE - # doc and $1 currently holds "-" (a dash). If there is no - # HERE doc and $tmpin is empty, then we copy the contents - # of the original channel program to $tmpin. + # Copy the contents of the original channel program to $tmpin. # - [[ -s $tmpin ]] || cp $1 $tmpin - shift + # If $infile currently holds "-" (a dash) it means that we consume a + # HERE doc from stdin, otherwise $infile is a file path. + # + cat $infile > $tmpin lua_args=$@ diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile.am b/tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile.am index e06b145dcc..fb35208119 100644 --- a/tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile.am +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile.am @@ -21,6 +21,7 @@ dist_pkgdata_SCRIPTS = \ tst.return_nvlist_neg.ksh \ tst.return_nvlist_pos.ksh \ tst.return_recursive_table.ksh \ + tst.stack_gsub.ksh \ tst.timeout.ksh dist_pkgdata_DATA = \ @@ -40,4 +41,6 @@ dist_pkgdata_DATA = \ tst.recursive.zcp \ tst.return_large.zcp \ tst.return_recursive_table.zcp \ + tst.stack_gsub.err \ + tst.stack_gsub.zcp \ tst.timeout.zcp diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.ksh b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.ksh index d486c25f44..eba01b17c8 100755 --- a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.ksh @@ -30,7 +30,7 @@ create_clone function cleanup { datasetexists $TESTPOOL/$TESTFS@$TESTSNAP && \ - log_must zfs destroy -R $TESTPOOL/$TESTFS@$TESTSNAP + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP -R } log_must_program $TESTPOOL $ZCP_ROOT/lua_core/tst.exists.zcp \ diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.memory_limit.ksh b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.memory_limit.ksh index 2885775686..0533b8fa30 100755 --- a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.memory_limit.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.memory_limit.ksh @@ -61,6 +61,9 @@ log_mustnot_checkerror_program "Memory limit exhausted" -m 1 $TESTPOOL - <<-EOF return s EOF +# Set the memlimit, in case it is a non-default value +log_must set_tunable32 LUA_MAX_MEMLIMIT 100000000 + log_mustnot_checkerror_program "Invalid instruction or memory limit" \ -m 200000000 $TESTPOOL - <<-EOF return 1; diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.ksh b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.ksh index ba9c407394..bbaeb54f59 100755 --- a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.ksh @@ -27,7 +27,7 @@ fs=$TESTPOOL/$TESTFS/testchild function cleanup { - datasetexists $fs && log_must zfs destroy -R $fs + datasetexists $fs && destroy_dataset $fs -R } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.err b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.err new file mode 100644 index 0000000000..45f2d9ef0e --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.err @@ -0,0 +1,18 @@ +Channel program execution failed: +C stack overflow +stack traceback: + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + [C]: in function 'gsub' + [string "channel program"]:17: in function <[string "channel program"]:16> + (...tail calls...) \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.ksh b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.ksh new file mode 100755 index 0000000000..ecabf3a3fe --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: +# Overflowing the C stack using recursive gsub() should be handled +# gracefully. gsub() uses more stack space than typical, so it relies +# on LUAI_MINCSTACK to ensure that we don't overflow the Linux kernel's +# stack. +# + +verify_runnable "global" + +log_assert "recursive gsub() should be handled gracefully" + +log_mustnot_program $TESTPOOL $ZCP_ROOT/lua_core/tst.stack_gsub.zcp + +log_pass "recursive gsub() should be handled gracefully" diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.zcp b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.zcp new file mode 100644 index 0000000000..a493363ca6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.zcp @@ -0,0 +1,20 @@ +-- +-- This file and its contents are supplied under the terms of the +-- Common Development and Distribution License ("CDDL"), version 1.0. +-- You may only use this file in accordance with the terms of version +-- 1.0 of the CDDL. +-- +-- A full copy of the text of the CDDL should have accompanied this +-- source. A copy of the CDDL is also available via the Internet at +-- http://www.illumos.org/license/CDDL. +-- + +-- +-- Copyright (c) 2020 by Delphix. All rights reserved. +-- + +function f(s) + return string.gsub(s, ".", f) +end + +return f("foo") diff --git a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.ksh b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.ksh index 9256e86771..22ea375481 100755 --- a/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.ksh @@ -37,7 +37,7 @@ function test_instr_limit error=$(zfs program -t $lim $TESTPOOL $ZCP_ROOT/lua_core/tst.timeout.zcp 2>&1) [[ $? -ne 0 ]] || log_fail "Channel program with limit $lim exited 0: $error" - instrs_run=$(echo $error | sed -n 's/.\+ \([0-9]*\) Lua instructions/\1/p') + instrs_run=$(echo $error | awk -F "chunk" '{print $2}' | awk '{print $1}') if [[ $instrs_run -lt $(( $lim - 100 )) ]]; then log_fail "Runtime (${instrs_run} instr) < limit (${lim} - 100 instr)" elif [[ $instrs_run -gt $(( $lim + 100 )) ]]; then diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile.am b/tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile.am index 7bdaf53de2..4d9aa9cebb 100644 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile.am +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile.am @@ -13,8 +13,11 @@ dist_pkgdata_SCRIPTS = \ tst.get_type.ksh \ tst.get_userquota.ksh \ tst.get_written.ksh \ + tst.inherit.ksh \ + tst.list_bookmarks.ksh \ tst.list_children.ksh \ tst.list_clones.ksh \ + tst.list_holds.ksh \ tst.list_snapshots.ksh \ tst.list_system_props.ksh \ tst.list_user_props.ksh \ @@ -24,10 +27,14 @@ dist_pkgdata_SCRIPTS = \ tst.promote_simple.ksh \ tst.rollback_mult.ksh \ tst.rollback_one.ksh \ + tst.set_props.ksh \ tst.snapshot_destroy.ksh \ tst.snapshot_neg.ksh \ tst.snapshot_recursive.ksh \ - tst.snapshot_simple.ksh + tst.bookmark.create.ksh \ + tst.bookmark.copy.ksh \ + tst.snapshot_simple.ksh \ + tst.terminate_by_signal.ksh dist_pkgdata_DATA = \ tst.get_index_props.out \ @@ -37,7 +44,10 @@ dist_pkgdata_DATA = \ tst.get_string_props.out \ tst.get_string_props.zcp \ tst.promote_conflict.zcp \ + tst.set_props.zcp \ tst.snapshot_destroy.zcp \ tst.snapshot_neg.zcp \ tst.snapshot_recursive.zcp \ - tst.snapshot_simple.zcp + tst.snapshot_simple.zcp \ + tst.bookmark.create.zcp \ + tst.bookmark.copy.zcp diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/cleanup.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/cleanup.ksh index 281f639a42..3ddcb4d275 100755 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/cleanup.ksh @@ -16,4 +16,7 @@ . $STF_SUITE/include/libtest.shlib -default_cleanup +default_cleanup_noexit +destroy_pool testpool2 + +log_pass diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/setup.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/setup.ksh index 2516b6b8ad..5837bf1a14 100755 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/setup.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/setup.ksh @@ -18,4 +18,8 @@ DISK=${DISKS%% *} -default_setup ${DISK} +TESTPOOLDISK=${DISKS%% *} +TESTPOOL2DISK=${DISKS##* } + +default_setup ${TESTPOOLDISK} +create_pool testpool2 ${TESTPOOL2DISK} diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.ksh new file mode 100755 index 0000000000..81f570d9e1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.ksh @@ -0,0 +1,45 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: Make sure bookmark copying works in channel programs +# + +verify_runnable "global" + +fs=$TESTPOOL/$TESTFS/testchild +snapname=testsnap +bookname=testbookmark +bookcopyname=testbookmark_copy + +function cleanup +{ + destroy_dataset $fs "-R" +} + +log_onexit cleanup + +log_must zfs create $fs + +log_must zfs snapshot $fs@$snapname +log_must zfs bookmark $fs@$snapname "$fs#$bookname" + +log_must_program_sync $TESTPOOL \ + $ZCP_ROOT/synctask_core/tst.bookmark.copy.zcp $fs $bookname $bookcopyname + +log_pass "Simple bookmark copying works" diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.zcp new file mode 100644 index 0000000000..9473035f02 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.zcp @@ -0,0 +1,32 @@ +-- +-- This file and its contents are supplied under the terms of the +-- Common Development and Distribution License ("CDDL"), version 1.0. +-- You may only use this file in accordance with the terms of version +-- 1.0 of the CDDL. +-- +-- A full copy of the text of the CDDL should have accompanied this +-- source. A copy of the CDDL is also available via the Internet at +-- http://www.illumos.org/license/CDDL. +-- + +-- +-- Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. +-- + +-- This program should be invoked as "zfs program " + +args = ... +argv = args["argv"] +fs = argv[1] +source = fs .. "#" .. argv[2] +new = fs .. "#" .. argv[3] +assert(zfs.sync.bookmark(source, new) == 0) +books = {} +count = 0 +for s in zfs.list.bookmarks(fs) do + count = count + 1 + books[s] = 1 +end +assert(count == 2) +assert(books[source] == 1) +assert(books[new] == 1) diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.ksh new file mode 100755 index 0000000000..05ec9cc676 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: Make sure basic bookmark functionality works in channel programs +# + +verify_runnable "global" + +fs=$TESTPOOL/$TESTFS/testchild +snapname=testsnap +bookname=testbookmark + +function cleanup +{ + destroy_dataset $fs "-R" +} + +log_onexit cleanup + +log_must zfs create $fs + +log_must zfs snapshot $fs@$snapname + +log_must_program_sync $TESTPOOL \ + $ZCP_ROOT/synctask_core/tst.bookmark.create.zcp $fs $snapname $bookname + +log_pass "Simple bookmark creation works" diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.zcp new file mode 100644 index 0000000000..eb53fd16ce --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.zcp @@ -0,0 +1,26 @@ +-- +-- This file and its contents are supplied under the terms of the +-- Common Development and Distribution License ("CDDL"), version 1.0. +-- You may only use this file in accordance with the terms of version +-- 1.0 of the CDDL. +-- +-- A full copy of the text of the CDDL should have accompanied this +-- source. A copy of the CDDL is also available via the Internet at +-- http://www.illumos.org/license/CDDL. +-- + +-- +-- Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. +-- + +-- This program should be invoked as "zfs program " + +args = ... +argv = args["argv"] +assert(zfs.sync.bookmark(argv[1] .. "@" .. argv[2], argv[1] .. "#" .. argv[3]) == 0) +books = {} +for s in zfs.list.bookmarks(argv[1]) do + table.insert(books, s) +end +assert(#books == 1) +assert(books[1] == (argv[1] .. "#" .. argv[3])) diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.ksh index 6478fa654c..eed3e0bce5 100755 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.ksh @@ -35,6 +35,7 @@ log_onexit cleanup log_must zfs create -o version=5 $fs create_snapshot $fs $TESTSNAP -log_must_program $TESTPOOL $ZCP_ROOT/synctask_core/tst.get_index_props.zcp $fs $snap +os=$(uname) +log_must_program $TESTPOOL $ZCP_ROOT/synctask_core/tst.get_index_props.zcp $fs $snap $os log_pass "Getting index props should work correctly." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp index e898cf86fa..10ef8e7f83 100644 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp @@ -16,6 +16,7 @@ arg = ... fs = arg["argv"][1] snap = arg["argv"][2] +os = arg["argv"][3] props = {} @@ -26,7 +27,11 @@ props['checksum'] = {{'on', 'default'}, {nil, nil}} props['dedup'] = {{'off', 'default'}, {nil, nil}} props['compression'] = {{'off', 'default'}, {nil, nil}} props['snapdir'] = {{'hidden', 'default'}, {nil, nil}} -props['acltype'] = {{'off', 'default'}, {'off', 'default'}} +if os == "Linux" then + props['acltype'] = {{'off', 'default'}, {'off', 'default'}} +elseif os == "FreeBSD" then + props['aclmode'] = {{'discard', 'default'}, {'discard', 'default'}} +end props['aclinherit'] = {{'restricted','default'}, {nil, nil}} props['copies'] = {{'1', 'default'}, {nil, nil}} props['primarycache'] = {{'all', 'default'}, {'all', 'default'}} @@ -37,7 +42,11 @@ props['devices'] = {{'on', 'default'}, {'on', 'default'}} props['exec'] = {{'on', 'default'}, {'on', 'default'}} props['setuid'] = {{'on', 'default'}, {'on', 'default'}} props['readonly'] = {{'off', 'default'}, {nil, nil}} -props['zoned'] = {{'off', 'default'}, {nil, nil}} +if os == "FreeBSD" then + props['jailed'] = {{'off', 'default'}, {nil, nil}} +else + props['zoned'] = {{'off', 'default'}, {nil, nil}} +end props['vscan'] = {{'off', 'default'}, {nil, nil}} props['nbmand'] = {{'off', 'default'}, {'off', 'default'}} props['version'] = {{'5', nil}, {'5', nil}} diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.zcp index 79969509be..744230db05 100644 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.zcp +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.zcp @@ -41,7 +41,7 @@ props['logicalused'] = {{true, nil}, {nil, nil}, {true, ni props['logicalreferenced'] = {{true, nil}, {true, nil}, {true, nil}} props['quota'] = {{true, 'default'}, {nil, nil}, {nil, nil}} props['reservation'] = {{true, 'default'}, {nil, nil}, {true, 'default'}} --- Note that zfsonlinux allows volsize for snapshot which differs from openzfs +-- Note that OpenZFS allows volsize for snapshot -- props['volsize'] = {{nil, nil}, {nil, nil}, {true, vol}} props['refquota'] = {{true, 'default'}, {nil, nil}, {nil, nil}} props['refreservation'] = {{true, 'default'}, {nil, nil}, {true, vol}} diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.ksh index b7d784489a..31ae4a5717 100755 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.ksh @@ -30,8 +30,8 @@ clone=$TESTPOOL/$TESTCLONE function cleanup { - datasetexists $clone && log_must zfs destroy $clone - datasetexists $fs && log_must zfs destroy -R $fs + datasetexists $clone && destroy_dataset $clone + datasetexists $fs && destroy_dataset $fs -R } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.inherit.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.inherit.ksh new file mode 100755 index 0000000000..e199b4c8b0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.inherit.ksh @@ -0,0 +1,39 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Joyent, Inc. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +verify_runnable "global" + +fs=$TESTPOOL/$TESTFS +testprop="com.joyent:testprop" +testval="testval" + +log_must dataset_setprop $fs $testprop $testval +log_must_program_sync $TESTPOOL - $fs $testprop <<-EOF + arg = ... + fs = arg["argv"][1] + prop = arg["argv"][2] + err = zfs.sync.inherit(fs, prop) + msg = "resetting " .. prop .. " on " .. fs .. " err=" .. err + return msg +EOF + + +prop=$(get_prop $testprop $fs) +[[ "$prop" == "-" ]] || log_fail "Property still set after inheriting" + +log_pass "Inherit/clear property with channel program works." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_bookmarks.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_bookmarks.ksh new file mode 100755 index 0000000000..7456177f72 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_bookmarks.ksh @@ -0,0 +1,120 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: +# Listing zfs bookmarks should work correctly. +# + +verify_runnable "global" + +TESTBOOK=$TESTPOOL/$TESTFS#testbook +TESTBOOK1=$TESTBOOK-1 +TESTBOOK2=$TESTBOOK-2 +TESTBOOK3=$TESTBOOK-3 + +function cleanup +{ + bkmarkexists $TESTBOOK && log_must zfs destroy $TESTBOOK + bkmarkexists $TESTBOOK1 && log_must zfs destroy $TESTBOOK1 + bkmarkexists $TESTBOOK2 && log_must zfs destroy $TESTBOOK2 + bkmarkexists $TESTBOOK3 && log_must zfs destroy $TESTBOOK3 + destroy_snapshot +} + +log_onexit cleanup + +create_snapshot + +# 0 bookmarks handled correctly +log_must_program $TESTPOOL - <<-EOF + n = 0 + for s in zfs.list.bookmarks("$TESTPOOL/$TESTFS") do + n = n + 1 + end + assert(n == 0) + return 0 +EOF + +# Create a bookmark +log_must zfs bookmark $TESTPOOL/$TESTFS@$TESTSNAP $TESTBOOK + +log_must_program $TESTPOOL - <<-EOF + n = 0 + for s in zfs.list.bookmarks("$TESTPOOL/$TESTFS") do + assert(s == "$TESTBOOK") + n = n + 1 + end + assert(n == 1) + return 0 +EOF + +log_must zfs bookmark $TESTPOOL/$TESTFS@$TESTSNAP $TESTBOOK1 +log_must zfs bookmark $TESTPOOL/$TESTFS@$TESTSNAP $TESTBOOK2 +log_must zfs bookmark $TESTPOOL/$TESTFS@$TESTSNAP $TESTBOOK3 + +# All bookmarks appear exactly once +log_must_program $TESTPOOL - <<-EOF + a = {} + a["$TESTBOOK"] = false + a["$TESTBOOK1"] = false + a["$TESTBOOK2"] = false + a["$TESTBOOK3"] = false + n = 0 + for s in zfs.list.bookmarks("$TESTPOOL/$TESTFS") do + assert(not a[s]) + a[s] = true + n = n + 1 + end + assert(n == 4) + assert(a["$TESTBOOK"] and + a["$TESTBOOK1"] and + a["$TESTBOOK2"] and + a["$TESTBOOK3"]) + return 0 +EOF + +# Nonexistent input +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.bookmarks("$TESTPOOL/nonexistent-fs") + return 0 +EOF +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.bookmarks("nonexistent-pool/$TESTFS") + return 0 +EOF + +# Can't look in a different pool than the one specified on command line +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.bookmarks("testpool2") + return 0 +EOF + +# Can't have bookmarks on snapshots, only on filesystems +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.bookmarks("$TESTPOOL/$TESTFS@$TESTSNAP") + return 0 +EOF + +# Can't have bookmarks on bookmarks, only on filesystems +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.bookmarks("$TESTBOOK") + return 0 +EOF + +log_pass "Listing zfs bookmarks should work correctly." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_holds.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_holds.ksh new file mode 100755 index 0000000000..2a471bdecb --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_holds.ksh @@ -0,0 +1,121 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: +# Listing zfs holds should work correctly. +# + +verify_runnable "global" + +TESTHOLD=testhold-tag +TESTHOLD1=$TESTHOLD-1 +TESTHOLD2=$TESTHOLD-2 +TESTHOLD3=$TESTHOLD-3 +SNAP=$TESTPOOL/$TESTFS@$TESTSNAP + +function cleanup +{ + holdexists $TESTHOLD $SNAP && log_must zfs release $TESTHOLD $SNAP + holdexists $TESTHOLD1 $SNAP && log_must zfs release $TESTHOLD1 $SNAP + holdexists $TESTHOLD2 $SNAP && log_must zfs release $TESTHOLD2 $SNAP + holdexists $TESTHOLD3 $SNAP && log_must zfs release $TESTHOLD3 $SNAP + destroy_snapshot +} + +log_onexit cleanup + +create_snapshot + +# 0 holds handled correctly +log_must_program $TESTPOOL - <<-EOF + n = 0 + for s in zfs.list.holds("$SNAP") do + n = n + 1 + end + assert(n == 0) + return 0 +EOF + +# Create a hold +log_must zfs hold $TESTHOLD $SNAP + +log_must_program $TESTPOOL - <<-EOF + n = 0 + for s in zfs.list.holds("$SNAP") do + assert(s == "$TESTHOLD") + n = n + 1 + end + assert(n == 1) + return 0 +EOF + +log_must zfs hold $TESTHOLD1 $SNAP +log_must zfs hold $TESTHOLD2 $SNAP +log_must zfs hold $TESTHOLD3 $SNAP + +# All holds appear exactly once +log_must_program $TESTPOOL - <<-EOF + a = {} + a["$TESTHOLD"] = false + a["$TESTHOLD1"] = false + a["$TESTHOLD2"] = false + a["$TESTHOLD3"] = false + n = 0 + for s in zfs.list.holds("$SNAP") do + assert(not a[s]) + a[s] = true + n = n + 1 + end + assert(n == 4) + assert(a["$TESTHOLD"] and + a["$TESTHOLD1"] and + a["$TESTHOLD2"] and + a["$TESTHOLD3"]) + return 0 +EOF + +# Nonexistent input +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.holds("$TESTPOOL/nonexistent-fs@nonexistent-snap") + return 0 +EOF +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.holds("nonexistent-pool/$TESTFS") + return 0 +EOF + +# Can't look in a different pool than the one specified on command line +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.holds("testpool2") + return 0 +EOF + +# Can't have holds on filesystems +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.holds("$TESTPOOL/$TESTFS") + return 0 +EOF + +# Can't have holds on bookmarks +log_mustnot_program $TESTPOOL - <<-EOF + zfs.list.holds("$TESTPOOL/$TESTFS#bookmark") + return 0 +EOF + +log_pass "Listing zfs holds should work correctly." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_user_props.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_user_props.ksh index 910dddc03f..a454a27533 100755 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_user_props.ksh +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_user_props.ksh @@ -20,6 +20,9 @@ # DESCRIPTION: # Listing zfs user properties should work correctly. # +# Note, that this file tests both zfs.list.user_properties +# and it's alias zfs.list.properties. +# verify_runnable "global" @@ -37,6 +40,14 @@ TESTVAL4="TOZwOfACvQtmDyiq68elB3a3g9YYyxBjSnLtN3ZyQYNOAKykzIE2khKKOBncJiDx" # 0 properties handled correctly +log_must_program $TESTPOOL - <<-EOF + n = 0 + for p in zfs.list.user_properties("$TESTPOOL/$TESTFS") do + n = n + 1 + end + assert(n == 0) + return 0 +EOF log_must_program $TESTPOOL - <<-EOF n = 0 for p in zfs.list.properties("$TESTPOOL/$TESTFS") do @@ -49,6 +60,16 @@ EOF # Add a single user property log_must zfs set $TESTPROP="$TESTVAL" $TESTPOOL/$TESTFS +log_must_program $TESTPOOL - <<-EOF + n = 0 + for p,v in zfs.list.user_properties("$TESTPOOL/$TESTFS") do + assert(p == "$TESTPROP") + assert(v == "$TESTVAL") + n = n + 1 + end + assert(n == 1) + return 0 +EOF log_must_program $TESTPOOL - <<-EOF n = 0 for p,v in zfs.list.properties("$TESTPOOL/$TESTFS") do @@ -66,6 +87,34 @@ log_must zfs set $TESTPROP3="$TESTVAL3" $TESTPOOL/$TESTFS log_must zfs set $TESTPROP4="$TESTVAL4" $TESTPOOL/$TESTFS # All user properties have correct value and appear exactly once +log_must_program $TESTPOOL - <<-EOF + a = {} + a["$TESTPROP"] = false + a["$TESTPROP1"] = false + a["$TESTPROP2"] = false + a["$TESTPROP3"] = false + a["$TESTPROP4"] = false + m = {} + m["$TESTPROP"] = "$TESTVAL" + m["$TESTPROP1"] = "$TESTVAL1" + m["$TESTPROP2"] = "$TESTVAL2" + m["$TESTPROP3"] = "$TESTVAL3" + m["$TESTPROP4"] = "$TESTVAL4" + n = 0 + for p,v in zfs.list.user_properties("$TESTPOOL/$TESTFS") do + assert(not a[p]) + a[p] = true + assert(v == m[p]) + n = n + 1 + end + assert(n == 5) + assert(a["$TESTPROP"] and + a["$TESTPROP1"] and + a["$TESTPROP2"] and + a["$TESTPROP3"] and + a["$TESTPROP4"]) + return 0 +EOF log_must_program $TESTPOOL - <<-EOF a = {} a["$TESTPROP"] = false @@ -95,4 +144,4 @@ log_must_program $TESTPOOL - <<-EOF return 0 EOF -log_pass "Listing zfs user properies should work correctly." +log_pass "Listing zfs user properties should work correctly." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.ksh new file mode 100755 index 0000000000..6ac1c2b205 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.ksh @@ -0,0 +1,39 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: +# Setting user props should work correctly on datasets. +# + +verify_runnable "global" + +fs=$TESTPOOL/$TESTFS/testchild + +function cleanup +{ + destroy_dataset $fs "-R" +} + +log_onexit cleanup + +log_must zfs create $fs + +log_must_program_sync $TESTPOOL $ZCP_ROOT/synctask_core/tst.set_props.zcp $fs + +log_pass "Setting props from channel program works correctly." diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.zcp new file mode 100644 index 0000000000..756263a9d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.zcp @@ -0,0 +1,109 @@ +-- +-- This file and its contents are supplied under the terms of the +-- Common Development and Distribution License ("CDDL"), version 1.0. +-- You may only use this file in accordance with the terms of version +-- 1.0 of the CDDL. +-- +-- A full copy of the text of the CDDL should have accompanied this +-- source. A copy of the CDDL is also available via the Internet at +-- http://www.illumos.org/license/CDDL. +-- + +-- +-- Copyright (c) 2017 by Delphix. All rights reserved. +-- Copyright 2020 Joyent, Inc. +-- + +arg = ... +fs = arg["argv"][1] + +-- values from zfs.h +maxname = 256 -- ZAP_MAXNAMELEN +maxvalue = 8192 -- ZAP_MAXVALUELEN + +pos_props = {} +neg_props = {} + +-- In lua, strings are immutable, so to avoid a bunch of copies, we +-- build the value in a table and use concat (which appears to be the +-- recommend method for such things). +largeprop = {} +for i = 0,maxvalue,8 +do + table.insert(largeprop, "aaaaaaaa") +end +-- add an extra character so we spill over the limit +table.insert(largeprop, "b") + +largepropv = table.concat(largeprop) + +largepropname = { "b:" } +for i = 0,maxname,8 +do + table.insert(largepropname, "aaaaaaaa") +end +largepropnamev = table.concat(largepropname) + +pos_props["a:prop"] = {"hello"} + +-- For neg_props, an optional expected error value can be added after the +-- property value as seen below. +neg_props["notaproperty"] = {"hello", EINVAL} +neg_props["a:very.long.property.value"] = { largepropv, E2BIG } +neg_props[largepropnamev] = {"greetings", ENAMETOOLONG } + +-- non-user properties aren't currently supported +-- Even if they were, the argument must be a string due to requirements of +-- the ZCP api. +neg_props["mountpoint"] = {"/foo/bar"} +neg_props["copies"] = { "2" } + +-- read-only properties should never succeed +neg_props["guid"] = { "12345" } + +set_fail = {} +val_fail = {} + +-- Test properties that should work +for prop, values in pairs(pos_props) do + for i, val in ipairs(values) do + old_val, src = zfs.get_prop(fs, prop) + + -- Attempt to set the property to the specified value + err = zfs.sync.set_prop(fs, prop, val) + + if (err ~= 0) then + set_fail[prop] = err -- tuple of prop, val that resulted in error + else + -- use get_prop to check that the set took affect + new_val, src = zfs.get_prop(fs, prop) + if (tostring(new_val) ~= tostring(val)) then + val_fail[prop] = new_val + end + + -- We modified the prop, restore old value (if one existed) + if (old_val ~= nil) then + err = zfs.sync.set_prop(fs, prop, old_val) + if (err ~= 0) then return err end + else + -- Didn't have an old value, delete (inherit) instead + err = zfs.sync.inherit(fs, prop) + if (err ~= 0) then return err end + end + end + end +end + +-- Test properties that should fail +for prop, expected in pairs(neg_props) do + exp_val = expected[1] + exp_err = expected[2] + + -- Attempt to set the property to the specified value + err = zfs.sync.set_prop(fs, prop, exp_val) + if (err == 0 or (exp_err ~= nil and err ~= exp_err)) then + set_fail[prop] = err -- tuple of prop, val that resulted in error + end +end + +return {set_fail, val_fail} diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.terminate_by_signal.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.terminate_by_signal.ksh new file mode 100755 index 0000000000..2c9014a084 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.terminate_by_signal.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: Execute a long-running zfs channel program and attempt to +# cancel it by sending a signal. +# + +verify_runnable "global" + +rootfs=$TESTPOOL/$TESTFS +snapname=snap +limit=50000000 + +function cleanup +{ + datasetexists $rootfs && destroy_dataset $rootfs -R +} + +log_onexit cleanup + +# +# Create a working set of 100 file systems +# +for i in {1..100}; do + log_must zfs create "$rootfs/child$i" +done + +# +# Attempt to create 100 snapshots with zfs.sync.snapshot() along with some +# time consuming efforts. We use loops of zfs.check.* (dry run operations) +# to consume instructions before the next zfs.sync.snapshot() occurs. +# +# Without a signal interruption this ZCP would take several minutes and +# generate over 30 million Lua instructions. +# +function chan_prog +{ +zfs program -t $limit $TESTPOOL - $rootfs $snapname <<-EOF + arg = ... + fs = arg["argv"][1] + snap = arg["argv"][2] + for child in zfs.list.children(fs) do + local snapname = child .. "@" .. snap + zfs.check.snapshot(snapname) + zfs.sync.snapshot(snapname) + for i=1,20000,1 do + zfs.check.snapshot(snapname) + zfs.check.destroy(snapname) + zfs.check.destroy(fs) + end + end + return "should not have reached here" +EOF +} + +log_note "Executing a long-running zfs program in the background" +chan_prog & +CHILD=$! + +# +# After waiting, send a kill signal to the channel program process. +# This should stop the ZCP near a million instructions but still have +# created some of the snapshots. Note that since the above zfs program +# command might get wrapped, we also issue a kill to the group. +# +sleep 10 +log_pos pkill -P $CHILD +log_pos kill $CHILD + +# +# Make sure the channel program did not fully complete by enforcing +# that not all of the snapshots were created. +# +snap_count=$(zfs list -t snapshot | grep $TESTPOOL | wc -l) +log_note "$snap_count snapshots created by ZCP" + +if [ "$snap_count" -eq 0 ]; then + log_fail "Channel program failed to run." +elif [ "$snap_count" -gt 90 ]; then + log_fail "Too many snapshots after a cancel ($snap_count)." +else + log_pass "Canceling a long-running channel program works." +fi diff --git a/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh b/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh index 4d66146d70..cb8c2ead59 100755 --- a/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh @@ -48,22 +48,37 @@ set -A files writable immutable append function cleanup { for i in ${files[*]}; do - log_must chattr -ia $TESTDIR/$i - log_must rm -f $TESTDIR/$i + if is_freebsd ; then + log_must chflags noschg $TESTDIR/$i + log_must rm -f $TESTDIR/$i + else + log_must chattr -ia $TESTDIR/$i + log_must rm -f $TESTDIR/$i + fi done } log_onexit cleanup -log_assert "Check whether chattr works as expected" +if is_freebsd ; then + log_assert "Check whether chflags works as expected" +else + log_assert "Check whether chattr works as expected" +fi log_must touch $TESTDIR/writable log_must touch $TESTDIR/immutable log_must touch $TESTDIR/append -log_must chattr -i $TESTDIR/writable -log_must chattr +i $TESTDIR/immutable -log_must chattr +a $TESTDIR/append +if is_freebsd ; then + log_must chflags noschg $TESTDIR/writable + log_must chflags schg $TESTDIR/immutable + log_must chflags sappnd $TESTDIR/append +else + log_must chattr -i $TESTDIR/writable + log_must chattr +i $TESTDIR/immutable + log_must chattr +a $TESTDIR/append +fi log_must eval "echo test > $TESTDIR/writable" log_must eval "echo test >> $TESTDIR/writable" @@ -72,4 +87,8 @@ log_mustnot eval "echo test >> $TESTDIR/immutable" log_mustnot eval "echo test > $TESTDIR/append" log_must eval "echo test >> $TESTDIR/append" -log_pass "chattr works as expected" +if is_freebsd ; then + log_pass "chflags works as expected" +else + log_pass "chattr works as expected" +fi diff --git a/tests/zfs-tests/tests/functional/checksum/Makefile.am b/tests/zfs-tests/tests/functional/checksum/Makefile.am index f72546b225..ddabc03020 100644 --- a/tests/zfs-tests/tests/functional/checksum/Makefile.am +++ b/tests/zfs-tests/tests/functional/checksum/Makefile.am @@ -1,9 +1,8 @@ include $(top_srcdir)/config/Rules.am -AM_CPPFLAGS += -I$(top_srcdir)/include -LDADD = $(top_srcdir)/lib/libicp/libicp.la - -AUTOMAKE_OPTIONS = subdir-objects +LDADD = \ + $(abs_top_builddir)/lib/libicp/libicp.la \ + $(abs_top_builddir)/lib/libspl/libspl_assert.la pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/checksum @@ -13,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ run_edonr_test.ksh \ run_sha2_test.ksh \ run_skein_test.ksh \ - filetest_001_pos.ksh + filetest_001_pos.ksh \ + filetest_002_pos.ksh dist_pkgdata_DATA = \ default.cfg @@ -21,10 +21,13 @@ dist_pkgdata_DATA = \ pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/checksum pkgexec_PROGRAMS = \ - edonr_test \ skein_test \ sha2_test -edonr_test_SOURCES = edonr_test.c skein_test_SOURCES = skein_test.c sha2_test_SOURCES = sha2_test.c + +if BUILD_LINUX +pkgexec_PROGRAMS += edonr_test +edonr_test_SOURCES = edonr_test.c +endif diff --git a/tests/zfs-tests/tests/functional/checksum/default.cfg b/tests/zfs-tests/tests/functional/checksum/default.cfg index 138c42b998..bc2f6e261b 100644 --- a/tests/zfs-tests/tests/functional/checksum/default.cfg +++ b/tests/zfs-tests/tests/functional/checksum/default.cfg @@ -28,4 +28,9 @@ # Copyright (c) 2013 by Delphix. All rights reserved. # -set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" +. $STF_SUITE/include/libtest.shlib + +set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "sha256" "sha512" "skein" +if ! is_freebsd; then + CHECKSUM_TYPES+=("edonr") +fi diff --git a/tests/zfs-tests/tests/functional/checksum/edonr_test.c b/tests/zfs-tests/tests/functional/checksum/edonr_test.c index a2a924e5d8..d8585ea4cf 100644 --- a/tests/zfs-tests/tests/functional/checksum/edonr_test.c +++ b/tests/zfs-tests/tests/functional/checksum/edonr_test.c @@ -36,11 +36,8 @@ #include #include #include -#include #include - -typedef enum boolean { B_FALSE, B_TRUE } boolean_t; -typedef unsigned long long u_longlong_t; +#include /* * Test messages from: @@ -169,7 +166,6 @@ main(int argc, char *argv[]) (void) printf("FAILED!\n"); \ failed = B_TRUE; \ } \ - NOTE(CONSTCOND) \ } while (0) #define EDONR_PERF_TEST(mode) \ @@ -196,7 +192,6 @@ main(int argc, char *argv[]) } \ (void) printf("Edon-R-%-6s%llu us (%.02f CPB)\n", #mode,\ (u_longlong_t)delta, cpb); \ - NOTE(CONSTCOND) \ } while (0) (void) printf("Running algorithm correctness tests:\n"); diff --git a/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh b/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh index ccc60a661d..615b41f312 100755 --- a/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh @@ -21,7 +21,7 @@ # # -# Copyright (c) 2018 by Delphix. All rights reserved. +# Copyright (c) 2018, 2019 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -32,8 +32,8 @@ # Sanity test to make sure checksum algorithms work. # For each checksum, create a file in the pool using that checksum. Verify # that there are no checksum errors. Next, for each checksum, create a single -# file in the pool using that checksum, scramble the underlying vdev, and -# verify that we correctly catch the checksum errors. +# file in the pool using that checksum, corrupt the file, and verify that we +# correctly catch the checksum errors. # # STRATEGY: # Test 1 @@ -46,19 +46,15 @@ # Test 2 # 6. For each checksum: # 7. Create a file using the checksum -# 8. Export the pool -# 9. Scramble the data on one of the underlying VDEVs -# 10. Import the pool -# 11. Scrub the pool -# 12. Verify that there are checksum errors +# 8. Corrupt all level 0 blocks in the file +# 9. Scrub the pool +# 10. Verify that there are checksum errors verify_runnable "both" function cleanup { - echo cleanup - [[ -e $TESTDIR ]] && \ - log_must rm -rf $TESTDIR/* > /dev/null 2>&1 + rm -fr $TESTDIR/* } log_assert "Create and read back files with using different checksum algorithms" @@ -66,8 +62,7 @@ log_assert "Create and read back files with using different checksum algorithms" log_onexit cleanup WRITESZ=1048576 -SKIPCNT=$(((4194304 / $WRITESZ) * 2)) -WRITECNT=$((($MINVDEVSIZE / $WRITESZ) - $SKIPCNT)) +NWRITES=5 # Get a list of vdevs in our pool set -A array $(get_disklist_fullpath) @@ -81,7 +76,7 @@ while [[ $i -lt ${#CHECKSUM_TYPES[*]} ]]; do type=${CHECKSUM_TYPES[i]} log_must zfs set checksum=$type $TESTPOOL log_must file_write -o overwrite -f $TESTDIR/test_$type \ - -b $WRITESZ -c 5 -d R + -b $WRITESZ -c $NWRITES -d R (( i = i + 1 )) done @@ -96,22 +91,17 @@ log_must [ $cksum -eq 0 ] rm -fr $TESTDIR/* -log_assert "Test scrambling the disk and seeing checksum errors" +log_assert "Test corrupting the files and seeing checksum errors" typeset -i j=1 while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do type=${CHECKSUM_TYPES[$j]} log_must zfs set checksum=$type $TESTPOOL log_must file_write -o overwrite -f $TESTDIR/test_$type \ - -b $WRITESZ -c 5 -d R + -b $WRITESZ -c $NWRITES -d R - log_must zpool export $TESTPOOL + # Corrupt the level 0 blocks of this file + corrupt_blocks_at_level $TESTDIR/test_$type - # Scramble the data on the first vdev in our pool. Skip the first - # and last 16MB of data, then scramble the rest after that. - log_must dd if=/dev/zero of=$firstvdev bs=$WRITESZ skip=$SKIPCNT \ - count=$WRITECNT - - log_must zpool import $TESTPOOL log_must zpool scrub $TESTPOOL log_must wait_scrubbed $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh new file mode 100755 index 0000000000..921a4b392a --- /dev/null +++ b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh @@ -0,0 +1,91 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018, 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/checksum/default.cfg + +# DESCRIPTION: +# Sanity test to make sure checksum algorithms work. +# For each checksum, create a file in the pool using that checksum. Verify +# that there are no checksum errors. Next, for each checksum, create a single +# file in the pool using that checksum, corrupt the file, and verify that we +# correctly catch the checksum errors. +# +# STRATEGY: +# Test 1 +# 1. For each checksum: +# 2. Create a file using the checksum +# 3. Corrupt all level 1 blocks in the file +# 4. Export and import the pool +# 5. Verify that there are checksum errors + +verify_runnable "both" + +function cleanup +{ + rm -fr $TESTDIR/* +} + +log_assert "Test corrupting files at L1 and seeing checksum errors" + +log_onexit cleanup + +WRITESZ=1048576 +NWRITES=5 + +# Get a list of vdevs in our pool +set -A array $(get_disklist_fullpath) + +# Get the first vdev, since we will corrupt it later +firstvdev=${array[0]} + +typeset -i j=1 +while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do + type=${CHECKSUM_TYPES[$j]} + log_must zfs set checksum=$type $TESTPOOL + log_must file_write -o overwrite -f $TESTDIR/test_$type \ + -b $WRITESZ -c $NWRITES -d R + + # Corrupt the level 1 blocks of this file + corrupt_blocks_at_level $TESTDIR/test_$type 1 + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + log_mustnot eval "cat $TESTDIR/test_$type >/dev/null" + + cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \ + awk '{print $5}') + + log_assert "Checksum '$type' caught $cksum checksum errors" + log_must [ $cksum -ne 0 ] + + rm -f $TESTDIR/test_$type + log_must zpool clear $TESTPOOL + + (( j = j + 1 )) +done diff --git a/tests/zfs-tests/tests/functional/checksum/sha2_test.c b/tests/zfs-tests/tests/functional/checksum/sha2_test.c index afd6f82438..c7561b54f2 100644 --- a/tests/zfs-tests/tests/functional/checksum/sha2_test.c +++ b/tests/zfs-tests/tests/functional/checksum/sha2_test.c @@ -39,9 +39,7 @@ #include #define _SHA2_IMPL #include -#define NOTE(x) -typedef enum boolean { B_FALSE, B_TRUE } boolean_t; -typedef unsigned long long u_longlong_t; +#include /* @@ -173,20 +171,6 @@ const uint8_t sha512_256_test_digests[][32] = { } }; -/* - * Local reimplementation of cmn_err, since it's used in sha2.c. - */ -/*ARGSUSED*/ -void -cmn_err(int level, char *format, ...) -{ - va_list ap; - va_start(ap, format); - /* LINTED: E_SEC_PRINTF_VAR_FMT */ - (void) vfprintf(stderr, format, ap); - va_end(ap); -} - int main(int argc, char *argv[]) { @@ -211,7 +195,6 @@ main(int argc, char *argv[]) (void) printf("FAILED!\n"); \ failed = B_TRUE; \ } \ - NOTE(CONSTCOND) \ } while (0) #define SHA2_PERF_TEST(mode, diglen) \ @@ -238,7 +221,6 @@ main(int argc, char *argv[]) } \ (void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode, \ (u_longlong_t)delta, cpb); \ - NOTE(CONSTCOND) \ } while (0) (void) printf("Running algorithm correctness tests:\n"); diff --git a/tests/zfs-tests/tests/functional/checksum/skein_test.c b/tests/zfs-tests/tests/functional/checksum/skein_test.c index 37548f03b3..484fad844b 100644 --- a/tests/zfs-tests/tests/functional/checksum/skein_test.c +++ b/tests/zfs-tests/tests/functional/checksum/skein_test.c @@ -37,10 +37,7 @@ #include #include #include -#define NOTE(x) - -typedef enum boolean { B_FALSE, B_TRUE } boolean_t; -typedef unsigned long long u_longlong_t; +#include /* * Skein test suite using values from the Skein V1.3 specification found at: @@ -287,7 +284,6 @@ main(int argc, char *argv[]) (void) printf("FAILED!\n"); \ failed = B_TRUE; \ } \ - NOTE(CONSTCOND) \ } while (0) #define SKEIN_PERF_TEST(mode, diglen) \ @@ -316,7 +312,6 @@ main(int argc, char *argv[]) } \ (void) printf("Skein" #mode "/" #diglen "\t%llu us " \ "(%.02f CPB)\n", (u_longlong_t)delta, cpb); \ - NOTE(CONSTCOND) \ } while (0) (void) printf("Running algorithm correctness tests:\n"); diff --git a/tests/zfs-tests/tests/functional/cli_root/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/Makefile.am index 99f1257837..c01ecee896 100644 --- a/tests/zfs-tests/tests/functional/cli_root/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/Makefile.am @@ -13,14 +13,15 @@ SUBDIRS = \ zfs_destroy \ zfs_diff \ zfs_get \ + zfs_ids_to_path \ zfs_inherit \ + zfs_jail \ zfs_load-key \ zfs_mount \ zfs_program \ zfs_promote \ zfs_property \ zfs_receive \ - zfs_remap \ zfs_rename \ zfs_reservation \ zfs_rollback \ @@ -33,6 +34,7 @@ SUBDIRS = \ zfs_unmount \ zfs_unshare \ zfs_upgrade \ + zfs_wait \ zpool \ zpool_add \ zpool_attach \ @@ -60,4 +62,5 @@ SUBDIRS = \ zpool_status \ zpool_sync \ zpool_trim \ - zpool_upgrade + zpool_upgrade \ + zpool_wait diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am index d37bcf607f..d84a3dfc72 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am @@ -1,8 +1,19 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zdb dist_pkgdata_SCRIPTS = \ - zdb_001_neg.ksh \ zdb_002_pos.ksh \ zdb_003_pos.ksh \ zdb_004_pos.ksh \ zdb_005_pos.ksh \ - zdb_006_pos.ksh + zdb_006_pos.ksh \ + zdb_args_neg.ksh \ + zdb_args_pos.ksh \ + zdb_block_size_histogram.ksh \ + zdb_checksum.ksh \ + zdb_decompress.ksh \ + zdb_decompress_zstd.ksh \ + zdb_object_range_neg.ksh \ + zdb_object_range_pos.ksh \ + zdb_display_block.ksh \ + zdb_objset_id.ksh \ + zdb_recover.ksh \ + zdb_recover_2.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_003_pos.ksh index 3c444ae983..36f1929dd1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_003_pos.ksh @@ -34,8 +34,17 @@ log_onexit cleanup function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL + if is_freebsd ; then + log_must sysctl kern.geom.debugflags=$saved_debugflags + fi } +if is_freebsd ; then + # FreeBSD won't allow writing to an in-use device without this set + saved_debugflags=$(sysctl -n kern.geom.debugflags) + log_must sysctl kern.geom.debugflags=16 +fi + verify_runnable "global" verify_disk_count "$DISKS" 2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_004_pos.ksh index 91a5c97997..2c6e6e9be0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_004_pos.ksh @@ -13,6 +13,7 @@ # # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -40,8 +41,17 @@ function cleanup for DISK in $DISKS; do zpool labelclear -f $DEV_RDSKDIR/$DISK done + if is_freebsd; then + log_must sysctl kern.geom.debugflags=$saved_debugflags + fi } +if is_freebsd; then + # FreeBSD won't allow writing to an in-use device without this set + saved_debugflags=$(sysctl -n kern.geom.debugflags) + log_must sysctl kern.geom.debugflags=16 +fi + verify_runnable "global" verify_disk_count "$DISKS" 2 set -A DISK $DISKS @@ -51,7 +61,7 @@ default_mirror_setup_noexit $DISKS DEVS=$(get_pool_devices ${TESTPOOL} ${DEV_RDSKDIR}) [[ -n $DEVS ]] && set -A DISK $DEVS -log_must zpool offline $TESTPOOL ${WHOLE_DISK} +log_must zpool offline $TESTPOOL $WHOLE_DISK log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=1K count=2 log_must zpool export $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_005_pos.ksh index 49e237c705..74975dbb0d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_005_pos.ksh @@ -37,8 +37,17 @@ function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL rm -f $TEMPFILE + if is_freebsd ; then + log_must sysctl kern.geom.debugflags=$saved_debugflags + fi } +if is_freebsd ; then + # FreeBSD won't allow writing to an in-use device without this set + saved_debugflags=$(sysctl -n kern.geom.debugflags) + log_must sysctl kern.geom.debugflags=16 +fi + verify_runnable "global" verify_disk_count "$DISKS" 2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh similarity index 85% rename from tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh rename to tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh index a5f827b564..ae948bb9b7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh @@ -56,18 +56,28 @@ set -A args "create" "add" "destroy" "import fakepool" \ "add mirror fakepool" "add raidz fakepool" \ "add raidz1 fakepool" "add raidz2 fakepool" \ "setvprop" "blah blah" "-%" "--?" "-*" "-=" \ - "-a" "-f" "-g" "-h" "-j" "-m" "-n" "-o" "-p" \ - "-p /tmp" "-r" "-t" "-w" "-x" "-y" "-z" \ - "-D" "-E" "-G" "-H" "-I" "-J" "-K" "-M" \ - "-N" "-Q" "-R" "-S" "-T" "-W" "-Y" "-Z" + "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \ + "-t" "-w" "-z" "-E" "-H" "-I" "-J" "-K" \ + "-N" "-Q" "-R" "-T" "-W" log_assert "Execute zdb using invalid parameters." -typeset -i i=0 -while [[ $i -lt ${#args[*]} ]]; do - log_mustnot zdb ${args[i]} +log_onexit cleanup - ((i = i + 1)) -done +function cleanup +{ + default_cleanup_noexit +} + +function test_imported_pool +{ + for i in ${args[@]}; do + log_mustnot zdb $i $TESTPOOL + done +} + +default_mirror_setup_noexit $DISKS + +test_imported_pool log_pass "Badly formed zdb parameters fail as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh new file mode 100755 index 0000000000..4c2fc15ec0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# ZDB allows a large number of possible inputs +# and combinations of those inputs. Test for non-zero +# exit values. These input options are based on the zdb +# man page +# +# STRATEGY: +# 1. Create an array containing value zdb parameters. +# 2. For each element, execute the sub-command. +# 3. Verify it does not return a error. +# + +verify_runnable "global" + +log_assert "Execute zdb using valid parameters." + +log_onexit cleanup + +function cleanup +{ + default_cleanup_noexit +} + +function test_imported_pool +{ + typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-G" "-h" "-i" "-L" \ + "-M" "-P" "-s" "-v" "-Y" "-y") + for i in ${args[@]}; do + log_must eval "zdb $i $TESTPOOL >/dev/null" + done +} + +function test_exported_pool +{ + log_must zpool export $TESTPOOL + typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-F" "-G" "-h" "-i" "-L" "-M" \ + "-P" "-s" "-v" "-X" "-Y" "-y") + for i in ${args[@]}; do + log_must eval "zdb -e $i $TESTPOOL >/dev/null" + done + log_must zpool import $TESTPOOL +} + +function test_vdev +{ + typeset -a args=("-A" "-q" "-u" "-Aqu") + VDEVS=$(get_pool_devices ${TESTPOOL} ${DEV_RDSKDIR}) + log_note $VDEVS + set -A VDEV_ARRAY $VDEVS + for i in ${args[@]}; do + log_must eval "zdb -l $i ${VDEV_ARRAY[0]} >/dev/null" + done +} + +function test_metaslab +{ + typeset -a args=("-A" "-L" "-P" "-Y") + for i in ${args[@]}; do + log_must eval "zdb -m $i $TESTPOOL >/dev/null" + done +} + +default_mirror_setup_noexit $DISKS + +test_imported_pool +test_exported_pool +test_vdev +test_metaslab + +log_pass "Valid zdb parameters pass as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh new file mode 100755 index 0000000000..8d677affb9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -0,0 +1,272 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security LLC. + +. $STF_SUITE/include/libtest.shlib + + +# +# DESCRIPTION: +# Create a pool and populate it with files of various +# recordsizes +# +# STRATEGY: +# 1. Create pool +# 2. Populate it +# 3. Run zdb -Pbbb on pool +# 4. Verify variance on blocksizes +# +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +SPA_MAXBLOCKSHIFT=24 + +function histo_populate_test_pool +{ + if [ $# -ne 1 ]; then + log_note "histo_populate_test_pool: insufficient parameters" + log_fail "hptp: 1 requested $# received" + fi + typeset pool=$1 + + set -A recordsizes + typeset -i min_rsbits=9 #512 + typeset -i max_rsbits=SPA_MAXBLOCKSHIFT #16 MiB + typeset -i sum_filesizes=0 + re_number='^[0-9]+$' + + let histo_pool_size=$(get_pool_prop size ${pool}) + if [[ ! ${histo_pool_size} =~ ${re_number} ]]; then + log_fail "histo_pool_size is not numeric ${pool_size}" + fi + let max_pool_record_size=$(get_prop recordsize ${pool}) + if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then + log_fail "hptp: max_pool_record_size is not numeric ${max_pool_record_size}" + fi + + sum_filesizes=$(echo "2^21"|bc) + ((min_pool_size=12*sum_filesizes)) + if [ ${histo_pool_size} -lt ${min_pool_size} ]; then + log_note "hptp: Your pool size ${histo_pool_size}" + log_fail "hptp: is less than minimum ${min_pool_size}" + fi + this_ri=min_rsbits + file_num=0 + total_count=0 + ################### + # generate 10% + 20% + 30% + 31% = 91% of the filespace + # attempting to use 100% will lead to no space left on device + # Heuristic testing showed that 91% was the practical upper + # bound on the default 4G zpool (mirrored) that is used in + # testing. + # + # In order to expedite testing, we will only fill 2G (of 4G) + # of the test pool. You may want to modify this for + # standalone testing. + # + # In filling only 50% of the pool, we create one object on + # each "pass" below to achieve multiple objects per record + # size. Creating one file per object would lead to + # excessive file creation time. + ################### + # for pass in 10 20 30 31 # 91% + for pass in 20 20 10 # 50% + do + ((thiscount=(((histo_pool_size*pass)/100)/sum_filesizes))) + + ((total_count+=thiscount)) + for rb in $(seq ${min_rsbits} ${max_rsbits}) + do + this_rs=$(echo "2^${rb}" | bc) + if [ ${this_rs} -gt ${max_pool_record_size} ]; then + continue + fi + + if [ ! -d /${pool}/B_${this_rs} ]; then + zfs create ${pool}/B_${this_rs} + zfs set recordsize=${this_rs} \ + ${pool}/B_${this_rs} + fi + #################### + # Create the files in the devices and datasets + # of the right size. The files are filled + # with random data to defeat the compression + # + # Note that the dd output is suppressed unless + # there are errors + #################### + + dd if=/dev/urandom \ + of=/${pool}/B_${this_rs}/file_${filenum} \ + bs=${this_rs} count=${thiscount} \ + iflag=fullblock 2>&1 | \ + egrep -v -e "records in" -e "records out" \ + -e "bytes.*copied" + ((filenum+=1)) + done + done + + #################### + # Testing showed that on some devices, unless the pool is + # synchronized, that the block counts will be below the + # anticipated sizes since not all of the blocks will be flushed + # to the device. This 'sync' command prevents that from + # happening. + #################### + log_must zpool sync ${pool} +} +function histo_check_test_pool +{ + if [ $# -ne 1 ]; then + log_note "histo_check_test_pool: insufficient parameters" + log_fail "hctp: 1 requested $# received" + fi + typeset pool=$1 + + set -A recordsizes + set -A recordcounts + typeset -i rb + typeset -i min_rsbits=9 #512 + typeset -i max_rsbits=SPA_MAXBLOCKSHIFT+1 + typeset -i this_rs + typeset -i this_ri + typeset -i sum_filesizes=0 + typeset dumped + typeset stripped + + let histo_check_pool_size=$(get_pool_prop size ${pool}) + if [[ ! ${histo_check_pool_size} =~ ${re_number} ]]; then + log_fail "histo_check_pool_size is not numeric ${histo_check_pool_size}" + fi + let max_pool_record_size=$(get_prop recordsize ${pool}) + if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then + log_fail "hctp: max_pool_record_size is not numeric ${max_pool_record_size}" + fi + + dumped="${TEST_BASE_DIR}/${pool}_dump.txt" + stripped="${TEST_BASE_DIR}/${pool}_stripped.txt" + + zdb -Pbbb ${pool} | \ + tee ${dumped} | \ + sed -e '1,/^block[ ][ ]*psize[ ][ ]*lsize.*$/d' \ + -e '/^size[ ]*Count/d' -e '/^$/,$d' \ + > ${stripped} + + sum_filesizes=$(echo "2^21"|bc) + + ################### + # generate 10% + 20% + 30% + 31% = 91% of the filespace + # attempting to use 100% will lead to no space left on device + # attempting to use 100% will lead to no space left on device + # Heuristic testing showed that 91% was the practical upper + # bound on the default 4G zpool (mirrored) that is used in + # testing. + # + # In order to expedite testing, we will only fill 2G (of 4G) + # of the test pool. You may want to modify this for + # standalone testing. + # + # In filling only 50% of the pool, we create one object on + # each "pass" below to achieve multiple objects per record + # size. Creating one file per object would lead to + # excessive file creation time. + ################### + # for pass in 10 20 30 31 # 91% + for pass in 20 20 10 # 50% + do + ((thiscount=(((histo_check_pool_size*pass)/100)/sum_filesizes))) + + for rb in $(seq ${min_rsbits} ${max_rsbits}) + do + blksize=$(echo "2^$rb"|bc) + if [ $blksize -le $max_pool_record_size ]; then + ((recordcounts[$blksize]+=thiscount)) + fi + done + done + + ################### + # compare the above computed counts for blocks against + # lsize count. Since some devices have a minimum hardware + # blocksize > 512, we cannot compare against the asize count. + # E.G., if the HWBlocksize = 4096, then the asize counts for + # 512, 1024 and 2048 will be zero and rolled up into the + # 4096 blocksize count for asize. For verification we stick + # to just lsize counts. + # + # The max_variance is hard-coded here at 12% to leave us some + # margin. Testing has shown this normally to be in the range + # of 2%-8%, but it may be as large as 11%. + ################### + let max_variance=12 + let fail_value=0 + let error_count=0 + log_note "Comparisons for ${pool}" + log_note "Bsize is the blocksize, Count is predicted value" + log_note "Bsize\tCount\tpsize\tlsize\tasize" + while read -r blksize pc pl pm lc ll lm ac al am + do + if [ $blksize -gt $max_pool_record_size ]; then + continue + fi + log_note \ + "$blksize\t${recordcounts[${blksize}]}\t$pc\t$lc\t$ac" + + ################### + # get the computer record count and compute the + # difference percentage in integer arithmetic + ################### + rc=${recordcounts[${blksize}]} + ((rclc=(rc-lc)<0?lc-rc:rc-lc)) # absolute value + ((dp=(rclc*100)/rc)) + + ################### + # Check against the allowed variance + ################### + if [ $dp -gt ${max_variance} ]; then + log_note \ + "Expected variance < ${max_variance}% observed ${dp}%" + if [ ${dp} -gt ${fail_value} ]; then + fail_value=${dp} + ((error_count++)) + fi + fi + done < ${stripped} + if [ ${fail_value} -gt 0 ]; then + if [ ${error_count} -eq 1 ]; then + log_note "hctp: There was ${error_count} error" + else + log_note "hctp:There were a total of ${error_count} errors" + fi + log_fail \ + "hctp: Max variance of ${max_variance}% exceeded, saw ${fail_value}%" + fi +} + +log_assert "Verify zdb -Pbbb (block histogram) works as expected" +log_onexit cleanup +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS + +histo_populate_test_pool $TESTPOOL + +histo_check_test_pool $TESTPOOL + +log_pass "Histogram for zdb" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_checksum.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_checksum.ksh new file mode 100755 index 0000000000..4f661262a7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_checksum.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -c will display the same checksum as -ddddddbbbbbb +# +# Strategy: +# 1. Create a pool +# 2. Write some data to a file +# 3. Run zdb -ddddddbbbbbb against the file +# 4. Record the checksum and DVA of L0 block 0 +# 5. Run zdb -R with :c flag and match the checksum + + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify zdb -R generates the correct checksum." +log_onexit cleanup +init_data=$TESTDIR/file1 +write_count=8 +blksize=131072 +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $init_data -b $blksize -c $write_count + +# get object number of file +listing=$(ls -i $init_data) +set -A array $listing +obj=${array[0]} +log_note "file $init_data has object number $obj" +sync_pool $TESTPOOL + +output=$(zdb -ddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*$/\1/p' <<< "$output") +log_note "block 0 of $init_data has a DVA of $dva" +cksum_expected=$(sed -Ene 's/^.+ cksum=([a-z0-9:]+)$/\1/p' <<< "$output") +log_note "expecting cksum $cksum_expected" +output=$(zdb -R $TESTPOOL $dva:c 2> /dev/null) +result=$(grep $cksum_expected <<< "$output") +(( $? != 0 )) && log_fail "zdb -R failed to print the correct checksum" + +log_pass "zdb -R generates the correct checksum" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress.ksh new file mode 100755 index 0000000000..1ebcbfb449 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress.ksh @@ -0,0 +1,119 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -R pool :d will display the correct data and length +# +# Strategy: +# 1. Create a pool, set compression to lzjb +# 2. Write some identifiable data to a file +# 3. Run zdb -ddddddbbbbbb against the file +# 4. Record the DVA, lsize, and psize of L0 block 0 +# 5. Run zdb -R with :d flag and match the data +# 6. Run zdb -R with :dr flags and match the lsize/psize +# 7. Run zdb -R with :dr flags and match the lsize +# 8. Run zdb -R with :dr flags and match the psize +# + + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify zdb -R :d flag (decompress) works as expected" +log_onexit cleanup +init_data=$TESTDIR/file1 +write_count=256 +blksize=4096 +pattern="_match__pattern_" +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +log_must zfs set recordsize=$blksize $TESTPOOL/$TESTFS +log_must zfs set compression=lzjb $TESTPOOL/$TESTFS + +# 16 chars 256 times = 4k = block size +typeset four_k="" +for i in {1..$write_count} +do + four_k=$four_k$pattern +done + +# write the 4k block 256 times +for i in {1..$write_count} +do + echo $four_k >> $init_data +done + +sync_pool $TESTPOOL true + +# get object number of file +listing=$(ls -i $init_data) +set -A array $listing +obj=${array[0]} +log_note "file $init_data has object number $obj" + +output=$(zdb -ddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*$/\1/p' <<< "$output") +log_note "block 0 of $init_data has a DVA of $dva" + +# use the length reported by zdb -ddddddbbbbbb +size_str=$(sed -Ene 's/^.+ size=([^ ]+) .*$/\1/p' <<< "$output") +log_note "block size $size_str" + +vdev=$(echo "$dva" |awk '{split($0,array,":")} END{print array[1]}') +offset=$(echo "$dva" |awk '{split($0,array,":")} END{print array[2]}') +output=$(zdb -R $TESTPOOL $vdev:$offset:$size_str:d 2> /dev/null) +echo $output |grep $pattern > /dev/null +(( $? != 0 )) && log_fail "zdb -R :d failed to decompress the data properly" + +output=$(zdb -R $TESTPOOL $vdev:$offset:$size_str:dr 2> /dev/null) +echo $output |grep $four_k > /dev/null +(( $? != 0 )) && log_fail "zdb -R :dr failed to decompress the data properly" + +output=$(zdb -R $TESTPOOL $vdev:$offset:$size_str:dr 2> /dev/null) +result=${#output} +(( $result != $blksize)) && log_fail \ +"zdb -R failed to decompress the data to the length (${#output} != $size_str)" + +# decompress using lsize +lsize=$(echo $size_str |awk '{split($0,array,"/")} END{print array[1]}') +psize=$(echo $size_str |awk '{split($0,array,"/")} END{print array[2]}') +output=$(zdb -R $TESTPOOL $vdev:$offset:$lsize:dr 2> /dev/null) +result=${#output} +(( $result != $blksize)) && log_fail \ +"zdb -R failed to decompress the data (length ${#output} != $blksize)" + +# Specifying psize will decompress successfully , but not always to full +# lsize since zdb has to guess lsize incrementally. +output=$(zdb -R $TESTPOOL $vdev:$offset:$psize:dr 2> /dev/null) +result=${#output} +# convert psize to decimal +psize_orig=$psize +psize=${psize%?} +psize=$((16#$psize)) +(( $result < $psize)) && log_fail \ +"zdb -R failed to decompress the data with psize $psize_orig\ + (length ${#output} < $psize)" + +log_pass "zdb -R :d flag (decompress) works as expected" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress_zstd.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress_zstd.ksh new file mode 100755 index 0000000000..238d495604 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress_zstd.ksh @@ -0,0 +1,114 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -Z pool will display the ZSTD compression header +# This will contain the actual length of the compressed data, as well as +# the version of ZSTD used to compress the block, and the compression level +# +# Strategy: +# 1. Create a pool, set compression to zstd- +# 2. Write some identifiable data to a file +# 3. Run zdb -Zddddddbbbbbb against the file +# 4. Record the DVA, lsize, and psize, and ZSTD header of L0 block 0 +# 5. Check that the ZSTD length is less than psize +# 6. Check that the ZSTD level matches the level we requested +# 7. Run zdb -R with :dr flags and confirm the size and content match +# + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify zdb -Z (read ZSTD header) works as expected" +log_onexit cleanup +src_data="$STF_SUITE/tests/functional/cli_root/zfs_receive/zstd_test_data.txt" +init_data=$TESTDIR/file1 +write_count=128 +blksize=131072 +verify_runnable "global" +verify_disk_count "$DISKS" 2 +random_level=$((RANDOM%19 + 1)) + +default_mirror_setup_noexit $DISKS +log_must zfs set recordsize=$blksize $TESTPOOL/$TESTFS +log_must zfs set compression=zstd-$random_level $TESTPOOL/$TESTFS + +# write the 1k of text 128 times +for i in {1..$write_count} +do + cat $src_data >> $init_data +done + +sync_pool $TESTPOOL true + +# get object number of file +listing=$(ls -i $init_data) +set -A array $listing +obj=${array[0]} +log_note "file $init_data has object number $obj" + +output=$(zdb -Zddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*$/\1/p' <<< "$output") +log_note "block 0 of $init_data has a DVA of $dva" + +# use the length reported by zdb -ddddddbbbbbb +size_str=$(sed -Ene 's/^.+ size=([^ ]+) .*$/\1/p' <<< "$output") +# convert sizes to decimal +lsize=$(echo $size_str |awk '{split($0,array,"/")} END{print array[1]}') +lsize_orig=$lsize +lsize=${lsize%?} +lsize_bytes=$((16#$lsize)) +psize=$(echo $size_str |awk '{split($0,array,"/")} END{print array[2]}') +psize_orig=$psize +psize=${psize%?} +psize_bytes=$((16#$psize)) +log_note "block size $size_str" + +# Get the ZSTD header reported by zdb -Z +zstd_str=$(sed -Ene 's/^.+ ZSTD:size=([^:]+):version=([^:]+):level=([^:]+):.*$/\1:\2:\3/p' <<< "$output") +zstd_size=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[1]}') +log_note "ZSTD compressed size $zstd_size" +(( $psize_bytes < $zstd_size )) && log_fail \ +"zdb -Z failed: physical block size was less than header content length ($psize_bytes < $zstd_size)" + +zstd_version=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[2]}') +log_note "ZSTD version $zstd_version" + +zstd_level=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[3]}') +log_note "ZSTD level $zstd_level" +(( $zstd_level != $random_level )) && log_fail \ +"zdb -Z failed: compression level did not match header level ($zstd_level < $random_level)" + +vdev=$(echo "$dva" |awk '{split($0,array,":")} END{print array[1]}') +offset=$(echo "$dva" |awk '{split($0,array,":")} END{print array[2]}') +# Check the first 1024 bytes +output=$(ZDB_NO_ZLE="true" zdb -R $TESTPOOL $vdev:$offset:$size_str:dr 2> /dev/null) +outsize=$(wc -c <<< "$output") +(( $outsize != $blksize )) && log_fail \ +"zdb -Z failed to decompress the data to the expected length ($outsize != $lsize_bytes)" +cmp $init_data - <<< "$output" +(( $? != 0 )) && log_fail "zdb -R :dr failed to decompress the data properly" + +log_pass "zdb -Z flag (ZSTD compression header) works as expected" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_display_block.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_display_block.ksh new file mode 100755 index 0000000000..5cc4575851 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_display_block.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -R pool :b will display the block +# +# Strategy: +# 1. Create a pool, set compression to lzjb +# 2. Write some identifiable data to a file +# 3. Run zdb -ddddddbbbbbb against the file +# 4. Record the DVA of the first L1 block; +# record the first L0 block display; and +# record the 2nd L0 block display. +# 5. Run zdb -R with :bd displays first L0 +# 6. Run zdb -R with :b80d displays 2nd L0 +# 7. Run zdb -R with :db80 displays 2nd L0 +# 8. Run zdb -R with :id flag displays indirect block +# (similar to zdb -ddddddbbbbbb output) +# 9. Run zdb -R with :id flag and .0 vdev +# + + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify zdb -R :b flag (block display) works as expected" +log_onexit cleanup +init_data=$TESTDIR/file1 +write_count=256 +blksize=4096 + +# only read 256 128 byte block pointers in L1 (:i flag) +# 256 x 128 = 32k / 0x8000 +l1_read_size="8000" + +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +log_must zfs set recordsize=$blksize $TESTPOOL/$TESTFS +log_must zfs set compression=lzjb $TESTPOOL/$TESTFS + +file_write -d R -o create -w -f $init_data -b $blksize -c $write_count +sync_pool $TESTPOOL true + +# get object number of file +listing=$(ls -i $init_data) +set -A array $listing +obj=${array[0]} +log_note "file $init_data has object number $obj" + +output=$(zdb -ddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "L1 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*/\1/p' <<< "$output") +log_note "first L1 block $init_data has a DVA of $dva" +output=$(zdb -ddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +blk_out0=${output##*>} +blk_out0=${blk_out0##+([[:space:]])} + +output=$(zdb -ddddddbbbbbb $TESTPOOL/$TESTFS $obj 2> /dev/null \ + |grep -m 1 "1000 L0 DVA" |head -n1) +blk_out1=${output##*>} +blk_out1=${blk_out1##+([[:space:]])} + +output=$(export ZDB_NO_ZLE=\"true\"; zdb -R $TESTPOOL $dva:bd\ + 2> /dev/null) +output=${output##*>} +output=${output##+([[:space:]])} +if [ "$output" != "$blk_out0" ]; then + log_fail "zdb -R :bd (block 0 display/decompress) failed" +fi + +output=$(export ZDB_NO_ZLE=\"true\"; zdb -R $TESTPOOL $dva:db80\ + 2> /dev/null) +output=${output##*>} +output=${output##+([[:space:]])} +if [ "$output" != "$blk_out1" ]; then + log_fail "zdb -R :db80 (block 1 display/decompress) failed" +fi + +output=$(export ZDB_NO_ZLE=\"true\"; zdb -R $TESTPOOL $dva:b80d\ + 2> /dev/null) +output=${output##*>} +output=${output##+([[:space:]])} +if [ "$output" != "$blk_out1" ]; then + log_fail "zdb -R :b80d (block 1 display/decompress) failed" +fi + +vdev=$(echo "$dva" |awk '{split($0,array,":")} END{print array[1]}') +offset=$(echo "$dva" |awk '{split($0,array,":")} END{print array[2]}') +output=$(export ZDB_NO_ZLE=\"true\";\ + zdb -R $TESTPOOL $vdev:$offset:$l1_read_size:id 2> /dev/null) +block_cnt=$(echo "$output" | grep 'L0' | wc -l) +if [ $block_cnt -ne $write_count ]; then + log_fail "zdb -R :id (indirect block display) failed" +fi + +# read from specific half of mirror +vdev="$vdev.0" +log_note "Reading from DVA $vdev:$offset:$l1_read_size" +output=$(export ZDB_NO_ZLE=\"true\";\ + zdb -R $TESTPOOL $vdev:$offset:$l1_read_size:id 2> /dev/null) +block_cnt=$(echo "$output" | grep 'L0' | wc -l) +if [ $block_cnt -ne $write_count ]; then + log_fail "zdb -R 0.0:offset:length:id (indirect block display) failed" +fi + +log_pass "zdb -R :b flag (block display) works as expected" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_neg.ksh new file mode 100755 index 0000000000..4301807880 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_neg.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# A badly formed object range parameter passed to zdb -dd should +# return an error. +# +# Strategy: +# 1. Create a pool +# 2. Run zdb -dd with assorted invalid object range arguments and +# confirm it fails as expected +# 3. Run zdb -dd with an invalid object identifier and +# confirm it fails as expected + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Execute zdb using invalid object range parameters." +log_onexit cleanup +verify_runnable "both" +verify_disk_count "$DISKS" 2 +default_mirror_setup_noexit $DISKS + +log_must zpool sync + +set -A bad_flags a b c e g h i j k l n o p q r s t u v w x y \ + B C D E F G H I J K L M N O P Q R S T U V W X Y Z \ + 0 1 2 3 4 5 6 7 8 9 _ - + % . , : + +typeset -i i=0 +while [[ $i -lt ${#bad_flags[*]} ]]; do + log_mustnot zdb -dd $TESTPOOL 0:1:${bad_flags[i]} + log_mustnot zdb -dd $TESTPOOL 0:1:A-${bad_flags[i]} + ((i = i + 1)) +done + +set -A bad_ranges ":" "::" ":::" ":0" "0:" "0:1:" "0:1::" "0::f" "0a:1" \ + "a0:1" "a:1" "0:a" "0:1a" "0:a1" "a:b0" "a:0b" "0:1:A-" "1:0" \ + "0:1:f:f" "0:1:f:" + +i=0 +while [[ $i -lt ${#bad_ranges[*]} ]]; do + log_mustnot zdb -dd $TESTPOOL ${bad_ranges[i]} + ((i = i + 1)) +done + +# Specifying a non-existent object identifier returns an error +obj_id_highest=$(zdb -P -dd $TESTPOOL/$TESTFS 2>/dev/null | + egrep "^ +-?([0-9]+ +){7}" | sort -n | tail -n 1 | awk '{print $1}') +obj_id_invalid=$(( $obj_id_highest + 1 )) +log_mustnot zdb -dd $TESTPOOL/$TESTFS $obj_id_invalid + +log_pass "Badly formed zdb object range parameters fail as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_pos.ksh new file mode 100755 index 0000000000..b7f47d11ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_pos.ksh @@ -0,0 +1,171 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# Object range parameters passed to zdb -dd work correctly. +# +# Strategy: +# 1. Create a pool +# 2. Create some files +# 3. Run zdb -dd with assorted object range arguments and verify output + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +# +# Print objects in @dataset with identifiers greater than or equal to +# @begin and less than or equal to @end, without using object range +# parameters. +# +function get_object_list_range +{ + dataset=$1 + begin=$2 + end=$3 + get_object_list $dataset | + while read line; do + obj=$(echo $line | awk '{print $1}') + if [[ $obj -ge $begin && $obj -le $end ]] ; then + echo "$line" + elif [[ $obj -gt $end ]] ; then + break + fi + done +} + +# +# Print just the list of objects from 'zdb -dd' with leading whitespace +# trimmed, discarding other zdb output, sorted by object identifier. +# Caller must pass in the dataset argument at minimum. +# +function get_object_list +{ + zdb -P -dd $@ 2>/dev/null | + egrep "^ +-?([0-9]+ +){7}" | + sed 's/^[[:space:]]*//' | + sort -n +} + +log_assert "Verify zdb -dd object range arguments work correctly." +log_onexit cleanup +verify_runnable "both" +verify_disk_count "$DISKS" 2 +default_mirror_setup_noexit $DISKS + +for x in $(seq 0 7); do + touch $TESTDIR/file$x + mkdir $TESTDIR/dir$x +done + +log_must zpool sync + +# Get list of all objects, but filter out user/group objects which don't +# appear when using object or object range arguments +all_objects=$(get_object_list $TESTPOOL/$TESTFS | grep -v 'used$') + +# Range 0:-1 gets all objects +expected=$all_objects +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:A gets all objects +expected=$all_objects +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1:A) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:f must output all file objects +expected=$(grep "ZFS plain file" <<< $all_objects) +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1:f) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:d must output all directory objects +expected=$(grep "ZFS directory" <<< $all_objects) +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1:d) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:df must output all directory and file objects +expected=$(grep -e "ZFS directory" -e "ZFS plain file" <<< $all_objects) +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1:df) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:A-f-d must output all non-files and non-directories +expected=$(grep -v -e "ZFS plain file" -e "ZFS directory" <<< $all_objects) +actual=$(get_object_list $TESTPOOL/$TESTFS 0:-1:A-f-d) +log_must test "\n$actual\n" == "\n$expected\n" + +# Specifying multiple ranges works +set -A obj_ids $(ls -i $TESTDIR | awk '{print $1}' | sort -n) +start1=${obj_ids[0]} +end1=${obj_ids[5]} +start2=${obj_ids[8]} +end2=${obj_ids[13]} +expected=$(get_object_list_range $TESTPOOL/$TESTFS $start1 $end1; + get_object_list_range $TESTPOOL/$TESTFS $start2 $end2) +actual=$(get_object_list $TESTPOOL/$TESTFS $start1:$end1 $start2:$end2) +log_must test "\n$actual\n" == "\n$expected\n" + +# Combining ranges with individual object IDs works +expected=$(get_object_list_range $TESTPOOL/$TESTFS $start1 $end1; + get_object_list $TESTPOOL/$TESTFS $start2 $end2) +actual=$(get_object_list $TESTPOOL/$TESTFS $start1:$end1 $start2 $end2) +log_must test "\n$actual\n" == "\n$expected\n" + +# Hex conversion must work for ranges and individual object identifiers +# (this test uses expected result from previous test). +start1_hex=$(printf "0x%x" $start1) +end1_hex=$(printf "0x%x" $end1) +start2_hex=$(printf "0x%x" $start2) +end2_hex=$(printf "0x%x" $end2) +actual=$(get_object_list $TESTPOOL/$TESTFS $start1_hex:$end1_hex \ + $start2_hex $end2_hex) +log_must test "\n$actual\n" == "\n$expected\n" + +# Specifying individual object IDs works +objects="$start1 $end1 $start2 $end2" +expected="$objects" +actual=$(get_object_list $TESTPOOL/$TESTFS $objects | awk '{print $1}' | xargs) +log_must test "$actual" == "$expected" + +# Get all objects in the meta-objset to test m (spacemap) and z (zap) flags +all_mos_objects=$(get_object_list $TESTPOOL 0:-1) + +# Range 0:-1:m must output all space map objects +expected=$(grep "SPA space map" <<< $all_mos_objects) +actual=$(get_object_list $TESTPOOL 0:-1:m) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:z must output all zap objects +expected=$(grep "zap" <<< $all_mos_objects) +actual=$(get_object_list $TESTPOOL 0:-1:z) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:A-m-z must output all non-space maps and non-zaps +expected=$(grep -v -e "zap" -e "SPA space map" <<< $all_mos_objects) +actual=$(get_object_list $TESTPOOL 0:-1:A-m-z) +log_must test "\n$actual\n" == "\n$expected\n" + +# Range 0:-1:mz must output all space maps and zaps +expected=$(grep -e "SPA space map" -e "zap" <<< $all_mos_objects) +actual=$(get_object_list $TESTPOOL 0:-1:mz) +log_must test "\n$actual\n" == "\n$expected\n" + +log_pass "zdb -dd object range arguments work correctly" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh new file mode 100755 index 0000000000..d23cc43c90 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -d pool/ will display the dataset +# +# Strategy: +# 1. Create a pool +# 2. Write some data to a file +# 3. Get the inode number (object number) of the file +# 4. Run zdb -d to get the objset ID of the dataset +# 5. Run zdb -dddddd pool/objsetID objectID (decimal) +# 6. Confirm names +# 7. Run zdb -dddddd pool/objsetID objectID (hex) +# 8. Confirm names +# 9. Obtain objsetID from /proc/spl/kstat/zfs/testpool/obset-0x +# (linux only) +# 10. Run zdb -dddddd pool/objsetID (hex) +# 11. Match name from zdb against proc entry +# + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify zdb -d / generates the correct names." +log_onexit cleanup +init_data=$TESTDIR/file1 +write_count=8 +blksize=131072 +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $init_data -b $blksize -c $write_count + +# get object number of file +listing=$(ls -i $init_data) +set -A array $listing +obj=${array[0]} +log_note "file $init_data has object number $obj" +sync_pool $TESTPOOL + +output=$(zdb -d $TESTPOOL/$TESTFS) +objset_id=$(echo $output | awk '{split($0,array,",")} END{print array[2]}' | + awk '{split($0,array," ")} END{print array[2]}') +objset_hex=$(printf "0x%X" $objset_id) +log_note "objset $TESTPOOL/$TESTFS has objset ID $objset_id ($objset_hex)" + +for id in "$objset_id" "$objset_hex" +do + log_note "zdb -dddddd $TESTPOOL/$id $obj" + output=$(zdb -dddddd $TESTPOOL/$id $obj) + reason="($TESTPOOL/$TESTFS not in zdb output)" + echo $output |grep "$TESTPOOL/$TESTFS" > /dev/null + (( $? != 0 )) && log_fail \ + "zdb -dddddd $TESTPOOL/$id $obj failed $reason" + reason="(file1 not in zdb output)" + echo $output |grep "file1" > /dev/null + (( $? != 0 )) && log_fail \ + "zdb -dddddd $TESTPOOL/$id $obj failed $reason" + obj=$(printf "0x%X" $obj) +done + +if is_linux; then + output=$(ls -1 /proc/spl/kstat/zfs/$TESTPOOL |grep objset- |tail -1) + objset_hex=${output#*-} + name_from_proc=$(cat /proc/spl/kstat/zfs/$TESTPOOL/$output | + grep dataset_name | awk '{split($0,array," ")} END{print array[3]}') + log_note "checking zdb output for $name_from_proc" + reason="(name $name_from_proc from proc not in zdb output)" + log_note "zdb -dddddd $TESTPOOL/$objset_hex" + output=$(zdb -dddddd $TESTPOOL/$objset_hex) + echo $output |grep "$name_from_proc" > /dev/null + (( $? != 0 )) && log_fail \ + "zdb -dddddd $TESTPOOL/$objset_hex failed $reason" +fi + +log_pass "zdb -d / generates the correct names." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover.ksh new file mode 100755 index 0000000000..d51edf3763 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Allan Jude. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -r +# Will extract (relative to ) to the file +# Similar to -R, except it does the work for you to find each record +# +# Strategy: +# 1. Create a pool +# 2. Write some data to a file +# 3. Extract the file +# 4. Compare the file to the original +# + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm $tmpfile +} + +log_assert "Verify zdb -r extract the correct data." +log_onexit cleanup +init_data=$TESTDIR/file1 +tmpfile="$TEST_BASE_DIR/zdb-recover" +write_count=8 +blksize=131072 +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $init_data -b $blksize -c $write_count +log_must zpool sync $TESTPOOL + +output=$(zdb -r $TESTPOOL/$TESTFS file1 $tmpfile) +log_must cmp $init_data $tmpfile + +log_pass "zdb -r extracts the correct data." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover_2.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover_2.ksh new file mode 100755 index 0000000000..91f04c7956 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_recover_2.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Allan Jude. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -r +# Will extract (relative to ) to the file +# Similar to -R, except it does the work for you to find each record +# +# Strategy: +# 1. Create a pool +# 2. Write some data to a file +# 3. Append to the file so it isn't an divisible by 2 +# 4. Extract the file +# 5. Compare the file to the original +# + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm $tmpfile +} + +log_assert "Verify zdb -r extract the correct data." +log_onexit cleanup +init_data=$TESTDIR/file1 +tmpfile="$TEST_BASE_DIR/zdb-recover" +write_count=8 +blksize=131072 +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $init_data -b $blksize -c $write_count +log_must echo "zfs" >> $init_data +log_must zpool sync $TESTPOOL + +output=$(zdb -r $TESTPOOL/$TESTFS file1 $tmpfile) +log_must cmp $init_data $tmpfile + +log_pass "zdb -r extracts the correct data." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_001_neg.ksh index c8fafc339d..c6e45c80dd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_001_neg.ksh @@ -57,7 +57,7 @@ set -A args "" "create" "create -s" "create -V" "create -s -V" \ "set compressratio=" "set mounted=" "set origin=" "set quota=" \ "set reservation=" "set volsize=" " set volblocksize=" "set recordsize=" \ "set mountpoint=" "set devices=" "set exec=" "set setuid=" "set readonly=" \ - "set zoned=" "set snapdir=" "set aclmode=" "set aclinherit=" \ + "set snapdir=" "set aclmode=" "set aclinherit=" \ "set quota=blah" "set reservation=blah" "set atime=blah" "set checksum=blah" \ "set compression=blah" \ "upgrade blah" "mount blah" "mount -o" \ @@ -65,6 +65,11 @@ set -A args "" "create" "create -s" "create -V" "create -s -V" \ "share" "unshare" "send" "send -i" "receive" "receive -d" "receive -vnF" \ "recv" "recv -d" "recv -vnF" "allow" "unallow" \ "blah blah" "-%" "--" "--?" "-*" "-=" +if is_freebsd; then + args+=("set jailed=") +else + args+=("set zoned=") +fi log_assert "Badly-formed zfs sub-command should return an error." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_002_pos.ksh index b21b6c657d..1290d888a9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs/zfs_002_pos.ksh @@ -48,13 +48,15 @@ function cleanup { unset ZFS_ABORT + if is_freebsd && [[ -n $savedcorefile ]]; then + sysctl kern.corefile=$savedcorefile + fi + if [[ -d $corepath ]]; then rm -rf $corepath fi for ds in $fs1 $fs $ctr; do - if datasetexists $ds; then - log_must zfs destroy -rRf $ds - fi + datasetexists $ds && destroy_dataset $ds -rRf done } @@ -62,8 +64,10 @@ log_assert "With ZFS_ABORT set, all zfs commands can abort and generate a " \ "core file." log_onexit cleanup -#preparation work for testing +# Preparation work for testing +savedcorefile="" corepath=$TESTDIR/core +corefile=$corepath/core.zfs if [[ -d $corepath ]]; then rm -rf $corepath fi @@ -91,9 +95,13 @@ typeset badparams=("" "create" "destroy" "snapshot" "rollback" "clone" \ if is_linux; then ulimit -c unlimited - echo "$corepath/core.zfs" >/proc/sys/kernel/core_pattern + echo "$corefile" >/proc/sys/kernel/core_pattern echo 0 >/proc/sys/kernel/core_uses_pid export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" +elif is_freebsd; then + ulimit -c unlimited + savedcorefile=$(sysctl -n kern.corefile) + log_must sysctl kern.corefile=$corepath/core.%N else log_must coreadm -p ${corepath}/core.%f fi @@ -102,7 +110,6 @@ log_must export ZFS_ABORT=yes for subcmd in "${cmds[@]}" "${badparams[@]}"; do zfs $subcmd >/dev/null 2>&1 && log_fail "$subcmd passed incorrectly." - corefile=${corepath}/core.zfs if [[ ! -e $corefile ]]; then log_fail "zfs $subcmd cannot generate core file with " \ "ZFS_ABORT set." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/cleanup.ksh index 6a4e7cfc66..f84ac43e67 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/cleanup.ksh @@ -26,4 +26,6 @@ . $STF_SUITE/include/libtest.shlib +log_must zfs destroy "$TESTPOOL/$TESTFS/child" +log_must zfs destroy "$TESTPOOL/${TESTFS}_with_suffix" default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/setup.ksh index 2a9de0535d..40953415c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/setup.ksh @@ -28,4 +28,8 @@ DISK=${DISKS%% *} -default_volume_setup $DISK +default_setup_noexit $DISK +log_must zfs create "$TESTPOOL/$TESTFS/child" +log_must zfs create "$TESTPOOL/${TESTFS}_with_suffix" +log_must zfs create "$TESTPOOL/$TESTFS/recv" +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh index 4a11837292..3a1cddb5c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K . All rights reserved. +# Copyright 2019, 2020 by Christian Schwarz. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -32,35 +33,55 @@ # # STRATEGY: # 1. Create initial snapshot +# # 2. Verify we can create a bookmark specifying snapshot and bookmark full paths -# 3. Verify we can create a bookmark specifying the snapshot name -# 4. Verify we can create a bookmark specifying the bookmark name +# 3. Verify we can create a bookmark specifying the short snapshot name +# 4. Verify we can create a bookmark specifying the short bookmark name # 5. Verify at least a full dataset path is required and both snapshot and # bookmark name must be valid # +# 6. Verify we can copy a bookmark by specifying the source bookmark and new +# bookmark full paths. +# 7. Verify we can copy a bookmark specifying the short source name +# 8. Verify we can copy a bookmark specifying the short new name +# 9. Verify two short paths are not allowed, and test empty paths +# 10. Verify we cannot copy a bookmark if the new bookmark already exists +# 11. Verify that copying a bookmark only works if new and source name +# have the same dataset +# verify_runnable "both" function cleanup { - if snapexists "$DATASET@$TESTSNAP"; then - log_must zfs destroy "$DATASET@$TESTSNAP" - fi - if bkmarkexists "$DATASET#$TESTBM"; then - log_must zfs destroy "$DATASET#$TESTBM" - fi + snapexists "$DATASET@$TESTSNAP" && \ + destroy_dataset "$DATASET@$TESTSNAP" + + bkmarkexists "$DATASET#$TESTBM" && \ + destroy_dataset "$DATASET#$TESTBM" + + bkmarkexists "$DATASET#$TESTBMCOPY" && \ + destroy_dataset "$DATASET#$TESTBMCOPY" } log_assert "'zfs bookmark' should work only when passed valid arguments." log_onexit cleanup DATASET="$TESTPOOL/$TESTFS" +DATASET_TWO="$TESTPOOL/${TESTFS}_two" TESTSNAP='snapshot' +TESTSNAP2='snapshot2' TESTBM='bookmark' +TESTBMCOPY='bookmark_copy' + # Create initial snapshot log_must zfs snapshot "$DATASET@$TESTSNAP" +# +# Bookmark creation tests +# + # Verify we can create a bookmark specifying snapshot and bookmark full paths log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET#$TESTBM" log_must eval "bkmarkexists $DATASET#$TESTBM" @@ -97,4 +118,120 @@ log_mustnot zfs bookmark "$TESTSNAP" "$DATASET#" log_mustnot zfs bookmark "$TESTSNAP" "$DATASET" log_mustnot eval "bkmarkexists $DATASET#$TESTBM" -log_pass "'zfs bookmark' works as expected only when passed valid arguments." +# Verify that we can create a bookmarks on another origin filesystem +log_must zfs clone "$DATASET@$TESTSNAP" "$DATASET_TWO" +log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET_TWO#$TESTBM" +log_must eval "destroy_dataset $DATASET_TWO" + +# Verify that we can cannot create bookmarks on a non-origin filesystem +log_must zfs create "$DATASET_TWO" +log_mustnot_expect "source is not an ancestor of the new bookmark's dataset" zfs bookmark "$DATASET@$TESTSNAP" "$DATASET_TWO#$TESTBM" +log_must zfs destroy "$DATASET_TWO" + +# Verify that we can create bookmarks of snapshots on the pool dataset +log_must zfs snapshot "$TESTPOOL@$TESTSNAP" +log_must zfs bookmark "$TESTPOOL@$TESTSNAP" "$TESTPOOL#$TESTBM" +log_must zfs destroy "$TESTPOOL#$TESTBM" +log_must zfs destroy "$TESTPOOL@$TESTSNAP" + +# +# Bookmark copying tests +# + +# create the source bookmark +log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET#$TESTBM" + +# Verify we can copy a bookmark by specifying the source bookmark +# and new bookmark full paths. +log_must eval "bkmarkexists $DATASET#$TESTBM" +log_must zfs bookmark "$DATASET#$TESTBM" "$DATASET#$TESTBMCOPY" +log_must eval "bkmarkexists $DATASET#$TESTBMCOPY" +## validate destroy once (should be truly independent bookmarks) +log_must zfs destroy "$DATASET#$TESTBM" +log_mustnot eval "bkmarkexists $DATASET#$TESTBM" +log_must eval "bkmarkexists $DATASET#$TESTBMCOPY" +log_must zfs destroy "$DATASET#$TESTBMCOPY" +log_mustnot eval "bkmarkexists $DATASET#$TESTBMCOPY" +log_mustnot eval "bkmarkexists $DATASET#$TESTBM" +## recreate the source bookmark +log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET#$TESTBM" + +# Verify we can copy a bookmark specifying the short source name +log_must zfs bookmark "#$TESTBM" "$DATASET#$TESTBMCOPY" +log_must eval "bkmarkexists $DATASET#$TESTBMCOPY" +log_must zfs destroy "$DATASET#$TESTBMCOPY" + +# Verify we can copy a bookmark specifying the short bookmark name +log_must zfs bookmark "$DATASET#$TESTBM" "#$TESTBMCOPY" +log_must eval "bkmarkexists $DATASET#$TESTBMCOPY" +log_must zfs destroy "$DATASET#$TESTBMCOPY" + +# Verify two short paths are not allowed, and test empty paths +log_mustnot zfs bookmark "#$TESTBM" "#$TESTBMCOPY" +log_mustnot zfs bookmark "#$TESTBM" "#" +log_mustnot zfs bookmark "#" "#$TESTBMCOPY" +log_mustnot zfs bookmark "#" "#" +log_mustnot zfs bookmark "#" "" +log_mustnot zfs bookmark "" "#" +log_mustnot zfs bookmark "" "" + +# Verify that we can copy bookmarks on another origin filesystem +log_must zfs clone "$DATASET@$TESTSNAP" "$DATASET_TWO" +log_must zfs bookmark "$DATASET#$TESTBM" "$DATASET_TWO#$TESTBMCOPY" +log_must zfs destroy "$DATASET_TWO" + +# Verify that we can cannot create bookmarks on another non-origin filesystem +log_must zfs create "$DATASET_TWO" +log_mustnot_expect "source is not an ancestor of the new bookmark's dataset" zfs bookmark "$DATASET#$TESTBM" "$DATASET_TWO#$TESTBMCOPY" +log_must zfs destroy "$DATASET_TWO" + +# Verify that we can copy bookmarks on the pool dataset +log_must zfs snapshot "$TESTPOOL@$TESTSNAP" +log_must zfs bookmark "$TESTPOOL@$TESTSNAP" "$TESTPOOL#$TESTBM" +log_must zfs bookmark "$TESTPOOL#$TESTBM" "$TESTPOOL#$TESTBMCOPY" +log_must zfs destroy "$TESTPOOL#$TESTBM" +log_must zfs destroy "$TESTPOOL#$TESTBMCOPY" +log_must zfs destroy "$TESTPOOL@$TESTSNAP" + +# Verify that copied 'normal' bookmarks are independent of the source bookmark +log_must zfs bookmark "$DATASET#$TESTBM" "$DATASET#$TESTBMCOPY" +log_must zfs destroy "$DATASET#$TESTBM" +log_must eval "zfs send $DATASET@$TESTSNAP > $TEST_BASE_DIR/zfstest_datastream.$$" +log_must eval "destroy_dataset $TESTPOOL/$TESTFS/recv" +log_must eval "zfs recv -o mountpoint=none $TESTPOOL/$TESTFS/recv < $TEST_BASE_DIR/zfstest_datastream.$$" +log_must zfs snapshot "$DATASET@$TESTSNAP2" +log_must eval "zfs send -i \#$TESTBMCOPY $DATASET@$TESTSNAP2 > $TEST_BASE_DIR/zfstest_datastream.$$" +log_must eval "zfs recv $TESTPOOL/$TESTFS/recv < $TEST_BASE_DIR/zfstest_datastream.$$" +# cleanup +log_must eval "destroy_dataset $DATASET@$TESTSNAP2" +log_must zfs destroy "$DATASET#$TESTBMCOPY" +log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET#$TESTBM" + +# Verify that copied redaction bookmarks are independent of the source bookmark +## create redaction bookmark +log_must zfs destroy "$DATASET#$TESTBM" +log_must zfs destroy "$DATASET@$TESTSNAP" +log_must eval "echo secret > $TESTDIR/secret" +log_must zfs snapshot "$DATASET@$TESTSNAP" +log_must eval "echo redacted > $TESTDIR/secret" +log_must zfs snapshot "$DATASET@$TESTSNAP2" # TESTSNAP2 is the redaction snapshot +log_must zfs list -t all -o name,createtxg,guid,mountpoint,written +log_must zfs redact "$DATASET@$TESTSNAP" "$TESTBM" "$DATASET@$TESTSNAP2" +# ensure our primitive for testing whether a bookmark is a redaction bookmark works +log_must eval "zfs get all $DATASET#$TESTBM | grep redact_snaps" +## copy the redaction bookmark +log_must zfs bookmark "$DATASET#$TESTBM" "#$TESTBMCOPY" +log_mustnot eval "zfs get all $DATASET#$TESTBMCOPY | grep redact_snaps" +log_must eval "zfs send --redact "$TESTBMCOPY" -i $DATASET@$TESTSNAP $DATASET@$TESTSNAP2 2>&1 | head -n 100 | grep 'not a redaction bookmark'" +# try the above again after destroying the source bookmark, preventive measure for future work +log_must zfs destroy "$DATASET#$TESTBM" +log_mustnot eval "zfs get all $DATASET#$TESTBMCOPY | grep redact_snaps" +log_must eval "zfs send --redact "$TESTBMCOPY" -i $DATASET@$TESTSNAP $DATASET@$TESTSNAP2 2>&1 | head -n 100 | grep 'not a redaction bookmark'" +## cleanup +log_must eval "destroy_dataset $DATASET@$TESTSNAP2" +log_must zfs destroy "$DATASET#$TESTBMCOPY" +log_must eval "destroy_dataset $DATASET@$TESTSNAP" +log_must zfs snapshot "$DATASET@$TESTSNAP" +log_must zfs bookmark "$DATASET@$TESTSNAP" "$DATASET#$TESTBM" + +log_pass "'zfs bookmark' works as expected" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile.am index 7c67e7239b..72d6e4700e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile.am @@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zfs_change-key.ksh \ zfs_change-key_child.ksh \ + zfs_change-key_clones.ksh \ zfs_change-key_inherit.ksh \ zfs_change-key_format.ksh \ zfs_change-key_load.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key.ksh index 781caae5b5..821abdeb32 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key.ksh @@ -40,7 +40,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_child.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_child.ksh index dda7c1df43..592f1eccca 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_child.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_child.ksh @@ -28,13 +28,15 @@ # STRATEGY: # 1. Create an encrypted dataset # 2. Create an encrypted child dataset -# 3. Attempt to change the key without any flags -# 4. Attempt to change the key specifying keylocation -# 5. Attempt to change the key specifying keyformat -# 6. Verify the new encryption root can unload and load its key -# 7. Recreate the child dataset -# 8. Attempt to change the key specifying both the keylocation and keyformat -# 9. Verify the new encryption root can unload and load its key +# 3. Create an unencrypted child dataset +# 4. Attempt to change the key without any flags +# 5. Attempt to change the key specifying keylocation +# 6. Attempt to change the key specifying keyformat +# 7. Verify the new encryption root can unload and load its key +# 8. Recreate the child dataset +# 9. Attempt to change the key specifying both the keylocation and keyformat +# 10. Verify the new encryption root can unload and load its key +# 11. Verify the unencrytped child is still accessible normally # verify_runnable "both" @@ -42,7 +44,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup @@ -53,6 +55,7 @@ log_assert "'zfs change-key' should promote an encrypted child to an" \ log_must eval "echo $PASSPHRASE1 | zfs create -o encryption=on" \ "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS1" log_must zfs create $TESTPOOL/$TESTFS1/child +log_must zfs create -o encryption=off $TESTPOOL/$TESTFS1/child2 log_mustnot eval "echo $PASSPHRASE2 | zfs change-key" \ "$TESTPOOL/$TESTFS1/child" @@ -82,5 +85,7 @@ log_must key_unavailable $TESTPOOL/$TESTFS1/child log_must eval "echo $PASSPHRASE2 | zfs load-key $TESTPOOL/$TESTFS1/child" log_must key_available $TESTPOOL/$TESTFS1/child +log_must zfs unmount $TESTPOOL/$TESTFS1/child2 +log_must zfs mount $TESTPOOL/$TESTFS1/child2 log_pass "'zfs change-key' promotes an encrypted child to an encryption root" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh new file mode 100755 index 0000000000..70a9df618e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# 'zfs change-key' should correctly update encryption roots with clones. +# +# STRATEGY: +# 1. Create an encrypted dataset +# 2. Create an encryption root child of the first dataset +# 3. Clone the child encryption root twice +# 4. Add inheriting children to the encryption root and each of the clones +# 5. Verify the encryption roots +# 6. Have the child encryption root inherit from its parent +# 7. Verify the encryption root for all datasets is now the parent dataset +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -Rf +} + +log_onexit cleanup + +log_assert "'zfs change-key' should correctly update encryption " \ + "roots with clones" + +log_must eval "echo $PASSPHRASE1 | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS1" +log_must eval "echo $PASSPHRASE2 | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS1/child" +log_must zfs snapshot $TESTPOOL/$TESTFS1/child@1 +log_must zfs clone $TESTPOOL/$TESTFS1/child@1 $TESTPOOL/$TESTFS1/clone1 +log_must zfs clone $TESTPOOL/$TESTFS1/child@1 $TESTPOOL/$TESTFS1/clone2 +log_must zfs create $TESTPOOL/$TESTFS1/child/A +log_must zfs create $TESTPOOL/$TESTFS1/clone1/B +log_must zfs create $TESTPOOL/$TESTFS1/clone2/C + +log_must verify_encryption_root $TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child $TESTPOOL/$TESTFS1/child +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone1 $TESTPOOL/$TESTFS1/child +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone2 $TESTPOOL/$TESTFS1/child +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child/A $TESTPOOL/$TESTFS1/child +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone1/B $TESTPOOL/$TESTFS1/child +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone2/C $TESTPOOL/$TESTFS1/child + +log_must zfs change-key -i $TESTPOOL/$TESTFS1/child + +log_must verify_encryption_root $TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone1 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone2 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child/A $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone1/B $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/clone2/C $TESTPOOL/$TESTFS1 + +log_pass "'zfs change-key' correctly updates encryption roots with clones" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_format.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_format.ksh index 6344b8d05a..22212d72d1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_format.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_format.ksh @@ -43,7 +43,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh index 94820c37ec..e9b010e912 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh @@ -42,7 +42,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_load.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_load.ksh index 4ed4aadfe0..a5a9976196 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_load.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_load.ksh @@ -38,7 +38,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_location.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_location.ksh index 5cbe34b269..607e2208ce 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_location.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_location.ksh @@ -40,7 +40,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh index b1672248be..224fabf226 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh @@ -52,7 +52,7 @@ function verify_pbkdf2iters function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile.am index 0a6c2eb850..06099c0c2b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile.am @@ -13,4 +13,5 @@ dist_pkgdata_SCRIPTS = \ zfs_clone_009_neg.ksh \ zfs_clone_010_pos.ksh \ zfs_clone_encrypted.ksh \ - zfs_clone_deeply_nested.ksh + zfs_clone_deeply_nested.ksh \ + zfs_clone_rm_nested.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh index b83ccdf48c..e6ffa26c02 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh @@ -100,11 +100,11 @@ function setup_all function cleanup_all { for fs in $targets; do - datasetexists $fs && log_must zfs destroy -f $fs + datasetexists $fs && destroy_dataset $fs -f done for snap in $SNAPFS $SNAPFS1 ; do - snapexists $snap && log_must zfs destroy -Rf $snap + snapexists $snap && destroy_dataset $snap -Rf done return 0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh index 8e69a7adcc..96eb3ea48d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh @@ -60,14 +60,10 @@ function setup_all function cleanup_all { - if datasetexists $TESTPOOL/notexist ; then - log_must zfs destroy -rRf $TESTPOOL/notexist - fi + datasetexists $TESTPOOL/notexist && destroy_dataset $TESTPOOL/notexist -rRf for snap in $SNAPFS $SNAPFS1 ; do - if snapexists $snap ; then - log_must zfs destroy -Rf $snap - fi + snapexists $snap && destroy_dataset $snap -Rf done return 0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh index 5222757598..6484de9c91 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh @@ -48,9 +48,7 @@ verify_runnable "both" function cleanup { - if snapexists $SNAPFS ; then - log_must zfs destroy -Rf $SNAPFS - fi + snapexists $SNAPFS && destroy_dataset $SNAPFS -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh index 8d86f55018..1c4c579f26 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh @@ -48,9 +48,7 @@ verify_runnable "both" function cleanup { - if snapexists $SNAPFS ; then - log_must zfs destroy -Rf $SNAPFS - fi + snapexists $SNAPFS && destroy_dataset $SNAPFS -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh index afa8b46a6f..6f17b17673 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh @@ -48,9 +48,7 @@ verify_runnable "global" function cleanup { - if snapexists $SNAPFS1 ; then - log_must zfs destroy -Rf $SNAPFS1 - fi + snapexists $SNAPFS1 && destroy_dataset $SNAPFS1 -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh index 2127eb117b..f2f7a5bcd0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh @@ -49,9 +49,7 @@ verify_runnable "global" function cleanup { - if snapexists $SNAPFS1 ; then - log_must_busy zfs destroy -Rf $SNAPFS1 - fi + snapexists $SNAPFS1 && destroy_dataset $SNAPFS1 -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh index 6fba72b580..4bfb3d5f78 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "both" function cleanup { - if snapexists $SNAPFS ; then - log_must zfs destroy -Rf $SNAPFS - fi + snapexists $SNAPFS && destroy_dataset $SNAPFS -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh index 8e306fd445..2f2b0ca18d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh @@ -48,9 +48,7 @@ verify_runnable "both" function cleanup { - if snapexists $SNAPFS ; then - log_must zfs destroy -Rf $SNAPFS - fi + snapexists $SNAPFS && destroy_dataset $SNAPFS -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh index 030c6af7ae..6cdf5717fa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh @@ -48,9 +48,7 @@ verify_runnable "global" function cleanup { - if snapexists $SNAPFS1 ; then - log_must zfs destroy -Rf $SNAPFS1 - fi + snapexists $SNAPFS1 && destroy_dataset $SNAPFS1 -Rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh index 40cabf649d..13f5418d4b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh @@ -39,8 +39,8 @@ function local_cleanup typeset -i i=1 for ds in $datasets; do datasetexists $ds/$TESTCLONE.$i && \ - log_must zfs destroy -rf $ds/$TESTCLONE.$i - datasetexists $ds && log_must zfs destroy -Rf $ds + destroy_dataset $ds/$TESTCLONE.$i -rf + datasetexists $ds && destroy_dataset $ds -Rf ((i=i+1)) done } @@ -143,33 +143,29 @@ datasets="$TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1/$TESTFS2 typeset -a d_clones typeset -a deferred_snaps typeset -i i -i=1 log_must setup_ds log_note "Verify zfs clone property for multiple clones" names=$(zfs list -rt all -o name $TESTPOOL) log_must verify_clones 3 0 -log_note "verfify clone property for clone deletion" +log_note "verify clone property for clone deletion" i=1 for ds in $datasets; do log_must zfs destroy $ds/$TESTCLONE.$i ((i=i+1)) done names=$(zfs list -rt all -o name $TESTPOOL) -i=1 log_must verify_clones 2 1 log_must local_cleanup log_must setup_ds log_note "verify zfs deferred destroy on clones property" -i=1 names=$(zfs list -rt all -o name $TESTPOOL) for ds in $datasets; do log_must zfs destroy -d $ds@snap deferred_snaps=( "${deferred_snaps[@]}" "$ds@snap" ) - ((i=i+1)) done log_must verify_clones 3 0 @@ -206,17 +202,14 @@ for ds in $datasets; do done names=$(zfs list -rt all -o name,clones $TESTPOOL) log_must verify_clones 3 1 $TESTCLONE -i=1 for ds in $datasets; do log_must zfs promote $ds - ((i=i+1)) done log_must local_cleanup log_note "verify clone list truncated correctly" -typeset -i j=200 -i=1 fs=$TESTPOOL/$TESTFS1 +xs=""; for i in {1..200}; do xs+="x"; done if is_linux; then ZFS_MAXPROPLEN=4096 else @@ -224,10 +217,8 @@ else fi log_must zfs create $fs log_must zfs snapshot $fs@snap -while((i <= $(( ZFS_MAXPROPLEN/200+1 )))); do - log_must zfs clone $fs@snap $fs/$TESTCLONE$(python -c 'print "x" * 200').$i - ((i=i+1)) - ((j=j+200)) +for (( i = 1; i <= (ZFS_MAXPROPLEN / 200 + 1); i++ )); do + log_must zfs clone ${fs}@snap ${fs}/${TESTCLONE}${xs}.${i} done clone_list=$(zfs list -o clones $fs@snap) char_count=$(echo "$clone_list" | tail -1 | wc | awk '{print $3}') diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh index 86f335bde2..1f07b9eb03 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh @@ -44,9 +44,9 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -f datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh new file mode 100755 index 0000000000..447fbb36b4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# When a snapshot is destroyed, we used to recurse all clones +# that are downstream of the destroyed snapshot (e.g. to remove +# its key and merge its deadlist entries to the previous one). +# This recursion would break the stack on deeply nested clone +# hierarchies. To avoid this problem today, we keep heap-allocated +# records of all the clones as we traverse their hierarchy. +# +# This test ensures and showcases that our new method works with +# deeply nested clone hierarchies. +# +# STRATEGY: +# 1. Create an fs and take a snapshot of it (snapshot foo) +# 2. Take a second snapshot of the same fs (snapshot bar) on +# top of snapshot foo +# 3. Create a clone of snapshot bar and then take a snapshot +# of it. +# 4. Create a clone of the newly-created snapshot and then +# take a snapshot of it. +# 5. Repeat step [4] many times to create a deeply nested hierarchy. +# 6. Destroy snapshot foo. +# + +verify_runnable "both" + +typeset FS0=$TESTPOOL/0 +typeset FOO=foo +typeset BAR=BAR + +typeset FS0SNAPFOO=$FS0@$FOO +typeset FS0SNAPBAR=$FS0@$BAR + +typeset -i numds=300 + +log_must zfs create $FS0 + +function test_cleanup +{ + log_must zfs destroy -Rf $FS0 + + return 0 +} + +log_must zfs snapshot $FS0SNAPFOO +log_must zfs snapshot $FS0SNAPBAR + +log_onexit test_cleanup + +for (( i=1; i/dev/null 2>&1 - log_must mount -F ufs -o rw $vol_b_path $mntp - elif [[ $type == "ext2" ]]; then - log_must echo y | newfs $vol_r_path >/dev/null 2>&1 + case "$type" in + "ext2") + if is_freebsd; then + log_unsupported "ext2 test not implemented for freebsd" + fi + log_must eval "new_fs $vol_b_path >/dev/null 2>&1" log_must mount -o rw $vol_b_path $mntp - else - log_must zpool create $TESTPOOL1 $vol_b_path + ;; + "ufs") + if is_linux; then + log_unsupported "ufs test not implemented for linux" + fi + log_must eval "new_fs $vol_b_path >/dev/null 2>&1" + log_must mount $vol_b_path $mntp + ;; + "zfs") + if is_freebsd; then + # Pool creation on zvols is forbidden by default. + # Save and restore the current setting. + typeset _saved=$(get_tunable VOL_RECURSIVE) + log_must set_tunable64 VOL_RECURSIVE 1 # Allow + zpool create $TESTPOOL1 $vol_b_path + typeset _zpool_create_result=$? + log_must set_tunable64 VOL_RECURSIVE $_saved # Restore + log_must test $_zpool_create_result = 0 + else + log_must zpool create $TESTPOOL1 $vol_b_path + fi log_must zfs create $TESTPOOL1/$TESTFS1 - fi + ;; + *) + log_unsupported "$type test not implemented" + ;; + esac - ((nfilesize = copy * ${FILESIZE%m})) - pre_used=$(get_used_prop $vol) + ((nfilesize = copies * ${FILESIZE%m})) + pre_used=$(get_prop used $vol) ((target_size = pre_used + nfilesize)) - if [[ $type == "ufs" ]]; then - log_must mkfile $FILESIZE $mntp/$FILE - elif [[ $type == "ext2" ]]; then - log_must mkfile $FILESIZE $mntp/$FILE - else + if [[ $type == "zfs" ]]; then log_must mkfile $FILESIZE /$TESTPOOL1/$TESTFS1/$FILE + else + log_must mkfile $FILESIZE $mntp/$FILE fi - post_used=$(get_used_prop $vol) - while ((post_used < target_size)) ; do + post_used=$(get_prop used $vol) + ((retries = 0)) + while ((post_used < target_size && retries++ < 42)); do sleep 1 - post_used=$(get_used_prop $vol) + post_used=$(get_prop used $vol) done ((used = post_used - pre_used)) if ((used < nfilesize)); then log_fail "The space is not charged correctly while setting" \ - "copies as $copy" + "copies as $copies ($used < $nfilesize)" \ + "pre=${pre_used} post=${post_used}" fi - if [[ $type == "ufs" ]]; then - umount $mntp - elif [[ $type == "ext2" ]]; then - umount $mntp - else + if [[ $type == "zfs" ]]; then log_must zpool destroy $TESTPOOL1 + else + log_must umount $mntp fi log_must zfs destroy $vol diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh index 2ed881a367..672692b59e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh @@ -49,9 +49,7 @@ function cleanup typeset ds for ds in $fs1 $fs2 $vol1 $vol2; do - if datasetexists $ds; then - log_must zfs destroy $ds - fi + datasetexists $ds && destroy_dataset $ds done } @@ -94,13 +92,13 @@ for val in 1 2 3; do fi for ds in $fs2 $vol2; do cmp_prop $ds $val2 - log_must zfs destroy $ds + destroy_dataset $ds block_device_wait done done for ds in $fs1 $vol1; do - log_must zfs destroy $ds + destroy_dataset $ds block_device_wait done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh index a5a9729dc1..b644fcae3c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh @@ -50,9 +50,8 @@ function cleanup typeset val for val in 1 2 3; do - if datasetexists $TESTPOOL/fs_$val; then - log_must zfs destroy $TESTPOOL/fs_$val - fi + datasetexists $TESTPOOL/fs_$val && \ + destroy_dataset $TESTPOOL/fs_$val done } @@ -76,12 +75,12 @@ sync log_note "Verify 'zfs list' can correctly list the space charged." fsize=${FILESIZE%[m|M]} for val in 1 2 3; do - used=$(get_used_prop $TESTPOOL/fs_$val) + used=$(get_prop used $TESTPOOL/fs_$val) check_used $used $val done log_note "Verify 'ls -s' can correctly list the space charged." -if is_linux; then +if is_linux || is_freebsd; then blksize=1024 else blksize=512 @@ -92,18 +91,27 @@ for val in 1 2 3; do check_used $used $val done -log_note "Verify df(1M) can corectly display the space charged." +log_note "Verify df(1) can correctly display the space charged." for val in 1 2 3; do - used=`df -F zfs -k /$TESTPOOL/fs_$val/$FILE | grep $TESTPOOL/fs_$val \ - | awk '{print $3}'` - (( used = used * 1024 )) # kb -> bytes + if is_freebsd; then + used=`df -m /$TESTPOOL/fs_$val | grep $TESTPOOL/fs_$val \ + | awk -v fs=fs_$val '$4 ~ fs {print $3}'` + else + used=`df -F zfs -k /$TESTPOOL/fs_$val/$FILE | grep $TESTPOOL/fs_$val \ + | awk '{print $3}'` + (( used = used * 1024 )) # kb -> bytes + fi check_used $used $val done log_note "Verify du(1) can correctly display the space charged." for val in 1 2 3; do - used=`du -k /$TESTPOOL/fs_$val/$FILE | awk '{print $1}'` - (( used = used * 1024 )) # kb -> bytes + if is_freebsd; then + used=`du -h /$TESTPOOL/fs_$val/$FILE | awk '{print $1}'` + else + used=`du -k /$TESTPOOL/fs_$val/$FILE | awk '{print $1}'` + (( used = used * 1024 )) # kb -> bytes + fi check_used $used $val done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh index 98420cb7f2..94e72bce4e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh @@ -51,9 +51,7 @@ function cleanup destroy_pool $TESTPOOL1 fi - if datasetexists $vol; then - log_must zfs destroy $vol - fi + datasetexists $vol && destroy_dataset $vol } log_assert "Verify that ZFS volume space used by multiple copies is charged correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh index 5946bf5967..6dc9306b33 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh @@ -51,9 +51,7 @@ function cleanup log_must umount $mntp fi - if datasetexists $vol; then - log_must zfs destroy $vol - fi + datasetexists $vol && destroy_dataset $vol if [[ -d $mntp ]]; then rm -rf $mntp @@ -70,8 +68,8 @@ if [[ ! -d $mntp ]]; then mkdir -p $mntp fi -for val in 1 2 3; do - do_vol_test $NEWFS_DEFAULT_FS $val $mntp +for copies in 1 2 3; do + do_vol_test $NEWFS_DEFAULT_FS $copies $mntp done log_pass "The volume space used by multiple copies is charged correctly as expected. " diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am index a36d021614..7515753c1b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am @@ -17,7 +17,10 @@ dist_pkgdata_SCRIPTS = \ zfs_create_013_pos.ksh \ zfs_create_014_pos.ksh \ zfs_create_encrypted.ksh \ - zfs_create_crypt_combos.ksh + zfs_create_crypt_combos.ksh \ + zfs_create_dryrun.ksh \ + zfs_create_nomount.ksh \ + zfs_create_verbose.ksh dist_pkgdata_DATA = \ properties.kshlib \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/properties.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_create/properties.kshlib index 00b5ad8bd7..4130ba4463 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/properties.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/properties.kshlib @@ -61,9 +61,13 @@ set -A FS_ONLY_PROP "quota=536870912" \ "devices=off" \ "exec=off" \ "setuid=off" \ - "zoned=on" \ "snapdir=visible" \ "canmount=off" \ "version=1" +if is_freebsd; then + FS_ONLY_PROP+=("jailed=on") +else + FS_ONLY_PROP+=("zoned=on") +fi set -A VOL_ONLY_PROP "volblocksize=16384" "volsize=536870912" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create.cfg index b96908ce12..785d5a0016 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create.cfg @@ -54,6 +54,12 @@ export VOL_LIMIT_KEYWORD1="1TB on 32-bit" export VOL_LIMIT_KEYWORD2="value is too large" export VOL_LIMIT_KEYWORD3="volume size exceeds limit" -set -A size "8k" "8K" "1m" "1M" "1mb" "1mB" "1Mb" "1MB" "1g" "1G" \ +set -A size "8k" "8K" "35K" "1m" "1M" "1mb" "1mB" "1Mb" "1MB" "1g" "1G" \ "1p" "1P" "1z" "1Z" "1gb" "1gB" "1Gb" "1GB" "1pb" "1pB" "1Pb" \ "1PB" "1zb" "1zB" "1Zb" "1ZB" + +# If a datasize has a volume size that is not a multiple of the blocksize, +# explicitly check that its size has been rounded up to the nearest multiple +# The volume with the exact size must exist in the "size" array above +set -A explicit_size_check "35K" +set -A expected_rounded_size "49152" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh index 0e580a8474..f74b2c9816 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh @@ -48,9 +48,11 @@ function cleanup typeset -i i=0 while (( $i < ${#datasets[*]} )); do datasetexists ${datasets[$i]} && \ - log_must zfs destroy -f ${datasets[$i]} + destroy_dataset ${datasets[$i]} -f ((i = i + 1)) done + + zfs destroy -f "$TESTPOOL/with a space" } log_onexit cleanup @@ -68,4 +70,8 @@ while (( $i < ${#datasets[*]} )); do ((i = i + 1)) done +log_must zfs create "$TESTPOOL/with a space" +log_must zfs unmount "$TESTPOOL/with a space" +log_must zfs mount "$TESTPOOL/with a space" + log_pass "'zfs create ' works as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_002_pos.ksh index 6f36b40bfd..0218e2e16b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_002_pos.ksh @@ -31,6 +31,7 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create.cfg +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib # # DESCRIPTION: @@ -39,6 +40,8 @@ # STRATEGY: # 1. Create a volume in the storage pool. # 2. Verify the volume is created correctly. +# 3. Verify that the volume created has its volsize rounded to the nearest +# multiple of the blocksize (in this case, the default blocksize) # verify_runnable "global" @@ -76,6 +79,15 @@ while (( $j < ${#size[*]} )); do fi ((j = j + 1)) - done + +typeset -i j=0 +while (( $j < ${#explicit_size_check[*]} )); do + propertycheck ${TESTPOOL}/${TESTVOL}${explicit_size_check[j]} \ + volsize=${expected_rounded_size[j]} || \ + log_fail "volsize ${size[j]} was not rounded up" + + ((j = j + 1)) +done + log_pass "'zfs create -s -V ' works as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_003_pos.ksh index 2906e32dab..120de10281 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_003_pos.ksh @@ -46,8 +46,7 @@ verify_runnable "global" function cleanup { - datasetexists $vol && \ - log_must zfs destroy -f $vol + datasetexists $vol && destroy_dataset $vol -f } log_assert "Verify creating volume with specified blocksize works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_004_pos.ksh index 200b4a0915..9e69366c87 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_004_pos.ksh @@ -48,8 +48,7 @@ verify_runnable "both" function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup @@ -65,7 +64,7 @@ while (( $i < ${#RW_FS_PROP[*]} )); do log_fail "zfs create $TESTPOOL/$TESTFS1 fail." propertycheck $TESTPOOL/$TESTFS1 ${RW_FS_PROP[i]} || \ log_fail "${RW_FS_PROP[i]} is failed to set." - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + log_must_busy zfs destroy -f $TESTPOOL/$TESTFS1 (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_005_pos.ksh index e953c65ded..98cf70938e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_005_pos.ksh @@ -49,7 +49,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_006_pos.ksh index 2a664a4246..551ae78cd2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_006_pos.ksh @@ -50,7 +50,7 @@ verify_runnable "global" function cleanup { datasetexists $TESTPOOL/$TESTVOL1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTVOL1 + destroy_dataset $TESTPOOL/$TESTVOL1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh index 08ede0592f..a905e50dfa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh @@ -46,9 +46,8 @@ verify_runnable "both" function cleanup { - if datasetexists $TESTPOOL/$TESTFS1 ; then - log_must zfs destroy -f $TESTPOOL/$TESTFS1 - fi + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup @@ -74,7 +73,6 @@ set -A args "ab" "-?" "-cV" "-Vc" "-c -V" "c" "V" "--c" "-e" "-s" \ "-o readonly=ON" "-o reADOnly=off" "-o rdonly=OFF" "-o rdonly=aaa" \ "-o readonly=ON -V $VOLSIZE" "-o reADOnly=off -V $VOLSIZE" \ "-o rdonly=OFF -V $VOLSIZE" "-o rdonly=aaa -V $VOLSIZE" \ - "-o zoned=ON" "-o ZoNed=off" "-o zoned=aaa" \ "-o snapdIR=hidden" "-o snapdir=VISible" "-o snapdir=aaa" \ "-o aclmode=DIScard" "-o aclmODE=groupmask" "-o aclmode=aaa" \ "-o aclinherit=deny" "-o aclinHerit=secure" "-o aclinherit=aaa" \ @@ -88,13 +86,26 @@ set -A args "ab" "-?" "-cV" "-Vc" "-c -V" "c" "V" "--c" "-e" "-s" \ "-o compressratio=1.00x" "-o compressratio=1.00x -V $VOLSIZE" \ "-o version=0" "-o version=1.234" "-o version=10K" "-o version=-1" \ "-o version=aaa" "-o version=999" +if is_freebsd; then + args+=("-o jailed=ON" "-o JaiLed=off" "-o jailed=aaa") +else + args+=("-o zoned=ON" "-o ZoNed=off" "-o zoned=aaa") +fi log_assert "'zfs create' should return an error with badly-formed parameters." typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do - log_mustnot zfs create ${args[i]} $TESTPOOL/$TESTFS1 - log_mustnot zfs create -p ${args[i]} $TESTPOOL/$TESTFS1 + typeset arg=${args[i]} + if is_freebsd; then + # FreeBSD does not strictly validate share options (yet). + if [[ "$arg" == "-o sharenfs="* ]]; then + ((i = i + 1)) + continue + fi + fi + log_mustnot zfs create $arg $TESTPOOL/$TESTFS1 + log_mustnot zfs create -p $arg $TESTPOOL/$TESTFS1 ((i = i + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh index b8190626c7..63f5e595ea 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh @@ -90,7 +90,9 @@ set -A args "$TESTPOOL/" "$TESTPOOL//blah" "$TESTPOOL/@blah" \ "$TESTPOOL/blah*blah" "$TESTPOOL/blah blah" \ "-s $TESTPOOL/$TESTFS1" "-b 1092 $TESTPOOL/$TESTFS1" \ "-b 64k $TESTPOOL/$TESTFS1" "-s -b 32k $TESTPOOL/$TESTFS1" \ - "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" + "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" \ + "$TESTPOOL/." "$TESTPOOL/.." "$TESTPOOL/../blah" "$TESTPOOL/./blah" \ + "$TESTPOOL/blah/./blah" "$TESTPOOL/blah/../blah" log_assert "Verify 'zfs create ' fails with bad argument." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_011_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_011_pos.ksh index 0144b050d7..c5012d4f34 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_011_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_011_pos.ksh @@ -33,7 +33,7 @@ # # DESCRIPTION: -# 'zfs create -p' should work as expecteed +# 'zfs create -p' should work as expected # # STRATEGY: # 1. To create $newdataset with -p option, first make sure the upper level @@ -48,9 +48,8 @@ verify_runnable "both" function cleanup { - if datasetexists $TESTPOOL/$TESTFS1 ; then - log_must zfs destroy -rf $TESTPOOL/$TESTFS1 - fi + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_012_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_012_pos.ksh index d8aa064077..a0b8d52f0c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_012_pos.ksh @@ -48,9 +48,8 @@ verify_runnable "both" function cleanup { - if datasetexists $TESTPOOL/$TESTFS1 ; then - log_must zfs destroy -rf $TESTPOOL/$TESTFS1 - fi + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -rf } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh index d1a8153d60..2482a68dc0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh @@ -43,8 +43,7 @@ TESTFS2=$(for i in $(seq $((255 - ${#TESTPOOL}))); do echo z ; done | tr -d '\n' function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && - log_must zfs destroy $TESTPOOL/$TESTFS1 + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh index a46cb55f36..758b800c2f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh @@ -38,7 +38,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup @@ -53,7 +53,7 @@ set -A ENCRYPTION_ALGS \ "encryption=aes-256-gcm" set -A ENCRYPTION_PROPS \ - "encryption=aes-256-ccm" \ + "encryption=aes-256-gcm" \ "encryption=aes-128-ccm" \ "encryption=aes-192-ccm" \ "encryption=aes-256-ccm" \ @@ -89,7 +89,7 @@ while (( i < ${#ENCRYPTION_ALGS[*]} )); do propertycheck $TESTPOOL/$TESTFS1 ${KEYFORMATS[j]} || \ log_fail "failed to set ${KEYFORMATS[j]}" - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + log_must_busy zfs destroy -f $TESTPOOL/$TESTFS1 (( j = j + 1 )) done (( i = i + 1 )) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_dryrun.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_dryrun.ksh new file mode 100755 index 0000000000..703ae8043d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_dryrun.ksh @@ -0,0 +1,168 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib + +# +# DESCRIPTION: +# zfs create -n should perform basic sanity checking but should never create a +# dataset. If -v and/or -P are used, it should verbose about what would be +# created if sanity checks pass. +# +# STRATEGY: +# 1. Attempt to create a file system and a volume using various combinations of +# -n with -v and -P. +# + +verify_runnable "both" + +# +# Verifies that valid commands with -n and without -[vP]: +# - succeed +# - do not create a dataset +# - do not generate output +# +function dry_create_no_output +{ + typeset -a cmd=(zfs create -n "$@") + + log_note "$0: ${cmd[@]}" + log_must "${cmd[@]}" + datasetexists "$TESTPOOL/$TESTFS1" && + log_fail "$TESTPOOL/$TESTFS1 unexpectedly created by '${cmd[@]}'" + typeset out=$("${cmd[@]}" 2>&1) + [[ -z "$out" ]] || + log_fail "unexpected output '$out' from '${cmd[@]}'" +} + +# +# Verifies that commands with invalid properties or invalid property values +# - fail +# - do not create a dataset +# - generate a message on stderr +# +function dry_create_error +{ + typeset -a cmd=(zfs create -n "$@") + + log_note "$0: ${cmd[@]}" + log_mustnot "${cmd[@]}" + datasetexists "$TESTPOOL/$TESTFS1" && + log_fail "$TESTPOOL/$TESTFS1 unexpectedly created by '${cmd[@]}'" + typeset out=$("${cmd[@]}" 2>&1 >/dev/null) + [[ -z "$out" ]] && + log_fail "expected an error message but got none from '${cmd[@]}'" +} + +# +# Verifies that dry-run commands with parseable output +# - succeed +# - do not create datasets +# - generate parseable output on stdout +# - output matches expectations +# +function dry_create_parseable +{ + typeset -n exp=$1 + shift + typeset -a cmd=(zfs create -Pn "$@") + typeset ds=${cmd[${#cmd[@]} - 1]} + typeset out + typeset -a toks + typeset -a props + typeset found_create=false + + log_note "$0: ${cmd[@]}" + out=$("${cmd[@]}") + (( $? == 0 )) || + log_fail "unexpected failure getting stdout from '${cmd[@]}'" + datasetexists "$TESTPOOL/$TESTFS1" && + log_fail "$TESTPOOL/$TESTFS1 unexpectedly created by '${cmd[@]}'" + echo "$out" | while IFS=$'\t' read -A toks; do + log_note "verifying ${toks[@]}" + case ${toks[0]} in + create) + log_must test "${#toks[@]}" -eq 2 + log_must test "${toks[1]}" == "$ds" + found_create="yes, I found create" + ;; + property) + log_must test "${#toks[@]}" -eq 3 + typeset prop=${toks[1]} + typeset val=${toks[2]} + if [[ -z "${exp[$prop]}" ]]; then + log_fail "unexpectedly got property '$prop'" + fi + # We may not know the exact value a property will take + # on. This is the case for at least refreservation. + if [[ ${exp[$prop]} != "*" ]]; then + log_must test "${exp[$prop]}" == "$val" + fi + unset exp[$prop] + ;; + *) + log_fail "Unexpected line ${toks[@]}" + ;; + esac + done + + log_must test "$found_create" == "yes, I found create" + log_must test "extra props: ${!exp[@]}" == "extra props: " +} + +function cleanup +{ + datasetexists "$TESTPOOL/$TESTFS1" && \ + destroy_dataset "$TESTPOOL/$TESTFS1" -r +} +log_onexit cleanup + +log_assert "zfs create -n creates nothing but can describe what would be" \ + "created" + +# Typical creations should succeed +dry_create_no_output "$TESTPOOL/$TESTFS1" +dry_create_no_output -V 10m "$TESTPOOL/$TESTFS1" +# It shouldn't do a space check right now +dry_create_no_output -V 100t "$TESTPOOL/$TESTFS1" +# It shouldn't create parent datasets either +dry_create_no_output -p "$TESTPOOL/$TESTFS1/$TESTFS2" +dry_create_no_output -pV 10m "$TESTPOOL/$TESTFS1/$TESTFS2" + +# Various invalid properties should be recognized and result in an error +dry_create_error -o nosuchprop=42 "$TESTPOOL/$TESTFS1" +dry_create_error -b 1234 -V 10m "$TESTPOOL/$TESTFS1" + +# Parseable output should be parseable. +typeset -A expect +expect=([compression]=on) +dry_create_parseable expect -o compression=on "$TESTPOOL/$TESTFS1" + +# Sparse volumes should not get a gratuitous refreservation +expect=([volblocksize]=4096 [volsize]=$((1024 * 1024 * 10))) +dry_create_parseable expect -b 4k -V 10m -s "$TESTPOOL/$TESTFS1" + +# Non-sparse volumes should have refreservation +expect=( + [volblocksize]=4096 + [volsize]=$((1024 * 1024 * 10)) + [refreservation]="*" +) +dry_create_parseable expect -b 4k -V 10m "$TESTPOOL/$TESTFS1" + +log_pass "zfs create -n creates nothing but can describe what would be" \ + "created" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh index 9d5ecab0df..e32545c689 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh @@ -51,10 +51,10 @@ # yes unspec 0 1 no no keyformat specified # yes unspec 1 0 yes new encryption root, crypt inherited # yes unspec 1 1 yes new encryption root, crypt inherited -# yes off 0 0 no unencrypted child of encrypted parent -# yes off 0 1 no unencrypted child of encrypted parent -# yes off 1 0 no unencrypted child of encrypted parent -# yes off 1 1 no unencrypted child of encrypted parent +# yes off 0 0 yes unencrypted child of encrypted parent +# yes off 0 1 no keylocation given, but crypt off +# yes off 1 0 no keyformat given, but crypt off +# yes off 1 1 no keyformat given, but crypt off # yes on 0 0 yes inherited encryption, local crypt # yes on 0 1 no no keyformat specified for new key # yes on 1 0 yes new encryption root @@ -70,9 +70,9 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r } log_onexit cleanup @@ -113,7 +113,9 @@ log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \ log_must eval "echo $PASSPHRASE | zfs create -o keyformat=passphrase" \ "-o keylocation=prompt $TESTPOOL/$TESTFS2/c4" -log_mustnot zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5 +log_must zfs create -o encryption=off $TESTPOOL/$TESTFS2/c5 +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/c5)" == "off" + log_mustnot zfs create -o encryption=off -o keylocation=prompt \ $TESTPOOL/$TESTFS2/c5 log_mustnot zfs create -o encryption=off -o keyformat=passphrase \ @@ -122,13 +124,13 @@ log_mustnot zfs create -o encryption=off -o keyformat=passphrase \ -o keylocation=prompt $TESTPOOL/$TESTFS2/c5 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "$TESTPOOL/$TESTFS2/c5" + "$TESTPOOL/$TESTFS2/c6" log_mustnot zfs create -o encryption=on -o keylocation=prompt \ - $TESTPOOL/$TESTFS2/c6 + $TESTPOOL/$TESTFS2/c7 log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "-o keyformat=passphrase $TESTPOOL/$TESTFS2/c6" + "-o keyformat=passphrase $TESTPOOL/$TESTFS2/c7" log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ - "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c7" + "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2/c8" log_pass "ZFS creates datasets only if they have a valid combination of" \ "encryption properties set." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh new file mode 100755 index 0000000000..e1fbbe63ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh @@ -0,0 +1,51 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# zfs create -u should leave the new file system unmounted. +# It should not work for a volume. +# +# STRATEGY: +# 1. Create a file system using -u and make sure the file system is not mounted. +# 3. Do it for a volume to verify it fails. +# + +verify_runnable "both" + +function cleanup +{ + local ds + + for ds in "$fs" "$vol"; do + datasetexists "$ds" && destroy_dataset "$ds" + done +} +log_onexit cleanup + +log_assert "zfs create -u leaves the new file system unmounted" + +typeset fs="$TESTPOOL/$TESTFS1" +typeset vol="$TESTPOOL/$TESTVOL1" + +log_must create_dataset "$fs" "-u" +log_mustnot ismounted "$fs" + +log_mustnot zfs create -V $VOLSIZE -u "$vol" + +log_pass "zfs create -u leaves the new file system unmounted" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_verbose.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_verbose.ksh new file mode 100755 index 0000000000..acab500062 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_verbose.ksh @@ -0,0 +1,164 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib + +# +# DESCRIPTION: +# zfs create -P without -n should be verbose about dataset creation. +# +# STRATEGY: +# 1. Attempt to create a file system and a volume using various properties +# and -P +# 2. Exercise the combination of -p and -P. +# + +verify_runnable "both" + +# +# Verifies that non dry-run commands with parseable output +# - succeed +# - create datasets +# - generate parseable output on stdout +# - output matches expectations +# +function dry_create_parseable +{ + typeset -n exp=$1 + shift + typeset -a cmd=(zfs create -P "$@") + typeset ds=${cmd[${#cmd[@]} - 1]} + typeset out + typeset -a toks + typeset -a props + typeset found_create=false + typeset create_ancestors= + typeset opt + + # Parse the arguments to see if -p was used. + while getopts :PV:b:ospv opt; do + case $opt in + p) create_ancestors=needed ;; + *) continue ;; + esac + done + + log_note "$0: ${cmd[@]}" + out=$("${cmd[@]}") + (( $? == 0 )) || + log_fail "unexpected failure getting stdout from '${cmd[@]}'" + datasetexists "$TESTPOOL/$TESTFS1" || + log_fail "$TESTPOOL/$TESTFS1 unexpectedly created by '${cmd[@]}'" + echo "$out" | while IFS=$'\t' read -A toks; do + log_note "verifying ${toks[@]}" + case ${toks[0]} in + create_ancestors) + case "$create_ancestors" in + needed) + log_must test "${toks[1]}" == "$ds" + create_ancestors="found ${toks[1]}" + ;; + found*) + log_fail "multiple ancestor creation" \ + "$create_ancestors and ${toks[1]}" + ;; + "") + log_fail "unexpected create_ancestors" + ;; + *) + log_fail "impossible error: fix the test" + ;; + esac + ;; + create) + log_must test "${#toks[@]}" -eq 2 + log_must test "${toks[1]}" == "$ds" + found_create="yes, I found create" + ;; + property) + log_must test "${#toks[@]}" -eq 3 + typeset prop=${toks[1]} + typeset val=${toks[2]} + if [[ -z "${exp[$prop]}" ]]; then + log_fail "unexpectedly got property '$prop'" + fi + # We may not know the exact value a property will take + # on. This is the case for at least refreservation. + if [[ ${exp[$prop]} != "*" ]]; then + log_must test "${exp[$prop]}" == "$val" + fi + unset exp[$prop] + ;; + *) + log_fail "Unexpected line ${toks[@]}" + ;; + esac + done + + log_must test "$found_create" == "yes, I found create" + log_must test "extra props: ${!exp[@]}" == "extra props: " + + case "$create_ancestors" in + "") + log_must_busy zfs destroy "$ds" + ;; + "found $ds") + log_must_busy zfs destroy -r "$(echo "$ds" | cut -d/ -f1-2)" + ;; + needed) + log_fail "Expected but did not find create_ancestors" + ;; + *) + log_fail "Unexpected value for create_ancestors:" \ + "$create_ancestors" + ;; + esac +} + +function cleanup +{ + datasetexists "$TESTPOOL/$TESTFS1" && \ + destroy_dataset "$TESTPOOL/$TESTFS1" -r +} +log_onexit cleanup + +log_assert "zfs create -v creates datasets verbosely" + +# Parseable output should be parseable. +typeset -A expect +expect=([compression]=on) +dry_create_parseable expect -o compression=on "$TESTPOOL/$TESTFS1" + +# Ancestor creation with -p should emit relevant line +expect=([compression]=on) +dry_create_parseable expect -p -o compression=on "$TESTPOOL/$TESTFS1" +expect=([compression]=on) +dry_create_parseable expect -p -o compression=on "$TESTPOOL/$TESTFS1/$TESTVOL" + +# Sparse volumes should not get a gratuitous refreservation +expect=([volblocksize]=4096 [volsize]=$((1024 * 1024 * 10))) +dry_create_parseable expect -b 4k -V 10m -s "$TESTPOOL/$TESTFS1" + +# Non-sparse volumes should have refreservation +expect=( + [volblocksize]=4096 + [volsize]=$((1024 * 1024 * 10)) + [refreservation]="*" +) +dry_create_parseable expect -b 4k -V 10m "$TESTPOOL/$TESTFS1" + +log_pass "zfs create -v creates datasets verbosely" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am index 183578df5d..664f3d81ae 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am @@ -2,6 +2,9 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_destro dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + zfs_clone_livelist_condense_and_disable.ksh \ + zfs_clone_livelist_condense_races.ksh \ + zfs_clone_livelist_dedup.ksh \ zfs_destroy_001_pos.ksh \ zfs_destroy_002_pos.ksh \ zfs_destroy_003_pos.ksh \ @@ -17,7 +20,10 @@ dist_pkgdata_SCRIPTS = \ zfs_destroy_013_neg.ksh \ zfs_destroy_014_pos.ksh \ zfs_destroy_015_pos.ksh \ - zfs_destroy_016_pos.ksh + zfs_destroy_016_pos.ksh \ + zfs_destroy_clone_livelist.ksh \ + zfs_destroy_dev_removal.ksh \ + zfs_destroy_dev_removal_condense.ksh dist_pkgdata_DATA = \ zfs_destroy_common.kshlib \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh new file mode 100755 index 0000000000..ab506debe9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh @@ -0,0 +1,125 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with the livelist feature +# enabled. + +# STRATEGY +# 1. Clone where livelist is condensed +# - create clone, write several files, delete those files +# - check that the number of livelist entries decreases +# after the delete +# 2. Clone where livelist is deactivated +# - create clone, write files. Delete those files and the +# file in the filesystem when the snapshot was created +# so the clone and snapshot no longer share data +# - check that the livelist is destroyed + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # reset the livelist sublist size to the original value + set_tunable64 LIVELIST_MAX_ENTRIES $ORIGINAL_MAX + # reset the minimum percent shared to 75 + set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN +} + +function check_ll_len +{ + string="$(zdb -vvvvv $TESTPOOL | grep "Livelist")" + substring="$1" + msg=$2 + if test "${string#*$substring}" != "$string"; then + return 0 # $substring is in $string + else + log_note $string + log_fail "$msg" # $substring is not in $string + fi +} + +function test_condense +{ + # set the max livelist entries to a small value to more easily + # trigger a condense + set_tunable64 LIVELIST_MAX_ENTRIES 20 + # set a small percent shared threshold so the livelist is not disabled + set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10 + clone_dataset $TESTFS1 snap $TESTCLONE + + # sync between each write to make sure a new entry is created + for i in {0..4}; do + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i + log_must zpool sync $TESTPOOL + done + + check_ll_len "5 entries" "Unexpected livelist size" + + # sync between each write to allow for a condense of the previous entry + for i in {0..4}; do + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i + log_must zpool sync $TESTPOOL + done + + check_ll_len "6 entries" "Condense did not occur" + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_deactivated +{ + # Threshold set to 50 percent + set_tunable32 LIVELIST_MIN_PERCENT_SHARED 50 + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 + log_must zpool sync $TESTPOOL + # snapshot and clone share 'atestfile', 33 percent + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE + + # Threshold set to 20 percent + set_tunable32 LIVELIST_MIN_PERCENT_SHARED 20 + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE2 + log_must zpool sync $TESTPOOL + # snapshot and clone share 'atestfile', 25 percent + check_livelist_exists $TESTCLONE + log_must rm /$TESTPOOL/$TESTCLONE/atestfile + # snapshot and clone share no files + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE +} + +ORIGINAL_MAX=$(get_tunable LIVELIST_MAX_ENTRIES) +ORIGINAL_MIN=$(get_tunable LIVELIST_MIN_PERCENT_SHARED) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap +test_condense +test_deactivated + +log_pass "Clone's livelist condenses and disables as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh new file mode 100755 index 0000000000..453b502416 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh @@ -0,0 +1,117 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Test race conditions for livelist condensing + +# STRATEGY +# These tests exercise code paths that deal with a livelist being +# simultaneously condensed and deactivated (deleted, exported or disabled). +# If a variable is set, the zthr will pause until it is cancelled or waited +# and then a counter variable keeps track of whether or not the code path is +# reached. + +# 1. Deletion race: repeatedly overwrite the same file to trigger condense +# and then delete the clone. +# 2. Disable race: Overwrite enough files to trigger condenses and disabling of +# the livelist. +# 3. Export race: repeatedly overwrite the same file to trigger condense and +# then export the pool. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # reset the livelist sublist size to the original value + set_tunable64 LIVELIST_MAX_ENTRIES $ORIGINAL_MAX + # reset the condense tests to 0 + set_tunable32 LIVELIST_CONDENSE_ZTHR_PAUSE 0 + set_tunable32 LIVELIST_CONDENSE_SYNC_PAUSE 0 +} + +function delete_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zpool sync $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "delete/condense race test failed" +} + +function export_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "export/condense race test failed" + log_must zfs destroy $TESTPOOL/$TESTCLONE +} + +function disable_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + # overwrite the file shared with the origin to trigger disable + log_must mkfile 100m /$TESTPOOL/$TESTCLONE/atestfile + log_must zpool sync $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "disable/condense race test failed" + log_must zfs destroy $TESTPOOL/$TESTCLONE +} + +ORIGINAL_MAX=$(get_tunable LIVELIST_MAX_ENTRIES) + +log_onexit cleanup + +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 100m /$TESTPOOL/$TESTFS1/atestfile +log_must zpool sync $TESTPOOL +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap + +# Reduce livelist size to trigger condense more easily +set_tunable64 LIVELIST_MAX_ENTRIES 20 + +# Test cancellation path in the zthr +set_tunable32 LIVELIST_CONDENSE_ZTHR_PAUSE 1 +set_tunable32 LIVELIST_CONDENSE_SYNC_PAUSE 0 +disable_race LIVELIST_CONDENSE_ZTHR_CANCEL +delete_race LIVELIST_CONDENSE_ZTHR_CANCEL +export_race LIVELIST_CONDENSE_ZTHR_CANCEL + +# Test cancellation path in the synctask +set_tunable32 LIVELIST_CONDENSE_ZTHR_PAUSE 0 +set_tunable32 LIVELIST_CONDENSE_SYNC_PAUSE 1 +disable_race LIVELIST_CONDENSE_SYNC_CANCEL +delete_race LIVELIST_CONDENSE_SYNC_CANCEL + +log_pass "Clone livelist condense race conditions passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh new file mode 100755 index 0000000000..5f356967a4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with livelists that contain +# dedup blocks. This test is a baseline regression test created +# to ensure that past bugs that we've encountered between dedup +# and the livelist logic don't resurface. + +# STRATEGY +# 1. Create a clone from a test filesystem and enable dedup. +# 2. Write some data and create a livelist. +# 3. Copy the data within the clone to create dedup blocks. +# 4. Remove some of the dedup data to create multiple free +# entries for the same block pointers. +# 5. Process all the livelist entries by destroying the clone. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # Reset the minimum percent shared to 75 + set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED +} + +function test_dedup +{ + # Set a small percent shared threshold so the livelist is not disabled + set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10 + clone_dataset $TESTFS1 snap $TESTCLONE + + # Enable dedup + log_must zfs set dedup=on $TESTPOOL/$TESTCLONE + + # Create some data to be deduped + log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k + + # Create dedup blocks + # Note: We sync before and after so all dedup blocks belong to the + # same TXG, otherwise they won't look identical to the livelist + # iterator due to their logical birth TXG being different. + log_must zpool sync $TESTPOOL + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3 + log_must zpool sync $TESTPOOL + check_livelist_exists $TESTCLONE + + # Introduce "double frees" + # We want to introduce consecutive FREEs of the same block as this + # was what triggered past panics. + # Note: Similarly to the previouys step we sync before and after our + # our deletions so all the entries end up in the same TXG. + log_must zpool sync $TESTPOOL + log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2 + log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3 + log_must zpool sync $TESTPOOL + check_livelist_exists $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap +test_dedup + +log_pass "Clone's livelist processes dedup blocks as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh index 534c33f0a0..11157e93c7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh @@ -53,7 +53,7 @@ verify_runnable "both" # run 'zfs destroy $opt '. 3rd, check the system status. # # $1 option of 'zfs destroy' -# $2 dataset will be destroied. +# $2 dataset will be destroyed. # function test_n_check { @@ -77,9 +77,7 @@ function test_n_check fi # Clean the test environment and make it clear. - if datasetexists $CTR; then - log_must zfs destroy -Rf $CTR - fi + datasetexists $CTR && destroy_dataset $CTR -Rf # According to option create test compatible environment. case $opt in diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh index 04e9713124..8b7e59b412 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh @@ -50,8 +50,7 @@ verify_runnable "both" function cleanup { for obj in $ctr2 $ctr1 $ctr; do - datasetexists $obj && \ - log_must zfs destroy -Rf $obj + datasetexists $obj && destroy_dataset $obj -Rf done for mntp in $TESTDIR1 $TESTDIR2; do @@ -142,14 +141,14 @@ done log_note "Verify that 'zfs destroy -R' succeeds to destroy dataset " \ "with dependent clone outside it." -log_must zfs destroy -R $ctr1 +log_must_busy zfs destroy -R $ctr1 datasetexists $ctr1 && \ log_fail "'zfs destroy -R' fails to destroy dataset with clone outside it." log_note "Verify that 'zfs destroy -r' succeeds to destroy dataset " \ "without dependent clone outside it." -log_must zfs destroy -r $ctr +log_must_busy zfs destroy -r $ctr datasetexists $ctr && \ log_fail "'zfs destroy -r' fails to destroy dataset with clone outside it." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh index 3db1331ff5..9a2ff6bea3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh @@ -49,15 +49,11 @@ function cleanup { cd $olddir - datasetexists $clone && \ - log_must zfs destroy -f $clone - - snapexists $snap && \ - log_must zfs destroy -f $snap + datasetexists $clone && destroy_dataset $clone -f + snapexists $snap && destroy_dataset $snap -f for fs in $fs1 $fs2; do - datasetexists $fs && \ - log_must zfs destroy -f $fs + datasetexists $fs && destroy_dataset $fs -f done for dir in $TESTDIR1 $TESTDIR2; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh index 2e4a0c3b2b..1c5b2cf1c7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh @@ -145,8 +145,8 @@ if is_global_zone; then check_dataset datasetexists $CTR $VOL check_dataset datasetnonexists $VOLSNAP $VOLCLONE - # Due to recusive destroy being a best-effort operation, - # all of the non-busy datasets bellow should be gone now. + # Due to recursive destroy being a best-effort operation, + # all of the non-busy datasets below should be gone now. check_dataset datasetnonexists $FS $FSSNAP $FSCLONE fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh index 70ad45af04..57eb736fd8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh @@ -51,8 +51,8 @@ function cleanup if datasetexists $clonesnap; then log_must zfs promote $fs fi - datasetexists $clone && log_must zfs destroy $clone - datasetexists $fssnap && log_must zfs destroy $fssnap + datasetexists $clone && destroy_dataset $clone + datasetexists $fssnap && destroy_dataset $fssnap } log_assert "Destroy dataset which is namespace-parent of origin should failed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh index df7cfcf527..e150cddfa1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh @@ -24,7 +24,7 @@ # # DESCRIPTION: # 'zfs destroy -R ' can destroy all the child -# snapshots and preserves all the nested datasetss. +# snapshots and preserves all the nested datasets. # # STRATEGY: # 1. Create nested datasets in the storage pool. @@ -45,7 +45,7 @@ datasets="$TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1/$TESTFS2 function cleanup { for ds in $datasets; do - datasetexists $ds && zfs destroy -rf $ds + datasetexists $ds && destroy_dataset $ds -rf done } @@ -57,7 +57,7 @@ for ds in $datasets; do datasetexists $ds || log_fail "Create $ds dataset fail." done -# create recursive nestedd snapshot +# create recursive nested snapshot log_must zfs snapshot -r $TESTPOOL/$TESTFS1@snap for ds in $datasets; do datasetexists $ds@snap || log_fail "Create $ds@snap snapshot fail." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh index f399ad2706..f1868f522c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh @@ -19,7 +19,7 @@ # snapshots from the same datasets # # STRATEGY -# 1. Create multiple snapshots for the same datset +# 1. Create multiple snapshots for the same dataset # 2. Run zfs destroy for these snapshots for a mix of valid and # invalid snapshot names # 3. Run zfs destroy for snapshots from different datasets and @@ -30,8 +30,8 @@ function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && zfs destroy -R $TESTPOOL/$TESTFS1 - datasetexists $TESTPOOL/$TESTFS2 && zfs destroy -R $TESTPOOL/$TESTFS2 + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 -R + datasetexists $TESTPOOL/$TESTFS2 && destroy_dataset $TESTPOOL/$TESTFS2 -R poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 rm -rf $VIRTUAL_DISK } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh index 83cd0a27c3..93c8c63fd2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh @@ -30,18 +30,18 @@ function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -R $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -R datasetexists $TESTPOOL/$TESTVOL && \ - log_must zfs destroy -Rf $TESTPOOL/$TESTVOL + destroy_dataset $TESTPOOL/$TESTVOL -Rf } function setup_snapshots { for i in $snaps; do datasetexists $TESTPOOL/$TESTFS1@snap$i && \ - log_must zfs destroy $TESTPOOL/$TESTFS1@snap$i + destroy_dataset $TESTPOOL/$TESTFS1@snap$i datasetexists $TESTPOOL/$TESTVOL@snap$i && \ - log_must zfs destroy $TESTPOOL/$TESTVOL@snap$i + destroy_dataset $TESTPOOL/$TESTVOL@snap$i log_must zfs snapshot $TESTPOOL/$TESTFS1@snap$i log_must zfs snapshot $TESTPOOL/$TESTVOL@snap$i done @@ -157,7 +157,7 @@ verify_snapshots 1 snaps="1 2 3 4 5" setup_snapshots -log_note "Snapshot destory with hold" +log_note "Snapshot destroy with hold" range="1 2 3 4 5" for i in 1 2 3 4 5; do log_must zfs hold keep $TESTPOOL/$TESTFS1@snap$i diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh new file mode 100755 index 0000000000..e7663ef797 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh @@ -0,0 +1,164 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018, 2020 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with the livelist feature +# enabled. + +# STRATEGY +# 1. One clone with an empty livelist +# - create the clone, check that livelist exists +# - delete the clone, check that livelist is eventually +# destroyed +# 2. One clone with populated livelist +# - create the clone, check that livelist exists +# - write multiple files to the clone +# - delete the clone, check that livelist is eventually +# destroyed +# 3. Multiple clones with empty livelists +# - same as 1. but with multiple clones +# 4. Multiple clones with populated livelists +# - same as 2. but with multiple clones +# 5. Clone of clone with populated livelists with promote + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 -R + # reset the livelist sublist size to its original value + set_tunable64 LIVELIST_MAX_ENTRIES $ORIGINAL_MAX +} + +function clone_write_file +{ + log_must mkfile 1m /$TESTPOOL/$1/$2 + log_must zpool sync $TESTPOOL +} + +function test_one_empty +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_one +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + clone_write_file $TESTCLONE $TESTFILE0 + clone_write_file $TESTCLONE $TESTFILE1 + clone_write_file $TESTCLONE $TESTFILE2 + log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE2 + check_livelist_exists $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_multiple_empty +{ + clone_dataset $TESTFS1 snap $TESTCLONE + clone_dataset $TESTFS1 snap $TESTCLONE1 + clone_dataset $TESTFS1 snap $TESTCLONE2 + + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zfs destroy $TESTPOOL/$TESTCLONE1 + log_must zfs destroy $TESTPOOL/$TESTCLONE2 + check_livelist_gone +} + +function test_multiple +{ + clone_dataset $TESTFS1 snap $TESTCLONE + clone_dataset $TESTFS1 snap $TESTCLONE1 + clone_dataset $TESTFS1 snap $TESTCLONE2 + + clone_write_file $TESTCLONE $TESTFILE0 + + clone_write_file $TESTCLONE1 $TESTFILE0 + clone_write_file $TESTCLONE1 $TESTFILE1 + clone_write_file $TESTCLONE1 $TESTFILE2 + + clone_write_file $TESTCLONE2 $TESTFILE0 + log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE0 + clone_write_file $TESTCLONE2 $TESTFILE1 + log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE1 + + check_livelist_exists $TESTCLONE + check_livelist_exists $TESTCLONE1 + check_livelist_exists $TESTCLONE2 + + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zfs destroy $TESTPOOL/$TESTCLONE1 + log_must zfs destroy $TESTPOOL/$TESTCLONE2 + check_livelist_gone +} + +function test_promote +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must zfs promote $TESTPOOL/$TESTCLONE + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE +} + +function test_clone_clone_promote +{ + log_must zfs create $TESTPOOL/fs + log_must dd if=/dev/zero of=/$TESTPOOL/fs/file bs=128k count=100 + log_must zfs snapshot $TESTPOOL/fs@snap + log_must zfs clone $TESTPOOL/fs@snap $TESTPOOL/clone + log_must dd if=/dev/zero of=/$TESTPOOL/clone/clonefile bs=128k count=10 + log_must zfs snapshot $TESTPOOL/clone@csnap + log_must zfs clone $TESTPOOL/clone@csnap $TESTPOOL/cloneclone + + check_livelist_exists clone + check_livelist_exists cloneclone + + # Promote should remove both clones' livelists + log_must zfs promote $TESTPOOL/cloneclone + check_livelist_gone + + # This destroy should not use a livelist + log_must zfs destroy $TESTPOOL/clone + log_must zdb -bcc $TESTPOOL +} + +ORIGINAL_MAX=$(get_tunable LIVELIST_MAX_ENTRIES) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 20m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap + +# set a small livelist entry size to more easily test multiple entry livelists +set_tunable64 LIVELIST_MAX_ENTRIES 20 + +test_one_empty +test_one +test_multiple_empty +test_multiple +test_promote +test_clone_clone_promote + +log_pass "Clone with the livelist feature enabled could be destroyed," \ + "also could be promoted and destroyed as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib index 0a6f5ed9d1..1a20b7a331 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -56,17 +56,12 @@ function setup_testenv #[dtst] if ! datasetexists $FS; then log_must zfs create $FS fi - # Volume test is only availible on globle zone + # Volume test is only available on global zone if ! datasetexists $VOL && is_global_zone; then log_must zfs create -V $VOLSIZE $VOL block_device_wait - echo "y" | newfs $ZVOL_DEVDIR/$VOL > /dev/null 2>&1 - if (( $? == 0 )); then - log_note "SUCCESS: newfs $ZVOL_DEVDIR/$VOL>/dev/null" - else - log_fail "newfs $ZVOL_DEVDIR/$VOL > /dev/null" - fi + log_must new_fs $ZVOL_DEVDIR/$VOL if [[ ! -d $TESTDIR1 ]]; then log_must mkdir $TESTDIR1 @@ -107,9 +102,7 @@ function cleanup_testenv pkill mkbusy - if datasetexists $CTR; then - log_must zfs destroy -Rf $CTR - fi + datasetexists $CTR && destroy_dataset $CTR -Rf } # @@ -127,7 +120,7 @@ function check_dataset shift for dtst in "$@"; do - # Volume and related stuff are unvailable in local zone + # Volume and related stuff are unavailable in local zone if ! is_global_zone; then if [[ $dtst == $VOL || $dtst == $VOLSNAP || \ $dtst == $VOLCLONE ]] @@ -140,9 +133,40 @@ function check_dataset if (( ${#newlist} != 0 )); then # Run each item in $newlist individually so on failure, the - # probelmatic dataset is listed in the logs. + # problematic dataset is listed in the logs. for i in $newlist; do log_must $funname $i done fi } + +# Use zdb to see if a livelist exists for a given clone +# $1 clone name +function check_livelist_exists +{ + zdb -vvvvv $TESTPOOL/$1 | grep "Livelist" || \ + log_fail "zdb could not find Livelist" +} + +# Check that a livelist has been removed, waiting for deferred destroy entries +# to be cleared from zdb. +function check_livelist_gone +{ + log_must zpool wait -t free $TESTPOOL + zpool sync + zdb -vvvvv $TESTPOOL | grep "Livelist" && \ + log_fail "zdb found Livelist after the clone is deleted." +} + +# Create a clone in the testpool based on $TESTFS@snap. Verify that the clone +# was created and that it includes a livelist +# $1 fs name +# $2 snap name +# $3 clone name +function clone_dataset +{ + log_must zfs clone $TESTPOOL/$1@$2 $TESTPOOL/$3 + datasetexists $TESTPOOL/$3 || \ + log_fail "zfs clone $TESTPOOL/$3 fail." + check_livelist_exists $3 +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh new file mode 100755 index 0000000000..107c133196 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify that livelists tracking remapped blocks can be +# properly destroyed. + +# STRATEGY +# 1. Create a pool with disk1 and create a filesystem, snapshot +# and clone. Write several files to the clone. +# 2. Add disk2 to the pool and then remove disk1, triggering a +# remap of the blkptrs tracked in the livelist. +# 3. Delete the clone + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +function cleanup +{ + poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 + [[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 + [[ -f $VIRTUAL_DISK2 ]] && log_must rm $VIRTUAL_DISK2 +} + +log_onexit cleanup + +VIRTUAL_DISK1=$TEST_BASE_DIR/disk1 +VIRTUAL_DISK2=$TEST_BASE_DIR/disk2 +log_must truncate -s $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 +log_must truncate -s $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 + +log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 +log_must poolexists $TESTPOOL2 + +log_must zfs create $TESTPOOL2/$TESTFS +log_must mkfile 25m /$TESTPOOL2/$TESTFS/atestfile +log_must zfs snapshot $TESTPOOL2/$TESTFS@snap + +log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE + +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE0 +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE1 +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE2 + +log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 +log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 +wait_for_removal $TESTPOOL2 + +log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE0 +log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE1 + +log_must zfs destroy $TESTPOOL2/$TESTCLONE + +log_pass "Clone with the livelist feature and remapped blocks," \ + "can be destroyed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh new file mode 100755 index 0000000000..ab646daece --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify that livelists tracking remapped blocks can be +# properly condensed. + +# STRATEGY +# 1. Create a pool with disk1 and create a filesystem, snapshot +# and clone. Create two files for the first livelist entry and +# pause condensing. +# 2. Add disk2 to the pool and then remove disk1, triggering a +# remap of the blkptrs tracked in the livelist. +# 3. Overwrite the first file several times to trigger a condense, +# overwrite the second file once and resume condensing, now with +# extra blkptrs added during the remap +# 4. Check that the test added new ALLOC blkptrs mid-condense using +# a variable set in that code path + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy.cfg + +function cleanup +{ + poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 + # reset livelist max size + set_tunable64 LIVELIST_MAX_ENTRIES $ORIGINAL_MAX + [[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 + [[ -f $VIRTUAL_DISK2 ]] && log_must rm $VIRTUAL_DISK2 +} + +log_onexit cleanup + +ORIGINAL_MAX=$(get_tunable LIVELIST_MAX_ENTRIES) +set_tunable64 LIVELIST_MAX_ENTRIES 20 + +VIRTUAL_DISK1=$TEST_BASE_DIR/disk1 +VIRTUAL_DISK2=$TEST_BASE_DIR/disk2 +log_must truncate -s $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 +log_must truncate -s $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 + +log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 +log_must poolexists $TESTPOOL2 + +log_must zfs create $TESTPOOL2/$TESTFS +log_must mkfile 100m /$TESTPOOL2/$TESTFS/atestfile +log_must zfs snapshot $TESTPOOL2/$TESTFS@snap + +log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE + +# Create initial files and pause condense zthr on next execution +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B +log_must zpool sync $TESTPOOL2 +set_tunable32 LIVELIST_CONDENSE_SYNC_PAUSE 1 + +# Add a new dev and remove the old one +log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 +log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 +wait_for_removal $TESTPOOL2 + +set_tunable32 LIVELIST_CONDENSE_NEW_ALLOC 0 +# Trigger a condense +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must zpool sync $TESTPOOL2 +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must zpool sync $TESTPOOL2 +# Write remapped blkptrs which will modify the livelist mid-condense +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B + +# Resume condense thr +set_tunable32 LIVELIST_CONDENSE_SYNC_PAUSE 0 +log_must zpool sync $TESTPOOL2 +# Check that we've added new ALLOC blkptrs during the condense +[[ "0" < "$(get_tunable LIVELIST_CONDENSE_NEW_ALLOC)" ]] || \ + log_fail "removal/condense test failed" + +log_must zfs destroy $TESTPOOL2/$TESTCLONE +log_pass "Clone with the livelist feature and remapped blocks," \ + "can be condensed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/socket.c b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/socket.c index 2fe9de77ce..a8c814e7b5 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/socket.c +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/socket.c @@ -22,6 +22,7 @@ #include #include #include +#include /* ARGSUSED */ int diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh index c4b42afee4..7063bbe9ce 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh @@ -32,9 +32,7 @@ verify_runnable "both" function cleanup { for snap in $TESTSNAP1 $TESTSNAP2; do - if snapexists "$snap"; then - log_must zfs destroy "$snap" - fi + snapexists "$snap" && destroy_dataset "$snap" done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh index 471e9ca68e..96e6d9b5ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh @@ -24,14 +24,15 @@ # 1. Create an encrypted dataset # 2. Create two snapshots of the dataset # 3. Perform 'zfs diff -Ft' and verify no errors occur +# 4. Perform the same test on a dataset with large dnodes # verify_runnable "both" function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset "$TESTPOOL/$TESTFS1" "-r" + destroy_dataset "$TESTPOOL/$TESTFS2" "-r" } log_assert "'zfs diff' should work with encrypted datasets" @@ -50,4 +51,13 @@ log_must zfs snapshot $TESTPOOL/$TESTFS1@snap2 # 3. Perform 'zfs diff' and verify no errors occur log_must zfs diff -Ft $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS1@snap2 +# 4. Perform the same test on a dataset with large dnodes +log_must eval "echo 'password' | zfs create -o dnodesize=4k \ + -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS2" +MNTPOINT="$(get_prop mountpoint $TESTPOOL/$TESTFS2)" +log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1 +log_must touch "$MNTPOINT/file" +log_must zfs snapshot $TESTPOOL/$TESTFS2@snap2 +log_must zfs diff -Ft $TESTPOOL/$TESTFS2@snap1 $TESTPOOL/$TESTFS2@snap2 + log_pass "'zfs diff' works with encrypted datasets" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh index 55dd8b66f6..0d08cf6295 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh @@ -31,9 +31,7 @@ verify_runnable "both" function cleanup { for snap in $TESTSNAP1 $TESTSNAP2; do - if snapexists "$snap"; then - log_must zfs destroy "$snap" - fi + snapexists "$snap" && destroy_dataset "$snap" done find "$MNTPOINT" -type f -delete rm -f "$FILEDIFF" @@ -50,7 +48,7 @@ function create_random # while (( i < count )); do log_must touch "$fspath/file$i" - sleep $(random 3) + sleep $(random_int_between 1 3) (( i = i + 1 )) done } @@ -84,7 +82,7 @@ do continue; fi - filetime="$(stat -c '%Z' $file)" + filetime=$(stat_ctime $file) if [[ "$filetime" != "$ctime" ]]; then log_fail "Unexpected ctime for file $file ($filetime != $ctime)" else diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_types.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_types.ksh index 9c81084d13..8e521b9f5a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_types.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_types.ksh @@ -70,8 +70,13 @@ DATASET="$TESTPOOL/$TESTFS/fs" TESTSNAP1="$DATASET@snap1" TESTSNAP2="$DATASET@snap2" FILEDIFF="$TESTDIR/zfs-diff.txt" -MAJOR=$(stat -c %t /dev/null) -MINOR=$(stat -c %T /dev/null) +if is_freebsd; then + MAJOR=$(stat -f %Hr /dev/null) + MINOR=$(stat -f %Lr /dev/null) +else + MAJOR=$(stat -c %t /dev/null) + MINOR=$(stat -c %T /dev/null) +fi # 1. Prepare a dataset log_must zfs create $DATASET @@ -106,7 +111,11 @@ verify_object_class "$MNTPOINT/cdev" "C" # 2. | (Named pipe) log_must zfs snapshot "$TESTSNAP1" -log_must mknod "$MNTPOINT/fifo" p +if is_freebsd; then + log_must mkfifo "$MNTPOINT/fifo" +else + log_must mknod "$MNTPOINT/fifo" p +fi log_must zfs snapshot "$TESTSNAP2" verify_object_class "$MNTPOINT/fifo" "|" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh index 92d51944f2..deb501698a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh @@ -27,6 +27,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2021 Matt Fiddaman # . $STF_SUITE/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib @@ -56,18 +57,27 @@ do ((i+=1)) done +typeset -r uint64_max="18446744073709551615" + typeset zfs_props=("type" used available creation volsize referenced \ compressratio mounted origin recordsize quota reservation mountpoint \ - sharenfs checksum compression atime devices exec readonly setuid zoned \ - snapdir acltype aclinherit canmount primarycache secondarycache \ + sharenfs checksum compression atime devices exec readonly setuid \ + snapdir aclinherit canmount primarycache secondarycache version \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ - version) - + filesystem_limit snapshot_limit filesystem_count snapshot_count) +if is_freebsd; then + typeset zfs_props_os=(jailed aclmode) +else + typeset zfs_props_os=(zoned acltype) +fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) -typeset all_props=("${zfs_props[@]}" "${userquota_props[@]}") +typeset all_props=("${zfs_props[@]}" \ + "${zfs_props_os[@]}" \ + "${userquota_props[@]}") typeset dataset=($TESTPOOL/$TESTCTR $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL \ - $TESTPOOL/$TESTFS@$TESTSNAP $TESTPOOL/$TESTVOL@$TESTSNAP) + $TESTPOOL/$TESTFS@$TESTSNAP $TESTPOOL/$TESTVOL@$TESTSNAP + $TESTPOOL/$TESTFS@$TESTSNAP1 $TESTPOOL/$TESTCLONE) typeset bookmark_props=(creation) typeset bookmark=($TESTPOOL/$TESTFS#$TESTBKMARK $TESTPOOL/$TESTVOL#$TESTBKMARK) @@ -93,10 +103,21 @@ function check_return_value while read line; do typeset item - item=$(echo $line | awk '{print $2}' 2>&1) + typeset value + item=$(echo $line | awk '{print $2}' 2>&1) if [[ $item == $p ]]; then ((found += 1)) + cols=$(echo $line | awk '{print NF}') + fi + + value=$(echo $line | awk '{print $3}' 2>&1) + if [[ $value == $uint64_max ]]; then + log_fail "'zfs get $opt $props $dst' return " \ + "UINT64_MAX constant." + fi + + if ((found > 0)); then break fi done < $TESTDIR/$TESTFILE0 @@ -104,6 +125,9 @@ function check_return_value if ((found == 0)); then log_fail "'zfs get $opt $props $dst' return " \ "error message.'$p' haven't been found." + elif [[ "$opt" == "-p" ]] && ((cols != 4)); then + log_fail "'zfs get $opt $props $dst' returned " \ + "$cols columns instead of 4." fi done @@ -118,6 +142,10 @@ log_onexit cleanup create_snapshot $TESTPOOL/$TESTFS $TESTSNAP create_snapshot $TESTPOOL/$TESTVOL $TESTSNAP +# Create second snapshot and clone it +create_snapshot $TESTPOOL/$TESTFS $TESTSNAP1 +create_clone $TESTPOOL/$TESTFS@$TESTSNAP1 $TESTPOOL/$TESTCLONE + # Create filesystem and volume's bookmark create_bookmark $TESTPOOL/$TESTFS $TESTSNAP $TESTBKMARK create_bookmark $TESTPOOL/$TESTVOL $TESTSNAP $TESTBKMARK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh index f49f58e8ce..c3746514ea 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh @@ -49,13 +49,19 @@ typeset options=(" " p r H) typeset zfs_props=("type" used available creation volsize referenced \ compressratio mounted origin recordsize quota reservation mountpoint \ - sharenfs checksum compression atime devices exec readonly setuid zoned \ - snapdir acltype aclinherit canmount primarycache secondarycache \ - usedbychildren usedbydataset usedbyrefreservation usedbysnapshots version) - + sharenfs checksum compression atime devices exec readonly setuid \ + snapdir aclinherit canmount primarycache secondarycache version \ + usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) +if is_freebsd; then + typeset zfs_props_os=(jailed aclmode) +else + typeset zfs_props_os=(zoned acltype) +fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) -typeset props=("${zfs_props[@]}" "${userquota_props[@]}") +typeset props=("${zfs_props[@]}" \ + "${zfs_props_os[@]}" \ + "${userquota_props[@]}") typeset dataset=($TESTPOOL/$TESTCTR $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL \ $TESTPOOL/$TESTFS@$TESTSNAP $TESTPOOL/$TESTVOL@$TESTSNAP) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh index b038e7484a..3bc4c6240e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh @@ -47,11 +47,9 @@ function cleanup { [[ -e $propfile ]] && rm -f $propfile - datasetexists $clone && \ - log_must zfs destroy $clone + datasetexists $clone && destroy_dataset $clone for snap in $fssnap $volsnap ; do - snapexists $snap && \ - log_must zfs destroy $snap + snapexists $snap && destroy_dataset $snap done if [[ -n $globalzone ]] ; then @@ -64,8 +62,7 @@ function cleanup done else for fs in $TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3; do - datasetexists $fs && \ - log_must zfs destroy -rf $fs + datasetexists $fs && destroy_dataset $fs -rf done fi } @@ -114,7 +111,7 @@ availspace=$(get_prop available $TESTPOOL) typeset -i i=0 # make sure 'availspace' is larger then twice of FILESIZE to create a new pool. -# If any, we only totally create 3 pools for multple datasets testing to limit +# If any, we only totally create 3 pools for multiple datasets testing to limit # testing time while (( availspace > DFILESIZE )) && (( i < 3 )) ; do (( i += 1 )) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh index 2de640f871..510c54506d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh @@ -47,13 +47,19 @@ verify_runnable "both" typeset val_opts=(p r H) typeset v_props=(type used available creation volsize referenced compressratio \ mounted origin recordsize quota reservation mountpoint sharenfs checksum \ - compression atime devices exec readonly setuid zoned snapdir acltype \ + compression atime devices exec readonly setuid snapdir version \ aclinherit canmount primarycache secondarycache \ - usedbychildren usedbydataset usedbyrefreservation usedbysnapshots version) - + usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) +if is_freebsd; then + typeset v_props_os=(jailed aclmode) +else + typeset v_props_os=(zoned acltype) +fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) -typeset val_props=("${v_props[@]}" "${userquota_props[@]}") +typeset val_props=("${v_props[@]}" \ + "${v_props_os[@]}" \ + "${userquota_props[@]}") set -f # Force shell does not parse '?' and '*' as the wildcard typeset inval_opts=(P R h ? *) typeset inval_props=(Type 0 ? * -on --on readonl time USED RATIO MOUNTED) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh index 29bd10d483..296fe99968 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh @@ -52,14 +52,19 @@ set -A options " " "-r" "-H" "-p" "-rHp" "-o name" \ set -A props type used available creation volsize referenced compressratio \ mounted origin recordsize quota reservation mountpoint sharenfs \ - checksum compression atime devices exec readonly setuid zoned snapdir \ - acltype aclinherit canmount primarycache secondarycache \ + checksum compression atime devices exec readonly setuid snapdir \ + aclinherit canmount primarycache secondarycache \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ userquota@root groupquota@root userused@root groupused@root +if is_freebsd; then + set -A props ${props[*]} jailed aclmode +else + set -A props ${props[*]} zoned acltype +fi zfs upgrade -v > /dev/null 2>&1 if [[ $? -eq 0 ]]; then - set -A all_props ${all_props[*]} version + set -A props ${props[*]} version fi set -A dataset $TESTPOOL/$TESTCTR $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh index 2d97c5918a..7fd6918b43 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh @@ -55,9 +55,14 @@ log_onexit depth_fs_cleanup set -A all_props type used available creation volsize referenced \ compressratio mounted origin recordsize quota reservation mountpoint \ sharenfs checksum compression atime devices exec readonly setuid \ - zoned snapdir acltype aclinherit canmount primarycache secondarycache \ + snapdir aclinherit canmount primarycache secondarycache \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ userquota@root groupquota@root userused@root groupused@root +if is_freebsd; then + set -A all_props ${all_props[*]} jailed aclmode +else + set -A all_props ${all_props[*]} zoned acltype +fi zfs upgrade -v > /dev/null 2>&1 if [[ $? -eq 0 ]]; then @@ -92,5 +97,16 @@ log_must eval "zfs get -H -t snapshot -o name creation $DEPTH_FS > $DEPTH_OUTPUT log_must eval "zfs get -H -t snapshot -d 1 -o name creation $DEPTH_FS > $EXPECT_OUTPUT" log_must diff $DEPTH_OUTPUT $EXPECT_OUTPUT +# Ensure 'zfs get -t snap' works as a shorthand for 'zfs get -t snapshot' +log_must eval "zfs get -H -t snap -d 1 -o name creation $DEPTH_FS > $DEPTH_OUTPUT" +log_must eval "zfs get -H -t snapshot -d 1 -o name creation $DEPTH_FS > $EXPECT_OUTPUT" +log_must diff $DEPTH_OUTPUT $EXPECT_OUTPUT + +# Ensure 'zfs get -t bookmark ' works as though -d 1 was specified +log_must eval "zfs get -H -t bookmark -o name creation $DEPTH_FS > $DEPTH_OUTPUT" +log_must eval "zfs get -H -t bookmark -d 1 -o name creation $DEPTH_FS > $EXPECT_OUTPUT" +log_must diff $DEPTH_OUTPUT $EXPECT_OUTPUT + + log_pass "'zfs get -d ' should get expected output." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib index d8cb9af028..9b4eecf371 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib @@ -26,6 +26,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2021 Matt Fiddaman # . $STF_SUITE/include/libtest.shlib @@ -87,8 +88,8 @@ function gen_option_str # $elements $prefix $separator $counter } # -# Cleanup the volume snapshot, filesystem snapshot, volume bookmark, and -# filesystem bookmark that were created for this test case. +# Cleanup the volume snapshot, filesystem snapshots, clone, volume bookmark, +# and filesystem bookmark that were created for this test case. # function cleanup { @@ -97,6 +98,11 @@ function cleanup datasetexists $TESTPOOL/$TESTFS@$TESTSNAP && \ destroy_snapshot $TESTPOOL/$TESTFS@$TESTSNAP + datasetexists $TESTPOOL/$TESTCLONE && \ + destroy_clone $TESTPOOL/$TESTCLONE + datasetexists $TESTPOOL/$TESTFS@$TESTSNAP1 && \ + destroy_snapshot $TESTPOOL/$TESTFS@$TESTSNAP1 + bkmarkexists $TESTPOOL/$TESTVOL#$TESTBKMARK && \ destroy_bookmark $TESTPOOL/$TESTVOL#$TESTBKMARK bkmarkexists $TESTPOOL/$TESTFS#$TESTBKMARK && \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_list_d.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_list_d.kshlib index 8ef8d9aa16..d5388e6ef2 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_list_d.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_list_d.kshlib @@ -37,7 +37,7 @@ set -A depth_options "d 0" "d 1" "d 2" "d 4" "d 32" set -A depth_array 0 1 2 4 32 # -# Setup multiple depths datasets, including fs, volume and snapshot. +# Setup multiple depths datasets, including fs, volumes, snapshots and bookmarks. # function depth_fs_setup { @@ -65,6 +65,7 @@ function depth_fs_setup log_must zfs create -V 8M $fs/vol_"$j"_depth"$i" fi log_must zfs snapshot $fs@snap_"$j"_depth"$i" + log_must zfs bookmark $fs@snap_"$j"_depth"$i" '#bookmark'_"$j"_depth"$i" (( j=j+1 )) done done @@ -77,7 +78,5 @@ function depth_fs_setup # function depth_fs_cleanup { - log_must zfs destroy -rR $DEPTH_FS + datasetexists $DEPTH_FS && destroy_dataset $DEPTH_FS -rR } - - diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile.am similarity index 61% rename from tests/zfs-tests/tests/functional/cli_root/zfs_remap/Makefile.am rename to tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile.am index 91abff68c7..5f5e385878 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile.am @@ -1,7 +1,5 @@ -pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_remap - +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_ids_to_path dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - zfs_remap_cliargs.ksh \ - zfs_remap_obsolete_counts.ksh + zfs_ids_to_path_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/cleanup.ksh new file mode 100755 index 0000000000..b5ff022172 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/cleanup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/setup.ksh new file mode 100755 index 0000000000..fd6f8f8bb0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/setup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh new file mode 100755 index 0000000000..563b3e00dd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: Identify the objset id and the object id of a file in a +# filesystem, and verify that zfs_ids_to_path behaves correctly with them. +# +# STRATEGY: +# 1. Create a dataset +# 2. Makes files in the dataset +# 3. Verify that zfs_ids_to_path outputs the correct format for each one +# + +verify_runnable "both" + +function cleanup +{ + destroy_dataset $TESTPOOL/$TESTFS + zfs create -o mountpoint=$TESTDIR $TESTPOOL/$TESTFS +} + +function test_one +{ + typeset ds_id="$1" + typeset ds_path="$2" + typeset file_path="$3" + + typeset mntpnt=$(get_prop mountpoint $ds_path) + typeset file_id=$(ls -i /$mntpnt/$file_path | sed 's/ .*//') + typeset output=$(zfs_ids_to_path $TESTPOOL $ds_id $file_id) + [[ "$output" == "$mntpnt/$file_path" ]] || \ + log_fail "Incorrect output for non-verbose while mounted: $output" + output=$(zfs_ids_to_path -v $TESTPOOL $ds_id $file_id) + [[ "$output" == "$ds_path:/$file_path" ]] || \ + log_fail "Incorrect output for verbose while mounted: $output" + log_must zfs unmount $ds_path + output=$(zfs_ids_to_path $TESTPOOL $ds_id $file_id) + [[ "$output" == "$ds_path:/$file_path" ]] || \ + log_fail "Incorrect output for non-verbose while unmounted: $output" + output=$(zfs_ids_to_path -v $TESTPOOL $ds_id $file_id) + [[ "$output" == "$ds_path:/$file_path" ]] || \ + log_fail "Incorrect output for verbose while unmounted: $output" + log_must zfs mount $ds_path +} + +log_onexit cleanup + +typeset BASE=$TESTPOOL/$TESTFS +typeset TESTFILE1=f1 +typeset TESTDIR1=d1 +typeset TESTFILE2=d1/f2 +typeset TESTDIR2=d1/d2 +typeset TESTFILE3=d1/d2/f3 +typeset TESTFILE4=d1/d2/f4 + +typeset mntpnt=$(get_prop mountpoint $BASE) + +log_must touch /$mntpnt/$TESTFILE1 +log_must mkdir /$mntpnt/$TESTDIR1 +log_must touch /$mntpnt/$TESTFILE2 +log_must mkdir /$mntpnt/$TESTDIR2 +log_must touch /$mntpnt/$TESTFILE3 +log_must touch /$mntpnt/$TESTFILE4 + +typeset ds_id=$(zdb $BASE | grep "^Dataset" | sed 's/.* ID \([0-9]*\).*/\1/') +test_one $ds_id $BASE $TESTFILE1 +test_one $ds_id $BASE $TESTFILE2 +test_one $ds_id $BASE $TESTFILE3 +test_one $ds_id $BASE $TESTFILE4 + +log_pass "zfs_ids_to_path displayed correctly" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh index 584039f543..8e37e8dbca 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh @@ -36,8 +36,8 @@ # 'zfs inherit' should return an error with bad parameters in one command. # # STRATEGY: -# 1. Set an array of bad options and invlid properties to 'zfs inherit' -# 2. Execute 'zfs inherit' with bad options and passing invlid properties +# 1. Set an array of bad options and invalid properties to 'zfs inherit' +# 2. Execute 'zfs inherit' with bad options and passing invalid properties # 3. Verify an error is returned. # @@ -45,9 +45,8 @@ verify_runnable "both" function cleanup { - if snapexists $TESTPOOL/$TESTFS@$TESTSNAP; then - log_must zfs destroy $TESTPOOL/$TESTFS@$TESTSNAP - fi + snapexists $TESTPOOL/$TESTFS@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP } log_assert "'zfs inherit' should return an error with bad parameters in" \ @@ -56,8 +55,13 @@ log_onexit cleanup set -A badopts "r" "R" "-R" "-rR" "-a" "-" "-?" "-1" "-2" "-v" "-n" set -A props "recordsize" "mountpoint" "sharenfs" "checksum" "compression" \ - "atime" "devices" "exec" "setuid" "readonly" "zoned" "snapdir" "aclmode" \ + "atime" "devices" "exec" "setuid" "readonly" "snapdir" "aclmode" \ "aclinherit" "xattr" "copies" +if is_freebsd; then + props+=("jailed") +else + props+=("zoned") +fi set -A illprops "recordsiz" "mountpont" "sharen" "compres" "atme" "blah" log_must zfs snapshot $TESTPOOL/$TESTFS@$TESTSNAP diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh index bc0d8c59c0..3f7e4ff972 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh @@ -37,8 +37,8 @@ # 'zfs inherit' should return an error with bad parameters in one command. # # STRATEGY: -# 1. Set an array of bad options and invlid properties to 'zfs inherit' -# 2. Execute 'zfs inherit' with bad options and passing invlid properties +# 1. Set an array of bad options and invalid properties to 'zfs inherit' +# 2. Execute 'zfs inherit' with bad options and passing invalid properties # 3. Verify an error is returned. # @@ -47,9 +47,7 @@ verify_runnable "both" function cleanup { for ds in $TESTPOOL $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL ; do - if snapexists $ds@$TESTSNAP; then - log_must zfs destroy $ds@$TESTSNAP - fi + snapexists $ds@$TESTSNAP && destroy_dataset $ds@$TESTSNAP done cleanup_user_prop $TESTPOOL } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile.am new file mode 100644 index 0000000000..b6dd7721e6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile.am @@ -0,0 +1,6 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_jail +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + jail.conf \ + zfs_jail_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_jail/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/cleanup.ksh new file mode 100755 index 0000000000..79cd6e9f90 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_jail/jail.conf b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/jail.conf new file mode 100644 index 0000000000..23a9dabeb4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/jail.conf @@ -0,0 +1,9 @@ +testjail { + allow.mount.zfs; + allow.mount; + devfs_ruleset = 4; + enforce_statfs = 0; + mount.devfs; + path = "/"; + persist; +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_jail/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/setup.ksh new file mode 100755 index 0000000000..6a9af3bc28 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh new file mode 100755 index 0000000000..2c08081102 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test basic functionality of `zfs jail` and `zfs unjail`. +# +# STRATEGY: +# 1. Create a jail. +# 2. Perform some basic ZFS operations on a dataset both in the host and +# in the jail to confirm the dataset is functional in the host +# and hidden in in the jail. +# 3. Run `zfs jail` to expose the dataset in the jail. +# 4. Perform some basic ZFS operations on the dataset both in the host and +# in the jail to confirm the dataset is functional in the jail and host. +# 5. Run `zfs unjail` to return the dataset to the host. +# 6. Perform some basic ZFS operations on the dataset both in the host and +# in the jail to confirm the dataset is functional in the host +# and hidden in in the jail. +# + +verify_runnable "global" + +JAIL="testjail" +JAIL_CONF="$STF_SUITE/tests/functional/cli_root/zfs_jail/jail.conf" + +function cleanup +{ + if jls -j $JAIL name >/dev/null 2>&1; then + jail -r -f $JAIL_CONF $JAIL + fi +} + +log_onexit cleanup + +log_assert "Verify that a dataset can be jailed and unjailed." + +# 1. Create a jail. +log_must jail -c -f $JAIL_CONF $JAIL + +# 2. Try some basic ZFS operations. +log_must zfs list $TESTPOOL +log_mustnot jexec $JAIL zfs list $TESTPOOL + +# 3. Jail the dataset. +log_must zfs jail $JAIL $TESTPOOL + +# 4. Try some basic ZFS operations. +log_must zfs list $TESTPOOL +log_must jexec $JAIL zfs list $TESTPOOL + +# 5. Unjail the dataset. +log_must zfs unjail $JAIL $TESTPOOL + +# 6. Try some basic ZFS operations. +log_must zfs list $TESTPOOL +log_mustnot jexec $JAIL zfs list $TESTPOOL + +log_pass "Datasets can be jailed and unjailed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/HEXKEY b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/HEXKEY new file mode 100644 index 0000000000..95ed1c051a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/HEXKEY @@ -0,0 +1 @@ +000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile.am index 06b4239a6d..7dfec435ce 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile.am @@ -5,10 +5,14 @@ dist_pkgdata_SCRIPTS = \ zfs_load-key.ksh \ zfs_load-key_all.ksh \ zfs_load-key_file.ksh \ + zfs_load-key_https.ksh \ zfs_load-key_location.ksh \ zfs_load-key_noop.ksh \ zfs_load-key_recursive.ksh dist_pkgdata_DATA = \ zfs_load-key.cfg \ - zfs_load-key_common.kshlib + zfs_load-key_common.kshlib \ + PASSPHRASE \ + HEXKEY \ + RAWKEY diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/PASSPHRASE b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/PASSPHRASE new file mode 100644 index 0000000000..f3097ab130 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/PASSPHRASE @@ -0,0 +1 @@ +password diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/RAWKEY b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/RAWKEY new file mode 100644 index 0000000000..f2d4cbf581 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/RAWKEY @@ -0,0 +1 @@ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/cleanup.ksh index 79cd6e9f90..d397bcf4e9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/cleanup.ksh @@ -26,5 +26,7 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +cleanup_https default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/setup.ksh index 6a9af3bc28..6cc5528ce5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/setup.ksh @@ -26,7 +26,10 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib DISK=${DISKS%% *} -default_setup $DISK +default_setup_noexit $DISK +setup_https +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.cfg index 90d9f63f1d..cc1e3b3305 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.cfg @@ -17,6 +17,9 @@ # Copyright (c) 2017 Datto, Inc. All rights reserved. # +# $PASSPHRASE, $HEXKEY, and $RAWKEY must be kept in sync +# with the corresponding files in this directory + export PASSPHRASE="password" export PASSPHRASE1="password1" export PASSPHRASE2="password2" @@ -24,3 +27,31 @@ export HEXKEY="000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F" export HEXKEY1="201F1E1D1C1B1A191817161514131211100F0E0D0C0B0A090807060504030201" export RAWKEY="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" export RAWKEY1="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + +export SSL_CA_CERT_FILE="/$TESTPOOL/snakeoil.crt" +export HTTPS_PORT_FILE="/$TESTPOOL/snakeoil.port" +export HTTPS_HOSTNAME="localhost" +export HTTPS_PORT= +export HTTPS_BASE_URL= + +function get_https_port +{ + if [ -z "$HTTPS_PORT" ]; then + read -r HTTPS_PORT < "$HTTPS_PORT_FILE" || return + fi + + echo "$HTTPS_PORT" +} + +function get_https_base_url +{ + if [ -z "$HTTPS_BASE_URL" ]; then + HTTPS_BASE_URL="https://$HTTPS_HOSTNAME:$(get_https_port)" || { + typeset ret=$? + HTTPS_BASE_URL= + return $ret + } + fi + + echo "$HTTPS_BASE_URL" +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.ksh index 847a6aabd3..8af9f80cfb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.ksh @@ -46,7 +46,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 poolexists $TESTPOOL1 && log_must destroy_pool $TESTPOOL1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_all.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_all.ksh index 5e331fd120..3c18e4538d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_all.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_all.ksh @@ -37,9 +37,9 @@ verify_runnable "both" function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 - datasetexists $TESTPOOL/zvol && log_must zfs destroy $TESTPOOL/zvol + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 + datasetexists $TESTPOOL/$TESTFS2 && destroy_dataset $TESTPOOL/$TESTFS2 + datasetexists $TESTPOOL/zvol && destroy_dataset $TESTPOOL/zvol poolexists $TESTPOOL1 && log_must destroy_pool $TESTPOOL1 } log_onexit cleanup @@ -50,6 +50,9 @@ log_must eval "echo $PASSPHRASE1 > /$TESTPOOL/pkey" log_must zfs create -o encryption=on -o keyformat=passphrase \ -o keylocation=file:///$TESTPOOL/pkey $TESTPOOL/$TESTFS1 +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=$(get_https_base_url)/PASSPHRASE $TESTPOOL/$TESTFS2 + log_must zfs create -V 64M -o encryption=on -o keyformat=passphrase \ -o keylocation=file:///$TESTPOOL/pkey $TESTPOOL/zvol @@ -58,20 +61,25 @@ log_must zpool create -O encryption=on -O keyformat=passphrase \ -O keylocation=file:///$TESTPOOL/pkey $TESTPOOL1 $DISK2 log_must zfs unmount $TESTPOOL/$TESTFS1 -log_must zfs unload-key $TESTPOOL/$TESTFS1 +log_must_busy zfs unload-key $TESTPOOL/$TESTFS1 -log_must zfs unload-key $TESTPOOL/zvol +log_must zfs unmount $TESTPOOL/$TESTFS2 +log_must_busy zfs unload-key $TESTPOOL/$TESTFS2 + +log_must_busy zfs unload-key $TESTPOOL/zvol log_must zfs unmount $TESTPOOL1 -log_must zfs unload-key $TESTPOOL1 +log_must_busy zfs unload-key $TESTPOOL1 log_must zfs load-key -a log_must key_available $TESTPOOL1 log_must key_available $TESTPOOL/zvol log_must key_available $TESTPOOL/$TESTFS1 +log_must key_available $TESTPOOL/$TESTFS2 log_must zfs mount $TESTPOOL1 log_must zfs mount $TESTPOOL/$TESTFS1 +log_must zfs mount $TESTPOOL/$TESTFS2 log_pass "'zfs load-key -a' loads keys for all datasets" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib index d9066f9cbf..f7461437c6 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib @@ -99,3 +99,66 @@ function verify_origin return 0 } + +function setup_https +{ + log_must openssl req -x509 -newkey rsa:4096 -sha256 -days 1 -nodes -keyout "/$TESTPOOL/snakeoil.key" -out "$SSL_CA_CERT_FILE" -subj "/CN=$HTTPS_HOSTNAME" + + python3 -uc " +import http.server, ssl, sys, os, time, random + +sys.stdin.close() + +httpd, err, port = None, None, None +for i in range(1, 100): + port = random.randint(0xC000, 0xFFFF) # ephemeral range + try: + httpd = http.server.HTTPServer(('$HTTPS_HOSTNAME', port), http.server.SimpleHTTPRequestHandler) + break + except: + err = sys.exc_info()[1] + time.sleep(i / 100) +if not httpd: + raise err + +with open('$HTTPS_PORT_FILE', 'w') as portf: + print(port, file=portf) + +httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS) + +os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key') + +with open('/$TESTPOOL/snakeoil.pid', 'w') as pidf: + if os.fork() != 0: + os._exit(0) + print(os.getpid(), file=pidf) + +sys.stdout.close() +sys.stderr.close() +try: + sys.stdout = sys.stderr = open('/tmp/ZTS-snakeoil.log', 'w', buffering=1) # line +except: + sys.stdout = sys.stderr = open('/dev/null', 'w') + +print('{} start on {}'.format(os.getpid(), port)) +httpd.serve_forever() +" || log_fail + + typeset https_pid= + for d in $(seq 0 0.1 5); do + read -r https_pid 2>/dev/null < "/$TESTPOOL/snakeoil.pid" && [ -n "$https_pid" ] && break + sleep "$d" + done + [ -z "$https_pid" ] && log_fail "Couldn't start HTTPS server" + log_note "Started HTTPS server as $https_pid on port $(get_https_port)" +} + +function cleanup_https +{ + typeset https_pid= + read -r https_pid 2>/dev/null < "/$TESTPOOL/snakeoil.pid" || return 0 + + log_must kill "$https_pid" + cat /tmp/ZTS-snakeoil.log + rm -f "/$TESTPOOL/snakeoil.pid" "/tmp/ZTS-snakeoil.log" +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_file.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_file.ksh index 7cbda43ff2..73c461fd6b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_file.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_file.ksh @@ -38,7 +38,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_https.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_https.ksh new file mode 100755 index 0000000000..c0c91e59dd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_https.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# 'zfs load-key' should load a dataset's key from an https:// URL, +# but fail to do so if the domain doesn't exist or the file 404s. +# +# STRATEGY: +# 1. Try to create a dataset pointing to an RFC6761-guaranteed unresolvable domain, +# one to the sshd port (which will be either unoccupied (ECONNREFUSED) +# or have sshd on it ("wrong version number")). +# and one pointing to an URL that will always 404. +# 2. Create encrypted datasets with keylocation=https://address +# 3. Unmount the datasets and unload their keys +# 4. Attempt to load the keys +# 5. Verify the keys are loaded +# 6. Attempt to mount the datasets +# + +verify_runnable "both" + +function cleanup +{ + for fs in "$TESTFS1" "$TESTFS2" "$TESTFS3"; do + datasetexists $TESTPOOL/$fs && \ + destroy_dataset $TESTPOOL/$fs + done +} +log_onexit cleanup + +log_assert "'zfs load-key' should load a key from a file" + +log_mustnot zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=https://invalid./where-ever $TESTPOOL/$TESTFS1 + +log_mustnot zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=https://$HTTPS_HOSTNAME:22 $TESTPOOL/$TESTFS1 + +log_mustnot zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=$(get_https_base_url)/ENOENT $TESTPOOL/$TESTFS1 + +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=$(get_https_base_url)/PASSPHRASE $TESTPOOL/$TESTFS1 + +log_must zfs create -o encryption=on -o keyformat=hex \ + -o keylocation=$(get_https_base_url)/HEXKEY $TESTPOOL/$TESTFS2 + +log_must zfs create -o encryption=on -o keyformat=raw \ + -o keylocation=$(get_https_base_url)/RAWKEY $TESTPOOL/$TESTFS3 + +for fs in "$TESTFS1" "$TESTFS2" "$TESTFS3"; do + log_must zfs unmount $TESTPOOL/$fs + log_must zfs unload-key $TESTPOOL/$fs +done +for fs in "$TESTFS1" "$TESTFS2" "$TESTFS3"; do + log_must zfs load-key $TESTPOOL/$fs + log_must key_available $TESTPOOL/$fs + log_must zfs mount $TESTPOOL/$fs +done + +log_pass "'zfs load-key' loads a key from a file" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_location.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_location.ksh index d0b1cdb20e..11f16e45ad 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_location.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_location.ksh @@ -44,7 +44,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 } log_onexit cleanup @@ -70,4 +70,9 @@ log_must eval "echo $PASSPHRASE | zfs load-key -L prompt $TESTPOOL/$TESTFS1" log_must key_available $TESTPOOL/$TESTFS1 log_must verify_keylocation $TESTPOOL/$TESTFS1 "file://$key_location" +log_must zfs unload-key $TESTPOOL/$TESTFS1 +log_must zfs load-key -L $(get_https_base_url)/PASSPHRASE $TESTPOOL/$TESTFS1 +log_must key_available $TESTPOOL/$TESTFS1 +log_must verify_keylocation $TESTPOOL/$TESTFS1 "file://$key_location" + log_pass "'zfs load-key -L' overrides keylocation with provided value" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh index bfce786448..2ee1783469 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh @@ -37,7 +37,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh index 7385b69cf5..c0b5553e39 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh @@ -39,7 +39,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup @@ -52,15 +52,21 @@ log_must zfs create -o encryption=on -o keyformat=passphrase \ log_must zfs create -o keyformat=passphrase \ -o keylocation=file:///$TESTPOOL/pkey $TESTPOOL/$TESTFS1/child +log_must zfs create -o keyformat=passphrase \ + -o keylocation=$(get_https_base_url)/PASSPHRASE $TESTPOOL/$TESTFS1/child/child + log_must zfs unmount $TESTPOOL/$TESTFS1 +log_must zfs unload-key $TESTPOOL/$TESTFS1/child/child log_must zfs unload-key $TESTPOOL/$TESTFS1/child log_must zfs unload-key $TESTPOOL/$TESTFS1 log_must zfs load-key -r $TESTPOOL log_must key_available $TESTPOOL/$TESTFS1 log_must key_available $TESTPOOL/$TESTFS1/child +log_must key_available $TESTPOOL/$TESTFS1/child/child log_must zfs mount $TESTPOOL/$TESTFS1 log_must zfs mount $TESTPOOL/$TESTFS1/child +log_must zfs mount $TESTPOOL/$TESTFS1/child/child log_pass "'zfs load-key -r' recursively loads keys" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am index b2de98934b..8c90b2e75e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am @@ -13,12 +13,15 @@ dist_pkgdata_SCRIPTS = \ zfs_mount_009_neg.ksh \ zfs_mount_010_neg.ksh \ zfs_mount_011_neg.ksh \ - zfs_mount_012_neg.ksh \ + zfs_mount_012_pos.ksh \ + zfs_mount_013_pos.ksh \ + zfs_mount_014_neg.ksh \ zfs_mount_all_001_pos.ksh \ zfs_mount_all_fail.ksh \ zfs_mount_all_mountpoints.ksh \ zfs_mount_encrypted.ksh \ zfs_mount_remount.ksh \ + zfs_mount_test_race.ksh \ zfs_multi_mount.ksh dist_pkgdata_DATA = \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index 2afb9a547b..85566e5653 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -66,7 +66,8 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev if [[ $vdev != "" && \ $vdev != "mirror" && \ - $vdev != "raidz" ]] ; then + $vdev != "raidz" && \ + $vdev != "draid" ]] ; then log_note "Wrong vdev: (\"$vdev\")" return 1 @@ -110,7 +111,7 @@ function cleanup_filesystem #pool #fs if datasetexists "$pool/$fs" ; then mtpt=$(get_prop mountpoint "$pool/$fs") - log_must zfs destroy -r $pool/$fs + destroy_dataset "$pool/$fs" "-r" [[ -d $mtpt ]] && \ log_must rm -rf $mtpt diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh index fc97520f57..c0cb693f6c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh @@ -44,13 +44,15 @@ # 2. Apply 'zfs set mountpoint=path '. # 3. Change directory to that given mountpoint. # 3. Invoke 'zfs mount '. -# 4. Verify that mount succeeds on Linux and fails for other platforms. +# 4. Verify that mount succeeds on Linux and FreeBSD and fails for other +# platforms. # verify_runnable "both" function cleanup { + [[ "$PWD" = "$TESTDIR" ]] && cd - log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS log_must force_unmount $TESTPOOL/$TESTFS return 0 @@ -74,7 +76,7 @@ cd $TESTDIR || \ zfs $mountcmd $TESTPOOL/$TESTFS ret=$? -if is_linux; then +if is_linux || is_freebsd; then (( ret == 0 )) || \ log_fail "'zfs $mountcmd $TESTPOOL/$TESTFS' " \ "unexpected return code of $ret." @@ -85,7 +87,7 @@ else fi log_note "Make sure the filesystem $TESTPOOL/$TESTFS is unmounted" -if is_linux; then +if is_linux || is_freebsd; then mounted $TESTPOOL/$TESTFS || \ log_fail Filesystem $TESTPOOL/$TESTFS is unmounted else diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh index 5c954354d1..5edce35c72 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh @@ -35,22 +35,20 @@ # # DESCRIPTION: -# Invoke "zfs mount " with a filesystem -# mountpoint that is identical to an existing one. -# It will fail with a return code of 1. For Linux, -# place a file in the directory to ensure the failure. -# Also for Linux, test overlay=off (default) in which case -# the mount will fail, and overlay=on, where the mount -# will succeed. +# Invoke "zfs mount " with a filesystem mountpoint that is +# identical to an existing one. It will fail with a return code of 1 +# when overlay=off. Place a file in the directory to ensure the failure. +# Also test overlay=on (default) in which case the mount will not fail. # # STRATEGY: # 1. Prepare an existing mounted filesystem. -# 2. Setup a new filesystem and make sure that it is unmounted. -# 3. For Linux, place a file in the mount point folder. -# 4. Mount the new filesystem using the various combinations -# - zfs set mountpoint= -# - zfs set mountpoint= -# 5. Verify that mount failed with return code of 1. +# 2. Setup a new filesystem with overlay=off and make sure that it is +# unmounted. +# 3. Place a file in the mount point folder. +# 4. Mount the new filesystem using the various combinations +# - zfs set mountpoint= +# - zfs set mountpoint= +# 5. Verify that mount failed with return code of 1. # 6. For Linux, also set overlay=on and verify the mount is # allowed. # @@ -76,7 +74,7 @@ typeset -i ret=0 log_assert "Verify that 'zfs $mountcmd '" \ "where the mountpoint is identical or on top of an existing one" \ - "will fail with return code 1." + "will fail with return code 1 when overlay=off." log_onexit cleanup @@ -98,8 +96,8 @@ done log_must zfs set mountpoint=$mtpt $TESTPOOL/$TESTFS log_must zfs $mountcmd $TESTPOOL/$TESTFS -if is_linux; then - log_must zfs set overlay=off $TESTPOOL/$TESTFS +log_must zfs set overlay=off $TESTPOOL/$TESTFS +if ! is_illumos; then touch $mtpt/file.1 log_must ls -l $mtpt | grep file fi @@ -107,7 +105,7 @@ fi mounted $TESTPOOL/$TESTFS || \ log_unresolved "Filesystem $TESTPOOL/$TESTFS is unmounted" -log_must zfs create $TESTPOOL/$TESTFS1 +log_must zfs create -o overlay=off $TESTPOOL/$TESTFS1 unmounted $TESTPOOL/$TESTFS1 || \ log_must force_unmount $TESTPOOL/$TESTFS1 @@ -123,9 +121,9 @@ while [[ $depth -gt 0 ]] ; do log_mustnot zfs $mountcmd $TESTPOOL/$TESTFS1 - # For Linux, test the overlay=on feature which allows - # mounting of non-empty directory. - if is_linux; then + if ! is_illumos; then + # Test the overlay=on feature which allows + # mounting of non-empty directory. log_must zfs set overlay=on $TESTPOOL/$TESTFS1 log_must zfs $mountcmd $TESTPOOL/$TESTFS1 log_must force_unmount $TESTPOOL/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh index e2ef0bf00d..409dd06d7f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh @@ -45,7 +45,7 @@ # setuid setuid/nosetuid # # STRATEGY: -# 1. Create filesystem and get origianl property value. +# 1. Create filesystem and get original property value. # 2. Using 'zfs mount -o' to set filesystem property. # 3. Verify the property was set temporarily. # 4. Verify it will not affect the property that is stored on disk. @@ -62,7 +62,10 @@ log_assert "Verify '-o' will set filesystem property temporarily, " \ "without affecting the property that is stored on disk." log_onexit cleanup -set -A properties "atime" "devices" "exec" "readonly" "setuid" +set -A properties "atime" "exec" "readonly" "setuid" +if ! is_freebsd; then + properties+=("devices") +fi # # Get the specified filesystem property reverse mount option. @@ -78,16 +81,21 @@ function get_reverse_option # Define property value: "reverse if value=on" "reverse if value=off" if is_linux; then set -A values "noatime" "atime" \ - "nodev" "dev" \ "noexec" "exec" \ "rw" "ro" \ - "nosuid" "suid" - else + "nosuid" "suid" \ + "nodev" "dev" + elif is_freebsd; then set -A values "noatime" "atime" \ - "nodevices" "devices" \ "noexec" "exec" \ "rw" "ro" \ "nosetuid" "setuid" + else + set -A values "noatime" "atime" \ + "noexec" "exec" \ + "rw" "ro" \ + "nosetuid" "setuid" \ + "nodevices" "devices" fi typeset -i i=0 @@ -123,7 +131,8 @@ for property in ${properties[@]}; do # Set filesystem property temporarily reverse_opt=$(get_reverse_option $fs $property) - log_must zfs mount -o remount,$reverse_opt $fs + log_must zfs unmount $fs + log_must zfs mount -o $reverse_opt $fs cur_val=$(get_prop $property $fs) (($? != 0)) && log_fail "get_prop $property $fs" @@ -135,7 +144,7 @@ for property in ${properties[@]}; do "be enabled in LZ" fi elif [[ $orig_val == $cur_val ]]; then - log_fail "zfs mount -o remount,$reverse_opt " \ + log_fail "zfs mount -o $reverse_opt " \ "doesn't change property." fi @@ -146,7 +155,7 @@ for property in ${properties[@]}; do cur_val=$(get_prop $property $fs) (($? != 0)) && log_fail "get_prop $property $fs" if [[ $orig_val != $cur_val ]]; then - log_fail "zfs mount -o remount,$reverse_opt " \ + log_fail "zfs mount -o $reverse_opt " \ "change the property that is stored on disks" fi done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh index 5f88b61100..6a251330f6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh @@ -47,9 +47,7 @@ function cleanup { ! ismounted $fs && log_must zfs mount $fs - if datasetexists $fs1; then - log_must zfs destroy $fs1 - fi + datasetexists $fs1 && destroy_dataset $fs1 if [[ -f $testfile ]]; then log_must rm -f $testfile @@ -73,7 +71,8 @@ log_must mkfile 1M $testfile $testfile1 log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt $fs1 -log_mustnot zfs mount $fs1 +log_must zfs mount $fs1 +log_must zfs unmount $fs1 log_must zfs mount -O $fs1 # Create new file in override mountpoint @@ -83,7 +82,7 @@ log_must mkfile 1M $mntpnt/$TESTFILE2 log_mustnot ls $testfile log_must ls $mntpnt/$TESTFILE1 $mntpnt/$TESTFILE2 -# Verify $TESTFILE2 was created in $fs1, rather then $fs +# Verify $TESTFILE2 was created in $fs1, rather than $fs log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt1 $fs1 log_must zfs mount $fs1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh index 963ad626c2..53ebf1f262 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh @@ -65,11 +65,11 @@ mpt=$(get_prop mountpoint $fs) log_must zfs umount $fs curpath=`dirname $0` cd $mpt -if is_linux; then +if is_linux || is_freebsd; then log_must zfs mount $fs else log_mustnot zfs mount $fs fi cd $curpath -log_pass "zfs mount fails with mounted filesystem or busy moutpoint as expected." +log_pass "zfs mount fails with mounted filesystem or busy mountpoint as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh index a116b4647c..95e2bc3972 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh @@ -45,12 +45,11 @@ verify_runnable "both" function cleanup { - if snapexists $TESTPOOL/$TESTFS@$TESTSNAP; then - log_must_busy zfs destroy $TESTPOOL/$TESTFS@$TESTSNAP - fi + snapexists $TESTPOOL/$TESTFS@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP if is_global_zone && datasetexists $TESTPOOL/$TESTVOL; then - log_must_busy zfs destroy $TESTPOOL/$TESTVOL + destroy_dataset $TESTPOOL/$TESTVOL fi } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh similarity index 79% rename from tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_neg.ksh rename to tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh index 19fb3b2596..5ff094d2c4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh @@ -18,7 +18,9 @@ # # DESCRIPTION: -# Verify that zfs mount should fail with a non-empty directory +# Verify that zfs mount succeeds with a non-empty directory +# + # # STRATEGY: # 1. Unmount the dataset @@ -29,12 +31,12 @@ # 6. Unmount the dataset # 7. Create a file in the directory created in step 2 # 8. Attempt to mount the dataset -# 9. Verify the mount fails +# 9. Verify the mount succeeds # verify_runnable "both" -log_assert "zfs mount fails with non-empty directory" +log_assert "zfs mount succeeds with non-empty directory" fs=$TESTPOOL/$TESTFS @@ -44,7 +46,8 @@ log_must zfs set mountpoint=$TESTDIR $fs log_must zfs mount $fs log_must zfs umount $fs log_must touch $TESTDIR/testfile.$$ -log_mustnot zfs mount $fs +log_must zfs mount $fs +log_must zfs umount $fs log_must rm -rf $TESTDIR -log_pass "zfs mount fails non-empty directory as expected." +log_pass "zfs mount succeeds with non-empty directory as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh new file mode 100755 index 0000000000..e6a4be1577 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount helper functions for both devices and pools. +# + +verify_runnable "both" + +set -A vdevs $(get_disklist_fullpath $TESTPOOL) +typeset -r mntpoint=$(get_prop mountpoint $TESTPOOL) +typeset -r helper="mount.zfs -o zfsutil" +typeset -r fs=$TESTPOOL/$TESTFS + +function cleanup +{ + cd $STF_SUITE + if [[ -d $TESTDIR/$$ ]]; then + log_must rm -rf $TESTDIR/$$ + fi + mounted && zfs $mountcmd $TESTPOOL + return 0 +} +log_onexit cleanup + +log_note "Verify zfs mount helper functions for both devices and pools" + +# Ensure that the ZFS filesystem is unmounted +force_unmount $TESTPOOL + +log_note "Verify ' '" +log_must $helper $fs $mntpoint +log_must ismounted $fs +force_unmount $fs + +log_note "Verify mount(8) does not canonicalize before calling helper" +# Canonicalization is confused by files in PWD matching [device|mountpoint] +log_must mkdir -p $TESTDIR/$$/$TESTPOOL +log_must cd $TESTDIR/$$ +# The env flag directs zfs to exec /bin/mount, which then calls helper +log_must eval ZFS_MOUNT_HELPER=1 zfs $mountcmd -v $TESTPOOL +# mount (2.35.2) still suffers from a cosmetic PWD prefix bug +log_must mounted $TESTPOOL +force_unmount $TESTPOOL + +log_note "Verify CWD prefix filter " +log_must cd / +log_must zfs set mountpoint=legacy $TESTPOOL +log_must mkdir -p $mntpoint +log_must mount -t zfs $TESTPOOL $mntpoint +log_must ismounted $TESTPOOL +log_must umount $mntpoint +log_must zfs set mountpoint=$mntpoint $TESTPOOL +log_must cd - +force_unmount $TESTPOOL + +log_note "Verify '-f ' fakemount" +log_must $helper -f $fs $mntpoint +log_mustnot ismounted $fs + +log_note "Verify '-o ro -v ' verbose RO" +log_must ${helper},ro -v $fs $mntpoint +log_must ismounted $fs +force_unmount $fs + +log_note "Verify '-o abc -s ' sloppy option" +log_must ${helper},abc -s ${vdevs[0]} $mntpoint +log_must mounted $mntpoint +force_unmount $TESTPOOL + +log_note "Verify ' '" +log_must $helper ${vdevs[0]} $mntpoint +log_must mounted $mntpoint + +log_pass "zfs mount helper correctly handles both device and pool strings" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh new file mode 100755 index 0000000000..5cf0bc7b3a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount helper failure on known bad parameters +# + +verify_runnable "both" + +set -A vdevs $(get_disklist_fullpath $TESTPOOL) +vdev=${vdevs[0]} + +mntpoint="$(get_prop mountpoint $TESTPOOL)" +helper="mount.zfs -o zfsutil" +fs=$TESTPOOL/$TESTFS + +function cleanup +{ + log_must force_unmount $vdev + return 0 +} +log_onexit cleanup + +log_note "Verify zfs mount helper failure on known bad parameters" + +# Ensure that the ZFS filesystem is unmounted. +force_unmount $fs + +log_note "Verify failure without '-o zfsutil'" +log_mustnot mount.zfs $fs $mntpoint + +log_note "Verify '-o abc ' bad option fails" +log_mustnot ${helper},abc $vdev $mntpoint + +log_note "Verify '\$NONEXISTFSNAME ' fails" +log_mustnot $helper $NONEXISTFSNAME $mntpoint + +log_note "Verify ' (\$NONEXISTFSNAME|/dev/null)' fails" +log_mustnot $helper $fs $NONEXISTFSNAME +log_mustnot $helper $fs /dev/null + +log_note "Verify '/dev/null ' fails" +log_mustnot $helper /dev/null $mntpoint + +log_note "Verify '[device|pool]' fails" +log_mustnot mount.zfs +log_mustnot $helper +log_mustnot $helper $vdev +log_mustnot $helper $TESTPOOL + +log_pass "zfs mount helper fails when expected" \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh index d7fcd20afa..d1103bddcc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh @@ -30,7 +30,8 @@ # 1. Create zfs filesystems # 2. Unmount a leaf filesystem # 3. Create a file in the above filesystem's mountpoint -# 4. Verify that 'zfs mount -a' fails to mount the above +# 4. Verify that 'zfs mount -a' succeeds if overlay=on and +# fails to mount the above if overlay=off # 5. Verify that all other filesystems were mounted # @@ -82,15 +83,23 @@ done # Create a stray file in one filesystem's mountpoint touch $path/0/strayfile -# Verify that zfs mount -a fails export __ZFS_POOL_RESTRICT="$TESTPOOL" + +# Verify that zfs mount -a succeeds with overlay=on (default) +log_must zfs $mountall +log_must mounted "$TESTPOOL/0" +log_must zfs $unmountall + +# Verify that zfs mount -a succeeds with overlay=off +log_must zfs set overlay=off "$TESTPOOL/0" log_mustnot zfs $mountall +log_mustnot mounted "$TESTPOOL/0" + unset __ZFS_POOL_RESTRICT -# All filesystems except for "0" should be mounted -log_mustnot mounted "$TESTPOOL/0" +# All other filesystems should be mounted for ((i=1; i<$fscount; i++)); do log_must mounted "$TESTPOOL/$i" done -log_pass "'zfs $mountall' failed as expected." +log_pass "'zfs $mountall' behaves as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh index 3e6a24bbcd..faeae4227a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh @@ -109,6 +109,8 @@ function cleanup_all export __ZFS_POOL_RESTRICT="$TESTPOOL" log_must zfs $unmountall unset __ZFS_POOL_RESTRICT + # make sure we leave $TESTPOOL mounted + log_must zfs mount $TESTPOOL for fs in ${filesystems[@]}; do cleanup_filesystem "$TESTPOOL" "$fs" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh index 9749a9b3aa..a95e7507b4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh @@ -42,7 +42,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh index f7a0978352..ac6103ebc7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh @@ -29,7 +29,7 @@ # # DESCRIPTION: -# Verify remount functionality, expecially on readonly objects. +# Verify remount functionality, especially on readonly objects. # # STRATEGY: # 1. Prepare a filesystem and a snapshot @@ -48,11 +48,19 @@ function cleanup { log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL - snapexists $TESTSNAP && log_must zfs destroy $TESTSNAP + snapexists $TESTSNAP && destroy_dataset $TESTSNAP [[ -d $MNTPSNAP ]] && log_must rmdir $MNTPSNAP return 0 } +if is_freebsd; then + typeset RO="-t zfs -ur" + typeset RW="-t zfs -uw" +else + typeset RO="-o remount,ro" + typeset RW="-o remount,rw" +fi + # # Verify the $filesystem is mounted readonly # This is preferred over "log_mustnot touch $fs" because we actually want to @@ -76,8 +84,13 @@ function checkmount # dataset option { typeset dataset="$1" typeset option="$2" + typeset options="" - options="$(awk -v ds="$dataset" '$1 == ds { print $4 }' /proc/mounts)" + if is_freebsd; then + options=$(mount -p | awk -v ds="$dataset" '$1 == ds { print $4 }') + else + options=$(awk -v ds="$dataset" '$1 == ds { print $4 }' /proc/mounts) + fi if [[ "$options" == '' ]]; then log_fail "Dataset $dataset is not mounted" elif [[ ! -z "${options##*$option*}" ]]; then @@ -105,21 +118,23 @@ log_must mkdir -p $MNTPSNAP # 2. Verify we can (re)mount the dataset readonly/read-write log_must touch $MNTPFS/file.dat checkmount $TESTFS 'rw' -log_must mount -o remount,ro $TESTFS $MNTPFS +log_must mount $RO $TESTFS $MNTPFS readonlyfs $MNTPFS checkmount $TESTFS 'ro' -log_must mount -o remount,rw $TESTFS $MNTPFS +log_must mount $RW $TESTFS $MNTPFS log_must touch $MNTPFS/file.dat checkmount $TESTFS 'rw' -# 3. Verify we can (re)mount the snapshot readonly -log_must mount -t zfs $TESTSNAP $MNTPSNAP -readonlyfs $MNTPSNAP -checkmount $TESTSNAP 'ro' -log_must mount -o remount,ro $TESTSNAP $MNTPSNAP -readonlyfs $MNTPSNAP -checkmount $TESTSNAP 'ro' -log_must umount $MNTPSNAP +if is_linux; then + # 3. Verify we can (re)mount the snapshot readonly + log_must mount -t zfs $TESTSNAP $MNTPSNAP + readonlyfs $MNTPSNAP + checkmount $TESTSNAP 'ro' + log_must mount $RO $TESTSNAP $MNTPSNAP + readonlyfs $MNTPSNAP + checkmount $TESTSNAP 'ro' + log_must umount $MNTPSNAP +fi # 4. Verify we can't remount a snapshot read-write # The "mount -o rw" command will succeed but the snapshot is mounted readonly. @@ -127,7 +142,7 @@ log_must umount $MNTPSNAP log_must mount -t zfs -o rw $TESTSNAP $MNTPSNAP readonlyfs $MNTPSNAP checkmount $TESTSNAP 'ro' -log_mustnot mount -o remount,rw $TESTSNAP $MNTPSNAP +log_mustnot mount $RW $TESTSNAP $MNTPSNAP readonlyfs $MNTPSNAP checkmount $TESTSNAP 'ro' log_must umount $MNTPSNAP @@ -138,7 +153,7 @@ log_must eval "echo 'password' | zfs create -o sync=disabled \ -o encryption=on -o keyformat=passphrase $TESTFS/crypt" CRYPT_MNTPFS="$(get_prop mountpoint $TESTFS/crypt)" log_must touch $CRYPT_MNTPFS/file.dat -log_must mount -o remount,ro $TESTFS/crypt $CRYPT_MNTPFS +log_must mount $RO $TESTFS/crypt $CRYPT_MNTPFS log_must umount -f $CRYPT_MNTPFS zpool sync $TESTPOOL @@ -149,7 +164,7 @@ log_must zpool import -o readonly=on $TESTPOOL # 7. Verify we can't remount its filesystem read-write readonlyfs $MNTPFS checkmount $TESTFS 'ro' -log_mustnot mount -o remount,rw $MNTPFS +log_mustnot mount $RW $MNTPFS readonlyfs $MNTPFS checkmount $TESTFS 'ro' diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.ksh new file mode 100755 index 0000000000..3a5793d070 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.ksh @@ -0,0 +1,117 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.cfg + +# +# DESCRIPTION: +# Verify parallel mount ordering is consistent. +# +# There was a bug in initial thread dispatching algorithm which put threads +# under race condition which resulted in undefined mount order. The purpose +# of this test is to verify `zfs unmount -a` succeeds (not `zfs mount -a` +# succeeds, it always does) after `zfs mount -a`, which could fail if threads +# race. See github.com/openzfs/zfs/issues/{8450,8833,8878} for details. +# +# STRATEGY: +# 1. Create pools and filesystems. +# 2. Set same mount point for >1 datasets. +# 3. Unmount all datasets. +# 4. Mount all datasets. +# 5. Unmount all datasets (verify this succeeds). +# + +verify_runnable "both" + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} +MNTPT=$TMPDIR/zfs_mount_test_race_mntpt +DISK1="$TMPDIR/zfs_mount_test_race_disk1" +DISK2="$TMPDIR/zfs_mount_test_race_disk2" + +TESTPOOL1=zfs_mount_test_race_tp1 +TESTPOOL2=zfs_mount_test_race_tp2 + +export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2" +log_must zfs $unmountall +unset __ZFS_POOL_RESTRICT + +function cleanup +{ + zpool destroy $TESTPOOL1 + zpool destroy $TESTPOOL2 + rm -rf $MNTPT + rm -rf /$TESTPOOL1 + rm -rf /$TESTPOOL2 + rm -f $DISK1 + rm -f $DISK2 + export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2" + log_must zfs $mountall + unset __ZFS_POOL_RESTRICT +} +log_onexit cleanup + +log_note "Verify parallel mount ordering is consistent" + +log_must truncate -s $MINVDEVSIZE $DISK1 +log_must truncate -s $MINVDEVSIZE $DISK2 + +log_must zpool create -f $TESTPOOL1 $DISK1 +log_must zpool create -f $TESTPOOL2 $DISK2 + +log_must zfs create $TESTPOOL1/$TESTFS1 +log_must zfs create $TESTPOOL2/$TESTFS2 + +log_must zfs set mountpoint=none $TESTPOOL1 +log_must zfs set mountpoint=$MNTPT $TESTPOOL1/$TESTFS1 + +# Note that unmount can fail (due to race condition on `zfs mount -a`) with or +# without `canmount=off`. The race has nothing to do with canmount property, +# but turn it off for convenience of mount layout used in this test case. +log_must zfs set canmount=off $TESTPOOL2 +log_must zfs set mountpoint=$MNTPT $TESTPOOL2 + +# At this point, layout of datasets in two pools will look like below. +# Previously, on next `zfs mount -a`, pthreads assigned to TESTFS1 and TESTFS2 +# could race, and TESTFS2 usually (actually always) won in OpenZFS. +# Note that the problem is how two or more threads could initially be assigned +# to the same top level directory, not this specific layout. +# This layout is just an example that can reproduce race, +# and is also the layout reported in #8833. +# +# NAME MOUNTED MOUNTPOINT +# ---------------------------------------------- +# /$TESTPOOL1 no none +# /$TESTPOOL1/$TESTFS1 yes $MNTPT +# /$TESTPOOL2 no $MNTPT +# /$TESTPOOL2/$TESTFS2 yes $MNTPT/$TESTFS2 + +# Apparently two datasets must be mounted. +log_must ismounted $TESTPOOL1/$TESTFS1 +log_must ismounted $TESTPOOL2/$TESTFS2 +# This unmount always succeeds, because potential race hasn't happened yet. +log_must zfs unmount -a +# This mount always succeeds, whether threads are under race condition or not. +log_must zfs mount -a + +# Verify datasets are mounted (TESTFS2 fails if the race broke mount order). +log_must ismounted $TESTPOOL1/$TESTFS1 +log_must ismounted $TESTPOOL2/$TESTFS2 +# Verify unmount succeeds (fails if the race broke mount order). +log_must zfs unmount -a + +log_pass "Verify parallel mount ordering is consistent passed" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_multi_mount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_multi_mount.ksh index e015d0affa..bd86eaa16b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_multi_mount.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_multi_mount.ksh @@ -59,7 +59,12 @@ log_must mkfile 128k $FILENAME log_must exec 9<> $FILENAME # open file # 3. Lazy umount -log_must umount -l $MNTPFS +if is_freebsd; then + # FreeBSD does not support lazy unmount + log_must umount $MNTPFS +else + log_must umount -l $MNTPFS +fi if [ -f $FILENAME ]; then log_fail "Lazy unmount failed" fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh index 1d769096b4..3788543b0b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh @@ -91,14 +91,28 @@ typeset -a pos_cmds_out=( } } }") + +# +# N.B. json.tool is needed to guarantee consistent ordering of fields, +# sed is needed to trim trailing space in CentOS 6's json.tool output +# +# As of Python 3.5 the behavior of json.tool changed to keep the order +# the same as the input and the --sort-keys option was added. Detect when +# --sort-keys is supported and apply the option to ensure the expected order. +# +if python -m json.tool --sort-keys <<< "{}"; then + JSON_TOOL_CMD="python -m json.tool --sort-keys" +else + JSON_TOOL_CMD="python -m json.tool" +fi + typeset -i cnt=0 typeset cmd for cmd in ${pos_cmds[@]}; do log_must zfs program $TESTPOOL $TESTZCP $TESTDS $cmd 2>&1 - log_must zfs program $TESTPOOL -j $TESTZCP $TESTDS $cmd 2>&1 - # json.tool is needed to guarantee consistent ordering of fields - # sed is needed to trim trailing space in CentOS 6's json.tool output - OUTPUT=$(zfs program $TESTPOOL -j $TESTZCP $TESTDS $cmd 2>&1 | python -m json.tool | sed 's/[[:space:]]*$//') + log_must zfs program -j $TESTPOOL $TESTZCP $TESTDS $cmd 2>&1 + OUTPUT=$(zfs program -j $TESTPOOL $TESTZCP $TESTDS $cmd 2>&1 | + $JSON_TOOL_CMD | sed 's/[[:space:]]*$//') if [ "$OUTPUT" != "${pos_cmds_out[$cnt]}" ]; then log_note "Got :$OUTPUT" log_note "Expected:${pos_cmds_out[$cnt]}" @@ -120,9 +134,9 @@ For the property list, run: zfs set|get For the delegated permission list, run: zfs allow|unallow") cnt=0 for cmd in ${neg_cmds[@]}; do - log_mustnot zfs program $TESTPOOL $TESTZCP $TESTDS $cmd 2>&1 - log_mustnot zfs program $TESTPOOL -j $TESTZCP $TESTDS $cmd 2>&1 - OUTPUT=$(zfs program $TESTPOOL -j $TESTZCP $TESTDS $cmd 2>&1) + log_mustnot zfs program $cmd $TESTPOOL $TESTZCP $TESTDS 2>&1 + log_mustnot zfs program -j $cmd $TESTPOOL $TESTZCP $TESTDS 2>&1 + OUTPUT=$(zfs program -j $cmd $TESTPOOL $TESTZCP $TESTDS 2>&1) if [ "$OUTPUT" != "${neg_cmds_out[$cnt]}" ]; then log_note "Got :$OUTPUT" log_note "Expected:${neg_cmds_out[$cnt]}" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh index 0bf7c5b6a1..dc3ffd65ed 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh @@ -50,8 +50,7 @@ function cleanup if snapexists $csnap; then log_must zfs promote $fs fi - snapexists $snap && \ - log_must zfs destroy -rR $snap + snapexists $snap && destroy_dataset $snap -rR typeset data for data in $file0 $file1; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh index e0d0e8457a..7dedaf91be 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh @@ -54,8 +54,7 @@ function cleanup typeset ds typeset data for ds in ${snap[*]}; do - snapexists $ds && \ - log_must zfs destroy -rR $ds + snapexists $ds && destroy_dataset $ds -rR done for data in ${file[*]}; do [[ -e $data ]] && rm -f $data diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh index 23b5991084..b8a5ab9c17 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh @@ -53,8 +53,7 @@ function cleanup typeset ds typeset data for ds in ${snap[*]}; do - snapexists $ds && \ - log_must zfs destroy -rR $ds + snapexists $ds && destroy_dataset $ds -rR done for data in ${file[*]}; do [[ -e $data ]] && rm -f $data diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh index c669a44eb0..289ddc6713 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh @@ -46,8 +46,8 @@ verify_runnable "both" function cleanup { if datasetexists $fssnap ; then - datasetexists $clone && log_must zfs destroy $clone - log_must zfs destroy $fssnap + datasetexists $clone && destroy_dataset $clone + destroy_dataset $fssnap fi if datasetexists $clone ; then log_must zfs promote $fs diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh index 286c14ac12..7f08f28a93 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh @@ -62,13 +62,9 @@ set -A args "" \ function cleanup { - if datasetexists $clone; then - log_must zfs destroy $clone - fi + datasetexists $clone && destroy_dataset $clone - if datasetexists $recvfs; then - log_must zfs destroy -r $recvfs - fi + datasetexists $recvfs && destroy_dataset $recvfs -r if snapexists $snap; then destroy_snapshot $snap diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh index 3f8ee1941d..95db7d9e6b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh @@ -47,8 +47,7 @@ verify_runnable "both" function cleanup { - snapexists $snap && \ - log_must zfs destroy -rR $snap + snapexists $snap && destroy_dataset $snap -rR typeset data for data in $TESTDIR/$TESTFILE0 $TESTDIR/$TESTFILE1; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh index 336c7b2538..fd6ed7e58e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh @@ -29,11 +29,12 @@ # 1. Create an encrypted dataset # 2. Clone the encryption root # 3. Clone the clone -# 4. Verify the encryption root of all three datasets is the origin +# 4. Add children to each of these three datasets +# 4. Verify the encryption root of all datasets is the origin # 5. Promote the clone of the clone -# 6. Verify the encryption root of all three datasets is still the origin -# 7. Promote the clone of the original encryption root -# 8. Verify the encryption root of all three datasets is the promoted dataset +# 6. Verify the encryption root of all datasets is still the origin +# 7. Promote the dataset again, so it is now the encryption root +# 8. Verify the encryption root of all datasets is the promoted dataset # verify_runnable "both" @@ -41,11 +42,11 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -Rf datasetexists $TESTPOOL/clone1 && \ - log_must zfs destroy -Rf $TESTPOOL/clone1 + destroy_dataset $TESTPOOL/clone1 -Rf datasetexists $TESTPOOL/clone2 && \ - log_must zfs destroy -Rf $TESTPOOL/clone2 + destroy_dataset $TESTPOOL/clone2 -Rf } log_onexit cleanup @@ -62,19 +63,31 @@ log_must zfs snap $snaproot log_must zfs clone $snaproot $TESTPOOL/clone1 log_must zfs snap $snapclone log_must zfs clone $snapclone $TESTPOOL/clone2 +log_must zfs create $TESTPOOL/$TESTFS1/child0 +log_must zfs create $TESTPOOL/clone1/child1 +log_must zfs create $TESTPOOL/clone2/child2 log_must verify_encryption_root $TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1 log_must verify_encryption_root $TESTPOOL/clone1 $TESTPOOL/$TESTFS1 log_must verify_encryption_root $TESTPOOL/clone2 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child0 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/clone1/child1 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/clone2/child2 $TESTPOOL/$TESTFS1 log_must zfs promote $TESTPOOL/clone2 log_must verify_encryption_root $TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS1 log_must verify_encryption_root $TESTPOOL/clone1 $TESTPOOL/$TESTFS1 log_must verify_encryption_root $TESTPOOL/clone2 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child0 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/clone1/child1 $TESTPOOL/$TESTFS1 +log_must verify_encryption_root $TESTPOOL/clone2/child2 $TESTPOOL/$TESTFS1 log_must zfs promote $TESTPOOL/clone2 log_must verify_encryption_root $TESTPOOL/$TESTFS1 $TESTPOOL/clone2 log_must verify_encryption_root $TESTPOOL/clone1 $TESTPOOL/clone2 log_must verify_encryption_root $TESTPOOL/clone2 $TESTPOOL/clone2 +log_must verify_encryption_root $TESTPOOL/$TESTFS1/child0 $TESTPOOL/clone2 +log_must verify_encryption_root $TESTPOOL/clone1/child1 $TESTPOOL/clone2 +log_must verify_encryption_root $TESTPOOL/clone2/child2 $TESTPOOL/clone2 log_pass "ZFS promotes clones of an encryption root" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh index bf94274ddb..f31ff48099 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh @@ -11,13 +11,13 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2017 by Delphix. All rights reserved. # # # DESCRIPTION # Verify that "zfs list" gives correct values for written and written@ -# proerties for the dataset when different operations are on done on it +# properties for the dataset when different operations are on done on it # # # STRATEGY @@ -36,7 +36,7 @@ function cleanup { for ds in $datasets; do - datasetexists $ds && log_must zfs destroy -R $TESTPOOL/$TESTFS1 + datasetexists $ds && destroy_dataset $TESTPOOL/$TESTFS1 -R done } function get_prop_mb @@ -86,7 +86,7 @@ blocks=0 for i in 1 2 3; do written=$(get_prop written $TESTPOOL/$TESTFS1@snap$i) if [[ $blocks -eq 0 ]]; then - # Written value for the frist non-clone snapshot is + # Written value for the first non-clone snapshot is # expected to be equal to the referenced value. expected_written=$( \ get_prop referenced $TESTPOOL/$TESTFS1@snap$i) @@ -120,7 +120,7 @@ sync_pool written=$(get_prop written $TESTPOOL/$TESTFS1) writtenat3=$(get_prop written@snap3 $TESTPOOL/$TESTFS1) [[ $written -eq $writtenat3 ]] || \ - log_fail "Written and written@ dont match $written $writtenat3" + log_fail "Written and written@ don't match $written $writtenat3" within_percent $written $before_written 0.1 && \ log_fail "Unexpected written value after delete $written $before_written" writtenat=$(get_prop written@snap1 $TESTPOOL/$TESTFS1) @@ -216,15 +216,15 @@ for ds in $datasets; do count=$blocks sync_pool done -recursive_output=$(zfs get -r written@current $TESTPOOL | \ +recursive_output=$(zfs get -p -r written@current $TESTPOOL | \ grep -v $TESTFS1@ | grep -v $TESTFS2@ | grep -v $TESTFS3@ | \ grep -v "VALUE" | grep -v "-") -expected="20.0M" +expected="$((20 * mb_block))" for ds in $datasets; do writtenat=$(echo "$recursive_output" | grep -v $ds/) writtenat=$(echo "$writtenat" | grep $ds | awk '{print $3}') - [[ $writtenat == $expected ]] || \ - log_fail "recursive written property output mismatch" + within_percent $writtenat $expected 99.5 || \ + log_fail "Unexpected written@ value on $ds" done log_pass "zfs written and written@ property fields print correct values" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am index bf112a77e6..aa23c71bdb 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am @@ -17,9 +17,17 @@ dist_pkgdata_SCRIPTS = \ zfs_receive_013_pos.ksh \ zfs_receive_014_pos.ksh \ zfs_receive_015_pos.ksh \ + zfs_receive_016_pos.ksh \ receive-o-x_props_override.ksh \ + receive-o-x_props_aliases.ksh \ zfs_receive_from_encrypted.ksh \ + zfs_receive_from_zstd.ksh \ + zfs_receive_new_props.ksh \ zfs_receive_to_encrypted.ksh \ zfs_receive_raw.ksh \ zfs_receive_raw_incremental.ksh \ + zfs_receive_raw_-d.ksh \ zfs_receive_-e.ksh + +dist_pkgdata_DATA = \ + zstd_test_data.txt diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh new file mode 100755 index 0000000000..d4b0aa2341 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh @@ -0,0 +1,213 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2017, loli10K . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib + +# +# DESCRIPTION: +# Verify ZFS property override (-o) and exclude (-x) options work when +# receiving a send stream, using property name aliases +# +# STRATEGY: +# 1. Create a filesystem with children. +# 2. Snapshot the filesystems. +# 3. Create various send streams (full, incremental, replication) and verify +# we can both override and exclude aliased properties. +# + +verify_runnable "both" + +function cleanup +{ + log_must rm -f $streamfile_full + log_must rm -f $streamfile_incr + log_must rm -f $streamfile_repl + log_must rm -f $streamfile_trun + destroy_dataset "$orig" "-rf" + destroy_dataset "$dest" "-rf" +} + +log_assert "ZFS receive property alias override and exclude options work as expected." +log_onexit cleanup + +orig=$TESTPOOL/$TESTFS1 +origsub=$orig/sub +dest=$TESTPOOL/$TESTFS2 +destsub=$dest/sub +typeset streamfile_full=$TESTDIR/streamfile_full.$$ +typeset streamfile_incr=$TESTDIR/streamfile_incr.$$ +typeset streamfile_repl=$TESTDIR/streamfile_repl.$$ +typeset streamfile_trun=$TESTDIR/streamfile_trun.$$ + +# +# 3.1 Verify we can't specify the same property in multiple -o or -x options +# or an invalid value was specified. +# +# Create a full send stream +log_must zfs create $orig +log_must zfs snapshot $orig@snap1 +log_must eval "zfs send $orig@snap1 > $streamfile_full" +# Verify we reject invalid options +log_mustnot eval "zfs recv $dest -o compress < $streamfile_full" +log_mustnot eval "zfs recv $dest -x compress=off < $streamfile_full" +log_mustnot eval "zfs recv $dest -o compress=off -x compress < $streamfile_full" +log_mustnot eval "zfs recv $dest -o compress=off -o compress=on < $streamfile_full" +log_mustnot eval "zfs recv $dest -x compress -x compress < $streamfile_full" +log_mustnot eval "zfs recv $dest -o version=1 < $streamfile_full" +log_mustnot eval "zfs recv $dest -x version < $streamfile_full" +log_mustnot eval "zfs recv $dest -x normalization < $streamfile_full" +# Verify we also reject invalid ZVOL options +log_must zfs create -V 32K -s $orig/zvol +log_must eval "zfs send $orig@snap1 > $streamfile_full" +log_mustnot eval "zfs recv $dest -x volblock < $streamfile_full" +log_mustnot eval "zfs recv $dest -o volblock=32K < $streamfile_full" +# Cleanup +block_device_wait +log_must_busy zfs destroy -r -f $orig + +# +# 3.2 Verify -o property=value works on streams without properties. +# +# Create a full send stream +log_must zfs create $orig +log_must zfs snapshot $orig@snap1 +log_must eval "zfs send $orig@snap1 > $streamfile_full" +# Receive the full stream, override some properties +log_must eval "zfs recv -o compress=on -o '$userprop:dest'='$userval' "\ + "$dest < $streamfile_full" +log_must eval "check_prop_source $dest compression on local" +log_must eval "check_prop_source $dest '$userprop:dest' '$userval' local" +# Cleanup +log_must zfs destroy -r -f $orig +log_must zfs destroy -r -f $dest + +# +# 3.3 Verify -o property=value and -x work +# for an incremental replication send stream. +# +# Create a dataset tree and receive it +log_must zfs create $orig +log_must zfs create $origsub +log_must zfs snapshot -r $orig@snap1 +log_must eval "zfs send -R $orig@snap1 > $streamfile_repl" +log_must eval "zfs recv $dest < $streamfile_repl" +# Fill the datasets with properties and create an incremental replication stream +log_must zfs snapshot -r $orig@snap2 +log_must zfs snapshot -r $orig@snap3 +log_must eval "zfs set copies=2 $orig" +log_must eval "zfs set dnsize=4k $orig" +log_must eval "zfs set compression=gzip $origsub" +log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr" +# Sets various combination of override and exclude options +log_must eval "zfs recv -F -o atime=off -o quota=123456789 -o checksum=sha512" \ + " -o dnsize=2k -x compress $dest < $streamfile_incr" +# Verify we can correctly override and exclude properties +log_must eval "check_prop_source $dest copies 2 received" +log_must eval "check_prop_source $dest atime off local" +log_must eval "check_prop_source $dest quota 123456789 local" +log_must eval "check_prop_source $dest checksum sha512 local" +log_must eval "check_prop_source $dest dnodesize 2k local" +log_must eval "check_prop_inherit $destsub copies $dest" +log_must eval "check_prop_inherit $destsub atime $dest" +log_must eval "check_prop_inherit $destsub checksum $dest" +log_must eval "check_prop_source $destsub quota 0 default" +log_must eval "check_prop_source $destsub compression off default" +# Cleanup +log_must zfs destroy -r -f $orig +log_must zfs destroy -r -f $dest + +# +# 3.4 Verify '-x property' does not remove existing local properties and a +# modified sent property is received and updated to the new value but can +# still be excluded. +# +# Create a dataset tree +log_must zfs create $orig +log_must zfs create $origsub +log_must zfs snapshot -r $orig@snap1 +log_must eval "zfs set copies=2 $orig" +log_must eval "zfs send -R $orig@snap1 > $streamfile_repl" +log_must eval "zfs receive $dest < $streamfile_repl" +log_must eval "check_prop_source $dest copies 2 received" +log_must eval "check_prop_inherit $destsub copies $dest" +# Set new custom properties on both source and destination +log_must eval "zfs set copies=3 $orig" +log_must eval "zfs set compression=on $orig" +log_must eval "zfs set compression=lzjb $origsub" +log_must eval "zfs set compression=gzip $dest" +# Receive the new stream, verify we preserve locally set properties +log_must zfs snapshot -r $orig@snap2 +log_must zfs snapshot -r $orig@snap3 +log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr" +log_must eval "zfs recv -F -x copies -x compress $dest < $streamfile_incr" +log_must eval "check_prop_source $dest copies 1 default" +log_must eval "check_prop_received $dest copies 3" +log_must eval "check_prop_source $destsub copies 1 default" +log_must eval "check_prop_received $destsub copies '-'" +log_must eval "check_prop_source $dest compression gzip local" +log_must eval "check_prop_inherit $destsub compression $dest" +# Cleanup +log_must zfs destroy -r -f $orig +log_must zfs destroy -r -f $dest + +# +# 3.6 Verify we correctly restore existing properties on a failed receive +# +# Receive a "clean" dataset tree +log_must zfs create $orig +log_must zfs create $origsub +log_must zfs snapshot -r $orig@snap1 +log_must eval "zfs send -R $orig@snap1 > $streamfile_repl" +log_must eval "zfs receive $dest < $streamfile_repl" +# Set custom properties on the destination +log_must eval "zfs set compress=on $dest" +log_must eval "zfs set compress=lzjb $destsub" +# Create a truncated incremental replication stream +mntpnt=$(get_prop mountpoint $orig) +log_must eval "dd if=/dev/urandom of=$mntpnt/file bs=1024k count=10" +log_must zfs snapshot -r $orig@snap2 +log_must zfs snapshot -r $orig@snap3 +log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr" +log_must eval "dd if=$streamfile_incr of=$streamfile_trun bs=1024k count=9" +# Receive the truncated stream, verify original properties are kept +log_mustnot eval "zfs recv -F -o copies=3 -o compress=gzip "\ + "$dest < $streamfile_trun" +log_must eval "check_prop_source $dest copies 1 default" +log_must eval "check_prop_source $destsub copies 1 default" +log_must eval "check_prop_source $dest compression on local" +log_must eval "check_prop_source $destsub compression lzjb local" +# Cleanup +log_must zfs destroy -r -f $orig +log_must zfs destroy -r -f $dest + +# +# 3.7 Verify that we can't get around checking a property is readonly +# by using the alias or receiving a parent replication stream. +log_must zfs create $orig +log_must zfs create -V 128K -s $origsub +log_must zfs snapshot -r $orig@snap1 +log_must eval "zfs send -R $orig@snap1 > $streamfile_repl" +log_mustnot eval "zfs receive -o volblock=64k $dest < $streamfile_repl" +# Cleanup +block_device_wait +log_must_busy zfs destroy -r -f $orig + +log_pass "ZFS receive property alias override and exclude options passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh index 6f897a96f3..2d3c15c62f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh @@ -259,16 +259,21 @@ log_must zfs destroy -r -f $orig log_must zfs destroy -r -f $dest # -# 3.7 Verify we can't receive a send stream overriding or excluding properties -# invalid for the dataset type unless the stream it's recursive, in which -# case only the appropriate properties are set on the destination. -# +# 3.7 Verify we can receive a send stream excluding but not overriding +# properties invalid for the dataset type, in which case only the +# appropriate properties are set on the destination. log_must zfs create -V 128K -s $orig log_must zfs snapshot $orig@snap1 log_must eval "zfs send $orig@snap1 > $streamfile_full" -log_mustnot eval "zfs receive -x atime $dest < $streamfile_full" log_mustnot eval "zfs receive -o atime=off $dest < $streamfile_full" +log_mustnot eval "zfs receive -o atime=off -x canmount $dest < $streamfile_full" +log_must eval "zfs receive -x atime -x canmount $dest < $streamfile_full" +log_must eval "check_prop_source $dest type volume -" +log_must eval "check_prop_source $dest atime - -" +log_must eval "check_prop_source $dest canmount - -" log_must_busy zfs destroy -r -f $orig +log_must_busy zfs destroy -r -f $dest +# Recursive sends also accept (and ignore) such overrides log_must zfs create $orig log_must zfs create -V 128K -s $origsub log_must zfs snapshot -r $orig@snap1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh index 5ce0e02fa6..8a6cd8c409 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh @@ -48,11 +48,9 @@ function cleanup { typeset -i i=0 - datasetexists $rst_root && \ - log_must zfs destroy -Rf $rst_root + datasetexists $rst_root && destroy_dataset $rst_root -Rf while (( i < 2 )); do - snapexists ${orig_snap[$i]} && \ - log_must zfs destroy -f ${orig_snap[$i]} + snapexists ${orig_snap[$i]} && destroy_dataset ${orig_snap[$i]} -f log_must rm -f ${bkup[$i]} (( i = i + 1 )) @@ -63,8 +61,7 @@ function cleanup function recreate_root { - datasetexists $rst_root && \ - log_must zfs destroy -Rf $rst_root + datasetexists $rst_root && destroy_dataset $rst_root -Rf if [[ -d $TESTDIR1 ]] ; then log_must rm -rf $TESTDIR1 fi @@ -155,7 +152,7 @@ for orig_fs in $datasets ; do log_must zfs destroy -Rf $rst_fs - log_note "Verfiying 'zfs receive -d ' works." + log_note "Verifying 'zfs receive -d ' works." i=0 while (( i < ${#bkup[*]} )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh index 36af37a757..ba3fc49bd8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh @@ -50,10 +50,8 @@ function cleanup typeset ds while (( i < ${#orig_snap[*]} )); do - snapexists ${rst_snap[$i]} && \ - log_must zfs destroy -f ${rst_snap[$i]} - snapexists ${orig_snap[$i]} && \ - log_must zfs destroy -f ${orig_snap[$i]} + snapexists ${rst_snap[$i]} && destroy_dataset ${rst_snap[$i]} -f + snapexists ${orig_snap[$i]} && destroy_dataset ${orig_snap[$i]} -f [[ -e ${bkup[$i]} ]] && \ log_must rm -rf ${bkup[$i]} @@ -61,8 +59,7 @@ function cleanup done for ds in $rst_vol $rst_root; do - datasetexists $ds && \ - log_must zfs destroy -Rf $ds + datasetexists $ds && destroy_dataset $ds -Rf done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh index d5f6e0984d..cce3876153 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh @@ -49,7 +49,7 @@ verify_runnable "both" function cleanup { for snap in $snap2 $snap1; do - datasetexists $snap && log_must zfs destroy -rf $snap + datasetexists $snap && destroy_dataset $snap -rf done for file in $ibackup $mntpnt/file1 $mntpnt/file2; do [[ -f $file ]] && log_must rm -f $file diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh index fcbdc5e159..7c115ee33b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh @@ -36,7 +36,7 @@ # Verify 'zfs receive' fails with malformed parameters. # # STRATEGY: -# 1. Denfine malformed parameters array +# 1. Define malformed parameters array # 2. Feed the malformed parameters to 'zfs receive' # 3. Verify the command should be failed # @@ -49,8 +49,7 @@ function cleanup typeset bkup for snap in $init_snap $inc_snap $init_topsnap $inc_topsnap ; do - snapexists $snap && \ - log_must zfs destroy -Rf $snap + snapexists $snap && destroy_dataset $snap -Rf done for bkup in $full_bkup $inc_bkup $full_topbkup $inc_topbkup; do @@ -92,16 +91,11 @@ sync set -A badargs \ "" "nonexistent-snap" "blah@blah" "-d" "-d nonexistent-dataset" \ - "$TESTPOOL/$TESTFS" "$TESTPOOL1" "$TESTPOOL/fs@" "$TESTPOOL/fs@@mysnap" \ + "$TESTPOOL1" "$TESTPOOL/fs@" "$TESTPOOL/fs@@mysnap" \ "$TESTPOOL/fs@@" "$TESTPOOL/fs/@mysnap" "$TESTPOOL/fs@/mysnap" \ "$TESTPOOL/nonexistent-fs/nonexistent-fs" "-d $TESTPOOL/nonexistent-fs" \ "-d $TESTPOOL/$TESTFS/nonexistent-fs" -if is_global_zone ; then - typeset -i n=${#badargs[@]} - badargs[$n]="-d $TESTPOOL" -fi - typeset -i i=0 while (( i < ${#badargs[*]} )) do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh index 4cbc7e3390..d8c71f2c28 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh @@ -53,12 +53,10 @@ function cleanup typeset bkup for snap in $init_snap $inc_snap; do - snapexists $snap && \ - log_must zfs destroy -f $snap + snapexists $snap && destroy_dataset $snap -f done - datasetexists $rst_root && \ - log_must zfs destroy -Rf $rst_root + datasetexists $rst_root && destroy_dataset $rst_root -Rf for bkup in $full_bkup $inc_bkup; do [[ -e $bkup ]] && \ @@ -82,8 +80,8 @@ log_must zfs snapshot $init_snap log_must eval "zfs send $init_snap > $full_bkup" log_note "'zfs receive' fails with invalid send streams." -log_mustnot eval "zfs receive $rst_init_snap < /dev/zero" -log_mustnot eval "zfs receive -d $rst_root $fbackup" for opt in "-v" "-vn"; do - if datasetexists $rst_fs; then - log_must zfs destroy -fr $rst_fs - fi + datasetexists $rst_fs && destroy_dataset $rst_fs -fr log_note "Check ZFS receive $opt []" log_must eval "zfs receive $opt $rst_fs < $fbackup > $tmp_out 2>&1" if [[ $opt == "-v" ]]; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh index d028acafad..37fe515e23 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh @@ -48,13 +48,10 @@ function cleanup { typeset ds - if snapexists $snap; then - log_must zfs destroy $snap - fi + snapexists $snap && destroy_dataset $snap + for ds in $ctr1 $ctr2 $fs1; do - if datasetexists $ds; then - log_must zfs destroy -rf $ds - fi + datasetexists $ds && destroy_dataset $ds -rf done if [[ -d $TESTDIR2 ]]; then rm -rf $TESTDIR2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh index 5d7a7043b1..e1e93e9d2a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh @@ -39,7 +39,7 @@ tpoolfile=$TEST_BASE_DIR/temptank.$$ function cleanup { for fs in $src_fs $dst_fs; do - datasetexists $fs && log_must zfs destroy -rf $fs + datasetexists $fs && log_must destroy_dataset $fs -rf done zpool destroy $temppool [[ -f $streamfile ]] && log_must rm -f $streamfile @@ -67,6 +67,8 @@ zfs snapshot $src_fs@snap3 log_must eval "zfs send -D -R $src_fs@snap3 > $streamfile" log_must eval "zfs receive -v $dst_fs < $streamfile" +log_must zfs destroy -r $dst_fs +log_must eval "zstream redup $streamfile | zfs receive -v $dst_fs" cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh index be04aed2b2..989d31b906 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh @@ -55,31 +55,6 @@ function cleanup log_must zfs destroy -rf $dest } -# -# Verify property $2 is set from source $4 on dataset $1 and has value $3. -# -# $1 checked dataset -# $2 user property -# $3 property value -# $4 source -# -function check_prop_source -{ - typeset dataset=$1 - typeset prop=$2 - typeset value=$3 - typeset source=$4 - typeset chk_value=$(get_prop "$prop" "$dataset") - typeset chk_source=$(get_source "$prop" "$dataset") - if [[ "$chk_value" != "$value" || \ - "$chk_source" != "$4" ]] - then - return 1 - else - return 0 - fi -} - log_assert "ZFS successfully receive and restore properties." log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh new file mode 100755 index 0000000000..04d20ebd39 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2020 by Mariusz Zaborski . + +# +# DESCRIPTION: +# Verify 'zfs recv' can forcibly unmount filesystem while receiving +# stream. +# +# STRATEGY: +# 1. Create snapshot of file system +# 2. Make a zfs filesystem mountpoint busy +# 3. Receive filesystem with force flag. +# 4. Verify that stream was received or failed on Linux. +# + +. $STF_SUITE/tests/functional/cli_root/cli_common.kshlib + +verify_runnable "both" + +function cleanup +{ + cd $curpath + + for snap in $init_snap $rst_snap; do + snapexists $snap && \ + destroy_snapshot $snap + done + + datasetexists $rst_root && \ + destroy_dataset $rst_root + + for file in $full_bkup + do + [[ -e $file ]] && \ + log_must rm -f $file + done + + [[ -d $TESTDIR1 ]] && \ + log_must rm -rf $TESTDIR1 +} + +log_assert "Verify 'zfs recv' can forcibly unmount busy filesystem." +log_onexit cleanup + +curpath=`dirname $0` +init_snap=$TESTPOOL/$TESTFS@init_snap +full_bkup=$TEST_BASE_DIR/fullbkup.$$ +rst_root=$TESTPOOL/rst_ctr +rst_snap=$rst_root@init_snap + +log_note "Verify 'zfs recv' can forcible unmount busy filesystem." + +# Preparation +log_must zfs create $rst_root +[[ ! -d $TESTDIR1 ]] && \ + log_must mkdir -p $TESTDIR1 +log_must zfs set mountpoint=$TESTDIR1 $rst_root + +log_must zfs snapshot $init_snap +log_must eval "zfs send $init_snap > $full_bkup" + +# Test +log_must cd $TESTDIR1 +if is_linux; then + # Linux does not support it. + log_mustnot zfs receive -MF $rst_snap < $full_bkup +else + log_must zfs receive -MF $rst_snap < $full_bkup +fi + +log_pass "The busy filesystem was unmounted or busy as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh index 5eee9eecf4..8914326852 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh @@ -31,9 +31,9 @@ # 4. Snapshot the encrypted dataset # 5. Attempt to receive the snapshot into an unencrypted child # 6. Verify encryption is not enabled -# 7. Verify the cheksum of the file is the same as the original +# 7. Verify the checksum of the file is the same as the original # 8. Attempt to receive the snapshot into an encrypted child -# 9. Verify the cheksum of the file is the same as the original +# 9. Verify the checksum of the file is the same as the original # verify_runnable "both" @@ -41,10 +41,10 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r } log_onexit cleanup @@ -59,7 +59,7 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ "-o keyformat=passphrase $TESTPOOL/$TESTFS2" log_must mkfile 1M /$TESTPOOL/$TESTFS2/$TESTFILE0 -typeset checksum=$(md5sum /$TESTPOOL/$TESTFS2/$TESTFILE0 | awk '{ print $1 }') +typeset checksum=$(md5digest /$TESTPOOL/$TESTFS2/$TESTFILE0) log_must zfs snapshot $snap @@ -69,14 +69,14 @@ log_must eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c1" crypt=$(get_prop encryption $TESTPOOL/$TESTFS1/c1) [[ "$crypt" == "off" ]] || log_fail "Received unencrypted stream as encrypted" -typeset cksum1=$(md5sum /$TESTPOOL/$TESTFS1/c1/$TESTFILE0 | awk '{ print $1 }') +typeset cksum1=$(md5digest /$TESTPOOL/$TESTFS1/c1/$TESTFILE0) [[ "$cksum1" == "$checksum" ]] || \ log_fail "Checksums differ ($cksum1 != $checksum)" log_note "Verify ZFS can receive into an encrypted child" log_must eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS2/c1" -typeset cksum2=$(md5sum /$TESTPOOL/$TESTFS2/c1/$TESTFILE0 | awk '{ print $1 }') +typeset cksum2=$(md5digest /$TESTPOOL/$TESTFS2/c1/$TESTFILE0) [[ "$cksum2" == "$checksum" ]] || \ log_fail "Checksums differ ($cksum2 != $checksum)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh new file mode 100755 index 0000000000..72eebb4f93 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# ZFS should receive a ZSTD compressed block and be able to determine the level +# +# STRATEGY: +# 1. Create a ZSTD compressed dataset (random level) +# 2. Create and checksum a file on the compressed dataset +# 3. Snapshot the compressed dataset +# 4. Attempt to receive the snapshot into a new dataset +# 5. Verify the checksum of the file is the same as the original +# 6. Verify the compression level is correctly stored +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r + + datasetexists $TESTPOOL/$TESTFS2 && \ + destroy_dataset $TESTPOOL/$TESTFS2 -r +} + +log_onexit cleanup + +log_assert "ZFS should track compression level when receiving a ZSTD stream" + +typeset src_data="$STF_SUITE/tests/functional/cli_root/zfs_receive/zstd_test_data.txt" +typeset snap="$TESTPOOL/$TESTFS1@snap" + +random_level=$((RANDOM%19 + 1)) +log_note "Randomly selected ZSTD level: $random_level" + +log_must zfs create -o compress=zstd-$random_level $TESTPOOL/$TESTFS1 +# Make a 5kb compressible file +log_must cat $src_data $src_data $src_data $src_data $src_data \ + > /$TESTPOOL/$TESTFS1/$TESTFILE0 +typeset checksum=$(md5digest /$TESTPOOL/$TESTFS1/$TESTFILE0) + +log_must zfs snapshot $snap + +# get object number of file +listing=$(ls -i /$TESTPOOL/$TESTFS1/$TESTFILE0) +set -A array $listing +obj=${array[0]} +log_note "file /$TESTPOOL/$TESTFS1/$TESTFILE0 has object number $obj" + +output=$(zdb -Zddddddbbbbbb $TESTPOOL/$TESTFS1 $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*$/\1/p' <<< "$output") +log_note "block 0 of /$TESTPOOL/$TESTFS1/$TESTFILE0 has a DVA of $dva" + +zstd_str=$(sed -Ene 's/^.+ ZSTD:size=([^:]+):version=([^:]+):level=([^:]+):.*$/\1:\2:\3/p' <<< "$output") +zstd_size1=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[1]}') +zstd_version1=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[2]}') +zstd_level1=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[3]}') +log_note "ZSTD src: size=$zstd_size1 version=$zstd_version1 level=$zstd_level1" + +log_note "Verify ZFS can receive the ZSTD compressed stream" +log_must eval "zfs send -ec $snap | zfs receive $TESTPOOL/$TESTFS2" + +typeset cksum1=$(md5digest /$TESTPOOL/$TESTFS2/$TESTFILE0) +[[ "$cksum1" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum1 != $checksum)" + +# get object number of file +listing=$(ls -i /$TESTPOOL/$TESTFS2/$TESTFILE0) +set -A array $listing +obj=${array[0]} +log_note "file /$TESTPOOL/$TESTFS2/$TESTFILE0 has object number $obj" + +output=$(zdb -Zddddddbbbbbb $TESTPOOL/$TESTFS2 $obj 2> /dev/null \ + |grep -m 1 "L0 DVA" |head -n1) +dva=$(sed -Ene 's/^.+DVA\[0\]=<([^>]+)>.*$/\1/p' <<< "$output") +log_note "block 0 of /$TESTPOOL/$TESTFS2/$TESTFILE0 has a DVA of $dva" + +zstd_str=$(sed -Ene 's/^.+ ZSTD:size=([^:]+):version=([^:]+):level=([^:]+):.*$/\1:\2:\3/p' <<< "$output") +zstd_size2=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[1]}') +(( $zstd_size2 != $zstd_size1 )) && log_fail \ +"ZFS recv failed: compressed size differs ($zstd_size2 != $zstd_size1)" +zstd_version2=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[2]}') +zstd_level2=$(echo "$zstd_str" |awk '{split($0,array,":")} END{print array[3]}') +log_note "ZSTD dest: size=$zstd_size2 version=$zstd_version2 level=$zstd_level2" +(( $zstd_level2 != $zstd_level1 )) && log_fail \ +"ZFS recv failed: compression level did not match header level ($zstd_level2 != $zstd_level1)" + +log_pass "ZFS can receive a ZSTD stream and determine the compression level" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh new file mode 100755 index 0000000000..54f13355f5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# ZFS receive test to handle Issue #10698 +# +# STRATEGY: +# 1. Create a pool with filesystem_limits disabled +# 2. Create a filesystem on that pool +# 3. Enable filesystem limits on that pool +# 4. On a pool with filesystem limits enabled, create a filesystem and set a +# limit +# 5. Snapshot limited filesystem +# 6. send -R limited filesystem and receive over filesystem with limits disabled +# + +verify_runnable "both" + +function cleanup +{ + destroy_pool "$poolname" + destroy_pool "$rpoolname" + log_must rm -f "$vdevfile" + log_must rm -f "$rvdevfile" + log_must rm -f "$streamfile" +} + +log_onexit cleanup + +log_assert "ZFS should handle receiving streams with filesystem limits on \ + pools where the feature was recently enabled" + +poolname=sendpool +rpoolname=recvpool +vdevfile="$TEST_BASE_DIR/vdevfile.$$" +rvdevfile="$TEST_BASE_DIR/rvdevfile.$$" +sendfs="$poolname/fs" +recvfs="$rpoolname/rfs" +streamfile="$TEST_BASE_DIR/streamfile.$$" + +log_must truncate -s $MINVDEVSIZE "$rvdevfile" +log_must truncate -s $MINVDEVSIZE "$vdevfile" +log_must zpool create -O mountpoint=none -o feature@filesystem_limits=disabled \ + "$rpoolname" "$rvdevfile" +log_must zpool create -O mountpoint=none "$poolname" "$vdevfile" + +log_must zfs create "$recvfs" +log_must zpool set feature@filesystem_limits=enabled "$rpoolname" + +log_must zfs create -o filesystem_limit=100 "$sendfs" +log_must zfs snapshot "$sendfs@a" + +log_must zfs send -R "$sendfs@a" >"$streamfile" +log_must eval "zfs recv -svuF $recvfs <$streamfile" + +log_pass "ZFS can handle receiving streams with filesystem limits on \ + pools where the feature was recently enabled" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw.ksh index 2042b37a98..32b05e527a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw.ksh @@ -31,11 +31,12 @@ # 4. Attempt to receive a raw send stream as a child of an unencrypted dataset # 5. Verify the key is unavailable # 6. Attempt to load the key and mount the dataset -# 7. Verify the cheksum of the file is the same as the original +# 7. Verify the checksum of the file is the same as the original # 8. Attempt to receive a raw send stream as a child of an encrypted dataset # 9. Verify the key is unavailable # 10. Attempt to load the key and mount the dataset -# 11. Verify the cheksum of the file is the same as the original +# 11. Verify the checksum of the file is the same as the original +# 12. Verify 'zfs receive -n' works with the raw stream # verify_runnable "both" @@ -43,10 +44,10 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r } log_onexit cleanup @@ -60,8 +61,7 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ "-o keyformat=passphrase $TESTPOOL/$TESTFS1" log_must mkfile 1M /$TESTPOOL/$TESTFS1/$TESTFILE0 -typeset checksum=$(md5sum /$TESTPOOL/$TESTFS1/$TESTFILE0 | \ - awk '{ print $1 }') +typeset checksum=$(md5digest /$TESTPOOL/$TESTFS1/$TESTFILE0) log_must zfs snapshot $snap @@ -74,7 +74,7 @@ keystatus=$(get_prop keystatus $TESTPOOL/$TESTFS2) log_must eval "echo $passphrase | zfs mount -l $TESTPOOL/$TESTFS2" -typeset cksum1=$(md5sum /$TESTPOOL/$TESTFS2/$TESTFILE0 | awk '{ print $1 }') +typeset cksum1=$(md5digest /$TESTPOOL/$TESTFS2/$TESTFILE0) [[ "$cksum1" == "$checksum" ]] || \ log_fail "Checksums differ ($cksum1 != $checksum)" @@ -85,9 +85,10 @@ keystatus=$(get_prop keystatus $TESTPOOL/$TESTFS1/c1) log_fail "Expected keystatus unavailable, got $keystatus" log_must eval "echo $passphrase | zfs mount -l $TESTPOOL/$TESTFS1/c1" -typeset cksum2=$(md5sum /$TESTPOOL/$TESTFS1/c1/$TESTFILE0 | \ - awk '{ print $1 }') +typeset cksum2=$(md5digest /$TESTPOOL/$TESTFS1/c1/$TESTFILE0) [[ "$cksum2" == "$checksum" ]] || \ log_fail "Checksums differ ($cksum2 != $checksum)" +log_must eval "zfs send -w $snap | zfs receive -n $TESTPOOL/$TESTFS3" + log_pass "ZFS can receive streams from raw sends" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh new file mode 100755 index 0000000000..662f9386eb --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# zfs receive -d should create the expected encryption hierarchy. +# +# STRATEGY: +# 1. Create an encrypted dataset and a inheriting child +# 2. Snapshot the child dataset +# 2. Create a recursive raw send file from the snapshot +# 3. Destroy the original child filesystem +# 4. Receive the snapshot as a child of the second dataset with '-d' +# 5. Verify the new child can be mounted +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r + rm -f $sendfile +} + +log_onexit cleanup + +log_assert "zfs receive -d should create the expected encryption hierarchy" + +typeset passphrase="password1" + +sendfile=$TEST_BASE_DIR/sendfile.$$ + +log_must eval "echo $passphrase | zfs create -o encryption=on" \ + "-o keyformat=passphrase $TESTPOOL/$TESTFS1" +log_must zfs create $TESTPOOL/$TESTFS1/child +log_must zfs snapshot $TESTPOOL/$TESTFS1/child@snap +log_must eval "zfs send -Rw $TESTPOOL/$TESTFS1/child@snap > $sendfile" +log_must zfs destroy -r $TESTPOOL/$TESTFS1/child +log_must zfs receive -Fd $TESTPOOL < $sendfile +log_must eval "echo $passphrase | zfs mount -l $TESTPOOL/$TESTFS1/child" + +log_pass "zfs receive -d creates the expected encryption hierarchy" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh index 1e91c6262c..7826ec9a48 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh @@ -43,10 +43,10 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r [[ -f $ibackup ]] && log_must rm -f $ibackup [[ -f $ibackup_trunc ]] && log_must rm -f $ibackup_trunc @@ -69,7 +69,7 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ log_must zfs snapshot $snap1 log_must mkfile 1M /$TESTPOOL/$TESTFS1/$TESTFILE0 -typeset checksum=$(md5sum /$TESTPOOL/$TESTFS1/$TESTFILE0 | awk '{ print $1 }') +typeset checksum=$(md5digest /$TESTPOOL/$TESTFS1/$TESTFILE0) log_must zfs snapshot $snap2 @@ -77,7 +77,7 @@ log_must eval "zfs send -w $snap1 | zfs receive $TESTPOOL/$TESTFS2" log_must eval "echo $passphrase2 | zfs change-key $TESTPOOL/$TESTFS1" log_must eval "zfs send -w -i $snap1 $snap2 > $ibackup" -typeset trunc_size=$(stat -c %s $ibackup) +typeset trunc_size=$(stat_size $ibackup) trunc_size=$(expr $trunc_size - 64) log_must cp $ibackup $ibackup_trunc log_must truncate -s $trunc_size $ibackup_trunc @@ -89,7 +89,7 @@ log_must zfs unload-key $TESTPOOL/$TESTFS2 log_must eval "zfs receive $TESTPOOL/$TESTFS2 < $ibackup" log_must eval "echo $passphrase2 | zfs mount -l $TESTPOOL/$TESTFS2" -typeset cksum1=$(md5sum /$TESTPOOL/$TESTFS2/$TESTFILE0 | awk '{ print $1 }') +typeset cksum1=$(md5digest /$TESTPOOL/$TESTFS2/$TESTFILE0) [[ "$cksum1" == "$checksum" ]] || \ log_fail "Checksums differ ($cksum1 != $checksum)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh index 57896c6fd3..526497401f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh @@ -38,15 +38,15 @@ verify_runnable "both" function cleanup { - snapexists $snap && log_must_busy zfs destroy -f $snap + snapexists $snap && destroy_dataset $snap -f datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup -log_assert "ZFS should receive to an encrypted child dataset" +log_assert "ZFS should receive encrypted filesystems into child dataset" typeset passphrase="password" typeset snap="$TESTPOOL/$TESTFS@snap" @@ -60,11 +60,13 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ log_note "Verifying ZFS will receive to an encrypted child" log_must eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c1" -log_note "Verifying 'send -p' will not receive to an encrypted child" -log_mustnot eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2" +log_note "Verifying 'send -p' will receive to an encrypted child" +log_must eval "zfs send -p $snap | zfs receive $TESTPOOL/$TESTFS1/c2" +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c2)" == "off" -log_note "Verifying 'send -R' will not receive to an encrypted child" -log_mustnot eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3" +log_note "Verifying 'send -R' will receive to an encrypted child" +log_must eval "zfs send -R $snap | zfs receive $TESTPOOL/$TESTFS1/c3" +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS1/c3)" == "off" log_note "Verifying ZFS will not receive to an encrypted child when the" \ "parent key is unloaded" @@ -72,4 +74,4 @@ log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unload-key $TESTPOOL/$TESTFS1 log_mustnot eval "zfs send $snap | zfs receive $TESTPOOL/$TESTFS1/c4" -log_pass "ZFS can receive to an encrypted child dataset" +log_pass "ZFS can receive encrypted filesystems into child dataset" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zstd_test_data.txt b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zstd_test_data.txt new file mode 100644 index 0000000000..da6a0c7d6e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zstd_test_data.txt @@ -0,0 +1 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim.. diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_cliargs.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_cliargs.ksh deleted file mode 100755 index 80a5e6e0d9..0000000000 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_cliargs.ksh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/ksh -p -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2018, loli10K . All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/removal/removal.kshlib - -# -# DESCRIPTION: -# 'zfs remap' should only work with supported parameters. -# -# STRATEGY: -# 1. Prepare a pool where a top-level VDEV has been removed -# 2. Verify every supported parameter to 'zfs remap' is accepted -# 3. Verify other unsupported parameters raise an error -# - -# The 'zfs remap' command has been disabled and may be removed. -export ZFS_REMAP_ENABLED=YES - -verify_runnable "both" - -function cleanup -{ - destroy_pool $TESTPOOL - rm -f $DISK1 $DISK2 -} - -log_assert "'zfs remap' should only work with supported parameters" -log_onexit cleanup - -f="$TESTPOOL/fs" -v="$TESTPOOL/vol" -s="$TESTPOOL/fs@snap" -b="$TESTPOOL/fs#bmark" -c="$TESTPOOL/clone" - -typeset goodparams=("$f" "$v" "$c") -typeset badparams=("-H" "-p" "-?" "$s" "$b" "$f $f" "$f $v" "$f $s") - -DISK1="$TEST_BASE_DIR/zfs_remap-1" -DISK2="$TEST_BASE_DIR/zfs_remap-2" - -# 1. Prepare a pool where a top-level VDEV has been removed -log_must truncate -s $(($MINVDEVSIZE * 2)) $DISK1 -log_must zpool create $TESTPOOL $DISK1 -log_must zfs create $f -log_must zfs create -V 1M -s $v -log_must zfs snap $s -log_must zfs bookmark $s $b -log_must zfs clone $s $c -log_must truncate -s $(($MINVDEVSIZE * 2)) $DISK2 -log_must zpool add $TESTPOOL $DISK2 -log_must zpool remove $TESTPOOL $DISK1 -log_must wait_for_removal $TESTPOOL - -# 2. Verify every supported parameter to 'zfs remap' is accepted -for param in "${goodparams[@]}" -do - log_must zfs remap $param -done - -# 3. Verify other unsupported parameters raise an error -for param in "${badparams[@]}" -do - log_mustnot zfs remap $param -done - -log_pass "'zfs remap' only works with supported parameters" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_obsolete_counts.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_obsolete_counts.ksh deleted file mode 100755 index 1f0e0e85d8..0000000000 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/zfs_remap_obsolete_counts.ksh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/ksh -p -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2018, loli10K . All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/removal/removal.kshlib - -# -# DESCRIPTION: -# 'zfs remap' depends on 'feature@obsolete_counts' being active -# -# STRATEGY: -# 1. Prepare a pool where a top-level VDEV has been removed and with -# feature@obsolete_counts disabled -# 2. Verify any 'zfs remap' command cannot be executed -# 3. Verify the same commands complete successfully when -# feature@obsolete_counts is enabled -# - -# N.B. The 'zfs remap' command has been disabled and may be removed. -export ZFS_REMAP_ENABLED=YES - -verify_runnable "both" - -function cleanup -{ - destroy_pool $TESTPOOL - rm -f $DISK1 $DISK2 -} - -log_assert "'zfs remap' depends on feature@obsolete_counts being active" -log_onexit cleanup - -f="$TESTPOOL/fs" -v="$TESTPOOL/vol" -s="$TESTPOOL/fs@snap" -c="$TESTPOOL/clone" - -DISK1="$TEST_BASE_DIR/zfs_remap-1" -DISK2="$TEST_BASE_DIR/zfs_remap-2" - -# 1. Prepare a pool where a top-level VDEV has been removed with -# feature@obsolete_counts disabled -log_must truncate -s $(($MINVDEVSIZE * 2)) $DISK1 -log_must zpool create -o feature@obsolete_counts=disabled $TESTPOOL $DISK1 -log_must zfs create $f -log_must zfs create -V 1M -s $v -log_must zfs snap $s -log_must zfs clone $s $c -log_must truncate -s $(($MINVDEVSIZE * 2)) $DISK2 -log_must zpool add $TESTPOOL $DISK2 -log_must zpool remove $TESTPOOL $DISK1 -log_must wait_for_removal $TESTPOOL - -# 2. Verify any 'zfs remap' command cannot be executed -log_mustnot zfs remap $f -log_mustnot zfs remap $v -log_mustnot zfs remap $c - -# 3. Verify the same commands complete successfully when -# feature@obsolete_counts is enabled -log_must zpool set feature@obsolete_counts=enabled $TESTPOOL -log_must zfs remap $f -log_must zfs remap $v -log_must zfs remap $c - -log_pass "'zfs remap' correctly depends on feature@obsolete_counts being active" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am index 406e278815..f8273d72c5 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am @@ -18,7 +18,8 @@ dist_pkgdata_SCRIPTS = \ zfs_rename_014_neg.ksh \ zfs_rename_encrypted_child.ksh \ zfs_rename_to_encrypted.ksh \ - zfs_rename_mountpoint.ksh + zfs_rename_mountpoint.ksh \ + zfs_rename_nounmount.ksh dist_pkgdata_DATA = \ zfs_rename.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib index 9b8fb6b0ed..af1c2f7bed 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib @@ -108,13 +108,11 @@ function cleanup ((i = i + 1)) done - if snapexists $TESTPOOL/$TESTFS@snapshot; then - log_must zfs destroy -fR $TESTPOOL/$TESTFS@snapshot - fi + snapexists $TESTPOOL/$TESTFS@snapshot && \ + destroy_dataset $TESTPOOL/$TESTFS@snapshot -fR - if datasetexists $TESTPOOL/$RECVFS; then - log_must zfs destroy -r $TESTPOOL/$RECVFS - fi + datasetexists $TESTPOOL/$RECVFS && \ + destroy_dataset $TESTPOOL/$RECVFS -r } function cmp_data #<$1 src data, $2 tgt data> diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh index 56c06cfe85..0bd4aca3a7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh @@ -44,7 +44,7 @@ verify_runnable "both" function cleanup { - datasetexists $snap && log_must zfs destroy $snap + datasetexists $snap && destroy_dataset $snap } log_assert "'zfs rename' can address the abbreviated snapshot name." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh index 3ad7d4e805..4d16051522 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh @@ -69,7 +69,7 @@ rename_dataset ${vol}-new $vol clone=$TESTPOOL/${snap}_clone create_clone $vol@$snap $clone -block_device_wait +block_device_wait $VOLDATA #verify data integrity for input in $VOL_R_PATH $ZVOL_RDEVDIR/$clone; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh index 3623d2bca1..2a3f8a8ccd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh @@ -46,9 +46,8 @@ verify_runnable "both" function cleanup { - if datasetexists $TESTPOOL/$TESTFS ; then - log_must zfs destroy -Rf $TESTPOOL/$TESTFS - fi + datasetexists $TESTPOOL/$TESTFS && \ + destroy_dataset $TESTPOOL/$TESTFS -Rf log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS @@ -117,26 +116,26 @@ log_must diff $SRC_FILE $obj if is_global_zone; then vol=$TESTPOOL/$TESTFS/vol.$$ ; volclone=$TESTPOOL/$TESTFS/volclone.$$ log_must zfs create -V 100M $vol - block_device_wait obj=$(target_obj $vol) + block_device_wait $obj log_must dd if=$SRC_FILE of=$obj bs=$BS count=$CNT snap=${vol}@snap.$$ log_must zfs snapshot $snap log_must zfs clone $snap $volclone - block_device_wait # Rename dataset & clone log_must zfs rename $vol ${vol}-new log_must zfs rename $volclone ${volclone}-new - block_device_wait # Compare source file and target file obj=$(target_obj ${vol}-new) + block_device_wait $obj log_must dd if=$obj of=$DST_FILE bs=$BS count=$CNT log_must diff $SRC_FILE $DST_FILE obj=$(target_obj ${volclone}-new) + block_device_wait $obj log_must dd if=$obj of=$DST_FILE bs=$BS count=$CNT log_must diff $SRC_FILE $DST_FILE @@ -144,10 +143,10 @@ if is_global_zone; then log_must zfs rename ${vol}-new $vol log_must zfs rename $snap ${snap}-new log_must zfs clone ${snap}-new $volclone - block_device_wait # Compare source file and target file obj=$(target_obj $volclone) + block_device_wait $obj log_must dd if=$obj of=$DST_FILE bs=$BS count=$CNT log_must diff $SRC_FILE $DST_FILE fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh index 3fc099d79f..2291638390 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh @@ -47,12 +47,11 @@ function cleanup { typeset -i i=0 while ((i < ${#datasets[@]})); do - if datasetexists ${datasets[$i]}@snap ; then - log_must zfs destroy ${datasets[$i]}@snap - fi - if datasetexists ${datasets[$i]}@snap-new ; then - log_must zfs destroy ${datasets[$i]}@snap-new - fi + datasetexists ${datasets[$i]}@snap && \ + destroy_dataset ${datasets[$i]}@snap + + datasetexists ${datasets[$i]}@snap-new && \ + destroy_dataset ${datasets[$i]}@snap-new ((i += 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh index 2d1220e334..71d72619d6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh @@ -46,19 +46,18 @@ verify_runnable "both" function additional_cleanup { - if datasetexists $TESTPOOL/notexist ; then - log_must zfs destroy -Rf $TESTPOOL/notexist - fi + datasetexists $TESTPOOL/notexist && \ + destroy_dataset $TESTPOOL/notexist -Rf + + datasetexists $TESTPOOL/$TESTFS && \ + destroy_dataset $TESTPOOL/$TESTFS -Rf - if datasetexists $TESTPOOL/$TESTFS ; then - log_must zfs destroy -Rf $TESTPOOL/$TESTFS - fi log_must zfs create $TESTPOOL/$TESTFS if is_global_zone ; then - if datasetexists $TESTPOOL/$TESTVOL ; then - log_must zfs destroy -Rf $TESTPOOL/$TESTVOL - fi + datasetexists $TESTPOOL/$TESTVOL && \ + destroy_dataset $TESTPOOL/$TESTVOL -Rf + log_must zfs create -V $VOLSIZE $TESTPOOL/$TESTVOL fi } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh index b2e01006fd..73790f58cd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh @@ -46,21 +46,17 @@ verify_runnable "both" function cleanup { - if datasetexists $TESTPOOL/$TESTCTR@snap-new ; then - log_must zfs destroy -f $TESTPOOL/$TESTCTR@snap-new - fi + datasetexists $TESTPOOL/$TESTCTR@snap-new && \ + destroy_dataset $TESTPOOL/$TESTCTR@snap-new -f - if datasetexists $TESTPOOL/$TESTCTR@snap ; then - log_must zfs destroy -f $TESTPOOL/$TESTCTR@snap - fi + datasetexists $TESTPOOL/$TESTCTR@snap && \ + destroy_dataset $TESTPOOL/$TESTCTR@snap -f - if datasetexists $TESTPOOL@snap-new ; then - log_must zfs destroy -f $TESTPOOL@snap-new - fi + datasetexists $TESTPOOL@snap-new && \ + destroy_dataset $TESTPOOL@snap-new -f - if datasetexists $TESTPOOL@snap ; then - log_must zfs destroy -f $TESTPOOL@snap - fi + datasetexists $TESTPOOL@snap && \ + destroy_dataset $TESTPOOL@snap -f } log_assert "zfs rename -r can rename snapshot when child datasets" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh index 7d99e9f69f..1c962608d7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh @@ -81,7 +81,7 @@ function nesting_cleanup # before resetting it, it will be left at the modified # value for the remaining tests. That's the reason # we reset it again here just in case. - log_must set_tunable_impl zfs_max_dataset_nesting 50 Z zcommon + log_must set_tunable_impl MAX_DATASET_NESTING 50 Z zcommon } log_onexit nesting_cleanup @@ -93,13 +93,13 @@ log_must zfs create -p $TESTPOOL/$dsC16 log_mustnot zfs rename $TESTPOOL/$dsA02 $TESTPOOL/$dsB15A # extend limit -log_must set_tunable_impl zfs_max_dataset_nesting 64 Z zcommon +log_must set_tunable_impl MAX_DATASET_NESTING 64 Z zcommon log_mustnot zfs rename $TESTPOOL/$dsA02 $TESTPOOL/$dsB16A log_must zfs rename $TESTPOOL/$dsA02 $TESTPOOL/$dsB15A # bring back old limit -log_must set_tunable_impl zfs_max_dataset_nesting 50 Z zcommon +log_must set_tunable_impl MAX_DATASET_NESTING 50 Z zcommon log_mustnot zfs rename $TESTPOOL/$dsC01 $TESTPOOL/$dsB15A47C log_must zfs rename $TESTPOOL/$dsB15A47A $TESTPOOL/$dsB15A47B diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh index fa57658f18..2366cf6765 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh @@ -42,9 +42,9 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r datasetexists $TESTPOOL/$TESTFS3 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS3 + destroy_dataset $TESTPOOL/$TESTFS3 -r } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh index 4d2b94dc88..7ec6b2aa49 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh @@ -34,8 +34,8 @@ verify_runnable "both" function rename_cleanup { - log_note zfs destroy -fR $TESTPOOL/rename_test - log_note zfs destroy -fR $TESTPOOL/renamed + zfs destroy -fR $TESTPOOL/rename_test + zfs destroy -fR $TESTPOOL/renamed } log_onexit rename_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh new file mode 100755 index 0000000000..1c707762a7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy is of the CDDL is also available via the Internet +# at http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# zfs rename -u should rename datasets without unmounting them +# +# STRATEGY: +# 1. Create a set of nested datasets. +# 2. Verify datasets are mounted. +# 3. Rename with -u and verify all datasets stayed mounted. +# + +verify_runnable "both" + +function rename_cleanup +{ + cd $back + zfs destroy -fR $TESTPOOL/rename_test + zfs destroy -fR $TESTPOOL/renamed +} + +back=$(pwd) +log_onexit rename_cleanup + +log_must zfs create $TESTPOOL/rename_test +log_must zfs create $TESTPOOL/rename_test/child +log_must zfs create $TESTPOOL/rename_test/child/grandchild + +if ! ismounted $TESTPOOL/rename_test; then + log_fail "$TESTPOOL/rename_test is not mounted" +fi +if ! ismounted $TESTPOOL/rename_test/child; then + log_fail "$TESTPOOL/rename_test/child is not mounted" +fi +if ! ismounted $TESTPOOL/rename_test/child/grandchild; then + log_fail "$TESTPOOL/rename_test/child/grandchild is not mounted" +fi + +mntp_p=$(get_prop mountpoint $TESTPOOL/rename_test) +mntp_c=$(get_prop mountpoint $TESTPOOL/rename_test/child) +mntp_g=$(get_prop mountpoint $TESTPOOL/rename_test/child/grandchild) + +log_must cd $mntp_g +log_mustnot zfs rename $TESTPOOL/rename_test $TESTPOOL/renamed +log_must zfs rename -u $TESTPOOL/rename_test $TESTPOOL/renamed + +log_mustnot zfs list $TESTPOOL/rename_test +log_mustnot zfs list $TESTPOOL/rename_test/child +log_mustnot zfs list $TESTPOOL/rename_test/child/grandchild + +log_must zfs list $TESTPOOL/renamed +log_must zfs list $TESTPOOL/renamed/child +log_must zfs list $TESTPOOL/renamed/child/grandchild + +missing=$(zfs mount | awk -v pat=$TESTPOOL/renamed '$1 ~ pat' | awk \ + -v mntp_p=$mntp_p \ + -v mntp_c=$mntp_c \ + -v mntp_g=$mntp_g ' + BEGIN { p = c = g = 0 } + $2 == mntp_p { p = 1 } + $2 == mntp_c { c = 1 } + $2 == mntp_g { g = 1 } + END { + if (p != 1) + print mntp_p + if (c != 1) + print mntp_c + if (g != 1) + print mntp_g + }') +[[ -z "$missing" ]] || log_fail "Mountpoints no longer mounted: $missing" + +log_pass "Verified rename -u does not unmount datasets" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh index 400592aaca..ab8e1c89ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh @@ -23,12 +23,13 @@ # # DESCRIPTION: -# 'zfs rename' should not rename an unencrypted dataset to a child +# 'zfs rename' should be able to move an unencrypted dataset to a child # of an encrypted dataset # # STRATEGY: # 1. Create an encrypted dataset -# 2. Attempt to rename the default dataset to a child of the encrypted dataset +# 2. Rename the default dataset to a child of the encrypted dataset +# 3. Confirm the child dataset doesn't have any encryption properties # verify_runnable "both" @@ -36,16 +37,17 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r } log_onexit cleanup -log_assert "'zfs rename' should not rename an unencrypted dataset to a" \ +log_assert "'zfs rename' should allow renaming an unencrypted dataset to a" \ "child of an encrypted dataset" log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS2" -log_mustnot zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS +log_must zfs rename $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS2/$TESTFS +log_must test "$(get_prop 'encryption' $TESTPOOL/$TESTFS2/$TESTFS)" == "off" -log_pass "'zfs rename' does not rename an unencrypted dataset to a child" \ +log_pass "'zfs rename' allows renaming an unencrypted dataset to a child" \ "of an encrypted dataset" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh index 5511f6ad6d..607bbf06ea 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh @@ -76,13 +76,14 @@ function test_n_check #opt num_snap_clone num_rollback pkill -x dd fi - datasetexists $FS && log_must zfs destroy -Rf $FS + datasetexists $FS && destroy_dataset $FS -Rf if datasetexists $VOL; then if ismounted $TESTDIR1 $NEWFS_DEFAULT_FS; then log_must umount -f $TESTDIR1 + sleep 0.1 fi - log_must zfs destroy -Rf $VOL + destroy_dataset $VOL -Rf fi # Create specified test environment diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh index 0ae13d3a9b..1e3109108b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh @@ -51,9 +51,7 @@ function cleanup { pkill ${DD##*/} for snap in $FSSNAP0 $FSSNAP1 $FSSNAP2; do - if snapexists $snap; then - log_must zfs destroy -Rf $snap - fi + snapexists $snap && destroy_dataset $snap -Rf done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh index 0c1bb730e7..9537d5077b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh @@ -51,9 +51,8 @@ function cleanup typeset ds for ds in $TESTPOOL $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL; do - if snapexists ${ds}@$TESTSNAP; then - log_must zfs destroy ${ds}@$TESTSNAP - fi + snapexists ${ds}@$TESTSNAP && \ + destroy_dataset ${ds}@$TESTSNAP done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib index 5b157d11c1..433f240675 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib @@ -76,16 +76,14 @@ function setup_snap_env # mount it. Otherwise, only check if this ufs|ext file system # was mounted. # - log_must eval "echo "y" | \ - newfs -v $ZVOL_DEVDIR/$VOL > /dev/null 2>&1" + log_must new_fs $ZVOL_DEVDIR/$VOL [[ ! -d $TESTDIR1 ]] && log_must mkdir $TESTDIR1 # Make sure the ufs|ext filesystem hasn't been mounted, # then mount the new ufs|ext filesystem. if ! ismounted $TESTDIR1 $NEWFS_DEFAULT_FS; then - log_must mount \ - $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL $TESTDIR1 + log_must mount $ZVOL_DEVDIR/$VOL $TESTDIR1 fi fi @@ -117,7 +115,7 @@ function setup_snap_env if datasetnonexists $snap; then log_must cp /etc/passwd $fname - if is_linux; then + if is_linux || is_freebsd; then log_must sync else # @@ -128,7 +126,21 @@ function setup_snap_env log_must lockfs -f $TESTDIR1 fi fi + if is_freebsd && [[ $dtst == $VOL ]]; then + # Though sync does start a fs sync on + # FreeBSD, it does not wait for it to + # finish. We can force a blocking sync + # by updating the fs mount instead. + # Otherwise, the snapshot might occur + # with the fs in an unmountable state. + log_must mount -ur \ + $ZVOL_DEVDIR/$VOL $TESTDIR1 + fi log_must zfs snapshot $snap + if is_freebsd && [[ $dtst == $VOL ]]; then + log_must mount -uw \ + $ZVOL_DEVDIR/$VOL $TESTDIR1 + fi fi if [[ $createclone == "true" ]]; then if datasetnonexists $clone; then @@ -147,7 +159,7 @@ function setup_clone_env } # -# Clean up the test environmnet +# Clean up the test environment # # $1 number of snapshot Note: Currently only support three snapshots. # @@ -169,9 +181,7 @@ function cleanup_env for dtst in $FS $VOL; do for snap in $TESTSNAP $TESTSNAP1 $TESTSNAP2; do - if snapexists $dtst@$snap; then - log_must zfs destroy -Rf $dtst@$snap - fi + snapexists $dtst@$snap && destroy_dataset $dtst@$snap -Rf done done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile.am index 2a476f3dce..25c7065670 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile.am @@ -10,9 +10,11 @@ dist_pkgdata_SCRIPTS = \ zfs_send_006_pos.ksh \ zfs_send_007_pos.ksh \ zfs_send_encrypted.ksh \ + zfs_send_encrypted_unloaded.ksh \ zfs_send_raw.ksh \ zfs_send_sparse.ksh \ - zfs_send-b.ksh + zfs_send-b.ksh \ + zfs_send_skip_missing.ksh dist_pkgdata_DATA = \ zfs_send.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send-b.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send-b.ksh index cd879846ce..f019c2215e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send-b.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send-b.ksh @@ -35,7 +35,7 @@ verify_runnable "both" function cleanup { for ds in "$SENDFS" "$BACKUP" "$RESTORE"; do - datasetexists $ds && log_must zfs destroy -r $ds + datasetexists $ds && destroy_dataset $ds -r done } @@ -52,20 +52,19 @@ log_must zfs snapshot "$SENDFS@s1" log_must zfs bookmark "$SENDFS@s1" "$SENDFS#bm" log_must zfs snapshot "$SENDFS@s2" log_must zfs set "compression=gzip" $SENDFS -log_must zfs set "org.zfsonlinux:prop=val" $SENDFS -log_must zfs set "org.zfsonlinux:snapprop=val" "$SENDFS@s1" +log_must zfs set "org.openzfs:prop=val" $SENDFS +log_must zfs set "org.openzfs:snapprop=val" "$SENDFS@s1" # 2. Verify command line options interact with '-b' correctly typeset opts=("" "p" "Rp" "cew" "nv" "D" "DLPRcenpvw") for opt in ${opts[@]}; do - log_must eval "zfs send -b$opt $SENDFS@s1 > /dev/null" - log_must eval "zfs send -b$opt -i $SENDFS@s1 $SENDFS@s2 > /dev/null" - log_must eval "zfs send -b$opt -I $SENDFS@s1 $SENDFS@s2 > /dev/null" + log_must eval "zfs send -b$opt $SENDFS@s1 >$TEST_BASE_DIR/devnull" + log_must eval "zfs send -b$opt -i $SENDFS@s1 $SENDFS@s2 >$TEST_BASE_DIR/devnull" + log_must eval "zfs send -b$opt -I $SENDFS@s1 $SENDFS@s2 >$TEST_BASE_DIR/devnull" done for opt in ${opts[@]}; do - log_mustnot eval "zfs send -b$opt $SENDFS > /dev/null" - log_mustnot eval "zfs send -b$opt $SENDFS#bm > /dev/null" - log_mustnot eval "zfs send -b$opt -i $SENDFS#bm $SENDFS@s2 > /dev/null" + log_mustnot eval "zfs send -b$opt $SENDFS >$TEST_BASE_DIR/devnull" + log_mustnot eval "zfs send -b$opt $SENDFS#bm >$TEST_BASE_DIR/devnull" done # Do 3..6 in a loop to verify various combination of "zfs send" options @@ -79,21 +78,21 @@ for opt in ${opts[@]}; do # NOTE: override "received" values and set some new properties as well log_must zfs set "compression=lz4" $BACKUP log_must zfs set "exec=off" $BACKUP - log_must zfs set "org.zfsonlinux:prop=newval" $BACKUP - log_must zfs set "org.zfsonlinux:newprop=newval" $BACKUP - log_must zfs set "org.zfsonlinux:snapprop=newval" "$BACKUP@s1" - log_must zfs set "org.zfsonlinux:newsnapprop=newval" "$BACKUP@s1" + log_must zfs set "org.openzfs:prop=newval" $BACKUP + log_must zfs set "org.openzfs:newprop=newval" $BACKUP + log_must zfs set "org.openzfs:snapprop=newval" "$BACKUP@s1" + log_must zfs set "org.openzfs:newsnapprop=newval" "$BACKUP@s1" # 5. Restore the "backup" dataset to a new destination log_must eval "zfs send -b$opt $BACKUP@s1 | zfs recv $RESTORE" # 6. Verify only original (received) properties are sent from "backup" log_must eval "check_prop_source $RESTORE compression gzip received" - log_must eval "check_prop_source $RESTORE org.zfsonlinux:prop val received" - log_must eval "check_prop_source $RESTORE@s1 org.zfsonlinux:snapprop val received" + log_must eval "check_prop_source $RESTORE org.openzfs:prop val received" + log_must eval "check_prop_source $RESTORE@s1 org.openzfs:snapprop val received" log_must eval "check_prop_source $RESTORE exec on default" - log_must eval "check_prop_missing $RESTORE org.zfsonlinux:newprop" - log_must eval "check_prop_missing $RESTORE@s1 org.zfsonlinux:newsnapprop" + log_must eval "check_prop_missing $RESTORE org.openzfs:newprop" + log_must eval "check_prop_missing $RESTORE@s1 org.openzfs:newsnapprop" # cleanup log_must zfs destroy -r $BACKUP diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_001_pos.ksh index b0a319d419..b18433085e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_001_pos.ksh @@ -50,12 +50,10 @@ verify_runnable "both" function cleanup { for snap in $init_snap $inc_snap $rst_snap $rst_inc_snap; do - snapexists $snap && \ - log_must zfs destroy -f $snap + snapexists $snap && destroy_dataset $snap -f done - datasetexists $rst_root && \ - log_must zfs destroy -Rf $rst_root + datasetexists $rst_root && destroy_dataset $rst_root -Rf for file in $full_bkup $inc_bkup \ $init_data $inc_data @@ -75,7 +73,7 @@ log_onexit cleanup init_snap=$TESTPOOL/$TESTFS@init_snap inc_snap=$TESTPOOL/$TESTFS@inc_snap full_bkup=$TEST_BASE_DIR/fullbkup.$$ -inc_bkup=/var/tmp/incbkup.$$ +inc_bkup=$TEST_BASE_DIR/incbkup.$$ init_data=$TESTDIR/$TESTFILE1 inc_data=$TESTDIR/$TESTFILE2 orig_sum="" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_002_pos.ksh index 6359bb4f7f..42bdddd2cc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_002_pos.ksh @@ -48,11 +48,8 @@ verify_runnable "both" function cleanup { - snapexists $snap && \ - log_must zfs destroy $snap - - datasetexists $ctr && \ - log_must zfs destroy -r $ctr + snapexists $snap && destroy_dataset $snap + datasetexists $ctr && destroy_dataset $ctr -r [[ -e $origfile ]] && \ log_must rm -f $origfile diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_003_pos.ksh index 825a10d0f8..caa84886fa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_003_pos.ksh @@ -44,8 +44,8 @@ verify_runnable "both" function cleanup { - datasetexists $snap1 && log_must zfs destroy $snap1 - datasetexists $snap2 && log_must zfs destroy $snap2 + datasetexists $snap1 && destroy_dataset $snap1 + datasetexists $snap2 && destroy_dataset $snap2 } log_assert "'zfs send -i' can deal with abbreviated snapshot name." @@ -61,7 +61,7 @@ log_must zfs snapshot $snap2 typeset -i i=0 while (( i < ${#args[*]} )); do - log_must eval "zfs send -i ${args[i]} > /dev/null" + log_must eval "zfs send -i ${args[i]} >$TEST_BASE_DIR/devnull" (( i += 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_004_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_004_neg.ksh index da14fa2fa6..af10e3a11f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_004_neg.ksh @@ -48,8 +48,7 @@ function cleanup typeset snap f for snap in $snap1 $snap2 $snap3; do - snapexists $snap && \ - log_must zfs destroy -f $snap + snapexists $snap && destroy_dataset $snap -f done for f in $tmpfile1 $tmpfile2; do @@ -65,7 +64,7 @@ snap2=$fs@snap2 snap3=$fs@snap3 set -A badargs \ - "" "$TESTPOOL" "$TESTFS" "$fs" "$fs@nonexisten_snap" "?" \ + "" "$TESTPOOL" "$TESTFS" "$fs" "$fs@nonexistent_snap" "?" \ "$snap1/blah" "$snap1@blah" "-i" "-x" "-i $fs" \ "-x $snap1 $snap2" "-i $snap1" \ "-i $snap2 $snap1" "$snap1 $snap2" "-i $snap1 $snap2 $snap3" \ @@ -96,7 +95,7 @@ log_must zfs snapshot $snap3 typeset -i i=0 while (( i < ${#badargs[*]} )) do - log_mustnot eval "zfs send ${badargs[i]} >/dev/null" + log_mustnot eval "zfs send ${badargs[i]} >$TEST_BASE_DIR/devnull" (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_005_pos.ksh index 9f369e372d..c4ab7a6212 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_005_pos.ksh @@ -50,7 +50,7 @@ function cleanup log_must zpool import $TESTPOOL datasetexists $TESTPOOL@snap && \ - log_must zfs destroy -r $TESTPOOL@snap + destroy_dataset $TESTPOOL@snap -r } log_assert "'zfs send -R' can send from read-only pools" @@ -61,6 +61,6 @@ log_must zfs snapshot -r $TESTPOOL@snap log_must zpool export $TESTPOOL log_must zpool import -o readonly=on $TESTPOOL -log_must eval "zfs send -R $TESTPOOL@snap >/dev/null" +log_must eval "zfs send -R $TESTPOOL@snap >$TEST_BASE_DIR/devnull" log_pass "'zfs send -R' can send from read-only pools" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh index 7192551b6c..42628a0512 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh @@ -15,7 +15,7 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -36,6 +36,7 @@ verify_runnable "both" function cleanup { + log_must set_tunable32 OVERRIDE_ESTIMATE_RECORDSIZE 8192 for ds in $datasets; do destroy_dataset $ds "-rf" done @@ -54,7 +55,7 @@ function get_estimate_size typeset snapshot=$1 typeset option=$2 typeset base_snapshot=${3:-""} - if [[ -z $3 ]];then + if [[ -z $3 ]]; then typeset total_size=$(zfs send $option $snapshot 2>&1 | tail -1) else typeset total_size=$(zfs send $option $base_snapshot $snapshot \ @@ -90,6 +91,7 @@ function verify_size_estimates log_assert "Verify 'zfs send -nvP' generates valid stream estimates" log_onexit cleanup +log_must set_tunable32 OVERRIDE_ESTIMATE_RECORDSIZE 0 typeset -l block_count=0 typeset -l block_size typeset -i PERCENT=1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh index 5fdb125bca..da0aebe6b5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh @@ -89,7 +89,7 @@ test_pool () } test_pool $TESTPOOL -log_must truncate --size=1G $vdev +log_must truncate -s 1G $vdev log_must zpool create -o version=1 tmp_pool $vdev test_pool tmp_pool log_must zpool destroy tmp_pool diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted.ksh index 490e146ba6..a4c332d47d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted.ksh @@ -42,7 +42,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup @@ -62,15 +62,15 @@ log_must eval "echo $passphrase1 | zfs create -o encryption=on" \ log_must zfs snapshot -r $snap -log_must eval "zfs send $snap > /dev/null" -log_mustnot eval "zfs send -p $snap > /dev/null" -log_mustnot eval "zfs send -R $snap > /dev/null" +log_must eval "zfs send $snap >$TEST_BASE_DIR/devnull" +log_mustnot eval "zfs send -p $snap >$TEST_BASE_DIR/devnull" +log_mustnot eval "zfs send -R $snap >$TEST_BASE_DIR/devnull" log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unload-key $TESTPOOL/$TESTFS1 -log_mustnot eval "zfs send $snap > /dev/null" -log_must eval "zfs send $TESTPOOL/$TESTFS1/child@snap > /dev/null" +log_mustnot eval "zfs send $snap >$TEST_BASE_DIR/devnull" +log_must eval "zfs send $TESTPOOL/$TESTFS1/child@snap >$TEST_BASE_DIR/devnull" log_pass "ZFS performs unencrypted sends of encrypted datasets, unless the" \ "'-p' or '-R' options are specified" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh index 112ee1143d..f268f7b38d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh @@ -37,7 +37,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup @@ -53,7 +53,7 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ log_must zfs snapshot $snap log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unload-key $TESTPOOL/$TESTFS1 -log_mustnot eval "zfs send $snap > /dev/null" +log_mustnot eval "zfs send $snap >$TEST_BASE_DIR/devnull" log_pass "ZFS does not perform unencrypted sends from encrypted datasets" \ "with unloaded keys." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_raw.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_raw.ksh index 85cc7407e1..03c2e78673 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_raw.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_raw.ksh @@ -38,11 +38,9 @@ verify_runnable "both" function cleanup { - snapexists $snap && \ - log_must zfs destroy $snap - + snapexists $snap && destroy_dataset $snap datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup @@ -59,21 +57,21 @@ log_must eval "echo $passphrase | zfs create -o encryption=on" \ log_must zfs snapshot $snap log_must zfs snapshot $snap1 -log_must eval "zfs send -w $snap > /dev/null" -log_must eval "zfs send -w $snap1 > /dev/null" +log_must eval "zfs send -w $snap >$TEST_BASE_DIR/devnull" +log_must eval "zfs send -w $snap1 >$TEST_BASE_DIR/devnull" log_note "Verify ZFS can perform raw sends with properties" -log_must eval "zfs send -wp $snap > /dev/null" -log_must eval "zfs send -wp $snap1 > /dev/null" +log_must eval "zfs send -wp $snap >$TEST_BASE_DIR/devnull" +log_must eval "zfs send -wp $snap1 >$TEST_BASE_DIR/devnull" log_note "Verify ZFS can perform raw replication sends" -log_must eval "zfs send -wR $snap > /dev/null" -log_must eval "zfs send -wR $snap1 > /dev/null" +log_must eval "zfs send -wR $snap >$TEST_BASE_DIR/devnull" +log_must eval "zfs send -wR $snap1 >$TEST_BASE_DIR/devnull" log_note "Verify ZFS can perform a raw send of an encrypted datasets with" \ "its key unloaded" log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unload-key $TESTPOOL/$TESTFS1 -log_must eval "zfs send -w $snap1 > /dev/null" +log_must eval "zfs send -w $snap1 >$TEST_BASE_DIR/devnull" log_pass "ZFS performs raw sends of datasets" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_skip_missing.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_skip_missing.ksh new file mode 100755 index 0000000000..2e12d25344 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_skip_missing.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016, loli10K. All rights reserved. +# Copyright (c) 2021, Pablo Correa Gómez. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/cli_common.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_send/zfs_send.cfg + +# +# DESCRIPTION: +# Verify 'zfs send' will avoid sending replication send +# streams when we're missing snapshots in the dataset +# hierarchy, unless -s|--skip-missing provided +# +# STRATEGY: +# 1. Create a parent and child fs and then only snapshot the parent +# 2. Verify sending with replication will fail +# 3. Verify sending with skip-missing will print a warning but succeed +# + +verify_runnable "both" + +function cleanup +{ + snapexists $SNAP && destroy_dataset $SNAP -f + + datasetexists $PARENT && destroy_dataset $PARENT -rf + + [[ -e $WARNF ]] && log_must rm -f $WARNF + rm -f $TEST_BASE_DIR/devnull +} + +log_assert "Verify 'zfs send -Rs' works as expected." +log_onexit cleanup + +PARENT=$TESTPOOL/parent +CHILD=$PARENT/child +SNAP=$PARENT@snap +WARNF=$TEST_BASE_DIR/warn.2 + +log_note "Verify 'zfs send -R' fails to generate replication stream"\ + " for datasets created before" + +log_must zfs create $PARENT +log_must zfs create $CHILD +log_must zfs snapshot $SNAP +log_mustnot eval "zfs send -R $SNAP >$TEST_BASE_DIR/devnull" + +log_note "Verify 'zfs send -Rs' warns about missing snapshots, "\ + "but still succeeds" + +log_must eval "zfs send -Rs $SNAP 2> $WARNF >$TEST_BASE_DIR/devnull" +log_must eval "[[ -s $WARNF ]]" + +log_pass "Verify 'zfs send -Rs' works as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_sparse.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_sparse.ksh index 7354305066..aeb49afd7f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_sparse.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_sparse.ksh @@ -24,15 +24,15 @@ # 1. Create sparse files of various size # 2. Snapshot and send these sparse files # 3. Verify these files are received correctly and we don't trigger any issue -# like the one described in https://github.com/zfsonlinux/zfs/pull/6760 +# like the one described in https://github.com/openzfs/zfs/pull/6760 # verify_runnable "both" function cleanup { - datasetexists $SENDFS && log_must zfs destroy -r $SENDFS - datasetexists $RECVFS && log_must zfs destroy -r $RECVFS + datasetexists $SENDFS && destroy_dataset $SENDFS -r + datasetexists $RECVFS && destroy_dataset $RECVFS -r } # @@ -57,14 +57,14 @@ function write_compare_files # # compare sparse files recvfile="$(get_prop mountpoint $recvfs)/data.bin" log_must cmp $sendfile $recvfile $offset $offset - sendsz=$(stat -c '%s' $sendfile) - recvsz=$(stat -c '%s' $recvfile) + sendsz=$(stat_size $sendfile) + recvsz=$(stat_size $recvfile) if [[ $sendsz -ne $recvsz ]]; then log_fail "$sendfile ($sendsz) and $recvfile ($recvsz) differ." fi # cleanup - log_must zfs destroy -r $sendfs - log_must zfs destroy -r $recvfs + destroy_dataset $sendfs -r + destroy_dataset $recvfs -r } log_assert "'zfs send' should be able to send (big) sparse files correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile.am index 015464bf47..f7362ff255 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile.am @@ -28,7 +28,8 @@ dist_pkgdata_SCRIPTS = \ zfs_set_001_neg.ksh \ zfs_set_002_neg.ksh \ zfs_set_003_neg.ksh \ - zfs_set_keylocation.ksh + zfs_set_keylocation.ksh \ + zfs_set_feature_activation.ksh dist_pkgdata_DATA = \ zfs_set_common.kshlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_002_neg.ksh index 5fbc8bf716..caad211bcf 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_002_neg.ksh @@ -64,4 +64,4 @@ do done done -log_pass "Setting invalid {primary|secondary}cache on fs or volume fail as expeced." +log_pass "Setting invalid {primary|secondary}cache on fs or volume fail as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_001_pos.ksh index dd3397f015..ac5fc8188f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_001_pos.ksh @@ -63,12 +63,11 @@ set -A values "on" "off" function cleanup { - if snapexists $TESTPOOL/$TESTFS@$TESTSNAP ; then - log_must zfs destroy -R $TESTPOOL/$TESTFS@$TESTSNAP - fi - if snapexists $TESTPOOL/$TESTVOL@$TESTSNAP ; then - log_must zfs destroy -R $TESTPOOL/$TESTVOL@$TESTSNAP - fi + snapexists $TESTPOOL/$TESTFS@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP -R + + snapexists $TESTPOOL/$TESTVOL@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTVOL@$TESTSNAP -R [[ -n $old_ctr_canmount ]] && \ log_must zfs set canmount=$old_ctr_canmount $TESTPOOL/$TESTCTR diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_002_pos.ksh index 7cbcf7903e..55c71f6ca3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_002_pos.ksh @@ -40,9 +40,11 @@ # # STRATEGY: # 1. Setup a pool and create fs, volume, snapshot clone within it. -# 2. Set canmount=noauto for each dataset and check the retuen value -# and check if it still can be mounted by mount -a. +# 2. Set canmount=noauto for each dataset and check the return value +# and check if it still can be mounted by mount -a or shared by +# share -a # 3. mount each dataset(except volume) to see if it can be mounted. +# 4. verify that a mounted dataset can be shared by share -a. # verify_runnable "both" @@ -74,18 +76,17 @@ function cleanup ds=$TESTPOOL/$TESTCLONE if datasetexists $ds; then mntp=$(get_prop mountpoint $ds) - log_must zfs destroy $ds + destroy_dataset $ds if [[ -d $mntp ]]; then rm -fr $mntp fi fi - if snapexists $TESTPOOL/$TESTFS@$TESTSNAP ; then - log_must zfs destroy -R $TESTPOOL/$TESTFS@$TESTSNAP - fi - if snapexists $TESTPOOL/$TESTVOL@$TESTSNAP ; then - log_must zfs destroy -R $TESTPOOL/$TESTVOL@$TESTSNAP - fi + snapexists $TESTPOOL/$TESTFS@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP -R + + snapexists $TESTPOOL/$TESTVOL@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTVOL@$TESTSNAP -R zfs unmount -a > /dev/null 2>&1 log_must zfs mount -a @@ -100,6 +101,7 @@ log_onexit cleanup set -A old_mnt set -A old_canmount +set -A old_sharenfs typeset tmpmnt=/tmpmount$$ typeset ds @@ -113,6 +115,7 @@ while (( i < ${#dataset_pos[*]} )); do ds=${dataset_pos[i]} old_mnt[i]=$(get_prop mountpoint $ds) old_canmount[i]=$(get_prop canmount $ds) + old_sharenfs[i]=$(get_prop sharenfs $ds) (( i = i + 1 )) done @@ -121,6 +124,7 @@ while (( i < ${#dataset_pos[*]} )) ; do dataset=${dataset_pos[i]} set_n_check_prop "noauto" "canmount" "$dataset" log_must zfs set mountpoint=$tmpmnt $dataset + log_must zfs set sharenfs=on $dataset if ismounted $dataset; then zfs unmount -a > /dev/null 2>&1 log_must mounted $dataset @@ -128,6 +132,8 @@ while (( i < ${#dataset_pos[*]} )) ; do log_must unmounted $dataset log_must zfs mount -a log_must unmounted $dataset + log_must zfs share -a + log_mustnot is_exported $tmpmnt else log_must zfs mount -a log_must unmounted $dataset @@ -137,6 +143,10 @@ while (( i < ${#dataset_pos[*]} )) ; do log_must zfs mount $dataset log_must mounted $dataset + log_must zfs share -a + log_must is_exported $tmpmnt + + log_must zfs set sharenfs="${old_sharenfs[i]}" $dataset log_must zfs set canmount="${old_canmount[i]}" $dataset log_must zfs set mountpoint="${old_mnt[i]}" $dataset (( i = i + 1 )) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_003_pos.ksh index a11cfb4088..e4664d03b4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_003_pos.ksh @@ -63,15 +63,14 @@ function cleanup ds=$TESTPOOL/$TESTCLONE if datasetexists $ds; then mntp=$(get_prop mountpoint $ds) - log_must zfs destroy $ds + destroy_dataset $ds if [[ -d $mntp ]]; then log_must rm -fr $mntp fi fi - if snapexists $TESTPOOL/$TESTFS@$TESTSNAP ; then - log_must zfs destroy -R $TESTPOOL/$TESTFS@$TESTSNAP - fi + snapexists $TESTPOOL/$TESTFS@$TESTSNAP && \ + destroy_dataset $TESTPOOL/$TESTFS@$TESTSNAP -R zfs unmount -a > /dev/null 2>&1 log_must zfs mount -a diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_004_pos.ksh index 11be1af3cf..e75114efde 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_004_pos.ksh @@ -44,7 +44,7 @@ verify_runnable "global" # properties -set -A sharenfs_prop "off" "on" "rw" +set -A sharenfs_prop "off" "on" "ro" set -A sharesmb_prop "off" "on" function cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh index 27003b21b5..f30d005224 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh @@ -46,7 +46,10 @@ verify_runnable "both" set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL" -set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "noparity" +set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "noparity" +if is_linux; then + values+=("edonr") +fi log_assert "Setting a valid checksum on a file system, volume," \ "it should be successful." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/compression_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/compression_001_pos.ksh index f7d06eaf4f..06da5f2f3c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/compression_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/compression_001_pos.ksh @@ -26,6 +26,7 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib . $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib # @@ -41,7 +42,7 @@ verify_runnable "both" set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL" -set -A values $(get_compress_opts zfs_set) +set -A values "${compress_prop_vals[@]}" log_assert "Setting a valid compression on file system and volume, " \ "It should be successful." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh index ad33e18fbb..48580cafdb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh @@ -34,7 +34,7 @@ # # DESCRIPTION: -# If ZFS is currently managing the file system but it is currently unmoutned, +# If ZFS is currently managing the file system but it is currently unmounted, # and the mountpoint property is changed, the file system remains unmounted. # # STRATEGY: diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh index 9bbb480ae7..4d86100c03 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh @@ -76,6 +76,14 @@ if is_linux; then if [[ $(linux_version) -lt $(linux_version "4.4") ]]; then args+=("mand" "nomand") fi +elif is_freebsd; then + # 'xattr' and 'devices' are not supported on FreeBSD + # Perhaps more options need to be added. + set -A args \ + "noexec" "exec" \ + "ro" "rw" \ + "nosuid" "suid" \ + "atime" "noatime" else set -A args \ "devices" "/devices/" "nodevices" "/nodevices/" \ @@ -96,11 +104,11 @@ log_must zfs set mountpoint=legacy $testfs typeset i=0 while ((i < ${#args[@]})); do - if is_linux; then + if is_linux || is_freebsd; then log_must mount -t zfs -o ${args[$i]} $testfs $tmpmnt msg=$(mount | grep "$tmpmnt ") - + echo $msg | grep "${args[((i))]}" > /dev/null 2>&1 if (($? != 0)) ; then echo $msg | grep "${args[((i-1))]}" > /dev/null 2>&1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/onoffs_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/onoffs_001_pos.ksh index 498567fdd3..7ba6d7fb65 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/onoffs_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/onoffs_001_pos.ksh @@ -51,7 +51,12 @@ function cleanup log_onexit cleanup -set -A props "atime" "readonly" "setuid" "zoned" +set -A props "atime" "readonly" "setuid" +if is_freebsd; then + props+=("jailed") +else + props+=("zoned") +fi set -A values "on" "off" if is_global_zone ; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/property_alias_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/property_alias_001_pos.ksh index 19e636be05..f1befe60c3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/property_alias_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/property_alias_001_pos.ksh @@ -21,14 +21,12 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2016, 2017, Delphix. All rights reserved. # Use is subject to license terms. # -# -# Copyright (c) 2016, 2017 by Delphix. All rights reserved. -# - +. $STF_SUITE/include/properties.shlib . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib @@ -106,7 +104,7 @@ for ds in $pool $fs $vol; do done ;; compression|compress ) - for val in $(get_compress_opts zfs_set); do + for val in "${compress_prop_vals[@]}"; do set_and_check $ds ${rw_prop[i]} $val ${chk_prop[i]} done ;; diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/readonly_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/readonly_001_pos.ksh index 9af8811e45..4adac420f7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/readonly_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/readonly_001_pos.ksh @@ -48,7 +48,7 @@ function cleanup { for dataset in $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL ; do snapexists ${dataset}@$TESTSNAP && \ - log_must zfs destroy -R ${dataset}@$TESTSNAP + destroy_dataset ${dataset}@$TESTSNAP -R done } @@ -113,7 +113,7 @@ function verify_readonly # $1 dataset, $2 on|off fi ;; volume) - $expect eval "echo 'y' | newfs \ + $expect eval "new_fs \ ${ZVOL_DEVDIR}/$dataset > /dev/null 2>&1" ;; *) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh index 67de2e822f..7177fac202 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh @@ -55,7 +55,12 @@ typeset ro_props="type used creation referenced refer compressratio \ mounted origin" typeset snap_ro_props="volsize recordsize recsize quota reservation reserv mountpoint \ sharenfs checksum compression compress atime devices exec readonly rdonly \ - setuid zoned" + setuid" +if is_freebsd; then + snap_ro_props+=" jailed" +else + snap_ro_props+=" zoned" +fi zfs upgrade -v > /dev/null 2>&1 if [[ $? -eq 0 ]]; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/snapdir_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/snapdir_001_pos.ksh index 079fc770a6..083a6b1f46 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/snapdir_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/snapdir_001_pos.ksh @@ -48,8 +48,7 @@ verify_runnable "both" function cleanup { for dataset in $all_datasets; do - snapexists ${dataset}@snap && \ - log_must zfs destroy ${dataset}@snap + snapexists ${dataset}@snap && destroy_dataset ${dataset}@snap done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_004_pos.ksh index 1d197fa25f..bd11ea0883 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_004_pos.ksh @@ -46,9 +46,7 @@ function cleanup { for fs in $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL $TESTPOOL ; do typeset fssnap=$fs@snap - if datasetexists $fssnap ; then - log_must zfs destroy -f $fssnap - fi + datasetexists $fssnap && destroy_dataset $fssnap -f done cleanup_user_prop $TESTPOOL } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh index 4d8982c120..c9bc7565ab 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh @@ -45,7 +45,12 @@ verify_runnable "both" set -A props "" "mountpoint" "checksum" "compression" "atime" "readonly" \ - "setuid" "zoned" "canmount" + "setuid" "canmount" +if is_freebsd; then + props+=("jailed") +else + props+=("zoned") +fi set -A values "" "mountpoint" "checksum" "compression" "atime" "readonly" \ "setuid" "zoned" "0" "-?" "-on" "--on" "*" "?" "Legacy" "NONE" "oN" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_002_neg.ksh index b9044041cb..2178175cd5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_002_neg.ksh @@ -47,8 +47,13 @@ log_assert "'zfs set' fails with invalid arguments" set -A editable_props "quota" "reservation" "reserv" "volsize" "recordsize" "recsize" \ "mountpoint" "checksum" "compression" "compress" "atime" \ - "devices" "exec" "setuid" "readonly" "zoned" "snapdir" "aclmode" \ + "devices" "exec" "setuid" "readonly" "snapdir" "aclmode" \ "aclinherit" "canmount" "xattr" "copies" "version" +if is_freebsd; then + editable_props+=("jailed") +else + editable_props+=("zoned") +fi for ds in $TESTPOOL $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL \ $TESTPOOL/$TESTFS@$TESTSNAP; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh index 102e887429..fd5f7f285f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh @@ -48,9 +48,8 @@ function cleanup if [ -e $badpath ]; then rm -f $badpath fi - if datasetexists $TESTPOOL/foo; then - log_must zfs destroy $TESTPOOL/foo - fi + + datasetexists $TESTPOOL/foo && destroy_dataset $TESTPOOL/foo } log_assert "'zfs set mountpoint/sharenfs' fails with invalid scenarios" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib index 084a4a0a82..1208207632 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib @@ -156,7 +156,7 @@ function random_string } # -# Get vaild user defined property name +# Get valid user defined property name # # $1 user defined property name length # @@ -189,7 +189,7 @@ function valid_user_property } # -# Get invaild user defined property name +# Get invalid user defined property name # # $1 user defined property name length # @@ -287,12 +287,14 @@ function check_prop_source typeset chk_value=$(get_prop "$prop" "$dataset") typeset chk_source=$(get_source "$prop" "$dataset") - if [[ "$chk_value" != "$value" || "$chk_source" != "$4" ]] - then - return 1 - else - return 0 - fi + if [[ "$chk_value" != "$value" || "$chk_source" != "$source" ]] + then + log_note "expected (value '$value', source '$source'), got \ + (value '$chk_value', source '$chk_source')" + return 1 + else + return 0 + fi } # diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_feature_activation.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_feature_activation.ksh new file mode 100755 index 0000000000..c5e6fb9c11 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_feature_activation.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Setting the compression property to any of the zstd levels should activate +# the zstd feature flag. Destroying the last dataset using the zstd feature flag +# should revert the feature to the 'enabled' state. +# +# STRATEGY: +# 1. Create pool, then create a file system within it. +# 2. Check that the zstd feature flag is 'enabled'. +# 3. Setting the compression property to zstd. +# 4. Check that the zstd feature flag is now 'active'. +# 5. Destroy the dataset +# 6. Confirm that the feature flag reverts to the 'enabled' state. +# + +verify_runnable "both" + +log_assert "Setting compression=zstd should activate the"\ + "org.freebsd:zstd_compress feature flag, and destroying the last"\ + "dataset using that property, should revert the feature flag to"\ + "the enabled state." + +export VDEV_ZSTD="$TEST_BASE_DIR/vdev-zstd" + +function cleanup +{ + if poolexists $TESTPOOL-zstd ; then + destroy_pool $TESTPOOL-zstd + fi + + rm $VDEV_ZSTD +} +log_onexit cleanup + +log_must truncate -s $SPA_MINDEVSIZE $VDEV_ZSTD +log_must zpool create $TESTPOOL-zstd $VDEV_ZSTD + +featureval="$(get_pool_prop feature@zstd_compress $TESTPOOL-zstd)" + +[[ "$featureval" == "disabled" ]] && \ + log_unsupported "ZSTD feature flag unsupposed" + +[[ "$featureval" == "active" ]] && \ + log_unsupported "ZSTD feature already active before test" + +random_level=$((RANDOM%19 + 1)) +log_note "Randomly selected ZSTD level: $random_level" + +log_must zfs create -o compress=zstd-$random_level $TESTPOOL-zstd/$TESTFS-zstd + +featureval="$(get_pool_prop feature@zstd_compress $TESTPOOL-zstd)" + +log_note "After zfs set, feature flag value is: $featureval" + +[[ "$featureval" == "active" ]] || + log_fail "ZSTD feature flag not activated" + +log_must zfs destroy $TESTPOOL-zstd/$TESTFS-zstd + +featureval="$(get_pool_prop feature@zstd_compress $TESTPOOL-zstd)" + +log_note "After zfs destroy, feature flag value is: $featureval" + +[[ "$featureval" == "enabled" ]] || + log_fail "ZSTD feature flag not deactivated" + +log_pass "Setting compression=zstd activated the feature flag, and"\ + "destroying the dataset deactivated it." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_keylocation.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_keylocation.ksh index 313fa4e4d1..9791339479 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_keylocation.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_keylocation.ksh @@ -45,12 +45,13 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r + cleanup_https } log_onexit cleanup -log_assert "Key location can only be 'prompt' or a file path for encryption" \ - "roots, and 'none' for unencrypted volumes" +log_assert "Key location can only be 'prompt', 'file://', or 'https://'" \ + "for encryption roots, and 'none' for unencrypted volumes" log_must eval "echo $PASSPHRASE > /$TESTPOOL/pkey" @@ -69,6 +70,10 @@ log_mustnot zfs set keylocation=/$TESTPOOL/pkey $TESTPOOL/$TESTFS1 log_must zfs set keylocation=file:///$TESTPOOL/pkey $TESTPOOL/$TESTFS1 log_must verify_keylocation $TESTPOOL/$TESTFS1 "file:///$TESTPOOL/pkey" +setup_https +log_must zfs set keylocation=$(get_https_base_url)/PASSPHRASE $TESTPOOL/$TESTFS1 +log_must verify_keylocation $TESTPOOL/$TESTFS1 "$(get_https_base_url)/PASSPHRASE" + log_must zfs set keylocation=prompt $TESTPOOL/$TESTFS1 log_must verify_keylocation $TESTPOOL/$TESTFS1 "prompt" @@ -89,5 +94,5 @@ log_mustnot zfs set keylocation=/$TESTPOOL/pkey $TESTPOOL/$TESTFS1/child log_must verify_keylocation $TESTPOOL/$TESTFS1/child "none" -log_pass "Key location can only be 'prompt' or a file path for encryption" \ - "roots, and 'none' for unencrypted volumes" +log_pass "Key location can only be 'prompt', 'file://', or 'https://'" \ + "for encryption roots, and 'none' for unencrypted volumes" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am index e200146569..35332f822e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am @@ -12,7 +12,10 @@ dist_pkgdata_SCRIPTS = \ zfs_share_008_neg.ksh \ zfs_share_009_neg.ksh \ zfs_share_010_neg.ksh \ - zfs_share_011_pos.ksh + zfs_share_011_pos.ksh \ + zfs_share_012_pos.ksh \ + zfs_share_013_pos.ksh \ + zfs_share_concurrent_shares.ksh dist_pkgdata_DATA = \ zfs_share.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/setup.ksh index 29f38e802c..1601087f71 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/setup.ksh @@ -27,10 +27,7 @@ . $STF_SUITE/include/libtest.shlib -share -s -if [ $? -ne 0 ]; then - log_unsupported "The NFS utilities are not installed" -fi +check_nfs # Make sure NFS server is running before testing. setup_nfs_server diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_001_pos.ksh index a2c06e0b38..6d4396aa19 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_001_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2016, 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -66,11 +66,12 @@ function cleanup fi datasetexists $TESTPOOL/$TESTFS-clone && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS-clone + destroy_dataset $TESTPOOL/$TESTFS-clone -f - if snapexists "$TESTPOOL/$TESTFS@snapshot"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS@snapshot - fi + snapexists "$TESTPOOL/$TESTFS@snapshot" && \ + destroy_dataset $TESTPOOL/$TESTFS@snapshot -f + + log_must zfs share -a } @@ -138,11 +139,20 @@ done # log_must zfs share -a +# +# We need to unset __ZFS_POOL_EXCLUDE so that we include all file systems +# in the os-specific zfs exports file. This will be reset by the next test. +# +unset __ZFS_POOL_EXCLUDE + i=0 while (( i < ${#fs[*]} )); do is_shared ${fs[i]} || \ log_fail "File system ${fs[i]} is not shared (share -a)" + is_exported ${fs[i]} || \ + log_fail "File system ${fs[i]} is not exported (share -a)" + ((i = i + 2)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_004_pos.ksh index baa5f4e416..6c48875f52 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_004_pos.ksh @@ -47,9 +47,8 @@ verify_runnable "global" function cleanup { - if snapexists $TESTPOOL/$TESTFS@snapshot; then - log_must zfs destroy $TESTPOOL/$TESTFS@snapshot - fi + snapexists $TESTPOOL/$TESTFS@snapshot && \ + destroy_dataset $TESTPOOL/$TESTFS@snapshot log_must zfs set sharenfs=off $TESTPOOL/$TESTFS log_must unshare_fs $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_006_pos.ksh index 6b06589b69..d5394017d7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_006_pos.ksh @@ -53,7 +53,7 @@ function cleanup fi datasetexists $TESTPOOL/$TESTCTR/$TESTFS2 && \ - log_must zfs destroy $TESTPOOL/$TESTCTR/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTCTR/$TESTFS2 typeset fs="" for fs in $mntp $TESTDIR1 $TESTDIR2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh index 29ca9a143a..c64157cee6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh @@ -51,7 +51,7 @@ function cleanup { set -A badopts \ "r0" "r0=machine1" "r0=machine1:machine2" \ - "-g" "-b" "-c" "-d" "--invalid" \ + "-g" "-b" "-c" "-d" "--invalid" "rw=[::1]a:[::2]" "rw=[::1" \ "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL\$TESTCTR\$TESTFS1" log_assert "Verify that invalid share parameters and options are caught." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_011_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_011_pos.ksh index f75877ee89..131b039e1c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_011_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_011_pos.ksh @@ -51,13 +51,11 @@ function cleanup log_must zfs set sharenfs=off $TESTPOOL/$TESTFS unshare_fs $TESTPOOL/$TESTFS - if snapexists "$TESTPOOL/$TESTFS@snapshot"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS@snapshot - fi + snapexists "$TESTPOOL/$TESTFS@snapshot" && \ + destroy_dataset $TESTPOOL/$TESTFS@snapshot -f - if datasetexists $TESTPOOL/$TESTFS/fs2 ; then - log_must zfs destroy -f $TESTPOOL/$TESTFS/fs2 - fi + datasetexists $TESTPOOL/$TESTFS/fs2 && \ + destroy_dataset $TESTPOOL/$TESTFS/fs2 -f } log_assert "Verify that umount and destroy fail, and do not unshare the shared" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_012_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_012_pos.ksh new file mode 100755 index 0000000000..fe38d55595 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_012_pos.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: Unmounted canmount=noauto export is removed during zfs share -a +# +# STRATEGY: +# 1. Share a dataset that also has canmount set to noauto +# 2. Capture the zfs exports file when the dataset is mounted + shared +# 3. Simulate a reboot by unmounting the dataset and restoring the exports file +# 4. Verify that 'zfs share -a' removes the export since dataset is not mounted +# + +verify_runnable "both" + +dataset="$TESTPOOL/$TESTFS" +mountpt=$(get_prop mountpoint $dataset) + +function cleanup +{ + zfs set canmount=on $dataset + zfs set sharenfs=off $dataset + zfs mount -a + + # + # unset __ZFS_POOL_EXCLUDE so that we include all file systems when + # rebuilding the exports file + # + unset __ZFS_POOL_EXCLUDE + rm /etc/exports.d/zfs.exports + zfs share -a +} + +log_assert "Unmounted canmount=noauto export is removed during zfs share -a" +log_onexit cleanup + +log_must zfs set canmount=noauto $dataset +zfs mount $dataset > /dev/null 2>&1 +log_must mounted $dataset +log_must zfs set sharenfs=on $dataset +log_must is_exported $mountpt + +log_must cp /etc/exports.d/zfs.exports /etc/exports.d/zfs.exports.save +log_must zfs umount $dataset +log_must unmounted $dataset +log_mustnot is_exported $mountpt + +# simulate a reboot condition +log_must mv /etc/exports.d/zfs.exports.save /etc/exports.d/zfs.exports + +log_must is_exported $mountpt +log_must zfs share -a +log_mustnot is_exported $mountpt + +log_pass "Unmounted canmount=noauto export is removed during zfs share -a" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh new file mode 100755 index 0000000000..150eddac0e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Felix Dörre +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that NFS share options including ipv6 literals are parsed and propagated correctly. +# + +verify_runnable "global" + +function cleanup +{ + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS + is_shared $TESTPOOL/$TESTFS && \ + log_must unshare_fs $TESTPOOL/$TESTFS +} + +log_onexit cleanup + +cleanup + +log_must zfs set sharenfs="rw=[::1]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[2::3]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "2::3(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]:[2::3]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null +log_must grep "2::3(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]/64" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1/64(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[2::3]/128" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "2::3/128(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]/32:[2::3]/128" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1/32(" <<< "$output" > /dev/null +log_must grep "2::3/128(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]:[2::3]/64:[2a01:1234:1234:1234:aa34:234:1234:1234]:1.2.3.4/24" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null +log_must grep "2::3/64(" <<< "$output" > /dev/null +log_must grep "2a01:1234:1234:1234:aa34:234:1234:1234(" <<< "$output" > /dev/null +log_must grep "1\\.2\\.3\\.4/24(" <<< "$output" > /dev/null + +log_pass "NFS share ip address propagated correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh new file mode 100755 index 0000000000..dbaaf39b65 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh @@ -0,0 +1,201 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that 'zfs set sharenfs=on', 'zfs share', and 'zfs unshare' can +# run concurrently. The test creates 50 filesystem and 50 threads. +# Each thread will run through the test strategy in parallel. +# +# STRATEGY: +# 1. Verify that the file system is not shared. +# 2. Enable the 'sharenfs' property +# 3. Invoke 'zfs unshare' and verify filesystem is no longer shared +# 4. Invoke 'zfs share'. +# 4. Verify that the file system is shared. +# 5. Verify that a shared filesystem cannot be shared again. +# 6. Verify that share -a succeeds. +# + +verify_runnable "global" + +function cleanup +{ + wait + for fs in $(seq 0 50) + do + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS1/$fs + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS2/$fs + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS3/$fs + unshare_fs $TESTPOOL/$TESTFS1/$fs + unshare_fs $TESTPOOL/$TESTFS2/$fs + unshare_fs $TESTPOOL/$TESTFS3/$fs + + if mounted $TESTPOOL/$TESTFS1/$fs; then + log_must zfs unmount $TESTPOOL/$TESTFS1/$fs + fi + if mounted $TESTPOOL/$TESTFS2/$fs; then + log_must zfs unmount $TESTPOOL/$TESTFS2/$fs + fi + if mounted $TESTPOOL/$TESTFS3/$fs; then + log_must zfs unmount $TESTPOOL/$TESTFS3/$fs + fi + + datasetexists $TESTPOOL/$TESTFS1/$fs && \ + destroy_dataset $TESTPOOL/$TESTFS1/$fs -f + datasetexists $TESTPOOL/$TESTFS2/$fs && \ + destroy_dataset $TESTPOOL/$TESTFS2/$fs -f + datasetexists $TESTPOOL/$TESTFS3/$fs && \ + destroy_dataset $TESTPOOL/$TESTFS3/$fs -f + done + + log_must zfs share -a +} + +function create_filesystems +{ + for fs in $(seq 0 50) + do + log_must zfs create -p $TESTPOOL/$TESTFS1/$fs + log_must zfs create -p $TESTPOOL/$TESTFS2/$fs + log_must zfs create -p $TESTPOOL/$TESTFS3/$fs + done +} + +# +# Main test routine. +# +# Given a file system this routine will attempt +# share the mountpoint and then verify it has been shared. +# +function test_share # filesystem +{ + typeset filesystem=$1 + typeset mntp=$(get_prop mountpoint $filesystem) + + not_shared $mntp || \ + log_fail "File system $filesystem is already shared." + + zfs set sharenfs=on $filesystem || \ + log_fail "zfs set sharenfs=on $filesystem failed." + is_shared $mntp || \ + log_fail "File system $filesystem is not shared (set sharenfs)." + + # + # Verify 'zfs share' works as well. + # + zfs unshare $filesystem || \ + log_fail "zfs unshare $filesystem failed." + is_shared $mntp && \ + log_fail "File system $filesystem is still shared." + + zfs share $filesystem || \ + log_fail "zfs share $filesystem failed." + is_shared $mntp || \ + log_fail "file system $filesystem is not shared (zfs share)." + + #log_note "Sharing a shared file system fails." + zfs share $filesystem && \ + log_fail "zfs share $filesystem did not fail" + return 0 +} + +# +# Set the main process id so that we know to capture +# failures from child processes and allow the parent process +# to report the failure. +# +set_main_pid $$ +log_assert "Verify that 'zfs share' succeeds as root." +log_onexit cleanup + +create_filesystems + +child_pids=() +for fs in $(seq 0 50) +do + test_share $TESTPOOL/$TESTFS1/$fs & + child_pids+=($!) + log_note "$TESTPOOL/$TESTFS1/$fs ==> $!" + test_share $TESTPOOL/$TESTFS2/$fs & + child_pids+=($!) + log_note "$TESTPOOL/$TESTFS2/$fs ==> $!" + test_share $TESTPOOL/$TESTFS3/$fs & + child_pids+=($!) + log_note "$TESTPOOL/$TESTFS3/$fs ==> $!" +done +wait_for_children "${child_pids[@]}" || + log_fail "multithreaded share test failed" + +log_note "Verify 'zfs share -a' succeeds." + +# +# Unshare each of the file systems. +# +child_pids=() +for fs in $(seq 0 50) +do + unshare_fs $TESTPOOL/$TESTFS1/$fs & + child_pids+=($!) + unshare_fs $TESTPOOL/$TESTFS2/$fs & + child_pids+=($!) + unshare_fs $TESTPOOL/$TESTFS3/$fs & + child_pids+=($!) +done +wait_for_children "${child_pids[@]}" || + log_fail "multithreaded unshare failed" + +# +# Try a zfs share -a and verify all file systems are shared. +# +log_must zfs share -a + +# +# We need to unset __ZFS_POOL_EXCLUDE so that we include all file systems +# in the os-specific zfs exports file. This will be reset by the next test. +# +unset __ZFS_POOL_EXCLUDE + +for fs in $(seq 0 50) +do + is_shared $TESTPOOL/$TESTFS1/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS1/$fs is not shared" + is_shared $TESTPOOL/$TESTFS2/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS2/$fs is not shared" + is_shared $TESTPOOL/$TESTFS3/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS3/$fs is not shared" + + is_exported $TESTPOOL/$TESTFS1/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS1/$fs is not exported" + is_exported $TESTPOOL/$TESTFS2/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS2/$fs is not exported" + is_exported $TESTPOOL/$TESTFS3/$fs || \ + log_fail "File system $TESTPOOL/$TESTFS3/$fs is not exported" +done + +log_pass "'zfs share [ -a ] ' succeeds as root." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh index 8708d8b624..2b89af9e5a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh @@ -89,7 +89,7 @@ function cleanup_all while (( i < ${#args[*]} )); do for snap in ${args[i]}; do - snapexists $snap && log_must zfs destroy -f $snap + snapexists $snap && destroy_dataset $snap -f done (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh index 2efcf1cceb..4ae68d411d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh @@ -53,12 +53,11 @@ function cleanup for snap in $TESTPOOL/$TESTCTR/$TESTFS1@$TESTSNAP \ $TESTPOOL/$TESTCTR/$TESTVOL@$TESTSNAP; do - snapexists $snap && \ - log_must zfs destroy $snap + snapexists $snap && destroy_dataset $snap done datasetexists $TESTPOOL/$TESTCTR/$TESTVOL && \ - log_must zfs destroy -rf $TESTPOOL/$TESTCTR/$TESTVOL + destroy_dataset $TESTPOOL/$TESTCTR/$TESTVOL -rf } @@ -82,7 +81,7 @@ while (( i < ${#args[*]} )); do ((i = i + 1)) done -# Testing the invalid senario: the child volume already has an +# Testing the invalid scenario: the child volume already has an # identical name snapshot, zfs snapshot -r should fail when # creating snapshot with -r for the parent log_must zfs destroy $TESTPOOL/$TESTCTR/$TESTFS1@$TESTSNAP diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh index 96121f1c13..16926a48dd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh @@ -44,9 +44,7 @@ verify_runnable "both" function cleanup { - if datasetexists $initfs ; then - log_must zfs destroy -rf $initfs - fi + datasetexists $initfs && destroy_dataset $initfs -rf } log_assert "Verify recursive snapshotting could not break ZFS." @@ -70,9 +68,7 @@ while ((ret == 0)); do # is incorrect # if ((len >= 255)); then - if datasetexists $basefs; then - log_must zfs destroy -r $basefs - fi + datasetexists $basefs && destroy_dataset $basefs -r basefs=${basefs%/*} len=$(echo $basefs| wc -c) fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh index d97dc0f822..c133403ac8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh @@ -45,9 +45,7 @@ verify_runnable "both" function cleanup { - if datasetexists $initfs ; then - log_must zfs destroy -rf $initfs - fi + datasetexists $initfs && destroy_dataset $initfs -rf } log_assert "Verify long name filesystem with snapshot should not break ZFS." @@ -71,9 +69,7 @@ while ((ret == 0)); do # is incorrect # if ((len >= 255)); then - if datasetexists $basefs; then - log_must zfs destroy -r $basefs - fi + datasetexists $basefs && destroy_dataset $basefs -r basefs=${basefs%/*} len=$(echo $basefs| wc -c) fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh index 089ebdb979..6b711286c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh @@ -46,9 +46,7 @@ function cleanup { for fs in $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL $TESTPOOL ; do typeset fssnap=$fs@snap - if datasetexists $fssnap ; then - log_must zfs destroy -rf $fssnap - fi + datasetexists $fssnap && destroy_dataset $fssnap -rf done cleanup_user_prop $TESTPOOL } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh index 590d56ec80..9499dca21e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh @@ -46,9 +46,7 @@ function cleanup { for fs in $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL $TESTPOOL/$TESTCTR $TESTPOOL ; do typeset fssnap=$fs@snap - if datasetexists $fssnap ; then - log_must zfs destroy -rf $fssnap - fi + datasetexists $fssnap && destroy_dataset $fssnap -rf done cleanup_user_prop $TESTPOOL } @@ -74,7 +72,12 @@ typeset ro_props="type used available avail creation referenced refer compressra mounted origin" typeset snap_ro_props="volsize recordsize recsize quota reservation reserv mountpoint \ sharenfs checksum compression compress atime devices exec readonly rdonly \ - setuid zoned" + setuid" +if is_freebsd; then + snap_ro_props+=" jailed" +else + snap_ro_props+=" zoned" +fi zfs upgrade -v > /dev/null 2>&1 if [[ $? -eq 0 ]]; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh index 3779100132..627910abd6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh @@ -34,7 +34,7 @@ # STRATEGY: # 1. Create 2 separate zpools, zpool name lengths must be the same. # 2. Attempt to simultaneously create a snapshot of each pool. -# 3. Veriy the snapshot creation failed. +# 3. Verify the snapshot creation failed. # verify_runnable "both" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh index 4cd98af0c6..6fedba9e5b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh @@ -12,6 +12,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Datto Inc. All rights reserved. # # @@ -22,7 +23,9 @@ # 1. Create multiple datasets # 2. Create multiple snapshots with a list of valid and invalid # snapshot names -# 3. Verify the valid snpashot creation +# 3. Verify the valid snapshot creation +# 4. Verify creation of snapshots report the correct numbers by +# performing a snapshot directory listing . $STF_SUITE/include/libtest.shlib @@ -31,9 +34,10 @@ ZFS_MAX_DATASET_NAME_LEN=256 function cleanup { for ds in $datasets; do - datasetexists $ds && log_must zfs destroy -r $ds + datasetexists $ds && destroy_dataset $ds -r done - zfs destroy -r $TESTPOOL/TESTFS4 + destroy_dataset $TESTPOOL/TESTFS4 -r + destroy_dataset $TESTPOOL/TESTFS5 -r } datasets="$TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3" @@ -66,8 +70,7 @@ i=0 while (( i < ${#valid_args[*]} )); do log_must zfs snapshot ${valid_args[i]} for token in ${valid_args[i]}; do - log_must snapexists $token && \ - log_must zfs destroy $token + snapexists $token && destroy_dataset $token done ((i = i + 1)) done @@ -86,7 +89,7 @@ for i in 1 2 3; do txg_tag=$(echo "$txg_group" | nawk -v j=$i 'FNR == j {print}') [[ $txg_tag != $(echo "$txg_group" | \ nawk -v j=$i 'FNR == j {print}') ]] \ - && log_fail "snapshots belong to differnt transaction groups" + && log_fail "snapshots belong to different transaction groups" done log_note "verify snapshot contents" for ds in $datasets; do @@ -112,4 +115,17 @@ log_must zfs rename $TESTPOOL/$TESTFS3/TESTFSA$DATASET_XXX \ log_must zfs snapshot -r $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS2@snap1 \ $TESTPOOL/$TESTFS3@snap1 $TESTPOOL/TESTFS4@snap1 +MYTEST="TESTFS5" +ITERATIONS=10 +NUM_SNAPS=5 +for x in {1..$ITERATIONS}; do + log_must zfs create $TESTPOOL/$MYTEST + for y in {1..$NUM_SNAPS}; do + log_must zfs snapshot $TESTPOOL/$MYTEST@$y + done; + n=$(ls -1 /$TESTPOOL/$MYTEST/.zfs/snapshot | wc -l) + verify_eq $n $NUM_SNAPS "count" + zfs destroy -r $TESTPOOL/$MYTEST; +done; + log_pass "zfs multiple snapshot verified correctly" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/cleanup.ksh index 79cd6e9f90..7d6a7e13db 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/cleanup.ksh @@ -27,4 +27,8 @@ . $STF_SUITE/include/libtest.shlib +if ! is_linux ; then + log_unsupported "sysfs is linux-only" +fi + default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/setup.ksh index 9692385996..261bce4386 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/setup.ksh @@ -27,6 +27,10 @@ . $STF_SUITE/include/libtest.shlib +if ! is_linux ; then + log_unsupported "sysfs is linux-only" +fi + DISK=${DISKS%% *} default_container_volume_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key.ksh index 9e08ac69d4..55cfb5cade 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key.ksh @@ -43,7 +43,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh index ecb98d1894..55da682620 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh @@ -38,9 +38,8 @@ verify_runnable "both" function cleanup { - datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 - datasetexists $TESTPOOL/zvol && log_must zfs destroy $TESTPOOL/zvol + datasetexists $TESTPOOL/$TESTFS1 && destroy_dataset $TESTPOOL/$TESTFS1 -r + datasetexists $TESTPOOL/zvol && destroy_dataset $TESTPOOL/zvol poolexists $TESTPOOL1 && log_must destroy_pool $TESTPOOL1 } log_onexit cleanup @@ -62,7 +61,7 @@ log_must zpool create -O encryption=on -O keyformat=passphrase \ log_must zfs unmount $TESTPOOL/$TESTFS1 log_must zfs unmount $TESTPOOL1 -log_must zfs unload-key -a +log_must_busy zfs unload-key -a log_must key_unavailable $TESTPOOL/$TESTFS1 log_must key_unavailable $TESTPOOL/$TESTFS1/child diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh index 9766b59058..01c720c04b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh @@ -41,7 +41,7 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile.am index 34cbb17ae4..6507b094df 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile.am @@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ zfs_unmount_008_neg.ksh \ zfs_unmount_009_pos.ksh \ zfs_unmount_all_001_pos.ksh \ - zfs_unmount_nested.ksh + zfs_unmount_nested.ksh \ + zfs_unmount_unload_keys.ksh dist_pkgdata_DATA = \ zfs_unmount.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh index fb4d1d9378..6036eb27a0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh @@ -55,7 +55,7 @@ function cleanup log_must zfs umount -f $TESTDIR2 datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 [[ -d $TESTDIR2 ]] && \ log_must rm -rf $TESTDIR2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh index e83e8d5165..e85a0f3cbf 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh @@ -53,14 +53,10 @@ verify_runnable "both" function cleanup { for ds in $vol $fs1; do - if datasetexists $ds; then - log_must zfs destroy -f $ds - fi + datasetexists $ds && destroy_dataset $ds -f done - if snapexists $snap; then - log_must zfs destroy $snap - fi + snapexists $snap && destroy_dataset $snap if [[ -e /tmp/$file ]]; then rm -f /tmp/$file @@ -95,15 +91,14 @@ for arg in ${badargs[@]}; do log_mustnot eval "zfs unmount $arg $fs >/dev/null 2>&1" done - -#Testing invalid datasets +# Testing invalid datasets for ds in $snap $vol "blah"; do for opt in "" "-f"; do log_mustnot eval "zfs unmount $opt $ds >/dev/null 2>&1" done done -#Testing invalid mountpoint +# Testing invalid mountpoint dir=foodir.$$ file=foo.$$ fs1=$TESTPOOL/fs.$$ @@ -119,22 +114,22 @@ for mpt in "./$dir" "./$file" "/tmp"; do done cd $curpath -#Testing null argument and too many arguments +# Testing null argument and too many arguments for opt in "" "-f"; do log_mustnot eval "zfs unmount $opt >/dev/null 2>&1" log_mustnot eval "zfs unmount $opt $fs $fs1 >/dev/null 2>&1" done -#Testing already unmounted filesystem +# Testing already unmounted filesystem log_must zfs unmount $fs1 for opt in "" "-f"; do log_mustnot eval "zfs unmount $opt $fs1 >/dev/null 2>&1" log_mustnot eval "zfs unmount /tmp/$dir >/dev/null 2>&1" done -#Testing legacy mounted filesystem +# Testing legacy mounted filesystem log_must zfs set mountpoint=legacy $fs1 -if is_linux; then +if is_linux || is_freebsd; then log_must mount -t zfs $fs1 /tmp/$dir else log_must mount -F zfs $fs1 /tmp/$dir diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh index 0ed14a99fc..814d603db5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh @@ -55,9 +55,7 @@ function cleanup for fs in $TESTPOOL/$TESTFS $TESTPOOL ; do typeset snap=$fs@$TESTSNAP - if snapexists $snap; then - log_must zfs destroy $snap - fi + snapexists $snap && destroy_dataset $snap done if ! poolexists $TESTPOOL && is_global_zone; then @@ -83,7 +81,7 @@ function restore_dataset } -log_assert "zfs fource unmount and destroy in snapshot directory will not cause error." +log_assert "zfs force unmount and destroy in snapshot directory will not cause error." log_onexit cleanup for fs in $TESTPOOL/$TESTFS $TESTPOOL ; do @@ -139,4 +137,4 @@ log_must eval zpool list > /dev/null 2>&1 log_must eval zpool status > /dev/null 2>&1 zpool iostat > /dev/null 2>&1 -log_pass "zfs fource unmount and destroy in snapshot directory will not cause error." +log_pass "zfs force unmount and destroy in snapshot directory will not cause error." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh index 73eae6a250..7da8be3d17 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh @@ -45,20 +45,24 @@ function nesting_cleanup log_onexit nesting_cleanup set -A test_depths 30 16 3 +typeset mountpoint=/$TESTPOOL/mnt dsA32=$(printf 'a/%.0s' {1..32})"a" log_must zfs create -p $TESTPOOL/$dsA32 dsB32=$(printf 'b/%.0s' {1..32})"b" log_must zfs create -o mountpoint=none -p $TESTPOOL/$dsB32 -log_mustnot mount -t zfs $TESTPOOL/$dsB32 /mnt +# FreeBSD's mount command ignores the mountpoint property. +if ! is_freebsd; then + log_mustnot mount -t zfs $TESTPOOL/$dsB32 /mnt +fi dsC32=$(printf 'c/%.0s' {1..32})"c" log_must zfs create -o mountpoint=legacy -p $TESTPOOL/$dsC32 log_must mount -t zfs $TESTPOOL/$dsC32 /mnt dsD32=$(printf 'd/%.0s' {1..32})"d" -log_must zfs create -o mountpoint=/$TESTPOOL/mnt -p $TESTPOOL/$dsD32 +log_must zfs create -o mountpoint=$mountpoint -p $TESTPOOL/$dsD32 for d in ${test_depths[@]}; do @@ -79,7 +83,7 @@ for d in ${test_depths[@]}; do log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$ds_pre is not mounted" @@ -109,7 +113,7 @@ for d in ${test_depths[@]}; do log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not mounted" @@ -139,7 +143,7 @@ for d in ${test_depths[@]}; do log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not mounted" @@ -152,7 +156,7 @@ for d in ${test_depths[@]}; do fi - # mountpoint=testpool/mnt + # mountpoint=/testpool/mnt ds_pre=$(printf 'd/%.0s' {1..$(($d-2))})"d" ds=$(printf 'd/%.0s' {1..$(($d-1))})"d" ds_post=$(printf 'd/%.0s' {1..$(($d))})"d" @@ -169,7 +173,7 @@ for d in ${test_depths[@]}; do log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$ds_pre is not mounted" @@ -182,8 +186,8 @@ for d in ${test_depths[@]}; do fi done +log_must rmdir $mountpoint # remove the mountpoint we created log_must zpool export $TESTPOOL -log_must rmdir /testpool/mnt # remove the mountpoint we created log_must zpool import $TESTPOOL log_pass "Verified nested dataset are unmounted." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh new file mode 100755 index 0000000000..c92287ad75 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_unmount/zfs_unmount.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# "zfs unmount -u" should allow the user to unload their encryption +# keys while unmounting one or more datasets +# +# STRATEGY: +# 1. Create a hierarchy of encrypted datasets +# 2. Test that 'zfs unmount -u' unloads keys as it unmounts a dataset +# 3. Test that 'zfs unmount -u' unloads keys as it unmounts multiple datasets +# 4. Test that 'zfs unmount -u' returns an error if the key is still in +# use by a clone. +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS2 && \ + destroy_dataset $TESTPOOL/$TESTFS2 -r + datasetexists $TESTPOOL/$TESTFS2/newroot && \ + destroy_dataset $TESTPOOL/$TESTFS2/newroot -r + datasetexists $TESTPOOL/$TESTFS2/child && \ + destroy_dataset $TESTPOOL/$TESTFS2/child -r + +} +log_onexit cleanup + +log_assert "'zfs unmount -u' should unload keys for datasets as they are unmounted" +log_must eval "echo 'password' | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS2" +log_must eval "echo 'password' | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS2/newroot" +log_must zfs create $TESTPOOL/$TESTFS2/child + +log_must zfs umount -u $TESTPOOL/$TESTFS2/newroot +log_must key_unavailable $TESTPOOL/$TESTFS2/newroot +log_must eval "echo 'password' | zfs mount -l $TESTPOOL/$TESTFS2/newroot" + +log_must zfs umount -u $TESTPOOL/$TESTFS2 +log_must key_unavailable $TESTPOOL/$TESTFS2 +log_must key_unavailable $TESTPOOL/$TESTFS2/newroot +log_must key_unavailable $TESTPOOL/$TESTFS2/child +log_must eval "echo 'password' | zfs mount -l $TESTPOOL/$TESTFS2/newroot" + +log_must zfs snap $TESTPOOL/$TESTFS2/newroot@1 +log_must zfs clone $TESTPOOL/$TESTFS2/newroot@1 $TESTPOOL/$TESTFS2/clone +log_mustnot zfs umount -u $TESTPOOL/$TESTFS2/newroot +log_must key_available $TESTPOOL/$TESTFS2/newroot +log_must mounted $TESTPOOL/$TESTFS2/newroot + +log_pass "'zfs unmount -u' unloads keys for datasets as they are unmounted" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh index 7bb1cd4a37..ac16fe97b9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh @@ -62,17 +62,14 @@ function cleanup [[ -d $TESTDIR2 ]] && \ log_must rm -rf $TESTDIR2 - if datasetexists "$TESTPOOL/$TESTCLONE"; then - log_must zfs destroy -f $TESTPOOL/$TESTCLONE - fi + datasetexists "$TESTPOOL/$TESTCLONE" && \ + destroy_dataset $TESTPOOL/$TESTCLONE -f - if snapexists "$TESTPOOL/$TESTFS2@snapshot"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS2@snapshot - fi + snapexists "$TESTPOOL/$TESTFS2@snapshot" && \ + destroy_dataset $TESTPOOL/$TESTFS2@snapshot -f - if datasetexists "$TESTPOOL/$TESTFS2"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS2 - fi + datasetexists "$TESTPOOL/$TESTFS2" && \ + destroy_dataset $TESTPOOL/$TESTFS2 -f } # @@ -140,7 +137,7 @@ while (( i < ${#mntp_fs[*]} )); do ((i = i + 2)) done -log_note "Verify 'zfs unshare -a' succeds as root." +log_note "Verify 'zfs unshare -a' succeeds as root." i=0 typeset sharenfs_val diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh index 6a9c72311c..1ded1b42c7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh @@ -65,17 +65,14 @@ function cleanup [[ -d $TESTDIR2 ]] && \ log_must rm -rf $TESTDIR2 - if datasetexists "$TESTPOOL/$TESTCLONE"; then - log_must zfs destroy -f $TESTPOOL/$TESTCLONE - fi + datasetexists "$TESTPOOL/$TESTCLONE" && \ + destroy_dataset $TESTPOOL/$TESTCLONE -f - if snapexists "$TESTPOOL/$TESTFS2@snapshot"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS2@snapshot - fi + snapexists "$TESTPOOL/$TESTFS2@snapshot" && \ + destroy_dataset $TESTPOOL/$TESTFS2@snapshot -f - if datasetexists "$TESTPOOL/$TESTFS2"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS2 - fi + datasetexists "$TESTPOOL/$TESTFS2" && \ + destroy_dataset $TESTPOOL/$TESTFS2 -f } # diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh index 66a7e80eb7..6e66deda9b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh @@ -49,9 +49,8 @@ verify_runnable "global" function cleanup { - if snapexists $TESTPOOL/$TESTFS@snapshot; then - log_must zfs destroy $TESTPOOL/$TESTFS@snapshot - fi + snapexists $TESTPOOL/$TESTFS@snapshot && \ + destroy_dataset $TESTPOOL/$TESTFS@snapshot log_must zfs set sharenfs=off $TESTPOOL/$TESTFS } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh index e92581c7c9..fd916040b1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh @@ -46,7 +46,7 @@ verify_runnable "global" export NONEXISTFSNAME="nonexistfs50charslong_0123456789012345678901234567" export NONEXISTMOUNTPOINT="/nonexistmountpoint_0123456789" -set -A opts "" "$TESTPOOL/$NONEXISTFSNAME" "$NONEEXISTMOUNTPOINT" "-?" "-1" \ +set -A opts "" "$TESTPOOL/$NONEXISTFSNAME" "$NONEXISTMOUNTPOINT" "-?" "-1" \ "-a blah" "$TESTPOOL/$TESTFS $TESTPOOL/$TESTFS1" \ "-f $TESTPOOL/$TESTFS $TESTPOOL/$TESTFS1" \ "$TESTPOOL/$TESTFS $TESTDIR" "-f $TESTPOOL/$TESTFS $TESTDIR" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh index 0749dc1b86..36817a0920 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh @@ -42,9 +42,8 @@ verify_runnable "global" function cleanup { - if datasetexists "$TESTPOOL/$TESTFS/shared1"; then - log_must zfs destroy -f $TESTPOOL/$TESTFS/shared1 - fi + datasetexists "$TESTPOOL/$TESTFS/shared1" && \ + destroy_dataset $TESTPOOL/$TESTFS/shared1 -f } log_assert "Verify 'zfs destroy' will unshare the dataset" @@ -57,16 +56,12 @@ log_must zfs create \ # # 2. Verify the datasets is shared. # -# The "non-impl" variant of "is_shared" requires the dataset to exist. -# Thus, we can only use the "impl" variant in step 4, below. To be -# consistent with step 4, we also use the "impl" variant here. -# -log_must eval "is_shared_impl $TESTDIR/1" +log_must is_shared $TESTDIR/1 # 3. Invoke 'zfs destroy' on the dataset. log_must zfs destroy -f $TESTPOOL/$TESTFS/shared1 # 4. Verify the dataset is not shared. -log_mustnot eval "is_shared_impl $TESTDIR/1" +log_mustnot is_shared $TESTDIR/1 log_pass "'zfs destroy' will unshare the dataset." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/setup.ksh index 4c1348a192..c9f36017df 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/setup.ksh @@ -39,4 +39,4 @@ else log_note "This machine is running ZFS Filesystem version $ZFS_VERSION" fi -default_setup $DISKS +default_setup "$DISKS" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh index e37b4f81ab..ab76461638 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh @@ -50,9 +50,7 @@ verify_runnable "both" function cleanup { - if datasetexists $rootfs ; then - log_must zfs destroy -Rf $rootfs - fi + datasetexists $rootfs && destroy_dataset $rootfs -Rf log_must zfs create $rootfs for file in $output $oldoutput ; do @@ -116,9 +114,7 @@ if (( i != COUNT - OLDCOUNT )); then fi for fs in $old_datasets ; do - if datasetexists $fs ; then - log_must zfs destroy -Rf $fs - fi + datasetexists $fs && destroy_dataset $fs -Rf done log_must eval 'zfs upgrade > $output 2>&1' @@ -133,7 +129,7 @@ COUNT=$( wc -l $output | awk '{print $1}' ) if (( COUNT != OLDCOUNT )); then cat $output - log_fail "Unexpect old-version filesystems print out." + log_fail "Unexpected old-version filesystems print out." fi log_pass "Executing 'zfs upgrade' command succeeds." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh index 6df47b450d..57f74ca285 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "both" function cleanup { - if datasetexists $rootfs ; then - log_must zfs destroy -Rf $rootfs - fi + datasetexists $rootfs && destroy_dataset $rootfs -Rf log_must zfs create $rootfs } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh index e3ff4f4b90..0b8fef5cd0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "both" function cleanup { - if datasetexists $rootfs ; then - log_must zfs destroy -Rf $rootfs - fi + datasetexists $rootfs && destroy_dataset $rootfs -Rf log_must zfs create $rootfs } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh index 1a929918bf..5fcdc6e268 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "both" function cleanup { - if datasetexists $rootfs ; then - log_must zfs destroy -Rf $rootfs - fi + datasetexists $rootfs && destroy_dataset $rootfs -Rf log_must zfs create $rootfs } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am new file mode 100644 index 0000000000..d401fe68b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_wait +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zfs_wait_deleteq.ksh + +dist_pkgdata_DATA = \ + zfs_wait.kshlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh similarity index 86% rename from tests/zfs-tests/tests/functional/cli_root/zfs_remap/cleanup.ksh rename to tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh index e78deacd5b..456d2d0c2d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh @@ -1,5 +1,6 @@ #!/bin/ksh -p # +# # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version @@ -11,7 +12,7 @@ # # -# Copyright 2018, loli10K . All rights reserved. +# Copyright (c) 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh new file mode 100755 index 0000000000..cca05fee72 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh @@ -0,0 +1,21 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib new file mode 100644 index 0000000000..9f62a7c92e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib @@ -0,0 +1,80 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018, 2019 by Delphix. All rights reserved. +# + +typeset -a disk_array=($(find_disks $DISKS)) + +typeset -r DISK1=${disk_array[0]} +typeset -r DISK2=${disk_array[1]} +typeset -r DISK3=${disk_array[2]} + +# +# When the condition it is waiting for becomes true, 'zfs wait' should return +# promptly. We want to enforce this, but any check will be racey because it will +# take some small but indeterminate amount of time for the waiting thread to be +# woken up and for the process to exit. +# +# To deal with this, we provide a grace period after the condition becomes true +# during which 'zfs wait' can exit. If it hasn't exited by the time the grace +# period expires we assume something is wrong and fail the test. While there is +# no value that can really be correct, the idea is we choose something large +# enough that it shouldn't cause issues in practice. +# +typeset -r WAIT_EXIT_GRACE=2.0 + +function proc_exists # pid +{ + ps -p $1 >/dev/null +} + +function proc_must_exist # pid +{ + proc_exists $1 || log_fail "zpool process exited too soon" +} + +function proc_must_not_exist # pid +{ + proc_exists $1 && log_fail "zpool process took too long to exit" +} + +function get_time +{ + date +'%H:%M:%S' +} + +function kill_if_running +{ + typeset pid=$1 + [[ $pid ]] && proc_exists $pid && log_must kill -s TERM $pid +} + +# Log a command and then start it running in the background +function log_bkgrnd +{ + log_note "$(get_time) Starting cmd in background '$@'" + "$@" & +} + +# Check that a background process has completed and exited with a status of 0 +function bkgrnd_proc_succeeded +{ + typeset pid=$1 + + log_must sleep $WAIT_EXIT_GRACE + + proc_must_not_exist $pid + wait $pid || log_fail "process exited with status $?" + log_note "$(get_time) wait completed successfully" +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh new file mode 100755 index 0000000000..00c5a109c0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib + +# +# DESCRIPTION: +# 'zfs wait' works when waiting for checkpoint discard to complete. +# +# STRATEGY: +# 1. Create a file +# 2. Open a file descriptor pointing to that file. +# 3. Delete the file. +# 4. Start a background process waiting for the delete queue to empty. +# 5. Verify that the command doesn't return immediately. +# 6. Close the open file descriptor. +# 7. Verify that the command returns soon after the descriptor is closed. +# + +function cleanup +{ + kill_if_running $pid + exec 3<&- +} + + +typeset -r TESTFILE="/$TESTPOOL/testfile" +typeset pid + +log_onexit cleanup + +log_must touch $TESTFILE +exec 3<> $TESTFILE +log_must rm $TESTFILE +log_bkgrnd zfs wait -t deleteq $TESTPOOL +pid=$! +proc_must_exist $pid + +exec 3<&- +log_must sleep 0.5 +bkgrnd_proc_succeeded $pid + +log_pass "'zfs wait -t discard' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool/Makefile.am index 2d0046c53a..327f236211 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_001_neg.ksh \ zpool_002_pos.ksh \ - zpool_003_pos.ksh + zpool_003_pos.ksh \ + zpool_colors.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool/setup.ksh index 6a9af3bc28..4e3b6b0e9f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/setup.ksh @@ -29,4 +29,4 @@ DISK=${DISKS%% *} -default_setup $DISK +default_mirror_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_001_neg.ksh index a3158bd578..25decd7886 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_001_neg.ksh @@ -37,7 +37,7 @@ # return an error. # # STRATEGY: -# 1. Create an array containg each zpool sub-command name. +# 1. Create an array containing each zpool sub-command name. # 2. For each element, execute the sub-command. # 3. Verify it returns an error. # diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh index 4cdc71123a..caf8a9a2d0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh @@ -47,31 +47,32 @@ function cleanup { unset ZFS_ABORT - if [[ -d $corepath ]]; then - rm -rf $corepath + if is_freebsd && [ -n "$old_corefile" ]; then + sysctl kern.corefile=$old_corefile fi - if poolexists $pool; then - log_must zpool destroy -f $pool - fi + # Clean up the pool created if we failed to abort. + poolexists $pool && destroy_pool $pool + + rm -rf $corepath $vdev1 $vdev2 $vdev3 } log_assert "With ZFS_ABORT set, all zpool commands can abort and generate a core file." log_onexit cleanup -#preparation work for testing corepath=$TESTDIR/core +corefile=$corepath/zpool.core if [[ -d $corepath ]]; then - rm -rf $corepath + log_must rm -rf $corepath fi -mkdir $corepath +log_must mkdir $corepath pool=pool.$$ vdev1=$TESTDIR/file1 vdev2=$TESTDIR/file2 vdev3=$TESTDIR/file3 for vdev in $vdev1 $vdev2 $vdev3; do - mkfile $MINVDEVSIZE $vdev + log_must mkfile $MINVDEVSIZE $vdev done set -A cmds "create $pool mirror $vdev1 $vdev2" "list $pool" "iostat $pool" \ @@ -86,23 +87,25 @@ set -A badparams "" "create" "destroy" "add" "remove" "list *" "iostat" "status" "import" "export" "upgrade" "history -?" "get" "set" if is_linux; then - ulimit -c unlimited - echo "$corepath/core.zpool" >/proc/sys/kernel/core_pattern + echo $corefile >/proc/sys/kernel/core_pattern echo 0 >/proc/sys/kernel/core_uses_pid - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" -else - coreadm -p ${corepath}/core.%f +elif is_freebsd; then + old_corefile=$(sysctl -n kern.corefile) + log_must sysctl kern.corefile=$corefile fi +ulimit -c unlimited +export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" export ZFS_ABORT=yes for subcmd in "${cmds[@]}" "${badparams[@]}"; do - corefile=${corepath}/core.zpool zpool $subcmd >/dev/null 2>&1 if [[ ! -e $corefile ]]; then - log_fail "zpool $subcmd cannot generate core file with ZFS_ABORT set." + log_fail "zpool $subcmd cannot generate core file with ZFS_ABORT set." fi rm -f $corefile done +unset ZFS_ABORT + log_pass "With ZFS_ABORT set, zpool command can abort and generate core file as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh index 0f04f0c046..71d73c0f80 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh @@ -42,11 +42,34 @@ # 3. Verify it run successfully. # +function cleanup +{ + unset ZFS_ABORT + + if is_freebsd && [ -n "$old_corefile" ]; then + sysctl kern.corefile=$old_corefile + fi + + rm -rf $corepath + + # Don't leave the pool frozen. + destroy_pool $TESTPOOL + default_mirror_setup $DISKS +} + verify_runnable "both" log_assert "Debugging features of zpool should succeed." +log_onexit cleanup -log_must zpool -? > /dev/null 2>&1 +corepath=$TESTDIR/core +corefile=$corepath/zpool.core +if [[ -d $corepath ]]; then + log_must rm -rf $corepath +fi +log_must mkdir $corepath + +log_must eval "zpool -? >/dev/null 2>&1" if is_global_zone ; then log_must zpool freeze $TESTPOOL @@ -57,21 +80,22 @@ fi log_mustnot zpool freeze fakepool -# Remove corefile possibly left by previous failing run of this test. -[[ -f core ]] && log_must rm -f core - if is_linux; then - ulimit -c unlimited - echo "core" >/proc/sys/kernel/core_pattern + echo $corefile >/proc/sys/kernel/core_pattern echo 0 >/proc/sys/kernel/core_uses_pid - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" +elif is_freebsd; then + old_corefile=$(sysctl -n kern.corefile) + log_must sysctl kern.corefile=$corefile fi +ulimit -c unlimited + +export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" +export ZFS_ABORT=yes + +zpool >/dev/null 2>&1 -ZFS_ABORT=1; export ZFS_ABORT -zpool > /dev/null 2>&1 unset ZFS_ABORT -[[ -f core ]] || log_fail "zpool did not dump core by request." -[[ -f core ]] && log_must rm -f core +[[ -f $corefile ]] || log_fail "zpool did not dump core by request." log_pass "Debugging features of zpool succeed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_colors.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_colors.ksh new file mode 100755 index 0000000000..18f2383863 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_colors.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2019 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test that zpool status colored output works. +# +# STRATEGY: +# 1. Create a pool with a bunch of errors and force fault one of the vdevs. +# 2. Look for 'pool:' in bold. +# 3. Look for 'DEGRADED' in yellow +# 3. Look for 'FAULTED' in red +# + +verify_runnable "both" + +function cleanup +{ + zinject -c all +} + +log_onexit cleanup + +log_assert "Test colorized zpool status output" + +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must dd if=/dev/urandom of=/$TESTDIR/testfile bs=10M count=1 + +log_must zpool sync + +log_must zpool offline -f $TESTPOOL $DISK3 +log_must wait_for_degraded $TESTPOOL +log_must zinject -d $DISK2 -e io -T read -f 20 $TESTPOOL +log_must zinject -d $DISK2 -e io -T write -f 20 $TESTPOOL + + +log_must zpool scrub -w $TESTPOOL +log_must zinject -c all + + +# Use 'script' to fake zpool status into thinking it's running in a tty. +# Log the output here in case it's needed for postmortem. +log_note "$(faketty TERM=xterm-256color ZFS_COLOR=1 zpool status)" + +# Replace the escape codes with "ESC" so they're easier to grep +out="$(faketty TERM=xterm-256color ZFS_COLOR=1 zpool status | \ + grep -E 'pool:|DEGRADED' | \ + sed -r 's/[[:space:]]+//g;'$(echo -e 's/\033/ESC/g'))" + +log_note "$(echo $out)" + +log_note "Look for 'pool:' in bold" +log_must eval "echo \"$out\" | grep -q 'ESC\[1mpool:ESC\[0m' " + +log_note "Look for 'DEGRADED' in yellow" +log_must eval "echo \"$out\" | grep -q 'ESC\[0;33mDEGRADEDESC\[0m'" + +# +# The escape code for 'FAULTED' is a little more tricky. The line starts like +# this: +# +# loop2 FAULTED +# +# Luckily, awk counts the start and end escape codes as separate fields, so +# we can easily remove the vdev field to get what we want. +# +out="$(faketty TERM=xterm-256color ZFS_COLOR=1 zpool status \ + | awk '/FAULTED/{print $1$3$4}' | sed -r $(echo -e 's/\033/ESC/g'))" + +log_note "Look for 'FAULTED' in red" +log_must eval "echo \"$out\" | grep -q 'ESC\[0;31mFAULTEDESC\[0m'" + +log_pass "zpool status displayed colors" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am index a7f62b6f9f..8d54d13f72 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am @@ -14,7 +14,8 @@ dist_pkgdata_SCRIPTS = \ zpool_add_010_pos.ksh \ add-o_ashift.ksh \ add_prop_ashift.ksh \ - add_nested_replacing_spare.ksh + add_nested_replacing_spare.ksh \ + zpool_add_dryrun_output.ksh dist_pkgdata_DATA = \ zpool_add.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 8556f298e7..89cc4b0d30 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -22,10 +22,11 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib # # DESCRIPTION: @@ -35,25 +36,29 @@ # STRATEGY: # 1. Create a pool with default values. # 2. Verify 'zpool add -o ashift=' works with allowed values (9-16). -# 3. Verify 'zpool add -o ashift=' doesn't accept other invalid values. +# 3. Verify setting kernel tunable for file vdevs works correctly. +# 4. Verify 'zpool add -o ashift=' doesn't accept other invalid values. # verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift poolexists $TESTPOOL && destroy_pool $TESTPOOL - log_must rm -f $disk1 $disk2 + rm -f $disk1 $disk2 } log_assert "zpool add -o ashift=' works with different ashift values" log_onexit cleanup -disk1=$TEST_BASE_DIR/$FILEDISK0 -disk2=$TEST_BASE_DIR/$FILEDISK1 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) + typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do @@ -69,13 +74,31 @@ do log_must zpool destroy $TESTPOOL log_must zpool labelclear $disk1 log_must zpool labelclear $disk2 + + # + # Make sure we can also set the ashift using the tunable. + # + log_must zpool create $TESTPOOL $disk1 + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift + log_must zpool add $TESTPOOL $disk2 + verify_ashift $disk2 $ashift + if [[ $? -ne 0 ]] + then + log_fail "Device was added without setting ashift value to "\ + "$ashift" + fi + # clean things for the next run + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift + log_must zpool destroy $TESTPOOL + log_must zpool labelclear $disk1 + log_must zpool labelclear $disk2 done typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") for badval in ${badvals[@]} do log_must zpool create $TESTPOOL $disk1 - log_mustnot zpool add $TESTPOOL -o ashift="$badval" $disk2 + log_mustnot zpool add -o ashift="$badval" $TESTPOOL $disk2 # clean things for the next run log_must zpool destroy $TESTPOOL log_must zpool labelclear $disk1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_nested_replacing_spare.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_nested_replacing_spare.ksh index ec94df8356..61f5f6d1ce 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_nested_replacing_spare.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_nested_replacing_spare.ksh @@ -25,7 +25,6 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh index 29debe1065..4637fe0d84 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -43,6 +44,7 @@ verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift poolexists $TESTPOOL && destroy_pool $TESTPOOL log_must rm -f $disk1 $disk2 } @@ -50,11 +52,19 @@ function cleanup log_assert "'zpool add' uses the ashift pool property value as default." log_onexit cleanup -disk1=$TEST_BASE_DIR/$FILEDISK0 -disk2=$TEST_BASE_DIR/$FILEDISK1 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +# +# Set the file vdev's ashift to the max. Overriding +# the ashift using the -o ashift property should still +# be honored. +# +log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16 + typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do @@ -77,7 +87,7 @@ do for cmdval in ${ashifts[@]} do log_must zpool create -o ashift=$ashift $TESTPOOL $disk1 - log_must zpool add $TESTPOOL -o ashift=$cmdval $disk2 + log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2 verify_ashift $disk2 $cmdval if [[ $? -ne 0 ]] then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh index 48a6bc3019..33bd94fdc4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh @@ -32,11 +32,4 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib -DISK=${DISKS%% *} -if is_mpath_device $DISK; then - delete_partitions -fi - -cleanup_devices $DISKS - log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh index 4b5f44a2a6..13bd33ee42 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh @@ -34,28 +34,4 @@ verify_runnable "global" -if ! $(is_physical_device $DISKS) ; then - log_unsupported "This directory cannot be run on raw files." -fi - -disk1=${DISKS%% *} -if is_mpath_device $disk1; then - delete_partitions -fi - -if [[ -n $DISK ]]; then - # - # Use 'zpool create' to clean up the information in - # in the given disk to avoid slice overlapping. - # - cleanup_devices $DISK - - partition_disk $SIZE $DISK 7 -else - for disk in `echo $DISKSARRAY`; do - cleanup_devices $disk - partition_disk $SIZE $disk 7 - done -fi - log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg index e4429b2a83..a634b8b3c6 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg @@ -28,59 +28,12 @@ # Copyright (c) 2012, 2015 by Delphix. All rights reserved. # -export DISK_ARRAY_NUM=0 -export DISK_ARRAY_LIMIT=4 -export DISKSARRAY="" - -function set_disks -{ - set -A disk_array $(find_disks $DISKS) - - if (( ${#disk_array[*]} <= 1 )); then - export DISK=${DISKS%% *} - else - export DISK="" - typeset -i i=0 - while (( i < ${#disk_array[*]} )); do - export DISK${i}="${disk_array[$i]}" - DISKSARRAY="$DISKSARRAY ${disk_array[$i]}" - (( i = i + 1 )) - (( i>$DISK_ARRAY_LIMIT )) && break - done - export DISK_ARRAY_NUM=$i - export DISKSARRAY - fi - - if (( $DISK_ARRAY_NUM == 0 )); then - export disk=$DISK - else - export disk=$DISK0 - fi - -} - -set_disks - export SIZE="$(((MINVDEVSIZE / (1024 * 1024)) * 2))m" +export VOLSIZE=$MINVDEVSIZE + +echo $DISKS | read DISK0 DISK1 DISK2 if is_linux; then + export DISK_ARRAY_NUM=3 set_device_dir - set_slice_prefix - export SLICE0=1 - export SLICE1=2 - export SLICE3=4 - export SLICE4=5 - export SLICE5=6 - export SLICE6=7 -else - export DEV_DSKDIR="/dev" - export SLICE_PREFIX="s" - export SLICE0=0 - export SLICE1=1 - export SLICE3=3 - export SLICE4=4 - export SLICE5=5 - export SLICE6=6 fi - -export VOLSIZE=$MINVDEVSIZE diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib index f80a2a864e..a7a1fb3302 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib @@ -40,12 +40,12 @@ function find_vfstab_dev typeset vfstabdevs="" typeset line - if is_linux; then - vfstab="/etc/fstab" - tmpfile="$TEST_BASE_DIR/fstab.tmp" - else + if is_illumos; then vfstab="/etc/vfstab" tmpfile="$TEST_BASE_DIR/vfstab.tmp" + else + vfstab="/etc/fstab" + tmpfile="$TEST_BASE_DIR/fstab.tmp" fi cat $vfstab | grep "^${DEV_DSKDIR}" >$tmpfile @@ -69,7 +69,12 @@ function find_mnttab_dev typeset mnttabdevs="" typeset line - if is_linux; then + if is_freebsd; then + # FreeBSD doesn't have a mnttab file. + mount -p | awk -v dir="^${DEV_DSKDIR}" \ + '$1 ~ dir { print $1 }' | xargs + return 0 + elif is_linux; then typeset mnttab="/etc/mtab" typeset tmpfile="$TEST_BASE_DIR/mtab.tmp" else @@ -90,36 +95,17 @@ function find_mnttab_dev } # -# Save the systme current dump device configuration +# Save the system current dump device configuration # function save_dump_dev { - typeset dumpdev + typeset dumpdev="" - if is_linux; then - dumpdev="" - else + if is_illumos; then typeset fnd="Dump device" dumpdev=`dumpadm | grep "$fnd" | cut -f2 -d : | \ awk '{print $1}'` fi echo $dumpdev } - -# -# Common cleanup routine for partitions used in testing -# -function partition_cleanup -{ - - if [[ -n $DISK ]]; then - partition_disk $SIZE $DISK 7 - else - typeset disk="" - for disk in $DISK0 $DISK1; do - partition_disk $SIZE $disk 7 - done - fi - -} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh index a0fc0eadeb..191ec839a9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh @@ -47,61 +47,41 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && \ - destroy_pool $TESTPOOL - - partition_cleanup + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -f $disk0 $disk1 } log_assert "'zpool add ...' can add devices to the pool." log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" "spare" +set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare" -case $DISK_ARRAY_NUM in -0|1) - pooldevs="${disk}${SLICE_PREFIX}${SLICE0} \ - ${DEV_DSKDIR}/${disk}${SLICE_PREFIX}${SLICE0} \ - \"${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}\"" - mirrordevs="\"${DEV_DSKDIR}/${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}\"" - raidzdevs="\"${DEV_DSKDIR}/${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}\"" +pooldevs="${DISK0} \ + \"${DISK0} ${DISK1}\" \ + \"${DISK0} ${DISK1} ${DISK2}\"" +mirrordevs="\"${DISK0} ${DISK1}\"" +raidzdevs="\"${DISK0} ${DISK1}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" - ;; -2|*) - pooldevs="${DISK0}${SLICE_PREFIX}${SLICE0} \ - \"${DEV_DSKDIR}/${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0}\" \ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1} \ - ${DISK1}${SLICE_PREFIX}${SLICE1}\"\ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1}\ - ${DISK1}${SLICE_PREFIX}${SLICE1}\"" - mirrordevs="\"${DEV_DSKDIR}/${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0}\"" - raidzdevs="\"${DEV_DSKDIR}/${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0}\"" - - ;; -esac +disk0=$TEST_BASE_DIR/disk0 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 +truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2 typeset -i i=0 typeset vdev eval set -A poolarray $pooldevs eval set -A mirrorarray $mirrordevs eval set -A raidzarray $raidzdevs +eval set -A draidarray $draiddevs while (( $i < ${#keywords[*]} )); do case ${keywords[i]} in ""|spare) for vdev in "${poolarray[@]}"; do - create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE6}" + create_pool "$TESTPOOL" "$disk0" log_must poolexists "$TESTPOOL" log_must zpool add -f "$TESTPOOL" ${keywords[i]} $vdev log_must vdevs_in_pool "$TESTPOOL" "$vdev" @@ -112,8 +92,7 @@ while (( $i < ${#keywords[*]} )); do mirror) for vdev in "${mirrorarray[@]}"; do create_pool "$TESTPOOL" "${keywords[i]}" \ - "${disk}${SLICE_PREFIX}${SLICE4}" \ - "${disk}${SLICE_PREFIX}${SLICE5}" + "$disk0" "$disk1" log_must poolexists "$TESTPOOL" log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev log_must vdevs_in_pool "$TESTPOOL" "$vdev" @@ -124,14 +103,26 @@ while (( $i < ${#keywords[*]} )); do raidz|raidz1) for vdev in "${raidzarray[@]}"; do create_pool "$TESTPOOL" "${keywords[i]}" \ - "${disk}${SLICE_PREFIX}${SLICE4}" \ - "${disk}${SLICE_PREFIX}${SLICE5}" + "$disk0" "$disk1" log_must poolexists "$TESTPOOL" log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev log_must vdevs_in_pool "$TESTPOOL" "$vdev" destroy_pool "$TESTPOOL" done + ;; + draid:1s|draid1:1s) + for vdev in "${draidarray[@]}"; do + create_pool "$TESTPOOL" "${keywords[i]}" \ + "$disk0" "$disk1" "$disk2" + log_must poolexists "$TESTPOOL" + log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev + log_must vdevs_in_pool "$TESTPOOL" "$vdev" + log_must vdevs_in_pool "$TESTPOOL" "draid1-0-0" + log_must vdevs_in_pool "$TESTPOOL" "draid1-1-0" + destroy_pool "$TESTPOOL" + done + ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh index eb492311a3..67810bbf98 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh @@ -48,10 +48,7 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && \ - destroy_pool $TESTPOOL - - partition_cleanup + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_assert "'zpool add -f ...' can successfully add" \ @@ -59,14 +56,13 @@ log_assert "'zpool add -f ...' can successfully add" \ log_onexit cleanup -create_pool "$TESTPOOL" mirror "${disk}${SLICE_PREFIX}${SLICE0}" \ - "${disk}${SLICE_PREFIX}${SLICE1}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL mirror $DISK0 $DISK1 +log_must poolexists $TESTPOOL -log_mustnot zpool add "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE3} -log_mustnot vdevs_in_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE3}" +log_mustnot zpool add $TESTPOOL $DISK2 +log_mustnot vdevs_in_pool $TESTPOOL $DISK2 -log_must zpool add -f "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE3} -log_must vdevs_in_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE3}" +log_must zpool add -f $TESTPOOL $DISK2 +log_must vdevs_in_pool $TESTPOOL $DISK2 log_pass "'zpool add -f ...' executes successfully." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh index cfdc29d95d..a6b03ff325 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh @@ -34,26 +34,23 @@ # # DESCRIPTION: -# 'zpool add -n ...' can display the configuration without -# adding the specified devices to given pool +# 'zpool add -n ...' can display the configuration without adding +# the specified devices to given pool # # STRATEGY: -# 1. Create a storage pool -# 2. Use -n to add a device to the pool -# 3. Verify the device is not added actually +# 1. Create a storage pool +# 2. Use -n to add devices to the pool +# 3. Verify the devices are not added actually +# 4. Add devices to the pool for real this time, verify the vdev tree is the +# same printed by the dryrun iteration # verify_runnable "global" function cleanup { - poolexists $TESTPOOL && \ - destroy_pool $TESTPOOL - - partition_cleanup - - [[ -e $tmpfile ]] && \ - log_must rm -f $tmpfile + destroy_pool $TESTPOOL + rm -f $TMPFILE_PREFIX* $VDEV_PREFIX* } log_assert "'zpool add -n ...' can display the configuration" \ @@ -61,18 +58,40 @@ log_assert "'zpool add -n ...' can display the configuration" \ log_onexit cleanup -tmpfile="$TEST_BASE_DIR/zpool_add_003.tmp$$" +typeset TMPFILE_PREFIX="$TEST_BASE_DIR/zpool_add_003" +typeset STR_DRYRUN="would update '$TESTPOOL' to the following configuration:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" +typeset -a VDEV_TYPES=("" "dedup" "special" "log" "cache" "spare") -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" +vdevs="" +config="" + +# 1. Create a storage pool +log_must truncate -s $SPA_MINDEVSIZE "$VDEV_PREFIX-root" +log_must zpool create "$TESTPOOL" "$VDEV_PREFIX-root" log_must poolexists "$TESTPOOL" +for vdevtype in "${VDEV_TYPES[@]}"; do + log_must truncate -s $SPA_MINDEVSIZE "$VDEV_PREFIX-$vdevtype" + vdevs="$vdevs $VDEV_PREFIX-$vdevtype" + config="$config $vdevtype $VDEV_PREFIX-$vdevtype" +done -zpool add -n "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE1} > $tmpfile +# 2. Use -n to add devices to the pool +log_must eval "zpool add -f -n $TESTPOOL $config > $TMPFILE_PREFIX-dryrun" +log_must grep -q "$STR_DRYRUN" "$TMPFILE_PREFIX-dryrun" -log_mustnot vdevs_in_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE1}" +# 3. Verify the devices are not added actually +for vdev in $vdevs; do + log_mustnot vdevs_in_pool "$TESTPOOL" "$vdev" +done -str="would update '$TESTPOOL' to the following configuration:" -cat $tmpfile | grep "$str" >/dev/null 2>&1 -(( $? != 0 )) && \ - log_fail "'zpool add -n ...' is executed as unexpected" +# 4. Add devices to the pool for real this time, verify the vdev tree is the +# same printed by the dryrun iteration +log_must zpool add -f $TESTPOOL $config +zpool status $TESTPOOL | awk 'NR == 1, /NAME/ { next } /^$/ {exit} + {print $1}' > "$TMPFILE_PREFIX-vdevtree" +cat "$TMPFILE_PREFIX-dryrun" | awk 'NR == 1, /would/ {next} + /^$/ {next} {print $1}' > "$TMPFILE_PREFIX-vdevtree-n" +log_must eval "diff $TMPFILE_PREFIX-vdevtree-n $TMPFILE_PREFIX-vdevtree" -log_pass "'zpool add -n ...'executes successfully." +log_pass "'zpool add -n ...' executes successfully." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh index 61ce4ec69c..64e52960d3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh @@ -47,31 +47,30 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && \ - destroy_pool "$TESTPOOL" - - datasetexists $TESTPOOL1/$TESTVOL && \ - log_must zfs destroy -f $TESTPOOL1/$TESTVOL - poolexists $TESTPOOL1 && \ - destroy_pool "$TESTPOOL1" - - partition_cleanup - + poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + if [ -n "$recursive" ]; then + set_tunable64 VOL_RECURSIVE $recursive + fi } log_assert "'zpool add ...' can add zfs volume to the pool." log_onexit cleanup -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL $DISK0 +log_must poolexists $TESTPOOL -create_pool "$TESTPOOL1" "${disk}${SLICE_PREFIX}${SLICE1}" -log_must poolexists "$TESTPOOL1" +create_pool $TESTPOOL1 $DISK1 +log_must poolexists $TESTPOOL1 log_must zfs create -V $VOLSIZE $TESTPOOL1/$TESTVOL block_device_wait -log_must zpool add "$TESTPOOL" $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL +if is_freebsd; then + recursive=$(get_tunable VOL_RECURSIVE) + log_must set_tunable64 VOL_RECURSIVE 1 +fi +log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh index 1516cb20af..c40f8db6f0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh @@ -50,16 +50,12 @@ verify_runnable "global" function cleanup { - poolexists "$TESTPOOL" && \ - destroy_pool "$TESTPOOL" - poolexists "$TESTPOOL1" && \ - destroy_pool "$TESTPOOL1" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 if [[ -n $saved_dump_dev ]]; then log_must eval "dumpadm -u -d $saved_dump_dev > /dev/null" fi - - partition_cleanup } log_assert "'zpool add' should fail with inapplicable scenarios." @@ -69,27 +65,27 @@ log_onexit cleanup mnttab_dev=$(find_mnttab_dev) vfstab_dev=$(find_vfstab_dev) saved_dump_dev=$(save_dump_dev) -dump_dev=${disk}${SLICE_PREFIX}${SLICE3} +dump_dev=$DISK2 -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL $DISK0 +log_must poolexists $TESTPOOL -create_pool "$TESTPOOL1" "${disk}${SLICE_PREFIX}${SLICE1}" -log_must poolexists "$TESTPOOL1" +create_pool $TESTPOOL1 $DISK1 +log_must poolexists $TESTPOOL1 unset NOINUSE_CHECK -log_mustnot zpool add -f "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE1} -log_mustnot zpool add -f "$TESTPOOL" $mnttab_dev +log_mustnot zpool add -f $TESTPOOL $DISK1 +log_mustnot zpool add -f $TESTPOOL $mnttab_dev if is_linux; then - log_mustnot zpool add "$TESTPOOL" $vfstab_dev + log_mustnot zpool add $TESTPOOL $vfstab_dev else - log_mustnot zpool add -f "$TESTPOOL" $vfstab_dev + log_mustnot zpool add -f $TESTPOOL $vfstab_dev fi -if ! is_linux; then - log_must echo "y" | newfs ${DEV_DSKDIR}/$dump_dev > /dev/null 2>&1 - log_must dumpadm -u -d ${DEV_DSKDIR}/$dump_dev > /dev/null - log_mustnot zpool add -f "$TESTPOOL" $dump_dev +if is_illumos; then + log_must eval "new_fs ${DEV_DSKDIR}/$dump_dev > /dev/null 2>&1" + log_must eval "dumpadm -u -d ${DEV_DSKDIR}/$dump_dev > /dev/null" + log_mustnot zpool add -f $TESTPOOL $dump_dev fi log_pass "'zpool add' should fail with inapplicable scenarios." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh index 6d47365ed9..2c3f488ea2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh @@ -46,14 +46,8 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL1 && \ - destroy_pool $TESTPOOL1 - - poolexists $TESTPOOL && \ - destroy_pool $TESTPOOL - - [[ -d $TESTDIR ]] && log_must rm -rf $TESTDIR - partition_cleanup + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + rm -rf $TESTDIR } log_assert "Adding a large number of file based vdevs to a zpool works." @@ -66,12 +60,12 @@ create_pool "$TESTPOOL1" "$TESTDIR/file.00" vdevs_list=$(echo $TESTDIR/file.{01..16}) log_must truncate -s $MINVDEVSIZE $vdevs_list -log_must zpool add -f "$TESTPOOL1" $vdevs_list -log_must vdevs_in_pool "$TESTPOOL1" "$vdevs_list" +log_must zpool add -f $TESTPOOL1 $vdevs_list +log_must vdevs_in_pool $TESTPOOL1 "$vdevs_list" # Attempt to add a file based vdev that's too small. log_must truncate -s 32m $TESTDIR/broken_file -log_mustnot zpool add -f "$TESTPOOL1" ${TESTDIR}/broken_file -log_mustnot vdevs_in_pool "$TESTPOOL1" "${TESTDIR}/broken_file" +log_mustnot zpool add -f $TESTPOOL1 ${TESTDIR}/broken_file +log_mustnot vdevs_in_pool $TESTPOOL1 ${TESTDIR}/broken_file log_pass "Adding a large number of file based vdevs to a zpool works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_007_neg.ksh index 081815bd02..4e9535c1c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_007_neg.ksh @@ -46,10 +46,7 @@ verify_runnable "global" function cleanup { - poolexists "$TESTPOOL" && \ - destroy_pool "$TESTPOOL" - - partition_cleanup + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_assert "'zpool add' should return an error with badly-formed parameters." @@ -57,10 +54,10 @@ log_assert "'zpool add' should return an error with badly-formed parameters." log_onexit cleanup set -A args "" "-f" "-n" "-?" "-nf" "-fn" "-f -n" "--f" "-blah" \ - "-? $TESTPOOL ${disk}${SLICE_PREFIX}${SLICE1}" + "-? $TESTPOOL $DISK1" -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL $DISK0 +log_must poolexists $TESTPOOL typeset -i i=0 while (( $i < ${#args[*]} )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_008_neg.ksh index edcdd32c93..77a899f70c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_008_neg.ksh @@ -46,22 +46,18 @@ verify_runnable "global" function cleanup { - - poolexists "$TESTPOOL" && \ - destroy_pool "$TESTPOOL" - - partition_cleanup + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_assert "'zpool add' should return an error with nonexistent pools and vdevs" log_onexit cleanup -set -A args "" "-f nonexistent_pool ${disk}${SLICE_PREFIX}${SLICE1}" \ +set -A args "" "-f nonexistent_pool $DISK1" \ "-f $TESTPOOL nonexistent_vdev" -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL $DISK0 +log_must poolexists $TESTPOOL typeset -i i=0 while (( $i < ${#args[*]} )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh index 1fc1a046ab..7ffe9512af 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh @@ -47,12 +47,7 @@ verify_runnable "global" function cleanup { - - poolexists "$TESTPOOL" && \ - destroy_pool "$TESTPOOL" - - partition_cleanup - + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_assert "'zpool add' should fail if vdevs are the same or vdev is " \ @@ -60,12 +55,11 @@ log_assert "'zpool add' should fail if vdevs are the same or vdev is " \ log_onexit cleanup -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must poolexists "$TESTPOOL" +create_pool $TESTPOOL $DISK0 +log_must poolexists $TESTPOOL -log_mustnot zpool add -f "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE1} \ - ${disk}${SLICE_PREFIX}${SLICE1} -log_mustnot zpool add -f "$TESTPOOL" ${disk}${SLICE_PREFIX}${SLICE0} +log_mustnot zpool add -f $TESTPOOL $DISK1 $DISK1 +log_mustnot zpool add -f $TESTPOOL $DISK0 log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \ "contained in the given pool." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh index 8b8eade48d..771b689c93 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh @@ -51,7 +51,7 @@ function cleanup typeset -i i=0 while ((i < 10)); do - log_must rm -f $TEST_BASE_DIR/vdev$i + rm -f $TEST_BASE_DIR/vdev$i ((i += 1)) done } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh new file mode 100755 index 0000000000..73dec92403 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh @@ -0,0 +1,175 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset STR_DRYRUN="would update '$TESTPOOL' to the following configuration:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool add -n ...' can display the correct configuration +# +# STRATEGY: +# 1. Create different storage pools, use -n to add devices to the pool and +# verify the output is as expected. +# 2. Create a pool with a hole vdev and verify it's not listed with add -n. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + ( + tree="'${dev[0]}' log '${dev[1]}' special '${dev[2]}' dedup '${dev[3]}'" + add="spare '${dev[4]}' cache '${dev[5]}'" + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + dedup + ${dev[3]} + special + ${dev[2]} + logs + ${dev[1]} + cache + ${dev[5]} + spares + ${dev[4]}" + ) + ( + tree="'${dev[0]}' log '${dev[1]}' special '${dev[2]}' dedup '${dev[3]}' \ + spare '${dev[4]}' cache '${dev[5]}'" + + add="'${dev[6]}' log '${dev[7]}' special '${dev[8]}' dedup '${dev[9]}' \ + spare '${dev[10]}' cache '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + ${dev[6]} + dedup + ${dev[3]} + ${dev[9]} + special + ${dev[2]} + ${dev[8]} + logs + ${dev[1]} + ${dev[7]} + cache + ${dev[5]} + ${dev[11]} + spares + ${dev[4]} + ${dev[10]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' \ + log mirror '${dev[2]}' '${dev[3]}' \ + dedup mirror '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}'" + + add="special mirror '${dev[4]}' '${dev[5]}' \ + spare '${dev[9]}' cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + mirror-0 + ${dev[0]} + ${dev[1]} + dedup + mirror + ${dev[6]} + ${dev[7]} + special + mirror + ${dev[4]} + ${dev[5]} + logs + mirror + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + destroy_pool "$TESTPOOL" + rm -f "$VDEV_PREFIX"* +} + +log_assert "'zpool add -n ...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset add="${tests[$i].add}" + typeset want="${tests[$i].want}" + + log_must eval zpool create "$TESTPOOL" $tree + log_must poolexists "$TESTPOOL" + typeset out="$(log_must eval "zpool add -n '$TESTPOOL' $add" | \ + sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi + log_must destroy_pool "$TESTPOOL" +done + +# Make sure hole vdevs are skipped in output. +log_must eval "zpool create '$TESTPOOL' '${dev[0]}' log '${dev[1]}' \ + cache '${dev[2]}'" + +# Create a hole vdev. +log_must eval "zpool remove '$TESTPOOL' '${dev[1]}'" +log_mustnot eval "zpool add -n '$TESTPOOL' '${dev[1]}' | \ + grep -qE '[[:space:]]+hole'" + +log_pass "'zpool add -n ...' displays config correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh index fd33fb9506..618c6992ed 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -41,19 +42,27 @@ verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - log_must rm -f $disk1 - log_must rm -f $disk2 + rm -f $disk1 $disk2 } log_assert "zpool attach -o ashift=' works with different ashift values" log_onexit cleanup -disk1=$TEST_BASE_DIR/$FILEDISK0 -disk2=$TEST_BASE_DIR/$FILEDISK1 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 log_must truncate -s $SIZE $disk1 log_must truncate -s $SIZE $disk2 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +# +# Set the file vdev's ashift to the max. Overriding +# the ashift using the -o ashift property should still +# be honored. +# +log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16 + typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do @@ -92,7 +101,7 @@ typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") for badval in ${badvals[@]} do log_must zpool create $TESTPOOL1 $disk1 - log_mustnot zpool attach $TESTPOOL1 -o ashift=$badval $disk1 $disk2 + log_mustnot zpool attach -o ashift=$badval $TESTPOOL1 $disk1 $disk2 log_must zpool destroy $TESTPOOL1 log_must zpool labelclear $disk1 log_mustnot zpool labelclear $disk2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh index 79ceaabd0d..98b4140727 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh @@ -176,11 +176,7 @@ function do_testing # dd if=/dev/zero of=$fbase.$i seek=512 bs=1024 count=$wcount conv=notrunc \ > /dev/null 2>&1 log_must sync - log_must zpool scrub $TESTPOOL1 - # Wait for the completion of scrub operation - while is_pool_scrubbing $TESTPOOL1; do - sleep 1 - done + log_must zpool scrub -w $TESTPOOL1 check_err $TESTPOOL1 && \ log_fail "No error generated." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am index 3c595935a1..5ffaae5b15 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am @@ -27,14 +27,24 @@ dist_pkgdata_SCRIPTS = \ zpool_create_024_pos.ksh \ zpool_create_encrypted.ksh \ zpool_create_crypt_combos.ksh \ + zpool_create_draid_001_pos.ksh \ + zpool_create_draid_002_pos.ksh \ + zpool_create_draid_003_pos.ksh \ + zpool_create_draid_004_pos.ksh \ zpool_create_features_001_pos.ksh \ zpool_create_features_002_pos.ksh \ zpool_create_features_003_pos.ksh \ zpool_create_features_004_neg.ksh \ zpool_create_features_005_pos.ksh \ + zpool_create_features_006_pos.ksh \ + zpool_create_features_007_pos.ksh \ + zpool_create_features_008_pos.ksh \ + zpool_create_features_009_pos.ksh \ create-o_ashift.ksh \ - zpool_create_tempname.ksh + zpool_create_tempname.ksh \ + zpool_create_dryrun_output.ksh dist_pkgdata_DATA = \ + draidcfg.gz \ zpool_create.cfg \ zpool_create.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh index d3134a795b..a504877540 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh @@ -32,8 +32,6 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib -clean_blockfile "$TESTDIR $TESTDIR0 $TESTDIR1" - cleanup_devices $DISKS log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/create-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/create-o_ashift.ksh index 6a9c3e28c3..2c1f6e0ca6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/create-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/create-o_ashift.ksh @@ -44,8 +44,8 @@ verify_runnable "global" function cleanup { - destroy_pool $TESTPOOL - log_must rm -f $disk + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -f $disk } # @@ -73,15 +73,21 @@ function verify_device_uberblocks # typeset device=$1 typeset ubcount=$2 - zdb -quuul $device | egrep '^(\s+)?Uberblock' | - awk -v ubcount=$ubcount 'BEGIN { count=0 } { uberblocks[$0]++; } + zdb -quuul $device | awk -v ubcount=$ubcount ' + /Uberblock/ && ! /invalid/ { uberblocks[$0]++ } END { + count = 0 for (i in uberblocks) { - if (i ~ /invalid/) { continue; } - if (uberblocks[i] != 4) { exit 1; } + if (uberblocks[i] != 4) { + printf "%s count: %s != 4\n", i, uberblocks[i] + exit 1 + } count++; } - if (count != ubcount) { exit 1; } + if (count != ubcount) { + printf "Total uberblock count: %s != %s\n", count, ubcount + exit 1 + } }' return $? @@ -90,8 +96,7 @@ function verify_device_uberblocks # log_assert "zpool create -o ashift=' works with different ashift values" log_onexit cleanup -disk=$TEST_BASE_DIR/$FILEDISK0 -log_must mkfile $SIZE $disk +disk=$(create_blockfile $SIZE) typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") # since Illumos 4958 the largest uberblock is 8K so we have at least of 16/label @@ -117,7 +122,7 @@ do # clean things for the next run log_must zpool destroy $TESTPOOL log_must zpool labelclear $disk - log_must eval "verify_device_uberblocks $disk 0" + log_must verify_device_uberblocks $disk 0 ((i = i + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz b/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz new file mode 100644 index 0000000000..b8c0a583c0 Binary files /dev/null and b/tests/zfs-tests/tests/functional/cli_root/zpool_create/draidcfg.gz differ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh index efdafe51a0..115126b1ac 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh @@ -34,24 +34,4 @@ verify_runnable "global" -if ! $(is_physical_device $DISKS) ; then - log_unsupported "This directory cannot be run on raw files." -fi - -if [[ -n $DISK ]]; then - # - # Use 'zpool create' to clean up the information in - # in the given disk to avoid slice overlapping. - # - cleanup_devices $DISK - - partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 2))m $DISK 7 -else - for disk in `echo $DISKSARRAY`; do - cleanup_devices $disk - - partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 2))m $disk 7 - done -fi - log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg index d58cece040..976570d621 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg @@ -30,71 +30,23 @@ . $STF_SUITE/include/libtest.shlib -export DISK_ARRAY_NUM=0 -export DISK_ARRAY_LIMIT=4 -export DISKSARRAY="" - -function set_disks -{ - typeset -a disk_array=($(find_disks $DISKS)) - - if (( ${#disk_array[*]} <= 1 )); then - export DISK=${DISKS%% *} - export DISK_ARRAY_NUM=1 - else - export DISK="" - typeset -i i=0 - while (( i < ${#disk_array[*]} )); do - export DISK${i}="${disk_array[$i]}" - DISKSARRAY="$DISKSARRAY ${disk_array[$i]}" - (( i = i + 1 )) - (( i>$DISK_ARRAY_LIMIT )) && break - done - export DISK_ARRAY_NUM=$i - export DISKSARRAY - fi -} - -set_disks +typeset -a disk_array=($(find_disks $DISKS)) +typeset DISKSARRAY="" +typeset -i DISK_ARRAY_LIMIT=4 +typeset -i i=0 +while (( i < ${#disk_array[*]} && i <= $DISK_ARRAY_LIMIT )); do + export DISK${i}="${disk_array[$i]}" + DISKSARRAY="$DISKSARRAY ${disk_array[$i]}" + (( i = i + 1 )) +done +export DISK_ARRAY_NUM=$i +export DISKSARRAY export FILESIZE="$MINVDEVSIZE" export FILESIZE1="$(($MINVDEVSIZE * 2))" export SIZE="$((MINVDEVSIZE / (1024 * 1024)))"m export SIZE1="$(($MINVDEVSIZE * 2 / (1024 * 1024)))m" -if is_linux; then - set_device_dir - set_slice_prefix - export SLICE0=1 - export SLICE1=2 - export SLICE2=3 - export SLICE3=4 - export SLICE4=5 - export SLICE5=6 - export SLICE6=7 - export SLICE7=8 - disk1=${DISKS%% *} - if is_mpath_device $disk1; then - delete_partitions - fi -else - export SLICE_PREFIX="s" - export SLICE0=0 - export SLICE1=1 - export SLICE2=2 - export SLICE3=3 - export SLICE4=4 - export SLICE5=5 - export SLICE6=6 - export SLICE7=7 -fi - -export FILEDISK=filedisk_create -export FILEDISK0=filedisk0_create -export FILEDISK1=filedisk1_create -export FILEDISK2=filedisk2_create -export FILEDISK3=filedisk3_create - export BYND_MAX_NAME="byondmaxnamelength\ 012345678901234567890123456789\ 012345678901234567890123456789\ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib index 9e68748320..c98e495187 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib @@ -45,7 +45,7 @@ function create_pool_test typeset vdevs eval "typeset -a diskarray=($3)" - for vdevs in "${diskarray[@]}";do + for vdevs in "${diskarray[@]}"; do create_pool $pool $keywd $vdevs log_must poolexists $pool destroy_pool $pool @@ -53,67 +53,15 @@ function create_pool_test } # -# Create a ufs|ext file system and make a file within the file -# system for storage pool vdev +# Create a file for storage pool vdev # $1, file size -# $2, file name -# $3, disk name to create ufs|ext file system # function create_blockfile { typeset size=$1 - typeset file=$2 - typeset disk=$3 - typeset dir=`dirname $file` - - if [[ -d $dir ]]; then - ismounted $dir $NEWFS_DEFAULT_FS - (( $? == 0 )) && \ - log_must umount -f $dir - else - log_must mkdir -p $dir - fi - - echo "y" | newfs ${DEV_RDSKDIR}/$disk >/dev/null 2>&1 - (( $? != 0 )) && - log_fail "Create file system fail." - - log_must mount ${DEV_DSKDIR}/$disk $dir - log_must truncate -s $size $file -} - -# -# Umount the ufs|ext filesystem and remove the mountpoint -# $1, the mount point -# -function clean_blockfile -{ - typeset dirs=$1 - - for dir in $dirs; do - if [[ -d $dir ]]; then - if is_linux; then - if ismounted $dir ext2; then - typeset dev=$(df -lht ext2 | \ - grep "$dir" | \ - awk '{print $1}') - log_must umount -f $dir - create_pool ${TESTPOOL}.tmp $dev - destroy_pool ${TESTPOOL}.tmp - fi - else - if ismounted $dir ufs; then - typeset dev=$(df -lhF ufs | \ - grep "$dir" | \ - awk '{print $1}') - log_must umount -f $dir - create_pool ${TESTPOOL}.tmp $dev - destroy_pool ${TESTPOOL}.tmp - fi - fi - log_must rm -rf $dir - fi - done + typeset file=$(mktemp) + truncate -s $size $file + echo $file } # @@ -125,12 +73,12 @@ function find_vfstab_dev typeset vfstabdevs="" typeset line - if is_linux; then - vfstab="/etc/fstab" - tmpfile="$TEST_BASE_DIR/fstab.tmp" - else + if is_illumos; then vfstab="/etc/vfstab" tmpfile="$TEST_BASE_DIR/vfstab.tmp" + else + vfstab="/etc/fstab" + tmpfile="$TEST_BASE_DIR/fstab.tmp" fi cat $vfstab | grep "^${DEV_DSKDIR}" >$tmpfile @@ -146,18 +94,97 @@ function find_vfstab_dev } # -# Save the systme current dump device configuration +# Save the system current dump device configuration # function save_dump_dev { - typeset dumpdev + typeset dumpdev="" - if is_linux; then - dumpdev="" - else + if is_illumos; then typeset fnd="Dump device" dumpdev=`dumpadm | grep "$fnd" | cut -f2 -d : | \ awk '{print $1}'` fi echo $dumpdev } + +# +# Verify a pools enabled features match the provided feature set. +# $1, pool name +# $2, feature set(s) +# +# check_feature_set $TESTPOOL set1 set2 set3 ... +# +function check_feature_set +{ + typeset pool=$1 + typeset feature_set=$2 + shift + + for set in "$@"; do + if test -e "$ZPOOL_COMPAT_DIR/$set"; then + file="$ZPOOL_COMPAT_DIR/$set" + else + log_fail "Missing feature file: $ZPOOL_COMPAT_DIR/$set" + fi + done + + # + # Create a temporary file which contains all features which are + # common to the listed feature sets. This is used for comparison + # below to determine which features should be enabled. + # + typeset tmpfile=$(mktemp) + + while read line; do + typeset flag=1 + + if [[ "$line" == "#*" ]]; then + continue + fi + + for set in "$@"; do + if ! grep -q "$line" $ZPOOL_COMPAT_DIR/$set; then + flag=0 + break; + fi + done + + if [[ $flag -eq 1 ]]; then + echo "$line" >>$tmpfile + fi + done <"$file" + + # + # Verify every enabled feature appears in the merged feature set. + # Verify every disabled feature does not. + # + for feature in $(zpool get all $pool | \ + awk '$2 ~ /feature@/ { print $2 }'); do + state=$(get_pool_prop $feature $pool) + name=$(cut -d'@' -f2 <<<"$feature") + + if [[ "$state" = "enabled" || "$state" = "active" ]]; then + if ! grep -q $name $tmpfile; then + cat $tmpfile + rm -f $tmpfile + log_fail "Enabled feature $name not " \ + "in feature set file" + fi + elif [[ "$state" = "disabled" ]]; then + if grep -q $name $tmpfile; then + cat $tmpfile + rm -f $tmpfile + log_fail "Disabled feature $name is " \ + "in feature set file" + fi + else + rm -f $tmpfile + log_fail "Feature $name in unknown state $state" + fi + done + + log_note "Checked all features" + + rm -f $tmpfile +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index 2a975edc51..42f57beae2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -49,17 +49,7 @@ function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - clean_blockfile "$TESTDIR0 $TESTDIR1" - - if [[ -n $DISK ]]; then - partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 2))m $DISK 7 - else - typeset disk="" - for disk in $DISK0 $DISK1; do - partition_disk \ - $((($MINVDEVSIZE / (1024 * 1024)) * 2))m $disk 7 - done - fi + rm -f $disk1 $disk2 } log_assert "'zpool create ...' can successfully create" \ @@ -67,80 +57,23 @@ log_assert "'zpool create ...' can successfully create" \ log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" +typeset disk1=$(create_blockfile $FILESIZE) +typeset disk2=$(create_blockfile $FILESIZE) -case $DISK_ARRAY_NUM in -0|1) - typeset disk="" - if (( $DISK_ARRAY_NUM == 0 )); then - disk=$DISK - else - disk=$DISK0 - fi - create_blockfile $FILESIZE $TESTDIR0/$FILEDISK0 \ - ${disk}${SLICE_PREFIX}${SLICE5} - create_blockfile $FILESIZE $TESTDIR1/$FILEDISK1 \ - ${disk}${SLICE_PREFIX}${SLICE6} +pooldevs="${DISK0} \ + \"${DISK0} ${DISK1}\" \ + \"${DISK0} ${DISK1} ${DISK2}\" \ + \"$disk1 $disk2\"" +mirrordevs="\"${DISK0} ${DISK1}\" \ + $raidzdevs \ + \"$disk1 $disk2\"" +raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" - pooldevs="${disk}${SLICE_PREFIX}${SLICE0} \ - ${DEV_DSKDIR}/${disk}${SLICE_PREFIX}${SLICE0} \ - \"${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}\" \ - $TESTDIR0/$FILEDISK0" - raidzdevs="\"${DEV_DSKDIR}/${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}\" \ - \"${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1} \ - ${disk}${SLICE_PREFIX}${SLICE3}\" \ - \"${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1} \ - ${disk}${SLICE_PREFIX}${SLICE3} \ - ${disk}${SLICE_PREFIX}${SLICE4}\"\ - \"$TESTDIR0/$FILEDISK0 $TESTDIR1/$FILEDISK1\"" - mirrordevs=$raidzdevs - ;; -2|*) - create_blockfile $FILESIZE $TESTDIR0/$FILEDISK0 \ - ${DISK0}${SLICE_PREFIX}${SLICE5} - create_blockfile $FILESIZE $TESTDIR1/$FILEDISK1 \ - ${DISK1}${SLICE_PREFIX}${SLICE5} - - pooldevs="${DISK0}${SLICE_PREFIX}${SLICE0} \ - \"${DEV_DSKDIR}/${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0}\" \ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1} \ - ${DISK1}${SLICE_PREFIX}${SLICE1}\"\ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1}\ - ${DISK1}${SLICE_PREFIX}${SLICE1}\" \ - \"$TESTDIR0/$FILEDISK0 $TESTDIR1/$FILEDISK1\"" - raidzdevs="\"${DEV_DSKDIR}/${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0}\" \ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1} \ - ${DISK1}${SLICE_PREFIX}${SLICE1}\" \ - \"${DISK0}${SLICE_PREFIX}${SLICE0} \ - ${DISK1}${SLICE_PREFIX}${SLICE0} \ - ${DISK0}${SLICE_PREFIX}${SLICE1} \ - ${DISK1}${SLICE_PREFIX}${SLICE1}\" \ - \"$TESTDIR0/$FILEDISK0 $TESTDIR1/$FILEDISK1\"" - mirrordevs=$raidzdevs - ;; -esac - -typeset -i i=0 -while (( $i < ${#keywords[*]} )); do - case ${keywords[i]} in - "") - create_pool_test "$TESTPOOL" "${keywords[i]}" "$pooldevs";; - mirror) - create_pool_test "$TESTPOOL" "${keywords[i]}" "$mirrordevs";; - raidz|raidz1) - create_pool_test "$TESTPOOL" "${keywords[i]}" "$raidzdevs" ;; - esac - (( i = i+1 )) -done +create_pool_test "$TESTPOOL" "" "$pooldevs" +create_pool_test "$TESTPOOL" "mirror" "$mirrordevs" +create_pool_test "$TESTPOOL" "raidz" "$raidzdevs" +create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs" +create_pool_test "$TESTPOOL" "draid" "$draiddevs" log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_002_pos.ksh index b98e5ac92f..2f709086f7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_002_pos.ksh @@ -47,22 +47,15 @@ verify_runnable "global" function cleanup { - for pool in $TESTPOOL $TESTPOOL1 $TESTPOOL2 $TESTPOOL3 $TESTPOOL4 \ - $TESTPOOL5 $TESTPOOL6 - do - destroy_pool $pool + for pool in $TESTPOOL $TESTPOOL1; do + poolexists $pool && destroy_pool $pool done - clean_blockfile "$TESTDIR0 $TESTDIR1" - - for file in $FILEDISK0 $FILEDISK1 $FILEDISK2 - do - if [[ -e $TEST_BASE_DIR/$file ]]; then - rm -f $TEST_BASE_DIR/$file - fi - done - - partition_disk $SIZE $disk 6 + rm -f $disk1 $disk2 + if is_freebsd; then + umount -f $TESTDIR + rm -rf $TESTDIR + fi } log_onexit cleanup @@ -70,57 +63,66 @@ log_onexit cleanup log_assert "'zpool create -f ...' can successfully create" \ "a new pool in some cases." -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi -create_pool "$TESTPOOL" "${disk}${SLICE_PREFIX}${SLICE0}" -log_must echo "y" | newfs \ - ${DEV_RDSKDIR}/${disk}${SLICE_PREFIX}${SLICE1} >/dev/null 2>&1 -create_blockfile $FILESIZE $TESTDIR0/$FILEDISK0 ${disk}${SLICE_PREFIX}${SLICE4} -create_blockfile $FILESIZE1 $TESTDIR1/$FILEDISK1 ${disk}${SLICE_PREFIX}${SLICE5} -log_must truncate -s $SIZE $TEST_BASE_DIR/$FILEDISK0 -log_must truncate -s $SIZE $TEST_BASE_DIR/$FILEDISK1 -log_must truncate -s $SIZE $TEST_BASE_DIR/$FILEDISK2 +create_pool $TESTPOOL $DISK0 +log_must eval "new_fs ${DEV_RDSKDIR}/${DISK1} >/dev/null 2>&1" +typeset disk1=$(create_blockfile $FILESIZE) +typeset disk2=$(create_blockfile $FILESIZE1) unset NOINUSE_CHECK log_must zpool export $TESTPOOL log_note "'zpool create' without '-f' will fail " \ - "while device is belong to an exported pool." -log_mustnot zpool create "$TESTPOOL1" "${disk}${SLICE_PREFIX}${SLICE0}" -create_pool "$TESTPOOL1" "${disk}${SLICE_PREFIX}${SLICE0}" + "while device belongs to an exported pool." +log_mustnot zpool create $TESTPOOL1 $DISK0 +create_pool $TESTPOOL1 $DISK0 log_must poolexists $TESTPOOL1 +log_must destroy_pool $TESTPOOL1 + log_note "'zpool create' without '-f' will fail " \ - "while device is using by an ufs filesystem." -log_mustnot zpool create "$TESTPOOL2" "${disk}${SLICE_PREFIX}${SLICE1}" -create_pool "$TESTPOOL2" "${disk}${SLICE_PREFIX}${SLICE1}" -log_must poolexists $TESTPOOL2 + "while device is in use by a ufs filesystem." +if is_freebsd; then + # fs must be mounted for create to fail on FreeBSD + log_must mkdir -p $TESTDIR + log_must mount ${DEV_DSKDIR}/${DISK1} $TESTDIR +fi +log_mustnot zpool create $TESTPOOL $DISK1 +if is_freebsd; then + # fs must not be mounted to create pool even with -f + log_must umount -f $TESTDIR + log_must rm -rf $TESTDIR +fi +create_pool $TESTPOOL $DISK1 +log_must poolexists $TESTPOOL + +log_must destroy_pool $TESTPOOL log_note "'zpool create' mirror without '-f' will fail " \ "while devices have different size." -log_mustnot zpool create "$TESTPOOL3" "mirror" $TESTDIR0/$FILEDISK0 \ - $TESTDIR1/$FILEDISK1 -create_pool "$TESTPOOL3" "mirror" $TESTDIR0/$FILEDISK0 $TESTDIR1/$FILEDISK1 -log_must poolexists $TESTPOOL3 +log_mustnot zpool create $TESTPOOL mirror $disk1 $disk2 +create_pool $TESTPOOL mirror $disk1 $disk2 +log_must poolexists $TESTPOOL -log_note "'zpool create' mirror without '-f' will fail " \ - "while devices are of different types." -log_mustnot zpool create "$TESTPOOL4" "mirror" $TEST_BASE_DIR/$FILEDISK0 \ - ${disk}${SLICE_PREFIX}${SLICE3} -create_pool "$TESTPOOL4" "mirror" \ - $TEST_BASE_DIR/$FILEDISK0 ${disk}${SLICE_PREFIX}${SLICE3} -log_must poolexists $TESTPOOL4 +log_must destroy_pool $TESTPOOL + +if ! is_freebsd; then + log_note "'zpool create' mirror without '-f' will fail " \ + "while devices are of different types." + log_mustnot zpool create $TESTPOOL mirror $disk1 $DISK0 + create_pool $TESTPOOL mirror $disk1 $DISK0 + log_must poolexists $TESTPOOL + + log_must destroy_pool $TESTPOOL +fi log_note "'zpool create' without '-f' will fail " \ - "while device is part of potentially active pool." -create_pool "$TESTPOOL5" "mirror" $TEST_BASE_DIR/$FILEDISK1 \ - $TEST_BASE_DIR/$FILEDISK2 -log_must zpool offline $TESTPOOL5 $TEST_BASE_DIR/$FILEDISK2 -log_must zpool export $TESTPOOL5 -log_mustnot zpool create "$TESTPOOL6" $TEST_BASE_DIR/$FILEDISK2 -create_pool $TESTPOOL6 $TEST_BASE_DIR/$FILEDISK2 -log_must poolexists $TESTPOOL6 + "while a device is part of a potentially active pool." +create_pool $TESTPOOL mirror $DISK0 $DISK1 +log_must zpool offline $TESTPOOL $DISK0 +log_must zpool export $TESTPOOL +log_mustnot zpool create $TESTPOOL1 $DISK0 +create_pool $TESTPOOL1 $DISK0 +log_must poolexists $TESTPOOL1 + +log_must destroy_pool $TESTPOOL1 log_pass "'zpool create -f ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_003_pos.ksh index 100a24c500..dd8d0107ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_003_pos.ksh @@ -47,7 +47,8 @@ verify_runnable "global" function cleanup { - [[ -e $tmpfile ]] && log_must rm -f $tmpfile + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -f $tmpfile } tmpfile="$TEST_BASE_DIR/zpool_create_003.tmp$$" @@ -57,18 +58,6 @@ log_assert "'zpool create -n ...' can display the configuration" log_onexit cleanup -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - -DISK=${DISKS%% *} -if is_mpath_device $DISK; then - partition_disk $SIZE $disk 1 -fi - -typeset vspec="${disk}${SLICE_PREFIX}${SLICE0}" typeset goodprops=('' '-o comment=text' '-O checksum=on' '-O ns:prop=value') typeset badprops=('-o ashift=9999' '-O doesnotexist=on' '-O volsize=10M') @@ -78,10 +67,10 @@ do # # Make sure disk is clean before we use it # - create_pool $TESTPOOL $vspec > $tmpfile + create_pool $TESTPOOL $DISK0 > $tmpfile destroy_pool $TESTPOOL - log_must eval "zpool create -n $prop $TESTPOOL $vspec > $tmpfile" + log_must eval "zpool create -n $prop $TESTPOOL $DISK0 > $tmpfile" poolexists $TESTPOOL && \ log_fail "'zpool create -n ...' fail." @@ -97,10 +86,10 @@ do # # Make sure disk is clean before we use it # - create_pool $TESTPOOL $vspec > $tmpfile + create_pool $TESTPOOL $DISK0 > $tmpfile destroy_pool $TESTPOOL - log_mustnot zpool create -n $prop $TESTPOOL $vspec + log_mustnot zpool create -n $prop $TESTPOOL $DISK0 done log_pass "'zpool create -n ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh index 2697562742..835cd1f547 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh @@ -45,27 +45,23 @@ verify_runnable "global" function cleanup { - typeset pool="" - poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 poolexists $TESTPOOL && destroy_pool $TESTPOOL - [[ -d $TESTDIR ]] && log_must rm -rf $TESTDIR - partition_disk $SIZE $disk 6 + rm -rf $TESTDIR } log_assert "Storage pools with 16 file based vdevs can be created." log_onexit cleanup -disk=${DISKS%% *} -create_pool $TESTPOOL $disk +create_pool $TESTPOOL $DISK0 log_must zfs create -o mountpoint=$TESTDIR $TESTPOOL/$TESTFS vdevs_list=$(echo $TESTDIR/file.{01..16}) log_must truncate -s $MINVDEVSIZE $vdevs_list -create_pool "$TESTPOOL1" $vdevs_list -log_must vdevs_in_pool "$TESTPOOL1" "$vdevs_list" +create_pool $TESTPOOL1 $vdevs_list +log_must vdevs_in_pool $TESTPOOL1 "$vdevs_list" if poolexists $TESTPOOL1; then destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index 2afbec37dc..e1d8cc4745 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -46,41 +46,35 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && \ - log_must zpool destroy -f $TESTPOOL - - for dir in $TESTDIR $TESTDIR1; do - [[ -d $dir ]] && rm -rf $dir - done + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $TESTDIR $TESTDIR1 } log_assert "'zpool create [-R root][-m mountpoint] ...' can create" \ "an alternate pool or a new pool mounted at the specified mountpoint." log_onexit cleanup -set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" +set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2" # # cleanup the pools created in previous case if zpool_create_004_pos timedout # for pool in $TESTPOOL2 $TESTPOOL1 $TESTPOOL; do - if poolexists $pool; then - destroy_pool $pool - fi + poolexists $pool && destroy_pool $pool done #prepare raw file for file disk -[[ -d $TESTDIR ]] && rm -rf $TESTDIR +rm -rf $TESTDIR log_must mkdir -p $TESTDIR typeset -i i=1 -while (( i < 4 )); do - log_must mkfile $FILESIZE $TESTDIR/file.$i +while (( i < 5 )); do + log_must truncate -s $FILESIZE $TESTDIR/file.$i (( i = i + 1 )) done #Remove the directory with name as pool name if it exists -[[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL +rm -rf /$TESTPOOL file=$TESTDIR/file for opt in "-R $TESTDIR1" "-m $TESTDIR1" \ @@ -93,9 +87,9 @@ do log_must zpool destroy -f $TESTPOOL [[ -d $TESTDIR1 ]] && rm -rf $TESTDIR1 log_must zpool create $opt $TESTPOOL ${pooltype[i]} \ - $file.1 $file.2 $file.3 + $file.1 $file.2 $file.3 $file.4 ! poolexists $TESTPOOL && \ - log_fail "Createing pool with $opt fails." + log_fail "Creating pool with $opt fails." mpt=`zfs mount | egrep "^$TESTPOOL[^/]" | awk '{print $2}'` (( ${#mpt} == 0 )) && \ log_fail "$TESTPOOL created with $opt is not mounted." @@ -105,12 +99,12 @@ do from the output of zfs mount" if [[ "$opt" == "-m $TESTDIR1" ]]; then [[ ! -d $TESTDIR1 ]] && \ - log_fail "$TESTDIR1 is not created auotmatically." + log_fail "$TESTDIR1 is not created automatically." [[ "$mpt" != "$TESTDIR1" ]] && \ log_fail "$TESTPOOL is not mounted on $TESTDIR1." elif [[ "$opt" == "-R $TESTDIR1" ]]; then [[ ! -d $TESTDIR1/$TESTPOOL ]] && \ - log_fail "$TESTDIR1/$TESTPOOL is not created auotmatically." + log_fail "$TESTDIR1/$TESTPOOL is not created automatically." [[ "$mpt" != "$TESTDIR1/$TESTPOOL" ]] && \ log_fail "$TESTPOOL is not mounted on $TESTDIR1/$TESTPOOL." else diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index 79a0060c9b..79b41fdaec 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -46,8 +46,8 @@ verify_runnable "global" function cleanup { - datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - datasetexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + poolexists $TESTPOOL && destroy_pool $TESTPOOL } @@ -97,6 +97,20 @@ set -A valid_args \ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \ mirror $vdev4 $vdev5 $vdev6 $vdev7" \ + "draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \ + "draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 special mirror $vdev3 $vdev4" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 draid2 $vdev4 $vdev5 $vdev6 $vdev7"\ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6 \ + cache $vdev7 log mirror $vdev8 $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 $vdev7 \ + special mirror $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 raidz $vdev5 $vdev6" set -A forced_args \ @@ -109,11 +123,19 @@ set -A forced_args \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4" \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4 spare $vdev5" \ "raidz $vdev0 $vdev1 spare $vdev2 raidz2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid2 $vdev2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid3 $vdev2 $vdev3 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ raidz2 $vdev4 $vdev5 $vdev6 spare $vdev7" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \ + "mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \ + draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 $vdev3 \ + draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \ + special mirror $vdev7 $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 \ raidz2 $vdev5 $vdev6 $vdev7" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh index a7ae5c090a..2873202cce 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh @@ -44,38 +44,29 @@ verify_runnable "global" -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \ "-m" "-R" "-m -R" "-Rm" "-mR" "-m $TESTDIR $TESTPOOL" \ - "-R $TESTDIR $TESTPOOL" "-m nodir $TESTPOOL $disk" \ - "-R nodir $TESTPOOL $disk" "-m nodir -R nodir $TESTPOOL $disk" \ - "-R nodir -m nodir $TESTPOOL $disk" "-R $TESTDIR -m nodir $TESTPOOL $disk" \ - "-R nodir -m $TESTDIR $TESTPOOL $disk" \ + "-R $TESTDIR $TESTPOOL" "-m nodir $TESTPOOL $DISK0" \ + "-R nodir $TESTPOOL $DISK0" "-m nodir -R nodir $TESTPOOL $DISK0" \ + "-R nodir -m nodir $TESTPOOL $DISK0" "-R $TESTDIR -m nodir $TESTPOOL $DISK0" \ + "-R nodir -m $TESTDIR $TESTPOOL $DISK0" \ "-blah" "$TESTPOOL" "$TESTPOOL blah" "$TESTPOOL c?t0d0" \ "$TESTPOOL c0txd0" "$TESTPOOL c0t0dx" "$TESTPOOL cxtxdx" \ "$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \ "$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \ - "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $disk c0t1d?" \ - "$TESTPOOL RAIDZ ${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}" \ - "$TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} \ - log ${disk}${SLICE_PREFIX}${SLICE1} \ - log ${disk}${SLICE_PREFIX}${SLICE3}" \ - "$TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} \ - spare ${disk}${SLICE_PREFIX}${SLICE1} \ - spare ${disk}${SLICE_PREFIX}${SLICE3}" \ - "$TESTPOOL RAIDZ1 ${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1}" \ - "$TESTPOOL MIRROR $disk" "$TESTPOOL raidz $disk" \ - "$TESTPOOL raidz1 $disk" \ - "1tank $disk" "1234 $disk" "?tank $disk" \ - "tan%k $disk" "ta@# $disk" "tan+k $disk" \ - "$BYND_MAX_NAME $disk" + "$TESTPOOL draid1" "$TESTPOOL mirror draid1" \ + "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \ + "$TESTPOOL RAIDZ $DISK0 $DISK1" \ + "$TESTPOOL $DISK0 log $DISK1 log $DISK2" \ + "$TESTPOOL $DISK0 spare $DISK1 spare $DISK2" \ + "$TESTPOOL RAIDZ1 $DISK0 $DISK1" "$TESTPOOL MIRROR $DISK0" \ + "$TESTPOOL DRAID $DISK1 $DISK2 $DISK3" "$TESTPOOL raidz $DISK0" \ + "$TESTPOOL raidz1 $DISK0" "$TESTPOOL draid $DISK0" \ + "$TESTPOOL draid2 $DISK0 $DISK1" \ + "$TESTPOOL draid $DISK0 $DISK1 $DISK2 spare s0-draid1-0" \ + "1tank $DISK0" "1234 $DISK0" "?tank $DISK0" \ + "tan%k $DISK0" "ta@# $DISK0" "tan+k $DISK0" \ + "$BYND_MAX_NAME $DISK0" log_assert "'zpool create' should return an error with badly-formed parameters." log_onexit default_cleanup_noexit diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_008_pos.ksh index 5c5c1d94dc..56bb64c640 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_008_pos.ksh @@ -44,24 +44,11 @@ verify_runnable "global" -if is_linux; then - # Versions of libblkid older than 2.27.0 will not always detect member - # devices of a pool, therefore skip this test case for old versions. - currentver="$(blkid -v | tr ',' ' ' | awk '/libblkid/ { print $6 }')" - requiredver="2.27.0" - - if [ "$(printf "$requiredver\n$currentver" | sort -V | head -n1)" == \ - "$currentver" ] && [ "$currentver" != "$requiredver" ]; then - log_unsupported "libblkid ($currentver) may not detect pools" - fi -fi - function cleanup { if [[ $exported_pool == true ]]; then if [[ $force_pool == true ]]; then - log_must zpool create \ - -f $TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} + log_must zpool create -f $TESTPOOL $DISK0 else log_must zpool import $TESTPOOL fi @@ -74,49 +61,6 @@ function cleanup if poolexists $TESTPOOL1 ; then destroy_pool $TESTPOOL1 fi - - # - # recover it back to EFI label - # - create_pool $TESTPOOL $disk - destroy_pool $TESTPOOL - - partition_disk $SIZE $disk 6 -} - -# -# create overlap slice 0 and 1 on $disk -# -function create_overlap_slice -{ - typeset format_file=$TEST_BASE_DIR/format_overlap.$$ - typeset disk=$1 - - echo "partition" >$format_file - echo "0" >> $format_file - echo "" >> $format_file - echo "" >> $format_file - echo "0" >> $format_file - echo "200m" >> $format_file - echo "1" >> $format_file - echo "" >> $format_file - echo "" >> $format_file - echo "0" >> $format_file - echo "400m" >> $format_file - echo "label" >> $format_file - echo "" >> $format_file - echo "q" >> $format_file - echo "q" >> $format_file - - format -e -s -d $disk -f $format_file - typeset -i ret=$? - rm -fr $format_file - - if (( ret != 0 )); then - log_fail "unable to create overlap slice." - fi - - return 0 } log_assert "'zpool create' have to use '-f' scenarios" @@ -125,42 +69,21 @@ log_onexit cleanup typeset exported_pool=false typeset force_pool=false -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - # overlapped slices as vdev need -f to create pool # Make the disk is EFI labeled first via pool creation -create_pool $TESTPOOL $disk +create_pool $TESTPOOL $DISK0 destroy_pool $TESTPOOL -if ! is_linux; then - # Make the disk is VTOC labeled since only VTOC label supports overlap - log_must labelvtoc $disk - log_must create_overlap_slice $disk - - unset NOINUSE_CHECK - log_mustnot zpool create $TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} - log_must zpool create -f $TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} - destroy_pool $TESTPOOL -fi - # exported device to be as spare vdev need -f to create pool -log_must zpool create -f $TESTPOOL $disk +log_must zpool create -f $TESTPOOL $DISK0 destroy_pool $TESTPOOL -log_must partition_disk $SIZE $disk 6 -create_pool $TESTPOOL ${disk}${SLICE_PREFIX}${SLICE0} \ - ${disk}${SLICE_PREFIX}${SLICE1} +create_pool $TESTPOOL $DISK0 $DISK1 log_must zpool export $TESTPOOL exported_pool=true -log_mustnot zpool create $TESTPOOL1 ${disk}${SLICE_PREFIX}${SLICE3} \ - spare ${disk}${SLICE_PREFIX}${SLICE1} -create_pool $TESTPOOL1 ${disk}${SLICE_PREFIX}${SLICE3} \ - spare ${disk}${SLICE_PREFIX}${SLICE1} +log_mustnot zpool create $TESTPOOL1 $DISK1 spare $DISK2 +create_pool $TESTPOOL1 $DISK1 spare $DISK2 force_pool=true destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh index 0b1b18aebb..e2f3899031 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh @@ -50,15 +50,10 @@ verify_runnable "global" function cleanup { - typeset dtst - typeset disk + typeset pool - for dtst in $TESTPOOL $TESTPOOL1; do - poolexists $dtst && destroy_pool $dtst - done - - for disk in $DISKS; do - partition_disk $SIZE $disk 6 + for pool in $TESTPOOL $TESTPOOL1; do + poolexists $pool && destroy_pool $pool done } @@ -68,27 +63,25 @@ log_onexit cleanup unset NOINUSE_CHECK typeset opt -for opt in "" "mirror" "raidz" "raidz1"; do - typeset disk="$DISKS" - (( ${#opt} == 0 )) && disk=${DISKS%% *} - - typeset -i count=$(get_word_count $disk) - if (( count < 2 && ${#opt} != 0 )) ; then - continue +for opt in "" "mirror" "raidz" "draid"; do + if [[ $opt == "" ]]; then + typeset disks=$DISK0 + else + typeset disks=$DISKS fi # Create two pools but using the same disks. - create_pool $TESTPOOL $opt $disk - log_mustnot zpool create -f $TESTPOOL1 $opt $disk + create_pool $TESTPOOL $opt $disks + log_mustnot zpool create -f $TESTPOOL1 $opt $disks destroy_pool $TESTPOOL # Create two pools and part of the devices were overlapped - create_pool $TESTPOOL $opt $disk - log_mustnot zpool create -f $TESTPOOL1 $opt ${DISKS% *} + create_pool $TESTPOOL $opt $disks + log_mustnot zpool create -f $TESTPOOL1 $opt $DISK0 destroy_pool $TESTPOOL # Create one pool but using the same disks twice. - log_mustnot zpool create -f $TESTPOOL $opt $disk $disk + log_mustnot zpool create -f $TESTPOOL $opt $disks $disks done log_pass "Using overlapping or in-use disks to create a new pool fails as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index 165939786d..36bbaa7de3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -48,37 +48,31 @@ verify_runnable "global" function cleanup { - poolexists $TOOSMALL && destroy_pool $TOOSMALL - poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + typeset pool - poolexists $TESTPOOL && destroy_pool $TESTPOOL + for pool in $TOOSMALL $TESTPOOL1 $TESTPOOL; do + poolexists $pool && destroy_pool $pool + done - [[ -d $TESTDIR ]] && rm -rf $TESTDIR - - partition_disk $SIZE $disk 6 + rm -rf $TESTDIR } log_onexit cleanup -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - -create_pool $TESTPOOL $disk +create_pool $TESTPOOL $DISK0 log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS typeset -l devsize=$(($SPA_MINDEVSIZE - 1024 * 1024)) -for files in $TESTDIR/file1 $TESTDIR/file2 +for files in $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3 do - log_must mkfile $devsize $files + log_must truncate -s $devsize $files done set -A args \ "$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \ - "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" + "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \ + "$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh index 8ade2561fe..9437033ae5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh @@ -54,62 +54,61 @@ function cleanup destroy_pool $pool done + rm -rf $disk1 $disk2 $disk3 $disk4 + if [[ -n $saved_dump_dev ]]; then log_must dumpadm -u -d $saved_dump_dev fi - - partition_disk $SIZE $disk 7 } log_assert "'zpool create' should be failed with inapplicable scenarios." log_onexit cleanup -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi -pooldev1=${disk}${SLICE_PREFIX}${SLICE0} -pooldev2=${disk}${SLICE_PREFIX}${SLICE1} -mirror1="${disk}${SLICE_PREFIX}${SLICE1} ${disk}${SLICE_PREFIX}${SLICE3}" -mirror2="${disk}${SLICE_PREFIX}${SLICE4} ${disk}${SLICE_PREFIX}${SLICE5}" +disk1=$(create_blockfile $FILESIZE) +disk2=$(create_blockfile $FILESIZE) +disk3=$(create_blockfile $FILESIZE) +disk4=$(create_blockfile $FILESIZE1) +mirror1="$DISK0 $DISK1" +mirror2="$disk1 $disk2" raidz1=$mirror1 raidz2=$mirror2 -diff_size_dev="${disk}${SLICE_PREFIX}${SLICE6} ${disk}${SLICE_PREFIX}${SLICE7}" +draid1="$DISK0 $DISK1 $DISK2" +draid2="$disk1 $disk2 $disk3" +diff_size_dev="$disk2 $disk4" +draid_diff_size_dev="$disk1 $disk2 $disk4" vfstab_dev=$(find_vfstab_dev) -if is_linux; then - partition_disk $SIZE $disk 7 - cyl=$(get_endslice $disk $SLICE5) - log_must set_partition $SLICE6 "$cyl" $SIZE1 $disk -else - specified_dump_dev=${disk}${SLICE_PREFIX}${SLICE0} +if is_illumos; then + specified_dump_dev=${DISK0}s0 saved_dump_dev=$(save_dump_dev) - cyl=$(get_endslice $disk $SLICE6) - log_must set_partition $SLICE7 "$cyl" $SIZE1 $disk + cyl=$(get_endslice $DISK0 6) + log_must set_partition 7 "$cyl" $SIZE1 $DISK0 fi -create_pool "$TESTPOOL" "$pooldev1" +create_pool $TESTPOOL $DISK0 # # Set up the testing scenarios parameters # -set -A arg "$TESTPOOL $pooldev2" \ - "$TESTPOOL1 $pooldev1" \ - "$TESTPOOL1 $TESTDIR0/$FILEDISK0" \ +set -A arg \ + "$TESTPOOL1 $DISK0" \ "$TESTPOOL1 mirror mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 raidz1 raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 draid draid $draid draid $draid2" \ "$TESTPOOL1 mirror raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 mirror raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 mirror draid $draid1 draid $draid2" \ "$TESTPOOL1 raidz mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz1 mirror $mirror1 mirror $mirror2" \ + "$TESTPOOL1 draid1 mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 mirror $diff_size_dev" \ "$TESTPOOL1 raidz $diff_size_dev" \ "$TESTPOOL1 raidz1 $diff_size_dev" \ + "$TESTPOOL1 draid1 $draid_diff_size_dev" \ "$TESTPOOL1 mirror $mirror1 spare $mirror2 spare $diff_size_dev" \ "$TESTPOOL1 $vfstab_dev" \ - "$TESTPOOL1 ${disk}s10" \ + "$TESTPOOL1 ${DISK0}s10" \ "$TESTPOOL1 spare $pooldev2" unset NOINUSE_CHECK @@ -122,10 +121,10 @@ done # now destroy the pool to be polite log_must zpool destroy -f $TESTPOOL -if ! is_linux; then +if is_illumos; then # create/destroy a pool as a simple way to set the partitioning # back to something normal so we can use this $disk as a dump device - log_must zpool create -f $TESTPOOL3 $disk + log_must zpool create -f $TESTPOOL3 $DISK1 log_must zpool destroy -f $TESTPOOL3 log_must dumpadm -d ${DEV_DSKDIR}/$specified_dump_dev @@ -134,7 +133,7 @@ if ! is_linux; then # Also check to see that in-use checking prevents us from creating # a zpool from just the first slice on the disk. log_mustnot zpool create \ - -f $TESTPOOL1 ${specified_dump_dev}${SLICE_PREFIX}${SLICE0} + -f $TESTPOOL1 ${specified_dump_dev}s0 fi log_pass "'zpool create' is failed as expected with inapplicable scenarios." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_012_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_012_neg.ksh index 347fdfea49..36888e4973 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_012_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_012_neg.ksh @@ -47,12 +47,12 @@ verify_runnable "global" function cleanup { - if poolexists $TESTPOOL; then - destroy_pool $TESTPOOL - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL } -if is_linux; then +if is_freebsd; then + typeset swap_disks=$(swapinfo -l | grep "/dev" | awk '{print $1}') +elif is_linux; then typeset swap_disks=`swapon -s | grep "/dev" | awk '{print $1}'` else typeset swap_disks=`swap -l | grep "c[0-9].*d[0-9].*s[0-9]" | \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_014_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_014_neg.ksh index fc383be9b5..44ed950f78 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_014_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_014_neg.ksh @@ -54,35 +54,26 @@ function cleanup zfs destroy $vol_name fi - if poolexists $TESTPOOL; then - destroy_pool $TESTPOOL - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_assert "'zpool create' should fail with regular file in swap." log_onexit cleanup -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - if is_linux; then set -A options "" "-f" else set -A options "-n" "" "-f" fi -typeset pool_dev=${disk}${SLICE_PREFIX}${SLICE0} typeset vol_name=$TESTPOOL/$TESTVOL typeset mntp=/mnt typeset TMP_FILE=$mntp/tmpfile.$$ -create_pool $TESTPOOL $pool_dev +create_pool $TESTPOOL $DISK0 log_must zfs create -V 100m $vol_name block_device_wait -log_must echo "y" | newfs ${ZVOL_DEVDIR}/$vol_name > /dev/null 2>&1 +log_must eval "new_fs ${ZVOL_DEVDIR}/$vol_name > /dev/null 2>&1" log_must mount ${ZVOL_DEVDIR}/$vol_name $mntp log_must mkfile 50m $TMP_FILE diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_015_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_015_neg.ksh index 4f605d3ba8..babf5ca9c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_015_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_015_neg.ksh @@ -56,20 +56,11 @@ function cleanup fi for pool in $TESTPOOL1 $TESTPOOL; do - if poolexists $pool; then - destroy_pool $pool - fi + poolexists $pool && destroy_pool $pool done } unset NOINUSE_CHECK -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - -typeset pool_dev=${disk}${SLICE_PREFIX}${SLICE0} typeset vol_name=$TESTPOOL/$TESTVOL log_assert "'zpool create' should fail with zfs vol device in swap." @@ -78,12 +69,17 @@ log_onexit cleanup # # use zfs vol device in swap to create pool which should fail. # -create_pool $TESTPOOL $pool_dev +create_pool $TESTPOOL $DISK0 log_must zfs create -V 100m $vol_name block_device_wait swap_setup ${ZVOL_DEVDIR}/$vol_name -for opt in "-n" "" "-f"; do +if is_freebsd; then + typeset -a opts=("" "-f") +else + typeset -a opts=("-n" "" "-f") +fi +for opt in "${opts[@]}"; do log_mustnot zpool create $opt $TESTPOOL1 ${ZVOL_DEVDIR}/${vol_name} done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_016_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_016_pos.ksh index 3fca607b1f..1fa205b0f2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_016_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_016_pos.ksh @@ -41,20 +41,14 @@ # STRATEGY: # 1. delete all devices in the swap # 2. create a zpool -# 3. Verify the creation is successed. +# 3. Verify the creation was successful # verify_runnable "global" -if is_linux; then - log_unsupported "Test case isn't useful under Linux." -fi - function cleanup { - if poolexists $TESTPOOL; then - destroy_pool $TESTPOOL - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL #recover swap devices FSTAB=$TEST_BASE_DIR/fstab_$$ @@ -73,12 +67,6 @@ function cleanup fi } -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi -typeset pool_dev=${disk}${SLICE_PREFIX}${SLICE0} typeset swap_disks=$(swap -l | grep -v "swapfile" | awk '{print $1}') typeset dump_device=$(dumpadm | grep "Dump device" | awk '{print $3}') @@ -94,7 +82,7 @@ for sdisk in $swap_disks; do fi done -log_must zpool create $TESTPOOL $pool_dev +log_must zpool create $TESTPOOL $DISK0 log_must zpool destroy $TESTPOOL log_pass "'zpool create' passed as expected with applicable scenario." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_017_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_017_neg.ksh index 7e75e74c29..ded1e3c396 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_017_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_017_neg.ksh @@ -47,23 +47,10 @@ verify_runnable "global" function cleanup { - if poolexists $TESTPOOL; then - destroy_pool $TESTPOOL - fi - - if [[ -d $TESTDIR ]]; then - log_must rm -rf $TESTDIR - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $TESTDIR } -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - -typeset pool_dev=${disk}${SLICE_PREFIX}${SLICE0} - log_assert "'zpool create' should fail with mountpoint exists and not empty." log_onexit cleanup @@ -81,7 +68,7 @@ while (( i < 2 )); do log_must touch $TESTDIR/testfile fi - log_mustnot zpool create -m $TESTDIR -f $TESTPOOL $pool_dev + log_mustnot zpool create -m $TESTDIR -f $TESTPOOL $DISK0 log_mustnot poolexists $TESTPOOL (( i = i + 1 )) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_018_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_018_pos.ksh index 1de51e0414..6ad662f950 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_018_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_018_pos.ksh @@ -47,18 +47,12 @@ function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - [[ -f $CPATH ]] && log_must rm $CPATH + rm -f $CPATH } log_onexit cleanup log_assert "zpool create can create pools with specified properties" -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - # # we don't include "root" property in this list, as it requires both "cachefile" # and "root" to be set at the same time. A test for this is included in @@ -70,7 +64,7 @@ typeset vals=("off" "off" "$CPATH" "3" "on") typeset -i i=0; while [ $i -lt "${#props[@]}" ] do - log_must zpool create -o ${props[$i]}=${vals[$i]} $TESTPOOL $disk + log_must zpool create -o ${props[$i]}=${vals[$i]} $TESTPOOL $DISK0 RESULT=$(get_pool_prop ${props[$i]} $TESTPOOL) if [[ $RESULT != ${vals[$i]} ]] then @@ -86,7 +80,7 @@ done poolexists $TESTPOOL && destroy_pool $TESTPOOL # pick two properties, and verify we can create with those as well -log_must zpool create -o delegation=off -o cachefile=$CPATH $TESTPOOL $disk +log_must zpool create -o delegation=off -o cachefile=$CPATH $TESTPOOL $DISK0 RESULT=$(get_pool_prop delegation $TESTPOOL) if [[ $RESULT != off ]] then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_019_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_019_pos.ksh index 9cf6081ff7..694ea2163c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_019_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_019_pos.ksh @@ -43,21 +43,13 @@ function cleanup { - if poolexists $TESTPOOL ; then - destroy_pool $TESTPOOL - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_onexit cleanup log_assert "zpool create cannot create pools specifying readonly properties" -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - set -A props "available" "capacity" "guid" "health" "size" "used" set -A vals "100" "10" "12345" "HEALTHY" "10" "10" @@ -65,7 +57,7 @@ typeset -i i=0; while [ $i -lt "${#props[@]}" ] do # try to set each property in the prop list with it's corresponding val - log_mustnot zpool create -o ${props[$i]}=${vals[$i]} $TESTPOOL $disk + log_mustnot zpool create -o ${props[$i]}=${vals[$i]} $TESTPOOL $DISK0 if poolexists $TESTPOOL then log_fail "$TESTPOOL was created when setting ${props[$i]}!" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_020_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_020_pos.ksh index ae069606ea..104b5ec986 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_020_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_020_pos.ksh @@ -46,14 +46,9 @@ function cleanup { - if poolexists $TESTPOOL ; then - destroy_pool $TESTPOOL - fi - if [ -d /${TESTPOOL}.root ] - then - log_must rmdir /${TESTPOOL}.root - fi - [[ -e $values ]] && log_must rm -f $values + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf /${TESTPOOL}.root + rm -f $values } log_onexit cleanup @@ -62,18 +57,12 @@ log_assert "zpool create -R works as expected" typeset values=$TEST_BASE_DIR/values.$$ -if [[ -n $DISK ]]; then - disk=$DISK -else - disk=$DISK0 -fi - log_must rm -f /etc/zfs/zpool.cache -log_must mkdir /${TESTPOOL}.root -log_must zpool create -R /${TESTPOOL}.root $TESTPOOL $disk +log_must rm -rf /${TESTPOOL}.root +log_must zpool create -R /${TESTPOOL}.root $TESTPOOL $DISK0 if [ ! -d /${TESTPOOL}.root ] then - log_fail "Mountpoint was not create when using zpool with -R flag!" + log_fail "Mountpoint was not created when using zpool with -R flag!" fi FS=$(zfs list $TESTPOOL) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_021_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_021_pos.ksh index 8f64c9d44c..655f887b60 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_021_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_021_pos.ksh @@ -68,10 +68,14 @@ set -A RW_FS_PROP "quota=536870912" \ "setuid=off" \ "readonly=on" \ "snapdir=visible" \ - "acltype=posixacl" \ + "acltype=posix" \ "aclinherit=discard" \ - "canmount=off" \ - "zoned=on" + "canmount=off" +if is_freebsd; then + RW_FS_PROP+=("jailed=on") +else + RW_FS_PROP+=("zoned=on") +fi typeset -i i=0 while (( $i < ${#RW_FS_PROP[*]} )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_022_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_022_pos.ksh index 4e6d255129..4a918c0a68 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_022_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_022_pos.ksh @@ -48,7 +48,7 @@ verify_runnable "global" function cleanup { - datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_onexit cleanup @@ -68,7 +68,7 @@ set -A RW_FS_PROP "quota=536870912" \ "setuid=off" \ "readonly=on" \ "snapdir=visible" \ - "acltype=posixacl" \ + "acltype=posix" \ "aclinherit=discard" \ "canmount=off" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh index fb0d480642..f101521bd3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh @@ -45,7 +45,7 @@ verify_runnable "global" function cleanup { - datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL + poolexists $TESTPOOL && destroy_pool $TESTPOOL } log_onexit cleanup @@ -63,7 +63,6 @@ set -A args "QuOta=none" "quota=non" "quota=abcd" "quota=0" "quota=" \ "deviCes=on" "devices=OFF" "devices=aaa" \ "exec=ON" "EXec=off" "exec=aaa" \ "readonly=ON" "reADOnly=off" "rdonly=OFF" "rdonly=aaa" \ - "zoned=ON" "ZoNed=off" "zoned=aaa" \ "snapdIR=hidden" "snapdir=VISible" "snapdir=aaa" \ "acltype=DIScard" "acltYPE=groupmask" "acltype=aaa" \ "aclinherit=deny" "aclinHerit=secure" "aclinherit=aaa" \ @@ -72,12 +71,25 @@ set -A args "QuOta=none" "quota=non" "quota=abcd" "quota=0" "quota=" \ "referenced=10K" "compressratio=1.00x" \ "version=0" "version=1.234" "version=10K" "version=-1" \ "version=aaa" "version=999" +if is_freebsd; then + args+=("jailed=ON" "JaiLed=off" "jailed=aaa") +else + args+=("zoned=ON" "ZoNed=off" "zoned=aaa") +fi log_assert "'zpool create -O' should return an error with badly formed parameters." typeset -i i=0 while (( $i < ${#args[*]} )); do - log_mustnot zpool create -O ${args[i]} -f $TESTPOOL $DISKS + typeset arg=${args[i]} + if is_freebsd; then + # FreeBSD does not strictly validate share opts (yet). + if [[ $arg == "sharenfs="* ]]; then + ((i = i + 1)) + continue + fi + fi + log_mustnot zpool create -O $arg -f $TESTPOOL $DISKS ((i = i + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh index d28d5953c5..63391e8adb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh @@ -48,7 +48,7 @@ set -A ENCRYPTION_ALGS "encryption=on" \ "encryption=aes-192-gcm" \ "encryption=aes-256-gcm" -set -A ENCRYPTION_PROPS "encryption=aes-256-ccm" \ +set -A ENCRYPTION_PROPS "encryption=aes-256-gcm" \ "encryption=aes-128-ccm" \ "encryption=aes-192-ccm" \ "encryption=aes-256-ccm" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh new file mode 100755 index 0000000000..9717af5052 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a variety of dRAID pools using the minimal dRAID vdev syntax. +# +# STRATEGY: +# 1) Create the required number of allowed dRAID vdevs. +# 2) Create few pools of various sizes using the draid1|draid2|draid3 syntax. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create ...' can create a pool." + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..84}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Verify all configurations up to 24 vdevs. +for parity in {1..3}; do + for children in {$((parity + 2))..24}; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid$parity $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + done +done + +# Spot check a few large configurations. +children_counts="53 84" +for children in $children_counts; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh new file mode 100755 index 0000000000..2e1ff39311 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh @@ -0,0 +1,82 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create dRAID pool using the maximum number of vdevs (255). Then verify +# that creating a pool with 256 fails as expected. +# +# STRATEGY: +# 1) Verify a pool with fewer than the required vdevs fails. +# 2) Verify pools with a valid number of vdevs succeed. +# 3) Verify a pool which exceeds the maximum number of vdevs fails. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid '" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..256}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Below maximum dRAID vdev count for specified parity level. +log_mustnot zpool create $TESTPOOL draid1 $(echo $TESTDIR/file.{01..01}) +log_mustnot zpool create $TESTPOOL draid2 $(echo $TESTDIR/file.{01..02}) +log_mustnot zpool create $TESTPOOL draid3 $(echo $TESTDIR/file.{01..03}) + +# Verify pool sizes from 2-10. Values in between are skipped to speed +# up the test case but will be exercised by the random pool creation +# done in zpool_create_draid_002_pos.ksh. +for (( i=2; i<=10; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Verify pool sizes from 254-255. +for (( i=254; i<=255; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Exceeds maximum dRAID vdev count (256). +log_mustnot zpool create $TESTPOOL draid $(echo $TESTDIR/file.{01..256}) + +log_pass "'zpool create draid '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh new file mode 100755 index 0000000000..52cd00cf4e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify allowed striped widths (data+parity) and hot spares may be +# configured at pool creation time. +# +# STRATEGY: +# 1) Test valid stripe/spare combinations given the number of children. +# 2) Test invalid stripe/spare/children combinations outside the allow limits. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $draid_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid:#d:#c:#s '" + +log_onexit cleanup + +mkdir $TESTDIR + +# Generate 10 random valid configurations to test. +for (( i=0; i<10; i++ )); do + parity=$(random_int_between 1 3) + spares=$(random_int_between 0 3) + data=$(random_int_between 1 16) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 32) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + draid_vdevs=$(echo $TESTDIR/file.{01..$children}) + log_must truncate -s $MINVDEVSIZE $draid_vdevs + + log_must zpool create $TESTPOOL $draid $draid_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + rm -f $draid_vdevs +done + +children=32 +draid_vdevs=$(echo $TESTDIR/file.{01..$children}) +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +# Out of order and unknown suffixes should fail. +log_mustnot zpool create $TESTPOOL draid:d8 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:s3 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:c32 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:10x $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:x10 $draid_vdevs + +# Exceeds maximum data disks (limited by total children) +log_must zpool create $TESTPOOL draid2:30d $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:31d $draid_vdevs + +# At least one data disk must be requested. +log_mustnot zpool create $TESTPOOL draid2:0d $draid_vdevs + +# Check invalid parity levels. +log_mustnot zpool create $TESTPOOL draid0 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid4 $draid_vdevs + +# Spares are limited: spares < children - (parity + data). +log_must zpool create $TESTPOOL draid2:20d:10s $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:20d:11s $draid_vdevs + +# The required children argument is enforced. +log_mustnot zpool create $TESTPOOL draid2:0c $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:31c $draid_vdevs +log_must zpool create $TESTPOOL draid2:32c $draid_vdevs +destroy_pool $TESTPOOL + +log_pass "'zpool create draid:#d:#c:#s '" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh new file mode 100755 index 0000000000..6b700fa362 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify generated dRAID permutation maps against the authoritative +# reference file contains the full permutations. +# + +verify_runnable "global" + +log_assert "'draid verify'" + +DRAIDCFG="$STF_SUITE/tests/functional/cli_root/zpool_create/draidcfg.gz" + +log_must draid verify $DRAIDCFG + +log_pass "'draid verify'" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh new file mode 100755 index 0000000000..1e4db20cfe --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh @@ -0,0 +1,138 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset STR_DRYRUN="would create '$TESTPOOL' with the following layout:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool create -n ...' can display the correct configuration +# +# STRATEGY: +# 1. Create -n a storage pool and verify the output is as expected. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + ( + tree="'${dev[0]}' '${dev[1]}' log '${dev[2]}' '${dev[3]}' \ + special '${dev[4]}' '${dev[5]}' dedup '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}' cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + ${dev[1]} + dedup + ${dev[6]} + ${dev[7]} + special + ${dev[4]} + ${dev[5]} + logs + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' \ + log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + dedup mirror '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}' \ + cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + mirror + ${dev[0]} + ${dev[1]} + dedup + mirror + ${dev[6]} + ${dev[7]} + special + mirror + ${dev[4]} + ${dev[5]} + logs + mirror + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + rm -f "$VDEV_PREFIX"* +} + +log_assert "'zpool add -n ...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset want="${tests[$i].want}" + + typeset out="$(log_must eval "zpool create -n '$TESTPOOL' $tree" | \ + sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi +done + +log_pass "'zpool add -n ...' displays config correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh new file mode 100755 index 0000000000..fe98434d1b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh @@ -0,0 +1,58 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify '-o compatibility' reserved values 'off, legacy' +# +# STRATEGY: +# 1. Create a pool with '-o compatibility=off' +# 2. Create a pool with '-o compatibility=legacy' +# 3. Cannot create a pool with '-o compatibility=unknown' +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL +} + +log_onexit cleanup + +log_assert "verify '-o compatibility' reserved values 'off, legacy'" + +log_must zpool create -f -o compatibility=off $TESTPOOL $DISKS +log_must zpool destroy -f $TESTPOOL + +log_must zpool create -f -o compatibility=legacy $TESTPOOL $DISKS +log_must zpool destroy -f $TESTPOOL + +log_mustnot zpool create -f -o compatibility=unknown $TESTPOOL $DISKS + +log_pass "verify '-o compatibility' reserved values 'off, legacy'" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh new file mode 100755 index 0000000000..8c812911b3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify pools can be created with the expected feature set enabled. +# +# STRATEGY: +# 1. Create a pool with a known feature set. +# 2. Verify only those features are active/enabled. +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL +} + +log_onexit cleanup + +log_assert "creates a pool with a specified feature set enabled" + +log_must zpool create -f -o compatibility=compat-2020 $TESTPOOL $DISKS +check_feature_set $TESTPOOL compat-2020 +log_must zpool destroy -f $TESTPOOL + +log_pass "creates a pool with a specified feature set enabled" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh new file mode 100755 index 0000000000..0580d444e7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify pools can be created with multiple feature sets. +# +# STRATEGY: +# 1. Create a pool with multiple feature sets. +# 2. Verify only the features common to both sets are enabled. +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL +} + +log_onexit cleanup + +log_assert "creates a pool with multiple feature sets enabled" + +log_must zpool create -f -o compatibility=freebsd-11.0,zol-0.8 $TESTPOOL $DISKS +check_feature_set $TESTPOOL freebsd-11.0 zol-0.8 +log_must zpool destroy -f $TESTPOOL + +log_pass "creates a pool with multiple feature sets enabled" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh new file mode 100755 index 0000000000..052c18dcee --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify '-o compatibility' property is updated in both the +# pool config MOS object and the cache file. +# +# STRATEGY: +# 1. Create a pool with '-o compatibility=legacy', then verify +# the property exists in the MOS config and cache file. +# 2. Create a pool, set the 'compatibility=off' property, then +# verify the property exists in the MOS config and cache file. +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL + rm -f $CACHE_FILE +} + +function check_config +{ + typeset propval=$1 + + poolval="$(zpool get -H -o value compatibility $TESTPOOL)" + if [ "$poolval" != "$propval" ]; then + log_fail "compatibility property set incorrectly $curval" + fi + + if ! zdb -C -U $CACHE_FILE | grep "compatibility: '$propval'"; then + log_fail "compatibility property missing in cache file" + fi + + if ! zdb -C -U $CACHE_FILE $TESTPOOL | grep "compatibility: '$propval'"; then + log_fail "compatibility property missing from MOS object" + fi +} + +log_onexit cleanup + +log_assert "verify '-o compatibility' in MOS object and cache file" + +CACHE_FILE=$TEST_BASE_DIR/cachefile.$$ + +# 1. Create a pool with '-o compatibility=legacy', then verify +# the property exists in the MOS config and cache file. +log_must zpool create -f -o cachefile=$CACHE_FILE -o compatibility=legacy $TESTPOOL $DISKS +log_must check_config legacy +log_must zpool export -F $TESTPOOL +log_must zpool import -c $CACHE_FILE $TESTPOOL +log_must check_config legacy +log_must zpool destroy -f $TESTPOOL + +# 2. Create a pool, set the 'compatibility=off' property, then +# verify the property exists in the MOS config and cache file. +log_must zpool create -f -o cachefile=$CACHE_FILE $TESTPOOL $DISKS +log_must zpool set compatibility=legacy $TESTPOOL +log_must check_config legacy +log_must zpool export -F $TESTPOOL +log_must zpool import -c $CACHE_FILE $TESTPOOL +log_must check_config legacy +log_must zpool destroy -f $TESTPOOL + +log_pass "verify '-o compatibility' in MOS object and cache file" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_tempname.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_tempname.ksh index 1e6fcea03b..8fd1cea36e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_tempname.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_tempname.ksh @@ -30,9 +30,11 @@ verify_runnable "global" function cleanup { - destroy_pool $TESTPOOL - destroy_pool $TEMPPOOL + typeset pool + for pool in $TESTPOOL $TEMPPOOL; do + poolexists $pool && destroy_pool $pool + done } log_assert "'zpool create -t ' can create a pool with the specified" \ @@ -48,8 +50,8 @@ typeset fsprops=('canmount=off' 'mountpoint=none' 'utf8only=on' for poolprop in "${poolprops[@]}"; do for fsprop in "${fsprops[@]}"; do # 1. Create a pool with '-t' option - log_must zpool create $TESTPOOL -t $TEMPPOOL \ - -O $fsprop -o $poolprop $DISKS + log_must zpool create -t $TEMPPOOL -O $fsprop -o $poolprop \ + $TESTPOOL $DISKS # 2. Verify the pool is created with the specified temporary name log_must poolexists $TEMPPOOL log_mustnot poolexists $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy.cfg index 65b43da2da..bf6026747f 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy.cfg @@ -28,19 +28,10 @@ # Copyright (c) 2012 by Delphix. All rights reserved. # -export DISK=${DISKS%% *} export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') export DISKSARRAY=$DISKS +echo $DISKS | read DISK0 DISK1 if is_linux; then set_device_dir - set_slice_prefix - export SLICE0=1 - export SLICE1=2 -else - export SLICE_PREFIX="s" - export SLICE0=0 - export SLICE1=1 - fi -export SLICE_SIZE=500m diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh index 2d9ec78211..c25b6c9230 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh @@ -48,34 +48,31 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 - datasetexists $TESTPOOL1/$TESTVOL && \ - log_must zfs destroy -f $TESTPOOL1/$TESTVOL + datasetexists $TESTPOOL1/$TESTVOL && destroy_dataset $TESTPOOL1/$TESTVOL -f typeset pool for pool in $TESTPOOL1 $TESTPOOL; do poolexists $pool && destroy_pool $pool done - zero_partitions $DISK + [ -n "$recursive" ] && set_tunable64 VOL_RECURSIVE $recursive } set -A datasets "$TESTPOOL" "$TESTPOOL2" -if ! $(is_physical_device $DISKS) ; then - log_unsupported "This case cannot be run on raw files." -fi - log_assert "'zpool destroy ' can destroy a specified pool." log_onexit cleanup -partition_disk $SLICE_SIZE $DISK 2 - -create_pool "$TESTPOOL" "${DISK}${SLICE_PREFIX}${SLICE0}" -create_pool "$TESTPOOL1" "${DISK}${SLICE_PREFIX}${SLICE1}" +create_pool $TESTPOOL $DISK0 +create_pool $TESTPOOL1 $DISK1 log_must zfs create -s -V $VOLSIZE $TESTPOOL1/$TESTVOL block_device_wait -create_pool "$TESTPOOL2" "${ZVOL_DEVDIR}/$TESTPOOL1/$TESTVOL" +if is_freebsd; then + typeset recursive=$(get_tunable VOL_RECURSIVE) + log_must set_tunable64 VOL_RECURSIVE 1 +fi +create_pool $TESTPOOL2 $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL typeset -i i=0 while (( i < ${#datasets[*]} )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh index ad9425795c..a634f10f11 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh @@ -59,7 +59,7 @@ function cleanup typeset -i i=0 while (( $i < ${#datasets[*]} )); do datasetexists ${datasets[i]} && \ - log_must zfs destroy ${datasets[i]} + destroy_dataset ${datasets[i]} (( i = i + 1 )) done @@ -73,9 +73,7 @@ log_assert "'zpool destroy -f ' can forcely destroy the specified pool" log_onexit cleanup -typeset cwd="" - -create_pool "$TESTPOOL" "$DISK" +create_pool $TESTPOOL $DISK0 log_must zfs create $TESTPOOL/$TESTFS log_must mkdir -p $TESTDIR log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS @@ -90,7 +88,6 @@ while (( $i < ${#datasets[*]} )); do ((i = i + 1)) done -cwd=$PWD log_note "'zpool destroy' without '-f' will fail " \ "while pool is busy." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore b/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore new file mode 100644 index 0000000000..a1f8c14838 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore @@ -0,0 +1 @@ +/ereports diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am index 7fb6e4f7a5..765df10222 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am @@ -1,4 +1,8 @@ +include $(top_srcdir)/config/Rules.am + pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events + dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ @@ -6,8 +10,17 @@ dist_pkgdata_SCRIPTS = \ zpool_events_cliargs.ksh \ zpool_events_follow.ksh \ zpool_events_poolname.ksh \ - zpool_events_errors.ksh + zpool_events_errors.ksh \ + zpool_events_duplicates.ksh \ + zpool_events_clear_retained.ksh dist_pkgdata_DATA = \ zpool_events.cfg \ zpool_events.kshlib + +ereports_LDADD = \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la + +pkgexec_PROGRAMS = ereports +ereports_SOURCES = ereports.c diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c b/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c new file mode 100644 index 0000000000..f825240000 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Command to output io and checksum ereport values, one per line. + * Used by zpool_events_duplicates.ksh to check for duplicate events. + * + * example output line: + * + * checksum "error_pool" 0x856dd01ce52e336 0x000034 0x000400 0x000a402c00 + * 0x000004 0x000000 0x000000 0x000000 0x000001 + */ + +/* + * Our ereport duplicate criteria + * + * When the class and all of these values match, then an ereport is + * considered to be a duplicate. + */ +static const char *criteria_name[] = { + FM_EREPORT_PAYLOAD_ZFS_POOL, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + + /* logical zio criteriai (optional) */ + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, +}; + +#define CRITERIA_NAMES_COUNT ARRAY_SIZE(criteria_name) + +static void +print_ereport_line(nvlist_t *nvl) +{ + char *class; + int last = CRITERIA_NAMES_COUNT - 1; + + /* + * For the test case context, we only want to see 'io' and + * 'checksum' subclass. We skip 'data' to minimize the output. + */ + if (nvlist_lookup_string(nvl, FM_CLASS, &class) != 0 || + strstr(class, "ereport.fs.zfs.") == NULL || + strcmp(class, "ereport.fs.zfs.data") == 0) { + return; + } + + (void) printf("%s\t", class + strlen("ereport.fs.zfs.")); + + for (int i = 0; i < CRITERIA_NAMES_COUNT; i++) { + nvpair_t *nvp; + uint32_t i32 = 0; + uint64_t i64 = 0; + char *str = NULL; + + if (nvlist_lookup_nvpair(nvl, criteria_name[i], &nvp) != 0) { + /* print a proxy for optional criteria */ + (void) printf("--------"); + (void) printf("%c", i == last ? '\n' : '\t'); + continue; + } + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + (void) printf("\"%s\"", str ? str : ""); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + (void) printf("0x%06x", i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + (void) printf("0x%06x", i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + (void) printf("0x%06llx", (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + if (strcmp(FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + criteria_name[i]) == 0) + (void) printf("0x%010llx", (u_longlong_t)i64); + else + (void) printf("0x%06llx", (u_longlong_t)i64); + break; + default: + (void) printf(""); + break; + } + (void) printf("%c", i == last ? '\n' : '\t'); + } +} + +static void +ereports_dump(libzfs_handle_t *zhdl, int zevent_fd) +{ + nvlist_t *nvl; + int ret, dropped; + + while (1) { + ret = zpool_events_next(zhdl, &nvl, &dropped, ZEVENT_NONBLOCK, + zevent_fd); + if (ret || nvl == NULL) + break; + if (dropped > 0) + (void) fprintf(stdout, "dropped %d events\n", dropped); + print_ereport_line(nvl); + (void) fflush(stdout); + nvlist_free(nvl); + } +} + +/* ARGSUSED */ +int +main(int argc, char **argv) +{ + libzfs_handle_t *hdl; + int fd; + + hdl = libzfs_init(); + if (hdl == NULL) { + (void) fprintf(stderr, "libzfs_init: %s\n", strerror(errno)); + exit(2); + } + fd = open(ZFS_DEV, O_RDWR); + if (fd < 0) { + (void) fprintf(stderr, "open: %s\n", strerror(errno)); + libzfs_fini(hdl); + exit(2); + } + + ereports_dump(hdl, fd); + + (void) close(fd); + libzfs_fini(hdl); + + return (0); +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear.ksh index ab862354b8..054d39be3f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear.ksh @@ -34,7 +34,7 @@ log_assert "'zpool events -c' should successfully clear events." # 1. Clear all ZFS events # This is needed because we may already over the max number or events queued # (zfs_zevent_len_max) generated by previous tests: generating $EVENTS_NUM new -# events and then counting them is racy and leads to failues, so start from 0. +# events and then counting them is racy and leads to failures, so start from 0. log_must zpool events -c # 2. Generate some new ZFS events @@ -43,14 +43,14 @@ for i in `seq 1 $EVENTS_NUM`; do done # wait a bit to allow the kernel module to process new events zpool_events_settle -EVENTS_NUM="$(zpool events -H | wc -l)" +EVENTS_NUM=$(zpool events -H | wc -l | xargs) # 3. Verify 'zpool events -c' successfully clear new events -CLEAR_OUTPUT="$(zpool events -c)" +CLEAR_OUTPUT=$(zpool events -c) if [[ "$CLEAR_OUTPUT" != "cleared $EVENTS_NUM events" ]]; then log_fail "Failed to clear $EVENTS_NUM events: $CLEAR_OUTPUT" fi -EVENTS_NUM="$(zpool events -H | wc -l)" +EVENTS_NUM=$(zpool events -H | wc -l) if [[ $EVENTS_NUM -ne 0 ]]; then log_fail "Unexpected events number: $EVENTS_NUM != 0" fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh new file mode 100755 index 0000000000..fdf56b2cf9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +# DESCRIPTION: +# Verify that new errors after a pool scrub are considered a duplicate +# +# STRATEGY: +# 1. Create a raidz pool with a file +# 2. Inject garbage into one of the vdevs +# 3. Scrub the pool +# 4. Observe the checksum error counts +# 5. Repeat inject and pool scrub +# 6. Verify that second pass also produces similar errors (i.e. not +# treated as a duplicate) +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +MOUNTDIR=$TEST_BASE_DIR/mount +FILEPATH=$MOUNTDIR/target +VDEV1=$TEST_BASE_DIR/vfile1 +VDEV2=$TEST_BASE_DIR/vfile2 +VDEV3=$TEST_BASE_DIR/vfile3 +SUPPLY=$TEST_BASE_DIR/supply +POOL=test_pool +FILESIZE="15M" +DAMAGEBLKS=10 + +OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) +RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX) +OLD_CHECKSUMS=$(get_tunable CHECKSUM_EVENTS_PER_SECOND) + +EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports" + +function cleanup +{ + log_must set_tunable64 CHECKSUM_EVENTS_PER_SECOND $OLD_CHECKSUMS + log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX + + zpool events -c + if poolexists $POOL ; then + zpool export $POOL + fi + log_must rm -f $VDEV1 $VDEV2 $VDEV3 +} + +function damage_and_repair +{ + log_must zpool clear $POOL $VDEV1 + log_must zpool events -c + + log_note injecting damage to $VDEV1 + log_must dd conv=notrunc if=$SUPPLY of=$VDEV1 bs=1M seek=4 count=$DAMAGEBLKS + log_must zpool scrub $POOL + log_must zpool wait -t scrub $POOL + log_note "pass $1 observed $($EREPORTS | grep -c checksum) checksum ereports" + + repaired=$(zpool status $POOL | grep "scan: scrub repaired" | awk '{print $4}') + if [ "$repaired" == "0B" ]; then + log_fail "INVALID TEST -- expected scrub to repair some blocks" + else + log_note "$repaired repaired during scrub" + fi +} + +function checksum_error_count +{ + zpool status -p $POOL | grep $VDEV1 | awk '{print $5}' +} + +assertion="Damage to recently repaired blocks should be reported/counted" +log_assert "$assertion" +log_note "zevent retain max setting: $RETAIN_MAX" + +log_onexit cleanup + +# Set our threshold high to avoid dropping events. +set_tunable64 ZEVENT_LEN_MAX 20000 +set_tunable64 CHECKSUM_EVENTS_PER_SECOND 20000 + +# Initialize resources for the test +log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $VDEV3 +log_must dd if=/dev/urandom of=$SUPPLY bs=1M count=$DAMAGEBLKS +log_must mkdir -p $MOUNTDIR +log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL raidz $VDEV1 $VDEV2 $VDEV3 +log_must zfs set compression=off recordsize=16k $POOL +# create a file full of zeros +log_must mkfile -v $FILESIZE $FILEPATH +log_must zpool sync $POOL + +# run once and observe the checksum errors +damage_and_repair 1 +errcnt=$(checksum_error_count) +log_note "$errcnt errors observed" +# set expectaton of at least 75% of what we observed in first pass +(( expected = (errcnt * 75) / 100 )) + +# run again and we should observe new checksum errors +damage_and_repair 2 +errcnt=$(checksum_error_count) + +log_must zpool destroy $POOL + +if (( errcnt < expected )); then + log_fail "FAILED -- expecting at least $expected checksum errors but only observed $errcnt" +else + log_note observed $errcnt new checksum errors after a scrub + log_pass "$assertion" +fi + diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh new file mode 100755 index 0000000000..595eacf5b4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh @@ -0,0 +1,143 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +# DESCRIPTION: +# Verify that duplicate I/O ereport errors are not posted +# +# STRATEGY: +# 1. Create a mirror pool +# 2. Inject duplicate read/write IO errors and checksum errors +# 3. Verify there are no duplicate events being posted +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +MOUNTDIR=$TEST_BASE_DIR/mount +FILEPATH=$MOUNTDIR/badfile +VDEV1=$TEST_BASE_DIR/vfile1 +VDEV2=$TEST_BASE_DIR/vfile2 +POOL=error_pool +FILESIZE="10M" +OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) +RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX) + +EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports" + +duplicates=false + +function cleanup +{ + log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX + + log_must zinject -c all + if poolexists $POOL ; then + destroy_pool $POOL + fi + log_must rm -f $VDEV1 $VDEV2 +} + +log_assert "Duplicate I/O ereport errors are not posted" +log_note "zevent retain max setting: $RETAIN_MAX" + +log_onexit cleanup + +# Set our threshold high to avoid dropping events. +set_tunable64 ZEVENT_LEN_MAX 20000 + +log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 +log_must mkdir -p $MOUNTDIR + +# +# $1: test type - corrupt (checksum error), io +# $2: read, write +function do_dup_test +{ + ERR=$1 + RW=$2 + + log_note "Testing $ERR $RW ereports" + log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL mirror $VDEV1 $VDEV2 + log_must zpool events -c + log_must zfs set compression=off $POOL + + if [ "$RW" == "read" ] ; then + log_must mkfile $FILESIZE $FILEPATH + + # unmount and mount filesystems to purge file from ARC + # to force reads to go through error inject handler + log_must zfs unmount $POOL + log_must zfs mount $POOL + + # all reads from this file get an error + if [ "$ERR" == "corrupt" ] ; then + log_must zinject -a -t data -e checksum -T read $FILEPATH + else + log_must zinject -a -t data -e io -T read $FILEPATH + fi + + # Read the file a few times to generate some + # duplicate errors of the same blocks + for _ in {1..15}; do + dd if=$FILEPATH of=/dev/null bs=128K > /dev/null 2>&1 + done + log_must zinject -c all + fi + + log_must zinject -d $VDEV1 -e $ERR -T $RW -f 100 $POOL + + if [ "$RW" == "write" ] ; then + log_must mkfile $FILESIZE $FILEPATH + log_must zpool sync $POOL + fi + + log_must zinject -c all + + ereports="$($EREPORTS | sort)" + actual=$(echo "$ereports" | wc -l) + unique=$(echo "$ereports" | uniq | wc -l) + log_note "$actual total $ERR $RW ereports where $unique were unique" + + if [ $actual -gt $unique ] ; then + log_note "UNEXPECTED -- $((actual-unique)) duplicate $ERR $RW ereports" + echo "$ereports" + duplicates=true + fi + + log_must zpool destroy $POOL +} + +do_dup_test "corrupt" "read" +do_dup_test "io" "read" +do_dup_test "io" "write" + +if $duplicates; then + log_fail "FAILED -- Duplicate I/O ereport errors encountered" +else + log_pass "Duplicate I/O ereport errors are not posted" +fi + diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh index 0dc551bbdd..4645e245c9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh @@ -45,13 +45,13 @@ VDEV2=$TEST_BASE_DIR/file2 VDEV3=$TEST_BASE_DIR/file3 POOL=error_pool FILESIZE=$((20 * 1024 * 1024)) -OLD_CHECKSUMS=$(get_tunable zfs_checksum_events_per_second) -OLD_LEN_MAX=$(get_tunable zfs_zevent_len_max) +OLD_CHECKSUMS=$(get_tunable CHECKSUM_EVENTS_PER_SECOND) +OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) function cleanup { - log_must set_tunable64 zfs_checksum_events_per_second $OLD_CHECKSUMS - log_must set_tunable64 zfs_zevent_len_max $OLD_LEN_MAX + log_must set_tunable64 CHECKSUM_EVENTS_PER_SECOND $OLD_CHECKSUMS + log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX log_must zinject -c all log_must zpool events -c @@ -66,8 +66,8 @@ log_assert "Check that the number of zpool errors match the number of events" log_onexit cleanup # Set our thresholds high so we never ratelimit or drop events. -set_tunable64 zfs_checksum_events_per_second 20000 -set_tunable64 zfs_zevent_len_max 20000 +set_tunable64 CHECKSUM_EVENTS_PER_SECOND 20000 +set_tunable64 ZEVENT_LEN_MAX 20000 log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $VDEV3 log_must mkdir -p $MOUNTDIR @@ -129,11 +129,11 @@ function do_test fi fi - if [ "$val" == "0" ] || [ "$events" == "" ] ; then + if [ -z "$val" -o $val -eq 0 -o -z "$events" -o $events -eq 0 ] ; then log_fail "Didn't see any errors or events ($val/$events)" fi - if [ "$val" != "$events" ] ; then + if [ $val -ne $events ] ; then log_fail "$val $POOLTYPE $str errors != $events events" else log_note "$val $POOLTYPE $str errors == $events events" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh index a996e57c14..258de033b8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh @@ -56,8 +56,8 @@ done zpool_events_settle # 4. Verify 'zpool events -f' successfully recorded these new events -EVENTS_LOG="$(cat $EVENTS_FILE | wc -l)" -if [[ "$EVENTS_LOG" != "$EVENTS_NUM" ]]; then +EVENTS_LOG=$(cat $EVENTS_FILE | wc -l) +if [[ $EVENTS_LOG -ne $EVENTS_NUM ]]; then log_fail "Unexpected number of events: $EVENTS_LOG != $EVENTS_NUM" fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index f39e6267bc..922e35125e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -72,7 +72,7 @@ log_onexit cleanup log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) @@ -144,6 +144,16 @@ for type in " " mirror raidz raidz2; do if [[ $? -ne 0 ]] ; then log_fail "pool $TESTPOOL1 has not expanded" fi + elif [[ $type == "draid" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]]; then + log_fail "pool $TESTPOOL has not expanded" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh index a49d4fc170..62843b0622 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh @@ -63,7 +63,7 @@ log_onexit cleanup log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid:1s; do # Initialize the file devices and the pool for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i @@ -92,6 +92,8 @@ for type in " " mirror raidz raidz2; do if [[ $type == "mirror" ]]; then typeset expected_zpool_expandsize=$(($exp_size-$org_size)) + elif [[ $type == "draid:1s" ]]; then + typeset expected_zpool_expandsize=$((2*($exp_size-$org_size))) else typeset expected_zpool_expandsize=$((3*($exp_size-$org_size))) fi @@ -147,6 +149,17 @@ for type in " " mirror raidz raidz2; do log_fail "pool $TESTPOOL1 has not expanded " \ "after zpool online -e" fi + elif [[ $type == "draid:1s" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]] ; then + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ @@ -160,9 +173,17 @@ for type in " " mirror raidz raidz2; do fi fi else - log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \ - "and zpool online -e" + log_fail "pool $TESTPOOL1 did not expand after vdev " \ + "expansion and zpool online -e" fi + + # For dRAID pools verify the distributed spare was resized after + # expansion and it is large enough to be used to replace a pool vdev. + if [[ $type == "draid:1s" ]]; then + log_must zpool replace -w $TESTPOOL1 $TEMPFILE.3 draid1-0-0 + verify_pool $TESTPOOL1 + fi + log_must zpool destroy $TESTPOOL1 done log_pass "zpool can expand after zpool online -e" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh index 323d0b907b..b3c71b666a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh @@ -73,7 +73,7 @@ log_onexit cleanup log_assert "zpool can not expand if set autoexpand=off after vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh index 8a4db824bc..09e2b6da21 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh @@ -61,7 +61,7 @@ log_onexit cleanup log_assert "After vdev expansion, all 4 labels have the same set of uberblocks." -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am index 86452e8acc..1c06d5b59e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am @@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \ zpool_export_004_pos.ksh dist_pkgdata_DATA = \ - zpool_export.cfg + zpool_export.cfg \ + zpool_export.kshlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/setup.ksh index 925f3e4af8..023920dae1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/setup.ksh @@ -30,8 +30,4 @@ DISK=${DISKS%% *} -if ! $(is_physical_device $DISK) ; then - log_unsupported "Only partitionable physical disks can be used" -fi - default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.cfg index 1501c04630..8bfb067c7a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.cfg @@ -30,30 +30,15 @@ . $STF_SUITE/include/libtest.shlib -export DISK_ARRAY_NUM=0 -export DISK_ARRAY_LIMIT=4 -export DISKSARRAY="" -export VDEVS_NUM=32 +export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') +export DISK1=$(echo $DISKS | awk '{print $1}') +export DISK2=$(echo $DISKS | awk '{print $3}') -function set_disks -{ - typeset -a disk_array=($(find_disks $DISKS)) - - if (( ${#disk_array[*]} <= 1 )); then - export DISK=${DISKS%% *} - else - export DISK="" - typeset -i i=0 - while (( i < ${#disk_array[*]} )); do - export DISK${i}="${disk_array[$i]}" - DISKSARRAY="$DISKSARRAY ${disk_array[$i]}" - (( i = i + 1 )) - (( i>$DISK_ARRAY_LIMIT )) && break - done - export DISK_ARRAY_NUM=$i - export DISKSARRAY - fi -} - -set_disks -set_device_dir +if is_linux; then + set_slice_prefix + set_device_dir + devs_id[0]=$(get_persistent_disk_name $DISK1) + devs_id[1]=$(get_persistent_disk_name $DISK2) +else + DEV_DSKDIR="/dev" +fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib new file mode 100644 index 0000000000..5484f20674 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.cfg + +function zpool_export_cleanup +{ + [[ -d $TESTDIR0 ]] && log_must rm -rf $TESTDIR0 + default_cleanup +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_001_pos.ksh index b6823553d7..111453c7a1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_001_pos.ksh @@ -29,8 +29,7 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib # # DESCRIPTION: @@ -46,19 +45,7 @@ verify_runnable "global" -function cleanup -{ - typeset dir=$(get_device_dir $DISKS) - - datasetexists "$TESTPOOL/$TESTFS" || \ - log_must zpool import -d $dir $TESTPOOL - - ismounted "$TESTPOOL/$TESTFS" - (( $? != 0 )) && \ - log_must zfs mount $TESTPOOL/$TESTFS -} - -log_onexit cleanup +log_onexit zpool_export_cleanup log_assert "Verify a pool can be exported." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_002_pos.ksh index 81473d903a..8040d12b92 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_002_pos.ksh @@ -29,7 +29,7 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # -. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib # # DESCRIPTION: @@ -45,19 +45,10 @@ verify_runnable "global" function cleanup { - typeset dir=$(get_device_dir $DISKS) cd $olddir || \ log_fail "Couldn't cd back to $olddir" - datasetexists "$TESTPOOL/$TESTFS" || \ - log_must zpool import -d $dir $TESTPOOL - - ismounted "$TESTPOOL/$TESTFS" - (( $? != 0 )) && \ - log_must zfs mount $TESTPOOL/$TESTFS - - [[ -e $TESTDIR/$TESTFILE0 ]] && \ - log_must rm -rf $TESTDIR/$TESTFILE0 + zpool_export_cleanup } olddir=$PWD diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_003_neg.ksh index b188f9c330..a2ee7fbdf9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_003_neg.ksh @@ -29,7 +29,7 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # -. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib # # DESCRIPTION: @@ -43,18 +43,7 @@ verify_runnable "global" -function cleanup -{ - typeset dir=$(get_device_dir $DISKS) - datasetexists "$TESTPOOL/$TESTFS" || \ - log_must zpool import -d $dir $TESTPOOL - - ismounted "$TESTPOOL/$TESTFS" - (( $? != 0 )) && \ - log_must zfs mount $TESTPOOL/$TESTFS -} - -log_onexit cleanup +log_onexit zpool_export_cleanup set -A args "" "-f" "-? $TESTPOOL" "-QWERTYUIO $TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh index 0f1a7c624d..9be3f23c4f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh @@ -29,7 +29,7 @@ # Copyright (c) 2012, 2016 by Delphix. All rights reserved. # -. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib # # DESCRIPTION: @@ -50,25 +50,8 @@ verify_runnable "global" -function cleanup -{ - mntpnt=$TESTDIR0 - datasetexists $TESTPOOL1 || log_must zpool import -d $mntpnt $TESTPOOL1 - datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - datasetexists $TESTPOOL2 && destroy_pool $TESTPOOL2 - typeset -i i=0 - while ((i < 5)); do - if [[ -e $mntpnt/vdev$i ]]; then - log_must rm -f $mntpnt/vdev$i - fi - ((i += 1)) - done - log_must rmdir $mntpnt -} - - log_assert "Verify zpool export succeed or fail with spare." -log_onexit cleanup +log_onexit zpool_export_cleanup mntpnt=$TESTDIR0 log_must mkdir -p $mntpnt diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile.am index 36a7f23126..0c87c9b377 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile.am @@ -5,7 +5,8 @@ dist_pkgdata_SCRIPTS = \ zpool_get_001_pos.ksh \ zpool_get_002_pos.ksh \ zpool_get_003_pos.ksh \ - zpool_get_004_neg.ksh + zpool_get_004_neg.ksh \ + zpool_get_005_pos.ksh dist_pkgdata_DATA = \ - zpool_get.cfg + zpool_get.cfg zpool_get_parsable.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index fdcce8b562..6075e1f1ab 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -46,7 +46,6 @@ typeset -a properties=( "failmode" "listsnapshots" "autoexpand" - "dedupditto" "dedupratio" "free" "allocated" @@ -58,6 +57,7 @@ typeset -a properties=( "leaked" "multihost" "autotrim" + "compatibility" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" @@ -72,15 +72,19 @@ typeset -a properties=( "feature@large_blocks" "feature@sha512" "feature@skein" - "feature@edonr" "feature@device_removal" "feature@obsolete_counts" "feature@zpool_checkpoint" "feature@spacemap_v2" + "feature@redaction_bookmarks" + "feature@redacted_datasets" + "feature@bookmark_written" + "feature@log_spacemap" + "feature@device_rebuild" + "feature@draid" ) -# Additional properties added for Linux. -if is_linux; then +if is_linux || is_freebsd; then properties+=( "ashift" "feature@large_dnode" @@ -90,5 +94,13 @@ if is_linux; then "feature@allocation_classes" "feature@resilver_defer" "feature@bookmark_v2" + "feature@livelist" + "feature@zstd_compress" + ) +fi + +if ! is_freebsd; then + properties+=( + "feature@edonr" ) fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_005_pos.ksh new file mode 100755 index 0000000000..ad27d180fd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_005_pos.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2014 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_get/zpool_get_parsable.cfg + +# +# DESCRIPTION: +# +# Zpool get returns parsable values for all known parsable properties +# +# STRATEGY: +# 1. For all parsable properties, verify zpool get -p returns a parsable value +# + +if ! is_global_zone ; then + TESTPOOL=${TESTPOOL%%/*} +fi + +typeset -i i=0 + +while [[ $i -lt "${#properties[@]}" ]]; do + log_note "Checking for parsable ${properties[$i]} property" + log_must eval "zpool get -p ${properties[$i]} $TESTPOOL >/tmp/value.$$" + grep "${properties[$i]}" /tmp/value.$$ >/dev/null 2>&1 + if [[ $? -ne 0 ]]; then + log_fail "${properties[$i]} not seen in output" + fi + + typeset v=$(grep "${properties[$i]}" /tmp/value.$$ | awk '{print $3}') + + log_note "${properties[$i]} has a value of $v" + + # Determine if this value is a valid number, result in return code + log_must test -n "$v" + expr $v + 0 >/dev/null 2>&1 + + # All properties must be positive integers in order to be + # parsable (i.e. a return code of 0 or 1 from expr above). + # The only exception is "expandsize", which may be "-". + if [[ ! ($? -eq 0 || $? -eq 1 || \ + ("${properties[$i]}" = "expandsize" && "$v" = "-")) ]]; then + log_fail "${properties[$i]} is not parsable" + fi + + i=$(( $i + 1 )) +done + +rm /tmp/value.$$ +log_pass "Zpool get returns parsable values for all known parsable properties" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_parsable.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_parsable.cfg new file mode 100644 index 0000000000..e7b95a4722 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_parsable.cfg @@ -0,0 +1,33 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# + +# Set the expected properties of zpool +typeset -a properties=("allocated" "capacity" "expandsize" "free" "freeing" + "leaked" "size") diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_001_neg.ksh index dd1be14a06..b5cd8d529e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_001_neg.ksh @@ -38,7 +38,7 @@ # # STRATEGY: # 1. Create pool, volume & snap -# 2. Verify 'zpool history' can cope with incorret arguments. +# 2. Verify 'zpool history' can cope with incorrect arguments. # verify_runnable "global" @@ -51,8 +51,8 @@ set -A neg_opt "$TESTPOOL/$TESTCTR" "$TESTPOOL/$TESTVOL" "-t $TESTPOOL" \ function cleanup { - datasetexists $clone && log_must zfs destroy $clone - datasetexists $snap && log_must zfs destroy $snap + datasetexists $clone && destroy_dataset $clone + datasetexists $snap && destroy_dataset $snap } log_assert "Verify 'zpool history' can deal with non-existent pools and " \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index ad0f9c46ed..a8c9a31dcf 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -9,6 +9,7 @@ dist_pkgdata_SCRIPTS = \ import_cachefile_device_replaced.ksh \ import_cachefile_mirror_attached.ksh \ import_cachefile_mirror_detached.ksh \ + import_cachefile_paths_changed.ksh \ import_cachefile_shared_device.ksh \ import_devices_missing.ksh \ import_paths_changed.ksh \ @@ -29,6 +30,8 @@ dist_pkgdata_SCRIPTS = \ zpool_import_013_neg.ksh \ zpool_import_014_pos.ksh \ zpool_import_015_pos.ksh \ + zpool_import_016_pos.ksh \ + zpool_import_017_pos.ksh \ zpool_import_all_001_pos.ksh \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh index 029fa66816..bee0e11a4f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh @@ -34,11 +34,10 @@ verify_runnable "global" -log_must set_tunable32 zfs_scan_suspend_progress 0 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 for pool in "$TESTPOOL" "$TESTPOOL1"; do - datasetexists $pool/$TESTFS && \ - log_must zfs destroy -Rf $pool/$TESTFS + datasetexists $pool/$TESTFS && destroy_dataset $pool/$TESTFS -Rf destroy_pool "$pool" done @@ -47,20 +46,4 @@ for dir in "$TESTDIR" "$TESTDIR1" "$DEVICE_DIR" ; do log_must rm -rf $dir done -DISK=${DISKS%% *} -if is_mpath_device $DISK; then - delete_partitions -fi -# recreate and destroy a zpool over the disks to restore the partitions to -# normal -case $DISK_COUNT in -0|1) - log_note "No disk devices to restore" - ;; -*) - log_must cleanup_devices $ZFS_DISK1 - log_must cleanup_devices $ZFS_DISK2 - ;; -esac - log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh index ab72042a21..3238faaa9a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh @@ -69,6 +69,8 @@ test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \ "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" \ "$VDEV0 raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV0 draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" "$VDEV0 log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV0 $VDEV2 log $VDEV1" test_add_vdevs "$VDEV0" "$VDEV1 log $VDEV2" "$VDEV0 $VDEV1 log $VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh index e7edb1a3b0..8a81c18cd8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh @@ -59,7 +59,7 @@ function custom_cleanup [[ -n ZFS_TXG_TIMEOUT ]] && log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 cleanup } @@ -87,7 +87,7 @@ function test_replacing_vdevs log_must zpool export $TESTPOOL1 log_must cp $CPATHBKP $CPATH log_must zpool import -c $CPATH -o cachefile=$CPATH $TESTPOOL1 - log_must set_tunable32 zfs_scan_suspend_progress 1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool replace $TESTPOOL1 $replacevdev $replaceby # Cachefile: pool in resilvering state @@ -96,7 +96,7 @@ function test_replacing_vdevs # Confirm pool is still replacing log_must pool_is_replacing $TESTPOOL1 log_must zpool export $TESTPOOL1 - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 ( $earlyremove ) && log_must rm $replacevdev @@ -155,6 +155,12 @@ test_replacing_vdevs "raidz $VDEV0 $VDEV1 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" \ true 20 +test_replacing_vdevs "draid:1s $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + "$VDEV1" "$VDEV5" \ + "draid $VDEV0 $VDEV5 $VDEV2 $VDEV3 $VDEV4 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + true 30 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import -c cachefile_unaware_of_replace passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh new file mode 100755 index 0000000000..0902bc49f4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh @@ -0,0 +1,117 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable from a cachefile even if device paths +# have changed. +# +# STRATEGY: +# 1. Create a pool using a cachefile +# 2. Backup cachefile +# 3. Export the pool. +# 4. Change the paths of some of the devices. +# 5. Verify that we can import the pool using the cachefile. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_new_paths +{ + typeset poolcreate="$1" + typeset pathstochange="$2" + + log_note "$0: pool '$poolcreate', changing paths of $pathstochange." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool export $TESTPOOL1 + + for dev in $pathstochange; do + log_must mv $dev "${dev}_new" + done + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + for dev in $pathstochange; do + log_must mv "${dev}_new" $dev + done + + log_note "" +} + +function test_duplicate_pools +{ + typeset poolcreate="$1" + typeset pathstocopy="$2" + + log_note "$0: pool '$poolcreate', creating duplicate pool using $pathstocopy." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + log_must zpool export $TESTPOOL1 + + for dev in $pathstocopy; do + log_must cp $dev "${dev}_orig" + + done + + log_must zpool create -f -o cachefile=$CPATH $TESTPOOL1 $poolcreate + log_must cp $CPATH $CPATHBKP + log_must zpool export $TESTPOOL1 + + for dev in $pathstocopy; do + log_must mv $dev "${dev}_new" + done + + log_must zpool import -c $CPATHBKP + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + for dev in $pathstocopy; do + log_must rm "${dev}_orig" + log_must mv "${dev}_new" $dev + done + + log_note "" +} + +test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "$VDEV0 log $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_new_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" + +test_duplicate_pools "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "$VDEV0 log $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_duplicate_pools "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" + +log_pass "zpool import with cachefile succeeded after changing device paths." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh index 23d79c6907..87942b4a52 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh @@ -50,7 +50,7 @@ function dev_checksum log_note "Compute checksum of '$dev'" - checksum=$(md5sum $dev) + checksum=$(md5digest $dev) if [[ $? -ne 0 ]]; then log_fail "Failed to compute checksum of '$dev'" return 1 @@ -108,6 +108,7 @@ test_shared_device "mirror $VDEV0 $VDEV1" "mirror $VDEV1 $VDEV2" "$VDEV1" test_shared_device "mirror $VDEV0 $VDEV1 $VDEV2" "mirror $VDEV2 $VDEV3" \ "$VDEV2" test_shared_device "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" +test_shared_device "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" test_shared_device "$VDEV0 log $VDEV1" "$VDEV2 log $VDEV1" "$VDEV1" "-m" log_pass "Pool doesn't write to a device it doesn't own anymore." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh index 7ee306e26d..15f3a0a7b4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh @@ -89,9 +89,11 @@ test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "$VDEV0 log $VDEV1" "$VDEV1" test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV1" +test_new_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV1" test_swap_paths "$VDEV0 $VDEV1" "$VDEV0" "$VDEV1" test_swap_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" +test_swap_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" test_swap_paths "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" \ "$VDEV0" "$VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh index e8f3937609..3ac8c104f1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh @@ -48,7 +48,7 @@ function custom_cleanup { set_vdev_validate_skip 0 cleanup - log_must set_tunable64 zfs_vdev_min_ms_count 16 + log_must set_tunable64 VDEV_MIN_MS_COUNT 16 } log_onexit custom_cleanup @@ -115,7 +115,7 @@ function test_common # further than the time that we took the checkpoint. # # Note that, ideally we would want to take a checkpoint - # right after we recond the txg we plan to rewind to. + # right after we record the txg we plan to rewind to. # But since we can't attach, detach or remove devices # while having a checkpoint, we take it after the # operation that changes the config. @@ -201,14 +201,14 @@ function test_remove_vdev } # Record txg history -is_linux && log_must set_tunable32 zfs_txg_history 100 +is_linux && log_must set_tunable32 TXG_HISTORY 100 # Make the devices bigger to reduce chances of overwriting MOS metadata. increase_device_sizes $(( FILE_SIZE * 4 )) # Increase the number of metaslabs for small pools temporarily to # reduce the chance of reusing a metaslab that holds old MOS metadata. -log_must set_tunable64 zfs_vdev_min_ms_count 150 +log_must set_tunable64 VDEV_MIN_MS_COUNT 150 # Part of the rewind test is to see how it reacts to path changes typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3" @@ -220,6 +220,7 @@ test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh index bc2c611ae0..b03b39d178 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh @@ -60,10 +60,10 @@ ZFS_TXG_TIMEOUT="" function custom_cleanup { # Revert zfs_txg_timeout to defaults - [[ -n ZFS_TXG_TIMEOUT ]] && + [[ -n $ZFS_TXG_TIMEOUT ]] && log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_must rm -rf $BACKUP_DEVICE_DIR - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 cleanup } @@ -102,13 +102,13 @@ function test_replace_vdev log_must zpool import -d $DEVICE_DIR $TESTPOOL1 # Ensure resilvering doesn't complete. - log_must set_tunable32 zfs_scan_suspend_progress 1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool replace $TESTPOOL1 $replacevdev $replaceby # Confirm pool is still replacing log_must pool_is_replacing $TESTPOOL1 log_must zpool export $TESTPOOL1 - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 ############################################################ # Test 1: rewind while device is resilvering. @@ -151,7 +151,7 @@ function test_replace_vdev } # Record txg history -is_linux && log_must set_tunable32 zfs_txg_history 100 +is_linux && log_must set_tunable32 TXG_HISTORY 100 log_must mkdir -p $BACKUP_DEVICE_DIR # Make the devices bigger to reduce chances of overwriting MOS metadata. @@ -176,6 +176,11 @@ test_replace_vdev "raidz $VDEV0 $VDEV1 $VDEV2" \ "raidz $VDEV0 $VDEV3 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" 10 +test_replace_vdev "draid $VDEV0 $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV1" "$VDEV4" \ + "draid $VDEV0 $VDEV4 $VDEV2 $VDEV3 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3" 10 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import rewind after device replacement passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh index 9f0ccfb6cb..22e619d741 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh @@ -33,32 +33,8 @@ . $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg verify_runnable "global" -verify_disk_count "$DISKS" 2 -if ! $(is_physical_device $ZFS_DISK1) ; then - log_unsupported "Only partitionable physical disks can be used" -fi - -DISK=${DISKS%% *} - -for dev in $ZFS_DISK1 $ZFS_DISK2 ; do - log_must cleanup_devices $dev -done - -typeset -i i=0 -while (( i <= $GROUP_NUM )); do - if ! is_linux; then - if (( i == 2 )); then - (( i = i + 1 )) - continue - fi - fi - log_must set_partition $i "$cyl" $SLICE_SIZE $ZFS_DISK1 - cyl=$(get_endslice $ZFS_DISK1 $i) - (( i = i + 1 )) -done - -create_pool "$TESTPOOL" "$ZFSSIDE_DISK1" +create_pool "$TESTPOOL" "$DISK" if [[ -d $TESTDIR ]]; then rm -rf $TESTDIR || log_unresolved Could not remove $TESTDIR @@ -73,7 +49,7 @@ log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 79423abe25..25f541ebf1 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -30,87 +30,11 @@ . $STF_SUITE/include/libtest.shlib -export DISKSARRAY=$DISKS -export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') -typeset -a disk_array=($(find_disks $DISKS)) -case "${#disk_array[*]}" in -0) - # - # on stf_configure, disk_freelist returns empty. - # - DISK_COUNT=0 - ;; -1) - # We need to repartition the single disk to two slices. - if is_linux; then - set_device_dir - set_slice_prefix - PRIMARY_SLICE=1 - DISK_COUNT=1 - ZFS_DISK1=${disk_array[0]} - ZFS_DISK2=${disk_array[0]} - if is_mpath_device $ZFS_DISK1; then - export DEV_DSKDIR=$DEV_MPATHDIR - else - export DEV_DSKDIR=$DEV_RDSKDIR - fi - if ( is_mpath_device $ZFS_DISK1 ) && [[ -z $(echo $ZFS_DISK1 | awk 'substr($1,18,1)\ - ~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK1 ); then - ZFSSIDE_DISK1=${ZFS_DISK1}1 - elif ( is_mpath_device $ZFS_DISK1 || is_loop_device $ZFS_DISK1 ); then - ZFSSIDE_DISK1=${ZFS_DISK1}p1 - else - log_fail "$ZFS_DISK1 not supported for partitioning." - fi - else - export DEV_DSKDIR="/dev" - PRIMARY_SLICE=2 - DISK_COUNT=1 - ZFS_DISK1=${disk_array[0]} - ZFSSIDE_DISK1=${ZFS_DISK1}s0 - ZFS_DISK2=${disk_array[0]} - fi - ;; -*) - # We need to repartition the single disk to two slices. - if is_linux; then - set_device_dir - set_slice_prefix - PRIMARY_SLICE=1 - DISK_COUNT=2 - ZFS_DISK1=${disk_array[0]} - if is_mpath_device $ZFS_DISK1; then - export DEV_DSKDIR=$DEV_MPATHDIR - else - export DEV_DSKDIR=$DEV_RDSKDIR - fi - if ( is_mpath_device $ZFS_DISK1 ) && [[ -z $(echo $ZFS_DISK1 | awk 'substr($1,18,1)\ - ~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK1 ); then - ZFSSIDE_DISK1=${ZFS_DISK1}1 - elif ( is_mpath_device $ZFS_DISK1 || is_loop_device $ZFS_DISK1 ); then - ZFSSIDE_DISK1=${ZFS_DISK1}p1 - else - log_fail "$ZFS_DISK1 not supported for partitioning." - fi - ZFS_DISK2=${disk_array[1]} - else - export DEV_DSKDIR="/dev" - PRIMARY_SLICE=2 - DISK_COUNT=2 - ZFS_DISK1=${disk_array[0]} - ZFSSIDE_DISK1=${ZFS_DISK1}s0 - ZFS_DISK2=${disk_array[1]} - fi - ;; -esac - -export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2 - +export DISK=${DISKS%% *} export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m" export FILE_SIZE="$((MINVDEVSIZE))" export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m" -export MAX_NUM=5 -export GROUP_NUM=3 +export MAX_NUM=6 export DEVICE_DIR=$TEST_BASE_DIR/dev_import-test export BACKUP_DEVICE_DIR=$TEST_BASE_DIR/bakdev_import-test export DEVICE_FILE=disk @@ -123,6 +47,7 @@ export CPATHBKP2=$TEST_BASE_DIR/cachefile.$$.bkp2 export MD5FILE=$TEST_BASE_DIR/md5sums.$$ export MD5FILE2=$TEST_BASE_DIR/md5sums.$$.2 +export GROUP_NUM=3 typeset -i num=0 while (( num < $GROUP_NUM )); do DEVICE_FILES="$DEVICE_FILES ${DEVICE_DIR}/${DEVICE_FILE}$num" @@ -135,5 +60,6 @@ export VDEV1=$DEVICE_DIR/${DEVICE_FILE}1 export VDEV2=$DEVICE_DIR/${DEVICE_FILE}2 export VDEV3=$DEVICE_DIR/${DEVICE_FILE}3 export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 +export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index d050145e44..8bbd668a93 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -31,10 +31,10 @@ function cleanup log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done - is_linux && set_tunable32 "zfs_txg_history" 0 + is_linux && set_tunable32 TXG_HISTORY 0 } # @@ -79,10 +79,10 @@ function write_some_data # # Create/overwrite a few datasets with files. -# Apply md5sum on all the files and store checksums in a file. +# Checksum all the files and store digests in a file. # # newdata: overwrite existing files if false. -# md5file: file where to store md5sums +# md5file: file where to store md5 digests # datasetname: base name for datasets # function _generate_data_common @@ -102,7 +102,10 @@ function _generate_data_common for j in {1..$files}; do typeset file="/$pool/$datasetname$i/file$j" dd if=/dev/urandom of=$file bs=128k count=$blocks > /dev/null - [[ -n $md5file ]] && md5sum $file >> $md5file + if [[ -n $md5file ]]; then + typeset cksum=$(md5digest $file) + echo $cksum $file >> $md5file + fi done ( $newdata ) && sync_pool "$pool" done @@ -140,8 +143,15 @@ function verify_data_md5sums return 1 fi - md5sum -c --quiet $md5file - return $? + cat $md5file | \ + while read digest file; do + typeset digest1=$(md5digest $file) + if [[ "$digest1" != "$digest" ]]; then + return 1 + fi + done + + return 0 } # @@ -153,7 +163,7 @@ function increase_device_sizes typeset -i i=0 while (( i < $MAX_NUM )); do - log_must mkfile $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -161,15 +171,18 @@ function increase_device_sizes # # Translate vdev names returned by zpool status into more generic names. # -# eg: mirror-2 --> mirror -# function _translate_vdev { typeset vdev=$1 - typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect" + # + # eg: mirror-2 --> mirror + # eg: draid2:4d:12c:1s-0 --> draid2 + # + typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect draid1 draid2 draid3" for word in $keywords; do - echo $vdev | egrep "^${word}-[0-9]+\$" > /dev/null + echo $vdev | egrep -qE \ + "^${word}-[0-9]+\$|^${word}:[0-9]+d:[0-9]c:[0-9]+s-[0-9]+\$" if [[ $? -eq 0 ]]; then vdev=$word break @@ -178,6 +191,7 @@ function _translate_vdev [[ $vdev == "logs" ]] && echo "log" && return 0 [[ $vdev == "raidz1" ]] && echo "raidz" && return 0 + [[ $vdev == "draid1" ]] && echo "draid" && return 0 echo $vdev return 0 @@ -307,71 +321,40 @@ function pool_is_replacing function set_vdev_validate_skip { - set_tunable32 "vdev_validate_skip" "$1" + set_tunable32 VDEV_VALIDATE_SKIP "$1" } function get_zfs_txg_timeout { - get_tunable "zfs_txg_timeout" + get_tunable TXG_TIMEOUT } function set_zfs_txg_timeout { - set_tunable32 "zfs_txg_timeout" "$1" + set_tunable32 TXG_TIMEOUT "$1" } function set_spa_load_verify_metadata { - set_tunable32 "spa_load_verify_metadata" "$1" + set_tunable32 SPA_LOAD_VERIFY_METADATA "$1" } function set_spa_load_verify_data { - set_tunable32 "spa_load_verify_data" "$1" + set_tunable32 SPA_LOAD_VERIFY_DATA "$1" } function set_zfs_max_missing_tvds { - set_tunable32 "zfs_max_missing_tvds" "$1" + set_tunable32 MAX_MISSING_TVDS "$1" } # -# Use mdb to find the last txg that was synced in an active pool. +# Use zdb to find the last txg that was synced in an active pool. # function get_last_txg_synced { typeset pool=$1 - if is_linux; then - txg=$(tail "/proc/spl/kstat/zfs/$pool/txgs" | - awk '$3=="C" {print $1}' | tail -1) - [[ "$txg" ]] || txg=0 - echo $txg - return 0 - fi - - typeset spas - spas=$(mdb -k -e "::spa") - [[ $? -ne 0 ]] && return 1 - - typeset spa="" - print "$spas\n" | while read line; do - typeset poolname=$(echo "$line" | awk '{print $3}') - typeset addr=$(echo "$line" | awk '{print $1}') - if [[ $poolname == $pool ]]; then - spa=$addr - break - fi - done - if [[ -z $spa ]]; then - log_fail "Couldn't find pool '$pool'" - return 1 - fi - typeset mdbcmd="$spa::print spa_t spa_ubsync.ub_txg | ::eval '.=E'" - typeset -i txg - txg=$(mdb -k -e "$mdbcmd") - [[ $? -ne 0 ]] && return 1 - - echo $txg - return 0 + zdb -u $pool | awk '$1 == "txg" { print $3 }' | sort -n | tail -n 1 } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh index 6e93fd4711..928efebdd2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz, one destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh index 096bbe8114..f8da584aad 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz2, two destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh index b337bd00f1..212024dfcb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh @@ -39,7 +39,7 @@ # STRATEGY: # 1. Create a 5 ways mirror pool A with dev0/1/2/3/4, then destroy it. # 2. Create a stripe pool B with dev1. Then destroy it. -# 3. Create a raidz2 pool C with dev2/3/4. Then destroy it. +# 3. Create a draid2 pool C with dev2/3/4/5. Then destroy it. # 4. Create a raidz pool D with dev3/4. Then destroy it. # 5. Create a stripe pool E with dev4. Then destroy it. # 6. Verify 'zpool import -D -a' recover all the pools. @@ -74,7 +74,7 @@ log_must zpool destroy $poolA log_must zpool create $poolB $VDEV1 log_must zpool destroy $poolB -log_must zpool create $poolC raidz2 $VDEV2 $VDEV3 $VDEV4 +log_must zpool create $poolC draid2 $VDEV2 $VDEV3 $VDEV4 $VDEV5 log_must zpool destroy $poolC log_must zpool create $poolD raidz $VDEV3 $VDEV4 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh index 53d6fd3052..ec387b2256 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh @@ -84,9 +84,9 @@ function cleanup destroy_pool $TESTPOOL1 - if datasetexists $TESTPOOL/$TESTFS; then - log_must zfs destroy -Rf $TESTPOOL/$TESTFS - fi + datasetexists $TESTPOOL/$TESTFS && \ + destroy_dataset $TESTPOOL/$TESTFS -Rf + log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS @@ -138,7 +138,7 @@ for option in "" "-Df"; do if ((nfs_share_bit == 1)); then log_note "Set sharenfs=on $pool" log_must zfs set sharenfs=on $pool - log_must is_shared $pool + ! is_freebsd && log_must is_shared $pool f_share="true" nfs_flag="sharenfs=on" fi @@ -147,9 +147,9 @@ for option in "" "-Df"; do while ((guid_bit <= 1)); do typeset guid_flag="pool name" if [[ -z $option ]]; then - log_must zpool export $pool + log_must_busy zpool export $pool else - log_must zpool destroy $pool + log_must_busy zpool destroy $pool fi typeset target=$pool @@ -181,19 +181,21 @@ for option in "" "-Df"; do for fs in $mount_fs; do log_must ismounted $pool/$fs [[ -n $f_share ]] && \ + ! is_freebsd && \ log_must is_shared $pool/$fs done for fs in $nomount_fs; do log_mustnot ismounted $pool/$fs - log_mustnot is_shared $pool/$fs + ! is_freebsd && \ + log_mustnot is_shared $pool/$fs done ((guid_bit = guid_bit + 1)) done # reset nfsshare=off if [[ -n $f_share ]]; then log_must zfs set sharenfs=off $pool - log_mustnot is_shared $pool + ! is_freebsd && log_mustnot is_shared $pool fi ((nfs_share_bit = nfs_share_bit + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_013_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_013_neg.ksh index 0a221b8e07..7fef6254fa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_013_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_013_neg.ksh @@ -60,9 +60,8 @@ function uncompress_pool function cleanup { - poolexists $POOL_NAME && log_must zpool destroy $POOL_NAME - [[ -e /$TESTPOOL/$POOL_FILE ]] && rm /$TESTPOOL/$POOL_FILE - return 0 + poolexists $POOL_NAME && destroy_pool $POOL_NAME + rm -f /$TESTPOOL/$POOL_FILE } log_assert "'zpool import' fails for pool that was not cleanly exported" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh new file mode 100755 index 0000000000..5434625cb9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid, one destroyed pools devices was removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with 1 disk which was used by pool A. +# 4. Verify import this draid pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid, one destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid, two destroyed pool's devices were used, import failed." +log_must mkfile $FILE_SIZE $VDEV0 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh new file mode 100755 index 0000000000..2e6cef265c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid2, two destroyed pool's devices were removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid2 pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with two disks which were used by pool A. +# 4. Verify import this draid2 pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid2, two destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid2, more than two destroyed pool's devices were used, " \ + "import failed." +log_must mkfile $FILE_SIZE $VDEV0 $VDEV1 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid2 passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata3.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata3.ksh index 86baf1f6e3..40b6ca1c18 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata3.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata3.ksh @@ -72,7 +72,7 @@ log_must zfs mount -o ro $POOL_NAME/testfs old_mntpnt=$(get_prop mountpoint $POOL_NAME/testfs) log_must eval "ls $old_mntpnt | grep -q testfile" -block_device_wait +block_device_wait /dev/zvol/$POOL_NAME/testvol log_mustnot dd if=/dev/zero of=/dev/zvol/$POOL_NAME/testvol bs=512 count=1 log_must dd if=/dev/zvol/$POOL_NAME/testvol of=/dev/null bs=512 count=1 @@ -90,7 +90,7 @@ log_must eval "zfs send $POOL_NAME/testfs@snap1 | \ zfs recv $POOL_NAME/encroot/testfs" log_must eval "zfs send $POOL_NAME/testvol@snap1 | \ zfs recv $POOL_NAME/encroot/testvol" -block_device_wait +block_device_wait /dev/zvol/$POOL_NAME/encroot/testvol log_must dd if=/dev/zero of=/dev/zvol/$POOL_NAME/encroot/testvol bs=512 count=1 new_mntpnt=$(get_prop mountpoint $POOL_NAME/encroot/testfs) log_must eval "ls $new_mntpnt | grep -q testfile" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata4.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata4.ksh index d06a9cd754..a0f063a8dc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata4.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata4.ksh @@ -51,7 +51,7 @@ function uncompress_pool function cleanup { - log_must set_tunable32 zfs_disable_ivset_guid_check 0 + log_must set_tunable32 DISABLE_IVSET_GUID_CHECK 0 poolexists $POOL_NAME && log_must zpool destroy $POOL_NAME [[ -e /$TESTPOOL/$POOL_FILE ]] && rm /$TESTPOOL/$POOL_FILE return 0 @@ -91,7 +91,7 @@ log_mustnot has_ivset_guid $POOL_NAME/testvol@snap3 # 2. Prepare pool to fix existing datasets log_must zpool set feature@bookmark_v2=enabled $POOL_NAME -log_must set_tunable32 zfs_disable_ivset_guid_check 1 +log_must set_tunable32 DISABLE_IVSET_GUID_CHECK 1 log_must zfs create $POOL_NAME/fixed # 3. Use raw sends to fix datasets diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh index 78e9bbf689..3b5167ff03 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh @@ -57,8 +57,8 @@ # Using the various combinations. # - Regular import # - Alternate Root Specified -# It should be succeed with single d/m device upon 'raidz' & 'mirror', -# but failed against 'regular' or more d/m devices. +# It should succeed with single d/m device upon 'raidz', 'mirror', +# 'draid' but failed against 'regular' or more d/m devices. # 6. If import succeed, verify following is true: # - The pool shows up under 'zpool list'. # - The pool's health should be DEGRADED. @@ -67,7 +67,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -89,7 +98,8 @@ function recreate_files log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must rm -f ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -157,6 +167,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count > 1 )) && \ action=log_mustnot ;; + 'draid') (( count > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh index 7534ebca87..60af3f3219 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh @@ -43,6 +43,8 @@ # before data integrity is compromised # - Raidz could withstand one devices failing # before data integrity is compromised +# - dRAID could withstand one devices failing +# before data integrity is compromised # Verify that is true. # # STRATEGY: @@ -50,11 +52,12 @@ # - Regular pool # - Mirror # - Raidz +# - dRAID # 2. Create necessary filesystem and test files. # 3. Export the test pool. # 4. Move one or more device files to other directory # 5. Verify 'zpool import -d' with the new directory -# will handle moved files successfullly. +# will handle moved files successfully. # Using the various combinations. # - Regular import # - Alternate Root Specified @@ -62,7 +65,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -88,7 +100,8 @@ function cleanup_all while (( i < $MAX_NUM )); do typeset dev_file=${DEVICE_DIR}/${DEVICE_FILE}$i if [[ ! -e ${dev_file} ]]; then - log_must mkfile $FILE_SIZE ${dev_file} + log_must rm -f ${dev_file} + log_must truncate -s $FILE_SIZE ${dev_file} fi ((i += 1)) done @@ -158,7 +171,8 @@ while (( i < ${#vdevs[*]} )); do # Backup all device files while filesystem prepared. # if [[ -z $backup ]] ; then - log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE ${DEVICE_FILE}* + log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE \ + ${DEVICE_FILE}0 ${DEVICE_FILE}1 ${DEVICE_FILE}2 backup="true" fi @@ -174,6 +188,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count == 1 )) && \ action=log_must ;; + 'draid') (( count == 1 )) && \ + action=log_must + ;; esac typeset target=$TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh index 815d409aa1..9d4629a779 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh @@ -59,12 +59,12 @@ verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/6839 -if is_linux; then +# See issue: https://github.com/openzfs/zfs/issues/6839 +if ! is_illumos; then log_unsupported "Test case may be slow" fi -set -A vdevs "" "mirror" "raidz" +set -A vdevs "" "mirror" "raidz" "draid" function verify { @@ -207,6 +207,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( overlap > 1 )) && \ action=log_mustnot ;; + 'draid') (( overlap > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am index 2ebc376d9c..3968902ec3 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am @@ -2,6 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_init dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_initialize_attach_detach_add_remove.ksh \ + zpool_initialize_fault_export_import_online.ksh \ zpool_initialize_import_export.ksh \ zpool_initialize_offline_export_import_online.ksh \ zpool_initialize_online_offline.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh new file mode 100755 index 0000000000..11b8a483e6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Miscellaneous complex sequences of operations function as expected. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing, fault, export, import, online and verify along +# the way that the initializing was cancelled and not restarted. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool offline -f $TESTPOOL $DISK1 +log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" +log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" +log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + +log_must zpool online $TESTPOOL $DISK1 +log_must zpool clear $TESTPOOL $DISK1 +log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" +log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + +log_pass "Initializing behaves as expected at each step of:" \ + "initialize + fault + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh index 0fa6a0be90..f774970a71 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -24,7 +24,6 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib # # DESCRIPTION: @@ -33,13 +32,13 @@ # STRATEGY: # 1. Create a one-disk pool. # 2. Initialize the disk to completion. -# 3. Load all metaslabs that don't have a spacemap, and make sure the entire -# metaslab has been filled with the initializing pattern (deadbeef). +# 3. Load all metaslabs and make sure that each contains at least +# once instance of the initializing pattern (deadbeef). # function cleanup { - set_tunable64 zfs_initialize_value $ORIG_PATTERN + set_tunable64 INITIALIZE_VALUE $ORIG_PATTERN zpool import -d $TESTDIR $TESTPOOL if datasetexists $TESTPOOL ; then @@ -54,36 +53,38 @@ log_onexit cleanup PATTERN="deadbeefdeadbeef" SMALLFILE="$TESTDIR/smallfile" -ORIG_PATTERN=$(get_tunable zfs_initialize_value) -log_must set_tunable64 zfs_initialize_value $(printf %llu 0x$PATTERN) +ORIG_PATTERN=$(get_tunable INITIALIZE_VALUE) +log_must set_tunable64 INITIALIZE_VALUE $(printf %llu 0x$PATTERN) log_must mkdir "$TESTDIR" -log_must mkfile $MINVDEVSIZE "$SMALLFILE" +log_must truncate -s $MINVDEVSIZE "$SMALLFILE" log_must zpool create $TESTPOOL "$SMALLFILE" -log_must zpool initialize $TESTPOOL - -while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do - sleep 0.5 -done - +log_must zpool initialize -w $TESTPOOL log_must zpool export $TESTPOOL -spacemaps=0 +metaslabs=0 bs=512 -while read -r sm; do - typeset offset="$(echo $sm | cut -d ' ' -f1)" - typeset size="$(echo $sm | cut -d ' ' -f2)" +zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | +while read -r offset_size; do + typeset offset=$(echo $offset_size | cut -d ' ' -f1) + typeset size=$(echo $offset_size | cut -d ' ' -f2) - spacemaps=$((spacemaps + 1)) - offset=$(((4 * 1024 * 1024) + 16#$offset)) - out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \ - count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad) - echo "$out" | log_must egrep "$PATTERN|\*|$size" -done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \ - awk '{print $4, $8}')" + log_note "offset: '$offset'" + log_note "size: '$size'" -if [[ $spacemaps -eq 0 ]];then - log_fail "Did not find any empty space maps to check" + metaslabs=$((metaslabs + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + log_note "vdev file offset: '$offset'" + + # Note we use '-t x4' instead of '-t x8' here because x8 is not + # a supported format on FreeBSD. + dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | + od -t x4 -Ad | egrep -q "deadbeef +deadbeef +deadbeef +deadbeef" || + log_fail "Pattern not found in metaslab free space" +done + +if [[ $metaslabs -eq 0 ]]; then + log_fail "Did not find any metaslabs to check" else - log_pass "Initializing wrote appropriate amount to disk" + log_pass "Initializing wrote to each metaslab" fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg index 85148d6e85..b2a10aa28a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg @@ -16,6 +16,13 @@ . $STF_SUITE/include/libtest.shlib typeset disks=(${DISKS[*]}) -typeset disk1=${disks[0]} -typeset disk2=${disks[1]} -typeset disk3=${disks[2]} + +if is_freebsd; then + typeset disk1=/dev/${disks[0]} + typeset disk2=/dev/${disks[1]} + typeset disk3=/dev/${disks[2]} +else + typeset disk1=${disks[0]} + typeset disk2=${disks[1]} + typeset disk3=${disks[2]} +fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh index dcca2e9335..b63d55d7ad 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh @@ -24,8 +24,8 @@ # STRATEGY: # 1. Create the pool with log device. # 2. Try clearing the label on data and log devices. -# 3. Add auxilary (cache/spare) vdevs. -# 4. Try clearing the label on auxilary vdevs. +# 3. Add auxiliary (cache/spare) vdevs. +# 4. Try clearing the label on auxiliary vdevs. # 5. Check that zpool labelclear will return non-zero and # labels are intact. diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh index a5131bdbb7..72a555bebe 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh @@ -26,8 +26,8 @@ # 2. Export the pool. # 3. Check that zpool labelclear returns non-zero when trying to # clear the label on ACTIVE vdevs, and succeeds with -f. -# 4. Add auxilary vdevs (cache/spare). -# 5. Check that zpool labelclear succeeds on auxilary vdevs of +# 4. Add auxiliary vdevs (cache/spare). +# 5. Check that zpool labelclear succeeds on auxiliary vdevs of # exported pool. verify_runnable "global" @@ -44,7 +44,7 @@ log_assert "zpool labelclear will fail on ACTIVE vdevs of exported pool and" \ for vdevtype in "" "cache" "spare"; do # Create simple pool, skip any mounts log_must zpool create -O mountpoint=none -f $TESTPOOL $disk1 log $disk2 - # Add auxilary vdevs (cache/spare) + # Add auxiliary vdevs (cache/spare) if [[ -n $vdevtype ]]; then log_must zpool add $TESTPOOL $vdevtype $disk3 fi @@ -63,7 +63,7 @@ for vdevtype in "" "cache" "spare"; do log_must zpool labelclear -f $disk2 log_mustnot zdb -lq $disk2 - # Check that labelclear on auxilary vdevs will succeed + # Check that labelclear on auxiliary vdevs will succeed if [[ -n $vdevtype ]]; then log_must zpool labelclear $disk3 log_mustnot zdb -lq $disk3 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh index 211829d512..31af9fd3f8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh @@ -39,13 +39,15 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - rm -f $PATTERN_FILE $DEVICE1 $DEVICE2 $DEVICE3 $DEVICE4 + rm -f $PATTERN_FILE $DISK_PATTERN_FILE \ + $DEVICE1 $DEVICE2 $DEVICE3 $DEVICE4 } log_onexit cleanup log_assert "zpool labelclear will only clear valid labels" PATTERN_FILE=$TEST_BASE_DIR/pattern +DISK_PATTERN_FILE=$TEST_BASE_DIR/disk-pattern DEVICE1="$TEST_BASE_DIR/device-1" DEVICE2="$TEST_BASE_DIR/device-2" @@ -79,7 +81,8 @@ log_mustnot eval "zpool import -d $TEST_BASE_DIR | grep $TESTPOOL" # Verify the original pattern over the first two labels is intact for dev in $DEVICE1 $DEVICE2 $DEVICE3 $DEVICE4; do - log_must cmp -n $((4 * 1048576)) $dev $PATTERN_FILE + log_must dd if=$dev of=$DISK_PATTERN_FILE bs=1048576 count=4 + log_must cmp $DISK_PATTERN_FILE $PATTERN_FILE log_mustnot zdb -lq $dev done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/setup.ksh index f3e36066e7..8ce094bcc7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/setup.ksh @@ -34,10 +34,4 @@ verify_runnable "global" -if ! $(is_physical_device $DISKS) ; then - log_unsupported "This directory cannot be run on raw files." -fi - -partition_disk $SIZE $DISK 6 - log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove.cfg index 7def918e8d..1b8312e993 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove.cfg @@ -28,30 +28,4 @@ # Copyright (c) 2012 by Delphix. All rights reserved. # -export DISK=${DISKS%% *} -export SIZE="200m" -export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') -export DISKSARRAY=$DISKS - -if is_linux; then - set_device_dir - set_slice_prefix - export SLICE0=1 - export SLICE1=2 - export SLICE2=3 - export SLICE3=4 - export SLICE4=5 - export SLICE5=6 - export SLICE6=7 - export SLICE7=8 -else - export SLICE_PREFIX="s" - export SLICE0=0 - export SLICE1=1 - export SLICE2=2 - export SLICE3=3 - export SLICE4=4 - export SLICE5=5 - export SLICE6=6 - export SLICE7=7 -fi +echo $DISKS | read DISK0 DISK1 DISK2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh index 365e86cfa8..0c098a61e2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh @@ -42,14 +42,13 @@ # 3. Verify that the remove failed. # -typeset disk=${DISK} -typeset vdev_devs="${disk}${SLICE_PREFIX}${SLICE0}" -typeset mirror_devs="${disk}${SLICE_PREFIX}${SLICE0} ${disk}${SLICE_PREFIX}${SLICE1}" +typeset vdev_devs="${DISK0}" +typeset mirror_devs="${DISK0} ${DISK1}" typeset raidz_devs=${mirror_devs} typeset raidz1_devs=${mirror_devs} -typeset raidz2_devs="${mirror_devs} ${disk}${SLICE_PREFIX}${SLICE3}" -typeset spare_devs1="${disk}${SLICE_PREFIX}${SLICE0}" -typeset spare_devs2="${disk}${SLICE_PREFIX}${SLICE1}" +typeset raidz2_devs="${mirror_devs} ${DISK2}" +typeset spare_devs1="${DISK0}" +typeset spare_devs2="${DISK1}" function check_remove { diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh index 340735aa00..4ab7ac659b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh @@ -50,10 +50,9 @@ function cleanup } log_onexit cleanup -typeset disk=${DISK} -typeset spare_devs1="${disk}${SLICE_PREFIX}${SLICE0}" -typeset spare_devs2="${disk}${SLICE_PREFIX}${SLICE1}" +typeset spare_devs1="${DISK0}" +typeset spare_devs2="${DISK1}" log_assert "zpool remove can only remove inactive hotspare device from pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh index c27c4c7d8f..4e132d9d00 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh @@ -54,17 +54,15 @@ function cleanup log_onexit cleanup typeset disk=${DISK} -typeset spare_devs1="${disk}${SLICE_PREFIX}${SLICE0}" -typeset spare_devs2="${disk}${SLICE_PREFIX}${SLICE1}" -typeset spare_devs3="${disk}${SLICE_PREFIX}${SLICE3}" -typeset spare_devs4="${disk}${SLICE_PREFIX}${SLICE4}" +typeset spare_devs1="${DISK0}" +typeset spare_devs2="${DISK1}" +typeset spare_devs3="${DISK2}" log_assert "zpool remove can remove hotspare device which state go though" \ " active to inactive in pool" log_note "Check spare device which state go through active to inactive" -log_must zpool create $TESTPOOL $spare_devs1 $spare_devs2 spare \ - $spare_devs3 $spare_devs4 +log_must zpool create $TESTPOOL $spare_devs1 $spare_devs2 spare $spare_devs3 log_must zpool replace $TESTPOOL $spare_devs2 $spare_devs3 log_mustnot zpool remove $TESTPOOL $spare_devs3 log_must zpool detach $TESTPOOL $spare_devs3 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh index a9fcef7905..25fced1ec1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh @@ -20,6 +20,10 @@ verify_runnable "global" +if ! is_linux; then + log_unsupported "scsi debug module unsupported" +fi + cleanup_devices $DISKS # Unplug the disk and remove scsi_debug module diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.cfg index 3d6a291e06..7451ffd8c5 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.cfg @@ -40,6 +40,4 @@ if is_linux; then devs_id[1]=$(get_persistent_disk_name $DISK2) devs_id[2]=$(get_persistent_disk_name $DISK3) export devs_id -else - DEV_DSKDIR="/dev" fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh index 6ac7488184..097dd3c71d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh @@ -42,7 +42,6 @@ verify_runnable "global" function cleanup { log_must zinject -c all - rm -f $TESTFILE_MD5 2>/dev/null # bring back removed disk online for further tests insert_disk $REMOVED_DISK $scsi_host poolexists $TESTPOOL && destroy_pool $TESTPOOL @@ -64,9 +63,8 @@ log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "unavail" # 3. Write a test file to the pool and calculate its checksum. TESTFILE=/$TESTPOOL/data -TESTFILE_MD5=$(mktemp --tmpdir=/var/tmp) log_must generate_random_file /$TESTPOOL/data $LARGE_FILE_SIZE -log_must md5sum $TESTFILE > $TESTFILE_MD5 +TESTFILE_MD5=$(md5digest $TESTFILE) # 4. Execute scrub. # add delay to I/O requests for remaining disk in pool @@ -90,12 +88,13 @@ log_must is_scan_restarted $TESTPOOL # 8. Put another device offline and check if the test file checksum is correct. log_must zpool offline $TESTPOOL $DISK2 -log_must md5sum -c $TESTFILE_MD5 +CHECK_MD5=$(md5digest $TESTFILE) +[[ $CHECK_MD5 == $TESTFILE_MD5 ]] || \ + log_fail "Checksums differ ($CHECK_MD5 != $TESTFILE_MD5)" log_must zpool online $TESTPOOL $DISK2 sleep 1 # clean up -rm -f $TESTFILE_MD5 2>/dev/null log_must zpool destroy $TESTPOOL log_pass "Zpool reopen test successful" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh index ae415487c7..1b18b1297a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -41,19 +42,27 @@ verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - log_must rm -f $disk1 - log_must rm -f $disk2 + rm -f $disk1 $disk2 } log_assert "zpool replace -o ashift=' works with different ashift values" log_onexit cleanup -disk1=$TEST_BASE_DIR/$FILEDISK0 -disk2=$TEST_BASE_DIR/$FILEDISK1 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 log_must truncate -s $SIZE $disk1 log_must truncate -s $SIZE $disk2 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +# +# Set the file vdev's ashift to the max. Overriding +# the ashift using the -o ashift property should still +# be honored. +# +log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16 + typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh index e740de133a..f076f26818 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -43,18 +44,27 @@ verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - log_must rm -f $disk1 $disk2 + rm -f $disk1 $disk2 } log_assert "'zpool replace' uses the ashift pool property value as default." log_onexit cleanup -disk1=$TEST_BASE_DIR/$FILEDISK0 -disk2=$TEST_BASE_DIR/$FILEDISK1 +disk1=$TEST_BASE_DIR/disk1 +disk2=$TEST_BASE_DIR/disk2 log_must truncate -s $SIZE $disk1 log_must truncate -s $SIZE $disk2 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +# +# Set the file vdev's ashift to the max. Overriding +# the ashift using the -o ashift property should still +# be honored. +# +log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16 + typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh index cfafbb6b54..80fc169126 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh @@ -45,7 +45,7 @@ verify_runnable "global" function cleanup { - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 log_must rm -f $mntpnt/biggerfile1 log_must rm -f $mntpnt/biggerfile2 } @@ -67,7 +67,7 @@ log_must sync log_must zpool detach $TESTPOOL $DISK3 # 3. Reattach the drives, causing the second drive's resilver to be deferred -log_must set_tunable32 zfs_scan_suspend_progress 1 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool attach $TESTPOOL $DISK1 $DISK2 log_must is_pool_resilvering $TESTPOOL true @@ -78,7 +78,7 @@ log_must is_pool_resilvering $TESTPOOL true # 4. Manually restart the resilver with all drives log_must zpool resilver $TESTPOOL log_must is_deferred_scan_started $TESTPOOL -log_must set_tunable32 zfs_scan_suspend_progress 0 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 log_must wait_for_resilver_end $TESTPOOL $MAXTIMEOUT log_must check_state $TESTPOOL "$DISK2" "online" log_must check_state $TESTPOOL "$DISK3" "online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh index b3cb58ceb6..03eb9901cb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh @@ -30,5 +30,5 @@ verify_runnable "global" -log_must set_tunable32 zfs_scan_suspend_progress 0 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 destroy_mirrors diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh index 71a204060b..449bb9a822 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh @@ -50,7 +50,7 @@ verify_runnable "global" function cleanup { - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 log_must rm -f $mntpnt/biggerfile } @@ -63,7 +63,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) log_must file_write -b 1048576 -c 1024 -o create -d 0 -f $mntpnt/biggerfile log_must sync -log_must set_tunable32 zfs_scan_suspend_progress 1 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_must zpool scrub -p $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh index 56225456b8..12dc044e9e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh @@ -47,14 +47,14 @@ verify_runnable "global" function cleanup { - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 } log_onexit cleanup log_assert "Scrub command fails when there is already a scrub in progress" -log_must set_tunable32 zfs_scan_suspend_progress 1 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_mustnot zpool scrub $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh index 9b6274cd10..a7ae7f16b1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh @@ -46,7 +46,7 @@ function cleanup { - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 rm -f $mntpnt/extra } @@ -59,7 +59,7 @@ log_assert "Resilver prevent scrub from starting until the resilver completes" mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) # Temporarily prevent scan progress so our test doesn't race -log_must set_tunable32 zfs_scan_suspend_progress 1 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 while ! is_pool_resilvering $TESTPOOL; do log_must zpool detach $TESTPOOL $DISK2 @@ -72,9 +72,7 @@ done log_must is_pool_resilvering $TESTPOOL log_mustnot zpool scrub $TESTPOOL -log_must set_tunable32 zfs_scan_suspend_progress 0 -while ! is_pool_resilvered $TESTPOOL; do - sleep 1 -done +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 +log_must zpool wait -t resilver $TESTPOOL log_pass "Resilver prevent scrub from starting until the resilver completes" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh index 8db6ae9802..69a33983d3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh @@ -48,18 +48,10 @@ log_assert "When scrubbing, detach device should not break system." log_must zpool scrub $TESTPOOL log_must zpool detach $TESTPOOL $DISK2 -log_must zpool attach $TESTPOOL $DISK1 $DISK2 - -while ! is_pool_resilvered $TESTPOOL; do - sleep 1 -done +log_must zpool attach -w $TESTPOOL $DISK1 $DISK2 log_must zpool scrub $TESTPOOL log_must zpool detach $TESTPOOL $DISK1 -log_must zpool attach $TESTPOOL $DISK2 $DISK1 - -while ! is_pool_resilvered $TESTPOOL; do - sleep 1 -done +log_must zpool attach -w $TESTPOOL $DISK2 $DISK1 log_pass "When scrubbing, detach device should not break system." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh index 483a683bd5..b1f7c6264b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh @@ -39,7 +39,7 @@ verify_runnable "global" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 } log_onexit cleanup @@ -58,11 +58,7 @@ done log_must zfs unmount $TESTPOOL/$TESTFS2 log_must zfs unload-key $TESTPOOL/$TESTFS2 -log_must zpool scrub $TESTPOOL - -while ! is_pool_scrubbed $TESTPOOL; do - sleep 1 -done +log_must zpool scrub -w $TESTPOOL log_must check_pool_status $TESTPOOL "scan" "with 0 errors" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh index e4cb2b51eb..4b51cd9625 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh @@ -43,7 +43,7 @@ log_assert "Verify we see '(repairing)' while scrubbing a bad vdev." function cleanup { log_must zinject -c all - log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must set_tunable64 SCAN_VDEV_LIMIT $ZFS_SCAN_VDEV_LIMIT_DEFAULT zpool scrub -s $TESTPOOL || true } @@ -54,7 +54,7 @@ log_must zinject -d $DISK1 -e io -T read -f 100 $TESTPOOL # Make the scrub slow log_must zinject -d $DISK1 -D10:1 $TESTPOOL -log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW +log_must set_tunable64 SCAN_VDEV_LIMIT $ZFS_SCAN_VDEV_LIMIT_SLOW log_must zpool scrub $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_ashift.ksh index 3e7ef33456..09b5f50d5e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_ashift.ksh @@ -22,6 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -41,6 +42,7 @@ verify_runnable "global" function cleanup { + log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift destroy_pool $TESTPOOL1 rm -f $disk } @@ -52,7 +54,15 @@ log_onexit cleanup log_assert "zpool set can modify 'ashift' property" -disk=$TEST_BASE_DIR/$FILEDISK0 +orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +# +# Set the file vdev's ashift to the max. Overriding +# the ashift using the -o ashift property should still +# be honored. +# +log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16 + +disk=$TEST_BASE_DIR/disk log_must mkfile $SIZE $disk log_must zpool create $TESTPOOL1 $disk diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am index d00f39d35d..aac5e0d6e7 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am @@ -11,7 +11,9 @@ dist_pkgdata_SCRIPTS = \ zpool_split_props.ksh \ zpool_split_vdevs.ksh \ zpool_split_resilver.ksh \ - zpool_split_wholedisk.ksh + zpool_split_wholedisk.ksh \ + zpool_split_indirect.ksh \ + zpool_split_dryrun_output.ksh dist_pkgdata_DATA = \ zpool_split.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh new file mode 100755 index 0000000000..2267ea7bd8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset NEWPOOL="${TESTPOOL}split" +typeset STR_DRYRUN="would create '$NEWPOOL' with the following layout:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool split -n [ ...]' can display the correct +# configuration +# +# STRATEGY: +# 1. Create a mirrored storage pool, split -n and verify the output is as +# expected. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + # Test for hole. + ( + tree="mirror '${dev[0]}' '${dev[1]}' log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}'" + + devs="" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[1]} + special + ${dev[5]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}'" + + devs="'${dev[0]}' '${dev[4]}'" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[0]} + special + ${dev[4]}" + ) + + # Full set of vdev types. + ( + tree="mirror '${dev[0]}' '${dev[1]}' + dedup mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + cache '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}'\ + log mirror '${dev[10]}' '${dev[11]}'" + + devs="" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[1]} + dedup + ${dev[3]} + special + ${dev[5]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' + dedup mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + cache '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}'\ + log mirror '${dev[10]}' '${dev[11]}'" + + devs="'${dev[0]}' '${dev[2]}' '${dev[4]}'" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[0]} + dedup + ${dev[2]} + special + ${dev[4]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + destroy_pool "$TESTPOOL" + rm -f "$VDEV_PREFIX"* +} + +log_assert \ +"'zpool split -n []...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset devs="${tests[$i].devs}" + typeset want="${tests[$i].want}" + + log_must eval zpool create "$TESTPOOL" $tree + log_must poolexists "$TESTPOOL" + typeset out="$(log_must eval "zpool split -n \ + '$TESTPOOL' '$NEWPOOL' $devs" | sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi + log_must destroy_pool "$TESTPOOL" +done + +log_pass \ +"'zpool split -n []...' displays config correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh new file mode 100755 index 0000000000..13f0d08b7f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# 'zpool split' should succeed on pools with indirect vdevs. +# +# STRATEGY: +# Create a mirrored pool, add a single device, remove it. `zpool split` +# should succeed. +# + +verify_runnable "global" + +log_assert "'zpool split' works on pools with indirect VDEVs." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + if poolexists $TESTPOOL2 ; then + destroy_pool $TESTPOOL2 + fi + rm -f $VDEV_TEMP $VDEV_M1 $VDEV_M2 +} +log_onexit cleanup + +typeset vdev_m12_mb=400 +typeset vdev_temp_mb=$(( floor($vdev_m12_mb / 2) )) +typeset VDEV_TEMP="$TEST_BASE_DIR/vdev_temp" +typeset VDEV_M1="$TEST_BASE_DIR/vdev_m1" +typeset VDEV_M2="$TEST_BASE_DIR/vdev_m2" +typeset altroot="$TESTDIR/altroot-$TESTPOOL2" + +log_must truncate -s ${vdev_temp_mb}M $VDEV_TEMP +log_must truncate -s ${vdev_m12_mb}M $VDEV_M1 +log_must truncate -s ${vdev_m12_mb}M $VDEV_M2 + +log_must zpool create -f $TESTPOOL $VDEV_TEMP +log_must zpool add -f $TESTPOOL mirror $VDEV_M1 $VDEV_M2 +log_must zpool remove $TESTPOOL $VDEV_TEMP +log_must wait_for_removal $TESTPOOL +log_must zpool split -R $altroot $TESTPOOL $TESTPOOL2 +log_must poolexists $TESTPOOL2 +log_must test "$(get_pool_prop 'altroot' $TESTPOOL2)" == "$altroot" + +log_pass "'zpool split' works on pools with indirect VDEVs." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_props.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_props.ksh index 67dbed6933..1aff8d31d9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_props.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_props.ksh @@ -35,7 +35,7 @@ function cleanup destroy_pool $TESTPOOL destroy_pool $TESTPOOL2 rm -f $DEVICE1 $DEVICE2 - log_must mmp_clear_hostid + ! is_freebsd && log_must mmp_clear_hostid } function setup_mirror @@ -48,23 +48,25 @@ function setup_mirror log_assert "'zpool split' can set new property values on the new pool" log_onexit cleanup -if [ -e $HOSTID_FILE ]; then - log_unsupported "System has existing $HOSTID_FILE file" -fi - -typeset good_props=('comment=text' 'ashift=12' 'multihost=on' - 'listsnapshots=on' 'autoexpand=on' 'autoreplace=on' 'dedupditto=1234' - 'delegation=off' 'failmode=continue') -typeset bad_props=("bootfs=$TESTPOOL2/bootfs" 'version=28' 'ashift=4' - 'allocated=1234' 'capacity=5678' 'dedupditto=42' 'multihost=none' - 'feature@async_destroy=disabled' 'feature@xxx_fake_xxx=enabled' - 'propname=propval' 'readonly=on') - DEVICE1="$TEST_BASE_DIR/device-1" DEVICE2="$TEST_BASE_DIR/device-2" -# Needed to set multihost=on -log_must mmp_set_hostid $HOSTID1 +typeset good_props=('comment=text' 'ashift=12' 'multihost=on' + 'listsnapshots=on' 'autoexpand=on' 'autoreplace=on' + 'delegation=off' 'failmode=continue') +typeset bad_props=("bootfs=$TESTPOOL2/bootfs" 'version=28' 'ashift=4' + 'allocated=1234' 'capacity=5678' 'multihost=none' + 'feature@async_destroy=disabled' 'feature@xxx_fake_xxx=enabled' + 'propname=propval' 'readonly=on') +if ! is_freebsd; then + good_props+=('multihost=on') + bad_props+=('multihost=none') + if [ -e $HOSTID_FILE ]; then + log_unsupported "System has existing $HOSTID_FILE file" + fi + # Needed to set multihost=on + log_must mmp_set_hostid $HOSTID1 +fi # Verify we can set a combination of valid property values on the new pool for prop in "${good_props[@]}" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh index 1a5c3198f0..99a40ecf2b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh @@ -41,7 +41,7 @@ verify_runnable "both" function cleanup { - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 destroy_pool $TESTPOOL destroy_pool $TESTPOOL2 rm -f $DEVICE1 $DEVICE2 @@ -69,7 +69,7 @@ function zpool_split #disk_to_be_offline/online log_must sync # temporarily prevent resilvering progress, so it will not finish too early - log_must set_tunable32 zfs_scan_suspend_progress 1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 log_must zpool online $TESTPOOL $disk @@ -84,7 +84,7 @@ function zpool_split #disk_to_be_offline/online log_mustnot zpool split $TESTPOOL $TESTPOOL2 - log_must set_tunable32 zfs_scan_suspend_progress 0 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 } log_assert "Verify 'zpool split' will fail if resilver in progress for a disk" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_vdevs.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_vdevs.ksh index b7ebe55cb8..9866cf7a5a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_vdevs.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_vdevs.ksh @@ -125,7 +125,7 @@ do add_config="$(awk '{$1= "";print $0}' <<< $config)" log_must zpool create $TESTPOOL $(pool_config $create_config) for vdev in $add_config; do - log_must zpool add $TESTPOOL -f $(pool_config $vdev) + log_must zpool add -f $TESTPOOL $(pool_config $vdev) done log_must zpool split -R $altroot $TESTPOOL $TESTPOOL2 log_must poolexists $TESTPOOL2 @@ -140,7 +140,7 @@ do add_config="$(awk '{$1= "";print $0}' <<< $config)" log_must zpool create $TESTPOOL $(pool_config $create_config) for vdev in $add_config; do - log_must zpool add $TESTPOOL -f $(pool_config $vdev) + log_must zpool add -f $TESTPOOL $(pool_config $vdev) done log_mustnot zpool split -R $altroot $TESTPOOL $TESTPOOL2 log_mustnot poolexists $TESTPOOL2 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am index aab4de0e7c..5553061c67 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am @@ -4,7 +4,4 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_status_001_pos.ksh \ zpool_status_002_pos.ksh \ - zpool_status_003_pos.ksh \ - zpool_status_-c_disable.ksh \ - zpool_status_-c_homedir.ksh \ - zpool_status_-c_searchpath.ksh + zpool_status_features_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh new file mode 100755 index 0000000000..635125fc0d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify zpool status only recommends upgrading the pool when +# the enabled features don't match those in the feature set. +# +# STRATEGY: +# 1. Create a pool with a known feature set. +# 2. Verify there is no `zpool status` notice to upgrade the pool. +# 3. Set the pool compatibility to a newer feature set. +# 4. Verify there is a `zpool status` notice to upgrade the pool. +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL1 && log_must zpool destroy $TESTPOOL1 + rm -f $FILEDEV +} + +FILEDEV="$TEST_BASE_DIR/filedev.$$" + +log_onexit cleanup + +log_assert "check 'zpool status' upgrade notice" + +log_must truncate -s $MINVDEVSIZE $FILEDEV +log_must zpool create -f -o compatibility=compat-2018 $TESTPOOL1 $FILEDEV +log_mustnot check_pool_status $TESTPOOL1 "status" "features are not enabled" + +log_must zpool set compatibility=compat-2020 $TESTPOOL1 +log_must check_pool_status $TESTPOOL1 "status" "features are not enabled" + +log_pass "check 'zpool status' upgrade notice" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am index d2d3b4ae88..0411ab4e00 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am @@ -3,6 +3,7 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zpool_trim_attach_detach_add_remove.ksh \ + zpool_trim_fault_export_import_online.ksh \ zpool_trim_import_export.ksh \ zpool_trim_multiple.ksh \ zpool_trim_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh index cdcf038ad1..09489600b3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh @@ -23,15 +23,21 @@ verify_runnable "global" -DISK1=${DISKS%% *} +if is_freebsd; then + log_unsupported "FreeBSD has no hole punching mechanism for the time being." + diskinfo -v $DISKS | grep -qE 'No.*# TRIM/UNMAP support' && + log_unsupported "DISKS do not support discard (TRIM/UNMAP)" +else + DISK1=${DISKS%% *} -typeset -i max_discard=0 -if [[ -b $DEV_RDSKDIR/$DISK1 ]]; then - max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }') -fi + typeset -i max_discard=0 + if is_disk_device $DEV_RDSKDIR/$DISK1; then + max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }') + fi -if test $max_discard -eq 0; then - log_unsupported "DISKS do not support discard (TRIM/UNMAP)" + if test $max_discard -eq 0; then + log_unsupported "DISKS do not support discard (TRIM/UNMAP)" + fi fi log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib index 1c54c66c12..e8d43cc8c7 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib @@ -30,6 +30,23 @@ function trim_progress # pool disk trim_prog_line "$1" "$2" | sed 's/.*(\([0-9]\{1,\}\)% trimmed.*/\1/g' } +# +# Write a bit of data and sync several times. +# +function sync_and_rewrite_some_data_a_few_times +{ + typeset pool=$1 + typeset -i a_few_times=${2:-20} + + typeset file="/$pool/tmpfile" + for i in {0..$a_few_times}; do + dd if=/dev/urandom of=${file} bs=128k count=10 + sync_pool "$pool" + done + + return 0 +} + function cleanup { if poolexists $TESTPOOL; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh new file mode 100755 index 0000000000..6bb9fc346d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tim Chase. All rights reserved. +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib + +# +# DESCRIPTION: +# Miscellaneous complex sequences of operations function as expected. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start trimming, fault, export, import, online and verify along +# the way that the trim was cancelled and not restarted. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool trim -r 128M $TESTPOOL $DISK1 +progress="$(trim_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Trimming did not start" + +log_must zpool offline -f $TESTPOOL $DISK1 +log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" +log_must eval "zpool status -t $TESTPOOL | grep $DISK1 | grep untrimmed" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# Note: the expected state here is unsupported since the faulted device +# cannot be checked to determine if it supports TRIM. +log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" +log_must eval "zpool status -t $TESTPOOL | grep $DISK1 | grep unsupported" + +log_must zpool online $TESTPOOL $DISK1 +log_must zpool clear $TESTPOOL $DISK1 +log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" +log_must eval "zpool status -t $TESTPOOL | grep $DISK1 | grep untrimmed" + +log_pass "Trimming behaves as expected at each step of:" \ + "trim + fault + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh index 681cd12f71..afc9a2ed19 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh @@ -27,7 +27,7 @@ # Trimming automatically resumes across offline/online. # # STRATEGY: -# 1. Create a pool with a two-way mirror. +# 1. Create a pool with a two-way mirror, prepare blocks to trim. # 2. Start trimming one of the disks and verify that trimming is active. # 3. Offline the disk. # 4. Online the disk. @@ -39,8 +39,10 @@ DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -log_must zpool trim -r 128M $TESTPOOL $DISK1 +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -O recordsize=4k +sync_and_rewrite_some_data_a_few_times $TESTPOOL + +log_must zpool trim -r 1 $TESTPOOL $DISK1 log_must zpool offline $TESTPOOL $DISK1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh index 58e0ef77cc..5d14b74ecc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh @@ -44,9 +44,9 @@ function cleanup rm -rf "$TESTDIR" fi - log_must set_tunable64 zfs_trim_metaslab_skip 0 - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count + log_must set_tunable64 TRIM_METASLAB_SKIP 0 + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 VDEV_MIN_MS_COUNT $vdev_min_ms_count } log_onexit cleanup @@ -55,12 +55,12 @@ LARGEFILE="$TESTDIR/largefile" # The minimum number of metaslabs is increased in order to simulate the # behavior of partial trimming on a more typically sized 1TB disk. -typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count) -log_must set_tunable64 zfs_vdev_min_ms_count 64 +typeset vdev_min_ms_count=$(get_tunable VDEV_MIN_MS_COUNT) +log_must set_tunable64 VDEV_MIN_MS_COUNT 64 # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 log_must mkdir "$TESTDIR" log_must truncate -s $LARGESIZE "$LARGEFILE" @@ -85,9 +85,9 @@ log_must test $new_size -gt $((4 * floor(LARGESIZE * 0.70) )) # Perform a partial trim, we expect it to skip most of the new metaslabs # which have never been used and therefore do not need be trimmed. -log_must set_tunable64 zfs_trim_metaslab_skip 1 +log_must set_tunable64 TRIM_METASLAB_SKIP 1 log_must zpool trim $TESTPOOL -log_must set_tunable64 zfs_trim_metaslab_skip 0 +log_must set_tunable64 TRIM_METASLAB_SKIP 0 log_must zpool sync while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh index faf134fbbd..68e9909007 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh @@ -39,8 +39,10 @@ DISK2="$(echo $DISKS | cut -d' ' -f2)" DISK3="$(echo $DISKS | cut -d' ' -f3)" log_must zpool list -v -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -log_must zpool trim -r 128M $TESTPOOL $DISK1 +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -O recordsize=4k +sync_and_rewrite_some_data_a_few_times $TESTPOOL + +log_must zpool trim -r 1 $TESTPOOL $DISK1 [[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] && \ log_fail "Trim did not start" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh index eaa4d90444..fbb0c29104 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh @@ -20,29 +20,29 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib +. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib # # DESCRIPTION: -# Starting and stopping an initialize works. +# Starting and stopping a trim works. # # STRATEGY: # 1. Create a one-disk pool. -# 2. Start initializing and verify that initializing is active. -# 3. Cancel initializing and verify that initializing is not active. +# 2. Start trimming and verify that trimming is active. +# 3. Cancel trimming and verify that trimming is not active. # DISK1=${DISKS%% *} log_must zpool create -f $TESTPOOL $DISK1 -log_must zpool initialize $TESTPOOL +log_must zpool trim $TESTPOOL -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initialize did not start" +[[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "TRIM did not start" -log_must zpool initialize -c $TESTPOOL +log_must zpool trim -c $TESTPOOL -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ - log_fail "Initialize did not stop" +[[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "TRIM did not stop" -log_pass "Initialize start + cancel works" +log_pass "TRIM start + cancel works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh index a216d132fb..d5aaf49aeb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh @@ -43,7 +43,7 @@ function cleanup rm -rf "$TESTDIR" fi - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min } log_onexit cleanup @@ -51,8 +51,8 @@ LARGESIZE=$((MINVDEVSIZE * 4)) LARGEFILE="$TESTDIR/largefile" # Reduce trim size to allow for tighter tolerance below when checking. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 log_must mkdir "$TESTDIR" log_must truncate -s $LARGESIZE "$LARGEFILE" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile.am index 18311ed2ca..c7f321a2f6 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile.am @@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ zpool_upgrade_006_neg.ksh \ zpool_upgrade_007_pos.ksh \ zpool_upgrade_008_pos.ksh \ - zpool_upgrade_009_neg.ksh + zpool_upgrade_009_neg.ksh \ + zpool_upgrade_features_001_pos.ksh dist_pkgdata_DATA = \ zpool_upgrade.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib index 7b018da1b6..783ae54e71 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib @@ -138,22 +138,3 @@ function check_poolversion log_fail "$pool: zpool reported version $actual, expected $vers" fi } - -# A simple function to get a random number between two bounds -# probably not the most efficient for large ranges, but it's okay. -# Note since we're using $RANDOM, 32767 is the largest number we -# can accept as the upper bound. -# $1 lower bound -# $2 upper bound -function random -{ - typeset min=$1 - typeset max=$2 - typeset rand=0 - - while [[ $rand -lt $min ]] ; do - rand=$(( $RANDOM % $max + 1)) - done - - echo $rand -} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh index adc1ba47fc..696c8c66cc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh @@ -42,7 +42,7 @@ # # STRATEGY: # 1. Import pools of all versions -# 2. Setup a test enviorment over the old pools. +# 2. Setup a test environment over the old pools. # 3. Verify the commands related to 'zfs upgrade' succeed as expected. # diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh index 173d7f68c8..d930919652 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh @@ -30,6 +30,7 @@ # Copyright 2015 Nexenta Systems, Inc. All rights reserved. # +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib # @@ -67,7 +68,7 @@ MAX_VER=15 for ver_old in $VERSIONS; do typeset -n pool_name=ZPOOL_VERSION_${ver_old}_NAME - typeset ver_new=$(random $ver_old $MAX_VER) + typeset -i ver_new=$(random_int_between $ver_old $MAX_VER) create_old_pool $ver_old log_must zpool upgrade -V $ver_new $pool_name > /dev/null diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh new file mode 100755 index 0000000000..5170d31b46 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify pools can be upgraded to known feature sets. +# +# STRATEGY: +# 1. Create a pool with a known feature set. +# 2. Verify only those features are active/enabled. +# 3. Upgrade the pool to a newer feature set. +# 4. Verify only those features are active/enabled. +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $TESTPOOL1 && log_must zpool destroy $TESTPOOL1 + rm -f $FILEDEV +} + +FILEDEV="$TEST_BASE_DIR/filedev.$$" + +log_onexit cleanup + +log_assert "verify pools can be upgraded to known feature sets." + +log_must truncate -s $MINVDEVSIZE $FILEDEV +log_must zpool create -f -o compatibility=compat-2018 $TESTPOOL1 $FILEDEV +check_feature_set $TESTPOOL1 compat-2018 +log_mustnot check_pool_status $TESTPOOL1 "status" "features are not enabled" + +log_must zpool set compatibility=compat-2020 $TESTPOOL1 +log_must check_pool_status $TESTPOOL1 "status" "features are not enabled" + +log_must zpool upgrade $TESTPOOL1 +check_feature_set $TESTPOOL1 compat-2020 +log_mustnot check_pool_status $TESTPOOL1 "status" "features are not enabled" + +log_pass "verify pools can be upgraded to known feature sets." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile.am new file mode 100644 index 0000000000..45ab8e3d4f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile.am @@ -0,0 +1,22 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_wait +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_wait_discard.ksh \ + zpool_wait_freeing.ksh \ + zpool_wait_initialize_basic.ksh \ + zpool_wait_initialize_cancel.ksh \ + zpool_wait_initialize_flag.ksh \ + zpool_wait_multiple.ksh \ + zpool_wait_no_activity.ksh \ + zpool_wait_remove.ksh \ + zpool_wait_remove_cancel.ksh \ + zpool_wait_trim_basic.ksh \ + zpool_wait_trim_cancel.ksh \ + zpool_wait_trim_flag.ksh \ + zpool_wait_usage.ksh + +dist_pkgdata_DATA = \ + zpool_wait.kshlib + +SUBDIRS = scan diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/cleanup.ksh similarity index 85% rename from tests/zfs-tests/tests/functional/cli_root/zfs_remap/setup.ksh rename to tests/zfs-tests/tests/functional/cli_root/zpool_wait/cleanup.ksh index 4497dbd746..456d2d0c2d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_remap/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/cleanup.ksh @@ -1,5 +1,6 @@ #!/bin/ksh -p # +# # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version @@ -11,7 +12,9 @@ # # -# Copyright 2018, loli10K . All rights reserved. +# Copyright (c) 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am new file mode 100644 index 0000000000..451d83a79a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am @@ -0,0 +1,11 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_wait/scan +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_wait_replace.ksh \ + zpool_wait_replace_cancel.ksh \ + zpool_wait_rebuild.ksh \ + zpool_wait_resilver.ksh \ + zpool_wait_scrub_basic.ksh \ + zpool_wait_scrub_cancel.ksh \ + zpool_wait_scrub_flag.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/cleanup.ksh new file mode 100755 index 0000000000..456d2d0c2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/cleanup.ksh @@ -0,0 +1,20 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/setup.ksh new file mode 100755 index 0000000000..8a6a1a25b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +verify_runnable "global" +verify_disk_count $DISKS 3 + +# +# Set up a pool for use in the tests that do scrubbing and resilvering. Each +# test leaves the pool in the same state as when it started, so it is safe to +# share the same setup. +# +log_must zpool create -f $TESTPOOL $DISK1 +log_must dd if=/dev/urandom of="/$TESTPOOL/testfile" bs=1k count=256k + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh new file mode 100755 index 0000000000..8cd5864597 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for sequential resilvering to complete. +# +# STRATEGY: +# 1. Attach a device to the pool so that sequential resilvering starts. +# 2. Start 'zpool wait'. +# 3. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using +# 'zpool wait'. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 +} + +typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL" +typeset pid + +log_onexit cleanup + +add_io_delay $TESTPOOL + +# Test 'zpool wait -t resilver' +log_must zpool attach -s $TESTPOOL $DISK1 $DISK2 +log_bkgrnd zpool wait -t resilver $TESTPOOL +pid=$! +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_must zpool detach $TESTPOOL $DISK2 + +# Test 'zpool attach -w' +log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2 +pid=$! +while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do + log_must sleep .5 +done +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_pass "'zpool wait -t resilver' and 'zpool attach -w' work." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh new file mode 100755 index 0000000000..06df7b51cf --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when a replacing disks. +# +# STRATEGY: +# 1. Attach a disk to pool to form two-way mirror. +# 2. Start a replacement of the new disk. +# 3. Start 'zpool wait'. +# 4. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 5. Repeat 2-4, except using the '-w' flag with 'zpool replace' instead of +# using 'zpool wait'. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 + get_disklist $TESTPOOL | grep $DISK3 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK3 +} + +function in_progress +{ + zpool status $TESTPOOL | grep 'replacing-' >/dev/null +} + +typeset pid + +log_onexit cleanup + +log_must zpool attach -w $TESTPOOL $DISK1 $DISK2 + +add_io_delay $TESTPOOL + +# Test 'zpool wait -t replace' +log_must zpool replace $TESTPOOL $DISK2 $DISK3 +log_bkgrnd zpool wait -t replace $TESTPOOL +pid=$! +check_while_waiting $pid in_progress + +# Test 'zpool replace -w' +log_bkgrnd zpool replace -w $TESTPOOL $DISK3 $DISK2 +pid=$! +while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do + log_must sleep .5 +done +check_while_waiting $pid in_progress + +log_pass "'zpool wait -t replace' and 'zpool replace -w' work." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh new file mode 100755 index 0000000000..a899e9f99f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when a replacing disk is detached before the replacement +# completes. +# +# STRATEGY: +# 1. Attach a disk to pool to form two-way mirror. +# 2. Modify tunable so that resilver won't complete while test is running. +# 3. Start a replacement of the new disk. +# 4. Start a process that waits for the replace. +# 5. Wait a few seconds and then check that the wait process is actually +# waiting. +# 6. Cancel the replacement by detaching the replacing disk. +# 7. Check that the wait process returns reasonably promptly. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 + get_disklist $TESTPOOL | grep $DISK3 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK3 + log_must zpool sync $TESTPOOL +} + +typeset pid + +log_onexit cleanup + +log_must zpool attach -w $TESTPOOL $DISK1 $DISK2 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace $TESTPOOL $DISK2 $DISK3 +log_bkgrnd zpool wait -t replace $TESTPOOL +pid=$! + +log_must sleep 3 +proc_must_exist $pid + +log_must zpool detach $TESTPOOL $DISK3 +bkgrnd_proc_succeeded $pid + +log_pass "'zpool wait -t replace' returns when replacing disk is detached." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh new file mode 100755 index 0000000000..a938901f76 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for resilvering to complete. +# +# STRATEGY: +# 1. Attach a device to the pool so that resilvering starts. +# 2. Start 'zpool wait'. +# 3. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using +# 'zpool wait'. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 +} + +typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL" +typeset pid + +log_onexit cleanup + +add_io_delay $TESTPOOL + +# Test 'zpool wait -t resilver' +log_must zpool attach $TESTPOOL $DISK1 $DISK2 +log_bkgrnd zpool wait -t resilver $TESTPOOL +pid=$! +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_must zpool detach $TESTPOOL $DISK2 + +# Test 'zpool attach -w' +log_bkgrnd zpool attach -w $TESTPOOL $DISK1 $DISK2 +pid=$! +while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do + log_must sleep .5 +done +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_pass "'zpool wait -t resilver' and 'zpool attach -w' work." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh new file mode 100755 index 0000000000..d4bb170817 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh @@ -0,0 +1,49 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for a scrub to complete. +# +# STRATEGY: +# 1. Start a scrub. +# 2. Start 'zpool wait -t scrub'. +# 3. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid +} + +typeset pid + +log_onexit cleanup + +# Slow down scrub so that we actually have something to wait for. +add_io_delay $TESTPOOL + +log_must zpool scrub $TESTPOOL +log_bkgrnd zpool wait -t scrub $TESTPOOL +pid=$! +check_while_waiting $pid "is_pool_scrubbing $TESTPOOL" + +log_pass "'zpool wait -t scrub' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh new file mode 100755 index 0000000000..7adb3b2b82 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when a scrub is paused or canceled. +# +# STRATEGY: +# 1. Modify tunable so that scrubs won't complete while test is running. +# 2. Start a scrub. +# 3. Start a process that waits for the scrub. +# 4. Wait a few seconds and then check that the wait process is actually +# waiting. +# 5. Pause the scrub. +# 6. Check that the wait process returns reasonably promptly. +# 7. Repeat 2-6, except stop the scrub instead of pausing it. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + kill_if_running $pid + is_pool_scrubbing $TESTPOOL && log_must zpool scrub -s $TESTPOOL +} + +function do_test +{ + typeset stop_cmd=$1 + + log_must zpool scrub $TESTPOOL + log_bkgrnd zpool wait -t scrub $TESTPOOL + pid=$! + + log_must sleep 3 + proc_must_exist $pid + + log_must eval "$stop_cmd" + bkgrnd_proc_succeeded $pid +} + +typeset pid + +log_onexit cleanup + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +do_test "zpool scrub -p $TESTPOOL" +do_test "zpool scrub -s $TESTPOOL" + +log_pass "'zpool wait -t scrub' works when scrub is canceled." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh new file mode 100755 index 0000000000..aac62cf460 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool scrub -w' waits while scrub is in progress. +# +# STRATEGY: +# 1. Start a scrub with the -w flag. +# 2. Wait a few seconds and then check that the wait process is actually +# waiting. +# 3. Stop the scrub, make sure that the command returns reasonably promptly. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + kill_if_running $pid +} + +typeset pid + +log_onexit cleanup + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_bkgrnd zpool scrub -w $TESTPOOL +pid=$! + +log_must sleep 3 +proc_must_exist $pid + +log_must zpool scrub -s $TESTPOOL +bkgrnd_proc_succeeded $pid + +log_pass "'zpool scrub -w' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/setup.ksh new file mode 100755 index 0000000000..5a9af18464 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/setup.ksh @@ -0,0 +1,23 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +verify_runnable "global" + +verify_disk_count $DISKS 3 + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib new file mode 100644 index 0000000000..b413f6e9f9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib @@ -0,0 +1,124 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +typeset -a disk_array=($(find_disks $DISKS)) + +typeset -r DISK1=${disk_array[0]} +typeset -r DISK2=${disk_array[1]} +typeset -r DISK3=${disk_array[2]} + +# +# When the condition it is waiting for becomes true, 'zpool wait' should return +# promptly. We want to enforce this, but any check will be racey because it will +# take some small but indeterminate amount of time for the waiting thread to be +# woken up and for the process to exit. +# +# To deal with this, we provide a grace period after the condition becomes true +# during which 'zpool wait' can exit. If it hasn't exited by the time the grace +# period expires we assume something is wrong and fail the test. While there is +# no value that can really be correct, the idea is we choose something large +# enough that it shouldn't cause issues in practice. +# +typeset -r WAIT_EXIT_GRACE=2.0 + +function add_io_delay # pool +{ + for disk in $(get_disklist $1); do + log_must zinject -d $disk -D20:1 $1 + done +} + +function remove_io_delay +{ + log_must zinject -c all +} + +function proc_exists # pid +{ + ps -p $1 >/dev/null +} + +function proc_must_exist # pid +{ + proc_exists $1 || log_fail "zpool process exited too soon" +} + +function proc_must_not_exist # pid +{ + proc_exists $1 && log_fail "zpool process took too long to exit" +} + +function get_time +{ + date +'%H:%M:%S' +} + +function kill_if_running +{ + typeset pid=$1 + [[ $pid ]] && proc_exists $pid && log_must kill -s TERM $pid +} + +# Log a command and then start it running in the background +function log_bkgrnd +{ + log_note "$(get_time) Starting cmd in background '$@'" + "$@" & +} + +# Check that a background process has completed and exited with a status of 0 +function bkgrnd_proc_succeeded +{ + typeset pid=$1 + + log_must sleep $WAIT_EXIT_GRACE + + proc_must_not_exist $pid + wait $pid || log_fail "zpool process exited with status $?" + log_note "$(get_time) wait completed successfully" +} + +# +# Check that 'zpool wait' returns reasonably promptly after the condition +# waited for becomes true, and not before. +# +function check_while_waiting +{ + # The pid of the waiting process + typeset wait_proc_pid=$1 + # A check that should be true while the activity is in progress + typeset activity_check=$2 + + log_note "$(get_time) waiting for process $wait_proc_pid using" \ + "activity check '$activity_check'" + while proc_exists $wait_proc_pid && eval "$activity_check"; do + log_must sleep .5 + done + + # + # If the activity being waited on is still in progress, then zpool wait + # exited too soon. + # + log_mustnot eval "$activity_check" + + bkgrnd_proc_succeeded $wait_proc_pid +} + +# Whether any vdev in the given pool is initializing +function is_vdev_initializing # pool +{ + zpool status -i "$1" | grep 'initialized, started' >/dev/null +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_discard.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_discard.ksh new file mode 100755 index 0000000000..8d5747e090 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_discard.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for checkpoint discard to complete. +# +# STRATEGY: +# 1. Create a pool. +# 2. Add some data to the pool. +# 3. Checkpoint the pool and delete the data so that the space is unique to the +# checkpoint. +# 4. Discard the checkpoint using the '-w' flag. +# 5. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 6. Repeat 2-5, but using 'zpool wait' instead of the '-w' flag. +# + +function cleanup +{ + log_must zinject -c all + poolexists $TESTPOOL && destroy_pool $TESTPOOL + kill_if_running $pid + + [[ $default_mem_limit ]] && log_must set_tunable64 \ + SPA_DISCARD_MEMORY_LIMIT $default_mem_limit +} + +function do_test +{ + typeset use_wait_flag=$1 + + log_must dd if=/dev/urandom of="$TESTFILE" bs=128k count=1k + log_must zpool checkpoint $TESTPOOL + + # Make sure bulk of space is unique to checkpoint + log_must rm "$TESTFILE" + + log_must zinject -d $DISK1 -D20:1 $TESTPOOL + + if $use_wait_flag; then + log_bkgrnd zpool checkpoint -dw $TESTPOOL + pid=$! + + while ! is_pool_discarding $TESTPOOL && proc_exists $pid; do + log_must sleep .5 + done + else + log_must zpool checkpoint -d $TESTPOOL + log_bkgrnd zpool wait -t discard $TESTPOOL + pid=$! + fi + + check_while_waiting $pid "is_pool_discarding $TESTPOOL" + log_must zinject -c all +} + +typeset -r TESTFILE="/$TESTPOOL/testfile" +typeset pid default_mem_limit + +log_onexit cleanup + +default_mem_limit=$(get_tunable SPA_DISCARD_MEMORY_LIMIT) +log_must set_tunable64 SPA_DISCARD_MEMORY_LIMIT 32 + +log_must zpool create $TESTPOOL $DISK1 + +do_test true +do_test false + +log_pass "'zpool wait -t discard' and 'zpool checkpoint -dw' work." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_freeing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_freeing.ksh new file mode 100755 index 0000000000..7f5a9e6a8d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_freeing.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for background freeing to complete. +# +# STRATEGY: +# 1. Create a pool. +# 2. Modify tunables to make sure freeing is slow enough to observe. +# 3. Create a file system with some data. +# 4. Destroy the file system and call 'zpool wait'. +# 5. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 6. Repeat 3-5, except destroy a snapshot instead of a filesystem. +# 7. Repeat 3-5, except destroy a clone. +# + +function cleanup +{ + log_must set_tunable64 ASYNC_BLOCK_MAX_BLOCKS $default_async_block_max_blocks + log_must set_tunable64 LIVELIST_MAX_ENTRIES $default_max_livelist_entries + log_must set_tunable64 LIVELIST_MIN_PERCENT_SHARED $default_min_pct_shared + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + kill_if_running $pid +} + +function test_wait +{ + log_bkgrnd zpool wait -t free $TESTPOOL + pid=$! + check_while_waiting $pid '[[ $(get_pool_prop freeing $TESTPOOL) != "0" ]]' +} + +typeset -r FS="$TESTPOOL/$TESTFS1" +typeset -r SNAP="$FS@snap1" +typeset -r CLONE="$TESTPOOL/clone" +typeset pid default_max_livelist_entries default_min_pct_shared +typeset default_async_block_max_blocks + +log_onexit cleanup + +log_must zpool create $TESTPOOL $DISK1 + +# +# Limit the number of blocks that can be freed in a single txg. This slows down +# freeing so that we actually have something to wait for. +# +default_async_block_max_blocks=$(get_tunable ASYNC_BLOCK_MAX_BLOCKS) +log_must set_tunable64 ASYNC_BLOCK_MAX_BLOCKS 8 +# +# Space from clones gets freed one livelist per txg instead of being controlled +# by zfs_async_block_max_blocks. Limit the rate at which space is freed by +# limiting the size of livelists so that we end up with a number of them. +# +default_max_livelist_entries=$(get_tunable LIVELIST_MAX_ENTRIES) +log_must set_tunable64 LIVELIST_MAX_ENTRIES 16 +# Don't disable livelists, no matter how much clone diverges from snapshot +default_min_pct_shared=$(get_tunable LIVELIST_MIN_PERCENT_SHARED) +log_must set_tunable64 LIVELIST_MIN_PERCENT_SHARED -1 + +# +# Test waiting for space from destroyed filesystem to be freed +# +log_must zfs create "$FS" +log_must dd if=/dev/zero of="/$FS/testfile" bs=1M count=128 +log_must zfs destroy "$FS" +test_wait + +# +# Test waiting for space from destroyed snapshot to be freed +# +log_must zfs create "$FS" +log_must dd if=/dev/zero of="/$FS/testfile" bs=1M count=128 +log_must zfs snapshot "$SNAP" +# Make sure bulk of space is unique to snapshot +log_must rm "/$FS/testfile" +log_must zfs destroy "$SNAP" +test_wait + +# +# Test waiting for space from destroyed clone to be freed +# +log_must zfs snapshot "$SNAP" +log_must zfs clone "$SNAP" "$CLONE" +# Add some data to the clone +for i in {1..50}; do + log_must dd if=/dev/urandom of="/$CLONE/testfile$i" bs=1k count=512 + # Force each new file to be tracked by a new livelist + log_must zpool sync $TESTPOOL +done +log_must zfs destroy "$CLONE" +test_wait + +log_pass "'zpool wait -t freeing' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh new file mode 100755 index 0000000000..924ae5f0d8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for devices to complete initializing +# +# STRATEGY: +# 1. Create a pool. +# 2. Modify a tunable to make sure initializing is slow enough to observe. +# 3. Start initializing the vdev in the pool. +# 4. Start 'zpool wait'. +# 5. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# + +function cleanup +{ + kill_if_running $pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + [[ -d "$TESTDIR" ]] && log_must rm -r "$TESTDIR" + + [[ "$default_chunk_sz" ]] && \ + log_must set_tunable64 INITIALIZE_CHUNK_SIZE $default_chunk_sz +} + +typeset -r FILE_VDEV="$TESTDIR/file_vdev" +typeset pid default_chunk_sz + +log_onexit cleanup + +default_chunk_sz=$(get_tunable INITIALIZE_CHUNK_SIZE) +log_must set_tunable64 INITIALIZE_CHUNK_SIZE 2048 + +log_must mkdir "$TESTDIR" +log_must mkfile 256M "$FILE_VDEV" +log_must zpool create -f $TESTPOOL "$FILE_VDEV" + +log_must zpool initialize $TESTPOOL "$FILE_VDEV" + +log_bkgrnd zpool wait -t initialize $TESTPOOL +pid=$! + +check_while_waiting $pid "is_vdev_initializing $TESTPOOL" + +log_pass "'zpool wait -t initialize' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh new file mode 100755 index 0000000000..8b19ee62aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when an initialization operation is canceled. +# +# STRATEGY: +# 1. Create a pool. +# 2. Modify a tunable to make sure initializing is slow enough that it won't +# complete before the test finishes. +# 3. Start initializing the vdev in the pool. +# 4. Start 'zpool wait'. +# 5. Wait a few seconds and then check that the wait process is actually +# waiting. +# 6. Cancel the initialization of the device. +# 7. Check that the wait process returns reasonably promptly. +# 8. Repeat 3-7, except pause the initialization instead of canceling it. +# + +function cleanup +{ + kill_if_running $pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + [[ "$default_chunk_sz" ]] && + log_must set_tunable64 INITIALIZE_CHUNK_SIZE $default_chunk_sz +} + +function do_test +{ + typeset stop_cmd=$1 + + log_must zpool initialize $TESTPOOL $DISK1 + + log_bkgrnd zpool wait -t initialize $TESTPOOL + pid=$! + + # Make sure that we are really waiting + log_must sleep 3 + proc_must_exist $pid + + # Stop initialization and make sure process returns + log_must eval "$stop_cmd" + bkgrnd_proc_succeeded $pid +} + +typeset pid default_chunk_sz + +log_onexit cleanup + +# Make sure the initialization takes a while +default_chunk_sz=$(get_tunable INITIALIZE_CHUNK_SIZE) +log_must set_tunable64 INITIALIZE_CHUNK_SIZE 512 + +log_must zpool create $TESTPOOL $DISK1 + +do_test "zpool initialize -c $TESTPOOL $DISK1" +do_test "zpool initialize -s $TESTPOOL $DISK1" + +log_pass "'zpool wait' works when initialization is stopped before completion." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh new file mode 100755 index 0000000000..8c8c45a51e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# -w flag for 'zpool initialize' waits for the completion of all and only those +# initializations kicked off by that invocation. +# +# STRATEGY: +# 1. Create a pool with 3 disks. +# 2. Start initializing disks 1 and 2 with one invocation of +# 'zpool initialize -w' +# 3. Start initializing disk 3 with a second invocation of 'zpool initialize -w' +# 4. Cancel the initialization of disk 1. Check that neither waiting process +# exits. +# 5. Cancel the initialization of disk 3. Check that only the second waiting +# process exits. +# 6. Cancel the initialization of disk 2. Check that the first waiting process +# exits. +# + +function cleanup +{ + kill_if_running $init12_pid + kill_if_running $init3_pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + [[ "$default_chunk_sz" ]] && + log_must set_tunable64 INITIALIZE_CHUNK_SIZE $default_chunk_sz +} + +typeset init12_pid init3_pid default_chunk_sz + +log_onexit cleanup + +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 + +# Make sure the initialization takes a while +default_chunk_sz=$(get_tunable INITIALIZE_CHUNK_SIZE) +log_must set_tunable64 INITIALIZE_CHUNK_SIZE 512 + +log_bkgrnd zpool initialize -w $TESTPOOL $DISK1 $DISK2 +init12_pid=$! +log_bkgrnd zpool initialize -w $TESTPOOL $DISK3 +init3_pid=$! + +# Make sure that we are really waiting +log_must sleep 3 +proc_must_exist $init12_pid +proc_must_exist $init3_pid + +# +# Cancel initialization of one of disks started by init12, make sure neither +# process exits +# +log_must zpool initialize -c $TESTPOOL $DISK1 +proc_must_exist $init12_pid +proc_must_exist $init3_pid + +# +# Cancel initialization started by init3, make sure that process exits, but +# init12 doesn't +# +log_must zpool initialize -c $TESTPOOL $DISK3 +proc_must_exist $init12_pid +bkgrnd_proc_succeeded $init3_pid + +# Cancel last initialization started by init12, make sure it returns. +log_must zpool initialize -c $TESTPOOL $DISK2 +bkgrnd_proc_succeeded $init12_pid + +log_pass "'zpool initialize -w' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_multiple.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_multiple.ksh new file mode 100755 index 0000000000..a8107b94eb --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_multiple.ksh @@ -0,0 +1,83 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for multiple activities. +# +# STRATEGY: +# 1. Create a pool with some data. +# 2. Alterate running two different activities (scrub and initialize), +# making sure that they overlap such that one of the two is always +# running. +# 3. Wait for both activities with a single invocation of zpool wait. +# 4. Check that zpool wait doesn't return until both activities have +# stopped. +# + +function cleanup +{ + kill_if_running $pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + [[ "$default_chunk_sz" ]] && log_must set_tunable64 \ + INITIALIZE_CHUNK_SIZE $default_chunk_sz + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 +} + +typeset pid default_chunk_sz + +log_onexit cleanup + +log_must zpool create -f $TESTPOOL $DISK1 +log_must dd if=/dev/urandom of="/$TESTPOOL/testfile" bs=64k count=1k + +default_chunk_sz=$(get_tunable INITIALIZE_CHUNK_SIZE) +log_must set_tunable64 INITIALIZE_CHUNK_SIZE 512 +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool scrub $TESTPOOL + +log_bkgrnd zpool wait -t scrub,initialize $TESTPOOL +pid=$! + +log_must sleep 2 + +log_must zpool initialize $TESTPOOL $DISK1 +log_must zpool scrub -s $TESTPOOL + +log_must sleep 2 + +log_must zpool scrub $TESTPOOL +log_must zpool initialize -s $TESTPOOL $DISK1 + +log_must sleep 2 + +log_must zpool initialize $TESTPOOL $DISK1 +log_must zpool scrub -s $TESTPOOL + +log_must sleep 2 + +proc_must_exist $pid + +# Cancel last activity, zpool wait should return +log_must zpool initialize -s $TESTPOOL $DISK1 +bkgrnd_proc_succeeded $pid + +log_pass "'zpool wait' works when waiting for multiple activities." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh new file mode 100755 index 0000000000..f4819f37ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' returns immediately when there is no activity in progress. +# +# STRATEGY: +# 1. Create an empty pool with no activity +# 2. Run zpool wait with various activities, make sure it always returns +# promptly +# + +function cleanup { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +typeset -r TIMEOUT_SECS=1 + +log_onexit cleanup +log_must zpool create $TESTPOOL $DISK1 + +# Wait for each activity +typeset activities=(free discard initialize replace remove resilver scrub) +for activity in ${activities[@]}; do + log_must timeout $TIMEOUT_SECS zpool wait -t $activity $TESTPOOL +done + +# Wait for multiple activities at the same time +log_must timeout $TIMEOUT_SECS zpool wait -t scrub,initialize $TESTPOOL +log_must timeout $TIMEOUT_SECS zpool wait -t free,remove,discard $TESTPOOL + +# Wait for all activities at the same time +log_must timeout $TIMEOUT_SECS zpool wait $TESTPOOL + +log_pass "'zpool wait' returns immediately when no activity is in progress." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove.ksh new file mode 100755 index 0000000000..19298d193c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for a device to be removed. +# +# STRATEGY: +# 1. Create a pool with two disks and some data. +# 2. Modify a tunable to make sure removal doesn't make any progress. +# 3. Start removing one of the disks. +# 4. Start 'zpool wait'. +# 5. Sleep for a few seconds and check that the process is actually waiting. +# 6. Modify tunable to allow removal to complete. +# 7. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 8. Repeat 1-7, except using the '-w' flag for 'zpool remove' instead of using +# 'zpool wait'. +# + +function cleanup +{ + kill_if_running $pid + log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +function do_test +{ + typeset use_flag=$1 + + log_must zpool create -f $TESTPOOL $DISK1 $DISK2 + log_must dd if=/dev/urandom of="/$TESTPOOL/testfile" bs=1k count=16k + + # Start removal, but don't allow it to make any progress at first + log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 1 + + if $use_flag; then + log_bkgrnd zpool remove -w $TESTPOOL $DISK1 + pid=$! + + while ! is_pool_removing $TESTPOOL && proc_exists $pid; do + log_must sleep .5 + done + else + log_must zpool remove $TESTPOOL $DISK1 + log_bkgrnd zpool wait -t remove $TESTPOOL + pid=$! + fi + + # Make sure the 'zpool wait' is actually waiting + log_must sleep 3 + proc_must_exist $pid + + # Unpause removal, and wait for it to finish + log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 + check_while_waiting $pid "is_pool_removing $TESTPOOL" + + log_must zpool destroy $TESTPOOL +} + +log_onexit cleanup + +typeset pid + +do_test true +do_test false + +log_pass "'zpool wait -t remove' and 'zpool remove -w' work." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh new file mode 100755 index 0000000000..4373b57779 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when device removal is canceled. +# +# STRATEGY: +# 1. Create a pool with two disks and some data. +# 2. Modify a tunable to make sure removal won't complete while test is running. +# 3. Start removing one of the disks. +# 4. Start 'zpool wait'. +# 5. Sleep for a few seconds and check that the process is actually waiting. +# 6. Cancel the removal of the device. +# 7. Check that the wait process returns reasonably promptly. +# + +function cleanup +{ + kill_if_running $pid + log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +typeset pid + +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 + +log_must dd if=/dev/urandom of="/$TESTPOOL/testfile" bs=1k count=16k + +# Start removal, but don't allow it to make any progress +log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 1 +log_must zpool remove $TESTPOOL $DISK1 + +log_bkgrnd zpool wait -t remove $TESTPOOL +pid=$! + +log_must sleep 3 +proc_must_exist $pid + +log_must zpool remove -s $TESTPOOL +bkgrnd_proc_succeeded $pid + +log_pass "'zpool wait -t remove' works when removal is canceled." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh new file mode 100755 index 0000000000..f047050ea0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for devices to finish being trimmed +# +# STRATEGY: +# 1. Create a pool. +# 2. Start trimming the vdev in the pool, making sure the rate is slow enough +# that the trim can be observed. +# 3. Start 'zpool wait'. +# 4. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# + +function cleanup +{ + kill_if_running $pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + [[ -d "$TESTDIR" ]] && log_must rm -r "$TESTDIR" +} + +# Check whether any vdevs in given pool are being trimmed +function trim_in_progress +{ + typeset pool="$1" + zpool status -t "$pool" | grep "trimmed, started" +} + +if is_freebsd; then + log_unsupported "FreeBSD has no hole punching mechanism for the time being." +fi + +typeset -r FILE_VDEV="$TESTDIR/file_vdev" +typeset pid + +log_onexit cleanup + +log_must mkdir "$TESTDIR" +log_must truncate -s 10G "$FILE_VDEV" +log_must zpool create -f $TESTPOOL "$FILE_VDEV" + +log_must zpool trim -r 2G $TESTPOOL "$FILE_VDEV" + +log_bkgrnd zpool wait -t trim $TESTPOOL +pid=$! + +check_while_waiting $pid "trim_in_progress $TESTPOOL" + +log_pass "'zpool wait -t trim' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh new file mode 100755 index 0000000000..26e1aa68e6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when a trim operation is canceled. +# +# STRATEGY: +# 1. Create a pool. +# 2. Start trimming the vdev in the pool, setting the rate low enough that the +# operation won't complete before the test finishes. +# 3. Start 'zpool wait'. +# 4. Wait a few seconds and then check that the wait process is actually +# waiting. +# 5. Cancel the trim. +# 6. Check that the wait process returns reasonably promptly. +# 7. Repeat 3-7, except pause the trim instead of canceling it. +# + +function cleanup +{ + kill_if_running $pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + [[ -d "$TESTDIR" ]] && log_must rm -r "$TESTDIR" +} + +function do_test +{ + typeset stop_cmd=$1 + + log_must zpool trim -r 1M $TESTPOOL "$FILE_VDEV" + + log_bkgrnd zpool wait -t trim $TESTPOOL + pid=$! + + # Make sure that we are really waiting + log_must sleep 3 + proc_must_exist $pid + + # Stop trimming and make sure process returns + log_must eval "$stop_cmd" + bkgrnd_proc_succeeded $pid +} + +if is_freebsd; then + log_unsupported "FreeBSD has no hole punching mechanism for the time being." +fi + +typeset pid +typeset -r FILE_VDEV="$TESTDIR/file_vdev1" + +log_onexit cleanup + +log_must mkdir "$TESTDIR" +log_must truncate -s 10G "$FILE_VDEV" +log_must zpool create -f $TESTPOOL "$FILE_VDEV" + +do_test "zpool trim -c $TESTPOOL $FILE_VDEV" +do_test "zpool trim -s $TESTPOOL $FILE_VDEV" + +log_pass "'zpool wait' works when trim is stopped before completion." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh new file mode 100755 index 0000000000..effccc1c34 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# -w flag for 'zpool trim' waits for trimming to complete for all and only those +# vdevs kicked off by that invocation. +# +# STRATEGY: +# 1. Create a pool with 3 vdevs. +# 2. Start trimming vdevs 1 and 2 with one invocation of 'zpool trim -w' +# 3. Start trimming vdev 3 with a second invocation of 'zpool trim -w' +# 4. Cancel the trim of vdev 1. Check that neither waiting process exits. +# 5. Cancel the trim of vdev 3. Check that only the second waiting process +# exits. +# 6. Cancel the trim of vdev 2. Check that the first waiting process exits. +# + +function cleanup +{ + kill_if_running $trim12_pid + kill_if_running $trim3_pid + poolexists $TESTPOOL && destroy_pool $TESTPOOL + [[ -d "$TESTDIR" ]] && log_must rm -r "$TESTDIR" +} + +if is_freebsd; then + log_unsupported "FreeBSD has no hole punching mechanism for the time being." +fi + +typeset trim12_pid trim3_pid +typeset -r VDEV1="$TESTDIR/file_vdev1" +typeset -r VDEV2="$TESTDIR/file_vdev2" +typeset -r VDEV3="$TESTDIR/file_vdev3" + +log_onexit cleanup + +log_must mkdir "$TESTDIR" +log_must truncate -s 10G "$VDEV1" "$VDEV2" "$VDEV3" +log_must zpool create -f $TESTPOOL "$VDEV1" "$VDEV2" "$VDEV3" + +log_bkgrnd zpool trim -r 1M -w $TESTPOOL "$VDEV1" "$VDEV2" +trim12_pid=$! +log_bkgrnd zpool trim -r 1M -w $TESTPOOL "$VDEV3" +trim3_pid=$! + +# Make sure that we are really waiting +log_must sleep 3 +proc_must_exist $trim12_pid +proc_must_exist $trim3_pid + +# +# Cancel trim of one of disks started by trim12, make sure neither +# process exits +# +log_must zpool trim -c $TESTPOOL "$VDEV1" +proc_must_exist $trim12_pid +proc_must_exist $trim3_pid + +# +# Cancel trim started by trim3, make sure that process exits, but +# trim12 doesn't +# +log_must zpool trim -c $TESTPOOL "$VDEV3" +proc_must_exist $trim12_pid +bkgrnd_proc_succeeded $trim3_pid + +# Cancel last trim started by trim12, make sure it returns. +log_must zpool trim -c $TESTPOOL "$VDEV2" +bkgrnd_proc_succeeded $trim12_pid + +log_pass "'zpool trim -w' works." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_usage.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_usage.ksh new file mode 100755 index 0000000000..2d6f897092 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_usage.ksh @@ -0,0 +1,47 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' behaves sensibly when invoked incorrectly. +# +# STRATEGY: +# 1. Invoke 'zpool wait' incorrectly and check that it exits with a non-zero +# status. +# 2. Invoke 'zpool wait' with missing or bad arguments and check that it prints +# some sensible error message. +# + +function cleanup { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup +log_must zpool create $TESTPOOL $DISK1 + +log_mustnot zpool wait + +zpool wait 2>&1 | grep -i usage || \ + log_fail "Usage message did not contain the word 'usage'." +zpool wait -t scrub fakepool 2>&1 | grep -i 'no such pool' || \ + log_fail "Error message did not contain phrase 'no such pool'." +zpool wait -t foo $TESTPOOL 2>&1 | grep -i 'invalid activity' || \ + log_fail "Error message did not contain phrase 'invalid activity'." + +log_pass "'zpool wait' behaves sensibly when invoked incorrectly." diff --git a/tests/zfs-tests/tests/functional/cli_user/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/Makefile.am index f1ff32e8d2..119f8ee187 100644 --- a/tests/zfs-tests/tests/functional/cli_user/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_user/Makefile.am @@ -2,4 +2,5 @@ SUBDIRS = \ misc \ zfs_list \ zpool_iostat \ - zpool_list + zpool_list \ + zpool_status diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/misc/Makefile.am index 29c0342909..2d38e65777 100644 --- a/tests/zfs-tests/tests/functional/cli_user/misc/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_user/misc/Makefile.am @@ -46,7 +46,7 @@ dist_pkgdata_SCRIPTS = \ arcstat_001_pos.ksh \ arc_summary_001_pos.ksh \ arc_summary_002_neg.ksh \ - dbufstat_001_pos.ksh + zpool_wait_privilege.ksh dist_pkgdata_DATA = \ misc.cfg diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh index a445fbb48c..befbea986e 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh @@ -48,6 +48,9 @@ else set -A args "" "-a" "-d" "-p 1" fi +# Without this, the below checks aren't going to work the way we hope... +set -o pipefail + typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do log_must eval "arc_summary ${args[i]} > /dev/null" diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg index 06d211ce18..1a96ff5d93 100644 --- a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg +++ b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg @@ -41,7 +41,7 @@ if is_linux; then # zfs get/set subcommands - ordered as per the list above so we # can iterate over both sets in an array PROP_VALS="\ - posixacl on \ + posix on \ fletcher2 on on \ on legacy none on \ 128K none on \ @@ -49,11 +49,37 @@ if is_linux; then # these are an alternate set of property values PROP_ALTVALS="\ - noacl off \ + nfsv4 off \ fletcher4 lzjb off \ off /tmp/zfstest 100M off \ 512 10m off \ hidden" +elif is_freebsd; then + PROP_NAMES="\ + acltype atime \ + checksum compression devices \ + exec mountpoint quota readonly \ + recordsize reservation setuid \ + snapdir" + + # these are a set of values we apply, for use when testing the + # zfs get/set subcommands - ordered as per the list above so we + # can iterate over both sets in an array + PROP_VALS="\ + posix on \ + fletcher2 on on \ + on legacy none on \ + 128K none on \ + visible" + + # these are an alternate set of property values + PROP_ALTVALS="\ + nfsv4 off \ + fletcher4 lzjb off \ + off /tmp/zfstest 100M off \ + 512 10m off \ + hidden" + else # these are the set of setable ZFS properties PROP_NAMES="\ diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/setup.ksh index bcf6a2296d..fc0ebde100 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/setup.ksh @@ -34,7 +34,7 @@ # This setup script is moderately complex, as it creates scenarios for all # of the tests included in this directory. Usually we'd want each test case -# to setup/teardown it's own configuration, but this would be time consuming +# to setup/teardown its own configuration, but this would be time consuming # given the nature of these tests. However, as a side-effect, one test # leaving the system in an unknown state could impact other test cases. diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zdb_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zdb_001_neg.ksh index 579ab12946..3adfc59f51 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zdb_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zdb_001_neg.ksh @@ -56,11 +56,7 @@ function check_zdb function cleanup { - if [ -e $TEST_BASE_DIR/zdb_001_neg.$$.txt ] - then - rm $TEST_BASE_DIR/zdb_001_neg.$$.txt - fi - + rm -f $TEST_BASE_DIR/zdb_001_neg.$$.txt $TEST_BASE_DIR/zdb.$$ } verify_runnable "global" diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_001_neg.ksh index 46171caf9f..bfe8cf4bb2 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_001_neg.ksh @@ -44,10 +44,7 @@ function cleanup { - if [ -e "$TEMPFILE" ] - then - rm -f "$TEMPFILE" - fi + rm -f "$TEMPFILE" } log_onexit cleanup @@ -55,7 +52,7 @@ log_assert "zfs shows a usage message when run as a user" TEMPFILE="$TEST_BASE_DIR/zfs_001_neg.$$.txt" -eval "zfs > $TEMPFILE 2>&1" +zfs > $TEMPFILE 2>&1 log_must grep "usage: zfs command args" "$TEMPFILE" log_must eval "awk '{if (length(\$0) > 80) exit 1}' < $TEMPFILE" diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_share_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_share_001_neg.ksh index 3f120c2438..14c35b3da6 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_share_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_share_001_neg.ksh @@ -45,7 +45,7 @@ verify_runnable "global" -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Requires additional dependencies" fi diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_unshare_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_unshare_001_neg.ksh index 72ed1f5d3e..7ae86fc4ec 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zfs_unshare_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zfs_unshare_001_neg.ksh @@ -45,7 +45,7 @@ verify_runnable "global" -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Requires additional dependencies" fi diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zpool_online_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zpool_online_001_neg.ksh index b89cf07ac1..cd29051535 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zpool_online_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zpool_online_001_neg.ksh @@ -49,7 +49,7 @@ function check_for_online | grep ONLINE ) if [ -n "$RESULT" ] then - log_fail "A disk was brough online!" + log_fail "A disk was brought online!" fi } diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zpool_wait_privilege.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zpool_wait_privilege.ksh new file mode 100755 index 0000000000..42a2dd2c63 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zpool_wait_privilege.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# zpool wait works when run as an unprivileged user +# + +verify_runnable "global" + +log_must zpool wait $TESTPOOL + +# Make sure printing status works as unprivileged user. +output=$(zpool wait -H $TESTPOOL 1) || \ + log_fail "'zpool wait -H $TESTPOOL 1' failed" +# There should be one line of status output in a pool with no activity. +log_must eval '[[ $(wc -l <<<$output) -ge 1 ]]' + +log_pass "zpool wait works when run as a user" diff --git a/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_002_pos.ksh index 382b2cb7f0..4951097aca 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_002_pos.ksh @@ -74,7 +74,7 @@ else fi # -# datsets ordered by checksum options (note, Orange, Carrot & Banana have the +# datasets ordered by checksum options (note, Orange, Carrot & Banana have the # same checksum options, so ZFS should revert to sorting them alphabetically by # name) # diff --git a/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh index d881b831ff..8e9009bd55 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh @@ -57,9 +57,8 @@ function cleanup log_onexit cleanup log_assert "'zfs list -d ' should get expected output." -mntpnt=/var/tmp -DEPTH_OUTPUT="$mntpnt/depth_output" -EXPECT_OUTPUT="$mntpnt/expect_output" +DEPTH_OUTPUT="$TEST_BASE_DIR/depth_output" +EXPECT_OUTPUT="$TEST_BASE_DIR/expect_output" typeset -i old_val=0 typeset -i j=0 typeset -i fs=0 diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh index 5cb50fde6f..22450d89df 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh @@ -30,7 +30,7 @@ # # STRATEGY: # 1. Change HOME to /var/tmp -# 2. Make a simple script that echos a key value pair +# 2. Make a simple script that echoes a key value pair # in /var/tmp/.zpool.d # 3. Make sure it can be run with -c # 4. Remove the script we created diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh index 1197ea2d11..11f51350af 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh @@ -30,7 +30,7 @@ # # STRATEGY: # 1. Set ZPOOL_SCRIPTS_PATH to contain a couple of non-default dirs -# 2. Make a simple script that echos a key value pair in each dir +# 2. Make a simple script that echoes a key value pair in each dir # 3. Make sure scripts can be run with -c # 4. Remove the scripts we created diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh index 1ae91c1a84..53652ec11b 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh @@ -68,7 +68,7 @@ for i in $files ; do test_zpool_script "$i" "$testpool" "zpool iostat -Pv -c" done -# Test that we can run multiple scripts separated with a commma by running +# Test that we can run multiple scripts separated with a comma by running # all the scripts in a single -c line. allscripts="$(echo $scripts | sed -r 's/[[:blank:]]+/,/g')" test_zpool_script "$allscripts" "$testpool" "zpool iostat -Pv -c" diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am new file mode 100644 index 0000000000..e1b3396577 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_user/zpool_status +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_status_003_pos.ksh \ + zpool_status_-c_disable.ksh \ + zpool_status_-c_homedir.ksh \ + zpool_status_-c_searchpath.ksh diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh new file mode 100755 index 0000000000..79cd6e9f90 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh new file mode 100755 index 0000000000..6a9af3bc28 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_disable.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh similarity index 97% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh index 4cc3deb6da..5363043a83 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_homedir.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh @@ -30,7 +30,7 @@ # # STRATEGY: # 1. Change HOME to /var/tmp -# 2. Make a simple script that echos a key value pair +# 2. Make a simple script that echoes a key value pair # in /var/tmp/.zpool.d # 3. Make sure it can be run with -c # 4. Remove the script we created diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh similarity index 97% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh index a075b9a0c1..3f64fdf1a7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-c_searchpath.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh @@ -30,7 +30,7 @@ # # STRATEGY: # 1. Set ZPOOL_SCRIPTS_PATH to contain a couple of non-default dirs -# 2. Make a simple script that echos a key value pair in each dir +# 2. Make a simple script that echoes a key value pair in each dir # 3. Make sure scripts can be run with -c # 4. Remove the scripts we created diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh similarity index 96% rename from tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh rename to tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh index c5e0c6e474..fa7d3f3f2d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh @@ -68,7 +68,7 @@ for i in $files ; do test_zpool_script "$i" "$testpool" "zpool status -P -c" done -# Test that we can run multiple scripts separated with a commma by running +# Test that we can run multiple scripts separated with a comma by running # all the scripts in a single -c line. allscripts="$(echo $scripts | sed -r 's/[[:blank:]]+/,/g')" test_zpool_script "$allscripts" "$testpool" "zpool status -P -c" diff --git a/tests/zfs-tests/tests/functional/compression/Makefile.am b/tests/zfs-tests/tests/functional/compression/Makefile.am index 25a5bca232..817bd41e80 100644 --- a/tests/zfs-tests/tests/functional/compression/Makefile.am +++ b/tests/zfs-tests/tests/functional/compression/Makefile.am @@ -5,7 +5,13 @@ dist_pkgdata_SCRIPTS = \ compress_001_pos.ksh \ compress_002_pos.ksh \ compress_003_pos.ksh \ - compress_004_pos.ksh + compress_004_pos.ksh \ + compress_zstd_bswap.ksh \ + l2arc_compressed_arc.ksh \ + l2arc_compressed_arc_disabled.ksh \ + l2arc_encrypted.ksh \ + l2arc_encrypted_no_compressed_arc.ksh dist_pkgdata_DATA = \ - compress.cfg + compress.cfg \ + testpool_zstd.tar.gz diff --git a/tests/zfs-tests/tests/functional/compression/compress_001_pos.ksh b/tests/zfs-tests/tests/functional/compression/compress_001_pos.ksh index b35b1775a7..fe3a3acacc 100755 --- a/tests/zfs-tests/tests/functional/compression/compress_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/compression/compress_001_pos.ksh @@ -21,14 +21,11 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2007, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2013, 2016, Delphix. All rights reserved. # Use is subject to license terms. # -# -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. -# - . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/compression/compress.cfg diff --git a/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh b/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh index 713f206c6a..d5b7256b52 100755 --- a/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh @@ -21,14 +21,14 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2007, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2013, 2016, Delphix. All rights reserved. +# Copyright (c) 2019, Kjeld Schouten-Lebbing. All rights reserved. # Use is subject to license terms. # -# -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. -# +. $STF_SUITE/include/properties.shlib . $STF_SUITE/include/libtest.shlib # @@ -62,7 +62,7 @@ typeset -i offset=0 for propname in "compression" "compress" do - for value in $(get_compress_opts zfs_compress) + for value in "${compress_prop_vals[@]:1}" do log_must zfs set $propname=$value $fs if [[ $value == "gzip-6" ]]; then diff --git a/tests/zfs-tests/tests/functional/compression/compress_004_pos.ksh b/tests/zfs-tests/tests/functional/compression/compress_004_pos.ksh index 29d4b3a2b0..b924bcd0ba 100755 --- a/tests/zfs-tests/tests/functional/compression/compress_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/compression/compress_004_pos.ksh @@ -21,14 +21,13 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2007, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2013, 2016, Delphix. All rights reserved. +# Copyright (c) 2019, Kjeld Schouten-Lebbing. All Rights Reserved. # Use is subject to license terms. # -# -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. -# - +. $STF_SUITE/include/properties.shlib . $STF_SUITE/include/libtest.shlib # @@ -94,7 +93,7 @@ typeset -i blknum=0 for propname in "compression" "compress" do - for value in $(get_compress_opts zfs_compress) + for value in "${compress_prop_vals[@]:1}" do log_must zfs set compression=$value $fs real_val=$(get_prop $propname $fs) diff --git a/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh b/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh new file mode 100755 index 0000000000..9726cf0dd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2007, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2021, Rich Ercolani. +# Use is subject to license terms. +# + +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Import a pool containing variously-permuted zstd-compressed files, +# then try to copy them out. + +typeset TESTPOOL_ZSTD_FILE=$STF_SUITE/tests/functional/compression/testpool_zstd.tar.gz +verify_runnable "both" + +function cleanup +{ + destroy_pool testpool_zstd + rm -f $TEST_BASE_DIR/testpool_zstd + +} + +log_assert "Trying to read data from variously mangled zstd datasets" +log_onexit cleanup + +log_must tar --directory $TEST_BASE_DIR -xzSf $TESTPOOL_ZSTD_FILE +log_must zpool import -d $TEST_BASE_DIR testpool_zstd +log_must dd if=/testpool_zstd/x86_64/zstd of=/dev/null +log_must dd if=/testpool_zstd/ppc64_fbsd/zstd of=/dev/null + +log_pass "Reading from mangled zstd datasets works as expected." diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh new file mode 100755 index 0000000000..5980ce1569 --- /dev/null +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +export SIZE=1G +export VDIR=$TESTDIR/disk.persist_l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" + +# fio options +export DIRECTORY=/$TESTPOOL-l2arc +export NUMJOBS=4 +export RUNTIME=30 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# +# DESCRIPTION: +# System with compressed_arc disabled succeeds at reading from L2ARC +# +# STRATEGY: +# 1. Enable compressed_arc. +# 2. Create pool with a cache device and compression enabled. +# 3. Read the number of L2ARC checksum failures. +# 4. Create a random file in that pool and random read for 30 sec. +# 5. Read the number of L2ARC checksum failures. +# + +verify_runnable "global" + +log_assert "L2ARC with compressed_arc enabled succeeds." + +origin_carc_setting=$(get_tunable COMPRESSED_ARC_ENABLED) + +function cleanup +{ + if poolexists $TESTPOOL-l2arc ; then + destroy_pool $TESTPOOL-l2arc + fi + + log_must set_tunable64 COMPRESSED_ARC_ENABLED $origin_carc_setting +} +log_onexit cleanup + +# Enable Compressed ARC so that in-ARC and on-disk will match +log_must set_tunable64 COMPRESSED_ARC_ENABLED 1 + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=lz4 -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE + +l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) + +log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ + "$l2_cksum_bad_end" +log_must test $(( $l2_cksum_bad_end - $l2_cksum_bad_start )) -eq 0 + +log_must zpool destroy -f $TESTPOOL-l2arc + +log_pass "L2ARC with compressed_arc enabled does not result in checksum errors." diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh new file mode 100755 index 0000000000..4c3b6a61c2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +export SIZE=1G +export VDIR=$TESTDIR/disk.persist_l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" + +# fio options +export DIRECTORY=/$TESTPOOL-l2arc +export NUMJOBS=4 +export RUNTIME=30 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# +# DESCRIPTION: +# System with compressed_arc disabled succeeds at reading from L2ARC +# +# STRATEGY: +# 1. Disable compressed_arc. +# 2. Create pool with a cache device and compression enabled. +# 3. Read the number of L2ARC checksum failures. +# 4. Create a random file in that pool and random read for 30 sec. +# 5. Read the number of L2ARC checksum failures. +# + +verify_runnable "global" + +log_assert "L2ARC with compressed_arc disabled succeeds." + +origin_carc_setting=$(get_tunable COMPRESSED_ARC_ENABLED) + +function cleanup +{ + if poolexists $TESTPOOL-l2arc ; then + destroy_pool $TESTPOOL-l2arc + fi + + log_must set_tunable64 COMPRESSED_ARC_ENABLED $origin_carc_setting +} +log_onexit cleanup + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV + +# Disable Compressed ARC so that in-ARC and on-disk will not match +log_must set_tunable64 COMPRESSED_ARC_ENABLED 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=lz4 -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE + +l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) + +log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ + "$l2_cksum_bad_end" +log_must test $(( $l2_cksum_bad_end - $l2_cksum_bad_start )) -eq 0 + +log_must zpool destroy -f $TESTPOOL-l2arc + +log_pass "L2ARC with compressed_arc disabled does not result in checksum"\ + "errors." diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh new file mode 100755 index 0000000000..fb460daf68 --- /dev/null +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +export SIZE=1G +export VDIR=$TESTDIR/disk.persist_l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" +export PASSPHRASE="password" + +# fio options +export DIRECTORY=/$TESTPOOL-l2arc/encrypted +export NUMJOBS=4 +export RUNTIME=30 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# +# DESCRIPTION: +# System with compressed_arc disabled succeeds at reading from L2ARC +# +# STRATEGY: +# 1. Enable compressed_arc. +# 2. Create pool with a cache device, encryption, and compression enabled. +# 3. Read the number of L2ARC checksum failures. +# 4. Create a random file in that pool and random read for 30 sec. +# 5. Read the number of L2ARC checksum failures. +# + +verify_runnable "global" + +log_assert "L2ARC with encryption enabled succeeds." + +origin_carc_setting=$(get_tunable COMPRESSED_ARC_ENABLED) + +function cleanup +{ + if poolexists $TESTPOOL-l2arc ; then + destroy_pool $TESTPOOL-l2arc + fi + + log_must set_tunable64 COMPRESSED_ARC_ENABLED $origin_carc_setting +} +log_onexit cleanup + +# Enable Compressed ARC so that in-ARC and on-disk will match +log_must set_tunable64 COMPRESSED_ARC_ENABLED 1 + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=zstd -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE + +log_must eval "echo $PASSPHRASE | zfs create -o compression=zstd " \ + "-o encryption=on -o keyformat=passphrase -o keylocation=prompt " \ + "$TESTPOOL-l2arc/encrypted" + +l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) + +log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ + "$l2_cksum_bad_end" +log_must test $(( $l2_cksum_bad_end - $l2_cksum_bad_start )) -eq 0 + +log_must zpool destroy -f $TESTPOOL-l2arc + +log_pass "L2ARC with encryption and compressed_arc enabled does not result in"\ + "checksum errors." diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh new file mode 100755 index 0000000000..45ef489c31 --- /dev/null +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 The FreeBSD Foundation [1] +# +# [1] Portions of this software were developed by Allan Jude +# under sponsorship from the FreeBSD Foundation. + +. $STF_SUITE/include/libtest.shlib + +export SIZE=1G +export VDIR=$TESTDIR/disk.persist_l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" +export PASSPHRASE="password" + +# fio options +export DIRECTORY=/$TESTPOOL-l2arc/encrypted +export NUMJOBS=4 +export RUNTIME=30 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# +# DESCRIPTION: +# System with compressed_arc disabled succeeds at reading from L2ARC +# +# STRATEGY: +# 1. Disable compressed_arc. +# 2. Create pool with a cache device, encryption, and compression enabled. +# 3. Read the number of L2ARC checksum failures. +# 4. Create a random file in that pool and random read for 30 sec. +# 5. Read the number of L2ARC checksum failures. +# + +verify_runnable "global" + +log_assert "L2ARC with compressed_arc disabled succeeds." + +origin_carc_setting=$(get_tunable COMPRESSED_ARC_ENABLED) + +function cleanup +{ + if poolexists $TESTPOOL-l2arc ; then + destroy_pool $TESTPOOL-l2arc + fi + + log_must set_tunable64 COMPRESSED_ARC_ENABLED $origin_carc_setting +} +log_onexit cleanup + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV + +# Disable Compressed ARC so that in-ARC and on-disk will not match +log_must set_tunable64 COMPRESSED_ARC_ENABLED 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=zstd -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE + +log_must eval "echo $PASSPHRASE | zfs create -o compression=zstd " \ + "-o encryption=on -o keyformat=passphrase -o keylocation=prompt " \ + "$TESTPOOL-l2arc/encrypted" + +l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) + +log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ + "$l2_cksum_bad_end" +log_must test $(( $l2_cksum_bad_end - $l2_cksum_bad_start )) -eq 0 + +log_must zpool destroy -f $TESTPOOL-l2arc + +log_pass "L2ARC with encryption enabled and compressed_arc disabled does not"\ + "result in checksum errors." diff --git a/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz b/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz new file mode 100644 index 0000000000..4096f7fcbe Binary files /dev/null and b/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz differ diff --git a/tests/zfs-tests/tests/functional/crtime/Makefile.am b/tests/zfs-tests/tests/functional/crtime/Makefile.am new file mode 100644 index 0000000000..13e1c2dde3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/crtime/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/crtime +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + crtime_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/write_dirs/write_dirs.cfg b/tests/zfs-tests/tests/functional/crtime/cleanup.ksh old mode 100644 new mode 100755 similarity index 74% rename from tests/zfs-tests/tests/functional/write_dirs/write_dirs.cfg rename to tests/zfs-tests/tests/functional/crtime/cleanup.ksh index 400d5bcb1a..3166bd6ec1 --- a/tests/zfs-tests/tests/functional/write_dirs/write_dirs.cfg +++ b/tests/zfs-tests/tests/functional/crtime/cleanup.ksh @@ -1,3 +1,4 @@ +#!/bin/ksh -p # # CDDL HEADER START # @@ -20,7 +21,7 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -30,18 +31,4 @@ . $STF_SUITE/include/libtest.shlib -verify_runnable "global" - -export SIZE="1gb" -export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') -export DISKSARRAY=$DISKS - -if is_linux; then - set_slice_prefix - set_device_dir - export SLICE=1 -else - DEV_DSKDIR="/dev" - export SLICE_PREFIX="s" - export SLICE=0 -fi +default_cleanup diff --git a/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh new file mode 100755 index 0000000000..4f9810553f --- /dev/null +++ b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Portions Copyright 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# Verify crtime is functional with xattr=on|sa + +verify_runnable "both" + +# +# The statx system call was first added in the 4.11 Linux kernel. Prior to this +# change there was no mechanism to obtain birth time on Linux. Therefore, this +# test is expected to fail on older kernels and is skipped. +# +if is_linux; then + if [[ $(linux_version) -lt $(linux_version "4.11") ]]; then + log_unsupported "Requires statx(2) system call on Linux" + fi + typeset stat_version=$(stat --version | awk '{ print $NF; exit }') + if compare_version_gte "8.30" "${stat_version}"; then + log_unsupported "Requires coreutils stat(1) > 8.30 on Linux" + fi +fi + +log_assert "Verify crtime is functional." + +set -A args "sa" "on" +typeset TESTFILE=$TESTDIR/testfile + +for arg in ${args[*]}; do + log_note "Testing with xattr set to $arg" + log_must zfs set xattr=$arg $TESTPOOL + rm -f $TESTFILE + log_must touch $TESTFILE + typeset -i crtime=$(stat_crtime $TESTFILE) + typeset -i ctime=$(stat_ctime $TESTFILE) + if (( crtime != ctime )); then + log_fail "Incorrect crtime ($crtime != $ctime)" + fi + log_must touch $TESTFILE + typeset -i crtime1=$(stat_crtime $TESTFILE) + if (( crtime1 != crtime )); then + log_fail "touch modified crtime ($crtime1 != $crtime)" + fi +done + +log_pass "Verified crtime is functional." diff --git a/tests/zfs-tests/tests/functional/crtime/setup.ksh b/tests/zfs-tests/tests/functional/crtime/setup.ksh new file mode 100755 index 0000000000..fc5cec3063 --- /dev/null +++ b/tests/zfs-tests/tests/functional/crtime/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/ctime/ctime.c b/tests/zfs-tests/tests/functional/ctime/ctime.c index 1cd1832340..b755be2feb 100644 --- a/tests/zfs-tests/tests/functional/ctime/ctime.c +++ b/tests/zfs-tests/tests/functional/ctime/ctime.c @@ -31,7 +31,9 @@ #include #include +#ifndef __FreeBSD__ #include +#endif #include #include #include @@ -95,6 +97,13 @@ get_file_time(const char *pfile, int what, time_t *ptr) } } +static ssize_t +get_dirnamelen(const char *path) +{ + const char *end = strrchr(path, '/'); + return (end ? end - path : -1); +} + static int do_read(const char *pfile) { @@ -147,22 +156,18 @@ static int do_link(const char *pfile) { int ret = 0; - char link_file[BUFSIZ] = { 0 }; - char pfile_copy[BUFSIZ] = { 0 }; - char *dname; + char link_file[BUFSIZ + 16] = { 0 }; if (pfile == NULL) { return (-1); } - strncpy(pfile_copy, pfile, sizeof (pfile_copy)-1); - pfile_copy[sizeof (pfile_copy) - 1] = '\0'; /* * Figure out source file directory name, and create * the link file in the same directory. */ - dname = dirname((char *)pfile_copy); - (void) snprintf(link_file, BUFSIZ, "%s/%s", dname, "link_file"); + (void) snprintf(link_file, sizeof (link_file), + "%.*s/%s", (int)get_dirnamelen(pfile), pfile, "link_file"); if (link(pfile, link_file) == -1) { (void) fprintf(stderr, "link(%s, %s) failed with errno %d\n", @@ -251,6 +256,7 @@ do_chown(const char *pfile) return (ret); } +#ifndef __FreeBSD__ static int do_xattr(const char *pfile) { @@ -268,6 +274,7 @@ do_xattr(const char *pfile) } return (ret); } +#endif static void cleanup(void) @@ -289,7 +296,9 @@ static timetest_t timetest_table[] = { { ST_CTIME, "st_ctime", do_chown }, { ST_CTIME, "st_ctime", do_link }, { ST_CTIME, "st_ctime", do_utime }, +#ifndef __FreeBSD__ { ST_CTIME, "st_ctime", do_xattr }, +#endif }; #define NCOMMAND (sizeof (timetest_table) / sizeof (timetest_table[0])) @@ -315,7 +324,7 @@ main(int argc, char *argv[]) (void) snprintf(tfile, sizeof (tfile), "%s/%s", penv[0], penv[1]); /* - * If the test file is exists, remove it first. + * If the test file exists, remove it first. */ if (access(tfile, F_OK) == 0) { (void) unlink(tfile); diff --git a/tests/zfs-tests/tests/functional/deadman/Makefile.am b/tests/zfs-tests/tests/functional/deadman/Makefile.am index 7b70ca09df..097f23e884 100644 --- a/tests/zfs-tests/tests/functional/deadman/Makefile.am +++ b/tests/zfs-tests/tests/functional/deadman/Makefile.am @@ -1,5 +1,6 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman dist_pkgdata_SCRIPTS = \ + deadman_ratelimit.ksh \ deadman_sync.ksh \ deadman_zio.ksh diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh new file mode 100755 index 0000000000..469117a56c --- /dev/null +++ b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright 2021 iXsystems, Inc. +# + +# DESCRIPTION: +# Verify spa deadman events are rate limited +# +# STRATEGY: +# 1. Reduce the zfs_slow_io_events_per_second to 1. +# 2. Reduce the zfs_deadman_ziotime_ms to 1ms. +# 3. Write data to a pool and read it back. +# 4. Verify deadman events have been produced at a reasonable rate. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/deadman/deadman.cfg + +verify_runnable "both" + +function cleanup +{ + zinject -c all + default_cleanup_noexit + + set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT +} + +log_assert "Verify spa deadman events are rate limited" +log_onexit cleanup + +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) +log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1 +log_must set_tunable64 DEADMAN_ZIOTIME_MS 1 + +# Create a new pool in order to use the updated deadman settings. +default_setup_noexit $DISK1 +log_must zpool events -c + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zinject -d $DISK1 -D 5:1 $TESTPOOL +log_must dd if=$mntpnt/file of=$TEST_BASE_DIR/devnull oflag=sync + +events=$(zpool events $TESTPOOL | grep -c ereport.fs.zfs.deadman) +log_note "events=$events" +if [ "$events" -lt 1 ]; then + log_fail "Expect >= 1 deadman events, $events found" +fi +if [ "$events" -gt 10 ]; then + log_fail "Expect <= 10 deadman events, $events found" +fi + +log_pass "Verify spa deadman events are rate limited" diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh index a5537c4355..fd6e8c858e 100755 --- a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh +++ b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh @@ -46,17 +46,17 @@ function cleanup log_must zinject -c all default_cleanup_noexit - log_must set_tunable64 zfs_deadman_synctime_ms $SYNCTIME_DEFAULT - log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT - log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT + log_must set_tunable64 DEADMAN_SYNCTIME_MS $SYNCTIME_DEFAULT + log_must set_tunable64 DEADMAN_CHECKTIME_MS $CHECKTIME_DEFAULT + log_must set_tunable64 DEADMAN_FAILMODE $FAILMODE_DEFAULT } log_assert "Verify spa deadman detects a hung txg" log_onexit cleanup -log_must set_tunable64 zfs_deadman_synctime_ms 5000 -log_must set_tunable64 zfs_deadman_checktime_ms 1000 -log_must set_tunable64 zfs_deadman_failmode "wait" +log_must set_tunable64 DEADMAN_SYNCTIME_MS 5000 +log_must set_tunable64 DEADMAN_CHECKTIME_MS 1000 +log_must set_tunable64 DEADMAN_FAILMODE "wait" # Create a new pool in order to use the updated deadman settings. default_setup_noexit $DISK1 @@ -73,13 +73,17 @@ log_must zinject -c all log_must zpool sync # Log txg sync times for reference and the zpool event summary. -log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs +if is_freebsd; then + log_must sysctl -n kstat.zfs.$TESTPOOL.txgs +else + log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs +fi log_must zpool events -# Verify at least 5 deadman events were logged. The first after 5 seconds, +# Verify at least 4 deadman events were logged. The first after 5 seconds, # and another each second thereafter until the delay is clearer. events=$(zpool events | grep -c ereport.fs.zfs.deadman) -if [ "$events" -lt 5 ]; then +if [ "$events" -lt 4 ]; then log_fail "Expect >=5 deadman events, $events found" fi diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh index a61be995ae..c1cfc11512 100755 --- a/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh +++ b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh @@ -49,19 +49,19 @@ function cleanup log_must zinject -c all default_cleanup_noexit - log_must set_tunable64 zfs_deadman_ziotime_ms $ZIOTIME_DEFAULT - log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT - log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT + log_must set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT + log_must set_tunable64 DEADMAN_CHECKTIME_MS $CHECKTIME_DEFAULT + log_must set_tunable64 DEADMAN_FAILMODE $FAILMODE_DEFAULT } log_assert "Verify zio deadman detects a hung zio" log_onexit cleanup # 1. Reduce the zfs_deadman_ziotime_ms to 5s. -log_must set_tunable64 zfs_deadman_ziotime_ms 5000 +log_must set_tunable64 DEADMAN_ZIOTIME_MS 5000 # 2. Reduce the zfs_deadman_checktime_ms to 1s. -log_must set_tunable64 zfs_deadman_checktime_ms 1000 -log_must set_tunable64 zfs_deadman_failmode "wait" +log_must set_tunable64 DEADMAN_CHECKTIME_MS 1000 +log_must set_tunable64 DEADMAN_FAILMODE "wait" # Create a new pool in order to use the updated deadman settings. default_setup_noexit $DISK1 diff --git a/tests/zfs-tests/tests/functional/delegate/cleanup.ksh b/tests/zfs-tests/tests/functional/delegate/cleanup.ksh index 31a57590fc..1951c00e2c 100755 --- a/tests/zfs-tests/tests/functional/delegate/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/delegate/cleanup.ksh @@ -43,8 +43,12 @@ if ! is_linux; then fi fi +if is_freebsd; then + log_must sysctl vfs.usermount=0 +fi + if is_linux; then - log_must set_tunable64 zfs_admin_snapshot 0 + log_must set_tunable64 ADMIN_SNAPSHOT 0 fi default_cleanup diff --git a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib index d088eaf371..e39b015b21 100644 --- a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib +++ b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib @@ -101,7 +101,7 @@ function verify_perm log_note "Check $type $user $perm $dtst" if ((ret != 0)) ; then - log_note "Fail: $user should have $perm " \ + log_note "Fail: $user should have $perm" \ "on $dtst" return 1 fi @@ -379,7 +379,7 @@ function verify_send typeset dtst=$3 typeset oldval - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset snap=$dtst@snap.$stamp typeset -i ret=1 @@ -408,7 +408,7 @@ function verify_fs_receive typeset fs=$3 typeset dtst - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset newfs=$fs/newfs.$stamp typeset newvol=$fs/newvol.$stamp typeset bak_user=$TEST_BASE_DIR/bak.$user.$stamp @@ -480,9 +480,10 @@ function verify_userprop typeset perm=$2 typeset dtst=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM user_run $user zfs set "$user:ts=$stamp" $dtst + zpool sync ${dtst%%/*} if [[ $stamp != $(get_prop "$user:ts" $dtst) ]]; then return 1 fi @@ -564,7 +565,7 @@ function verify_fs_create typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset newfs=$fs/nfs.$stamp typeset newvol=$fs/nvol.$stamp @@ -684,7 +685,7 @@ function verify_fs_destroy # Verify that given the correct delegation, a regular user can: # Take a snapshot of an unmounted dataset -# Take a snapshot of an mounted dataset +# Take a snapshot of a mounted dataset # Create a snapshot by making a directory in the .zfs/snapshot directory function verify_fs_snapshot { @@ -692,7 +693,7 @@ function verify_fs_snapshot typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset snap=$fs@snap.$stamp typeset mntpt=$(get_prop mountpoint $fs) @@ -716,12 +717,15 @@ function verify_fs_snapshot fi log_must zfs destroy $snap - typeset snapdir=${mntpt}/.zfs/snapshot/snap.$stamp - user_run $user mkdir $snapdir - if ! datasetexists $snap ; then - return 1 + # Creating snaps via mkdir is not supported on FreeBSD + if ! is_freebsd; then + typeset snapdir=${mntpt}/.zfs/snapshot/snap.$stamp + user_run $user mkdir $snapdir + if ! datasetexists $snap ; then + return 1 + fi + log_must zfs destroy $snap fi - log_must zfs destroy $snap return 0 } @@ -733,7 +737,7 @@ function verify_fs_rollback typeset fs=$3 typeset oldval - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset snap=$fs@snap.$stamp typeset mntpt=$(get_prop mountpoint $fs) @@ -766,7 +770,7 @@ function verify_fs_clone typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basefs=${fs%/*} typeset snap=$fs@snap.$stamp typeset clone=$basefs/cfs.$stamp @@ -811,7 +815,7 @@ function verify_fs_rename typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basefs=${fs%/*} typeset snap=$fs@snap.$stamp typeset renamefs=$basefs/nfs.$stamp @@ -894,7 +898,7 @@ function verify_fs_mount typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset mntpt=$(get_prop mountpoint $fs) typeset newmntpt=$TEST_BASE_DIR/mnt.$stamp @@ -962,7 +966,7 @@ function verify_fs_mountpoint typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset mntpt=$(get_prop mountpoint $fs) typeset newmntpt=$TEST_BASE_DIR/mnt.$stamp @@ -1001,7 +1005,7 @@ function verify_fs_promote typeset perm=$2 typeset fs=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basefs=${fs%/*} typeset snap=$fs@snap.$stamp typeset clone=$basefs/cfs.$stamp @@ -1057,7 +1061,7 @@ function verify_fs_canmount typeset fs=$3 typeset oldval - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM if ! ismounted $fs ; then set -A modes "on" "off" @@ -1368,7 +1372,7 @@ function verify_vol_snapshot typeset perm=$2 typeset vol=$3 - typeset stamp=${perm}.${user}.$(date +'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basevol=${vol%/*} typeset snap=$vol@snap.$stamp @@ -1393,7 +1397,7 @@ function verify_vol_rollback typeset perm=$2 typeset vol=$3 - typeset stamp=${perm}.${user}.$(date+'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basevol=${vol%/*} typeset snap=$vol@snap.$stamp @@ -1428,7 +1432,7 @@ function verify_vol_clone typeset perm=$2 typeset vol=$3 - typeset stamp=${perm}.${user}.$(date+'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basevol=${vol%/*} typeset snap=$vol@snap.$stamp typeset clone=$basevol/cvol.$stamp @@ -1474,7 +1478,7 @@ function verify_vol_rename typeset perm=$2 typeset vol=$3 - typeset stamp=${perm}.${user}.$(date+'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basevol=${vol%/*} typeset snap=$vol@snap.$stamp typeset clone=$basevol/cvol.$stamp @@ -1521,7 +1525,7 @@ function verify_vol_promote typeset perm=$2 typeset vol=$3 - typeset stamp=${perm}.${user}.$(date+'%F-%T-%N') + typeset stamp=${perm}.${user}.$RANDOM typeset basevol=${vol%/*} typeset snap=$vol@snap.$stamp typeset clone=$basevol/cvol.$stamp diff --git a/tests/zfs-tests/tests/functional/delegate/setup.ksh b/tests/zfs-tests/tests/functional/delegate/setup.ksh index 149cf7869a..2f13da7504 100755 --- a/tests/zfs-tests/tests/functional/delegate/setup.ksh +++ b/tests/zfs-tests/tests/functional/delegate/setup.ksh @@ -33,7 +33,7 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/delegate/delegate_common.kshlib -if ! is_linux; then +if is_illumos; then # check svc:/network/nis/client:default state # disable it if the state is ON # and the state will be restored during cleanup.ksh @@ -44,6 +44,11 @@ if ! is_linux; then fi fi +if is_freebsd; then + # To pass user mount tests + log_must sysctl vfs.usermount=1 +fi + cleanup_user_group # Create staff group and add two user to it @@ -71,7 +76,7 @@ fi DISK=${DISKS%% *} if is_linux; then - log_must set_tunable64 zfs_admin_snapshot 1 + log_must set_tunable64 ADMIN_SNAPSHOT 1 fi default_volume_setup $DISK diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_001_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_001_pos.ksh index 3db1af5098..1e0ed80d32 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_001_pos.ksh @@ -83,7 +83,7 @@ if ! cat /etc/group | awk -F: '{print $1}' | \ grep -w 'everyone' > /dev/null 2>&1 then group_added="TRUE" - log_must groupadd everyone + log_must add_group everyone fi for dtst in $DATASETS ; do @@ -92,7 +92,7 @@ for dtst in $DATASETS ; do done log_must restore_root_datasets if [[ $group_added == "TRUE" ]]; then - log_must groupdel everyone + log_must del_group everyone fi log_pass "everyone is always interpreted as keyword passed." diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_002_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_002_pos.ksh index 23ed806ad7..fc603eae19 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_002_pos.ksh @@ -50,6 +50,14 @@ function cleanup { if id $STAFF_GROUP > /dev/null 2>&1; then log_must del_user $STAFF_GROUP + if is_freebsd; then + # pw userdel also deletes the group with the same name + # and has no way to opt out of this behavior (yet). + # Recreate the group as a workaround. + log_must add_group $STAFF_GROUP + log_must add_user $STAFF_GROUP $STAFF1 + log_must add_user $STAFF_GROUP $STAFF2 + fi fi restore_root_datasets @@ -71,6 +79,14 @@ done log_must restore_root_datasets log_must del_user $STAFF_GROUP +if is_freebsd; then + # pw userdel also deletes the group with the same name + # and has no way to opt out of this behavior (yet). + # Recreate the group as a workaround. + log_must add_group $STAFF_GROUP + log_must add_user $STAFF_GROUP $STAFF1 + log_must add_user $STAFF_GROUP $STAFF2 +fi for dtst in $datasets ; do log_must zfs allow $STAFF_GROUP $perms $dtst log_must verify_perm $dtst $perms $STAFF1 $STAFF2 diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_007_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_007_pos.ksh index ea43fcf033..f3213254b9 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_007_pos.ksh @@ -66,7 +66,7 @@ log_must zfs create $childfs log_must zfs create $grandchild # -# Setting different permissions to the same set on two level. +# Setting different permissions to the same set on two levels. # But only assign the user at one level. # log_must zfs allow -s @set $perms1 $ROOT_TESTFS @@ -74,7 +74,8 @@ log_must zfs allow -s @set $perms2 $childfs log_must zfs allow $STAFF1 @set $childfs # -# Verify only perms2 is valid to user on the level which he was assigned. +# Verify that the user only has the permissions that they were assigned +# in each filesystem. # log_must verify_noperm $ROOT_TESTFS $perms1 $STAFF1 for fs in $childfs $grandchild ; do diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_008_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_008_pos.ksh index 48de842b7e..b0e1df32a6 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_008_pos.ksh @@ -33,13 +33,13 @@ # # DESCRIPTION: -# non-root user can allow any permissions which he is holding to -# other else user when it get 'allow' permission. +# A non-root user can use 'zfs allow' to delegate permissions that +# they have, if they also have the 'allow' permission. # # STRATEGY: # 1. Set two set permissions to two datasets locally. -# 2. Verify the non-root user can allow permission if he has allow -# permission. +# 2. Verify the non-root user can use 'zfs allow' if they have +# 'allow' permission. # verify_runnable "both" @@ -69,8 +69,8 @@ for dtst in $DATASETS ; do log_must user_run $STAFF1 zfs allow -l $OTHER1 $perms1 $dtst log_must verify_perm $dtst $perms1 $OTHER1 - # $perms2 was not allow to $STAFF1, so he have no permission to - # delegate permission to other else. + # $perms2 was not allowed to $STAFF1, so they do not have + # permission to delegate permission to other users. log_mustnot user_run $STAFF1 zfs allow $OTHER1 $perms2 $dtst log_must verify_noperm $dtst $perms2 $OTHER1 done diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_009_neg.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_009_neg.ksh index c2c9110204..a6f12244ce 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_009_neg.ksh @@ -36,7 +36,7 @@ # zfs allow can deal with invalid arguments.(Invalid options or combination) # # STRATEGY: -# 1. Verify invalid argumets will cause error. +# 1. Verify invalid arguments will cause error. # 2. Verify non-optional argument was missing will cause error. # 3. Verify invalid options cause error. # @@ -51,7 +51,6 @@ longset="set123456789012345678901234567890123456789012345678901234567890123" for dtst in $DATASETS ; do log_mustnot eval "zfs allow -s @$longset $dtst" # Create non-existent permission set - typeset timestamp=$(date +'%F-%R:%S') log_mustnot zfs allow -s @non-existent $dtst log_mustnot zfs allow $STAFF "atime,created,mounted" $dtst log_mustnot zfs allow $dtst $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh index 7b70e13224..3a8ef5e625 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh @@ -87,6 +87,47 @@ set -A perms create true false \ promote true true \ xattr true false \ receive true false + +elif is_freebsd; then +# Results in Results in +# Permission Filesystem Volume +# +# Removed for FreeBSD +# - jailed - jailing requires superuser privileges +# - sharenfs - sharing requires superuser privileges +# - share - sharing requires superuser privileges +# - xattr - Not supported on FreeBSD +# +set -A perms create true false \ + snapshot true true \ + mount true false \ + send true true \ + allow true true \ + quota true false \ + reservation true true \ + dnodesize true false \ + recordsize true false \ + mountpoint true false \ + checksum true true \ + compression true true \ + canmount true false \ + atime true false \ + devices true false \ + exec true false \ + volsize false true \ + setuid true false \ + readonly true true \ + snapdir true false \ + userprop true true \ + aclmode true false \ + aclinherit true false \ + rollback true true \ + clone true true \ + rename true true \ + promote true true \ + receive true false \ + destroy true true + else set -A perms create true false \ diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh index 4da559bfc7..fd95db92e0 100755 --- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh +++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh @@ -61,6 +61,12 @@ set -A perms create snapshot mount send allow quota reservation \ devices exec volsize setuid readonly snapdir userprop \ rollback clone rename promote dnodesize \ zoned xattr receive destroy +elif is_freebsd; then +set -A perms create snapshot mount send allow quota reservation \ + recordsize mountpoint checksum compression canmount atime \ + devices exec volsize setuid readonly snapdir userprop \ + aclmode aclinherit rollback clone rename promote dnodesize \ + jailed receive destroy else set -A perms create snapshot mount send allow quota reservation \ recordsize mountpoint checksum compression canmount atime \ diff --git a/tests/zfs-tests/tests/functional/devices/devices_001_pos.ksh b/tests/zfs-tests/tests/functional/devices/devices_001_pos.ksh index ac031ed6a5..2f2802bc65 100755 --- a/tests/zfs-tests/tests/functional/devices/devices_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/devices/devices_001_pos.ksh @@ -42,7 +42,7 @@ # 1. Create pool and file system. # 2. Set devices=on on this file system. # 3. Separately create block device file and character file. -# 4. Separately read from those two device files. +# 4. Separately read and write from those two device files. # 5. Check the return value, and make sure it succeeds. # @@ -55,12 +55,18 @@ log_onexit cleanup log_must zfs set devices=on $TESTPOOL/$TESTFS # -# Separately create block device file and character device file, then try to -# open them and make sure it succeed. +# Create block device file backed by a ZFS volume. +# Verify it can be opened, written, and read. # -create_dev_file b $TESTDIR/$TESTFILE1 -log_must dd if=$TESTDIR/$TESTFILE1 of=$TESTDIR/$TESTFILE1.out count=1 +create_dev_file b $TESTDIR/$TESTFILE1 $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL +log_must dd if=/dev/urandom of=$TESTDIR/$TESTFILE1.out1 count=1 bs=128k +log_must dd if=$TESTDIR/$TESTFILE1.out1 of=$TESTDIR/$TESTFILE1 count=1 bs=128k +log_must dd if=$TESTDIR/$TESTFILE1 of=$TESTDIR/$TESTFILE1.out2 count=1 bs=128k +log_must cmp $TESTDIR/$TESTFILE1.out1 $TESTDIR/$TESTFILE1.out2 + +# Create character device file backed by /dev/null +# Verify it can be opened and written. create_dev_file c $TESTDIR/$TESTFILE2 -log_must dd if=$TESTDIR/$TESTFILE2 of=$TESTDIR/$TESTFILE2.out count=1 +log_must dd if=/dev/urandom of=$TESTDIR/$TESTFILE2 count=1 bs=128k log_pass "Setting devices=on on file system and testing it pass." diff --git a/tests/zfs-tests/tests/functional/devices/devices_002_neg.ksh b/tests/zfs-tests/tests/functional/devices/devices_002_neg.ksh index ce25502b81..a768c4aa6b 100755 --- a/tests/zfs-tests/tests/functional/devices/devices_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/devices/devices_002_neg.ksh @@ -42,7 +42,7 @@ # 1. Create pool and file system. # 2. Set devices=off on this file system. # 3. Separately create block device file and character file. -# 4. Separately read from those two device files. +# 4. Separately read and write from those two device files. # 5. Check the return value, and make sure it failed. # @@ -55,12 +55,16 @@ log_onexit cleanup log_must zfs set devices=off $TESTPOOL/$TESTFS # -# Separately create block device file and character device file, then try to -# open them and make sure it failed. +# Create block device file backed by a ZFS volume. +# Verify it cannot be opened, written, and read. # -create_dev_file b $TESTDIR/$TESTFILE1 -log_mustnot dd if=$TESTDIR/$TESTFILE1 of=$TESTDIR/$TESTFILE1.out count=1 +create_dev_file b $TESTDIR/$TESTFILE1 $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL +log_mustnot dd if=/dev/urandom of=$TESTDIR/$TESTFILE1 count=1 bs=128k +log_mustnot dd if=$TESTDIR/$TESTFILE1 of=/dev/null count=1 bs=128k + +# Create character device file backed by /dev/null +# Verify it cannot be opened and written. create_dev_file c $TESTDIR/$TESTFILE2 -log_mustnot dd if=$TESTDIR/$TESTFILE2 of=$TESTDIR/$TESTFILE2.out count=1 +log_mustnot dd if=/dev/urandom of=$TESTDIR/$TESTFILE2 count=1 bs=128k log_pass "Setting devices=off on file system and testing it pass." diff --git a/tests/zfs-tests/tests/functional/devices/devices_common.kshlib b/tests/zfs-tests/tests/functional/devices/devices_common.kshlib index 2c7df8d058..fa7fdbecf5 100644 --- a/tests/zfs-tests/tests/functional/devices/devices_common.kshlib +++ b/tests/zfs-tests/tests/functional/devices/devices_common.kshlib @@ -36,89 +36,74 @@ # # $1 device file type # $2 file name +# $3 device path (used for 'b' device type) # function create_dev_file { typeset filetype=$1 typeset filename=$2 + typeset devstr=$3 case $filetype in - b) - if is_linux; then - major=$(awk '/[hsv]d/ { print $1; exit }' \ - /proc/partitions) - minor=$(awk '/[hsv]d/ { print $2; exit }' \ - /proc/partitions) - log_must mknod $filename b $major $minor - return 0 - fi - - devtype=$(df -n / | awk '{print $3}') - case $devtype in - zfs) - rootpool=$(df / | \ - awk '{print $2}') - rootpool=${rootpool#\(} - rootpool=${rootpool%%/*} - - devstr=$(get_disklist $rootpool) - devstr=$(echo "$devstr" | \ - awk '{print $1}') - [[ -z $devstr ]] && \ - log_fail "Can not get block device file." - devstr=$DEV_DSKDIR/${devstr} - ;; - ufs) + b) + case $(uname) in + Linux) # - # Get the existing block device file in current system. - # And bring out the first one. + # stat(1) --format=FORMAT tokens + # %t - major device type in hex + # %T - minor device type in hex # - devstr=$(df-lhF ufs | \ - grep "^${DEV_DSKDIR}" | \ - awk '{print $1}') - devstr=$(echo "$devstr" | \ - awk '{print $1}') - [[ -z $devstr ]] && \ - log_fail "Can not get block device file." - ;; - *) - log_unsupported "Unsupported fstype " \ - "for / ($devtype)," \ - "only ufs|zfs is supported." - ;; - esac - + major=$(stat --dereference --format="%t" "$devstr") + minor=$(stat --dereference --format="%T" "$devstr") + log_must mknod $filename b "0x${major}" "0x${minor}" + ;; + *) # # Get the device file information. i.e: - # $DEV_DSKDIR/c0t0d0s0: block special (28/768) + # $devstr: block special (28/768) # devstr=$(file $devstr) - - # - # Bring out major and minor number. - # major=${devstr##*\(} major=${major%%/*} minor=${devstr##*/} minor=${minor%\)} - log_must mknod $filename b $major $minor ;; - c) + esac + ;; + c) + # + # Create device file '/dev/null', $devstr is unused. + # + case $(uname) in + Linux) + # + # stat(1) --format=FORMAT tokens + # %t - major device type in hex + # %T - minor device type in hex + # + major=$(stat --format="%t" /dev/null) + minor=$(stat --format="%T" /dev/null) + log_must mknod $filename c "0x${major}" "0x${minor}" + ;; + FreeBSD) # # Create device file '/dev/null' # - if is_linux; then - major=$(stat -c %t /dev/null) - minor=$(stat -c %T /dev/null) - log_must mknod $filename c $major $minor - else - log_must mknod $filename c $(getmajor mm) 2 - fi + major=13 + minor=2 + log_must mknod $filename b $major $minor ;; *) - log_fail "'$filetype' is wrong." + major=$(getmajor mm) + minor=2 + log_must mknod $filename b $major $minor ;; + esac + ;; + *) + log_fail "'$filetype' is wrong." + ;; esac return 0 @@ -129,6 +114,6 @@ function cleanup log_must zfs set devices=on $TESTPOOL/$TESTFS log_must rm -f $TESTDIR/$TESTFILE1 log_must rm -f $TESTDIR/$TESTFILE2 - log_must rm -f $TESTDIR/$TESTFILE1.out - log_must rm -f $TESTDIR/$TESTFILE2.out + log_must rm -f $TESTDIR/$TESTFILE1.out1 + log_must rm -f $TESTDIR/$TESTFILE1.out2 } diff --git a/tests/zfs-tests/tests/functional/devices/setup.ksh b/tests/zfs-tests/tests/functional/devices/setup.ksh index fc5cec3063..ee6cf83acb 100755 --- a/tests/zfs-tests/tests/functional/devices/setup.ksh +++ b/tests/zfs-tests/tests/functional/devices/setup.ksh @@ -32,4 +32,4 @@ . $STF_SUITE/include/libtest.shlib DISK=${DISKS%% *} -default_setup $DISK +default_volume_setup $DISK diff --git a/tests/zfs-tests/tests/functional/events/.gitignore b/tests/zfs-tests/tests/functional/events/.gitignore new file mode 100644 index 0000000000..ed5af03a10 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/.gitignore @@ -0,0 +1 @@ +/zed_fd_spill-zedlet diff --git a/tests/zfs-tests/tests/functional/events/Makefile.am b/tests/zfs-tests/tests/functional/events/Makefile.am index e1fe490812..92ce5dbc38 100644 --- a/tests/zfs-tests/tests/functional/events/Makefile.am +++ b/tests/zfs-tests/tests/functional/events/Makefile.am @@ -1,11 +1,18 @@ +include $(top_srcdir)/config/Rules.am + pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/events dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ events_001_pos.ksh \ events_002_pos.ksh \ - zed_rc_filter.ksh + zed_rc_filter.ksh \ + zed_fd_spill.ksh dist_pkgdata_DATA = \ events.cfg \ events_common.kshlib + +pkgexecdir = $(pkgdatadir) +pkgexec_PROGRAMS = zed_fd_spill-zedlet +zed_fd_spill_zedlet_SOURCES = zed_fd_spill-zedlet.c diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh index 4905342b71..699bc28233 100755 --- a/tests/zfs-tests/tests/functional/events/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh @@ -26,6 +26,6 @@ . $STF_SUITE/include/libtest.shlib -zed_cleanup all-debug.sh all-syslog.sh +zed_cleanup all-debug.sh all-syslog.sh all-dumpfds default_cleanup diff --git a/tests/zfs-tests/tests/functional/events/events_001_pos.ksh b/tests/zfs-tests/tests/functional/events/events_001_pos.ksh index 5121f66b78..189cf435e8 100755 --- a/tests/zfs-tests/tests/functional/events/events_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/events/events_001_pos.ksh @@ -94,22 +94,22 @@ run_and_verify -p "$MPOOL"\ -e "resource.fs.zfs.statechange" \ -e "sysevent.fs.zfs.config_sync" \ "zpool offline $MPOOL $VDEV1" -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "resource.fs.zfs.statechange" \ -e "sysevent.fs.zfs.vdev_online" \ - -e "sysevent.fs.zfs.resilver_start" \ - -e "sysevent.fs.zfs.resilver_finish" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.resilver_start" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.resilver_finish" \ "zpool online $MPOOL $VDEV1" # Attach then detach a device from the mirror. -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_attach" \ -e "sysevent.fs.zfs.resilver_start" \ - -e "sysevent.fs.zfs.resilver_finish" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.resilver_finish" \ "zpool attach $MPOOL $VDEV1 $VDEV4" run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_remove" \ @@ -117,20 +117,20 @@ run_and_verify -p "$MPOOL" \ "zpool detach $MPOOL $VDEV4" # Replace a device -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_attach" \ -e "sysevent.fs.zfs.resilver_start" \ + -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.resilver_finish" \ -e "sysevent.fs.zfs.vdev_remove" \ - -e "sysevent.fs.zfs.history_event" \ - -e "sysevent.fs.zfs.config_sync" \ "zpool replace -f $MPOOL $VDEV1 $VDEV4" # Scrub a pool. -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.scrub_start" \ - -e "sysevent.fs.zfs.scrub_finish" \ -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.scrub_finish" \ "zpool scrub $MPOOL" # Export then import a pool @@ -139,9 +139,9 @@ run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.config_sync" \ "zpool export $MPOOL" run_and_verify -p "$MPOOL" \ - -e "sysevent.fs.zfs.pool_import" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.pool_import" \ "zpool import -d $TEST_BASE_DIR $MPOOL" # Destroy the pool diff --git a/tests/zfs-tests/tests/functional/events/events_002_pos.ksh b/tests/zfs-tests/tests/functional/events/events_002_pos.ksh index 495b2bbade..af2be33dbc 100755 --- a/tests/zfs-tests/tests/functional/events/events_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/events/events_002_pos.ksh @@ -50,11 +50,11 @@ function cleanup [[ -f $file ]] && rm -f $file done - log_must rm -f $TMP_EVENTS_ZED $TMP_EVENTS_ZED + log_must rm -f $TMP_EVENTS_ZED log_must zed_stop } -log_assert "Verify ZED handles missed events on when starting" +log_assert "Verify ZED handles missed events when starting" log_onexit cleanup log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 @@ -66,7 +66,7 @@ log_must zpool create $MPOOL mirror $VDEV1 $VDEV2 # 2. Start the ZED and verify it handles missed events. log_must zed_start -log_must file_wait $ZED_DEBUG_LOG +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.config_sync' 150 log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED awk -v event="sysevent.fs.zfs.pool_create" \ @@ -81,9 +81,7 @@ log_must truncate -s 0 $ZED_DEBUG_LOG # 4. Generate additional events. log_must zpool offline $MPOOL $VDEV1 log_must zpool online $MPOOL $VDEV1 -while ! is_pool_resilvered $MPOOL; do - sleep 1 -done +log_must zpool wait -t resilver $MPOOL log_must zpool scrub $MPOOL @@ -94,12 +92,11 @@ done # 5. Start the ZED and verify it only handled the new missed events. log_must zed_start -log_must file_wait $ZED_DEBUG_LOG 15 +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.resilver_finish' 150 log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED -log_mustnot grep -q "sysevent.fs.zfs.pool_create" $TMP_EVENTS_ZED +log_mustnot file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.pool_create' 30 log_must grep -q "sysevent.fs.zfs.vdev_online" $TMP_EVENTS_ZED log_must grep -q "sysevent.fs.zfs.resilver_start" $TMP_EVENTS_ZED -log_must grep -q "sysevent.fs.zfs.resilver_finish" $TMP_EVENTS_ZED log_pass "Verify ZED handles missed events on when starting" diff --git a/tests/zfs-tests/tests/functional/events/events_common.kshlib b/tests/zfs-tests/tests/functional/events/events_common.kshlib index 26afc10917..9c5879183b 100644 --- a/tests/zfs-tests/tests/functional/events/events_common.kshlib +++ b/tests/zfs-tests/tests/functional/events/events_common.kshlib @@ -23,10 +23,34 @@ # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. # Use is subject to license terms. # +# Copyright (c) 2020 by Delphix. All rights reserved. +# . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/events/events.cfg +# +# wait for 'event' to show up in the log 'file' +function file_wait_event # file event timeout +{ + file=$1 + event=$2 + timeout=${3:-120} + + SECONDS=0 + + until grep -q "^ZEVENT_CLASS=$event" $ZED_DEBUG_LOG ; do + if [[ $SECONDS -gt $timeout ]]; then + echo file_wait_event exceeded $SECONDS seconds + return 1 + fi + + sleep 1 + done + + return 0; +} + # # Wait for up to 'timeout' seconds for the 'file' to settle, i.e. # not be updated for a period of 'delay' seconds. @@ -41,6 +65,7 @@ function file_wait # file delay timeout while [ $(( $(date +%s) - $(stat -c %Y $file) )) -lt $delay ]; do if [[ $SECONDS -gt $timeout ]]; then + echo file_wait exceeded $SECONDS seconds return 1 fi @@ -52,30 +77,22 @@ function file_wait # file delay timeout function run_and_verify { - typeset delay event pool zedlog + typeset event pool set -A events - while getopts "d:e:p:z:" opt; do + while getopts "e:p:" opt; do case $opt in - d) - delay=$OPTARG - ;; e) - events[${#events[*]}+1]=$OPTARG + events+=("$OPTARG") ;; p) pool=$OPTARG ;; - z) - zedlog=$OPTARG - ;; esac done shift $(($OPTIND - 1)) pool=${pool:-$TESTPOOL} - delay=${delay:-3} - zedlog=${zedlog:-$ZED_DEBUG_LOG} fullcmd="$1" cmd=$(echo $fullcmd | awk '{print $1}') @@ -87,21 +104,38 @@ function run_and_verify # Remove any previous events from the logs. log_must zpool events -c - log_must truncate -s 0 $zedlog + log_must truncate -s 0 $ZED_DEBUG_LOG # Run the command as provided. log_must eval "$fullcmd" # Collect the new events and verify there are some. log_must zpool sync -f - log_must file_wait $zedlog $delay - log_must cp $zedlog $TMP_EVENTS_ZED log_must eval "zpool events >$TMP_EVENTS 2>/dev/null" log_must eval "zpool events -v > $TMP_EVENTS_FULL 2>/dev/null" log_must test -s $TMP_EVENTS log_must test -s $TMP_EVENTS_FULL - log_must test -s $TMP_EVENTS_ZED + + # If the only event is history then we don't observe zed debug log + if [[ "${events[0]}" != "sysevent.fs.zfs.history_event" ]]; then + # wait for all the non-history events to show up in the + # debug log, all-debug.sh filters history events. + for event in ${events[*]}; do + if [[ "$event" == \ + "sysevent.fs.zfs.history_event" ]]; then + continue + fi + + log_must file_wait_event $ZED_DEBUG_LOG "$event" + done + + log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED + log_must test -s $TMP_EVENTS_ZED + + log_note "Events logged:" + grep "^ZEVENT_CLASS" $TMP_EVENTS_ZED + fi log_note "Events generated:" cat $TMP_EVENTS @@ -118,6 +152,11 @@ function run_and_verify $TMP_EVENTS_FULL >$TMP_EVENT_FULL log_must grep -q "pool = \"$pool\"" $TMP_EVENT_FULL + # all-debug.sh filters history events (seen in ZED_DEBUG_LOG) + if [[ "$event" == "sysevent.fs.zfs.history_event" ]]; then + continue + fi + # Verify the event was received by the ZED and logged. awk -v event="$event" \ 'BEGIN{FS="\n"; RS=""} $0 ~ event { print $0 }' \ diff --git a/tests/zfs-tests/tests/functional/events/zed_fd_spill-zedlet.c b/tests/zfs-tests/tests/functional/events/zed_fd_spill-zedlet.c new file mode 100644 index 0000000000..c072f906d2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_fd_spill-zedlet.c @@ -0,0 +1,36 @@ +/* + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +int main(void) { + if (fork()) { + int err; + wait(&err); + return (err); + } + + char buf[64]; + sprintf(buf, "/tmp/zts-zed_fd_spill-logdir/%d", getppid()); + dup2(creat(buf, 0644), STDOUT_FILENO); + + snprintf(buf, sizeof (buf), "/proc/%d/fd", getppid()); + execlp("ls", "ls", buf, NULL); + _exit(127); +} diff --git a/tests/zfs-tests/tests/functional/events/zed_fd_spill.ksh b/tests/zfs-tests/tests/functional/events/zed_fd_spill.ksh new file mode 100755 index 0000000000..8736a7fdf7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_fd_spill.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# DESCRIPTION: +# Verify ZEDLETs only inherit the fds specified in the manpage +# +# STRATEGY: +# 1. Inject a ZEDLET that dumps the fds it gets to a file. +# 2. Generate some events. +# 3. Read back the generated files and assert that there is no fd past 3, +# and there are exactly 4 fds. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must rm -rf "$logdir" + log_must rm "/tmp/zts-zed_fd_spill-logdir" + log_must zed_stop +} + +log_assert "Verify ZEDLETs inherit only the fds specified" +log_onexit cleanup + +logdir="$(mktemp -d)" +log_must ln -s "$logdir" /tmp/zts-zed_fd_spill-logdir + +self="$(readlink -f "$0")" +log_must ln -s "${self%/*}/zed_fd_spill-zedlet" "${ZEDLET_DIR}/all-dumpfds" + +log_must zpool events -c +log_must zed_stop +log_must zed_start + +log_must truncate -s 0 $ZED_DEBUG_LOG +log_must zpool scrub $TESTPOOL +log_must zfs set compression=off $TESTPOOL/$TESTFS +log_must wait_scrubbed $TESTPOOL +log_must file_wait $ZED_DEBUG_LOG 3 + +if [ -n "$(find "$logdir" -maxdepth 0 -empty)" ]; then + log_fail "Our ZEDLET didn't run!" +fi +log_must awk ' + !/^[0123]$/ { + print FILENAME ": " $0 + err=1 + } + END { + exit err + } +' "$logdir"/* +wc -l "$logdir"/* | log_must awk '$1 != "4" && $2 != "total" {print; exit 1}' + +log_pass "ZED doesn't leak fds to ZEDLETs" diff --git a/tests/zfs-tests/tests/functional/events/zed_rc_filter.ksh b/tests/zfs-tests/tests/functional/events/zed_rc_filter.ksh index 44652ee4cf..0bef0ef1f9 100755 --- a/tests/zfs-tests/tests/functional/events/zed_rc_filter.ksh +++ b/tests/zfs-tests/tests/functional/events/zed_rc_filter.ksh @@ -49,6 +49,7 @@ log_assert "Verify zpool sub-commands generate expected events" log_onexit cleanup log_must zpool events -c +log_must zed_stop log_must zed_start # Backup our zed.rc diff --git a/tests/zfs-tests/tests/functional/fallocate/Makefile.am b/tests/zfs-tests/tests/functional/fallocate/Makefile.am new file mode 100644 index 0000000000..5ff366d248 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fallocate/Makefile.am @@ -0,0 +1,6 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fallocate +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + fallocate_prealloc.ksh \ + fallocate_punch-hole.ksh diff --git a/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh b/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh new file mode 100755 index 0000000000..bdfa614711 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fallocate/cleanup.ksh @@ -0,0 +1,27 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh b/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh new file mode 100755 index 0000000000..7bb020fe5c --- /dev/null +++ b/tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test fallocate(2) preallocation. +# +# STRATEGY: +# 1. Verify mode 0 fallocate is supported. +# 2. Verify default 10% reserve space is honored by setting a quota. +# + +verify_runnable "global" + +FILE=$TESTDIR/$TESTFILE0 + +function cleanup +{ + log_must zfs set quota=none $TESTPOOL + + [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/* +} + +log_assert "Ensure sparse files can be preallocated" + +log_onexit cleanup + +# Pre-allocate a sparse 1GB file. +log_must fallocate -l $((1024 * 1024 * 1024)) $FILE +log_must rm -Rf $TESTDIR/* + +# Verify that an additional ~10% reserve space is required. +log_must zfs set quota=100M $TESTPOOL +log_mustnot fallocate -l $((150 * 1024 * 1024)) $FILE +log_mustnot fallocate -l $((110 * 1024 * 1024)) $FILE +log_must fallocate -l $((90 * 1024 * 1024)) $FILE + +log_pass "Ensure sparse files can be preallocated" diff --git a/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh b/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh new file mode 100755 index 0000000000..ed83561bd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh @@ -0,0 +1,110 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2021 by The FreeBSD Foundation. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test hole-punching functionality +# +# STRATEGY: +# 1. Create a dense file +# 2. Punch an assortment of holes in the file and verify the result. +# + +verify_runnable "global" + +# +# Prior to __FreeBSD_version 1400032 there are no mechanism to punch hole in a +# file on FreeBSD. truncate -d support is required to call fspacectl(2) on +# behalf of the script. +# +if is_freebsd; then + if [[ $(uname -K) -lt 1400032 ]]; then + log_unsupported "Requires fspacectl(2) support on FreeBSD" + fi + if truncate -d 2>&1 | grep "illegal option" > /dev/null; then + log_unsupported "Requires truncate(1) -d support on FreeBSD" + fi +fi + +FILE=$TESTDIR/$TESTFILE0 +BLKSZ=$(get_prop recordsize $TESTPOOL) + +function cleanup +{ + [[ -e $TESTDIR ]] && log_must rm -f $FILE +} + +function check_disk_size +{ + typeset expected_size=$1 + + disk_size=$(du $TESTDIR/file | awk '{print $1}') + if [ $disk_size -ne $expected_size ]; then + log_fail "Incorrect size: $disk_size != $expected_size" + fi +} + +function check_apparent_size +{ + typeset expected_size=$1 + + apparent_size=$(stat_size) + if [ $apparent_size -ne $expected_size ]; then + log_fail "Incorrect size: $apparent_size != $expected_size" + fi +} + +log_assert "Ensure holes can be punched in files making them sparse" + +log_onexit cleanup + +# Create a dense file and check it is the correct size. +log_must file_write -o create -f $FILE -b $BLKSZ -c 8 +log_must check_disk_size $((131072 * 8)) + +# Punch a hole for the first full block. +log_must punch_hole 0 $BLKSZ $FILE +log_must check_disk_size $((131072 * 7)) + +# Partially punch a hole in the second block. +log_must punch_hole $BLKSZ $((BLKSZ / 2)) $FILE +log_must check_disk_size $((131072 * 7)) + +# Punch a hole which overlaps the third and forth block. +log_must punch_hole $(((BLKSZ * 2) + (BLKSZ / 2))) $((BLKSZ)) $FILE +log_must check_disk_size $((131072 * 7)) + +# Punch a hole from the fifth block past the end of file. The apparent +# file size should not change since --keep-size is implied. +apparent_size=$(stat_size $FILE) +log_must punch_hole $((BLKSZ * 4)) $((BLKSZ * 10)) $FILE +log_must check_disk_size $((131072 * 4)) +log_must check_apparent_size $apparent_size + +log_pass "Ensure holes can be punched in files making them sparse" diff --git a/tests/zfs-tests/tests/functional/fallocate/setup.ksh b/tests/zfs-tests/tests/functional/fallocate/setup.ksh new file mode 100755 index 0000000000..32334d3968 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fallocate/setup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index f2fc06877d..ba0d7d6992 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ auto_offline_001_pos.ksh \ auto_online_001_pos.ksh \ + auto_online_002_pos.ksh \ auto_replace_001_pos.ksh \ auto_spare_001_pos.ksh \ auto_spare_002_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index bd0fd4c879..86916bf906 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -25,30 +25,36 @@ # # DESCRIPTION: # Testing Fault Management Agent ZED Logic - Physically removed device is -# offlined and onlined when reattached +# made unavail and onlined when reattached # # STRATEGY: # 1. Create a pool # 2. Simulate physical removal of one device -# 3. Verify the device is offlined +# 3. Verify the device is unavailable # 4. Reattach the device # 5. Verify the device is onlined -# 6. Repeat the same tests with a spare device: zed will use the spare to handle -# the removed data device -# 7. Repeat the same tests again with a faulted spare device: zed should offline -# the removed data device if no spare is available +# 6. Repeat the same tests with a spare device: +# zed will use the spare to handle the removed data device +# 7. Repeat the same tests again with a faulted spare device: +# the removed data device should be unavailable # # NOTE: the use of 'block_device_wait' throughout the test helps avoid race # conditions caused by mixing creation/removal events from partitioning the # disk (zpool create) and events from physically removing it (remove_disk). # +# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a +# vdev to the unavailable state. The ZED does receive a removal notification +# but only relies on it to activate a hot spare. Additional work is planned +# to extend an existing ioctl interface to allow the ZED to transition the +# vdev in to a removed state. +# verify_runnable "both" if is_linux; then # Add one 512b scsi_debug device (4Kn would generate IO errors) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -73,31 +79,33 @@ filedev3="$TEST_BASE_DIR/file-vdev-3" sparedev="$TEST_BASE_DIR/file-vdev-spare" removedev=$(get_debug_device) -typeset poolconfs=("mirror $filedev1 $removedev" - "raidz $filedev1 $removedev" - "raidz2 $filedev1 $filedev2 $removedev" +typeset poolconfs=( + "mirror $filedev1 $removedev" "raidz3 $filedev1 $filedev2 $filedev3 $removedev" - "$filedev1 cache $removedev" - "mirror $filedev1 $filedev2 cache $removedev" - "raidz $filedev1 $filedev2 $filedev3 cache $removedev" + "mirror $filedev1 $filedev2 special mirror $filedev3 $removedev" ) -log_must truncate -s $SPA_MINDEVSIZE $filedev1 -log_must truncate -s $SPA_MINDEVSIZE $filedev2 -log_must truncate -s $SPA_MINDEVSIZE $filedev3 -log_must truncate -s $SPA_MINDEVSIZE $sparedev +log_must truncate -s $MINVDEVSIZE $filedev1 +log_must truncate -s $MINVDEVSIZE $filedev2 +log_must truncate -s $MINVDEVSIZE $filedev3 +log_must truncate -s $MINVDEVSIZE $sparedev for conf in "${poolconfs[@]}" do # 1. Create a pool log_must zpool create -f $TESTPOOL $conf - block_device_wait + block_device_wait ${DEV_DSKDIR}/${removedev} + + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" # 2. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 3. Verify the device is offlined - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + # 3. Verify the device is unavailable. + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" # 4. Reattach the device insert_disk $removedev @@ -107,8 +115,8 @@ do # cleanup destroy_pool $TESTPOOL - log_must parted "/dev/${removedev}" -s -- mklabel msdos - block_device_wait + log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos + block_device_wait ${DEV_DSKDIR}/${removedev} done # 6. Repeat the same tests with a spare device: zed will use the spare to handle @@ -117,30 +125,31 @@ for conf in "${poolconfs[@]}" do # 1. Create a pool with a spare log_must zpool create -f $TESTPOOL $conf - block_device_wait + block_device_wait ${DEV_DSKDIR}/${removedev} log_must zpool add $TESTPOOL spare $sparedev - # 3. Simulate physical removal of one device + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" + + # 2. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 4. Verify the device is handled by the spare unless is a l2arc disk - # which can only be offlined - if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then - log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" - else - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" - fi + # 3. Verify the device is handled by the spare. + log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" - # 5. Reattach the device + # 4. Reattach the device insert_disk $removedev - # 6. Verify the device is onlined + # 5. Verify the device is onlined log_must wait_vdev_state $TESTPOOL $removedev "ONLINE" # cleanup destroy_pool $TESTPOOL - log_must parted "/dev/${removedev}" -s -- mklabel msdos - block_device_wait + log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos + block_device_wait ${DEV_DSKDIR}/${removedev} done # 7. Repeat the same tests again with a faulted spare device: zed should offline @@ -149,18 +158,23 @@ for conf in "${poolconfs[@]}" do # 1. Create a pool with a spare log_must zpool create -f $TESTPOOL $conf - block_device_wait + block_device_wait ${DEV_DSKDIR}/${removedev} log_must zpool add $TESTPOOL spare $sparedev + mntpnt=$(get_prop mountpoint /$TESTPOOL) || + log_fail "get_prop mountpoint /$TESTPOOL" + # 2. Fault the spare device making it unavailable log_must zpool offline -f $TESTPOOL $sparedev log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED" # 3. Simulate physical removal of one device remove_disk $removedev + log_must mkfile 1m $mntpnt/file + log_must zpool sync $TESTPOOL - # 4. Verify the device is offlined - log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE" + # 4. Verify the device is unavailable + log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" # 5. Reattach the device insert_disk $removedev @@ -170,8 +184,8 @@ do # cleanup destroy_pool $TESTPOOL - log_must parted "/dev/${removedev}" -s -- mklabel msdos - block_device_wait + log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos + block_device_wait ${DEV_DSKDIR}/${removedev} done log_pass "ZED detects physically removed devices" diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh index bc925bc91c..03fc15a8a7 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh @@ -129,7 +129,7 @@ do typeset -i timeout=0 while true; do if ((timeout == $MAXTIMEOUT)); then - log_fail "Timeout occured" + log_fail "Timeout occurred" fi ((timeout++)) diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh new file mode 100755 index 0000000000..60185ace34 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2019 by Delphix. All rights reserved. +# Portions Copyright 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Online Test. +# Now with partitioned vdevs. +# +# STRATEGY: +# 1. Partition a scsi_debug device for simulating removal +# 2. Create a pool +# 3. Offline disk +# 4. ZED polls for an event change for online disk to be automatically +# added back to the pool. +# +verify_runnable "both" + +function cleanup +{ + poolexists ${TESTPOOL} && destroy_pool ${TESTPOOL} + unload_scsi_debug +} + +log_assert "Testing automated auto-online FMA test with partitioned vdev" + +log_onexit cleanup + +load_scsi_debug ${SDSIZE} ${SDHOSTS} ${SDTGTS} ${SDLUNS} '512b' +SDDEVICE=$(get_debug_device) +zpool labelclear -f ${SDDEVICE} +partition_disk ${SDSIZE} ${SDDEVICE} 1 +part=${SDDEVICE}1 +host=$(get_scsi_host ${SDDEVICE}) + +block_device_wait /dev/${part} +log_must zpool create -f ${TESTPOOL} raidz1 ${part} ${DISKS} + +# Add some data to the pool +log_must mkfile ${FSIZE} /${TESTPOOL}/data + +remove_disk ${SDDEVICE} +check_state ${TESTPOOL} "" "degraded" || \ + log_fail "${TESTPOOL} is not degraded" + +# Clear zpool events +log_must zpool events -c + +# Online disk +insert_disk ${SDDEVICE} ${host} + +log_note "Delay for ZED auto-online" +typeset -i timeout=0 +until is_pool_resilvered ${TESTPOOL}; do + if ((timeout++ == MAXTIMEOUT)); then + log_fail "Timeout occurred" + fi + sleep 1 +done +log_note "Auto-online of ${SDDEVICE} is complete" + +# Validate auto-online was successful +sleep 1 +check_state ${TESTPOOL} "" "online" || \ + log_fail "${TESTPOOL} is not back online" + +log_must zpool destroy ${TESTPOOL} + +log_pass "Auto-online with partitioned vdev test successful" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh index b6af1a3f40..a93267185b 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -55,36 +55,59 @@ zed_events_drain TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2"; do - # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE +for type in "mirror" "raidz" "raidz2" "draid:1s"; do + if [ "$type" = "draid:1s" ]; then + # 1. Create a dRAID pool with a distributed hot spare + # + # Corruption is injected in the file-2 instead of file-1 + # vdev since the dRAID permutation at these offsets maps + # to distributed spare space and not data devices. + # + log_must truncate -s $MINVDEVSIZE $VDEV_FILES + log_must zpool create -f $TESTPOOL $type $VDEV_FILES + SPARE="draid1-0-0" + FAULT="$TEST_BASE_DIR/file-2" + else + # 1. Create a pool with hot spares + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE + SPARE=$SPARE_FILE + FAULT=$FAULT_FILE + fi # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject IO ERRORS on read with a zinject error handler - log_must zinject -d $FAULT_FILE -e io -T read $TESTPOOL + log_must zinject -d $FAULT -e io -T read $TESTPOOL log_must cp $TESTFILE /dev/null # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "INUSE" + log_must wait_vdev_state $TESTPOOL $FAULT "FAULTED" 60 + log_must wait_vdev_state $TESTPOOL $SPARE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + # The ZED will use a sequential resilver for dRAID. Wait for the + # resilver and subsequent scrub to complete before moving on. + if [ "$type" = "draid:1s" ]; then + log_must wait_scrubbed $TESTPOOL + fi + # 6. Clear the fault log_must zinject -c all - log_must zpool clear $TESTPOOL $FAULT_FILE + log_must zpool clear $TESTPOOL $FAULT # 7. Verify the hot spare is available and expected pool/device status - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "AVAIL" + log_must wait_vdev_state $TESTPOOL $FAULT "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "AVAIL" + log_must is_pool_resilvered $TESTPOOL log_must check_state $TESTPOOL "" "ONLINE" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index 63aaead08d..e9517bad71 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -50,22 +50,26 @@ log_assert "Testing automated auto-spare FMA test" log_onexit cleanup -# Clear events from previous runs -zed_events_drain +# Events not supported on FreeBSD +if ! is_freebsd; then + # Clear events from previous runs + zed_events_drain +fi TESTFILE="/$TESTPOOL/$TESTFS/testfile" for type in "mirror" "raidz" "raidz2"; do # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject CHECKSUM ERRORS on read with a zinject error handler log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh index e9857518ed..f4fd21d043 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh @@ -60,7 +60,7 @@ FAIL_DEVICE="$TEST_BASE_DIR/fail-dev" # 1. Create a pool from 512b devices and set "ashift" pool property accordingly for vdev in $SAFE_DEVICE $FAIL_DEVICE; do - truncate -s $SPA_MINDEVSIZE $vdev + truncate -s $MINVDEVSIZE $vdev done log_must zpool create -f $TESTPOOL mirror $SAFE_DEVICE $FAIL_DEVICE # NOTE: file VDEVs should be added as 512b devices, verify this "just in case" @@ -71,7 +71,7 @@ log_must zpool set ashift=9 $TESTPOOL # 2. Add one 512e spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than the existing 512b devices, add 32m of fudge -load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' +load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' SPARE_DEVICE=$(get_debug_device) log_must_busy zpool add $TESTPOOL spare $SPARE_DEVICE diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh index 8650ceff7d..8a9cf6f532 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh @@ -53,22 +53,53 @@ function cleanup log_assert "ZED should be able to handle multiple faulted devices" log_onexit cleanup -# Clear events from previous runs -zed_events_drain +# Events not supported on FreeBSD +if ! is_freebsd; then + # Clear events from previous runs + zed_events_drain +fi FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" -DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2" +SAFE_DEV3="$TEST_BASE_DIR/safe-dev3" +SAFE_DEV4="$TEST_BASE_DIR/safe-dev4" +DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" -for type in "mirror" "raidz" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on the first device log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL @@ -76,11 +107,11 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in a hot spare and expected pool/device status + # 4. Verify the ZED kicks in a hot spare and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Inject IO ERRORS on a second device @@ -95,10 +126,14 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 7. Verify the ZED kicks in a second hot spare log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do + sleep 1 + done + # 8. Clear the fault on both devices log_must zinject -c all log_must zpool clear $TESTPOOL $FAULT_DEV1 @@ -107,8 +142,8 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 9. Verify the hot spares are available and expected pool/device status log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "AVAIL" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" log_must check_state $TESTPOOL "" "ONLINE" # Cleanup @@ -116,12 +151,38 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do done # Rinse and repeat, this time faulting both devices at the same time -# NOTE: "raidz" is exluded since it cannot survive 2 faulted devices -# NOTE: "mirror" is a 4-way mirror here and should survive this test -for type in "mirror" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices +# NOTE: "mirror" is a 3-way mirror here and should survive this test +for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on two devices log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" @@ -130,14 +191,14 @@ for type in "mirror" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in two hot spares and expected pool/device status + # 4. Verify the ZED kicks in two hot spares and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Clear the fault on both devices diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh index 467161359d..4229537b39 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh @@ -42,7 +42,7 @@ if is_linux; then # Add one 512b spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -72,7 +72,7 @@ SPARE_DISKDEV="$(get_debug_device)" for vdev in $SAFE_FILEDEVPOOL1 $SAFE_FILEDEVPOOL2 $FAIL_FILEDEVPOOL1 \ $FAIL_FILEDEVPOOL2 $SPARE_FILEDEV; do - log_must truncate -s $SPA_MINDEVSIZE $vdev + log_must truncate -s $MINVDEVSIZE $vdev done for spare in $SPARE_FILEDEV $SPARE_DISKDEV; do diff --git a/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh b/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh index ea831efdf4..81eab56666 100755 --- a/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh +++ b/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh @@ -33,7 +33,7 @@ log_assert "Testing that injected decompression errors are handled correctly" function cleanup { - log_must set_tunable64 zfs_compressed_arc_enabled 1 + log_must set_tunable64 COMPRESSED_ARC_ENABLED 1 log_must zinject -c all default_cleanup_noexit } @@ -41,15 +41,18 @@ function cleanup log_onexit cleanup default_mirror_setup_noexit $DISK1 $DISK2 -log_must set_tunable64 zfs_compressed_arc_enabled 0 +log_must set_tunable64 COMPRESSED_ARC_ENABLED 0 log_must zfs create -o compression=on $TESTPOOL/fs mntpt=$(get_prop mountpoint $TESTPOOL/fs) -write_compressible $mntpt 32m 1 0 "testfile" +write_compressible $mntpt 32m 1 1024k "testfile" log_must sync log_must zfs umount $TESTPOOL/fs log_must zfs mount $TESTPOOL/fs log_must zinject -a -t data -e decompress -f 20 $mntpt/testfile.0 log_mustnot eval "cat $mntpt/testfile.0 > /dev/null" -log_must eval "zpool events $TESTPOOL | grep -q 'data'" +if ! is_freebsd; then + # Events are not supported on FreeBSD + log_must eval "zpool events $TESTPOOL | grep -q 'data'" +fi log_pass "Injected decompression errors are handled correctly" diff --git a/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh b/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh index ca698f7783..d81c4b2bd2 100755 --- a/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh +++ b/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh @@ -50,6 +50,9 @@ log_must zfs umount $TESTPOOL/fs log_must zfs mount $TESTPOOL/fs log_mustnot eval "cat $mntpt/file1 > /dev/null" -log_must eval "zpool events $TESTPOOL | grep -q 'authentication'" +# Events are not supported on FreeBSD +if ! is_freebsd; then + log_must eval "zpool events $TESTPOOL | grep -q 'authentication'" +fi log_pass "Injected decryption errors are handled correctly" diff --git a/tests/zfs-tests/tests/functional/fault/fault.cfg b/tests/zfs-tests/tests/functional/fault/fault.cfg index 25601a71a3..839330ed47 100644 --- a/tests/zfs-tests/tests/functional/fault/fault.cfg +++ b/tests/zfs-tests/tests/functional/fault/fault.cfg @@ -47,8 +47,6 @@ if is_linux; then devs_id[1]=$(get_persistent_disk_name $DISK2) devs_id[2]=$(get_persistent_disk_name $DISK3) export devs_id -else - DEV_DSKDIR="/dev" fi export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \ diff --git a/tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh b/tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh index a5b58ec8ff..db4a4ad55e 100755 --- a/tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh +++ b/tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh @@ -42,6 +42,7 @@ function cleanup # Restore our zed.rc log_must zed_rc_restore $zedrc_backup default_cleanup_noexit + log_must zpool labelclear -f $DISK1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh index b6a3e71fdf..85f0083a0e 100755 --- a/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh +++ b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh @@ -41,14 +41,14 @@ DISK=${DISKS%% *} verify_runnable "both" -log_must zpool create $TESTPOOL mirror ${DISKS} +default_mirror_setup_noexit $DISKS function cleanup { log_must zinject -c all - log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO - log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS - log_must destroy_pool $TESTPOOL + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + default_cleanup_noexit } log_onexit cleanup @@ -56,10 +56,10 @@ log_onexit cleanup log_must zpool events -c # Mark any IOs greater than 10ms as slow IOs -OLD_SLOW_IO=$(get_tunable zio_slow_io_ms) -OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second) -log_must set_tunable64 zio_slow_io_ms 10 -log_must set_tunable64 zfs_slow_io_events_per_second 1000 +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) +log_must set_tunable64 ZIO_SLOW_IO_MS 10 +log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 # Create 20ms IOs log_must zinject -d $DISK -D20:100 $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh b/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh index 20b61da92d..ad0e49f8fb 100755 --- a/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh @@ -48,8 +48,8 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS - log_must set_tunable64 zfs_async_block_max_blocks 100000 + datasetexists $TEST_FS && destroy_dataset $TEST_FS + log_must set_tunable64 ASYNC_BLOCK_MAX_BLOCKS 100000 } log_onexit cleanup @@ -64,7 +64,7 @@ log_must dd bs=1024k count=128 if=/dev/zero of=/$TEST_FS/file # Decrease the max blocks to free each txg, so that freeing takes # long enough that we can observe it. # -log_must set_tunable64 zfs_async_block_max_blocks 100 +log_must set_tunable64 ASYNC_BLOCK_MAX_BLOCKS 100 log_must sync log_must zfs destroy $TEST_FS @@ -88,7 +88,7 @@ done # per txg. # sleep 10 -log_must set_tunable64 zfs_async_block_max_blocks 100000 +log_must set_tunable64 ASYNC_BLOCK_MAX_BLOCKS 100000 # Wait for everything to be freed. while [[ "0" != "$(zpool list -Ho freeing $TESTPOOL)" ]]; do diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh index d3530292e8..cb1e940a7d 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh @@ -47,7 +47,7 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS + datasetexists $TEST_FS && destroy_dataset $TEST_FS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh index c2b32ad662..9a00ceeb3c 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh @@ -48,7 +48,7 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS + datasetexists $TEST_FS && destroy_dataset $TEST_FS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh index 3fa1cabe06..2cc587b478 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh @@ -44,9 +44,7 @@ TEST_STREAM=$TESTDIR/ldnsnap function cleanup { - if datasetexists $TEST_FS ; then - log_must zfs destroy -r $TEST_FS - fi + datasetexists $TEST_FS && destroy_dataset $TEST_FS -r if datasetexists $LGCYPOOL ; then log_must zpool destroy -f $LGCYPOOL diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh index a2d92673b1..2be9894263 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh @@ -40,13 +40,8 @@ TEST_FILEINCR=bar function cleanup { - if datasetexists $TEST_SEND_FS ; then - log_must zfs destroy -r $TEST_SEND_FS - fi - - if datasetexists $TEST_RECV_FS ; then - log_must zfs destroy -r $TEST_RECV_FS - fi + datasetexists $TEST_SEND_FS && destroy_dataset $TEST_SEND_FS -r + datasetexists $TEST_RECV_FS && destroy_dataset $TEST_RECV_FS -r rm -f $TEST_STREAM rm -f $TEST_STREAMINCR diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh index 38b4ac52e5..3727bd5c11 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh @@ -49,7 +49,7 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS + datasetexists $TEST_FS && destroy_dataset $TEST_FS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh index eac292cbe0..71e1751713 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh @@ -39,7 +39,7 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS + datasetexists $TEST_FS && destroy_dataset $TEST_FS } function verify_dnode_packing @@ -71,6 +71,7 @@ for ((i=0; i < 100; i++)); do done log_must wait +sync_pool $TESTPOOL verify_dnode_packing diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh index fa746c52e5..1e42202069 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh @@ -39,7 +39,7 @@ verify_runnable "both" function cleanup { - datasetexists $TEST_FS && log_must zfs destroy $TEST_FS + datasetexists $TEST_FS && destroy_dataset $TEST_FS } log_onexit cleanup @@ -64,7 +64,7 @@ done log_must wait -log_must zpool export $TESTPOOL +log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL log_must ls -lR "/$TEST_FS/" >/dev/null 2>&1 log_must zdb -d $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/history/history.cfg b/tests/zfs-tests/tests/functional/history/history.cfg index bbbd612a66..e9200a2b50 100644 --- a/tests/zfs-tests/tests/functional/history/history.cfg +++ b/tests/zfs-tests/tests/functional/history/history.cfg @@ -37,7 +37,11 @@ export TMP_HISTORY=$TEST_BASE_DIR/tmp_history.$$ export NEW_HISTORY=$TEST_BASE_DIR/new_history.$$ export MIGRATEDPOOLNAME=${MIGRATEDPOOLNAME:-history_pool} -export TIMEZONE=${TIMEZONE:-US/Mountain} +if is_freebsd; then + export TIMEZONE=${TIMEZONE:-America/Denver} +else + export TIMEZONE=${TIMEZONE:-US/Mountain} +fi export HIST_USER="huser" export HIST_GROUP="hgroup" diff --git a/tests/zfs-tests/tests/functional/history/history_001_pos.ksh b/tests/zfs-tests/tests/functional/history/history_001_pos.ksh index e22aaa33db..f33265185d 100755 --- a/tests/zfs-tests/tests/functional/history/history_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_001_pos.ksh @@ -115,7 +115,7 @@ import_dir=$TEST_BASE_DIR/import_dir.$$ log_must mkdir $import_dir log_must cp $STF_SUITE/tests/functional/history/zfs-pool-v4.dat.Z $import_dir log_must uncompress $import_dir/zfs-pool-v4.dat.Z -upgrade_pool=$(zpool import -d $import_dir | grep "pool:" | awk '{print $2}') +upgrade_pool=$(zpool import -d $import_dir | awk '/pool:/ { print $2 }') log_must zpool import -d $import_dir $upgrade_pool run_and_verify -p "$upgrade_pool" "zpool upgrade $upgrade_pool" diff --git a/tests/zfs-tests/tests/functional/history/history_002_pos.ksh b/tests/zfs-tests/tests/functional/history/history_002_pos.ksh index 5533287457..b431cdc5f1 100755 --- a/tests/zfs-tests/tests/functional/history/history_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_002_pos.ksh @@ -49,7 +49,7 @@ function cleanup [[ -f $tmpfile ]] && rm -f $tmpfile [[ -f $tmpfile2 ]] && rm -f $tmpfile2 for dataset in $fs $newfs $fsclone $vol $newvol $volclone; do - datasetexists $dataset && zfs destroy -Rf $dataset + datasetexists $dataset && destroy_dataset $dataset -Rf done rm -rf /history.$$ } @@ -72,8 +72,8 @@ props=( mountpoint /history.$$ mountpoint legacy mountpoint none compression lz4 compression on compression off - compression lzjb acltype noacl - acltype posixacl xattr sa + compression lzjb acltype off + acltype posix acltype nfsv4 atime on atime off devices on devices off exec on exec off @@ -84,9 +84,39 @@ props=( aclinherit discard aclinherit noallow aclinherit secure aclinherit passthrough canmount off canmount on - xattr on xattr off compression gzip compression gzip-$((RANDOM%9 + 1)) - copies $((RANDOM%3 + 1)) + compression zstd compression zstd-$((RANDOM%9 + 1)) + compression zstd-fast copies $((RANDOM%3 + 1)) + compression zstd-fast-$((RANDOM%9 + 1)) xattr sa + xattr on xattr off +) +elif is_freebsd; then +# property value property value +# +props=( + quota 64M recordsize 512 + reservation 32M reservation none + mountpoint /history.$$ mountpoint legacy + mountpoint none sharenfs on + sharenfs off + compression on compression off + compression lzjb aclmode discard + aclmode groupmask aclmode passthrough + atime on atime off + devices on devices off + exec on exec off + setuid on setuid off + readonly on readonly off + jailed on jailed off + snapdir hidden snapdir visible + aclinherit discard aclinherit noallow + aclinherit secure aclinherit passthrough + canmount off canmount on + compression gzip compression gzip-$((RANDOM%9 + 1)) + compression zstd compression zstd-$((RANDOM%9 + 1)) + compression zstd-fast copies $((RANDOM%3 + 1)) + compression zstd-fast-$((RANDOM%9 + 1)) acltype off + acltype posix acltype nfsv4 ) else # property value property value diff --git a/tests/zfs-tests/tests/functional/history/history_003_pos.ksh b/tests/zfs-tests/tests/functional/history/history_003_pos.ksh index 4ecee3ba0c..46af53f8af 100755 --- a/tests/zfs-tests/tests/functional/history/history_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_003_pos.ksh @@ -65,9 +65,7 @@ log_must zpool create $spool $VDEV0 log_must zfs create $spool/$sfs typeset -i orig_count=$(zpool history $spool | wc -l) -typeset orig_md5=$(zpool history $spool | head -2 | md5sum | \ - awk '{print $1}') - +typeset orig_md5=$(zpool history $spool | head -2 | md5digest) typeset -i i=0 while ((i < 300)); do zfs set compression=off $spool/$sfs @@ -82,7 +80,7 @@ done TMPFILE=$TEST_BASE_DIR/spool.$$ zpool history $spool >$TMPFILE typeset -i entry_count=$(wc -l $TMPFILE | awk '{print $1}') -typeset final_md5=$(head -2 $TMPFILE | md5sum | awk '{print $1}') +typeset final_md5=$(head -2 $TMPFILE | md5digest) grep 'zpool create' $TMPFILE >/dev/null 2>&1 || log_fail "'zpool create' was not found in pool history" diff --git a/tests/zfs-tests/tests/functional/history/history_005_neg.ksh b/tests/zfs-tests/tests/functional/history/history_005_neg.ksh index f6a81a4ac5..297a701cc5 100755 --- a/tests/zfs-tests/tests/functional/history/history_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/history/history_005_neg.ksh @@ -42,9 +42,9 @@ # zpool iostat # # STRATEGY: -# 1. Create a test pool. +# 1. Create a test pool # 2. Separately invoke zpool list|status|iostat -# 3. Verify they was not recored in pool history. +# 3. Verify they were not recorded in pool history # verify_runnable "global" diff --git a/tests/zfs-tests/tests/functional/history/history_006_neg.ksh b/tests/zfs-tests/tests/functional/history/history_006_neg.ksh index a2da831c5c..19b7114faf 100755 --- a/tests/zfs-tests/tests/functional/history/history_006_neg.ksh +++ b/tests/zfs-tests/tests/functional/history/history_006_neg.ksh @@ -40,16 +40,14 @@ # STRATEGY: # 1. Create a test pool. # 2. Separately invoke zfs list|get|holds|mount|unmount|share|unshare|send -# 3. Verify they were not recored in pool history. +# 3. Verify they were not recorded in pool history. # verify_runnable "global" function cleanup { - if datasetexists $fs ; then - log_must zfs destroy -rf $fs - fi + datasetexists $fs && destroy_dataset $fs -rf log_must zfs create $fs } diff --git a/tests/zfs-tests/tests/functional/history/history_007_pos.ksh b/tests/zfs-tests/tests/functional/history/history_007_pos.ksh index b65e855d8c..591d5b85e8 100755 --- a/tests/zfs-tests/tests/functional/history/history_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_007_pos.ksh @@ -62,7 +62,7 @@ migratedpoolname=$MIGRATEDPOOLNAME typeset -i RET=1 typeset -i linenum=0 -[[ ! -d $import_dir ]] && log_must mkdir $import_dir +[[ ! -d $import_dir ]] && log_must mkdir -p $import_dir # We test the migrations on both uniform platform and cross platform for arch in "i386" "sparc"; do @@ -73,7 +73,7 @@ for arch in "i386" "sparc"; do cat $orig_cmds_f | grep -v "^$" > $orig_cmds_f1 log_must cp $tst_dir/${arch}.migratedpool.DAT.Z $import_dir - log_must uncompress $import_dir/${arch}.migratedpool.DAT.Z + log_must uncompress -f $import_dir/${arch}.migratedpool.DAT.Z # destroy the pool with same name, so that import operation succeeds. poolexists $migratedpoolname && \ @@ -83,7 +83,7 @@ for arch in "i386" "sparc"; do TZ=$TIMEZONE zpool history $migratedpoolname | grep -v "^$" \ >$migrated_cmds_f RET=$? - (( $RET != 0 )) && log_fail "zpool histroy $migratedpoolname fails." + (( $RET != 0 )) && log_fail "zpool history $migratedpoolname fails." # The migrated history file should differ with original history file on # two commands -- 'export' and 'import', which are included in migrated diff --git a/tests/zfs-tests/tests/functional/history/history_008_pos.ksh b/tests/zfs-tests/tests/functional/history/history_008_pos.ksh index 996c7658c3..8e174dcb7e 100755 --- a/tests/zfs-tests/tests/functional/history/history_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_008_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "global" function cleanup { - if datasetexists $root_testfs; then - log_must zfs destroy -rf $root_testfs - fi + datasetexists $root_testfs && destroy_dataset $root_testfs -rf log_must zfs create $root_testfs } diff --git a/tests/zfs-tests/tests/functional/history/history_010_pos.ksh b/tests/zfs-tests/tests/functional/history/history_010_pos.ksh index 31fe8ec54d..2c32b1b6ce 100755 --- a/tests/zfs-tests/tests/functional/history/history_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/history/history_010_pos.ksh @@ -47,7 +47,7 @@ function cleanup { del_user $HIST_USER del_group $HIST_GROUP - datasetexists $root_testfs && log_must zfs destroy -rf $root_testfs + datasetexists $root_testfs && destroy_dataset $root_testfs -rf } log_assert "Verify internal long history information are correct." diff --git a/tests/zfs-tests/tests/functional/history/history_common.kshlib b/tests/zfs-tests/tests/functional/history/history_common.kshlib index 80af2e903d..ff3260f3c0 100644 --- a/tests/zfs-tests/tests/functional/history/history_common.kshlib +++ b/tests/zfs-tests/tests/functional/history/history_common.kshlib @@ -51,11 +51,11 @@ function run_and_verify fullcmd="$1" flags="$2" - if is_linux; then + if is_illumos; then + histcmd=$(echo $fullcmd | sed 's/\/usr\/sbin\///g') + else histcmd=$(echo $fullcmd | sed 's/^.*\/\(zpool .*\).*$/\1/') histcmd=$(echo $histcmd | sed 's/^.*\/\(zfs .*\).*$/\1/') - else - histcmd=$(echo $fullcmd | sed 's/\/usr\/sbin\///g') fi cmd=$(echo $histcmd | awk '{print $1}') @@ -72,9 +72,9 @@ function run_and_verify # Run the command as the specified user, and find the new history. zpool history $flags $pool > $OLD_HISTORY 2>/dev/null if [[ $user == "root" ]]; then - log_must eval "$fullcmd" + log_must_busy eval "$fullcmd" else - log_must user_run $user "$fullcmd" + log_must_busy user_run $user "$fullcmd" fi zpool history $flags $pool > $TMP_HISTORY 2>/dev/null diff $OLD_HISTORY $TMP_HISTORY | grep "^> " | sed 's/^> //g' \ @@ -110,12 +110,13 @@ function verify_long fi typeset suffix="" - if [ is_linux ]; then + if is_linux; then suffix=":linux" + elif is_freebsd; then + suffix=":freebsd" fi - grep "$cmd \[user $uid ($user) on $hname$suffix\]" \ - $NEW_HISTORY >/dev/null 2>&1 + grep -q "$cmd \[user $uid ($user) on $hname$suffix\]" $NEW_HISTORY if [[ $? != 0 ]]; then log_note "Couldn't find long information for \"$cmd\"" return 1 @@ -224,7 +225,7 @@ function verify_allow # # Here, we determine three things: - # - Whether we're operating on a set or an indivdual permission (which + # - Whether we're operating on a set or an individual permission (which # dictates the case of the first character in the code) # - The name of the dataset we're operating on. # - Whether the operation applies locally or to descendent datasets (or @@ -364,7 +365,7 @@ function verify_destroy typeset cmd=$1 typeset flags=$3 - # This function doesn't currently verifiy the zpool command. + # This function doesn't currently verify the zpool command. [[ ${cmd%% *} == "zfs" ]] || return 1 [[ $flags =~ "i" ]] || return 1 diff --git a/tests/zfs-tests/tests/functional/hkdf/Makefile.am b/tests/zfs-tests/tests/functional/hkdf/Makefile.am index d0a68f442f..8ac9053223 100644 --- a/tests/zfs-tests/tests/functional/hkdf/Makefile.am +++ b/tests/zfs-tests/tests/functional/hkdf/Makefile.am @@ -1,12 +1,5 @@ include $(top_srcdir)/config/Rules.am -AM_CPPFLAGS += -I$(top_srcdir)/include -AM_CPPFLAGS += -I$(top_srcdir)/lib/libspl/include -LDADD = $(top_srcdir)/lib/libicp/libicp.la -LDADD += $(top_srcdir)/lib/libzpool/libzpool.la - -AUTOMAKE_OPTIONS = subdir-objects - pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/hkdf dist_pkgdata_SCRIPTS = \ @@ -20,3 +13,5 @@ pkgexec_PROGRAMS = \ hkdf_test hkdf_test_SOURCES = hkdf_test.c +hkdf_test_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la diff --git a/tests/zfs-tests/tests/functional/inheritance/Makefile.am b/tests/zfs-tests/tests/functional/inheritance/Makefile.am index 82de708d63..3c624621f2 100644 --- a/tests/zfs-tests/tests/functional/inheritance/Makefile.am +++ b/tests/zfs-tests/tests/functional/inheritance/Makefile.am @@ -4,6 +4,8 @@ dist_pkgdata_SCRIPTS = \ inherit_001_pos.ksh dist_pkgdata_DATA = \ + README.config \ + README.state \ inherit.kshlib \ config001.cfg \ config002.cfg \ diff --git a/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh b/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh index 76bd05ce57..7c5b812877 100755 --- a/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh @@ -401,20 +401,20 @@ set -A local_val "off" "on" "off" \ # # Add system specific values # - -if ! is_linux; then - prop+=("aclmode" "" \ - "mountpoint" "") - def_val+=("discard" \ - "") - local_val+=("groupmask" \ - "$TESTDIR") -else +if is_linux; then prop+=("acltype" "") def_val+=("off") local_val+=("off") +else + prop+=("aclmode" "") + def_val+=("discard") + local_val+=("groupmask") +fi +if is_illumos; then + prop+=("mountpoint" "") + def_val+=("") + local_val+=("$TESTDIR") fi - # # Global flag indicating whether the default record size had been diff --git a/tests/zfs-tests/tests/functional/inuse/inuse.cfg b/tests/zfs-tests/tests/functional/inuse/inuse.cfg index bbc32f1f10..631ace7ab7 100644 --- a/tests/zfs-tests/tests/functional/inuse/inuse.cfg +++ b/tests/zfs-tests/tests/functional/inuse/inuse.cfg @@ -30,101 +30,25 @@ . $STF_SUITE/include/libtest.shlib -if is_linux; then - export DISKSARRAY=$DISKS - export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') - set_device_dir - set_slice_prefix - export SLICE0=1 - export SLICE1=2 -else - export SLICE_PREFIX="s" - export SLICE0=0 - export SLICE1=1 -fi - -verify_disk_count "$DISKS" 2 set -A disk_array $(find_disks $DISKS) -case "${#disk_array[@]}" in -2) - FS_DISK0=${disk_array[0]} - FS_DISK1=${disk_array[1]} - FS_DISK2=${disk_array[0]} - FS_DISK3=${disk_array[1]} - FS_SIDE0=${FS_DISK0}${SLICE_PREFIX}${SLICE0} - FS_SIDE1=${FS_DISK0}${SLICE_PREFIX}${SLICE1} - FS_SIDE2=${FS_DISK1}${SLICE_PREFIX}${SLICE0} - FS_SIDE3=${FS_DISK1}${SLICE_PREFIX}${SLICE1} - disk0="${DEV_DSKDIR}/$FS_SIDE0" - disk1="${DEV_DSKDIR}/$FS_SIDE1" - disk2="${DEV_DSKDIR}/$FS_SIDE2" - disk3="${DEV_DSKDIR}/$FS_SIDE3" - disktargets="$disk0 $disk2" - rawdisk0="${DEV_RDSKDIR}/$FS_SIDE0" - rawdisk1="${DEV_RDSKDIR}/$FS_SIDE1" - rawdisk2="${DEV_RDSKDIR}/$FS_SIDE2" - rawdisk3="${DEV_RDSKDIR}/$FS_SIDE3" - rawtargets="$rawdisk0 $rawdisk2" - vdisks="$FS_DISK0" - sdisks="$FS_DISK1" - vslices="$FS_SIDE0 $FS_SIDE1 $FS_SIDE2" - sslices="$FS_SIDE3" - ;; -3) - FS_DISK0=${disk_array[0]} - FS_DISK1=${disk_array[1]} - FS_DISK2=${disk_array[2]} - FS_DISK3=${disk_array[0]} - FS_SIDE0=${FS_DISK0}${SLICE_PREFIX}${SLICE0} - FS_SIDE1=${FS_DISK0}${SLICE_PREFIX}${SLICE1} - FS_SIDE2=${FS_DISK1}${SLICE_PREFIX}${SLICE0} - FS_SIDE3=${FS_DISK2}${SLICE_PREFIX}${SLICE0} - disk0="${DEV_DSKDIR}/$FS_SIDE0" - disk1="${DEV_DSKDIR}/$FS_SIDE1" - disk2="${DEV_DSKDIR}/$FS_SIDE2" - disk3="${DEV_DSKDIR}/$FS_SIDE3" - disktargets="$disk0 $disk2 $disk3" - rawdisk0="${DEV_RDSKDIR}/$FS_SIDE0" - rawdisk1="${DEV_RDSKDIR}/$FS_SIDE1" - rawdisk2="${DEV_RDSKDIR}/$FS_SIDE2" - rawdisk3="${DEV_RDSKDIR}/$FS_SIDE3" - rawtargets="$rawdisk0 $rawdisk2 $rawdisk3" - vdisks="$FS_DISK0 $FS_DISK1" - sdisks="$FS_DISK2" - vslices="$FS_SIDE0 $FS_SIDE2 $FS_SIDE3" - sslices="$FS_SIDE1" - ;; -*) - FS_DISK0=${disk_array[0]} - FS_DISK1=${disk_array[1]} - FS_DISK2=${disk_array[2]} - FS_DISK3=${disk_array[3]} - FS_SIDE0=${FS_DISK0}${SLICE_PREFIX}${SLICE0} - FS_SIDE1=${FS_DISK1}${SLICE_PREFIX}${SLICE0} - FS_SIDE2=${FS_DISK2}${SLICE_PREFIX}${SLICE0} - FS_SIDE3=${FS_DISK3}${SLICE_PREFIX}${SLICE0} - disk0="${DEV_DSKDIR}/$FS_SIDE0" - disk1="${DEV_DSKDIR}/$FS_SIDE1" - disk2="${DEV_DSKDIR}/$FS_SIDE2" - disk3="${DEV_DSKDIR}/$FS_SIDE3" - disktargets="$disk0 $disk1 $disk2 $disk3" - rawdisk0="${DEV_RDSKDIR}/$FS_SIDE0" - rawdisk1="${DEV_RDSKDIR}/$FS_SIDE1" - rawdisk2="${DEV_RDSKDIR}/$FS_SIDE2" - rawdisk3="${DEV_RDSKDIR}/$FS_SIDE3" - rawtargets="$rawdisk0 $rawdisk1 $rawdisk2 $rawdisk3" - vdisks="$FS_DISK0 $FS_DISK1 $FS_DISK2" - sdisks="$FS_DISK3" - vslices="$FS_SIDE0 $FS_SIDE1 $FS_SIDE2" - sslices="$FS_SIDE3" - ;; -esac +FS_DISK0=${disk_array[0]} +FS_DISK1=${disk_array[1]} +FS_DISK2=${disk_array[2]} +disk0="${DEV_DSKDIR}/$FS_DISK0" +disk1="${DEV_DSKDIR}/$FS_DISK1" +disk2="${DEV_DSKDIR}/$FS_DISK2" +disktargets="$disk0 $disk1 $disk2" +rawdisk0="${DEV_RDSKDIR}/$FS_DISK0" +rawdisk1="${DEV_RDSKDIR}/$FS_DISK1" +rawdisk2="${DEV_RDSKDIR}/$FS_DISK2" +rawtargets="$rawdisk0 $rawdisk1 $rawdisk2" +vdisks="$FS_DISK0 $FS_DISK1" +sdisks="$FS_DISK2" -export FS_DISK0 FS_DISK1 FS_DISK2 FS_DISK3 SINGLE_DISK -export FS_SIDE0 FS_SIDE1 FS_SIDE2 FS_SIDE3 -export disk0 disk1 disk2 disk3 disktargets -export rawdisk0 rawdisk1 rawdisk2 rawdisk3 rawtargets -export vdisks sdisks vslices sslices +export FS_DISK0 FS_DISK1 FS_DISK2 +export disk0 disk1 disk2 disktargets +export rawdisk0 rawdisk1 rawdisk2 rawtargets +export vdisks sdisks export UFSMP=$TESTDIR/testinuseufsdump export FS_SIZE=1g diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_001_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_001_pos.ksh index 63c68e66e4..f824661c00 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_001_pos.ksh @@ -43,10 +43,6 @@ verify_runnable "global" -if is_linux; then - log_unsupported "Test case isn't applicable to Linux" -fi - function cleanup { # @@ -64,27 +60,25 @@ log_assert "Ensure ZFS cannot use a device designated as a dump device" log_onexit cleanup typeset dumpdev="" -typeset diskslice="" PREVDUMPDEV=`dumpadm | grep "Dump device" | awk '{print $3}'` -log_note "Zero $FS_DISK0 and place free space in to slice 0" +log_note "Zero $FS_DISK0" log_must cleanup_devices $FS_DISK0 -diskslice="${DEV_DSKDIR}/${FS_DISK0}${SLICE0}" -log_note "Configuring $diskslice as dump device" -log_must dumpadm -d $diskslice > /dev/null +log_note "Configuring $rawdisk0 as dump device" +log_must dumpadm -d $rawdisk0 > /dev/null log_note "Confirm that dump device has been setup" dumpdev=`dumpadm | grep "Dump device" | awk '{print $3}'` [[ -z "$dumpdev" ]] && log_untested "No dump device has been configured" -[[ "$dumpdev" != "$diskslice" ]] && \ - log_untested "Dump device has not been been configured to $diskslice" +[[ "$dumpdev" != "$rawdisk0" ]] && \ + log_untested "Dump device has not been configured to $rawdisk0" log_note "Attempt to zpool the dump device" unset NOINUSE_CHECK -log_mustnot zpool create $TESTPOOL "$diskslice" +log_mustnot zpool create $TESTPOOL "$rawdisk0" log_mustnot poolexists $TESTPOOL log_pass "Unable to zpool a device in use by dumpadm" diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_003_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_003_pos.ksh index bdd79d9c4c..07d6ac1755 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_003_pos.ksh @@ -50,10 +50,6 @@ verify_runnable "global" -if is_linux; then - log_unsupported "Test case isn't applicable to Linux" -fi - function cleanup { poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 @@ -98,18 +94,9 @@ typeset restored_files="${UFSMP}/restored_files" typeset -i dirnum=0 typeset -i filenum=0 typeset cwd="" -typeset cyl="" - -for num in 0 1 2; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%s*} - slice=${slice##*${SLICE_PREFIX}} - log_must set_partition $slice "$cyl" $FS_SIZE $disk - cyl=$(get_endslice $disk $slice) -done log_note "Make a ufs filesystem on source $rawdisk1" -echo "y" | newfs -v $rawdisk1 > /dev/null 2>&1 +new_fs $rawdisk1 > /dev/null 2>&1 (($? != 0)) && log_untested "Unable to create ufs filesystem on $rawdisk1" log_must mkdir -p $UFSMP @@ -149,7 +136,7 @@ log_mustnot zpool create $TESTPOOL1 "$disk1" log_mustnot poolexists $TESTPOOL1 log_note "Attempt to take the source device in use by ufsdump as spare device" -log_mustnot zpool create $TESTPOOL1 "$FS_SIDE2" spare "$disk1" +log_mustnot zpool create $TESTPOOL1 "$FS_DISK2" spare "$disk1" log_mustnot poolexists $TESTPOOL1 wait $PIDUFSDUMP @@ -175,7 +162,7 @@ log_mustnot poolexists $TESTPOOL2 log_note "Attempt to take the restored device in use by ufsrestore as spare" \ "device" -log_mustnot zpool create -f $TESTPOOL2 "$FS_SIDE2" spare "$disk1" +log_mustnot zpool create -f $TESTPOOL2 "$FS_DISK2" spare "$disk1" log_mustnot poolexists $TESTPOOL2 log_pass "Unable to zpool over a device in use by ufsdump or ufsrestore" diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh index 95d505f35b..a9725e06dc 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh @@ -48,8 +48,8 @@ verify_runnable "global" function cleanup { # - # Essentailly this is the default_cleanup routine but I cannot get it - # to work correctly. So its reproduced below. Still need to full + # Essentially this is the default_cleanup routine but I cannot get it + # to work correctly. So its reproduced below. Still need to fully # understand why default_cleanup does not work correctly from here. # log_must zfs umount $TESTPOOL/$TESTFS @@ -72,6 +72,9 @@ function mini_format if is_linux; then parted $disk -s -- mklabel gpt typeset -i retval=$? + elif is_freebsd; then + gpart create -s gpt $disk + typeset -i retval=$? else typeset format_file=$TEST_BASE_DIR/format_in.$$.1 echo "partition" > $format_file diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_005_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_005_pos.ksh index 6b0abf429d..afe30d0599 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_005_pos.ksh @@ -58,15 +58,15 @@ function cleanup cleanup_devices $vdisks $sdisks } -function verify_assertion #slices +function verify_assertion #disks { typeset targets=$1 for t in $targets; do - echo "y" | newfs -v $t > /dev/null 2>&1 - (( $? !=0 )) || \ + if new_fs $t; then log_fail "newfs over active pool " \ - "unexpected return code of 0" + "unexpected return code of 0" + fi done return 0 @@ -82,39 +82,11 @@ typeset -i i=0 unset NOINUSE_CHECK while (( i < ${#vdevs[*]} )); do - for num in 0 1 2 3 ; do - eval typeset disk=\${FS_DISK$num} - zero_partitions $disk - done - typeset cyl="" - for num in 0 1 2 3 ; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%${SLICE_PREFIX}*} - [[ -z $SLICE_PREFIX ]] && eval typeset disk=\${FS_DISK$num} - slice=$(echo $slice | awk '{ print substr($1,length($1),1) }') - log_must set_partition $slice "$cyl" $FS_SIZE $disk - [[ $num < 3 ]] && cyl=$(get_endslice $disk $slice) - done + typeset spare="spare $sdisks" - if [[ -n $SINGLE_DISK && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vslices spare $sslices - verify_assertion "$rawtargets" - destroy_pool $TESTPOOL1 - - if [[ ( $FS_DISK0 == $FS_DISK2 ) && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - if [[ ( $FS_DISK0 == $FS_DISK3 ) && ( ${vdevs[i]} == "raidz2" ) ]]; then - (( i = i + 1 )) - continue - fi - create_pool $TESTPOOL1 ${vdevs[i]} $vdisks spare $sdisks + # If this is for raidz2, use 3 disks for the pool. + [[ ${vdevs[i]} = "raidz2" ]] && spare="$sdisks" + create_pool $TESTPOOL1 ${vdevs[i]} $vdisks $spare verify_assertion "$rawtargets" destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_006_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_006_pos.ksh index 0ce45a661c..9657322526 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_006_pos.ksh @@ -44,10 +44,6 @@ verify_runnable "global" -if is_linux; then - log_unsupported "Test case isn't applicable to Linux" -fi - function cleanup { if [[ -n $PREVDUMPDEV ]]; then @@ -62,7 +58,7 @@ function cleanup cleanup_devices $vdisks $sdisks } -function verify_assertion #slices +function verify_assertion # disks { typeset targets=$1 @@ -85,39 +81,11 @@ PREVDUMPDEV=`dumpadm | grep "Dump device" | awk '{print $3}'` unset NOINUSE_CHECK while (( i < ${#vdevs[*]} )); do + typeset spare="spare $sdisks" - for num in 0 1 2 3 ; do - eval typeset disk=\${FS_DISK$num} - zero_partitions $disk - done - - for num in 0 1 2 3 ; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%${SLICE_PREFIX}*} - slice=${slice##*${SLICE_PREFIX}} - log_must set_partition $slice "" $FS_SIZE $disk - done - - if [[ -n $SINGLE_DISK && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vslices spare $sslices - verify_assertion "$disktargets" - destroy_pool $TESTPOOL1 - - if [[ ( $FS_DISK0 == $FS_DISK2 ) && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - if [[ ( $FS_DISK0 == $FS_DISK3 ) && ( ${vdevs[i]} == "raidz2" ) ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vdisks spare $sdisks + # If this is for raidz2, use 3 disks for the pool. + [[ ${vdevs[i]} = "raidz2" ]] && spare="$sdisks" + create_pool $TESTPOOL1 ${vdevs[i]} $vdisks $spare verify_assertion "$disktargets" destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_007_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_007_pos.ksh index 22ac064ef3..b96b80890e 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_007_pos.ksh @@ -45,10 +45,6 @@ verify_runnable "global" -if is_linux; then - log_unsupported "Test case isn't applicable to Linux" -fi - function cleanup { if [[ -n $PREVDUMPDEV ]]; then @@ -65,7 +61,7 @@ function cleanup cleanup_devices $vdisks $sdisks } -function verify_assertion #slices +function verify_assertion # disks { typeset targets=$1 @@ -89,41 +85,11 @@ typeset -i i=0 PREVDUMPDEV=`dumpadm | grep "Dump device" | awk '{print $3}'` while (( i < ${#vdevs[*]} )); do + typeset spare="spare $sdisks" - for num in 0 1 2 3 ; do - eval typeset disk=\${FS_DISK$num} - zero_partitions $disk - done - - for num in 0 1 2 3 ; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%${SLICE_PREFIX}*} - slice=${slice##*${SLICE_PREFIX}} - log_must set_partition $slice "" $FS_SIZE $disk - done - - if [[ -n $SINGLE_DISK && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vslices spare $sslices - log_must zpool export $TESTPOOL1 - verify_assertion "$disktargets" - log_must zpool import $TESTPOOL1 - destroy_pool $TESTPOOL1 - - if [[ ( $FS_DISK0 == $FS_DISK2 ) && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - if [[ ( $FS_DISK0 == $FS_DISK3 ) && ( ${vdevs[i]} == "raidz2" ) ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vdisks spare $sdisks + # If this is for raidz2, use 3 disks for the pool. + [[ ${vdevs[i]} = "raidz2" ]] && spare="$sdisks" + create_pool $TESTPOOL1 ${vdevs[i]} $vdisks $spare log_must zpool export $TESTPOOL1 verify_assertion "$disktargets" log_must zpool import $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_008_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_008_pos.ksh index ddc8fa7a49..d60ebcee15 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_008_pos.ksh @@ -61,15 +61,15 @@ function cleanup cleanup_devices $vdisks $sdisks } -function verify_assertion #slices +function verify_assertion # disks { typeset targets=$1 for t in $targets; do - echo "y" | newfs -v $t > /dev/null 2>&1 - (( $? !=0 )) && \ + if ! new_fs $t; then log_fail "newfs over exported pool " \ - "failes unexpected." + "fails unexpectedly." + fi done return 0 @@ -82,32 +82,14 @@ log_onexit cleanup set -A vdevs "" "mirror" "raidz" "raidz1" "raidz2" typeset -i i=0 -typeset cyl="" - -for num in 0 1 2 3 ; do - eval typeset disk=\${FS_DISK$num} - zero_partitions $disk -done - -for num in 0 1 2 3 ; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%${SLICE_PREFIX}*} - [[ -z $SLICE_PREFIX ]] && eval typeset disk=\${FS_DISK$num} - slice=$(echo $slice | awk '{ print substr($1,length($1),1) }') - log_must set_partition $slice "$cyl" $FS_SIZE $disk - [[ $num < 3 ]] && cyl=$(get_endslice $disk $slice) -done - while (( i < ${#vdevs[*]} )); do - if [[ -n $SINGLE_DISK && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi + typeset spare="spare $sdisks" - create_pool $TESTPOOL1 ${vdevs[i]} $vslices spare $sslices + # If this is for raidz2, use 3 disks for the pool. + [[ ${vdevs[i]} = "raidz2" ]] && spare="$sdisks" + create_pool $TESTPOOL1 ${vdevs[i]} $vdisks $spare log_must zpool export $TESTPOOL1 verify_assertion "$rawtargets" - cleanup_devices $vslices $sslices (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/inuse/inuse_009_pos.ksh b/tests/zfs-tests/tests/functional/inuse/inuse_009_pos.ksh index a5e9fda596..54d201ad62 100755 --- a/tests/zfs-tests/tests/functional/inuse/inuse_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/inuse/inuse_009_pos.ksh @@ -61,7 +61,7 @@ function cleanup cleanup_devices $vdisks $sdisks } -function verify_assertion #disks +function verify_assertion # disks { typeset targets=$1 @@ -79,44 +79,12 @@ log_onexit cleanup set -A vdevs "" "mirror" "raidz" "raidz1" "raidz2" typeset -i i=0 - while (( i < ${#vdevs[*]} )); do + typeset spare="spare $sdisks" - for num in 0 1 2 3 ; do - eval typeset disk=\${FS_DISK$num} - zero_partitions $disk - done - - typeset cyl="" - for num in 0 1 2 3 ; do - eval typeset slice=\${FS_SIDE$num} - disk=${slice%${SLICE_PREFIX}*} - [[ -z $SLICE_PREFIX ]] && eval typeset disk=\${FS_DISK$num} - slice=$(echo $slice | awk '{ print substr($1,length($1),1) }') - log_must set_partition $slice "$cyl" $FS_SIZE $disk - [[ $num < 3 ]] && cyl=$(get_endslice $disk $slice) - done - - if [[ -n $SINGLE_DISK && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vslices spare $sslices - log_must zpool export $TESTPOOL1 - verify_assertion "$vdisks $sdisks" - - if [[ ( $FS_DISK0 == $FS_DISK2 ) && -n ${vdevs[i]} ]]; then - (( i = i + 1 )) - continue - fi - - if [[ ( $FS_DISK0 == $FS_DISK3 ) && ( ${vdevs[i]} == "raidz2" ) ]]; then - (( i = i + 1 )) - continue - fi - - create_pool $TESTPOOL1 ${vdevs[i]} $vdisks spare $sdisks + # If this is for raidz2, use 3 disks for the pool. + [[ ${vdevs[i]} = "raidz2" ]] && spare="$sdisks" + create_pool $TESTPOOL1 ${vdevs[i]} $vdisks $spare log_must zpool export $TESTPOOL1 verify_assertion "$vdisks $sdisks" diff --git a/tests/zfs-tests/tests/functional/io/Makefile.am b/tests/zfs-tests/tests/functional/io/Makefile.am index 5253f08a05..44c0d02d6e 100644 --- a/tests/zfs-tests/tests/functional/io/Makefile.am +++ b/tests/zfs-tests/tests/functional/io/Makefile.am @@ -5,6 +5,7 @@ dist_pkgdata_SCRIPTS = \ sync.ksh \ psync.ksh \ libaio.ksh \ + io_uring.ksh \ posixaio.ksh \ mmap.ksh diff --git a/tests/zfs-tests/tests/functional/io/io_uring.ksh b/tests/zfs-tests/tests/functional/io/io_uring.ksh new file mode 100755 index 0000000000..2d2b18f8bb --- /dev/null +++ b/tests/zfs-tests/tests/functional/io/io_uring.ksh @@ -0,0 +1,72 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/io/io.cfg + +# +# DESCRIPTION: +# Verify Linux io_uring. +# +# STRATEGY: +# 1. Use fio(1) in verify mode to perform write, read, +# random read, and random write workloads. +# 2. Repeat the test with additional fio(1) options. +# + +verify_runnable "global" + + +if [[ $(linux_version) -lt $(linux_version "5.1") ]]; then + log_unsupported "Requires io_uring support" +fi + +fio --ioengine=io_uring --parse-only || log_unsupported "io_uring support required" + +function cleanup +{ + log_must rm -f "$mntpnt/rw*" +} + +log_assert "Verify Linux io_uring" + +log_onexit cleanup + +ioengine="--ioengine=io_uring" +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +dir="--directory=$mntpnt" + +set -A fio_arg -- "--sync=0" "--sync=1" "--direct=0" "--direct=1" + +for arg in "${fio_arg[@]}"; do + log_must fio $dir $ioengine $arg $FIO_WRITE_ARGS + log_must fio $dir $ioengine $arg $FIO_READ_ARGS + log_must fio $dir $ioengine $arg $FIO_RANDWRITE_ARGS + log_must fio $dir $ioengine $arg $FIO_RANDREAD_ARGS + log_must rm -f "$mntpnt/rw*" +done + +log_pass "Verified Linux io_uring" diff --git a/tests/zfs-tests/tests/functional/l2arc/Makefile.am b/tests/zfs-tests/tests/functional/l2arc/Makefile.am new file mode 100644 index 0000000000..09f4c1d0d7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/Makefile.am @@ -0,0 +1,15 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/l2arc +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + l2arc_arcstats_pos.ksh \ + l2arc_l2miss_pos.ksh \ + l2arc_mfuonly_pos.ksh \ + persist_l2arc_001_pos.ksh \ + persist_l2arc_002_pos.ksh \ + persist_l2arc_003_neg.ksh \ + persist_l2arc_004_pos.ksh \ + persist_l2arc_005_pos.ksh + +dist_pkgdata_DATA = \ + l2arc.cfg diff --git a/tests/zfs-tests/tests/functional/l2arc/cleanup.ksh b/tests/zfs-tests/tests/functional/l2arc/cleanup.ksh new file mode 100755 index 0000000000..c3d88e3ffc --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +verify_runnable "global" + +if poolexists $TESTPOOL ; then + log_must destroy_pool $TESTPOOL +fi + +log_must rm -rf $VDIR + +log_pass diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_remap.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg old mode 100755 new mode 100644 similarity index 54% rename from tests/zfs-tests/tests/functional/removal/removal_with_remap.ksh rename to tests/zfs-tests/tests/functional/l2arc/l2arc.cfg index 6f56740b82..0302392f4c --- a/tests/zfs-tests/tests/functional/removal/removal_with_remap.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg @@ -1,4 +1,4 @@ -#! /bin/ksh -p +#!/bin/ksh -p # # CDDL HEADER START # @@ -15,18 +15,24 @@ # # -# Copyright (c) 2015, 2017 by Delphix. All rights reserved. +# Copyright (c) 2020, George Amanakis. All rights reserved. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/removal/removal.kshlib -# N.B. The 'zfs remap' command has been disabled and may be removed. -export ZFS_REMAP_ENABLED=YES +export SIZE=1G +export VDIR=$TESTDIR/disk.l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" +export VDEV1="$VDIR/c" -default_setup_noexit "$DISKS" -log_onexit default_cleanup_noexit - -test_removal_with_operation zfs remap $TESTPOOL/$TESTFS - -log_pass "Can remap a filesystem during removal" +# fio options +export DIRECTORY=/$TESTPOOL +export NUMJOBS=4 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh new file mode 100755 index 0000000000..3e76347b02 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# L2ARC MFU/MRU arcstats do not leak +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool, smaller than the cache device +# and random read for 10 sec. +# 3. Read l2arc_mfu_asize and l2arc_mru_asize +# 4. Export pool. +# 5. Verify l2arc_mfu_asize and l2arc_mru_asize are 0. +# 6. Import pool. +# 7. Read random read for 10 sec. +# 8. Read l2arc_mfu_asize and l2arc_mru_asize +# 9. Verify that L2ARC MFU increased and MFU+MRU = L2_asize. +# + +verify_runnable "global" + +log_assert "L2ARC MFU/MRU arcstats do not leak." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_mfu_init=$(get_arcstat l2_mfu_asize) +typeset l2_mru_init=$(get_arcstat l2_mru_asize) +typeset l2_prefetch_init=$(get_arcstat l2_prefetch_asize) +typeset l2_asize_init=$(get_arcstat l2_asize) + +log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size +log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds + +log_must test $(get_arcstat l2_mfu_asize) -eq 0 +log_must test $(get_arcstat l2_mru_asize) -eq 0 +log_must zpool import -d $VDIR $TESTPOOL +arcstat_quiescence_noecho l2_size + +log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_mfu_end=$(get_arcstat l2_mfu_asize) +typeset l2_mru_end=$(get_arcstat l2_mru_asize) +typeset l2_prefetch_end=$(get_arcstat l2_prefetch_asize) +typeset l2_asize_end=$(get_arcstat l2_asize) + +log_must test $(( $l2_mru_end + $l2_mfu_end + $l2_prefetch_end - \ + $l2_asize_end )) -eq 0 +log_must test $(( $l2_mru_init + $l2_mfu_init + $l2_prefetch_init - \ + $l2_asize_init )) -eq 0 + +log_must zpool destroy -f $TESTPOOL + +log_pass "L2ARC MFU/MRU arcstats do not leak." diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh new file mode 100755 index 0000000000..783484f52c --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Adam Moss. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# l2arc_misses does not increment upon reads from a pool without l2arc +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create pool without a cache device. +# 3. Create a random file in the no-cache-device pool, +# and random read for 10 sec. +# 4. Check that l2arc_misses hasn't risen +# 5. Create a random file in the pool with the cache device, +# and random read for 10 sec. +# 6. Check that l2arc_misses has risen +# + +verify_runnable "global" + +log_assert "l2arc_misses does not increment upon reads from a pool without l2arc." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + if poolexists $TESTPOOL1 ; then + destroy_pool $TESTPOOL1 + fi +} +log_onexit cleanup + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=off -f $TESTPOOL $VDEV cache $VDEV_CACHE +log_must zpool create -O compression=off -f $TESTPOOL1 $VDEV1 + +# I/O to pool without l2arc - expect that l2_misses stays constant +export DIRECTORY=/$TESTPOOL1 +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio +# attempt to remove entries for pool from ARC so we would try +# to hit the nonexistent L2ARC for subsequent reads +log_must zpool export $TESTPOOL1 +log_must zpool import $TESTPOOL1 -d $VDEV1 + +typeset starting_miss_count=$(get_arcstat l2_misses) + +log_must fio $FIO_SCRIPTS/random_reads.fio +log_must test $(get_arcstat l2_misses) -eq $starting_miss_count + +# I/O to pool with l2arc - expect that l2_misses rises +export DIRECTORY=/$TESTPOOL +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio +# wait for L2ARC writes to actually happen +arcstat_quiescence_noecho l2_size +# attempt to remove entries for pool from ARC so we would try +# to hit L2ARC for subsequent reads +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL -d $VDEV + +log_must fio $FIO_SCRIPTS/random_reads.fio +log_must test $(get_arcstat l2_misses) -gt $starting_miss_count + +log_must zpool destroy -f $TESTPOOL +log_must zpool destroy -f $TESTPOOL1 + +log_pass "l2arc_misses does not increment upon reads from a pool without l2arc." diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh new file mode 100755 index 0000000000..5d0198c90c --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# l2arc_mfuonly does not cache MRU buffers +# +# STRATEGY: +# 1. Set l2arc_mfuonly=yes +# 2. Create pool with a cache device. +# 3. Create a random file in that pool, smaller than the cache device +# and random read for 10 sec. +# 4. Export and re-import the pool. This is necessary as some MFU ghost +# buffers with prefetch status may transition to MRU eventually. +# By re-importing the pool the l2 arcstats reflect the ARC state +# of L2ARC buffers upon their caching in L2ARC. +# 5. Verify l2arc_mru_asize is 0. +# + +verify_runnable "global" + +log_assert "l2arc_mfuonly does not cache MRU buffers." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_MFUONLY $mfuonly + log_must set_tunable32 PREFETCH_DISABLE $zfsprefetch +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 1 as some prefetched buffers may +# transition to MRU. +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 1 + +typeset mfuonly=$(get_tunable L2ARC_MFUONLY) +log_must set_tunable32 L2ARC_MFUONLY 1 + +typeset zfsprefetch=$(get_tunable PREFETCH_DISABLE) +log_must set_tunable32 PREFETCH_DISABLE 1 + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +typeset log_blk_start=$(get_arcstat l2_log_blk_writes) + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL +log_must zpool import -d $VDIR $TESTPOOL + +# Regardless of l2arc_noprefetch, some MFU buffers might be evicted +# from ARC, accessed later on as prefetches and transition to MRU as +# prefetches. +# If accessed again they are counted as MRU and the l2arc_mru_asize arcstat +# will not be 0 (mentioned also in zfs.4) +# For the purposes of this test we mitigate this by disabling (predictive) +# ZFS prefetches with zfs_prefetch_disable=1. +log_must test $(get_arcstat l2_mru_asize) -eq 0 + +log_must zpool destroy -f $TESTPOOL + +log_pass "l2arc_mfuonly does not cache MRU buffers." diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh new file mode 100755 index 0000000000..0a9049490c --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# Persistent L2ARC with an unencrypted ZFS file system succeeds +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Export and re-import pool without writing any data. +# 3. Create a random file in that pool and random read for 10 sec. +# 4. Export pool. +# 5. Read the amount of log blocks written from the header of the +# L2ARC device. +# 6. Import pool. +# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# (5). +# 8. Check if the labels of the L2ARC device are intact. +# +# * We can predict the minimum bytes of L2ARC restored if we subtract +# from the effective size of the cache device the bytes l2arc_evict() +# evicts: +# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize +# wr_sz: l2arc_write_max + l2arc_write_boost (worst case) +# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) * +# sizeof (l2arc_log_blk_phys_t) +# min restored size: l2 - (wr_sz + blk_overhead) +# + +verify_runnable "global" + +log_assert "Persistent L2ARC with an unencrypted ZFS file system succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE) +log_must set_tunable32 L2ARC_NOPREFETCH 0 +log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must zpool export $TESTPOOL +log_must zpool import -d $VDIR $TESTPOOL + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL +arcstat_quiescence_noecho l2_size + +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - + $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +log_must zdb -lllq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC with an unencrypted ZFS file system succeeds." diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh new file mode 100755 index 0000000000..93982e6c60 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh @@ -0,0 +1,115 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# Persistent L2ARC with an encrypted ZFS file system succeeds +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a an encrypted ZFS file system. +# 3. Create a random file in the encrypted file system and random +# read for 10 sec. +# 4. Export pool. +# 5. Read the amount of log blocks written from the header of the +# L2ARC device. +# 5. Import pool. +# 6. Mount the encrypted ZFS file system. +# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# (5). +# 8. Check if the labels of the L2ARC device are intact. +# +# * We can predict the minimum bytes of L2ARC restored if we subtract +# from the effective size of the cache device the bytes l2arc_evict() +# evicts: +# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize +# wr_sz: l2arc_write_max + l2arc_write_boost (worst case) +# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) * +# sizeof (l2arc_log_blk_phys_t) +# min restored size: l2 - (wr_sz + blk_overhead) +# + +verify_runnable "global" + +log_assert "Persistent L2ARC with an encrypted ZFS file system succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE) +log_must set_tunable32 L2ARC_NOPREFETCH 0 +log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase $TESTPOOL/$TESTFS1" + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL +log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" +arcstat_quiescence_noecho l2_size + +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC with an encrypted ZFS file system succeeds." diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh new file mode 100755 index 0000000000..fe35c8fc45 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0 +# +# STRATEGY: +# 1. Set L2ARC_REBUILD_ENABLED = 0 +# 2. Create pool with a cache device. +# 3. Create a random file in that pool and random read for 10 sec. +# 4. Export pool. +# 5. Import pool. +# 6. Check in zpool iostat if the cache device has space allocated. +# 7. Read the file written in (3) and check if l2_hits in +# /proc/spl/kstat/zfs/arcstats increased. +# + +verify_runnable "global" + +log_assert "Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_REBUILD_ENABLED $rebuild_enabled + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 0 + +# disable L2ARC rebuild +typeset rebuild_enabled=$(get_tunable L2ARC_REBUILD_ENABLED) +log_must set_tunable32 L2ARC_REBUILD_ENABLED 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 2 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +typeset l2_success_start=$(get_arcstat l2_rebuild_success) + +log_must zpool import -d $VDIR $TESTPOOL +log_mustnot test "$(zpool iostat -Hpv $TESTPOOL $VDEV_CACHE | awk '{print $2}')" -gt 80000000 + +typeset l2_success_end=$(get_arcstat l2_rebuild_success) + +log_mustnot test $l2_success_end -gt $l2_success_start + +log_must zpool destroy -f $TESTPOOL +log_must set_tunable32 L2ARC_REBUILD_ENABLED $rebuild_enabled + +log_pass "Persistent L2ARC fails as expected when L2ARC_REBUILD_ENABLED = 0." diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh new file mode 100755 index 0000000000..b407031806 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not +# present. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool and random read for 10 sec. +# 3. Read the amount of log blocks written from the header of the +# L2ARC device. +# 4. Offline the L2ARC device and export pool. +# 5. Import pool and online the L2ARC device. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# (3). +# 7. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE) +log_must set_tunable32 L2ARC_NOPREFETCH 0 +log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size +log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +log_must zpool import -d $VDIR $TESTPOOL +log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present." diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh new file mode 100755 index 0000000000..8ad648519f --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool and random read for 10 sec. +# 3. Offline the L2ARC device. +# 4. Read the amount of log blocks written from the header of the +# L2ARC device. +# 5. Online the L2ARC device. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# (4). +# 7. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE) +log_must set_tunable32 L2ARC_NOPREFETCH 0 +log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) + +# Upon onlining the cache device we might write additional blocks to it +# before it is marked for rebuild as the l2ad_* parameters are not cleared +# when offlining the device. See comment in l2arc_rebuild_vdev(). +# So we cannot compare the amount of rebuilt log blocks to the amount of log +# blocks read from the header of the device. +log_must test $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) -gt 0 +log_must test $l2_dh_log_blk -gt 0 + +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present." diff --git a/tests/zfs-tests/tests/functional/l2arc/setup.ksh b/tests/zfs-tests/tests/functional/l2arc/setup.ksh new file mode 100755 index 0000000000..0df61a9d27 --- /dev/null +++ b/tests/zfs-tests/tests/functional/l2arc/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +verify_runnable "global" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV +log_must mkfile $SIZE $VDEV1 + +log_pass diff --git a/tests/zfs-tests/tests/functional/large_files/large_files_001_pos.ksh b/tests/zfs-tests/tests/functional/large_files/large_files_001_pos.ksh index 3be20356ea..f59603724e 100755 --- a/tests/zfs-tests/tests/functional/large_files/large_files_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/large_files/large_files_001_pos.ksh @@ -38,7 +38,7 @@ # STRATEGY: # 1. largest_file will write to a file and increase its size # to the maximum allowable. -# 2. The last byte of the file should be accessbile without error. +# 2. The last byte of the file should be accessible without error. # 3. Writing beyond the maximum file size generates an 'errno' of # EFBIG. # diff --git a/tests/zfs-tests/tests/functional/large_files/large_files_002_pos.ksh b/tests/zfs-tests/tests/functional/large_files/large_files_002_pos.ksh index f4d4e5afbb..255a8f8b5a 100755 --- a/tests/zfs-tests/tests/functional/large_files/large_files_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/large_files/large_files_002_pos.ksh @@ -49,7 +49,11 @@ log_must rm $TESTDIR/ulimit_write_file $TESTDIR/ulimit_trunc_file # Verify 'ulimit -f ' works log_must ulimit -f 1024 log_mustnot sh -c 'dd if=/dev/zero of=$TESTDIR/ulimit_write_file bs=1M count=2' -log_mustnot sh -c 'truncate -s2M $TESTDIR/ulimit_trunc_file' -log_must rm $TESTDIR/ulimit_write_file $TESTDIR/ulimit_trunc_file +log_must rm $TESTDIR/ulimit_write_file +# FreeBSD allows the sparse file because space has not been allocated. +if ! is_freebsd; then + log_mustnot sh -c 'truncate -s2M $TESTDIR/ulimit_trunc_file' + log_must rm $TESTDIR/ulimit_trunc_file +fi log_pass "Successfully enforced 'ulimit -f' maximum file size" diff --git a/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh b/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh index 1bc8f72d6a..6b51598d7c 100755 --- a/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh @@ -91,13 +91,13 @@ function cleanup if ismounted $TESTPOOL/$TESTFS ; then log_must zfs unmount $TESTPOOL/$TESTFS fi - log_must zfs destroy $TESTPOOL/$TESTFS + destroy_dataset $TESTPOOL/$TESTFS fi destroy_pool $TESTPOOL datasetexists $TESTPOOL2/$TESTVOL && \ - log_must zfs destroy $TESTPOOL2/$TESTVOL + destroy_dataset $TESTPOOL2/$TESTVOL destroy_pool $TESTPOOL2 @@ -154,8 +154,8 @@ for volsize in $VOLSIZES; do log_note "Destroy zfs, volume & zpool" log_must zfs destroy $TESTPOOL/$TESTFS destroy_pool $TESTPOOL - log_must zfs destroy $TESTPOOL2/$TESTVOL + log_must_busy zfs destroy $TESTPOOL2/$TESTVOL destroy_pool $TESTPOOL2 done -log_pass "Dateset can be created, mounted & destroy in largest pool succeeded." +log_pass "Dataset can be created, mounted & destroy in largest pool succeeded." diff --git a/tests/zfs-tests/tests/functional/libzfs/Makefile.am b/tests/zfs-tests/tests/functional/libzfs/Makefile.am index ae9be5097a..53cb635444 100644 --- a/tests/zfs-tests/tests/functional/libzfs/Makefile.am +++ b/tests/zfs-tests/tests/functional/libzfs/Makefile.am @@ -9,12 +9,8 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ libzfs_input.ksh -DEFAULT_INCLUDES += \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/lib/libspl/include - many_fds_LDADD = \ - $(top_builddir)/lib/libzfs/libzfs.la + $(abs_top_builddir)/lib/libzfs/libzfs.la pkgexec_PROGRAMS = many_fds many_fds_SOURCES = many_fds.c diff --git a/tests/zfs-tests/tests/functional/limits/cleanup.ksh b/tests/zfs-tests/tests/functional/limits/cleanup.ksh index e78deacd5b..2a84ab4438 100755 --- a/tests/zfs-tests/tests/functional/limits/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/limits/cleanup.ksh @@ -15,5 +15,7 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib +cleanup_user_group default_cleanup diff --git a/tests/zfs-tests/tests/functional/limits/filesystem_limit.ksh b/tests/zfs-tests/tests/functional/limits/filesystem_limit.ksh index a659792541..fbfc141be3 100755 --- a/tests/zfs-tests/tests/functional/limits/filesystem_limit.ksh +++ b/tests/zfs-tests/tests/functional/limits/filesystem_limit.ksh @@ -15,10 +15,12 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib # # DESCRIPTION: # ZFS 'filesystem_limit' is enforced when executing various actions +# NOTE: the limit should *not* be enforced if the user is allowed to change it. # # STRATEGY: # 1. Verify 'zfs create' and 'zfs clone' cannot exceed the filesystem_limit @@ -28,14 +30,47 @@ verify_runnable "both" +# +# The has_capability() function was first exported in the 4.10 Linux kernel +# then backported to some LTS kernels. Prior to this change there was no +# mechanism to perform the needed permission check. Therefore, this test +# is expected to fail on older kernels and is skipped. +# +if is_linux; then + if [[ $(linux_version) -lt $(linux_version "4.10") ]]; then + log_unsupported "Requires has_capability() kernel function" + fi +fi + function setup { - log_must zfs create "$DATASET_TEST" - log_must zfs create "$DATASET_UTIL" + # We can't delegate 'mount' privs under Linux: to avoid issues with + # commands that may need to (re)mount datasets we set mountpoint=none + if is_linux; then + log_must zfs create -o mountpoint=none "$DATASET_TEST" + log_must zfs create -o mountpoint=none "$DATASET_UTIL" + else + log_must zfs create "$DATASET_TEST" + log_must zfs create "$DATASET_UTIL" + fi + if is_freebsd; then + # Ensure our non-root user has the permission to create the + # mountpoints and mount the filesystems. + sysctl vfs.usermount=1 + log_must chmod 777 $(get_prop mountpoint "$DATASET_TEST") + log_must chmod 777 $(get_prop mountpoint "$DATASET_UTIL") + fi + log_must zfs allow -d -l $STAFF1 'create,mount,rename,clone,receive' \ + "$DATASET_TEST" + log_must zfs allow -d -l $STAFF1 'create,mount,rename,clone,receive' \ + "$DATASET_UTIL" } function cleanup { + if is_freebsd; then + sysctl vfs.usermount=0 + fi destroy_dataset "$DATASET_TEST" "-Rf" destroy_dataset "$DATASET_UTIL" "-Rf" rm -f $ZSTREAM @@ -50,25 +85,39 @@ ZSTREAM="$TEST_BASE_DIR/filesystem_limit.$$" # 1. Verify 'zfs create' and 'zfs clone' cannot exceed the filesystem_limit setup +# NOTE: we allow 'canmount' to the non-root user so we can use 'log_must' with +# 'user_run zfs create -o canmount=off' successfully +log_must zfs allow -d -l $STAFF1 'canmount' "$DATASET_TEST" log_must zfs set filesystem_limit=1 "$DATASET_TEST" -log_must zfs create "$DATASET_TEST/create" -log_mustnot zfs create "$DATASET_TEST/create_exceed" +log_must user_run $STAFF1 zfs create -o canmount=off "$DATASET_TEST/create" +log_mustnot user_run $STAFF1 zfs create -o canmount=off "$DATASET_TEST/create_exceed" log_mustnot datasetexists "$DATASET_TEST/create_exceed" log_must zfs set filesystem_limit=2 "$DATASET_TEST" log_must zfs snapshot "$DATASET_TEST/create@snap" -log_must zfs clone "$DATASET_TEST/create@snap" "$DATASET_TEST/clone" -log_mustnot zfs clone "$DATASET_TEST/create@snap" "$DATASET_TEST/clone_exceed" +log_must user_run $STAFF1 zfs clone -o canmount=off "$DATASET_TEST/create@snap" "$DATASET_TEST/clone" +log_mustnot user_run $STAFF1 zfs clone -o canmount=off "$DATASET_TEST/create@snap" "$DATASET_TEST/clone_exceed" log_mustnot datasetexists "$DATASET_TEST/clone_exceed" log_must test "$(get_prop 'filesystem_count' "$DATASET_TEST")" == "2" +# Verify filesystem_limit is *not* enforced for users allowed to change it +log_must zfs create "$DATASET_TEST/create_notenforced_root" +log_must zfs allow -l $STAFF1 'filesystem_limit' "$DATASET_TEST" +log_must user_run $STAFF1 zfs create -o canmount=off "$DATASET_TEST/create_notenforced_user" +log_must test "$(get_prop 'filesystem_count' "$DATASET_TEST")" == "4" cleanup # 2. Verify 'zfs rename' cannot move filesystems exceeding the limit setup log_must zfs set filesystem_limit=0 "$DATASET_UTIL" log_must zfs create "$DATASET_TEST/rename" -log_mustnot zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed" +log_mustnot user_run $STAFF1 zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed" log_mustnot datasetexists "$DATASET_UTIL/renamed" log_must test "$(get_prop 'filesystem_count' "$DATASET_UTIL")" == "0" +# Verify filesystem_limit is *not* enforced for users allowed to change it +log_must zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed_notenforced_root" +log_must zfs rename "$DATASET_UTIL/renamed_notenforced_root" "$DATASET_TEST/rename" +log_must zfs allow -l $STAFF1 'filesystem_limit' "$DATASET_UTIL" +log_must user_run $STAFF1 zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed_notenforced_user" +log_must datasetexists "$DATASET_UTIL/renamed_notenforced_user" cleanup # 3. Verify 'zfs receive' cannot exceed the limit @@ -77,8 +126,14 @@ log_must zfs set filesystem_limit=0 "$DATASET_TEST" log_must zfs create "$DATASET_UTIL/send" log_must zfs snapshot "$DATASET_UTIL/send@snap1" log_must eval "zfs send $DATASET_UTIL/send@snap1 > $ZSTREAM" -log_mustnot eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_mustnot user_run $STAFF1 eval "zfs receive $DATASET_TEST/received < $ZSTREAM" log_mustnot datasetexists "$DATASET_TEST/received" log_must test "$(get_prop 'filesystem_count' "$DATASET_TEST")" == "0" +# Verify filesystem_limit is *not* enforced for users allowed to change it +log_must eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_must zfs destroy -r "$DATASET_TEST/received" +log_must zfs allow -l $STAFF1 'filesystem_limit' "$DATASET_TEST" +log_must user_run $STAFF1 eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_must datasetexists "$DATASET_TEST/received" log_pass "'filesystem_limit' property is enforced" diff --git a/tests/zfs-tests/tests/functional/limits/setup.ksh b/tests/zfs-tests/tests/functional/limits/setup.ksh index af6edbe2bd..94f3e7b4d4 100755 --- a/tests/zfs-tests/tests/functional/limits/setup.ksh +++ b/tests/zfs-tests/tests/functional/limits/setup.ksh @@ -15,7 +15,14 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib DISK=${DISKS%% *} +cleanup_user_group + +# Create staff group and user +log_must add_group $STAFF_GROUP +log_must add_user $STAFF_GROUP $STAFF1 + default_volume_setup $DISK diff --git a/tests/zfs-tests/tests/functional/limits/snapshot_limit.ksh b/tests/zfs-tests/tests/functional/limits/snapshot_limit.ksh index fa4b6e8f23..62f14466e6 100755 --- a/tests/zfs-tests/tests/functional/limits/snapshot_limit.ksh +++ b/tests/zfs-tests/tests/functional/limits/snapshot_limit.ksh @@ -15,10 +15,12 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib # # DESCRIPTION: # ZFS 'snapshot_limit' is enforced when executing various actions +# NOTE: the limit should *not* be enforced if the user is allowed to change it. # # STRATEGY: # 1. Verify 'zfs snapshot' cannot exceed the snapshot_limit @@ -29,14 +31,47 @@ verify_runnable "both" +# +# The has_capability() function was first exported in the 4.10 Linux kernel +# then backported to some LTS kernels. Prior to this change there was no +# mechanism to perform the needed permission check. Therefore, this test +# is expected to fail on older kernels and is skipped. +# +if is_linux; then + if [[ $(linux_version) -lt $(linux_version "4.10") ]]; then + log_unsupported "Requires has_capability() kernel function" + fi +fi + function setup { - log_must zfs create "$DATASET_TEST" - log_must zfs create "$DATASET_UTIL" + # We can't delegate 'mount' privs under Linux: to avoid issues with + # commands that may need to (re)mount datasets we set mountpoint=none + if is_linux; then + log_must zfs create -o mountpoint=none "$DATASET_TEST" + log_must zfs create -o mountpoint=none "$DATASET_UTIL" + else + log_must zfs create "$DATASET_TEST" + log_must zfs create "$DATASET_UTIL" + fi + if is_freebsd; then + # Ensure our non-root user has the permission to create the + # mountpoints and mount the filesystems. + sysctl vfs.usermount=1 + log_must chmod 777 $(get_prop mountpoint "$DATASET_TEST") + log_must chmod 777 $(get_prop mountpoint "$DATASET_UTIL") + fi + log_must zfs allow -d -l $STAFF1 \ + 'create,snapshot,rename,mount,promote,receive' "$DATASET_TEST" + log_must zfs allow -d -l $STAFF1 \ + 'create,snapshot,rename,mount,promote,receive' "$DATASET_UTIL" } function cleanup { + if is_freebsd; then + sysctl vfs.usermount=0 + fi destroy_dataset "$DATASET_TEST" "-Rf" destroy_dataset "$DATASET_UTIL" "-Rf" rm -f $ZSTREAM @@ -52,10 +87,15 @@ ZSTREAM="$TEST_BASE_DIR/snapshot_limit.$$" # 1. Verify 'zfs snapshot' cannot exceed the snapshot_limit setup log_must zfs set snapshot_limit=1 "$DATASET_TEST" -log_must zfs snapshot "$DATASET_TEST@snap" -log_mustnot zfs snapshot "$DATASET_TEST@snap_exceed" +log_must user_run $STAFF1 zfs snapshot "$DATASET_TEST@snap" +log_mustnot user_run $STAFF1 zfs snapshot "$DATASET_TEST@snap_exceed" log_mustnot datasetexists "$DATASET_TEST@snap_exceed" log_must test "$(get_prop 'snapshot_count' "$DATASET_TEST")" == "1" +# Verify snapshot_limit is *not* enforced for users allowed to change it +log_must zfs snapshot "$DATASET_TEST@snap_notenforced_root" +log_must zfs allow -l $STAFF1 'snapshot_limit' "$DATASET_TEST" +log_must user_run $STAFF1 zfs snapshot "$DATASET_TEST@snap_notenforced_user" +log_must test "$(get_prop 'snapshot_count' "$DATASET_TEST")" == "3" cleanup # 2. Verify 'zfs rename' cannot move snapshots exceeding the limit @@ -63,9 +103,19 @@ setup log_must zfs set snapshot_limit=0 "$DATASET_UTIL" log_must zfs create "$DATASET_TEST/rename" log_must zfs snapshot "$DATASET_TEST/rename@snap" -log_mustnot zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed" +log_mustnot user_run $STAFF1 \ + zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed" log_mustnot datasetexists "$DATASET_UTIL/renamed" log_must test "$(get_prop 'snapshot_count' "$DATASET_UTIL")" == "0" +# Verify snapshot_limit is *not* enforced for users allowed to change it +log_must zfs rename "$DATASET_TEST/rename" \ + "$DATASET_UTIL/renamed_notenforced_root" +log_must zfs create "$DATASET_TEST/rename" +log_must zfs snapshot "$DATASET_TEST/rename@snap" +log_must zfs allow -l $STAFF1 'snapshot_limit' "$DATASET_UTIL" +log_must user_run $STAFF1 \ + zfs rename "$DATASET_TEST/rename" "$DATASET_UTIL/renamed_notenforced_user" +log_must test "$(get_prop 'snapshot_count' "$DATASET_UTIL")" == "2" cleanup # 3. Verify 'zfs promote' cannot exceed the limit @@ -74,9 +124,15 @@ log_must zfs set snapshot_limit=0 "$DATASET_UTIL" log_must zfs create "$DATASET_TEST/promote" log_must zfs snapshot "$DATASET_TEST/promote@snap" log_must zfs clone "$DATASET_TEST/promote@snap" "$DATASET_UTIL/promoted" -log_mustnot zfs promote "$DATASET_UTIL/promoted" +log_mustnot user_run $STAFF1 zfs promote "$DATASET_UTIL/promoted" log_mustnot datasetexists "$DATASET_UTIL/promoted@snap" log_must test "$(get_prop 'snapshot_count' "$DATASET_UTIL")" == "0" +# Verify snapshot_limit is *not* enforced for users allowed to change it +log_must zfs promote "$DATASET_UTIL/promoted" +log_must zfs promote "$DATASET_TEST/promote" +log_must zfs allow -l $STAFF1 'snapshot_limit' "$DATASET_UTIL" +log_must user_run $STAFF1 zfs promote "$DATASET_UTIL/promoted" +log_must test "$(get_prop 'snapshot_count' "$DATASET_UTIL")" == "1" cleanup # 4. Verify 'zfs receive' cannot exceed the limit @@ -85,15 +141,26 @@ log_must zfs set snapshot_limit=0 "$DATASET_TEST" log_must zfs create "$DATASET_UTIL/send" log_must zfs snapshot "$DATASET_UTIL/send@snap1" log_must eval "zfs send $DATASET_UTIL/send@snap1 > $ZSTREAM" -log_mustnot eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_mustnot user_run $STAFF1 \ + eval "zfs receive $DATASET_TEST/received < $ZSTREAM" log_mustnot datasetexists "$DATASET_TEST/received" log_must test "$(get_prop 'snapshot_count' "$DATASET_TEST")" == "0" log_must zfs set snapshot_limit=1 "$DATASET_TEST" -log_must eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_must user_run $STAFF1 \ + eval "zfs receive $DATASET_TEST/received < $ZSTREAM" log_must zfs snapshot "$DATASET_UTIL/send@snap2" log_must eval "zfs send -i @snap1 $DATASET_UTIL/send@snap2 > $ZSTREAM" -log_mustnot eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_mustnot user_run $STAFF1 \ + eval "zfs receive $DATASET_TEST/received < $ZSTREAM" log_mustnot datasetexists "$DATASET_TEST/received@snap2" log_must test "$(get_prop 'snapshot_count' "$DATASET_TEST")" == "1" +# Verify snapshot_limit is *not* enforced for users allowed to change it +log_must eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_must zfs snapshot "$DATASET_UTIL/send@snap3" +log_must eval "zfs send -i @snap2 $DATASET_UTIL/send@snap3 > $ZSTREAM" +log_must zfs allow -l $STAFF1 'snapshot_limit' "$DATASET_TEST" +log_must user_run $STAFF1 \ + eval "zfs receive $DATASET_TEST/received < $ZSTREAM" +log_must test "$(get_prop 'snapshot_count' "$DATASET_TEST")" == "3" log_pass "'snapshot_limit' property is enforced" diff --git a/tests/zfs-tests/tests/functional/link_count/Makefile.am b/tests/zfs-tests/tests/functional/link_count/Makefile.am index 669f3c142c..bfb7154a65 100644 --- a/tests/zfs-tests/tests/functional/link_count/Makefile.am +++ b/tests/zfs-tests/tests/functional/link_count/Makefile.am @@ -2,4 +2,5 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/link_count dist_pkgdata_SCRIPTS = \ cleanup.ksh \ setup.ksh \ - link_count_001.ksh + link_count_001.ksh \ + link_count_root_inode.ksh diff --git a/tests/zfs-tests/tests/functional/link_count/link_count_001.ksh b/tests/zfs-tests/tests/functional/link_count/link_count_001.ksh index e121787cab..3ab3fbef8d 100755 --- a/tests/zfs-tests/tests/functional/link_count/link_count_001.ksh +++ b/tests/zfs-tests/tests/functional/link_count/link_count_001.ksh @@ -49,6 +49,10 @@ log_assert "Verify file link count is zero on zfs" export ITERS=10 export NUMFILES=10000 +if is_freebsd; then + log_unsupported "Not applicable on FreeBSD" +fi + # Detect and make sure this test must be executed on a multi-process system if ! is_mp; then log_unsupported "This test requires a multi-processor system." diff --git a/tests/zfs-tests/tests/functional/link_count/link_count_root_inode.ksh b/tests/zfs-tests/tests/functional/link_count/link_count_root_inode.ksh new file mode 100755 index 0000000000..d2bf30ac37 --- /dev/null +++ b/tests/zfs-tests/tests/functional/link_count/link_count_root_inode.ksh @@ -0,0 +1,119 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify root inode (directory) has correct link count. +# +# STRATEGY: +# 1. Create pool and fs. +# 2. Test link count of root inode. +# 3. Create directories and test link count of root inode. +# 4. Delete directories and test link count of root inode. +# 5. Create regular file and test link count of root inode. +# 6. Delete regular file and test link count of root inode. +# + +function assert_link_count +{ + typeset dirpath="$1" + typeset value="$2" + + log_must test "$(ls -ld $dirpath | awk '{ print $2 }')" == "$value" +} + +verify_runnable "both" + +log_note "Verify root inode (directory) has correct link count." + +# Delete a directory from link_count_001.ksh. +if [ -d "${TESTDIR}" -a -d "${TESTDIR}/tmp" ]; then + log_must rm -rf ${TESTDIR}/tmp +fi + +# +# Test with hidden '.zfs' directory. +# This also tests general directories. +# +log_note "Testing with snapdir set to hidden (default)" + +for dst in $TESTPOOL $TESTPOOL/$TESTFS +do + typeset mtpt=$(get_prop mountpoint $dst) + log_must zfs set snapdir=hidden $dst + log_must test -d "$mtpt/.zfs" + if test -n "$(ls $mtpt)"; then + ls $mtpt + log_note "$mtpt not empty, skipping" + continue + fi + assert_link_count $mtpt 2 + + log_must mkdir $mtpt/a + assert_link_count $mtpt 3 + log_must rmdir $mtpt/a + assert_link_count $mtpt 2 + + log_must mkdir -p $mtpt/a/b + assert_link_count $mtpt 3 + log_must rmdir $mtpt/a/b + log_must rmdir $mtpt/a + assert_link_count $mtpt 2 + + log_must touch $mtpt/a + assert_link_count $mtpt 2 + log_must rm $mtpt/a + assert_link_count $mtpt 2 +done + +# +# Test with visible '.zfs' directory. +# +log_note "Testing with snapdir set to visible" + +for dst in $TESTPOOL $TESTPOOL/$TESTFS +do + typeset mtpt=$(get_prop mountpoint $dst) + log_must zfs set snapdir=visible $dst + log_must test -d "$mtpt/.zfs" + if test -n "$(ls $mtpt)"; then + ls $mtpt + log_note "$mtpt not empty, skipping" + continue + fi + assert_link_count $mtpt 3 + + log_must mkdir $mtpt/a + assert_link_count $mtpt 4 + log_must rmdir $mtpt/a + assert_link_count $mtpt 3 + + log_must mkdir -p $mtpt/a/b + assert_link_count $mtpt 4 + log_must rmdir $mtpt/a/b + log_must rmdir $mtpt/a + assert_link_count $mtpt 3 + + log_must touch $mtpt/a + assert_link_count $mtpt 3 + log_must rm $mtpt/a + assert_link_count $mtpt 3 +done + +log_pass "Verify root inode (directory) has correct link count passed" diff --git a/tests/zfs-tests/tests/functional/log_spacemap/Makefile.am b/tests/zfs-tests/tests/functional/log_spacemap/Makefile.am new file mode 100644 index 0000000000..a1e523426c --- /dev/null +++ b/tests/zfs-tests/tests/functional/log_spacemap/Makefile.am @@ -0,0 +1,2 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/log_spacemap +dist_pkgdata_SCRIPTS = log_spacemap_import_logs.ksh diff --git a/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh new file mode 100755 index 0000000000..fca0e8e4a1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh @@ -0,0 +1,81 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Log spacemaps are generally destroyed at export in order to +# not induce performance overheads at import time. As a result, +# the log spacemap codepaths that read the logs in import times +# are not tested outside of ztest and pools with DEBUG bits doing +# many imports/exports while running the test suite. +# +# This test uses an internal tunable and forces ZFS to keep the +# log spacemaps at export, and then re-imports the pool, thus +# providing explicit testing of those codepaths. It also uses +# another tunable to load all the metaslabs when the pool is +# re-imported so more assertions and verifications will be hit. +# +# STRATEGY: +# 1. Create pool. +# 2. Do a couple of writes to generate some data for spacemap logs. +# 3. Set tunable to keep logs after export. +# 4. Export pool and verify that there are logs with zdb. +# 5. Set tunable to load all metaslabs at import. +# 6. Import pool. +# 7. Reset tunables. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + if poolexists $LOGSM_POOL; then + log_must zpool destroy -f $LOGSM_POOL + fi +} +log_onexit cleanup + +LOGSM_POOL="logsm_import" +TESTDISK="$(echo $DISKS | cut -d' ' -f1)" + +log_must zpool create -o cachefile=none -f $LOGSM_POOL $TESTDISK +log_must zfs create $LOGSM_POOL/fs + +log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10 +log_must sync +log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10 +log_must sync + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must zpool export $LOGSM_POOL + +LOGSM_COUNT=$(zdb -m -e $LOGSM_POOL | grep "Log Spacemap object" | wc -l) +if (( LOGSM_COUNT == 0 )); then + log_fail "Pool does not have any log spacemaps after being exported" +fi + +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 +log_must zpool import $LOGSM_POOL + +log_pass "Log spacemaps imported with no errors" diff --git a/tests/zfs-tests/tests/functional/migration/migration.cfg b/tests/zfs-tests/tests/functional/migration/migration.cfg index 7d86436709..12a5a7799b 100644 --- a/tests/zfs-tests/tests/functional/migration/migration.cfg +++ b/tests/zfs-tests/tests/functional/migration/migration.cfg @@ -60,7 +60,6 @@ case "${#disk_array[*]}" in log_fail "$ZFS_DISK not supported for partitioning." fi else - export DEV_DSKDIR="/dev" ZFSSIDE_DISK=${SINGLE_DISK}s0 NONZFSSIDE_DISK=${SINGLE_DISK}s1 fi @@ -93,7 +92,6 @@ case "${#disk_array[*]}" in log_fail "$NONZFS_DISK not supported for partitioning." fi else - export DEV_DSKDIR="/dev" ZFSSIDE_DISK=${ZFS_DISK}s0 NONZFSSIDE_DISK=${NONZFS_DISK}s0 fi diff --git a/tests/zfs-tests/tests/functional/migration/setup.ksh b/tests/zfs-tests/tests/functional/migration/setup.ksh index cae66aa5b1..58edc0a929 100755 --- a/tests/zfs-tests/tests/functional/migration/setup.ksh +++ b/tests/zfs-tests/tests/functional/migration/setup.ksh @@ -57,9 +57,9 @@ log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS rm -rf $NONZFS_TESTDIR || log_unresolved Could not remove $NONZFS_TESTDIR mkdir -p $NONZFS_TESTDIR || log_unresolved Could not create $NONZFS_TESTDIR -echo "y" | newfs -v ${DEV_DSKDIR}/$NONZFS_DISK +new_fs ${DEV_DSKDIR}/$NONZFS_DISK (( $? != 0 )) && - log_untested "Unable to setup a UFS file system" + log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system" log_must mount ${DEV_DSKDIR}/$NONZFS_DISK $NONZFS_TESTDIR diff --git a/tests/zfs-tests/tests/functional/mmap/Makefile.am b/tests/zfs-tests/tests/functional/mmap/Makefile.am index 2adc398b8c..b26791ee7c 100644 --- a/tests/zfs-tests/tests/functional/mmap/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmap/Makefile.am @@ -4,7 +4,8 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ mmap_read_001_pos.ksh \ mmap_write_001_pos.ksh \ - mmap_libaio_001_pos.ksh + mmap_libaio_001_pos.ksh \ + mmap_seek_001_pos.ksh dist_pkgdata_DATA = \ mmap.cfg diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_read_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_read_001_pos.ksh index 42e1f73202..470f10d937 100755 --- a/tests/zfs-tests/tests/functional/mmap/mmap_read_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/mmap/mmap_read_001_pos.ksh @@ -40,7 +40,7 @@ # 1. Create a pool & dataset # 2. Call readmmap binary # 3. unmount this file system -# 4. Verify the integrity of this pool & dateset +# 4. Verify the integrity of this pool & dataset # verify_runnable "global" diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_seek_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_seek_001_pos.ksh new file mode 100755 index 0000000000..6188549ad8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_seek_001_pos.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmap/mmap.cfg + +# +# DESCRIPTION: +# lseek() data/holes for an mmap()'d file. +# +# STRATEGY: +# 1. Enable compression and hole reporting for dirty files. +# 2. Call mmap_seek binary test case for various record sizes. +# + +verify_runnable "global" + +function cleanup +{ + log_must zfs set compression=off $TESTPOOL/$TESTFS + log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + log_must rm -f $TESTDIR/test-mmap-file + log_must set_tunable64 DMU_OFFSET_NEXT_SYNC $dmu_offset_next_sync +} + +log_assert "lseek() data/holes for an mmap()'d file." + +log_onexit cleanup + +# Enable hole reporting for dirty files. +typeset dmu_offset_next_sync=$(get_tunable DMU_OFFSET_NEXT_SYNC) +log_must set_tunable64 DMU_OFFSET_NEXT_SYNC 1 + +# Compression must be enabled to convert zero'd blocks to holes. +# This behavior is checked by the mmap_seek test. +log_must zfs set compression=on $TESTPOOL/$TESTFS + +for bs in 4096 8192 16384 32768 65536 131072; do + log_must zfs set recordsize=$bs $TESTPOOL/$TESTFS + log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs + log_must rm $TESTDIR/test-mmap-file +done + +log_pass "lseek() data/holes for an mmap()'d file succeeded." diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh index 24150b827f..2f4257993d 100755 --- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh @@ -33,7 +33,7 @@ # # DESCRIPTION: -# Writing to a file and mmaping that file at the +# Writing to a file and mmapping that file at the # same time does not result in a deadlock. # # STRATEGY: diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am index e39a0a5aac..2848fd4ce6 100644 --- a/tests/zfs-tests/tests/functional/mmp/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am @@ -12,6 +12,7 @@ dist_pkgdata_SCRIPTS = \ mmp_reset_interval.ksh \ mmp_on_zdb.ksh \ mmp_write_distribution.ksh \ + mmp_hostid.ksh \ setup.ksh \ cleanup.ksh diff --git a/tests/zfs-tests/tests/functional/mmp/cleanup.ksh b/tests/zfs-tests/tests/functional/mmp/cleanup.ksh index 8146f773a2..b41d6ccbeb 100755 --- a/tests/zfs-tests/tests/functional/mmp/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/mmp/cleanup.ksh @@ -23,6 +23,6 @@ verify_runnable "global" -log_must set_tunable64 zfs_multihost_history $MMP_HISTORY_OFF +log_must set_tunable64 MULTIHOST_HISTORY $MMP_HISTORY_OFF log_pass "mmp cleanup passed" diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib index fda57c002c..661cbf3a52 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib +++ b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib @@ -173,8 +173,8 @@ function seconds_mmp_waits_for_activity typeset seconds=0 typeset devices=${#DISK[@]} - typeset import_intervals=$(get_tunable zfs_multihost_import_intervals) - typeset import_interval=$(get_tunable zfs_multihost_interval) + typeset import_intervals=$(get_tunable MULTIHOST_IMPORT_INTERVALS) + typeset import_interval=$(get_tunable MULTIHOST_INTERVAL) typeset tmpfile=$(mktemp) typeset mmp_fail typeset mmp_write @@ -241,8 +241,8 @@ function import_activity_check # pool opts act_test_duration function clear_mmp_history { - log_must set_tunable64 zfs_multihost_history $MMP_HISTORY_OFF - log_must set_tunable64 zfs_multihost_history $MMP_HISTORY + log_must set_tunable64 MULTIHOST_HISTORY $MMP_HISTORY_OFF + log_must set_tunable64 MULTIHOST_HISTORY $MMP_HISTORY } function count_skipped_mmp_writes # pool duration diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh new file mode 100755 index 0000000000..e3c6e34f4b --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify the hostid file can reside on a ZFS dataset. +# +# STRATEGY: +# 1. Create a non-redundant pool +# 2. Create an 'etc' dataset containing a valid hostid file +# 3. Create a file so the pool will have some contents +# 4. Verify multihost cannot be enabled until the /etc/hostid is linked +# 5. Verify vdevs may be attached and detached +# 6. Verify normal, cache, log and special vdevs can be added +# 7. Verify normal, cache, and log vdevs can be removed +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + default_cleanup_noexit + log_must rm $MMP_DIR/file.{0,1,2,3,4,5} + log_must rmdir $MMP_DIR + log_must mmp_clear_hostid + if [[ -L $HOSTID_FILE ]]; then + rm -f $HOSTID_FILE + fi +} + +log_assert "Verify hostid file can reside on a ZFS dataset" +log_onexit cleanup + +log_must mkdir -p $MMP_DIR +log_must truncate -s $MINVDEVSIZE $MMP_DIR/file.{0,1,2,3,4,5} + +# 1. Create a non-redundant pool +log_must zpool create $MMP_POOL $MMP_DIR/file.0 + +# 2. Create an 'etc' dataset containing a valid hostid file; caching is +# disabled on the dataset to force the hostid to be read from disk. +log_must zfs create -o primarycache=none -o secondarycache=none $MMP_POOL/etc +mntpnt_etc=$(get_prop mountpoint $MMP_POOL/etc) +log_must mmp_set_hostid $HOSTID1 +log_must mv $HOSTID_FILE $mntpnt_etc/hostid + +# 3. Create a file so the pool will have some contents +log_must zfs create $MMP_POOL/fs +mntpnt_fs=$(get_prop mountpoint $MMP_POOL/fs) +log_must mkfile 1M $mntpnt_fs/file + +# 4. Verify multihost cannot be enabled until the /etc/hostid is linked +log_mustnot zpool set multihost=on $MMP_POOL +log_mustnot ls -l $HOSTID_FILE +log_must ln -s $mntpnt_etc/hostid $HOSTID_FILE +log_must zpool set multihost=on $MMP_POOL + +# 5. Verify vdevs may be attached and detached +log_must zpool attach $MMP_POOL $MMP_DIR/file.0 $MMP_DIR/file.1 +log_must zpool detach $MMP_POOL $MMP_DIR/file.1 + +# 6. Verify normal, cache, log and special vdevs can be added +log_must zpool add $MMP_POOL $MMP_DIR/file.1 +log_must zpool add $MMP_POOL $MMP_DIR/file.2 +log_must zpool add $MMP_POOL cache $MMP_DIR/file.3 +log_must zpool add $MMP_POOL log $MMP_DIR/file.4 +log_must zpool add $MMP_POOL special $MMP_DIR/file.5 + +# 7. Verify normal, cache, and log vdevs can be removed +log_must zpool remove $MMP_POOL $MMP_DIR/file.2 +log_must zpool remove $MMP_POOL $MMP_DIR/file.3 +log_must zpool remove $MMP_POOL $MMP_DIR/file.4 + +log_pass "Verify hostid file can reside on a ZFS dataset." diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh index 64ed9bf974..6e7bb63754 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh @@ -43,7 +43,7 @@ function cleanup { default_cleanup_noexit log_must mmp_clear_hostid - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT } log_assert "multihost=on|off inactive pool activity checks" @@ -103,7 +103,7 @@ log_mustnot import_no_activity_check $TESTPOOL "-f" # 9. Verify activity check duration based on mmp_write and mmp_fail # Specify a short test via tunables but import pool imported while # tunables set to default duration. -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_MIN +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID1 log_must import_activity_check $TESTPOOL "-f" $MMP_TEST_DURATION_DEFAULT diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_interval.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_interval.ksh index fb44d6191b..0c080ab5d3 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_interval.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_interval.ksh @@ -19,11 +19,11 @@ # # DESCRIPTION: -# zfs_multihost_interval should only accept valid values. +# MULTIHOST_INTERVAL should only accept valid values. # # STRATEGY: -# 1. Set zfs_multihost_interval to invalid values (negative). -# 2. Set zfs_multihost_interval to valid values. +# 1. Set MULTIHOST_INTERVAL to invalid values (negative). +# 2. Set MULTIHOST_INTERVAL to valid values. # . $STF_SUITE/include/libtest.shlib @@ -34,14 +34,14 @@ verify_runnable "both" function cleanup { - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT } -log_assert "zfs_multihost_interval cannot be set to an invalid value" +log_assert "MULTIHOST_INTERVAL cannot be set to an invalid value" log_onexit cleanup -log_mustnot set_tunable64 zfs_multihost_interval -1 -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_MIN -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT +log_mustnot set_tunable64 MULTIHOST_INTERVAL -1 +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT -log_pass "zfs_multihost_interval cannot be set to an invalid value" +log_pass "MULTIHOST_INTERVAL cannot be set to an invalid value" diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_on_off.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_on_off.ksh index 8bef86a0ff..29d771de8f 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_on_off.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_on_off.ksh @@ -23,7 +23,7 @@ # # STRATEGY: # 1. Set multihost=off (disables mmp) -# 2. Set zfs_txg_timeout to large value +# 2. Set TXG_TIMEOUT to large value # 3. Create a zpool # 4. Find the current "best" uberblock # 5. Sleep for enough time for uberblocks to change @@ -44,8 +44,8 @@ verify_runnable "both" function cleanup { default_cleanup_noexit - log_must set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_DEFAULT - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT + log_must set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_DEFAULT + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT log_must rm -f $PREV_UBER $CURR_UBER log_must mmp_clear_hostid } @@ -53,8 +53,8 @@ function cleanup log_assert "mmp thread won't write uberblocks with multihost=off" log_onexit cleanup -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_MIN -log_must set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_LONG +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN +log_must set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_LONG log_must mmp_set_hostid $HOSTID1 default_setup_noexit $DISK diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_on_thread.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_on_thread.ksh index 07384c6231..01cca61c3c 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_on_thread.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_on_thread.ksh @@ -39,7 +39,7 @@ verify_runnable "both" function cleanup { default_cleanup_noexit - log_must set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_DEFAULT + log_must set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_DEFAULT log_must rm -f $PREV_UBER $CURR_UBER log_must mmp_clear_hostid } @@ -47,7 +47,7 @@ function cleanup log_assert "mmp thread writes uberblocks (MMP)" log_onexit cleanup -log_must set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_LONG +log_must set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_LONG log_must mmp_set_hostid $HOSTID1 default_setup_noexit $DISK diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_on_uberblocks.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_on_uberblocks.ksh index bf1eb54a73..007288a78f 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_on_uberblocks.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_on_uberblocks.ksh @@ -22,7 +22,7 @@ # Ensure that MMP updates uberblocks with MMP info at expected intervals. # # STRATEGY: -# 1. Set zfs_txg_timeout to large value +# 1. Set TXG_TIMEOUT to large value # 2. Create a zpool # 3. Clear multihost history # 4. Sleep, then collect count of uberblocks written @@ -47,15 +47,15 @@ MIN_SEQ_VALUES=7 function cleanup { default_cleanup_noexit - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT - set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_DEFAULT + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT + set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_DEFAULT log_must mmp_clear_hostid } log_assert "Ensure MMP uberblocks update at the correct interval" log_onexit cleanup -log_must set_tunable64 zfs_txg_timeout $TXG_TIMEOUT_LONG +log_must set_tunable64 TXG_TIMEOUT $TXG_TIMEOUT_LONG log_must mmp_set_hostid $HOSTID1 default_setup_noexit "$DISKS" @@ -66,14 +66,14 @@ UBER_CHANGES=$(count_mmp_writes $TESTPOOL 10) log_note "Uberblock changed $UBER_CHANGES times" if [ $UBER_CHANGES -lt $MIN_UB_WRITES ]; then - log_fail "Fewer uberblock writes occured than expected ($EXPECTED)" + log_fail "Fewer uberblock writes occurred than expected ($EXPECTED)" fi if [ $UBER_CHANGES -gt $MAX_UB_WRITES ]; then - log_fail "More uberblock writes occured than expected ($EXPECTED)" + log_fail "More uberblock writes occurred than expected ($EXPECTED)" fi -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_MIN +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN SEQ_BEFORE=$(zdb -luuuu ${DISK[0]} | awk '/mmp_seq/ {if ($NF>max) max=$NF}; END {print max}') sleep 1 SEQ_AFTER=$(zdb -luuuu ${DISK[0]} | awk '/mmp_seq/ {if ($NF>max) max=$NF}; END {print max}') diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh index 842df284b8..6e3d1fe34d 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh @@ -19,15 +19,15 @@ # # DESCRIPTION: -# Ensure that the MMP thread is notified when zfs_multihost_interval is -# reduced, and that changes to zfs_multihost_interval and -# zfs_multihost_fail_intervals do not trigger pool suspensions. +# Ensure that the MMP thread is notified when MULTIHOST_INTERVAL is +# reduced, and that changes to MULTIHOST_INTERVAL and +# MULTIHOST_FAIL_INTERVALS do not trigger pool suspensions. # # STRATEGY: -# 1. Set zfs_multihost_interval to much longer than the test duration +# 1. Set MULTIHOST_INTERVAL to much longer than the test duration # 2. Create a zpool and enable multihost # 3. Verify no MMP writes occurred -# 4. Set zfs_multihost_interval to 1 second +# 4. Set MULTIHOST_INTERVAL to 1 second # 5. Sleep briefly # 6. Verify MMP writes began # 7. Verify mmp_fail and mmp_write in uberblock reflect tunables @@ -43,34 +43,34 @@ verify_runnable "both" function cleanup { default_cleanup_noexit - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT - log_must set_tunable64 zfs_multihost_fail_intervals \ + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT + log_must set_tunable64 MULTIHOST_FAIL_INTERVALS \ $MMP_FAIL_INTERVALS_DEFAULT log_must mmp_clear_hostid } -log_assert "mmp threads notified when zfs_multihost_interval reduced" +log_assert "mmp threads notified when MULTIHOST_INTERVAL reduced" log_onexit cleanup -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_HOUR +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_HOUR log_must mmp_set_hostid $HOSTID1 default_setup_noexit $DISK log_must zpool set multihost=on $TESTPOOL clear_mmp_history -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT uber_count=$(count_mmp_writes $TESTPOOL 1) if [ $uber_count -eq 0 ]; then - log_fail "ERROR: mmp writes did not start when zfs_multihost_interval reduced" + log_fail "ERROR: mmp writes did not start when MULTIHOST_INTERVAL reduced" fi # 7. Verify mmp_write and mmp_fail are written for fails in $(seq $MMP_FAIL_INTERVALS_MIN $((MMP_FAIL_INTERVALS_MIN*2))); do for interval in $(seq $MMP_INTERVAL_MIN 200 $MMP_INTERVAL_DEFAULT); do - log_must set_tunable64 zfs_multihost_fail_intervals $fails - log_must set_tunable64 zfs_multihost_interval $interval + log_must set_tunable64 MULTIHOST_FAIL_INTERVALS $fails + log_must set_tunable64 MULTIHOST_INTERVAL $interval log_must sync_pool $TESTPOOL typeset mmp_fail=$(zdb $TESTPOOL 2>/dev/null | awk '/mmp_fail/ {print $NF}') @@ -86,10 +86,10 @@ for fails in $(seq $MMP_FAIL_INTERVALS_MIN $((MMP_FAIL_INTERVALS_MIN*2))); do done -# 8. Repeatedly change zfs_multihost_interval and fail_intervals +# 8. Repeatedly change MULTIHOST_INTERVAL and fail_intervals for x in $(seq 10); do typeset new_interval=$(( (RANDOM % 20 + 1) * $MMP_INTERVAL_MIN )) - log_must set_tunable64 zfs_multihost_interval $new_interval + log_must set_tunable64 MULTIHOST_INTERVAL $new_interval typeset action=$((RANDOM %10)) if [ $action -eq 0 ]; then log_must zpool export -a @@ -106,14 +106,14 @@ for x in $(seq 10); do log_must zpool import -f $TESTPOOL elif [ $action -eq 3 ]; then log_must zpool export -F $TESTPOOL - log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_MIN + log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN log_must zpool import $TESTPOOL elif [ $action -eq 4 ]; then - log_must set_tunable64 zfs_multihost_fail_intervals \ + log_must set_tunable64 MULTIHOST_FAIL_INTERVALS \ $((RANDOM % MMP_FAIL_INTERVALS_DEFAULT)) fi sleep 5 done -log_pass "mmp threads notified when zfs_multihost_interval reduced" +log_pass "mmp threads notified when MULTIHOST_INTERVAL reduced" diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh index 7504caa4d1..b6bdc68116 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh @@ -57,8 +57,8 @@ log_must zpool create -f $MMP_POOL mirror $MMP_DIR/file.{0,1} mirror $MMP_DIR/fi # Step 2 log_must mmp_set_hostid $HOSTID1 log_must zpool set multihost=on $MMP_POOL -set_tunable64 zfs_multihost_history 0 -set_tunable64 zfs_multihost_history 40 +set_tunable64 MULTIHOST_HISTORY 0 +set_tunable64 MULTIHOST_HISTORY 40 # Step 3 # default settings, every leaf written once/second diff --git a/tests/zfs-tests/tests/functional/mmp/setup.ksh b/tests/zfs-tests/tests/functional/mmp/setup.ksh index c91f61979c..b1e5431c84 100755 --- a/tests/zfs-tests/tests/functional/mmp/setup.ksh +++ b/tests/zfs-tests/tests/functional/mmp/setup.ksh @@ -27,8 +27,8 @@ if [ -e $HOSTID_FILE ]; then log_unsupported "System has existing $HOSTID_FILE file" fi -log_must set_tunable64 zfs_multihost_history $MMP_HISTORY -log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT -log_must set_tunable64 zfs_multihost_fail_intervals $MMP_FAIL_INTERVALS_DEFAULT +log_must set_tunable64 MULTIHOST_HISTORY $MMP_HISTORY +log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_DEFAULT +log_must set_tunable64 MULTIHOST_FAIL_INTERVALS $MMP_FAIL_INTERVALS_DEFAULT log_pass "mmp setup pass" diff --git a/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh b/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh index 0d2628079c..6130e2c828 100755 --- a/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh +++ b/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh @@ -32,7 +32,7 @@ function cleanup { - log_must set_tunable32 zfs_unlink_suspend_progress $default_unlink_sp + log_must set_tunable32 UNLINK_SUSPEND_PROGRESS $default_unlink_sp for fs in $(seq 1 3); do mounted $TESTDIR.$fs || zfs mount $TESTPOOL/$TESTFS.$fs rm -f $TESTDIR.$fs/file-* @@ -66,8 +66,7 @@ function unlinked_size_is } -UNLINK_SP_PARAM=/sys/module/zfs/parameters/zfs_unlink_suspend_progress -default_unlink_sp=$(get_tunable zfs_unlink_suspend_progress) +default_unlink_sp=$(get_tunable UNLINK_SUSPEND_PROGRESS) log_onexit cleanup @@ -89,7 +88,7 @@ for fs in 1 2 3; do log_must xattrtest -f 175 -x 3 -r -k -p $TESTDIR.$fs fi - log_must set_tunable32 zfs_unlink_suspend_progress 1 + log_must set_tunable32 UNLINK_SUSPEND_PROGRESS 1 log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs # build up unlinked set @@ -106,7 +105,7 @@ for fs in 1 2 3; do log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs # confirm we can drain and add to unlinked set at the same time - log_must set_tunable32 zfs_unlink_suspend_progress 0 + log_must set_tunable32 UNLINK_SUSPEND_PROGRESS 0 log_must zfs umount $TESTPOOL/$TESTFS.$fs log_must zfs mount $TESTPOOL/$TESTFS.$fs for fn in $(seq 101 175); do diff --git a/tests/zfs-tests/tests/functional/mount/umountall_001.ksh b/tests/zfs-tests/tests/functional/mount/umountall_001.ksh index b8c89c623a..814c831e40 100755 --- a/tests/zfs-tests/tests/functional/mount/umountall_001.ksh +++ b/tests/zfs-tests/tests/functional/mount/umountall_001.ksh @@ -45,6 +45,8 @@ zfs_list="/ /lib /sbin /tmp /usr /var /var/adm /var/run" # Append our ZFS filesystems to the list, not worrying about duplicates. if is_linux; then typeset mounts=$(mount | awk '{if ($5 == "zfs") print $3}') +elif is_freebsd; then + typeset mounts=$(mount -p | awk '{if ($3 == "zfs") print $2}') else typeset mounts=$(mount -p | awk '{if ($4 == "zfs") print $3}') fi @@ -60,6 +62,9 @@ if is_linux; then if [[ -z $mounts ]]; then mounts=$(awk '/zfs/ { print $2 }' /proc/mounts) fi +elif is_freebsd; then + # Umountall and umount not supported on FreeBSD + mounts=$(mount -t zfs | sort -r | awk '{print $3}') else mounts=$(umountall -n -F zfs 2>&1 | awk '{print $2}') fi diff --git a/tests/zfs-tests/tests/functional/mv_files/mv_files_common.kshlib b/tests/zfs-tests/tests/functional/mv_files/mv_files_common.kshlib index 24b3fab38e..6b925501b0 100644 --- a/tests/zfs-tests/tests/functional/mv_files/mv_files_common.kshlib +++ b/tests/zfs-tests/tests/functional/mv_files/mv_files_common.kshlib @@ -152,8 +152,7 @@ function generate_files # function mv_files { - - find $1 -type f -print | xargs -i \ + find $1 -type f -print | xargs -I "{}" \ mv {} $2 > /dev/null 2>&1 } diff --git a/tests/zfs-tests/tests/functional/mv_files/random_creation.ksh b/tests/zfs-tests/tests/functional/mv_files/random_creation.ksh index 45c46f83c0..05ddf62984 100755 --- a/tests/zfs-tests/tests/functional/mv_files/random_creation.ksh +++ b/tests/zfs-tests/tests/functional/mv_files/random_creation.ksh @@ -11,7 +11,7 @@ DIR="${TESTDIR}/RANDOM_SMALL" log_must mkdir "${DIR}" count=0 -for i in $(shuf -i 1-"${RC_PASS1}") ; do +for i in $(range_shuffle 1 "${RC_PASS1}") ; do if ! touch "${DIR}/${i}" ; then log_fail "error creating ${i} after ${count} files" fi diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_003_pos.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_003_pos.ksh index 40aa500249..496e2a029c 100755 --- a/tests/zfs-tests/tests/functional/no_space/enospc_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/no_space/enospc_003_pos.ksh @@ -44,25 +44,28 @@ verify_runnable "both" function cleanup { - log_must zpool destroy $TESTPOOL1 + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + rm -f $testfile0 } log_onexit cleanup log_assert "ENOSPC is returned on pools with large physical block size" -log_must zpool create $TESTPOOL1 -o ashift=13 $DISK_LARGE +typeset testfile0=${TESTDIR}/testfile0 + +log_must zpool create -o ashift=13 $TESTPOOL1 $DISK_LARGE log_must zfs set mountpoint=$TESTDIR $TESTPOOL1 log_must zfs set compression=off $TESTPOOL1 log_must zfs set recordsize=512 $TESTPOOL1 log_must zfs set copies=3 $TESTPOOL1 -log_note "Writing file: $TESTFILE0 until ENOSPC." -file_write -o create -f $TESTDIR/$TESTFILE0 -b $BLOCKSZ \ +log_note "Writing file: $testfile0 until ENOSPC." +file_write -o create -f $testfile0 -b $BLOCKSZ \ -c $NUM_WRITES -d $DATA ret=$? (( $ret != $ENOSPC )) && \ - log_fail "$TESTFILE0 returned: $ret rather than ENOSPC." + log_fail "$testfile0 returned: $ret rather than ENOSPC." log_pass "ENOSPC returned as expected." diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_df.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_df.ksh index b3df69141f..b1eeaf2cc5 100755 --- a/tests/zfs-tests/tests/functional/no_space/enospc_df.ksh +++ b/tests/zfs-tests/tests/functional/no_space/enospc_df.ksh @@ -58,7 +58,7 @@ log_must zfs umount $TESTPOOL/$TESTFS # Ensure the pool root filesystem shows in df output. # If the pool was full (available == 0) and the pool -# root filesytem had very little in it (used < 1 block), +# root filesystem had very little in it (used < 1 block), # the size reported to df was zero (issue #8253) and # df skipped the filesystem in its output. log_must eval "df -h | grep $TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_copies.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_copies.ksh index 3971820966..2a61f605b2 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_copies.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_copies.ksh @@ -34,7 +34,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_mtime.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_mtime.ksh index 4d06cfe4a2..0422bbaca1 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_mtime.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_mtime.ksh @@ -34,7 +34,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } @@ -51,6 +51,10 @@ if is_linux; then o_atime=$(stat -c %X $TESTDIR/clone/file) o_ctime=$(stat -c %Z $TESTDIR/clone/file) o_mtime=$(stat -c %Y $TESTDIR/clone/file) +elif is_freebsd; then + o_atime=$(stat -f "%a" $TESTDIR/clone/file) + o_ctime=$(stat -f "%c" $TESTDIR/clone/file) + o_mtime=$(stat -f "%m" $TESTDIR/clone/file) else o_atime=$(ls -E% all $TESTDIR/clone/file | awk '/atime/ {print $4}') o_ctime=$(ls -E% all $TESTDIR/clone/file | awk '/ctime/ {print $4}') @@ -66,6 +70,10 @@ if is_linux; then atime=$(stat -c %X $TESTDIR/clone/file) ctime=$(stat -c %Z $TESTDIR/clone/file) mtime=$(stat -c %Y $TESTDIR/clone/file) +elif is_freebsd; then + atime=$(stat -f "%a" $TESTDIR/clone/file) + ctime=$(stat -f "%c" $TESTDIR/clone/file) + mtime=$(stat -f "%m" $TESTDIR/clone/file) else atime=$(ls -E% all $TESTDIR/clone/file | awk '/atime/ {print $4}') ctime=$(ls -E% all $TESTDIR/clone/file | awk '/ctime/ {print $4}') diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_negative.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_negative.ksh index 8b0b9b5499..617c34602b 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_negative.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_negative.ksh @@ -36,7 +36,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_promoted_clone.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_promoted_clone.ksh index f9e6e83b7a..057c59a380 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_promoted_clone.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_promoted_clone.ksh @@ -36,7 +36,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $TESTPOOL/clone + datasetexists $origin && destroy_dataset $TESTPOOL/clone -R log_must zfs create -o mountpoint=$TESTDIR $origin } diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_recsize.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_recsize.ksh index 14caedbf20..38e7ec1ff2 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_recsize.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_recsize.ksh @@ -34,7 +34,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } @@ -50,7 +50,7 @@ log_must zfs clone $origin@a $origin/clone for rs in 512 1024 2048 4096 8192 16384 32768 65536 131072 ; do log_must zfs set recsize=$rs $origin/clone dd if=/$TESTDIR/file of=/$TESTDIR/clone/file bs=1024k count=$MEGS \ - conv=notrunc > $TEST_BASE_DIR/null 2>&1 || log_fail "dd failed." + conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." log_must verify_nopwrite $origin $origin@a $origin/clone done diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_sync.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_sync.ksh index c9d7b59b34..e0721cac19 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_sync.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_sync.ksh @@ -24,7 +24,7 @@ # # Strategy: # 1. Create an origin fs with compression and sha256. -# 2. Clone origin such that it inherits the properies. +# 2. Clone origin such that it inherits the properties. # 3. Use dd with the sync flag to test the sync write path. # @@ -34,7 +34,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_varying_compression.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_varying_compression.ksh index d91d5536f8..190bdbd6c6 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_varying_compression.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_varying_compression.ksh @@ -12,11 +12,12 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2016, Delphix. All rights reserved. +# Copyright (c) 2019, Kjeld Schouten-Lebbing. All Rights Reserved. # -. $STF_SUITE/include/libtest.shlib . $STF_SUITE/include/properties.shlib +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/nopwrite/nopwrite.shlib # @@ -40,7 +41,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R log_must zfs create -o mountpoint=$TESTDIR $origin } @@ -51,8 +52,8 @@ log_must zfs set checksum=sha256 $origin dd if=/dev/urandom of=$TESTDIR/file bs=1024k count=$MEGS conv=notrunc \ >/dev/null 2>&1 || log_fail "initial dd failed." -# Verify nop_write for 4 random compression algorithms -for i in $(get_rand_compress 4); do +# Verify nop_write for all compression algorithms except "off" +for i in "${compress_prop_vals[@]:1}"; do zfs snapshot $origin@a || log_fail "zfs snap failed" log_must zfs clone -o compress=$i $origin@a $origin/clone dd if=/$TESTDIR/file of=/$TESTDIR/clone/file bs=1024k count=$MEGS \ diff --git a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh index 126a00ea1c..e0422a3acf 100755 --- a/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh +++ b/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh @@ -37,7 +37,7 @@ log_onexit cleanup function cleanup { - datasetexists $origin && log_must zfs destroy -R $origin + datasetexists $origin && destroy_dataset $origin -R # No need to recreate the volume as no other tests expect it. } @@ -45,13 +45,14 @@ log_assert "nopwrite works on volumes" log_must zfs set compress=on $origin log_must zfs set checksum=sha256 $origin -dd if=/dev/urandom of=$vol bs=8192 count=4096 conv=notrunc >/dev/null \ +dd if=/dev/urandom of=$vol bs=16384 count=2048 conv=notrunc >/dev/null \ 2>&1 || log_fail "dd into $origin failed." zfs snapshot $origin@a || log_fail "zfs snap failed" log_must zfs clone $origin@a $clone log_must zfs set compress=on $clone log_must zfs set checksum=sha256 $clone -dd if=$vol of=$volclone bs=8192 count=4096 conv=notrunc >/dev/null 2>&1 || \ +block_device_wait +dd if=$vol of=$volclone bs=16384 count=2048 conv=notrunc >/dev/null 2>&1 || \ log_fail "dd into $clone failed." log_must verify_nopwrite $origin $origin@a $clone diff --git a/tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh b/tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh index 99b9d6bf1e..19576a8210 100755 --- a/tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh @@ -90,10 +90,7 @@ while [[ $i -lt ${#disks[*]} ]]; do log_must zpool online $TESTPOOL ${disks[$i]} check_state $TESTPOOL ${disks[$i]} "online" || \ log_fail "Failed to set ${disks[$i]} online" - # Delay for resilver to complete - while ! is_pool_resilvered $TESTPOOL; do - log_must sleep 1 - done + log_must zpool wait -t resilver $TESTPOOL log_must zpool clear $TESTPOOL while [[ $j -lt ${#disks[*]} ]]; do if [[ $j -eq $i ]]; then @@ -125,10 +122,7 @@ while [[ $i -lt ${#disks[*]} ]]; do log_must zpool online $TESTPOOL ${disks[$i]} check_state $TESTPOOL ${disks[$i]} "online" || \ log_fail "Failed to set ${disks[$i]} online" - # Delay for resilver to complete - while ! is_pool_resilvered $TESTPOOL; do - log_must sleep 1 - done + log_must zpool wait -t resilver $TESTPOOL log_must zpool clear $TESTPOOL fi ((i++)) diff --git a/tests/zfs-tests/tests/functional/pam/Makefile.am b/tests/zfs-tests/tests/functional/pam/Makefile.am new file mode 100644 index 0000000000..4d9ae17084 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/Makefile.am @@ -0,0 +1,7 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/pam +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + pam_basic.ksh \ + pam_nounmount.ksh \ + utilities.kshlib diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh new file mode 100755 index 0000000000..62131c6d68 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +destroy_pool $TESTPOOL +del_user ${username} +del_group pamtestgroup + +rm -rf "$runstatedir" +for dir in $TESTDIRS; do + rm -rf $dir +done diff --git a/tests/zfs-tests/tests/functional/pam/pam_basic.ksh b/tests/zfs-tests/tests/functional/pam/pam_basic.ksh new file mode 100755 index 0000000000..96ac594536 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_basic.ksh @@ -0,0 +1,49 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}" +echo "testpass" | pamtester pam_zfs_key_test ${username} open_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +echo "testpass" | pamtester pam_zfs_key_test ${username} open_session +references 2 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +log_must pamtester pam_zfs_key_test ${username} close_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +log_must pamtester pam_zfs_key_test ${username} close_session +references 0 +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +log_pass "done." diff --git a/tests/zfs-tests/tests/functional/pam/pam_nounmount.ksh b/tests/zfs-tests/tests/functional/pam/pam_nounmount.ksh new file mode 100755 index 0000000000..8179f398df --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_nounmount.ksh @@ -0,0 +1,51 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir} nounmount" +echo "testpass" | pamtester pam_zfs_key_test ${username} open_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +echo "testpass" | pamtester pam_zfs_key_test ${username} open_session +references 2 +keystatus available +log_must ismounted "$TESTPOOL/pam/${username}" + +log_must pamtester pam_zfs_key_test ${username} close_session +references 1 +keystatus available +log_must ismounted "$TESTPOOL/pam/${username}" + +log_must pamtester pam_zfs_key_test ${username} close_session +references 0 +keystatus available +log_must ismounted "$TESTPOOL/pam/${username}" +log_must zfs unmount "$TESTPOOL/pam/${username}" +log_must zfs unload-key "$TESTPOOL/pam/${username}" + +log_pass "done." diff --git a/tests/zfs-tests/tests/functional/pam/setup.ksh b/tests/zfs-tests/tests/functional/pam/setup.ksh new file mode 100755 index 0000000000..23515a598e --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/setup.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if ! which pamtester; then + log_unsupported "pam tests require the pamtester utility to be installed" +fi + +DISK=${DISKS%% *} +create_pool $TESTPOOL "$DISK" + +log_must zfs create -o mountpoint="$TESTDIR" "$TESTPOOL/pam" +log_must add_group pamtestgroup +log_must add_user pamtestgroup ${username} +log_must mkdir -p "$runstatedir" + +echo "testpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase -o keylocation=prompt "$TESTPOOL/pam/${username}" +log_must zfs unmount "$TESTPOOL/pam/${username}" +log_must zfs unload-key "$TESTPOOL/pam/${username}" + +log_pass diff --git a/tests/zfs-tests/tests/functional/pam/utilities.kshlib b/tests/zfs-tests/tests/functional/pam/utilities.kshlib new file mode 100644 index 0000000000..ef80f5a4f1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/utilities.kshlib @@ -0,0 +1,40 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +username="pamtestuser" +runstatedir="${TESTDIR}_run" +function keystatus { + log_must [ "$(zfs list -Ho keystatus "$TESTPOOL/pam/${username}")" == "$1" ] +} + +function genconfig { + for i in password auth session; do + printf "%s\trequired\tpam_permit.so\n%s\toptional\tpam_zfs_key.so\t%s\n" "$i" "$i" "$1" + done > /etc/pam.d/pam_zfs_key_test +} + +function references { + log_must [ "$(cat "${runstatedir}/$(id -u ${username})")" == "$1" ] +} + diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh index f915d2ad41..7e523ef908 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh @@ -52,6 +52,7 @@ fragment_after_checkpoint_and_verify log_must zpool export $NESTEDPOOL log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL -log_must zdb $NESTEDPOOL +log_must zpool export $NESTEDPOOL +log_must zdb -e -p $FILEDISKDIR $NESTEDPOOL log_pass "Rewind to checkpoint on a stressed pool." diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh index c473451c2e..b6d34307b3 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh @@ -46,7 +46,7 @@ function test_cleanup { poolexists $NESTEDPOOL && destroy_pool $NESTEDPOOL - log_must set_tunable32 spa_asize_inflation 24 + set_tunable32 SPA_ASIZE_INFLATION 24 cleanup_test_pool } @@ -54,7 +54,7 @@ verify_runnable "global" setup_test_pool log_onexit test_cleanup -log_must set_tunable32 spa_asize_inflation 4 +log_must set_tunable32 SPA_ASIZE_INFLATION 4 log_must zfs create $DISKFS @@ -80,13 +80,14 @@ log_mustnot dd if=/dev/urandom of=$NESTEDFS0FILE bs=1M count=300 # log_must zpool list $NESTEDPOOL -log_must zdb -kc $NESTEDPOOL - log_must zpool export $NESTEDPOOL +log_must zdb -e -p $FILEDISKDIR -kc $NESTEDPOOL + log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL log_must [ "$(head -c 100 $NESTEDFS0FILE)" = "$FILE0INTRO" ] -log_must zdb $NESTEDPOOL +log_must zpool export $NESTEDPOOL +log_must zdb -e -p $FILEDISKDIR $NESTEDPOOL log_pass "Do not reuse checkpointed space at low capacity." diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index f1abad063d..f970935f5b 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -41,7 +41,7 @@ verify_runnable "global" function test_cleanup { # reset memory limit to 16M - set_tunable64 zfs_spa_discard_memory_limit 1000000 + set_tunable64 SPA_DISCARD_MEMORY_LIMIT 1000000 cleanup_nested_pools } @@ -67,7 +67,7 @@ log_onexit test_cleanup # map, we should have even more time to # verify this. # -set_tunable64 zfs_spa_discard_memory_limit 128 +set_tunable64 SPA_DISCARD_MEMORY_LIMIT 128 log_must zpool checkpoint $NESTEDPOOL @@ -100,11 +100,12 @@ log_mustnot zpool remove $NESTEDPOOL $FILEDISK1 log_mustnot zpool reguid $NESTEDPOOL # reset memory limit to 16M -set_tunable64 zfs_spa_discard_memory_limit 16777216 +set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 nested_wait_discard_finish -log_must zdb $NESTEDPOOL +log_must zpool export $NESTEDPOOL +log_must zdb -e -p $FILEDISKDIR $NESTEDPOOL log_pass "Can export/import but not rewind/checkpoint/discard or " \ "change pool's config while discarding." diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh index ad96d5dcb6..514a059841 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh @@ -52,7 +52,7 @@ populate_test_pool # # Create big empty file and do some writes at random # offsets to ensure that it takes up space. Note that -# the implcitly created filesystem ($FS0) does not +# the implicitly created filesystem ($FS0) does not # have compression enabled. # log_must mkfile $BIGFILESIZE $FS0FILE diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib b/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib index 6e410e0c85..bb8bab6cdf 100644 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib @@ -27,7 +27,7 @@ # This is why these tests run directly on pools that use a # "real disk vdev" (meaning not a file based one). These tests # use the $TESTPOOL pool that is created on top of $TESTDISK. -# This pool is refered to as the "test pool" and thus all +# This pool is referred to as the "test pool" and thus all # the tests of this group use the testpool-related functions of # this file (not the nested_pools ones). # @@ -154,13 +154,18 @@ function setup_nested_pools function cleanup_nested_pool { - log_must zpool destroy $NESTEDPOOL + if poolexists $NESTEDPOOL; then + log_must zpool destroy $NESTEDPOOL + fi + log_must rm -f $FILEDISKS } function cleanup_test_pool { - log_must zpool destroy $TESTPOOL + if poolexists $TESTPOOL; then + log_must zpool destroy $TESTPOOL + fi # # We always clear the labels of all disks diff --git a/tests/zfs-tests/tests/functional/pool_names/pool_names_002_neg.ksh b/tests/zfs-tests/tests/functional/pool_names/pool_names_002_neg.ksh index 0c96e1999e..4b6744563d 100755 --- a/tests/zfs-tests/tests/functional/pool_names/pool_names_002_neg.ksh +++ b/tests/zfs-tests/tests/functional/pool_names/pool_names_002_neg.ksh @@ -106,14 +106,14 @@ do done log_note "Verify invalid pool names fail" -set -A POOLNAME "c0t0d0s0" "c0t0d0" "c0t0d19" "c0t50000E0108D279d0" \ +set -A POOLNAME \ "mirror" "raidz" ",," ",,,,,,,,,,,,,,,,,,,,,,,,," \ "2222222222222222222" "mirror_pool" "raidz_pool" \ "mirror-pool" "raidz-pool" "spare" "spare_pool" \ "spare-pool" "raidz1-" "raidz2:" ":aaa" "-bbb" "_ccc" ".ddd" -if verify_slog_support ; then - POOLNAME[${#POOLNAME[@]}]='log' -fi + +POOLNAME[${#POOLNAME[@]}]='log' + typeset -i i=0 while ((i < ${#POOLNAME[@]})); do log_mustnot zpool create -m $TESTDIR ${POOLNAME[$i]} $DISK diff --git a/tests/zfs-tests/tests/functional/privilege/cleanup.ksh b/tests/zfs-tests/tests/functional/privilege/cleanup.ksh index 45a6a0f762..99985c670f 100755 --- a/tests/zfs-tests/tests/functional/privilege/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/privilege/cleanup.ksh @@ -31,7 +31,7 @@ . $STF_SUITE/include/libtest.shlib -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Privilege tests require pfexec command" fi diff --git a/tests/zfs-tests/tests/functional/privilege/privilege_001_pos.ksh b/tests/zfs-tests/tests/functional/privilege/privilege_001_pos.ksh index ae869380d0..af4f705679 100755 --- a/tests/zfs-tests/tests/functional/privilege/privilege_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/privilege/privilege_001_pos.ksh @@ -57,7 +57,7 @@ # We can only run this in the global zone verify_runnable "global" -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Requires pfexec command" fi diff --git a/tests/zfs-tests/tests/functional/privilege/privilege_002_pos.ksh b/tests/zfs-tests/tests/functional/privilege/privilege_002_pos.ksh index 22cfaf55d7..ab00e32952 100755 --- a/tests/zfs-tests/tests/functional/privilege/privilege_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/privilege/privilege_002_pos.ksh @@ -60,7 +60,7 @@ verify_runnable "both" -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Requires pfexec command" fi diff --git a/tests/zfs-tests/tests/functional/privilege/setup.ksh b/tests/zfs-tests/tests/functional/privilege/setup.ksh index 94576d835e..4eb0693944 100755 --- a/tests/zfs-tests/tests/functional/privilege/setup.ksh +++ b/tests/zfs-tests/tests/functional/privilege/setup.ksh @@ -31,10 +31,6 @@ . $STF_SUITE/include/libtest.shlib -if is_linux; then - log_unsupported "Requires pfexec command" -fi - ZFS_USER=zfsrbac USES_NIS=false diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh index a3afe0c429..080fdddb2d 100755 --- a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh +++ b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh @@ -61,16 +61,18 @@ function cleanup log_must rm -f $BACKUP fi - # Our disk is back. Now we can clear errors and destroy the - # pool cleanly. - log_must zpool clear $TESTPOOL2 + if poolexists $TESTPOOL2 ; then + # Our disk is back. Now we can clear errors and destroy the + # pool cleanly. + log_must zpool clear $TESTPOOL2 - # Now that the disk is back and errors cleared, wait for our - # hung 'zpool scrub' to finish. - wait + # Now that the disk is back and errors cleared, wait for our + # hung 'zpool scrub' to finish. + wait - destroy_pool $TESTPOOL2 - log_must rm $REALDISK + destroy_pool $TESTPOOL2 + fi + log_must rm -f $REALDISK unload_scsi_debug fi } @@ -105,8 +107,10 @@ check_all $TESTPOOL "ONLINE" # Fault one of the disks, and check that pool is degraded DISK1=$(echo "$DISKS" | awk '{print $2}') -zpool offline -tf $TESTPOOL $DISK1 +log_must zpool offline -tf $TESTPOOL $DISK1 check_all $TESTPOOL "DEGRADED" +log_must zpool online $TESTPOOL $DISK1 +log_must zpool clear $TESTPOOL # Create a new pool out of a scsi_debug disk TESTPOOL2=testpool2 @@ -137,7 +141,7 @@ remove_disk $SDISK # background since the command will hang when the pool gets suspended. The # command will resume and exit after we restore the missing disk later on. zpool scrub $TESTPOOL2 & -sleep 1 # Give the scrub some time to run before we check if it fails +sleep 3 # Give the scrub some time to run before we check if it fails log_must check_all $TESTPOOL2 "SUSPENDED" diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh index c9eff3649c..dfc1f1ee04 100755 --- a/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh +++ b/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh @@ -42,13 +42,13 @@ function cleanup { - datasetexists $FS && log_must zfs destroy -r $FS + datasetexists $FS && destroy_dataset $FS -r } function count_snap_cmds { typeset expected_count=$1 - count=$(grep "command: zfs snapshot $FS@testsnapshot" | wc -l) + count=$(grep -E "command: (lt-)?zfs snapshot $FS@testsnapshot" | wc -l) log_must eval "[[ $count -eq $expected_count ]]" } diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh index 473de5c84e..1af1c2c070 100755 --- a/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh +++ b/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh @@ -43,7 +43,7 @@ function cleanup { [[ -z $msgs1 ]] || log_must rm $msgs1 [[ -z $msgs2 ]] || log_must rm $msgs2 - datasetexists $FS && log_must zfs destroy -r $FS + datasetexists $FS && destroy_dataset $FS -r } typeset -r ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg @@ -75,7 +75,7 @@ msgs2=$(mktemp) || log_fail # Truncate the result of the read that completed second in case it picked up an # extra message that was logged after the first read completed. # -log_must truncate -s $(stat -c "%s" $msgs1) $msgs2 +log_must truncate -s $(stat_size $msgs1) $msgs2 log_must diff $msgs1 $msgs2 diff --git a/tests/zfs-tests/tests/functional/procfs/setup.ksh b/tests/zfs-tests/tests/functional/procfs/setup.ksh index b3812dbdc6..79fa28f4f1 100755 --- a/tests/zfs-tests/tests/functional/procfs/setup.ksh +++ b/tests/zfs-tests/tests/functional/procfs/setup.ksh @@ -26,9 +26,4 @@ . $STF_SUITE/include/libtest.shlib -if ! is_linux ; then - log_unsupported "procfs is only used on Linux" -fi - default_mirror_setup $DISKS -log_pass diff --git a/tests/zfs-tests/tests/functional/projectquota/projectid_001_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectid_001_pos.ksh index 44af9941b9..46e79062a0 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectid_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectid_001_pos.ksh @@ -38,8 +38,8 @@ # # # STRATEGY: -# 1. Create a regular file and a directroy. -# 2. Set project ID on both directroy and regular file. +# 1. Create a regular file and a directory. +# 2. Set project ID on both directory and regular file. # 3. New created subdir or regular file should inherit its parent's # project ID if its parent has project inherit flag. # 4. New created subdir should inherit its parent project's inherit flag. diff --git a/tests/zfs-tests/tests/functional/projectquota/projectid_002_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectid_002_pos.ksh index 1a402e298b..e382f46404 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectid_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectid_002_pos.ksh @@ -41,7 +41,7 @@ # 1. Create three directories # 2. Set tdir1 and tdir3 project ID as PRJID1, # set tdir2 project ID as PRJID2. -# 3. Create regular file under tdir1. It inherits tdir1 proejct ID. +# 3. Create regular file under tdir1. It inherits tdir1 project ID. # 4. Hardlink from tdir1's child to tdir2 should be denied, # move tdir1's child to tdir2 will be object recreated. # 5. Hardlink from tdir1's child to tdir3 should succeed. diff --git a/tests/zfs-tests/tests/functional/projectquota/projectquota_004_neg.ksh b/tests/zfs-tests/tests/functional/projectquota/projectquota_004_neg.ksh index df0eda7d77..7ca81c3fbc 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectquota_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectquota_004_neg.ksh @@ -43,9 +43,7 @@ function cleanup { - if datasetexists $snap_fs; then - log_must zfs destroy $snap_fs - fi + datasetexists $snap_fs && destroy_dataset $snap_fs log_must cleanup_projectquota } @@ -62,7 +60,7 @@ for prj in "${no_prjs[@]}"; do log_mustnot zfs set projectquota@$prj=100m $QFS done -log_note "can set all numberic id even that id is not existed" +log_note "can set all numeric id even if that id does not exist" log_must zfs set projectquota@12345678=100m $QFS set -A sizes "100mfsd" "m0.12m" "GGM" "-1234-m" "123m-m" diff --git a/tests/zfs-tests/tests/functional/projectquota/projectquota_005_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectquota_005_pos.ksh index b52f302f78..0736648f1e 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectquota_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectquota_005_pos.ksh @@ -43,9 +43,7 @@ function cleanup { - if datasetexists $snap_fs; then - log_must zfs destroy $snap_fs - fi + datasetexists $snap_fs && destroy_dataset $snap_fs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectquota_008_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectquota_008_pos.ksh index 365b5627e8..b045b2c5fc 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectquota_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectquota_008_pos.ksh @@ -48,9 +48,7 @@ function cleanup { - if datasetexists $snap_fs; then - log_must zfs destroy $snap_fs - fi + datasetexists $snap_fs && destroy_dataset $snap_fs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectquota_009_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectquota_009_pos.ksh index a867b538c1..da44e731a9 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectquota_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectquota_009_pos.ksh @@ -49,9 +49,7 @@ function cleanup { for ds in $TESTPOOL/fs $TESTPOOL/fs-rename $TESTPOOL/fs-clone; do - if datasetexists $ds; then - log_must zfs destroy -rRf $ds - fi + datasetexists $ds && destroy_dataset $ds -rRf done } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectspace_001_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectspace_001_pos.ksh index a84ff9f89a..b7707ea522 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectspace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectspace_001_pos.ksh @@ -45,9 +45,7 @@ function cleanup { - if datasetexists $snap_fs; then - log_must zfs destroy $snap_fs - fi + datasetexists $snap_fs && destroy_dataset $snap_fs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectspace_002_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectspace_002_pos.ksh index 216855e94d..10edae771e 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectspace_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectspace_002_pos.ksh @@ -44,9 +44,7 @@ function cleanup { - if datasetexists $snapfs; then - log_must zfs destroy $snapfs - fi + datasetexists $snapfs && destroy_dataset $snapfs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectspace_003_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectspace_003_pos.ksh index 629b3b3e57..8db5d0d899 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectspace_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectspace_003_pos.ksh @@ -45,9 +45,7 @@ function cleanup { - if datasetexists $snapfs; then - log_must zfs destroy $snapfs - fi + datasetexists $snapfs && destroy_dataset $snapfs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projectspace_004_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projectspace_004_pos.ksh index 494d7f3b7a..fc4a93f044 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projectspace_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projectspace_004_pos.ksh @@ -38,15 +38,13 @@ # # STRATEGY: # 1. set project [obj]quota on the directory -# 2. set project ID and inherit flag on the directoty +# 2. set project ID and inherit flag on the directory # 3. run 'df [-i]' on the directory and check the result # function cleanup { - if datasetexists $snap_fs; then - log_must zfs destroy $snap_fs - fi + datasetexists $snap_fs && destroy_dataset $snap_fs log_must cleanup_projectquota } diff --git a/tests/zfs-tests/tests/functional/projectquota/projecttree_002_pos.ksh b/tests/zfs-tests/tests/functional/projectquota/projecttree_002_pos.ksh index 4008811a19..d610192427 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projecttree_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projecttree_002_pos.ksh @@ -39,7 +39,7 @@ # # STRATEGY: # 1. Create a tree with 4 level directories. -# 2. Set project ID on both directroy and regular file via +# 2. Set project ID on both directory and regular file via # "zfs project -p". # 3. Check the project ID via "zfs project". # 4. Set project inherit flag on kinds of level directories (and its diff --git a/tests/zfs-tests/tests/functional/projectquota/projecttree_003_neg.ksh b/tests/zfs-tests/tests/functional/projectquota/projecttree_003_neg.ksh index 33382fdbe9..cbc45857f7 100755 --- a/tests/zfs-tests/tests/functional/projectquota/projecttree_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/projectquota/projecttree_003_neg.ksh @@ -43,8 +43,8 @@ # 2. "-C" only supports "-r" and "-k". # 3. "-s" only supports "-r" and "-p". # 4. "-c", "-C" and "-s" can NOT be specified together. -# 5. "-d" can overwirte former "-r". -# 6. "-r" can overwirte former "-d". +# 5. "-d" can overwrite former "-r". +# 6. "-r" can overwrite former "-d". # 7. "-0" must be together with "-c". # 8. "-d" must be on directory. # 9. "-r" must be on directory. diff --git a/tests/zfs-tests/tests/functional/pyzfs/Makefile.am b/tests/zfs-tests/tests/functional/pyzfs/Makefile.am index 0a27adecca..26c5ac595a 100644 --- a/tests/zfs-tests/tests/functional/pyzfs/Makefile.am +++ b/tests/zfs-tests/tests/functional/pyzfs/Makefile.am @@ -1,18 +1,7 @@ +include $(top_srcdir)/config/Substfiles.am + pkgpyzfsdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/pyzfs pkgpyzfs_SCRIPTS = \ pyzfs_unittest.ksh -EXTRA_DIST = \ - pyzfs_unittest.ksh.in - -# -# The pyzfs module is built either for Python 2 or Python 3. In order -# to properly test it the unit tests must be updated to the matching vesion. -# -$(pkgpyzfs_SCRIPTS):%:%.in - -$(SED) -e 's,@PYTHON\@,$(PYTHON),g' \ - $< >'$@' - -chmod 775 $@ - -distclean-local:: - -$(RM) $(pkgpyzfs_SCRIPTS) +SUBSTFILES += $(pkgpyzfs_SCRIPTS) diff --git a/tests/zfs-tests/tests/functional/quota/quota.kshlib b/tests/zfs-tests/tests/functional/quota/quota.kshlib index 082a77c033..0ffe6394b5 100644 --- a/tests/zfs-tests/tests/functional/quota/quota.kshlib +++ b/tests/zfs-tests/tests/functional/quota/quota.kshlib @@ -33,6 +33,8 @@ # BLOCK_SIZE, QUOTA_VALUE and TOLERANCE set in quota.cfg if is_linux; then readonly EDQUOT=122 +elif is_freebsd; then + readonly EDQUOT=69 else readonly EDQUOT=49 fi diff --git a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh index 9364a9f60f..d6783e9a43 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh @@ -62,7 +62,8 @@ function cleanup # pool, otherwise next test will fail trying to set a # quota which is less than the space used. # - sleep 5 + wait_freeing $TESTPOOL + sync_pool $TESTPOOL } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh index c54968bbcc..2f34072dd1 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh @@ -61,6 +61,9 @@ function cleanup [[ -e $TESTDIR/$TESTFILE2 ]] && \ log_must rm $TESTDIR/$TESTFILE2 + + wait_freeing $TESTPOOL + sync_pool $TESTPOOL } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh index bec2243f9f..6ab25cf2d4 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh @@ -61,11 +61,12 @@ function cleanup log_must rm $TESTDIR1/$TESTFILE1 # - # Need to allow time for space to be released back to - # pool, otherwise next test will fail trying to set a - # quota which is less than the space used. - # - sleep 5 + # Need to allow time for space to be released back to + # pool, otherwise next test will fail trying to set a + # quota which is less than the space used. + # + wait_freeing $TESTPOOL + sync_pool $TESTPOOL } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh index cc62d8fa96..3733544389 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh @@ -62,6 +62,9 @@ function cleanup [[ -e $TESTDIR1/$TESTFILE2 ]] && \ log_must rm $TESTDIR1/$TESTFILE2 + + wait_freeing $TESTPOOL + sync_pool $TESTPOOL } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh index ed28cc1f0c..e87139a58f 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh @@ -48,8 +48,7 @@ verify_runnable "both" function cleanup { - datasetexists $fs_child && \ - log_must zfs destroy $fs_child + datasetexists $fs_child && destroy_dataset $fs_child log_must zfs set quota=$quota_val $fs } diff --git a/tests/zfs-tests/tests/functional/raidz/Makefile.am b/tests/zfs-tests/tests/functional/raidz/Makefile.am index 694de18a6c..d93eb73cf8 100644 --- a/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -3,4 +3,6 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ raidz_001_neg.ksh \ - raidz_002_pos.ksh + raidz_002_pos.ksh \ + raidz_003_pos.ksh \ + raidz_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh index 4c105b9411..0f88a1a514 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh @@ -35,4 +35,4 @@ log_mustnot raidz_test -T -log_pass "raidz_test detects errors as espected." +log_pass "raidz_test detects errors as expected." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh new file mode 100755 index 0000000000..bf22632c7e --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and default reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh new file mode 100755 index 0000000000..6cd2bf7c9f --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and zero reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -r 0 -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/redacted_send/Makefile.am b/tests/zfs-tests/tests/functional/redacted_send/Makefile.am new file mode 100644 index 0000000000..61d0ea2135 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/Makefile.am @@ -0,0 +1,26 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redacted_send +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + redacted_compressed.ksh \ + redacted_contents.ksh \ + redacted_deleted.ksh \ + redacted_disabled_feature.ksh \ + redacted_embedded.ksh \ + redacted_holes.ksh \ + redacted_incrementals.ksh \ + redacted_largeblocks.ksh \ + redacted_many_clones.ksh \ + redacted_mixed_recsize.ksh \ + redacted_mounts.ksh \ + redacted_negative.ksh \ + redacted_origin.ksh \ + redacted_panic.ksh \ + redacted_props.ksh \ + redacted_resume.ksh \ + redacted_size.ksh \ + redacted_volume.ksh + +dist_pkgdata_DATA = \ + redacted.cfg \ + redacted.kshlib diff --git a/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh b/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh new file mode 100755 index 0000000000..1a7c142b85 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +destroy_pool $POOL +destroy_pool $POOL2 +log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 0 + +log_pass diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg b/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg new file mode 100644 index 0000000000..f964b37bad --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted.cfg @@ -0,0 +1,86 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +export DISK1=$(echo $DISKS | awk '{print $1}') +export DISK2=$(echo $DISKS | awk '{print $2}') + +export POOL=$TESTPOOL +export POOL2=$TESTPOOL2 +export FS=$TESTFS +export FS2=$TESTFS2 + +# +# These are the byte ranges that differ between files and their redacted +# counterparts. See compare_files() for more detail. +# +typeset RANGE0="0,2097152" +typeset RANGE1="0,131072" +typeset RANGE2="1048576,2097152" +typeset RANGE3="0,131072 +1966080,131072 +3932160,131072" +typeset RANGE4="0,131072 +262144,131072 +524288,131072 +786432,131072" +typeset RANGE5="0,1048576 +7340032,1048576" +typeset RANGE6="393216,131072 +655360,131072 +917504,131072 +1179648,131072 +1441792,393216 +1966080,393216 +2621440,262144 +3145728,262144 +3670016,262144 +4194304,262144 +4718592,262144 +5242880,262144" +typeset RANGE7="1048576,6291456" +typeset RANGE8="4063232,131072" +typeset RANGE9="0,131072 +262144,131072 +524288,131072 +786432,131072 +1048576,131072 +1310720,131072 +1572864,131072 +1835008,131072 +2097152,131072 +2359296,131072 +2621440,131072 +2883584,131072 +3145728,131072 +3407872,131072 +3670016,131072 +3932160,131072" +typeset RANGE10="0,393216" +typeset RANGE11="0,1048576" +typeset RANGE12="0,2097152" +typeset RANGE13="0,16384" +typeset RANGE14="" +typeset RANGE15="0,4194304" +typeset RANGE16="0,6291456" \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib b/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib new file mode 100644 index 0000000000..30101939db --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib @@ -0,0 +1,266 @@ +#!/bin/ksh + +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016, 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/redacted_send/redacted.cfg + +function setup_dataset +{ + typeset ds_name=$1 + typeset opts=$2 + typeset file_create_func=$3 + typeset sendfs="$POOL/$ds_name" + [[ -n $file_create_func ]] || file_create_func=setup_common + + log_must zfs create $opts $sendfs + + $file_create_func $sendfs + + log_must zfs snapshot $sendfs@snap + log_must zfs clone $opts $sendfs@snap $POOL/${ds_name}_clone + log_must zfs snapshot $POOL/${ds_name}_clone@snap +} + +function setup_common +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset bs=$(get_prop recsize $sendfs) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=$bs count=16 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=$bs count=32 +} + +function setup_embedded +{ + typeset sendfs=$1 + + typeset recsize + typeset mntpnt=$(get_prop mountpoint $sendfs) + for recsize in 512 1024 2048 4096 8192 16384; do + if is_illumos; then + log_must mkholes -d $((recsize - 8)):8 $mntpnt/$recsize + else + log_must dd if=/dev/urandom of=$mntpnt/$recsize bs=8 \ + count=1 seek=$(((recsize / 8) - 1)) + fi + done +} + +function setup_holes +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset M=$((1024 * 1024)) + + if is_illumos; then + log_must mkholes -d 0:$((8 * M)) $mntpnt/f1 + log_must mkholes -d 0:$M -d $((7 * M)):$M $mntpnt/f2 + log_must mkholes -d $M:$((6 * M)) -h $((7 * M)):$M $mntpnt/f3 + log_must mkholes -h 0:$((8 * M)) $mntpnt/f4 + else + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=8M count=1 + + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1M count=1 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1M count=1 seek=7 \ + conv=notrunc + + log_must dd if=/dev/urandom of=$mntpnt/f3 bs=1M count=6 seek=1 + log_must truncate -s $((8 * M)) $mntpnt/f3 + + log_must truncate -s $((8 * M)) $mntpnt/f4 + fi + + log_must zfs create $sendfs/manyrm + for i in {1..256}; do + log_must stride_dd -i /dev/urandom -o $mntpnt/manyrm/f$i -b 512 \ + -c $(random_int_between 1 100) -s $(random_int_between 1 4) + done + + log_must zfs snapshot $sendfs/manyrm@snap + log_must zfs clone $sendfs/manyrm@snap $sendfs/manyrm_clone + log_must zfs snapshot $sendfs/manyrm_clone@snap +} + +function setup_incrementals +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + typeset bs=$(get_prop recsize $sendfs) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=$bs count=16 + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=$bs count=32 + log_must mkdir $mntpnt/d1 + log_must eval "cat $mntpnt/f1 $mntpnt/f2 >$mntpnt/d1/f1" + log_must zfs snapshot $sendfs@snap0 + + log_must zfs clone $sendfs@snap0 $POOL/hole + mntpnt=$(get_prop mountpoint $POOL/hole) + log_must dd if=/dev/zero of=$mntpnt/f2 bs=$bs count=16 conv=notrunc + log_must zfs snapshot $POOL/hole@snap + + log_must zfs clone $sendfs@snap0 $POOL/stride3 + mntpnt=$(get_prop mountpoint $POOL/stride3) + log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $bs -c 11 -s 3 + log_must zfs snapshot $POOL/stride3@snap + + log_must zfs clone $sendfs@snap0 $POOL/stride5 + mntpnt=$(get_prop mountpoint $POOL/stride5) + log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $bs -c 7 -s 5 + log_must zfs snapshot $POOL/stride5@snap + + log_must zfs clone $sendfs@snap0 $POOL/int + log_must zfs snapshot $POOL/int@snap + + log_must zfs clone $POOL/int@snap $POOL/rm + mntpnt=$(get_prop mountpoint $POOL/rm) + log_must rm -rf $mntpnt/[df][12] + log_must zfs snapshot $POOL/rm@snap + + log_must zfs clone $POOL/int@snap $POOL/write + mntpnt=$(get_prop mountpoint $POOL/write) + log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=16 conv=notrunc + log_must dd if=/dev/urandom of=$mntpnt/d1/f1 bs=512 count=16 seek=16 \ + conv=notrunc + log_must zfs snapshot $POOL/write@snap +} + +function setup_mounts +{ + typeset sendfs=$1 + + typeset mntpnt=$(get_prop mountpoint $sendfs) + log_must touch $mntpnt/empty + log_must dd if=/dev/urandom of=$mntpnt/contents1 bs=512 count=2 + log_must dd if=/dev/urandom of=$mntpnt/contents2 bs=512 count=2 + log_must mkdir $mntpnt/dir1 + log_must touch $mntpnt/dir1/empty + log_must dd if=/dev/urandom of=$mntpnt/dir1/contents1 bs=512 count=2 + log_must dd if=/dev/urandom of=$mntpnt/dir1/contents2 bs=512 count=2 + log_must mkdir $mntpnt/dir1/dir2 + log_must touch $mntpnt/dir1/dir2/empty + log_must dd if=/dev/urandom of=$mntpnt/dir1/dir2/file bs=512 count=2 + + log_must zfs create -s -V 16p $sendfs/vol + log_must zfs snapshot $sendfs/vol@snap + log_must zfs clone $sendfs/vol@snap $sendfs/vol_clone + log_must zfs snapshot $sendfs/vol_clone@snap +} + +function mount_redacted +{ + typeset flag='' + while getopts "f" opt; do + case $opt in + f) + flag='-f' + ;; + esac + done + shift $(($OPTIND - 1)) + + typeset ds=$1 + log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 1 + zfs mount $flag -oro $ds || return 1 + log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 0 + return 0 +} + +function unmount_redacted +{ + typeset ds=$1 + + zfs unmount $ds +} + +# +# This function calls a utility that prints out the ranges where a file +# and its redacted counterpart differ, each range on a new line like this: +# +# 0,131072 +# 1966080,131072 +# 3932160,131072 +# +# The output is then checked against a variable containing the expected +# output to verify the redacted ranges are the ones expected. +# +function compare_files +{ + typeset sendfs=$1 + typeset recvfs=$2 + typeset file=$3 + typeset expected="$4" + typeset tmpfile="$tmpdir/get_file.out" + + log_must mount_redacted -f $recvfs + + typeset file1="$(get_prop mountpoint $sendfs)/$file" + typeset file2="$(get_prop mountpoint $recvfs)/$file" + log_note "Comparing $file1 and $file2" + [[ -f $file1 ]] || log_fail "File $file1 does not exist." + [[ -f $file2 ]] || log_fail "File $file2 does not exist." + + log_must eval "get_diff $file1 $file2 >$tmpfile" + typeset range="$(cat $tmpfile)" + log_must unmount_redacted $recvfs + [[ "$expected" = "$range" ]] || log_fail "Unexpected range: $range" +} + +function redacted_cleanup +{ + typeset ds_list=$@ + typeset ds + + for ds in $ds_list; do + zfs destroy -R $ds + done + + set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 0 + rm -f $(get_prop mountpoint $POOL)/tmp/* +} + +# Retrieve the redaction list of a bookmark or snapshot, using +# the property or zdb output, as requested. +function get_guid_list +{ + typeset filename=$1 + typeset dataset=$2 + typeset use_zdb=${3:-false} + + if $use_zdb; then + guid_list=$(zdb -vvvv $dataset | sed -e 's/,//g' \ + -ne 's/^.*Snapshots: \[\(.*\)\]/\1/p') + else + guid_list=$(get_prop redact_snaps $dataset) + fi + + for guid in $(echo $guid_list | tr ',' ' '); do + echo $guid + done | sort >$filename +} diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh new file mode 100755 index 0000000000..0a8bf3903c --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that compressed send streams are redacted correctly. +# +# Strategy: +# 1. Receive a redacted compressed send stream, verifying compression and +# redaction. +# 2. Receive an incremental on the full receive, verifying compression and +# redaction. +# + +typeset ds_name="compressed" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name "-o compress=lz4" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset clone_mnt="$(get_prop mountpoint $clone)" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must stride_dd -i /dev/urandom -o $clone_mnt/f1 -b $((128 * 1024)) -c 4 -s 2 +log_must zfs snapshot $clone@snap1 +log_must rm $clone_mnt/f2 +log_must zfs snapshot $clone@snap2 + +log_must zfs redact $sendfs@snap book1 $clone@snap1 $clone@snap2 +log_must eval "zfs send -c --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream compressed lz4 redacted +compare_files $sendfs $recvfs "f1" "$RANGE4" +verify_stream_size $stream $sendfs +log_must mount_redacted -f $recvfs +verify_stream_size $stream $recvfs +log_must unmount_redacted $recvfs + +log_must eval "zfs send -c -i $sendfs@snap $clone@snap1 >$stream" +log_must eval "zfs recv $POOL2/inc1 <$stream" +log_must stream_has_features $stream compressed lz4 +typeset mntpnt=$(get_prop mountpoint $POOL2) +log_must diff $clone_mnt/f1 $mntpnt/inc1/f1 +log_must diff $send_mnt/f2 $mntpnt/inc1/f2 + +log_must eval "zfs send -c -i $sendfs@snap $clone@snap2 >$stream" +log_must eval "zfs recv $POOL2/inc2 <$stream" +log_must stream_has_features $stream compressed lz4 +log_must diff $clone_mnt/f1 $mntpnt/inc1/f1 +[[ -f $mntpnt/inc2/f2 ]] && log_fail "File f2 should not exist." + +log_pass "Compressed send streams are redacted correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh new file mode 100755 index 0000000000..fb12862c95 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh @@ -0,0 +1,162 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redaction works as expected for various scenarios. +# +# Strategy: +# 1. An unmodified file does not get redacted at all. +# 2. Empty redaction list redacts everything. +# 3. A file removed in the clone redacts the whole file. +# 4. A file moved in the clone does not redact the file. +# 5. A copied, then removed file in the clone redacts the whole file. +# 6. Overwriting a file with identical contents redacts the file. +# 7. A partially modified block redacts the entire block. +# 8. Only overlapping areas of modified ranges are redacted. +# 9. Send from the root dataset of a pool work correctly. +# + +typeset ds_name="contents" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +# An unmodified file does not get redacted at all. +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Removing a file in the clone redacts the entire file. +log_must rm "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book3 $clone@snap1 +log_must eval "zfs send --redact book3 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Moving a file in the clone does not redact the file. +log_must mv "$clone_mnt/f1" "$clone_mnt/f1.moved" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book4 $clone@snap1 +log_must eval "zfs send --redact book4 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.moved ]] && log_fail "Found moved file in redacted receive." +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Copying, then removing a file in the clone does redact the file. +log_must cp "$clone_mnt/f1" "$clone_mnt/f1.copied" +log_must rm "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book5 $clone@snap1 +log_must eval "zfs send --redact book5 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.copied ]] && log_fail "Found moved file in redacted receive." +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Overwriting the contents of a block with identical contents redacts the file. +log_must cp "$clone_mnt/f1" "$clone_mnt/f1.copied" +log_must cp "$clone_mnt/f1.copied" "$clone_mnt/f1" +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book6 $clone@snap1 +log_must eval "zfs send --redact book6 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE0" +log_must mount_redacted -f $recvfs +[[ -f $recv_mnt/f1.copied ]] && log_fail "Found moved file in redacted receive." +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Modifying some of a block redacts the whole block. +log_must dd if=/dev/urandom of=$clone_mnt/f1 conv=notrunc seek=2 count=1 bs=32k +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book7 $clone@snap1 +log_must eval "zfs send --redact book7 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE1" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Only overlapping areas of modified ranges are redacted. +log_must dd if=/dev/urandom of=$clone_mnt/f2 bs=1024k count=3 conv=notrunc +log_must zfs snapshot $clone@snap1 +log_must zfs clone $sendfs@snap $clone/new +typeset mntpnt="$(get_prop mountpoint $clone/new)" +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=1024k seek=1 count=3 \ + conv=notrunc +log_must zfs snapshot $clone/new@snap +log_must zfs redact $sendfs@snap book8 $clone@snap1 $clone/new@snap +log_must eval "zfs send --redact book8 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE2" +log_must zfs destroy -R $clone/new +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# FizzBuzz version +log_must zfs clone $sendfs@snap $POOL/stride3 +mntpnt="$(get_prop mountpoint $POOL/stride3)" +log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $((128 * 1024)) -c 11 -s 3 +log_must zfs snapshot $POOL/stride3@snap +log_must zfs clone $sendfs@snap $POOL/stride5 +mntpnt="$(get_prop mountpoint $POOL/stride5)" +log_must stride_dd -i /dev/urandom -o $mntpnt/f2 -b $((128 * 1024)) -c 7 -s 5 +log_must zfs snapshot $POOL/stride5@snap +log_must zfs redact $sendfs@snap book8a $POOL/stride3@snap $POOL/stride5@snap +log_must eval "zfs send --redact book8a $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE3" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Send from the root dataset of a pool work correctly. +log_must dd if=/dev/urandom of=/$POOL/f1 bs=128k count=4 +log_must zfs snapshot $POOL@snap +log_must zfs clone $POOL@snap $POOL/clone +log_must dd if=/dev/urandom of=/$POOL/clone/f1 bs=128k count=1 conv=notrunc +log_must zfs snapshot $POOL/clone@snap +log_must zfs redact $POOL@snap book9 $POOL/clone@snap +log_must eval "zfs send --redact book9 $POOL@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $POOL $recvfs "f1" "$RANGE1" +log_must zfs destroy -R $POOL@snap + +log_pass "Redaction works as expected for various scenarios." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh new file mode 100755 index 0000000000..3e2aeb7335 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redaction works as expected with respect to deleted files +# +# Strategy: +# 1. A file on the delete queue counts as deleted when using it to calculate +# redaction. +# 2. A file that is removed in the tosnap of an incremental, where the fromsnap +# is a redaction bookmark that contains references to that file, does not +# result in records for that file. +# + +typeset ds_name="deleted" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone2="$POOL/${ds_name}_clone2" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +# +# A file on the delete queue counts as deleted when using it to calculate +# redaction. +# + +# +# Open file descriptor 5 for appending to $clone_mnt/f1 so that it will go on +# the delete queue when we rm it. +# +exec 5>>$clone_mnt/f1 +log_must dd if=/dev/urandom of=$clone_mnt/f1 bs=512 count=1 conv=notrunc +log_must rm $clone_mnt/f1 +log_must zfs snapshot $clone@snap1 +# Close file descriptor 5 +exec 5>&- +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +# +# We have temporarily disabled redaction blkptrs, so this will not +# fail as was originally intended. We should uncomment this line +# when we re-enable redaction blkptrs. +# +#log_mustnot dd if=$recv_mnt/f1 of=/dev/null bs=512 count=1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# +# A file that is removed in the tosnap of an incremental, where the fromsnap +# is a redaction bookmark that contains references to that file, does not +# result in records for that file. +# +log_must zfs clone $sendfs@snap $clone2 +typeset clone2_mnt="$(get_prop mountpoint $clone2)" +log_must rm -rf $clone2_mnt/* +log_must zfs snapshot $clone2@snap +log_must zfs redact $sendfs@snap book2 $clone2@snap +log_must zfs destroy -R $clone2 +log_must eval "zfs send --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must rm $send_mnt/f1 +log_must zfs snapshot $sendfs@snap2 +log_must zfs clone $sendfs@snap2 $clone2 +typeset clone2_mnt="$(get_prop mountpoint $clone2)" +log_must rm $clone2_mnt/* +log_must zfs snapshot $clone2@snap +log_must zfs redact $sendfs@snap2 book3 $clone2@snap +log_must zfs destroy -R $clone2 +log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must diff <(ls $send_mnt) <(ls $recv_mnt) +log_must zfs destroy -R $recvfs +log_must zfs rollback -R $sendfs@snap + +log_pass "Verify Redaction works as expected with respect to deleted files." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh new file mode 100755 index 0000000000..3cf73f0016 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify the functionality of the redaction_bookmarks and redacted_datasets +# features. +# +# Strategy: +# 1. Create a pool with all features disabled. +# 2. Verify redacted send fails. +# 3. Enable redaction_bookmarks and verify redacted sends works. +# 4. Verify receipt of a redacted stream fails. +# 5. Enable recacted_datasets and verify zfs receive works. +# + +typeset ds_name="disabled" +typeset sendfs="$POOL/$ds_name" +typeset sendfs1="$POOL2/${ds_name}1" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone1="$POOL2/${ds_name}_clone1" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' + +function cleanup +{ + destroy_pool $POOL2 + create_pool $POOL2 $DISK2 + log_must zfs snapshot $POOL2@init + redacted_cleanup $sendfs $recvfs +} + +log_onexit cleanup + +destroy_pool $POOL2 +log_must zpool create -d $POOL2 $DISK2 + +log_must zfs create $sendfs1 +log_must zfs snapshot $sendfs1@snap +log_must zfs clone $sendfs1@snap $clone1 +log_must zfs snapshot $clone1@snap + +log_mustnot zfs redact $sendfs1@snap book1 $clone1@snap +log_must zpool set feature@redaction_bookmarks=enabled $POOL2 +log_must zfs redact $sendfs1@snap book1 $clone1@snap + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_mustnot eval "zfs recv $recvfs <$stream" +log_must zpool set feature@redacted_datasets=enabled $POOL2 +log_must eval "zfs recv $recvfs <$stream" + +log_pass "The redacted send/recv features work correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh new file mode 100755 index 0000000000..1c5b503a9b --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify embedded blocks and redacted send work correctly together. +# +# Strategy: +# 1. Create recsize sized files with embedded blocks from size 512b to 16k. +# 2. Receive a redacted send stream with nothing redacted. +# 3. Verify the received files match the source, contain embedded blocks, and +# that the stream has the redacted and embedded data features. +# 4. Receive a redacted send stream with files 512, 2048 and 8192 redacted. +# 5. Verify that the redacted files no longer match, but the others still +# contain embedded blocks and the stream has the redacted and embedded +# data features. +# + +typeset ds_name="embedded" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '-o compress=lz4' setup_embedded +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset recsize send_obj recv_obj + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send -e --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream redacted embed_data + +log_must mount_redacted -f $recvfs +for recsize in 512 1024 2048 4096 8192 16384; do + send_obj=$(get_objnum $send_mnt/$recsize) + recv_obj=$(get_objnum $recv_mnt/$recsize) + + log_must diff $send_mnt/$recsize $recv_mnt/$recsize + log_must eval "zdb -ddddd $sendfs $send_obj >$tmpdir/send.zdb" + log_must eval "zdb -ddddd $recvfs $recv_obj >$tmpdir/recv.zdb" + + grep -q "EMBEDDED" $tmpdir/send.zdb || \ + log_fail "Obj $send_obj not embedded in $sendfs" + grep -q "EMBEDDED" $tmpdir/recv.zdb || \ + log_fail "Obj $recv_obj not embedded in $recvfs" + + cat $stream | zstream dump -v | log_must grep -q \ + "WRITE_EMBEDDED object = $send_obj offset = 0" +done + +log_must zfs destroy -R $recvfs +for recsize in 512 2048 8192; do + log_must dd if=/dev/urandom of=$clone_mnt/$recsize bs=$recsize count=1 +done +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book2 $clone@snap1 +log_must eval "zfs send -e --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must stream_has_features $stream redacted embed_data + +log_must mount_redacted -f $recvfs +for recsize in 512 2048 8192; do + log_mustnot diff $send_mnt/$recsize $recv_mnt/$recsize +done +for recsize in 1024 4096 16384; do + send_obj=$(get_objnum $send_mnt/$recsize) + recv_obj=$(get_objnum $recv_mnt/$recsize) + + log_must diff $send_mnt/$recsize $recv_mnt/$recsize + log_must eval "zdb -ddddd $sendfs $send_obj >$tmpdir/send.zdb" + log_must eval "zdb -ddddd $recvfs $recv_obj >$tmpdir/recv.zdb" + + grep -q "EMBEDDED" $tmpdir/send.zdb || \ + log_fail "Obj $send_obj not embedded in $sendfs" + grep -q "EMBEDDED" $tmpdir/recv.zdb || \ + log_fail "Obj $recv_obj not embedded in $recvfs" + + cat $stream | zstream dump -v | log_must grep -q \ + "WRITE_EMBEDDED object = $send_obj offset = 0" +done + +log_pass "Embedded blocks and redacted send work correctly together." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh new file mode 100755 index 0000000000..d111aa0ef6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh @@ -0,0 +1,120 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send streams reliably handle holes. +# +# Strategy: +# 1. Holes written at the beginning and end of a non-sparse file in the +# redacted list are correctly redacted. +# 2. Holes written throughout a non-sparse file in the redacted list are +# correctly redacted. +# 3. Data written into a hole in a sparse file in the redacted list are +# correctly redacted. +# 4. Holes in metadata blocks. +# + +typeset ds_name="holes" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_holes +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset M=$((1024 * 1024)) + +log_onexit redacted_cleanup $sendfs $recvfs + +# Write holes at the start and end of a non-sparse file. +if is_illumos; then + log_must mkholes -h 0:$M -h $((7 * M)):$M $clone_mnt/f1 +else + log_must dd if=/dev/zero of=$clone_mnt/f1 bs=1M count=1 conv=notrunc + log_must dd if=/dev/zero of=$clone_mnt/f1 bs=1M count=1 conv=notrunc seek=7 +fi +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE5" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Write two overlapping sets of holes into the same non-sparse file. +log_must stride_dd -i /dev/zero -o $clone_mnt/f1 -b $((128 * 1024)) -c 8 -s 2 -k 3 +log_must stride_dd -i /dev/zero -o $clone_mnt/f1 -b $((256 * 1024)) -c 8 -s 2 -k 6 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book2 $clone@snap1 +log_must eval "zfs send --redact book2 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE6" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Write data into the middle of a hole. +if is_illumos; then + log_must mkholes -d $((3 * M)):$((2 * M)) $clone_mnt/f2 +else + log_must dd if=/dev/urandom of=$clone_mnt/f2 bs=1M count=2 seek=3 \ + conv=notrunc +fi +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book3 $clone@snap1 +log_must eval "zfs send --redact book3 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE14" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Remove a file with holes. +log_must rm $clone_mnt/f3 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendfs@snap book4 $clone@snap1 +log_must eval "zfs send --redact book4 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f3" "$RANGE7" +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +# Create a hole in a L0 metadata block by removing files. +log_must rm $send_mnt/manyrm_clone/f{32..96} +log_must zfs snapshot $sendfs/manyrm_clone@snap1 + +log_must zfs redact $sendfs/manyrm@snap book6 $sendfs/manyrm_clone@snap1 +log_must eval "zfs send --redact book6 $sendfs/manyrm@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_must mount_redacted -f $recvfs +for i in {1..31} {97..256}; do + diff $send_mnt/manyrm/f$i $recv_mnt/f$i || log_fail \ + "File f$i did not match in the send and recv datasets." +done +for i in {32..96}; do + file_size=$(stat_size $send_mnt/manyrm/f$i) + redacted_size=$(stat_size $recv_mnt/f$i) + [[ $file_size -eq $redacted_size ]] || log_fail \ + "File f$i has size $file_size and redacted size $redacted_size" +done +log_must zfs rollback -R $clone@snap +log_must zfs destroy -R $recvfs + +log_pass "Redacted send streams reliably handle holes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh new file mode 100755 index 0000000000..1d2ed3a687 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that incrementals (redacted and normal) work with redacted datasets. +# +# Strategy: +# 1. Test normal incrementals from the original snap to a subset of the +# redaction list. +# 2. Test receipt of intermediate clones, and their children. +# 3. Test receipt with origin snap specified by '-o origin='. +# 4. Test incrementals from redaction bookmarks. +# + +typeset ds_name="incrementals" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_incrementals +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs $POOL2/rfs + +# Setup a redacted send using a redaction list at varying depth. +log_must zfs redact $sendfs@snap0 book1 $POOL/rm@snap $POOL/stride3@snap \ + $POOL/stride5@snap +log_must eval "zfs send --redact book1 $sendfs@snap0 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" + +# Verify receipt of normal incrementals to redaction list members. +log_must eval "zfs send -i $sendfs@snap0 $POOL/stride3@snap >$stream" +log_must eval "zfs recv $POOL2/rstride3 <$stream" +log_must diff -r /$POOL/stride3 /$POOL2/rstride3 +log_must eval "zfs send -i $sendfs@snap0 $POOL/stride5@snap >$stream" +log_must eval "zfs recv $POOL2/rstride5 <$stream" +log_must diff -r /$POOL/stride5 /$POOL2/rstride5 + +# But not a normal child that we weren't redacted with respect to. +log_must eval "zfs send -i $sendfs@snap0 $POOL/hole@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rhole@snap <$stream" + +# Verify we can receive an intermediate clone redacted with respect to a +# subset of the original redaction list. +log_must zfs redact $POOL/int@snap book2 $POOL/rm@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book2 $POOL/int@snap >$stream" +log_must eval "zfs recv $POOL2/rint <$stream" +compare_files $POOL/int $POOL2/rint "f1" "$RANGE0" +compare_files $POOL/int $POOL2/rint "f2" "$RANGE15" +compare_files $POOL/int $POOL2/rint "d1/f1" "$RANGE16" +log_must mount_redacted -f $POOL2/rint + +# Verify we can receive grandchildren on the child. +log_must eval "zfs send -i $POOL/int@snap $POOL/rm@snap >$stream" +log_must eval "zfs receive $POOL2/rrm <$stream" +log_must diff -r /$POOL/rm /$POOL2/rrm + +# But not a grandchild that the received child wasn't redacted with respect to. +log_must eval "zfs send -i $POOL/int@snap $POOL/write@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rwrite<$stream" + +# Verify we cannot receive an intermediate clone that isn't redacted with +# respect to a subset of the original redaction list. +log_must zfs redact $POOL/int@snap book4 $POOL/rm@snap $POOL/write@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book4 $POOL/int@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rint <$stream" +log_must zfs redact $POOL/int@snap book5 $POOL/write@snap +log_must eval "zfs send -i $sendfs@snap0 --redact book5 $POOL/int@snap >$stream" +log_mustnot eval "zfs recv $POOL2/rint <$stream" +log_mustnot zfs redact $POOL/int@snap book6 $POOL/hole@snap + +# Verify we can receive a full clone of the grandchild on the child. +log_must eval "zfs send $POOL/write@snap >$stream" +log_must eval "zfs recv -o origin=$POOL2/rint@snap $POOL2/rwrite <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite + +# Along with other origins. +log_must eval "zfs recv -o origin=$POOL2/rfs@snap0 $POOL2/rwrite1 <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite1 +log_must eval "zfs recv -o origin=$POOL2@init $POOL2/rwrite2 <$stream" +log_must diff -r /$POOL/write /$POOL2/rwrite2 +log_must zfs destroy -R $POOL2/rwrite2 + +log_must zfs destroy -R $POOL2/rfs + +# Write some data for tests of incremental sends from bookmarks +log_must zfs snapshot $sendfs@snap1 +log_must zfs clone $sendfs@snap1 $POOL/hole1 +typeset mntpnt=$(get_prop mountpoint $POOL/hole1) +log_must dd if=/dev/zero of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/hole1@snap +log_must zfs clone $sendfs@snap1 $POOL/write1 +mntpnt=$(get_prop mountpoint $POOL/write1) +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/write1@snap +log_must zfs clone $POOL/int@snap $POOL/write2 +mntpnt=$(get_prop mountpoint $POOL/write2) +log_must dd if=/dev/urandom of=$mntpnt/f2 bs=128k count=16 conv=notrunc +log_must zfs snapshot $POOL/write2@snap + +# Setup a redacted send using a redaction list at varying depth. +log_must zfs redact $sendfs@snap0 book7 $POOL/rm@snap $POOL/stride3@snap \ + $POOL/stride5@snap +log_must eval "zfs send --redact book7 $sendfs@snap0 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" + +# Verify we can receive a redacted incremental sending from the bookmark. +log_must zfs redact $sendfs@snap1 book8 $POOL/write1@snap +log_must eval "zfs send -i $sendfs#book7 --redact book8 $sendfs@snap1 >$stream" +log_must eval "zfs receive $POOL2/rfs <$stream" +# The stride3 and stride5 snaps redact 3 128k blocks at block offsets 0 15 and +# 30 of f2. The write1 snap only covers the first two of those three blocks. +compare_files $sendfs $POOL2/rfs "f2" "$RANGE12" +log_must mount_redacted -f $POOL2/rfs +log_must diff $send_mnt/f1 /$POOL2/rfs/f1 +log_must diff $send_mnt/d1/f1 /$POOL2/rfs/d1/f1 +unmount_redacted $POOL2/rfs + +# Verify we can receive a normal child we weren't redacted with respect to by +# sending from the bookmark. +log_must eval "zfs send -i $sendfs#book7 $POOL/hole1@snap >$stream" +log_must eval "zfs recv $POOL2/rhole1 <$stream" +log_must diff -r /$POOL/hole1 /$POOL2/rhole1 + +# Verify we can receive an intermediate clone redacted with respect to a +# non-subset if we send from the bookmark. +log_must zfs redact $POOL/int@snap book9 $POOL/write2@snap +log_must eval "zfs send -i $sendfs#book7 --redact book9 $POOL/int@snap >$stream" +log_must eval "zfs receive $POOL2/rint <$stream" +compare_files $sendfs $POOL2/rint "f2" "$RANGE12" + +log_pass "Incrementals (redacted and normal) work with redacted datasets." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh new file mode 100755 index 0000000000..caccdd3600 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify large blocks and redacted send work correctly together. +# +# Strategy: +# 1. Create a dataset and clone with a 1m recordsize, modifying a few k +# within the first 1m of a 16m file. +# 2. Verify that the whole first 1m of the file is redacted. +# 3. Receive an incremental stream from the original snap to the snap it +# was redacted with respect to. +# 4. Verify that the received dataset matches the clone +# + +typeset ds_name="largeblocks" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '-o recsize=1m' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must dd if=/dev/urandom of=$clone_mnt/f1 bs=32k count=3 seek=8 conv=notrunc +log_must zfs snapshot $clone@snap1 + +log_must zfs redact $sendfs@snap book1 $clone@snap1 +log_must eval "zfs send -L --redact book1 $sendfs@snap >$stream" +log_must stream_has_features $stream redacted large_blocks +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f1" "$RANGE11" +log_must mount_redacted -f $recvfs +log_must diff $send_mnt/f2 $recv_mnt/f2 +unmount_redacted $recvfs + +log_must eval "zfs send -L -i $sendfs@snap $clone@snap1 >$stream" +log_must stream_has_features $stream large_blocks +log_must eval "zfs recv $recvfs/new <$stream" +log_must diff -r $clone_mnt $recv_mnt/new + +log_pass "Large blocks and redacted send work correctly together." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh new file mode 100755 index 0000000000..3386643b29 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send can deal with a large redaction list. +# +# Strategy: +# 1. Create 64 clones of sendfs each of which modifies two blocks in a file. +# The first modification is at an offset unique to each clone, and the +# second (the last block in the file) is common to them all. +# 2. Verify a redacted stream with a reasonable redaction list length can +# be correctly processed. +# 3. Verify that if the list is too long, the send fails gracefully. +# + +typeset ds_name="many_clones" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset redaction_list='' +typeset mntpnt + +log_onexit redacted_cleanup $sendfs $recvfs + +# Fill in both the last block, and a different block in every clone. +for i in {1..64}; do + log_must zfs clone $sendfs@snap ${clone}$i + mntpnt=$(get_prop mountpoint ${clone}$i) + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=$i \ + conv=notrunc + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=63 \ + conv=notrunc + log_must zfs snapshot ${clone}$i@snap +done + +# The limit isn't necessarily 32 snapshots. The maximum number of snapshots in +# the redacted list is determined in dsl_bookmark_create_redacted_check(). +log_must zfs redact $sendfs@snap book1 $clone{1..32}@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" +compare_files $sendfs $recvfs "f2" "$RANGE8" + +log_mustnot zfs redact $sendfs@snap book2 $clone{1..64}@snap + +log_pass "Redacted send can deal with a large redaction list." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh new file mode 100755 index 0000000000..e1cd09e17d --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify redacted send works with datasets of different sizes. +# +# Strategy: +# 1. Create two dataset one with recsize 512, and one 1m and create a 2m file. +# 2. For each dataset, create clones of both 512 and 1m recsize and modify +# the first 16k of the file. +# 3. Send each original dataset, redacted with respect to each of the clones +# into both a dataset inheriting a 512 recsize and a 1m one. +# 4. Verify that the smallest unit of redaction is that of the origin fs. +# + +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +typeset mntpnt + +log_onexit redacted_cleanup $POOL/512 $POOL/1m $POOL2/512 $POOL2/1m + +# Set up the datasets we'll send and redact from. +log_must zfs create -o recsize=512 $POOL/512 +mntpnt=$(get_prop mountpoint $POOL/512) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=1024k count=2 +log_must zfs snapshot $POOL/512@snap +log_must zfs clone -o recsize=1m $POOL/512@snap $POOL/1mclone +mntpnt=$(get_prop mountpoint $POOL/1mclone) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=32 conv=notrunc +log_must zfs snapshot $POOL/1mclone@snap + +log_must zfs create -o recsize=1m $POOL/1m +mntpnt=$(get_prop mountpoint $POOL/1m) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=1024k count=2 +log_must zfs snapshot $POOL/1m@snap +log_must zfs clone -o recsize=512 $POOL/1m@snap $POOL/512clone +mntpnt=$(get_prop mountpoint $POOL/512clone) +log_must dd if=/dev/urandom of=$mntpnt/f1 bs=512 count=32 conv=notrunc +log_must zfs snapshot $POOL/512clone@snap + +# Create datasets that allow received datasets to inherit recordsize. +log_must zfs create -o recsize=512 $POOL2/512 +log_must zfs create -o recsize=1m $POOL2/1m + +# Do the sends and verify the contents. +log_must zfs redact $POOL/512@snap book1 $POOL/1mclone@snap +log_must eval "zfs send --redact book1 $POOL/512@snap>$stream" +log_must eval "zfs recv $POOL2/512/recva <$stream" +compare_files $POOL/512 $POOL2/512/recva "f1" "$RANGE13" +log_must eval "zfs recv $POOL2/1m/recvb <$stream" +compare_files $POOL/512 $POOL2/1m/recvb "f1" "$RANGE13" + +log_must zfs redact $POOL/1m@snap book2 $POOL/512clone@snap +log_must eval "zfs send --redact book2 $POOL/1m@snap >$stream" +log_must eval "zfs recv $POOL2/512/recvc <$stream" +compare_files $POOL/1m $POOL2/512/recvc "f1" "$RANGE11" +log_must eval "zfs recv $POOL2/1m/recvd <$stream" +compare_files $POOL/1m $POOL2/1m/recvd "f1" "$RANGE11" + +log_pass "Redaction works correctly with different recordsizes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh new file mode 100755 index 0000000000..0bc4bf4617 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that received redacted datasets are not mounted by default, but +# can still be mounted after setting ALLOW_REDACTED_DATASET_MOUNT. +# +# Strategy: +# 1. Verify a received redacted stream isn't mounted by default. +# 2. Set ALLOW_REDACTED_DATASET_MOUNT and verify it can't be mounted +# without the -f flag, but can with -f. +# 3. Receive a redacted volume. +# 4. Verify the device file isn't present until the kernel variable is set. +# 5. Verify the files in the send fs are also present in the recv fs. +# + +typeset ds_name="mounts" +typeset sendfs="$POOL/$ds_name" +typeset sendvol="$sendfs/vol" +typeset recvfs="$POOL2/$ds_name" +typeset recvvol="$POOL2/vol" +typeset clone="$POOL/${ds_name}_clone" +typeset clonevol="${sendvol}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_mounts +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" +typeset recv_vol_file="/dev/zvol/$recvvol" + +log_onexit redacted_cleanup $sendfs $recvfs $recvvol + +log_must rm $clone_mnt/empty $clone_mnt/contents1 +log_must dd if=/dev/urandom of=$clone_mnt/contents2 bs=512 count=1 conv=notrunc +log_must rm $clone_mnt/dir1/contents1 +log_must rm -rf $clone_mnt/dir1/dir2 +log_must dd if=/dev/urandom of=$clone_mnt/dir1/contents2 bs=512 count=1 \ + conv=notrunc +log_must dd if=/dev/urandom of=$clone_mnt/dir1/empty bs=512 count=1 +log_must zfs snapshot $clone@snap1 + +log_must zfs redact $sendfs@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs receive $recvfs <$stream" +log_mustnot ismounted $recvfs +log_mustnot mount_redacted $recvfs +log_mustnot ismounted $recvfs +log_must mount_redacted -f $recvfs +log_must ismounted $recvfs + +# Verify that the send and recv fs both have the same files under their +# mountpoints by comparing find output with the name of the mountpoint +# deleted. +contents=$(log_must find $recv_mnt) +contents_orig=$(log_must find $send_mnt) +log_must diff <(echo ${contents//$recv_mnt/}) \ + <(echo ${contents_orig//$send_mnt/}) +log_must zfs redact $sendvol@snap book2 $clonevol@snap +log_must eval "zfs send --redact book2 $sendvol@snap >$stream" +log_must eval "zfs receive $recvvol <$stream" +is_disk_device $recv_vol_file && log_fail "Volume device file should not exist." +log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 1 +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle + +# The device file isn't guaranteed to show up right away. +if ! is_disk_device $recv_vol_file; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + is_disk_device $recv_vol_file && break + done +fi +is_disk_device $recv_vol_file || log_fail "Volume device file should exist." + +log_must dd if=/dev/urandom of=$send_mnt/dir1/contents1 bs=512 count=2 +log_must rm $send_mnt/dir1/dir2/empty +log_must zfs snapshot $sendfs@snap2 +log_must eval "zfs send -i $sendfs#book1 $sendfs@snap2 >$stream" +log_must eval "zfs receive $recvfs <$stream" +log_must mount_redacted -f $recvfs +log_must ismounted $recvfs +contents=$(log_must find $recv_mnt) +contents_orig=$(log_must find $send_mnt) +log_must diff <(echo ${contents//$recv_mnt/}) \ + <(echo ${contents_orig//$send_mnt/}) + +log_pass "Received redacted streams can be mounted." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh new file mode 100755 index 0000000000..e591cca0bb --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Test that redacted send correctly detects invalid arguments. +# + +typeset sendfs="$POOL2/sendfs" +typeset recvfs="$POOL2/recvfs" +typeset clone1="$POOL2/clone1" +typeset clone2="$POOL2/clone2" +typeset clone3="$POOL2/clone3" +typeset clone3="$POOL2/clone4" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) + +log_onexit redacted_cleanup $sendfs $recvfs $clone3 + +log_must zfs create $sendfs +log_must zfs snapshot $sendfs@snap1 +log_must zfs snapshot $sendfs@snap2 +log_must zfs snapshot $sendfs@snap3 +log_must zfs clone $sendfs@snap2 $clone1 +log_must zfs snapshot $clone1@snap +log_must zfs bookmark $clone1@snap $clone1#book +log_must zfs clone $sendfs@snap2 $clone2 +log_must zfs snapshot $clone2@snap + +# Incompatible flags +log_must zfs redact $sendfs@snap2 book $clone1@snap +log_mustnot eval "zfs send -R --redact book $sendfs@snap2 >$TEST_BASE_DIR/devnull" + +typeset arg +for arg in "$sendfs" "$clone1#book"; do + log_mustnot eval "zfs send --redact book $arg >$TEST_BASE_DIR/devnull" +done + +# Bad redaction list arguments +log_mustnot zfs redact $sendfs@snap1 +log_mustnot zfs redact $sendfs@snap1 book +log_mustnot zfs redact $sendfs#book1 book4 $clone1 +log_mustnot zfs redact $sendfs@snap1 book snap2 snap3 +log_mustnot zfs redact $sendfs@snap1 book @snap2 @snap3 +log_mustnot eval "zfs send --redact $sendfs#book $sendfs@snap >$TEST_BASE_DIR/devnull" + +# Redaction snapshots not a descendant of tosnap +log_mustnot zfs redact $sendfs@snap2 book $sendfs@snap2 +log_must zfs redact $sendfs@snap2 book2 $clone1@snap $clone2@snap +log_must eval "zfs send --redact book2 $sendfs@snap2 >$stream" +log_must zfs redact $sendfs@snap2 book3 $clone1@snap $clone2@snap +log_must eval "zfs send -i $sendfs@snap1 --redact book3 $sendfs@snap2 \ + >$TEST_BASE_DIR/devnull" +log_mustnot zfs redact $sendfs@snap3 $sendfs@snap3 $clone1@snap + +# Full redacted sends of redacted datasets are not allowed. +log_must eval "zfs recv $recvfs <$stream" +log_must zfs snapshot $recvfs@snap +log_must zfs clone $recvfs@snap $clone3 +log_must zfs snapshot $clone3@snap +log_mustnot zfs redact $recvfs@snap book5 $clone3@snap + +# Nor may a redacted dataset appear in the redaction list. +log_mustnot zfs redact testpool2/recvfs@snap2 book7 testpool2/recvfs@snap + +# Non-redaction bookmark cannot be sent and produces invalid argument error +log_must zfs bookmark "$sendfs@snap1" "$sendfs#book8" +log_must eval "zfs send --redact book8 -i $sendfs@snap1 $sendfs@snap2 2>&1 | head -n 100 | grep 'not a redaction bookmark'" + +# Error messages for common usage errors +log_mustnot_expect "not contain '#'" zfs redact $sendfs@snap1 \#book $sendfs@snap2 +log_mustnot_expect "not contain '#'" zfs redact $sendfs@snap1 $sendfs#book $sendfs@snap2 +log_mustnot_expect "full dataset names" zfs redact $sendfs@snap1 book @snap2 +log_mustnot_expect "full dataset names" zfs redact $sendfs@snap1 book @snap2 +log_mustnot_expect "full dataset names" zfs redact $sendfs@snap1 \#book @snap2 +log_mustnot_expect "descendent of snapshot" zfs redact $sendfs@snap2 book $sendfs@snap1 + +log_pass "Verify that redacted send correctly detects invalid arguments." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh new file mode 100755 index 0000000000..74e5914f2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Test that receiving sends from redaction bookmarks and redacted datasets +# works correctly in certain edge cases. +# 1. Send A(B,C,D) to pool2. +# 2. Verify send from A(B, C, D) can be received onto it. +# 3. Verify send from A(B, C) can be received onto it. +# 4. Verify send from A() can be received onto it. +# 5. Verify send from A(E) cannot be received onto it. +# 6. Verify send from redaction bookmark for A(B, C) can be received onto it. +# 7. Verify send from redaction bookmark for A() can be received onto it. +# 8. Verify send from redaction bookmark for A(E) cannot be received onto it. +# + +typeset ds_name="origin" +typeset sendfs="$POOL/$ds_name" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' setup_incrementals +typeset dsA=$sendfs@snap0 +typeset dsB=$POOL/hole@snap +typeset dsC=$POOL/rm@snap +typeset dsD=$POOL/write@snap +typeset dsE=$POOL/stride3@snap +typeset dsF=$POOL/stride5@snap +typeset targ=$POOL2/targfs@snap + +log_onexit redacted_cleanup $sendfs $POOL2/rBCD $POOL2/targfs \ + $POOL2/rBC $POOL2/rE + +# Set up all the filesystems and clones. +log_must zfs redact $dsA BCD $dsB $dsC $dsD +log_must eval "zfs send --redact BCD $dsA >$stream" +log_must eval "zfs receive $POOL2/rBCD <$stream" +log_must eval "zfs receive $targ <$stream" + +log_must zfs redact $dsA BC $dsB $dsC +log_must eval "zfs send --redact BC $dsA >$stream" +log_must eval "zfs receive $POOL2/rBC <$stream" + +log_must zfs redact $dsA E $dsE +log_must eval "zfs send --redact E $dsA >$stream" +log_must eval "zfs receive $POOL2/rE <$stream" + +log_must eval "zfs send $dsF >$stream" +log_must eval "zfs receive -o origin=$POOL2/rBCD@snap0 $POOL2/BCDrF <$stream" +log_must eval "zfs receive -o origin=$POOL2/rBC@snap0 $POOL2/BCrF <$stream" +log_must eval "zfs receive -o origin=$POOL2/rE@snap0 $POOL2/ErF <$stream" + +# Run tests from redacted datasets. +log_must eval "zfs send -i $POOL2/rBCD@snap0 $POOL2/BCDrF@snap >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tdBCD <$stream" + +log_must eval "zfs send -i $POOL2/rBC@snap0 $POOL2/BCrF@snap >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tdBC <$stream" + +log_must eval "zfs send -i $POOL2/rE@snap0 $POOL2/ErF@snap >$stream" +log_mustnot eval "zfs receive -o origin=$targ $POOL2/tdE <$stream" + +# Run tests from redaction bookmarks. +log_must eval "zfs send -i $sendfs#BC $dsF >$stream" +log_must eval "zfs receive -o origin=$targ $POOL2/tbBC <$stream" + +log_must eval "zfs send -i $sendfs#E $dsF >$stream" +log_mustnot eval "zfs receive -o origin=$targ $POOL2/tbE <$stream" + +log_pass "Verify sends from redacted datasets and bookmarks work correctly." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh new file mode 100755 index 0000000000..032d1fb91a --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify edge case when midbufid is equal to minbufid for the bug fixed by +# https://github.com/openzfs/zfs/pull/11297 (Fix kernel panic induced by +# redacted send) +# + +typeset ds_name="panic" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset stream=$(mktemp $TEST_BASE_DIR/stream.XXXX) + +function cleanup +{ + redacted_cleanup $sendfs $recvfs + rm -f $stream +} + +log_onexit cleanup + +log_must zfs create -o recsize=8k $sendfs +log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=2048 +log_must zfs snapshot $sendfs@init +log_must zfs clone $sendfs@init $clone +log_must stride_dd -i /dev/urandom -o /$clone/file -b 8192 -s 2 -c 7226 +log_must zfs snapshot $clone@init +log_must zfs redact $sendfs@init book_init $clone@init +log_must eval "zfs send --redact $sendfs#book_init $sendfs@init >$stream" +log_must eval "zfs recv $recvfs <$stream" +log_pass diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh new file mode 100755 index 0000000000..e4163c4ef8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify the list of redacted snapshot guids as properties. +# +# Strategy: +# 1. Create a redacted dataset and receive it into another pool. +# 2. Verify that the redaction list in the book mark (according to zdb) +# matches the list shown in the redact_snaps property. +# 3. Verify that the received snapshot has a matching redaction list. +# + +typeset ds_name="props" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset mntpnt + +log_onexit redacted_cleanup $sendfs $recvfs + +# Verify a plain dataset, snapshot or bookmark has an empty list. +log_must zfs snapshot $sendfs@empty_snapshot +log_must zfs bookmark $sendfs@empty_snapshot $sendfs#empty_bookmark +found_list=$(get_prop redact_snaps $sendfs) +[[ $found_list = "-" ]] || log_fail "Unexpected dataset list: $found_list" +found_list=$(get_prop redact_snaps $sendfs@empty_snapshot) +[[ $found_list = "-" ]] || log_fail "Unexpected snapshot list: $found_list" +found_list=$(get_prop redact_snaps $sendfs#empty_bookmark) +[[ $found_list = "-" ]] || log_fail "Unexpected bookmark list: $found_list" + +# Fill in a different block in every clone. +for i in {1..16}; do + log_must zfs clone $sendfs@snap ${clone}$i + mntpnt=$(get_prop mountpoint ${clone}$i) + log_must dd if=/dev/urandom of=$mntpnt/f2 bs=64k count=1 seek=$i \ + conv=notrunc + log_must zfs snapshot ${clone}$i@snap +done + +log_must zfs redact $sendfs@snap book1 $clone{1..16}@snap +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +log_must eval "zfs recv $recvfs <$stream" + +get_guid_list $tmpdir/prop_list $sendfs#book1 +get_guid_list $tmpdir/zdb_list $sendfs#book1 true +get_guid_list $tmpdir/recvd_prop_list $recvfs@snap + +count=$(wc -l $tmpdir/prop_list | awk '{print $1}') +[[ $count -eq 16 ]] || log_fail "Found incorrect number of redaction snapshots." + +diff $tmpdir/prop_list $tmpdir/zdb_list || \ + log_fail "Property list differed from zdb output" +diff $tmpdir/prop_list $tmpdir/recvd_prop_list || \ + log_fail "Received property list differed from sent" + +log_pass "The redaction list is consistent between sent and received datasets." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh new file mode 100755 index 0000000000..4ab04a0e57 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that resumable send works correctly with redacted streams. +# +# Strategy: +# 1. Do a full redacted resumable send. +# 2. Verify the received contents are correct. +# 3. Do an incremental redacted resumable send. +# 4. Verify the received contents are correct. +# 5. Verify that recv -A removes a partially received dataset. +# + +typeset ds_name="resume" +typeset sendfs="$POOL/$ds_name" +typeset recvfs="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset clone1="$POOL/${ds_name}_clone1" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +setup_dataset $ds_name '' +typeset clone_mnt="$(get_prop mountpoint $clone)" +typeset send_mnt="$(get_prop mountpoint $sendfs)" +typeset recv_mnt="/$POOL2/$ds_name" + +log_onexit redacted_cleanup $sendfs $recvfs + +log_must stride_dd -i /dev/urandom -o $clone_mnt/f2 -b 512 -c 64 -s 512 +log_must zfs snapshot $clone@snap1 + +# Do the full resumable send +log_must zfs redact $sendfs@snap book1 $clone@snap1 +resume_test "zfs send --redact book1 $sendfs@snap" $tmpdir $recvfs +log_must mount_redacted -f $recvfs +log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 1 +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must eval "get_diff $send_mnt/f2 $recv_mnt/f2 >$tmpdir/get_diff.out" +typeset range=$(cat $tmpdir/get_diff.out) +[[ "$RANGE9" = "$range" ]] || log_fail "Unexpected range: $range" + +log_must dd if=/dev/urandom of=$send_mnt/f3 bs=1024k count=3 +log_must zfs snapshot $sendfs@snap2 +log_must zfs clone $sendfs@snap2 $clone1 +typeset clone1_mnt="$(get_prop mountpoint $clone1)" +log_must dd if=/dev/urandom of=$clone1_mnt/f3 bs=128k count=3 conv=notrunc +log_must zfs snapshot $clone1@snap + +# Do the incremental resumable send +log_must zfs redact $sendfs@snap2 book2 $clone1@snap +resume_test "zfs send --redact book2 -i $sendfs#book1 $sendfs@snap2" \ + $tmpdir $recvfs +log_must diff $send_mnt/f1 $recv_mnt/f1 +log_must diff $send_mnt/f2 $recv_mnt/f2 +log_must eval "get_diff $send_mnt/f3 $recv_mnt/f3 >$tmpdir/get_diff.out" +range=$(cat $tmpdir/get_diff.out) +[[ "$RANGE10" = "$range" ]] || log_fail "Unexpected range: $range" + +# Test recv -A works properly and verify saved sends are not allowed +log_mustnot zfs recv -A $recvfs +log_must zfs destroy -R $recvfs +log_mustnot zfs recv -A $recvfs +log_must eval "zfs send --redact book1 $sendfs@snap >$stream" +dd if=$stream bs=64k count=1 | log_mustnot zfs receive -s $recvfs +[[ "-" = $(get_prop receive_resume_token $recvfs) ]] && \ + log_fail "Receive token not found." +log_mustnot eval "zfs send --saved --redact book1 $recvfs >$TEST_BASE_DIR/devnull" +log_must zfs recv -A $recvfs +log_must datasetnonexists $recvfs + +log_pass "Resumable send works correctly with redacted streams." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh new file mode 100755 index 0000000000..7456084b04 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that send size estimates of redacted sends work correctly +# +# Strategy: +# 1. Perform a redacted send with -nv and without, and verify the +# size estimate is the same as the size of the actual send. +# 2. Receive an incremental send from the redaction bookmark with +# -nv and without, and verify the size estimate is the same as +# the size of the actual send. +# + +ds_name="sizes" +typeset sendfs="$POOL/$ds_name" +typeset clone="$POOL/${ds_name}_clone2" +setup_dataset $ds_name "-o compress=lz4" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset size=$(mktemp $tmpdir/size.XXXX) +typeset size2=$(mktemp $tmpdir/size.XXXX) + +log_onexit redacted_cleanup $sendfs $clone +log_must zfs clone $sendfs@snap $clone +typeset clone_mnt="$(get_prop mountpoint $clone)" +log_must rm -rf $clone_mnt/* +log_must zfs snapshot $clone@snap +log_must zfs redact $sendfs@snap book $clone@snap +log_must eval "zfs send -nvP --redact book $sendfs@snap | \ + grep '^size' | awk '{print \$2}' >$size" +log_must eval "zfs send --redact book $sendfs@snap | wc -c \ + >$size2" +bytes1=$(cat $size | tr -d '[[:space:]]') +bytes2=$(cat $size2 | tr -d '[[:space:]]') +[[ "$bytes1" -eq "$bytes2" ]] || \ + log_fail "Full sizes differ: estimate $bytes1 and actual $bytes2" + +log_must zfs snapshot $sendfs@snap2 +log_must eval "zfs send -nvP -i $sendfs#book $sendfs@snap2 | \ + grep '^size' | awk '{print \$2}' >$size" +log_must eval "zfs send -i $sendfs#book $sendfs@snap2 | wc -c >$size2" +bytes1=$(cat $size | tr -d '[[:space:]]') +bytes2=$(cat $size2 | tr -d '[[:space:]]') +[[ "$bytes1" -eq "$bytes2" ]] || \ + log_fail "Incremental sizes differ: estimate $bytes1 and actual $bytes2" + +log_pass "Size estimates of redacted sends estimate accurately." diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh new file mode 100755 index 0000000000..2ea10638ce --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh @@ -0,0 +1,105 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +# +# Description: +# Verify that redacted send works on volumes. +# +# Strategy: +# 1. Write to a volume, then make a clone of that volume. +# 2. Receive a redacted stream that sends all blocks. +# 3. Receive a redacted stream that redacts the first half of the written area. +# + +typeset ds_name="volume" +typeset sendvol="$POOL/$ds_name" +typeset recvvol="$POOL2/$ds_name" +typeset clone="$POOL/${ds_name}_clone" +typeset tmpdir="$(get_prop mountpoint $POOL)/tmp" +typeset stream=$(mktemp $tmpdir/stream.XXXX) +typeset send_file="/dev/zvol/$sendvol" +typeset recv_file="/dev/zvol/$recvvol" +typeset clone_file="/dev/zvol/$clone" + +log_onexit redacted_cleanup $sendvol $recvvol + +log_must zfs create -b 8k -V 1g $sendvol +sleep 10 +log_must zpool export $POOL +log_must zpool import $POOL +udevadm settle +if ! is_disk_device $send_file; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + is_disk_device $send_file && break + done +fi +log_must dd if=/dev/urandom of=$send_file bs=8k count=64 +log_must zfs snapshot $sendvol@snap +log_must zfs clone $sendvol@snap $clone +log_must zfs snapshot $clone@snap + +log_must set_tunable32 ALLOW_REDACTED_DATASET_MOUNT 1 +log_must zfs redact $sendvol@snap book1 $clone@snap +log_must eval "zfs send --redact book1 $sendvol@snap >$stream" +log_must eval "zfs recv $recvvol <$stream" +sleep 10 +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle +if ! is_disk_device $recv_file; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + is_disk_device $recv_file && break + done +fi +log_must dd if=$send_file of=$tmpdir/send.dd bs=8k count=64 +log_must dd if=$recv_file of=$tmpdir/recv.dd bs=8k count=64 +log_must diff $tmpdir/send.dd $tmpdir/recv.dd +log_must zfs destroy -R $recvvol + +log_must dd if=/dev/urandom of=$clone_file bs=8k count=32 +log_must zfs snapshot $clone@snap1 +log_must zfs redact $sendvol@snap book2 $clone@snap1 +log_must eval "zfs send --redact book2 $sendvol@snap >$stream" +log_must eval "zfs recv $recvvol <$stream" +sleep 10 +log_must zpool export $POOL2 +log_must zpool import $POOL2 +udevadm settle +if ! is_disk_device $recv_file; then + udevadm settle + for t in 10 5 3 2 1; do + log_note "Polling $t seconds for device file." + udevadm settle + sleep $t + is_disk_device $recv_file && break + done +fi +log_must dd if=$send_file of=$tmpdir/send.dd bs=8k count=32 skip=32 +log_must dd if=$recv_file of=$tmpdir/recv.dd bs=8k count=32 skip=32 +log_must diff $tmpdir/send.dd $tmpdir/recv.dd + +log_pass "Redacted send works correctly with volumes." diff --git a/tests/zfs-tests/tests/functional/redacted_send/setup.ksh b/tests/zfs-tests/tests/functional/redacted_send/setup.ksh new file mode 100755 index 0000000000..3f537f813d --- /dev/null +++ b/tests/zfs-tests/tests/functional/redacted_send/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/redacted_send/redacted.kshlib + +verify_disk_count "$DISKS" 2 + +create_pool $POOL $DISK1 +log_must zfs snapshot $POOL@init +create_pool $POOL2 $DISK2 +log_must zfs snapshot $POOL2@init +log_must zfs create $POOL/tmp +log_pass diff --git a/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/tests/zfs-tests/tests/functional/redundancy/Makefile.am index 6f6cc405b9..42c11c4aa9 100644 --- a/tests/zfs-tests/tests/functional/redundancy/Makefile.am +++ b/tests/zfs-tests/tests/functional/redundancy/Makefile.am @@ -2,10 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redundancy dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - redundancy_001_pos.ksh \ - redundancy_002_pos.ksh \ - redundancy_003_pos.ksh \ - redundancy_004_neg.ksh + redundancy_draid.ksh \ + redundancy_draid1.ksh \ + redundancy_draid2.ksh \ + redundancy_draid3.ksh \ + redundancy_draid_damaged.ksh \ + redundancy_draid_spare1.ksh \ + redundancy_draid_spare2.ksh \ + redundancy_draid_spare3.ksh \ + redundancy_mirror.ksh \ + redundancy_raidz.ksh \ + redundancy_raidz1.ksh \ + redundancy_raidz2.ksh \ + redundancy_raidz3.ksh \ + redundancy_stripe.ksh dist_pkgdata_DATA = \ redundancy.cfg \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index ab36d00de9..baee8269b1 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -66,6 +66,23 @@ function random echo $value } +# +# Get the number of checksum errors for the pool. +# +# $1 Pool +# +function cksum_pool +{ + typeset -i cksum=$(zpool status $1 | awk ' + !NF { isvdev = 0 } + isvdev { errors += $NF } + /CKSUM$/ { isvdev = 1 } + END { print errors } + ') + + echo $cksum +} + # # Record the directories construction and checksum all the files which reside # within the specified pool @@ -81,6 +98,7 @@ function record_data [[ -z $pool ]] && log_fail "No specified pool." [[ -f $recordfile ]] && log_must rm -f $recordfile + sync_pool $pool typeset mntpnt mntpnt=$(get_prop mountpoint $pool) log_must eval "du -a $mntpnt > $recordfile 2>&1" @@ -119,22 +137,44 @@ function setup_test_env destroy_pool $pool fi - log_must mkfile $MINVDEVSIZE $vdevs + log_must truncate -s $MINVDEVSIZE $vdevs - log_must zpool create -m $TESTDIR $pool $keyword $vdevs + log_must zpool create -f -m $TESTDIR $pool $keyword $vdevs log_note "Filling up the filesystem ..." typeset -i ret=0 typeset -i i=0 typeset file=$TESTDIR/file + typeset -i limit + (( limit = $(get_prop available $pool) / 2 )) + while true ; do - file_write -o create -f $file.$i \ - -b $BLOCKSZ -c $NUM_WRITES + [[ $(get_prop available $pool) -lt $limit ]] && break + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES + ret=$? + (( $ret != 0 )) && break + (( i = i + 1 )) + done + + record_data $TESTPOOL $PRE_RECORD_FILE +} + +function refill_test_env +{ + log_note "Re-filling the filesystem ..." + typeset pool=$1 + typeset -i ret=0 + typeset -i i=0 + typeset mntpnt + mntpnt=$(get_prop mountpoint $pool) + typeset file=$mntpnt/file + while [[ -e $file.$i ]]; do + log_must rm -f $file.$i + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES ret=$? (( $ret != 0 )) && break (( i = i + 1 )) done - (($ret != 28 )) && log_note "file_write return value($ret) is unexpected." record_data $TESTPOOL $PRE_RECORD_FILE } @@ -178,8 +218,13 @@ function is_data_valid { typeset pool=$1 + log_must zpool scrub -w $pool + record_data $pool $PST_RECORD_FILE if ! diff $PRE_RECORD_FILE $PST_RECORD_FILE > /dev/null 2>&1; then + log_must cat $PRE_RECORD_FILE + log_must cat $PST_RECORD_FILE + diff -u $PRE_RECORD_FILE $PST_RECORD_FILE return 1 fi @@ -198,7 +243,7 @@ function get_vdevs #pool cnt typeset -i cnt=$2 typeset all_devs=$(zpool iostat -v $pool | awk '{print $1}'| \ - egrep -v "^pool$|^capacity$|^mirror$|^raidz1$|^raidz2$|---" | \ + egrep -v "^pool$|^capacity$|^mirror$|^raidz1$|^raidz2$|^raidz3$|^draid1.*|^draid2.*|^draid3.*|---" | \ egrep -v "/old$|^$pool$") typeset -i i=0 typeset vdevs @@ -226,17 +271,10 @@ function replace_missing_devs typeset vdev for vdev in $@; do - log_must gnudd if=/dev/zero of=$vdev \ - bs=1024k count=$(($MINDEVSIZE / (1024 * 1024))) \ - oflag=fdatasync - log_must zpool replace -f $pool $vdev $vdev - while true; do - if ! is_pool_resilvered $pool ; then - log_must sleep 2 - else - break - fi - done + log_must dd if=/dev/zero of=$vdev \ + bs=1024k count=$((MINVDEVSIZE / (1024 * 1024))) \ + conv=fdatasync + log_must zpool replace -wf $pool $vdev $vdev done } @@ -254,19 +292,19 @@ function damage_devs typeset -i cnt=$2 typeset label="$3" typeset vdevs - typeset -i bs_count=$((64 * 1024)) + typeset -i bs_count=$(((MINVDEVSIZE / 1024) - 4096)) vdevs=$(get_vdevs $pool $cnt) typeset dev if [[ -n $label ]]; then for dev in $vdevs; do - dd if=/dev/zero of=$dev seek=512 bs=1024 \ + log_must dd if=/dev/zero of=$dev seek=512 bs=1024 \ count=$bs_count conv=notrunc >/dev/null 2>&1 done else for dev in $vdevs; do - dd if=/dev/zero of=$dev bs=1024 count=$bs_count \ - conv=notrunc >/dev/null 2>&1 + log_must dd if=/dev/zero of=$dev bs=1024 \ + count=$bs_count conv=notrunc >/dev/null 2>&1 done fi diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh new file mode 100755 index 0000000000..8015e682c8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -0,0 +1,248 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# dRAID should provide redundancy +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# - create draid pool +# - fill it with some directories/files +# - verify self-healing by overwriting devices +# - verify resilver by replacing devices +# - verify scrub by zeroing devices +# - destroy the draid pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +function test_selfheal # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + # + # Scrub the pool because the find command will only self-heal blocks + # from the files which were read. Before overwriting additional + # devices we need to repair all of the blocks in the pool. + # + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=draid$nparity + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $TESTPOOL) + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_selfheal $TESTPOOL $nparity $dir + test_resilver $TESTPOOL $nparity $dir + test_scrub $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid redundancy test succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh new file mode 100755 index 0000000000..85d420ab0d --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid pool can withstand at most 1 device failing or missing. +# +# STRATEGY: +# 1. Create N(>3,<6) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged one of the virtual disk file. +# 6. Verify the data is correct to prove draid can withstand 1 device is +# failing. +# + +verify_runnable "global" + +log_assert "Verify draid pool can withstand one device failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 3 6) +setup_test_env $TESTPOOL draid $cnt + +# +# Inject data corruption error for draid pool +# +damage_devs $TESTPOOL 1 "label" +log_must is_data_valid $TESTPOOL +log_must clear_errors $TESTPOOL + +# +# Inject bad device error for draid pool +# +damage_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL +log_must recover_bad_missing_devs $TESTPOOL 1 + +# +# Inject missing device error for draid pool +# +remove_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL + +log_pass "draid pool can withstand one device failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh new file mode 100755 index 0000000000..04f1fdfb15 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid2 pool can withstand 2 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<6) virtual disk files. +# 2. Create draid2 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove draid2 can withstand 2 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid2 pool can withstand two devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 6) +setup_test_env $TESTPOOL draid2 $cnt + +# +# Inject data corruption errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid2 pool +# +for i in 1 2; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid2 pool can withstand two devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh new file mode 100755 index 0000000000..d4c823ed9b --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>5,<6) virtual disk files. +# 2. Create draid3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most three of the virtual disk files. +# 6. Verify the data is correct to prove draid3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 5 6) +setup_test_env $TESTPOOL draid3 $cnt + +# +# Inject data corruption errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid3 pool can withstand three devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh new file mode 100755 index 0000000000..6796cc78a1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh @@ -0,0 +1,153 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# When sequentially resilvering a dRAID pool with multiple vdevs +# that contain silent damage a sequential resilver should never +# introduce additional unrecoverable damage. +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# - create draid pool +# - fill it with some directories/files +# - overwrite the maximum number of repairable devices +# - sequentially resilver each overwritten device one at a time; +# the device will not be correctly repaired because the silent +# damage on the other vdevs will cause the parity calculations +# to generate incorrect data for the resilvering vdev. +# - verify that only the resilvering devices had invalid data +# written and that a scrub is still able to repair the pool +# - destroy the draid pool +# + +typeset -r devs=7 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable + set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled +} + +function test_sequential_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + spare=draid${nparity}-0-$i + log_must zpool replace -fsw $pool $dir/dev-$i $spare + done + + log_must zpool scrub -w $pool + + # When only a single child was overwritten the sequential resilver + # can fully repair the damange from parity and the scrub will have + # nothing to repair. When multiple children are silently damaged + # the sequential resilver will calculate the wrong data since only + # the parity information is used and it cannot be verified with + # the checksum. However, since only the resilvering devices are + # written to with the bad data a subsequent scrub will be able to + # fully repair the pool. + # + if [[ $nparity == 1 ]]; then + log_must check_pool_status $pool "scan" "repaired 0B" + else + log_mustnot check_pool_status $pool "scan" "repaired 0B" + fi + + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=draid${nparity}:${nparity}s + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_sequential_resilver $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid damaged device(s) test succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh new file mode 100755 index 0000000000..8acee15679 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify resilver to dRAID distributed spares. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can: +# - sustain N failures (1-3), and +# - has N distributed spares to replace all faulted vdevs +# b. Fill the pool with data +# c. Systematically fault a vdev, then replace it with a spare +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify resilver to dRAID distributed spares" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=$(random_int_between 1 3) + spares=$(random_int_between $parity 3) + data=$(random_int_between 1 8) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 16) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + i=0 + while [[ $i -lt $spares ]]; do + fault_vdev="$BASEDIR/vdev$i" + spare_vdev="draid${parity}-0-${i}" + + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + log_must zpool replace -w $flags $TESTPOOL \ + $fault_vdev $spare_vdev + log_must check_vdev_state spare-$i "DEGRADED" + log_must check_vdev_state $spare_vdev "ONLINE" + log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" + log_must zpool detach $TESTPOOL $fault_vdev + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + (( i += 1 )) + done + + log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh new file mode 100755 index 0000000000..08fdd558f9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify multiple dRAID spares can be used. +# +# STRATEGY: +# 1. Create a pool and fill it with data. +# 2. Engage 3 distributed spares and verify the pool +# 3. Refill the filesystem with new data +# 4. Clear the pool to online previous faulted devices and resilver +# 5. Verify the pool and its contents +# + +log_assert "Verify multiple dRAID spares" + +log_onexit cleanup + +parity=1 +spares=3 +data=$(random_int_between 1 4) +children=10 +draid="draid${parity}:${data}d:${children}c:${spares}s" + +setup_test_env $TESTPOOL $draid $children + +# Replace vdev7 -> draid1-0-0 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev7 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + +# Replace vdev8 -> draid1-0-1 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + +# Replace vdev9 -> draid1-0-2 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + +# Verify, refill and verify the pool contents. +verify_pool $TESTPOOL +refill_test_env $TESTPOOL +verify_pool $TESTPOOL + +# Bring everything back online and check for errors. +log_must zpool clear $TESTPOOL +log_must zpool wait -t resilver $TESTPOOL + +log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL" + +log_must zpool scrub -w $TESTPOOL +log_must check_pool_status $TESTPOOL "scan" "repaired 0B" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + +log_must is_data_valid $TESTPOOL + +log_pass "Verify multiple dRAID spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh new file mode 100755 index 0000000000..28e8e3c6d7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -0,0 +1,193 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify dRAID resilver to traditional and distributed spares for +# a variety of pool configurations and pool states. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can +# sustain 1 failure and has 5 distributed spares. +# b. Fill the pool with data +# c. Systematically fault and replace vdevs in the pools with +# spares to test resilving in common pool states. +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify dRAID resilver" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + +# +# Disable scrubbing after a sequential resilver to verify the resilver +# alone is able to reconstruct the data without the help of a scrub. +# +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=1 + spares=5 + data=$(random_int_between 1 4) + children=10 + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + # + # Perform a variety of replacements to normal and distributed spares + # for a variety of different vdev configurations to exercise different + # resilver code paths. The final configuration is expected to be: + # + # NAME STATE READ WRITE CKSUM + # testpool DEGRADED 0 0 0 + # draid1:1d:10c:5s-0 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/new_vdev0 ONLINE 0 0 0 + # /var/tmp/basedir.28683/new_vdev1 ONLINE 0 0 0 + # spare-2 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev2 FAULTED 0 0 0 + # draid1-0-3 ONLINE 0 0 0 + # spare-3 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev3 FAULTED 0 0 0 + # draid1-0-4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev5 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev6 ONLINE 0 0 0 + # draid1-0-0 ONLINE 0 0 0 + # spare-8 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev8 FAULTED 0 0 0 + # draid1-0-1 ONLINE 0 0 0 + # spare-9 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev9 ONLINE 0 0 0 + # draid1-0-2 ONLINE 0 0 0 + # spares + # draid1-0-0 INUSE currently in use + # draid1-0-1 INUSE currently in use + # draid1-0-2 INUSE currently in use + # draid1-0-3 INUSE currently in use + # draid1-0-4 INUSE currently in use + # + + # Distributed spare which replaces original online device + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev7 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + log_must zpool detach $TESTPOOL $BASEDIR/vdev7 + log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Distributed spare in mirror with original device faulted + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev8 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Distributed spare in mirror with original device still online + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Normal faulted device replacement + new_vdev0="$BASEDIR/new_vdev0" + log_must truncate -s $MINVDEVSIZE $new_vdev0 + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev0 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 + log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Distributed spare faulted device replacement + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev2 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev2 draid1-0-3 + log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Normal online device replacement + new_vdev1="$BASEDIR/new_vdev1" + log_must truncate -s $MINVDEVSIZE $new_vdev1 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 + log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Distributed spare online device replacement (then fault) + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 + log_must check_vdev_state $TESTPOOL spare-3 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-4 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-4 "INUSE" + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" + log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + # Verify the original data is valid + log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_mirror.ksh similarity index 96% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_mirror.ksh index a1ca2cb765..b7b791b248 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_mirror.ksh @@ -29,6 +29,7 @@ # Copyright (c) 2013 by Delphix. All rights reserved. # +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/redundancy/redundancy.kshlib # @@ -50,7 +51,7 @@ verify_runnable "global" log_assert "Verify mirrored pool can withstand N-1 devices are failing or missing." log_onexit cleanup -typeset -i cnt=$(random 2 5) +typeset -i cnt=$(random_int_between 2 5) setup_test_env $TESTPOOL mirror $cnt typeset -i i=1 diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh new file mode 100755 index 0000000000..d736883916 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh @@ -0,0 +1,248 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# RAIDZ should provide redundancy +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool +# - fill it with some directories/files +# - verify self-healing by overwriting devices +# - verify resilver by replacing devices +# - verify scrub by zeroing devices +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +function test_selfheal # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + # + # Scrub the pool because the find command will only self-heal blocks + # from the files which were read. Before overwriting additional + # devices we need to repair all of the blocks in the pool. + # + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $TESTPOOL) + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_selfheal $TESTPOOL $nparity $dir + test_resilver $TESTPOOL $nparity $dir + test_scrub $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "raidz redundancy test succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh similarity index 90% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh index e25a48be8d..a73890e4cc 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz1.ksh @@ -29,6 +29,7 @@ # Copyright (c) 2013 by Delphix. All rights reserved. # +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/redundancy/redundancy.kshlib # @@ -41,16 +42,16 @@ # 3. Fill the filesystem with directories and files. # 4. Record all the files and directories checksum information. # 5. Damaged one of the virtual disk file. -# 6. Verify the data is correct to prove raidz can withstand 1 devicd is +# 6. Verify the data is correct to prove raidz can withstand 1 device is # failing. # verify_runnable "global" -log_assert "Verify raidz pool can withstand one device is failing." +log_assert "Verify raidz pool can withstand one device failing." log_onexit cleanup -typeset -i cnt=$(random 2 5) +typeset -i cnt=$(random_int_between 2 5) setup_test_env $TESTPOOL raidz $cnt # @@ -73,4 +74,4 @@ log_must recover_bad_missing_devs $TESTPOOL 1 remove_devs $TESTPOOL 1 log_must is_data_valid $TESTPOOL -log_pass "Raidz pool can withstand one devices is failing passed." +log_pass "raidz pool can withstand one device failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh similarity index 91% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh index b16687dbe8..94b9b88251 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz2.ksh @@ -29,6 +29,7 @@ # Copyright (c) 2013 by Delphix. All rights reserved. # +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/redundancy/redundancy.kshlib # @@ -47,10 +48,10 @@ verify_runnable "global" -log_assert "Verify raidz2 pool can withstand two devices are failing." +log_assert "Verify raidz2 pool can withstand two devices failing." log_onexit cleanup -typeset -i cnt=$(random 3 5) +typeset -i cnt=$(random_int_between 3 5) setup_test_env $TESTPOOL raidz2 $cnt # @@ -80,4 +81,4 @@ for i in 1 2; do log_must recover_bad_missing_devs $TESTPOOL $i done -log_pass "Raidz2 pool can withstand two devices are failing passed." +log_pass "raidz2 pool can withstand two devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh new file mode 100755 index 0000000000..0a01c47106 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A raidz3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<5) virtual disk files. +# 2. Create raidz3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove raidz3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify raidz3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 5) +setup_test_env $TESTPOOL raidz3 $cnt + +# +# Inject data corruption errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for raidz3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "raidz3 pool can withstand three devices failing passed." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_stripe.ksh similarity index 89% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_stripe.ksh index 01b819dc62..b2c4a85feb 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_stripe.ksh @@ -29,6 +29,7 @@ # Copyright (c) 2013, 2016 by Delphix. All rights reserved. # +. $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/redundancy/redundancy.kshlib # @@ -50,17 +51,14 @@ verify_runnable "global" log_assert "Verify striped pool have no data redundancy." log_onexit cleanup -typeset -i cnt=$(random 2 5) +typeset -i cnt=$(random_int_between 2 5) setup_test_env $TESTPOOL "" $cnt damage_devs $TESTPOOL 1 "keep_label" -log_must zpool scrub $TESTPOOL +log_must zpool scrub -w $TESTPOOL -# Wait for the scrub to wrap, or is_healthy will be wrong. -while ! is_pool_scrubbed $TESTPOOL; do - sleep 1 -done - -log_mustnot is_healthy $TESTPOOL +if is_healthy $TESTPOOL ; then + log_fail "$pool should not be healthy." +fi log_pass "Striped pool has no data redundancy as expected." diff --git a/tests/zfs-tests/tests/functional/refquota/Makefile.am b/tests/zfs-tests/tests/functional/refquota/Makefile.am index 5f7c7b6869..1d8418fbbe 100644 --- a/tests/zfs-tests/tests/functional/refquota/Makefile.am +++ b/tests/zfs-tests/tests/functional/refquota/Makefile.am @@ -7,4 +7,6 @@ dist_pkgdata_SCRIPTS = \ refquota_003_pos.ksh \ refquota_004_pos.ksh \ refquota_005_pos.ksh \ - refquota_006_neg.ksh + refquota_006_neg.ksh \ + refquota_007_neg.ksh \ + refquota_008_neg.ksh diff --git a/tests/zfs-tests/tests/functional/refquota/refquota_007_neg.ksh b/tests/zfs-tests/tests/functional/refquota/refquota_007_neg.ksh new file mode 100755 index 0000000000..4f0393883b --- /dev/null +++ b/tests/zfs-tests/tests/functional/refquota/refquota_007_neg.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. + +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# refquota limits the amount of space a dataset can consume, +# snapshot rollback should be limited by refquota. +# +# STRATEGY: +# 1. Create a file in a filesystem +# 2. Create a snapshot of the filesystem +# 3. Remove the file +# 4. Set a refquota of size half of the file +# 5. Rollback the filesystem from the snapshot +# 6. Rollback should fail +# + +verify_runnable "both" + +function cleanup +{ + log_must zfs destroy -rf $TESTPOOL/$TESTFS + log_must zfs create $TESTPOOL/$TESTFS + log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS +} + +log_onexit cleanup + +TESTFILE='testfile' +FS=$TESTPOOL/$TESTFS + +mntpnt=$(get_prop mountpoint $FS) +log_must mkfile 20M $mntpnt/$TESTFILE +log_must zfs snapshot $FS@snap20M +log_must rm $mntpnt/$TESTFILE + +log_must sync + +log_must zfs set refquota=10M $FS +log_mustnot zfs rollback $FS@snap20M + +log_pass "The rollback to the snapshot was restricted by refquota." diff --git a/tests/zfs-tests/tests/functional/refquota/refquota_008_neg.ksh b/tests/zfs-tests/tests/functional/refquota/refquota_008_neg.ksh new file mode 100755 index 0000000000..6e4da3621c --- /dev/null +++ b/tests/zfs-tests/tests/functional/refquota/refquota_008_neg.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. + +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# refquota limits the amount of space a dataset can consume, +# This test verifies that zfs receive does not override +# refquota. +# +# STRATEGY: +# 1. Create a sub-filesystem $TESTSUBFS1 +# 2. Create a file in the sub-filesystem $TESTSUBFS1 +# 3. Create a snapshot of the sub-filesystem $TESTSUBFS1 +# 4. Create another sub-filesystem $TESTSUBFS2 +# 5. Apply a refquota value to $TESTSUBFS2, +# half the sub-filesystem $TESTSUBFS1 file size +# 6. Verify that zfs receive of the snapshot of $TESTSUBFS1 +# fails due to refquota +# + +verify_runnable "both" + +oldvalue=$(get_tunable SPA_ASIZE_INFLATION) +function cleanup +{ + set_tunable32 SPA_ASIZE_INFLATION $oldvalue + log_must zfs destroy -rf $TESTPOOL/$TESTFS + log_must zfs create $TESTPOOL/$TESTFS + log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS +} + +log_onexit cleanup + +set_tunable32 SPA_ASIZE_INFLATION 2 + +TESTFILE='testfile' +FS=$TESTPOOL/$TESTFS +log_must zfs create $FS/$TESTSUBFS1 +log_must zfs create $FS/$TESTSUBFS2 + +mntpnt1=$(get_prop mountpoint $FS/$TESTSUBFS1) +mntpnt2=$(get_prop mountpoint $FS/$TESTSUBFS2) + +log_must mkfile 200M $mntpnt1/$TESTFILE +log_must zfs snapshot $FS/$TESTSUBFS1@snap200m + +log_must zfs set refquota=10M $FS/$TESTSUBFS2 +log_mustnot eval "zfs send $FS/$TESTSUBFS1@snap200m |" \ + "zfs receive -F $FS/$TESTSUBFS2" + +log_pass "ZFS receive does not override refquota" + diff --git a/tests/zfs-tests/tests/functional/refreserv/Makefile.am b/tests/zfs-tests/tests/functional/refreserv/Makefile.am index 96f25d444e..bd760a1f06 100644 --- a/tests/zfs-tests/tests/functional/refreserv/Makefile.am +++ b/tests/zfs-tests/tests/functional/refreserv/Makefile.am @@ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \ refreserv_002_pos.ksh \ refreserv_003_pos.ksh \ refreserv_004_pos.ksh \ - refreserv_005_pos.ksh + refreserv_005_pos.ksh \ + refreserv_multi_raidz.ksh \ + refreserv_raidz.ksh dist_pkgdata_DATA = \ refreserv.cfg diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_002_pos.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_002_pos.ksh index d4c0a4faeb..a8f58631f7 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_002_pos.ksh @@ -50,11 +50,9 @@ function cleanup if is_global_zone ; then log_must zfs set refreservation=none $TESTPOOL - if datasetexists $TESTPOOL@snap ; then - log_must zfs destroy -f $TESTPOOL@snap - fi + datasetexists $TESTPOOL@snap && destroy_dataset $TESTPOOL@snap -f fi - log_must zfs destroy -rf $TESTPOOL/$TESTFS + destroy_dataset $TESTPOOL/$TESTFS -rf log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_003_pos.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_003_pos.ksh index da36609f2c..3e5a78cf94 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_003_pos.ksh @@ -38,7 +38,7 @@ # space outside of this refreservation. # # STRATEGY: -# 1. Setting quota and refservation +# 1. Setting quota and refreservation # 2. Verify snapshot can be created, when used =< quota - refreserv # 3. Verify failed to create snapshot, when used > quota - refreserv # diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh index 8c044eca59..1ccc9828d4 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh @@ -45,9 +45,9 @@ verify_runnable "global" function cleanup { - log_must zfs destroy -rf $TESTPOOL/$TESTFS - log_must zfs create $TESTPOOL/$TESTFS - log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS + destroy_dataset "$fs" "-rf" + log_must zfs create $fs + log_must zfs set mountpoint=$TESTDIR $fs } log_assert "Volume (ref)reservation is not limited by volsize" diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh new file mode 100755 index 0000000000..ff79764bab --- /dev/null +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh @@ -0,0 +1,201 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto picks worst raidz vdev +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - remember its refreservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number - though it is horrible with tiny blocks. At 10M on +# ashift=12, the estimate may be over 26% too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto picks worst raidz vdev" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +if is_freebsd; then + bps=$(diskinfo -v ${alldisks[0]} | awk '/sectorsize/ { print $1 }') +elif is_linux; then + bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) +fi +case "$bps" in +512) + allshifts=(9 10 17) + ;; +4096) + allshifts=(12 13 17) + ;; +*) + log_fail "bytes/sector: $bps != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +typeset -A sizes= + +# +# Determine the refreservation for a $volsize MiB volume on each raidz type at +# various block sizes. +# +for parity in 1 2 3; do + raid=raidz$parity + typeset -A sizes["$raid"] + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + typeset -A sizes["$raid"]["$ndisks"] + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Gathering refreservation for $raid-$ndisks" \ + "volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + sizes["$raid"]["$ndisks"]["$vbs"]=$refres + + log_must_busy zfs destroy "$vol" + done + + log_must_busy zpool destroy "$TESTPOOL" + done +done + +# A little extra info is always helpful when diagnosing problems. To +# pretty-print what you find in the log, do this in ksh: +# typeset -A sizes=(...) +# print -v sizes +log_note "sizes=$(print -C sizes)" + +# +# Helper function for checking that refreservation is calculated properly in +# multi-vdev pools. "Properly" is defined as assuming that all vdevs are as +# space inefficient as the worst one. +# +function check_vdevs { + typeset raid=$1 + typeset nd1=$2 + typeset nd2=$3 + typeset -a disks1 disks2 + typeset vbs vol refres refres1 refres2 expect + + disks1=(${alldisks[0..$((nd1 - 1))]}) + disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]}) + if (( ${#disks2[@]} < nd2 )); then + log_note "Too few disks to test $raid-$nd1 + $raid=$nd2" + return + fi + + log_must zpool create -f "$TESTPOOL" \ + "$raid" "${disks1[@]}" "$raid" "${disks2[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol" + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + + refres1=${sizes["$raid"]["$nd1"]["$vbs"]} + refres2=${sizes["$raid"]["$nd2"]["$vbs"]} + + if (( refres1 > refres2 )); then + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres1)" + log_must test "$refres" -eq "$refres1" + else + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres2)" + log_must test "$refres" -eq "$refres2" + fi + + log_must zfs destroy "$vol" + done + + log_must zpool destroy "$TESTPOOL" +} + +# +# Verify that multi-vdev pools use the last optimistic size for all the +# permutations within a particular raidz variant. +# +for raid in "${!sizes[@]}"; do + # ksh likes to create a [0] item for us. Thanks, ksh! + [[ $raid == "0" ]] && continue + + for nd1 in "${!sizes["$raid"][@]}"; do + # And with an empty array we get one key, ''. Thanks, ksh! + [[ $nd1 == "0" || -z "$nd1" ]] && continue + + for nd2 in "${!sizes["$raid"][@]}"; do + [[ $nd2 == "0" || -z "$nd2" ]] && continue + + check_vdevs "$raid" "$nd1" "$nd2" + done + done +done + +log_pass "raidz refreservation=auto picks worst raidz vdev" diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh new file mode 100755 index 0000000000..22891ef1d5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto accounts for extra parity and skip blocks +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - fully overwrite it +# - verify that referenced is less than or equal to reservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# 5. Repeat all steps above for raidz2 and raidz3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number. At 10M on ashift=12, the estimate may be over 26% +# too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto accounts for extra parity and skip blocks" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +if is_freebsd; then + bps=$(diskinfo -v ${alldisks[0]} | awk '/sectorsize/ { print $1 }') +elif is_linux; then + bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) +fi +log_must test "$bps" -eq 512 -o "$bps" -eq 4096 +case "$bps" in +512) + allshifts=(9 10 17) + maxpct=151 + ;; +4096) + allshifts=(12 13 17) + maxpct=110 + ;; +*) + log_fail "bytes/sector: $bps != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +# This loop handles all iterations of steps 1 through 4 described in strategy +# comment above, +for parity in 1 2 3; do + raid=raidz$parity + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Testing $raid-$ndisks volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + block_device_wait "/dev/zvol/$vol" + log_must dd if=/dev/zero of=/dev/zvol/$vol \ + bs=1024k count=$volsize + sync + + ref=$(zfs get -Hpo value referenced "$vol") + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$ref" + log_must test -n "$refres" + + typeset -F2 deltapct=$((refres * 100.0 / ref)) + log_note "$raid-$ndisks refreservation $refres" \ + "is $deltapct% of reservation $res" + + log_must test "$ref" -le "$refres" + log_must test "$deltapct" -le $maxpct + + log_must_busy zfs destroy "$vol" + block_device_wait + done + + log_must_busy zpool destroy "$TESTPOOL" + done +done + +log_pass "raidz refreservation=auto accounts for extra parity and skip blocks" diff --git a/tests/zfs-tests/tests/functional/removal/Makefile.am b/tests/zfs-tests/tests/functional/removal/Makefile.am index ba42b899ac..878935b96d 100644 --- a/tests/zfs-tests/tests/functional/removal/Makefile.am +++ b/tests/zfs-tests/tests/functional/removal/Makefile.am @@ -10,25 +10,27 @@ # # -# Copyright (c) 2014, 2015 by Delphix. All rights reserved. +# Copyright (c) 2014, 2019 by Delphix. All rights reserved. # pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/removal dist_pkgdata_SCRIPTS = \ - cleanup.ksh removal_all_vdev.ksh removal_check_space.ksh \ - removal_condense_export.ksh removal_multiple_indirection.ksh \ - removal_remap_deadlists.ksh removal_remap.ksh \ + cleanup.ksh removal_all_vdev.ksh removal_cancel.ksh \ + removal_check_space.ksh removal_condense_export.ksh \ + removal_multiple_indirection.ksh \ + removal_nopwrite.ksh removal_remap_deadlists.ksh \ removal_reservation.ksh removal_resume_export.ksh \ removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \ removal_with_dedup.ksh removal_with_errors.ksh \ removal_with_export.ksh removal_with_faulted.ksh \ - removal_with_ganging.ksh removal_with_remap.ksh \ + removal_with_ganging.ksh \ removal_with_remove.ksh removal_with_scrub.ksh \ removal_with_send.ksh removal_with_send_recv.ksh \ removal_with_snapshot.ksh removal_with_write.ksh \ removal_with_zdb.ksh remove_mirror.ksh remove_mirror_sanity.ksh \ - remove_raidz.ksh remove_expanded.ksh + remove_raidz.ksh remove_expanded.ksh remove_indirect.ksh \ + remove_attach_mirror.ksh dist_pkgdata_DATA = \ removal.kshlib diff --git a/tests/zfs-tests/tests/functional/removal/removal.kshlib b/tests/zfs-tests/tests/functional/removal/removal.kshlib index fa0174db05..140ac38ad8 100644 --- a/tests/zfs-tests/tests/functional/removal/removal.kshlib +++ b/tests/zfs-tests/tests/functional/removal/removal.kshlib @@ -28,15 +28,13 @@ function wait_for_removal # pool typeset pool=$1 typeset callback=$2 - while is_pool_removing $pool; do - sleep 1 - done + log_must zpool wait -t remove $pool # # The pool state changes before the TXG finishes syncing; wait for # the removal to be completed on disk. # - sync_pool + sync_pool $pool log_must is_pool_removed $pool return 0 @@ -62,7 +60,8 @@ function attempt_during_removal # pool disk callback [args] typeset callback=$3 shift 3 - set_tunable32 zfs_removal_suspend_progress 1 + log_onexit_push set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 + set_tunable32 REMOVAL_SUSPEND_PROGRESS 1 log_must zpool remove $pool $disk @@ -81,7 +80,8 @@ function attempt_during_removal # pool disk callback [args] # log_must is_pool_removing $pool - set_tunable32 zfs_removal_suspend_progress 0 + set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 + log_onexit_pop log_must wait_for_removal $pool log_mustnot vdevs_in_pool $pool $disk @@ -99,7 +99,7 @@ function random_write # file write_size { typeset file=$1 typeset block_size=$2 - typeset file_size=$(stat -c%s $file 2>/dev/null) + typeset file_size=$(stat_size $file 2>/dev/null) typeset nblocks=$((file_size / block_size)) [[ -w $file ]] || return 1 diff --git a/tests/zfs-tests/tests/functional/removal/removal_cancel.ksh b/tests/zfs-tests/tests/functional/removal/removal_cancel.ksh new file mode 100755 index 0000000000..e97dc5e77a --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_cancel.ksh @@ -0,0 +1,94 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# +# Ensure that cancelling a removal midway does not cause any +# issues like cause a panic. +# +# STRATEGY: +# +# 1. Create a pool with one vdev and do some writes on it. +# 2. Add a new vdev to the pool and start the removal of +# the first vdev. +# 3. Cancel the removal after some segments have been copied +# over to the new vdev. +# 4. Run zdb to ensure the on-disk state of the pool is ok. +# + +function cleanup +{ + # + # Reset tunable. + # + log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0 +} +log_onexit cleanup + +SAMPLEFILE=/$TESTDIR/00 + +# +# Create pool with one disk. +# +log_must default_setup_noexit "$REMOVEDISK" + +# +# Create a file of size 1GB and then do some random writes. +# Since randwritecomp does 8K writes we do 25000 writes +# which means we write ~200MB to the vdev. +# +log_must mkfile -n 1g $SAMPLEFILE +log_must randwritecomp $SAMPLEFILE 25000 + +# +# Add second device where all the data will be evacuated. +# +log_must zpool add -f $TESTPOOL $NOTREMOVEDISK + +# +# Block removal. +# +log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 1 + +# +# Start removal. +# +log_must zpool remove $TESTPOOL $REMOVEDISK + +# +# Only for debugging purposes in test logs. +# +log_must zpool status $TESTPOOL + +# +# Cancel removal. +# +log_must zpool remove -s $TESTPOOL + +# +# Verify on-disk state. +# +log_must zdb $TESTPOOL + +log_pass "Device removal thread cancelled successfully." diff --git a/tests/zfs-tests/tests/functional/removal/removal_condense_export.ksh b/tests/zfs-tests/tests/functional/removal/removal_condense_export.ksh index ad33caec84..8de17ff2e8 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_condense_export.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_condense_export.ksh @@ -21,21 +21,19 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/removal/removal.kshlib -if is_linux; then - log_unsupported "ZDB fails during concurrent pool activity." -fi - function reset { - log_must set_tunable64 zfs_condense_indirect_commit_entry_delay_ms 0 - log_must set_tunable64 zfs_condense_min_mapping_bytes 131072 + log_must set_tunable64 CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS 0 + log_must set_tunable64 CONDENSE_INDIRECT_OBSOLETE_PCT 25 + log_must set_tunable64 CONDENSE_MIN_MAPPING_BYTES 131072 default_cleanup_noexit } default_setup_noexit "$DISKS" "true" log_onexit reset -log_must set_tunable64 zfs_condense_indirect_commit_entry_delay_ms 1000 -log_must set_tunable64 zfs_condense_min_mapping_bytes 1 +log_must set_tunable64 CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS 5000 +log_must set_tunable64 CONDENSE_INDIRECT_OBSOLETE_PCT 5 +log_must set_tunable64 CONDENSE_MIN_MAPPING_BYTES 1 log_must zfs set recordsize=512 $TESTPOOL/$TESTFS @@ -77,9 +75,16 @@ log_must zpool remove $TESTPOOL $REMOVEDISK log_must wait_for_removal $TESTPOOL log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK -log_must zfs remap $TESTPOOL/$TESTFS +# +# Touch one block under each L1 indirect block, so that the other data blocks +# will be remapped to their concrete locations. These parameters assume +# recordsize=512, indirect block size of 128K (1024 block pointers per +# indirect block), and file size of less than 20*1024 blocks (10MB). +# +log_must stride_dd -i /dev/urandom -o $TESTDIR/file -b 512 -c 20 -s 1024 + sync_pool $TESTPOOL -sleep 5 +sleep 4 sync_pool $TESTPOOL log_must zpool export $TESTPOOL zdb -e -p $REMOVEDISKPATH $TESTPOOL | grep 'Condensing indirect vdev' || \ diff --git a/tests/zfs-tests/tests/functional/removal/removal_multiple_indirection.ksh b/tests/zfs-tests/tests/functional/removal/removal_multiple_indirection.ksh index 97b67a4626..6c52fd7819 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_multiple_indirection.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_multiple_indirection.ksh @@ -57,14 +57,14 @@ function cleanup default_cleanup_noexit log_must rm -f $DISKS - # reset zfs_remove_max_segment to 1M - set_tunable32 zfs_remove_max_segment 1048576 + # reset REMOVE_MAX_SEGMENT to 1M + set_tunable32 REMOVE_MAX_SEGMENT 1048576 } log_onexit cleanup -# set zfs_remove_max_segment to 32k -log_must set_tunable32 zfs_remove_max_segment 32768 +# set REMOVE_MAX_SEGMENT to 32k +log_must set_tunable32 REMOVE_MAX_SEGMENT 32768 log_must dd if=/dev/urandom of=$TESTDIR/$TESTFILE0 bs=128k count=1 FILE_CONTENTS=$(<$TESTDIR/$TESTFILE0) diff --git a/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh new file mode 100755 index 0000000000..cede81ad60 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh @@ -0,0 +1,87 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib +. $STF_SUITE/tests/functional/nopwrite/nopwrite.shlib + +default_setup_noexit "$DISKS" +log_onexit default_cleanup_noexit +BLOCKSIZE=8192 + +origin="$TESTPOOL/$TESTFS" + +log_must zfs set compress=on $origin +log_must zfs set checksum=skein $origin + +log_must zfs set recordsize=8k $origin +dd if=/dev/urandom of=$TESTDIR/file_8k bs=1024k count=$MEGS oflag=sync \ + conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed." +log_must zfs set recordsize=128k $origin +dd if=/dev/urandom of=$TESTDIR/file_128k bs=1024k count=$MEGS oflag=sync \ + conv=notrunc >/dev/null 2>&1 || log_fail "dd into $TESTDIR/file failed." + +zfs snapshot $origin@a || log_fail "zfs snap failed" +log_must zfs clone $origin@a $origin/clone + +# +# Verify that nopwrites work prior to removal +# +log_must zfs set recordsize=8k $origin/clone +dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_must verify_nopwrite $origin $origin@a $origin/clone + +log_must zfs set recordsize=128k $origin/clone +dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_must verify_nopwrite $origin $origin@a $origin/clone + +# +# Remove a device before testing nopwrites again +# +log_must zpool remove $TESTPOOL $REMOVEDISK +log_must wait_for_removal $TESTPOOL +log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK + +# +# Normally, we expect nopwrites to avoid allocating new blocks, but +# after a device has been removed the DVAs will get remapped when +# a L0's indirect block is written. This will negate the effects +# of nopwrite and should result in new allocations. +# + +# +# Perform a direct zil nopwrite test +# +log_must zfs set recordsize=8k $origin/clone +dd if=/$TESTDIR/file_8k of=/$TESTDIR/clone/file_8k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_mustnot verify_nopwrite $origin $origin@a $origin/clone + +# +# Perform an indirect zil nopwrite test +# +log_must zfs set recordsize=128k $origin/clone +dd if=/$TESTDIR/file_128k of=/$TESTDIR/clone/file_128k bs=1024k \ + oflag=sync conv=notrunc >/dev/null 2>&1 || log_fail "dd failed." +log_mustnot verify_nopwrite $origin $origin@a $origin/clone + +log_pass "Remove works with nopwrite." diff --git a/tests/zfs-tests/tests/functional/removal/removal_remap.ksh b/tests/zfs-tests/tests/functional/removal/removal_remap.ksh deleted file mode 100755 index 5239ef3a5e..0000000000 --- a/tests/zfs-tests/tests/functional/removal/removal_remap.ksh +++ /dev/null @@ -1,126 +0,0 @@ -#! /bin/ksh -p -# -# CDDL HEADER START -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# -# CDDL HEADER END -# - -# -# Copyright (c) 2015, 2016 by Delphix. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/removal/removal.kshlib - -# N.B. The 'zfs remap' command has been disabled and may be removed. -export ZFS_REMAP_ENABLED=YES - -default_setup_noexit "$DISKS" - - -function cleanup -{ - set_tunable64 zfs_condense_min_mapping_bytes 131072 - default_cleanup_noexit -} - -log_onexit cleanup - -log_must set_tunable64 zfs_condense_min_mapping_bytes 1 - -log_must zfs set recordsize=512 $TESTPOOL/$TESTFS - -# -# Create a large file so that we know some of the blocks will be on the -# removed device, and hence eligible for remapping. -# -log_must dd if=/dev/urandom of=$TESTDIR/file bs=$((2**12)) count=$((2**9)) - -# -# Randomly rewrite some of blocks in the file so that there will be holes and -# we will not be able to remap the entire file in a few huge chunks. -# -for i in $(seq $((2**12))); do - # - # We have to sync periodically so that all the writes don't end up in - # the same txg. If they were all in the same txg, only the last write - # would go through and we would not have as many allocations to - # fragment the file. - # - ((i % 100 > 0 )) || sync_pool || log_fail "Could not sync." - random_write $TESTDIR/file $((2**9)) || \ - log_fail "Could not random write." -done - -# -# Remap should quietly succeed as a noop before a removal. -# -log_must zfs remap $TESTPOOL/$TESTFS -remaptxg_before=$(zfs get -H -o value remaptxg $TESTPOOL/$TESTFS) -(( $? == 0 )) || log_fail "Could not get remaptxg." -[[ $remaptxg_before == "-" ]] || \ - log_fail "remaptxg ($remaptxg_before) had value before a removal" - -log_must zpool remove $TESTPOOL $REMOVEDISK -log_must wait_for_removal $TESTPOOL -log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK - -# -# remaptxg should not be set if we haven't done a remap. -# -remaptxg_before=$(zfs get -H -o value remaptxg $TESTPOOL/$TESTFS) -(( $? == 0 )) || log_fail "Could not get remaptxg." -[[ $remaptxg_before == "-" ]] || \ - log_fail "remaptxg ($remaptxg_before) had value before a removal" - -mapping_size_before=$(indirect_vdev_mapping_size $TESTPOOL) -log_must zfs remap $TESTPOOL/$TESTFS - -# Try to wait for a condense to finish. -for i in {1..5}; do - sleep 5 - sync_pool -done -mapping_size_after=$(indirect_vdev_mapping_size $TESTPOOL) - -# -# After the remap, there should not be very many blocks referenced. The reason -# why our threshold is as high as 512 is because our ratio of metadata to -# user data is relatively high, with only 64M of user data on the file system. -# -(( mapping_size_after < mapping_size_before )) || \ - log_fail "Mapping size did not decrease after remap: " \ - "$mapping_size_before before to $mapping_size_after after." -(( mapping_size_after < 512 )) || \ - log_fail "Mapping size not small enough after remap: " \ - "$mapping_size_before before to $mapping_size_after after." - -# -# After a remap, the remaptxg should be set to a non-zero value. -# -remaptxg_after=$(zfs get -H -o value remaptxg $TESTPOOL/$TESTFS) -(( $? == 0 )) || log_fail "Could not get remaptxg." -log_note "remap txg after remap is $remaptxg_after" -(( remaptxg_after > 0 )) || log_fail "remaptxg not increased" - -# -# Remap should quietly succeed as a noop if there have been no removals since -# the last remap. -# -log_must zfs remap $TESTPOOL/$TESTFS -remaptxg_again=$(zfs get -H -o value remaptxg $TESTPOOL/$TESTFS) -(( $? == 0 )) || log_fail "Could not get remaptxg." -log_note "remap txg after second remap is $remaptxg_again" -(( remaptxg_again == remaptxg_after )) || \ - log_fail "remap not noop if there has been no removal" - -log_pass "Remapping a fs caused mapping size to decrease." diff --git a/tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh b/tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh index a2f6580b4f..9348022866 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh @@ -21,9 +21,6 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/removal/removal.kshlib -# N.B. The 'zfs remap' command has been disabled and may be removed. -export ZFS_REMAP_ENABLED=YES - default_setup_noexit "$DISKS" log_onexit default_cleanup_noexit @@ -37,7 +34,7 @@ log_must zfs snapshot $TESTPOOL/$TESTFS@snap-pre2 log_must dd if=/dev/zero of=$TESTDIR/file bs=1024k count=100 \ conv=notrunc seek=200 -if is_linux; then +if is_linux || is_freebsd; then log_must attempt_during_removal $TESTPOOL $REMOVEDISK zdb -cd $TESTPOOL else log_must attempt_during_removal $TESTPOOL $REMOVEDISK @@ -45,7 +42,14 @@ fi log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK log_must zdb -cd $TESTPOOL -log_must zfs remap $TESTPOOL/$TESTFS +# +# Touch one block under each L1 indirect block, so that the other data blocks +# will be remapped to their concrete locations. These parameters assume +# recordsize=128K, indirect block size of 128K (1024 block pointers per +# indirect block), and file size of less than 3*1024 blocks (384MB). +# +log_must stride_dd -i /dev/urandom -o $TESTDIR/file -b 131072 -c 3 -s 1024 + log_must zdb -cd $TESTPOOL log_must zfs snapshot $TESTPOOL/$TESTFS@snap-post3 diff --git a/tests/zfs-tests/tests/functional/removal/removal_resume_export.ksh b/tests/zfs-tests/tests/functional/removal/removal_resume_export.ksh index 4f1e63cd06..142e72754b 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_resume_export.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_resume_export.ksh @@ -43,13 +43,28 @@ function cleanup { - log_must zinject -c all + zinject -c all default_cleanup_noexit } function callback { + # + # Inject an error so export fails after having just suspended + # the removal thread. [spa_inject_ref gets incremented] + # + log_must zinject -d $REMOVEDISK -D 10:1 $TESTPOOL + + # + # Because of the above error export should fail. + # log_mustnot zpool export $TESTPOOL + + # + # Let the removal finish. + # + log_must zinject -c all + return 0 } @@ -78,13 +93,7 @@ log_must dd if=/dev/urandom of=$TESTDIR/$TESTFILE0 bs=64M count=32 log_must zpool add -f $TESTPOOL $NOTREMOVEDISK # -# Inject an error so export fails after having just suspended -# the removal thread. [spa_inject_ref gets incremented] -# -log_must zinject -d $REMOVEDISK -D 10:1 $TESTPOOL - -# -# Because of the above error export should fail. +# Attempt the export with errors injected. # log_must attempt_during_removal $TESTPOOL $REMOVEDISK callback diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh index 2ef56706a2..9d5143ef8b 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh @@ -64,9 +64,7 @@ function wait_for_removing_cancel { typeset pool=$1 - while is_pool_removing $pool; do - sleep 1 - done + log_must zpool wait -t remove $pool # # The pool state changes before the TXG finishes syncing; wait for @@ -85,8 +83,11 @@ FILE_CONTENTS="Leeloo Dallas mul-ti-pass." echo $FILE_CONTENTS >$TESTDIR/$TESTFILE0 log_must [ "x$(<$TESTDIR/$TESTFILE0)" = "x$FILE_CONTENTS" ] -log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**7)) -sync_pool $TESTPOOL +log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**8)) + +# Flush the ARC to minimize cache effects. +log_must zpool export $TESTPOOL +log_must zpool import -d $TMPDIR $TESTPOOL # Verify that unexpected read errors automatically cancel the removal. log_must zinject -d $DISK0 -e io -T all -f 100 $TESTPOOL @@ -95,6 +96,10 @@ log_must wait_for_removing_cancel $TESTPOOL log_must vdevs_in_pool $TESTPOOL mirror-0 log_must zinject -c all +# Flush the ARC to minimize cache effects. +log_must zpool export $TESTPOOL +log_must zpool import -d $TMPDIR $TESTPOOL + # Verify that unexpected write errors automatically cancel the removal. log_must zinject -d $DISK3 -e io -T all -f 100 $TESTPOOL log_must zpool remove $TESTPOOL mirror-0 diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh index 0ec358aadb..f76f76d34f 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh @@ -26,7 +26,7 @@ log_onexit default_cleanup_noexit function callback { - is_linux && test_removal_with_operation_kill + test_removal_with_operation_kill log_must zpool export $TESTPOOL # diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_ganging.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_ganging.ksh index 35c90e6a56..e3e635998e 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_with_ganging.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_with_ganging.ksh @@ -23,12 +23,12 @@ function cleanup { - log_must set_tunable64 metaslab_force_ganging $((2**17 + 1)) + log_must set_tunable64 METASLAB_FORCE_GANGING $((2**17 + 1)) default_cleanup_noexit } default_setup_noexit "$DISKS" -log_must set_tunable64 metaslab_force_ganging $((2**14)) +log_must set_tunable64 METASLAB_FORCE_GANGING $((2**14)) log_onexit cleanup FILE_CONTENTS="Leeloo Dallas mul-ti-pass." diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_send.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_send.ksh index 59e66aca52..a082478381 100755 --- a/tests/zfs-tests/tests/functional/removal/removal_with_send.ksh +++ b/tests/zfs-tests/tests/functional/removal/removal_with_send.ksh @@ -28,7 +28,7 @@ function callback { create_snapshot $TESTPOOL/$TESTFS $TESTSNAP log_must ksh -c \ - "zfs send $TESTPOOL/$TESTFS@$TESTSNAP >/dev/null" + "zfs send $TESTPOOL/$TESTFS@$TESTSNAP >$TEST_BASE_DIR/devnull" return 0 } diff --git a/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh b/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh new file mode 100755 index 0000000000..9bbb07cd94 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh @@ -0,0 +1,73 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# Resilvering results in no CKSUM errors in pools with indirect vdevs. +# +# STRATEGY: +# 1. Create a pool with two top-vdevs +# 2. Write some files +# 3. Remove one of the top-vdevs +# 4. Reattach it to make a mirror +# + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} + +DISK1="$TMPDIR/dsk1" +DISK2="$TMPDIR/dsk2" +DISKS="$DISK1 $DISK2" + +# fio options +export DIRECTORY=/$TESTPOOL +export NUMJOBS=16 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=4K +export SYNC_TYPE=0 +export DIRECT=1 +export FILE_SIZE=128M + +log_must mkfile 4g $DISK1 +log_must mkfile 4g $DISK2 + +function cleanup +{ + default_cleanup_noexit + log_must rm -f $DISKS +} + +log_must zpool create -O recordsize=4k $TESTPOOL $DISK1 $DISK2 +log_onexit cleanup + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/sequential_reads.fio + +log_must zpool remove -w $TESTPOOL $DISK2 +log_must zpool attach -w $TESTPOOL $DISK1 $DISK2 + +verify_pool $TESTPOOL + +log_pass "Resilvering results in no CKSUM errors with indirect vdevs" diff --git a/tests/zfs-tests/tests/functional/removal/remove_indirect.ksh b/tests/zfs-tests/tests/functional/removal/remove_indirect.ksh new file mode 100755 index 0000000000..c4ba0d9ac5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/remove_indirect.ksh @@ -0,0 +1,58 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2019, loli10K . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# Device removal cannot remove non-concrete vdevs +# +# STRATEGY: +# 1. Create a pool with removable devices +# 2. Remove a top-level device +# 3. Verify we can't remove the "indirect" vdev created by the first removal +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL + log_must rm -f $TEST_BASE_DIR/device-{1,2,3} +} + +log_assert "Device removal should not be able to remove non-concrete vdevs" +log_onexit cleanup + +# 1. Create a pool with removable devices +truncate -s $MINVDEVSIZE $TEST_BASE_DIR/device-{1,2,3} +zpool create $TESTPOOL $TEST_BASE_DIR/device-{1,2,3} + +# 2. Remove a top-level device +log_must zpool remove $TESTPOOL $TEST_BASE_DIR/device-1 +log_must wait_for_removal $TESTPOOL + +# 3. Verify we can't remove the "indirect" vdev created by the first removal +INDIRECT_VDEV=$(zpool list -v -g $TESTPOOL | awk '{if ($2 == "-") { print $1; exit} }') +log_must test -n "$INDIRECT_VDEV" +log_mustnot zpool remove $TESTPOOL $INDIRECT_VDEV + +log_pass "Device removal cannot remove non-concrete vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/Makefile.am b/tests/zfs-tests/tests/functional/replacement/Makefile.am index d47fcd5e1b..fe6e491219 100644 --- a/tests/zfs-tests/tests/functional/replacement/Makefile.am +++ b/tests/zfs-tests/tests/functional/replacement/Makefile.am @@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - replacement_001_pos.ksh \ - replacement_002_pos.ksh \ - replacement_003_pos.ksh + attach_import.ksh \ + attach_multiple.ksh \ + attach_rebuild.ksh \ + attach_resilver.ksh \ + detach.ksh \ + rebuild_disabled_feature.ksh \ + rebuild_multiple.ksh \ + rebuild_raidz.ksh \ + replace_import.ksh \ + replace_rebuild.ksh \ + replace_resilver.ksh \ + resilver_restart_001.ksh \ + resilver_restart_002.ksh \ + scrub_cancel.ksh dist_pkgdata_DATA = \ replacement.cfg diff --git a/tests/zfs-tests/tests/functional/replacement/attach_import.ksh b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh new file mode 100755 index 0000000000..e2749b164e --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress attach operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering. +# a. Create a pool +# b. Add a vdev with 'zpool attach' and resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool attach' resumed resilvering +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify attach is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh new file mode 100755 index 0000000000..5c38353491 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that attach/detach work while resilvering and attaching +# multiple vdevs. +# +# Strategy: +# 1. Create a single vdev pool +# 2. While healing or sequential resilvering: +# a. Attach a vdev to convert the pool to a mirror. +# b. Attach a vdev to convert the pool to a 3-way mirror. +# c. Verify the original vdev cannot be removed (no redundant copies) +# d. Detach a vdev. Healing and sequential resilver remain running. +# e. Detach a vdev. Healing resilver remains running, sequential +# resilver is canceled. +# f. Wait for resilver to complete. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach/detach with multiple vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify resilver resumes on import. +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + +for replace_mode in "healing" "sequential"; do + # + # Resilvers abort the dsl_scan and reconfigure it for resilvering. + # Rebuilds cancel the dsl_scan and start the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + flags="" + else + flags="-s" + fi + + log_mustnot is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Attach first vdev (stripe -> mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # Attach second vdev (2-way -> 3-way mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[1]} ${VDEV_FILES[2]} + log_must is_pool_resilvering $TESTPOOL1 + + # Original vdev cannot be detached until there is sufficient redundancy. + log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]} + + # Detach first vdev (resilver keeps running) + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # + # Detach second vdev. There's a difference in behavior between + # healing and sequential resilvers. A healing resilver will not be + # cancelled even though there's nothing on the original vdev which + # needs to be rebuilt. A sequential resilver on the otherhand is + # canceled when returning to a non-redundant striped layout. At + # some point the healing resilver behavior should be updated to match + # the sequential resilver behavior. + # + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]} + + if [[ "$replace_mode" = "healing" ]]; then + log_must is_pool_resilvering $TESTPOOL1 + else + log_mustnot is_pool_resilvering $TESTPOOL1 + fi + + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait $TESTPOOL1 +done + +log_pass "Verify attach/detach with multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh new file mode 100755 index 0000000000..998d3eec7c --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh @@ -0,0 +1,173 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Attaching disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and +# start some random I/O +# 2. Attach a disk to the pool. +# 3. Verify the integrity of the file system and the resilvering. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function attach_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids; do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for op in "" "-f"; do + create_pool $TESTPOOL1 mirror $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 +done + +log_note "Verify 'zpool attach' fails with non-mirrors." + +for type in "" "raidz" "raidz1" "draid" "draid1"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \ + $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -eq 0 ]]; then + log_fail "$REPLACEFILE should not be present." + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh similarity index 90% rename from tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh index 391aa5cf0d..e99d681bb2 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh @@ -37,7 +37,7 @@ # Attaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Attach a disk to the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -104,9 +104,7 @@ function attach_test ((i = i + 1)) done - log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,13 +117,13 @@ function attach_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 - + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +132,7 @@ done # # Create a replacement disk special file. # -mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE for op in "" "-f"; do create_pool $TESTPOOL1 mirror $specials_list @@ -143,7 +141,7 @@ for op in "" "-f"; do attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi @@ -153,7 +151,7 @@ done log_note "Verify 'zpool attach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 @@ -162,7 +160,7 @@ for type in "" "raidz" "raidz1"; do log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \ $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -eq 0 ]]; then log_fail "$REPLACEFILE should not be present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh similarity index 92% rename from tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/detach.ksh index 71b9602ee1..f049c639d8 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/detach.ksh @@ -37,7 +37,7 @@ # Detaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Detach a disk from the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -121,8 +121,8 @@ function detach_test specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +134,7 @@ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 detach_test $TESTDIR/$TESTFILE1.1 -zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" +zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -eq 0 ]]; then log_fail "$TESTFILE1.1 should no longer be present." fi @@ -143,14 +143,14 @@ destroy_pool $TESTPOOL1 log_note "Verify 'zpool detach' fails with non-mirrors." -for type in "" "raidz" "raidz1" ; do +for type in "" "raidz" "raidz1" "draid"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 log_mustnot zpool detach $TESTDIR/$TESTFILE1.1 - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" + zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -ne 0 ]]; then log_fail "$TESTFILE1.1 is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh new file mode 100755 index 0000000000..d17d83b783 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify device_rebuild feature flags. +# +# Strategy: +# 1. Create a pool with all features disabled. +# 2. Verify 'zpool replace -s' fails and the feature is disabled. +# 3. Enable the device_rebuild feature. +# 4. Verify 'zpool replace -s' works and the feature is active. +# 5. Wait for the feature to return to enabled. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +function check_feature_flag +{ + feature=$1 + pool=$2 + expected_value=$3 + + value="$(zpool get -H -o property,value all $pool | \ + egrep "$feature" | awk '{print $2}')" + if [ "$value" = "$expected_value" ]; then + log_note "$feature verified to be $value" + else + log_fail "$feature should be $expected_value but is $value" + fi +} + +log_assert "Verify device_rebuild feature flags." + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE +log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]} + +log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool set feature@device_rebuild=enabled $TESTPOOL1 +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled" + +log_pass "Verify device_rebuild feature flags." diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh new file mode 100755 index 0000000000..7775cbff4d --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Sequential reconstruction (unlike healing reconstruction) operate on the +# top-level vdev. This means that a sequential resilver operation can be +# started/stopped on a different top-level vdev without impacting other +# sequential resilvers. +# +# STRATEGY: +# 1. Create a mirrored pool. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 +} + +function check_history +{ + pool=$1 + msg=$2 + exp=$3 + + count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg") + if [[ "$count" -ne "$exp" ]]; then + log_fail "Expected $exp rebuild '$msg' messages, found $count" + else + log_note "Found $count/$exp rebuild '$msg' messages" + fi +} + +log_assert "Rebuilds operate on the top-level vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \ + $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 + +# Verify two sequential resilvers can run concurrently. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 2 +destroy_pool $TESTPOOL1 + +# Verify canceling one resilver (zpool detach) does not impact others. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 1 +check_history $TESTPOOL1 "canceled" 1 +destroy_pool $TESTPOOL1 + +log_pass "Rebuilds operate on the top-level vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh new file mode 100755 index 0000000000..26dc6f87b2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Executing 'zpool replace -s' for raidz vdevs failed. Sequential +# resilvers are only allowed for stripe/mirror/dRAID pools. +# +# STRATEGY: +# 1. Create a raidz pool, verify 'zpool replace -s' fails +# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Sequential resilver is not allowed for raidz vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# raidz[1-3] +for vdev_type in "raidz" "raidz2" "raidz3"; do + log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]} + log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + destroy_pool $TESTPOOL1 +done + +# stripe +log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +# mirror +log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +# draid +log_must zpool create $TESTPOOL1 draid ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +log_pass "Sequential resilver is not allowed for raidz vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_import.ksh b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh new file mode 100755 index 0000000000..37d3c6645c --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress replace operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering replace: +# a. Create a pool +# b. Replace a vdev with 'zpool replace' to resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool replace' resumed resilvering. +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Verify replace is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify replace is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh new file mode 100755 index 0000000000..b3c7995fd6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh @@ -0,0 +1,158 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror/draid) and +# start some random I/O +# 2. Replace a disk in the pool with another disk. +# 3. Verify the integrity of the file system and the rebuilding. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk with -r during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for type in "" "mirror" "draid"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh similarity index 92% rename from tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh index 8f40436ffb..2585397bba 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh @@ -37,7 +37,7 @@ # Replacing disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Replace a disk in the pool with another disk. # 3. Verify the integrity of the file system and the resilvering. @@ -104,9 +104,7 @@ function replace_test ((i = i + 1)) done - log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,11 +117,12 @@ function replace_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do +while [[ $i != 3 ]]; do log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" @@ -135,7 +134,7 @@ done # log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE -for type in "" "raidz" "mirror"; do +for type in "" "raidz" "mirror" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 @@ -143,7 +142,7 @@ for type in "" "raidz" "mirror"; do replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement.cfg b/tests/zfs-tests/tests/functional/replacement/replacement.cfg index b2ba1b8851..271317b1c9 100644 --- a/tests/zfs-tests/tests/functional/replacement/replacement.cfg +++ b/tests/zfs-tests/tests/functional/replacement/replacement.cfg @@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""} export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""} export HOLES_COUNT=${HOLES_COUNT-"16384"} # FILESIZE/BLKSIZE/8 export REPLACEFILE="sparedisk" + +set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4} +export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 )) +export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1 +export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2 diff --git a/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh new file mode 100755 index 0000000000..7e96ab5187 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh @@ -0,0 +1,187 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Testing resilver restart logic both with and without the deferred resilver +# feature enabled, verifying that resilver is not restarted when it is +# unnecessary. +# +# STRATEGY: +# 1. Create a pool +# 2. Create four filesystems with the primary cache disable to force reads +# 3. Write four files simultaneously, one to each filesystem +# 4. Do with and without deferred resilvers enabled +# a. Replace a vdev with a spare & suspend resilver immediately +# b. Verify resilver starts properly +# c. Offline / online another vdev to introduce a new DTL range +# d. Verify resilver restart or defer +# e. Inject read errors on vdev that was offlined / onlned +# f. Verify that resilver did not restart +# g. Unsuspend resilver and wait for it to finish +# h. Verify that there are two resilvers and nothing is deferred +# + +function cleanup +{ + log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX + log_must zinject -c all + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +# count resilver events in zpool and number of deferred rsilvers on vdevs +function verify_restarts # +{ + msg=$1 + cnt=$2 + defer=$3 + + # check the number of resilver start in events log + RESILVERS=$(zpool events | grep -c sysevent.fs.zfs.resilver_start) + log_note "expected $cnt resilver start(s)$msg, found $RESILVERS" + [[ "$RESILVERS" -ne "$cnt" ]] && + log_fail "expected $cnt resilver start(s)$msg, found $RESILVERS" + + [[ -z "$defer" ]] && return + + # use zdb to find which vdevs have the resilver defer flag + VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk ' + /children/ { gsub(/[^0-9]/, ""); child = $0 } + /com\.datto:resilver_defer$/ { print child } + ') + + if [[ "$defer" == "-" ]] + then + [[ -n $VDEV_DEFERS ]] && + log_fail "didn't expect any vdevs to have resilver deferred" + return + fi + + [[ $VDEV_DEFERS -eq $defer ]] || + log_fail "resilver deferred set on unexpected vdev: $VDEV_DEFERS" +} + +log_assert "Check for unnecessary resilver restarts" + +ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS) +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) +ORIG_ZFS_ZEVENT_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) + +set -A RESTARTS -- '1' '2' '2' '2' +set -A VDEVS -- '' '' '' '' +set -A DEFER_RESTARTS -- '1' '1' '1' '2' +set -A DEFER_VDEVS -- '-' '2' '2' '-' + +VDEV_REPLACE="${VDEV_FILES[1]} $SPARE_VDEV_FILE" + +log_onexit cleanup + +# ensure that enough events will be saved +log_must set_tunable32 ZEVENT_LEN_MAX 512 + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \ + raidz ${VDEV_FILES[@]} + +# create 4 filesystems +for fs in fs{0..3} +do + log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs +done + +# simultaneously write 16M to each of them +set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0 +log_note "Writing data files" +for path in ${DATAPATHS[@]} +do + dd if=/dev/urandom of=$path bs=1M count=16 > /dev/null 2>&1 & +done +wait + +# test without and with deferred resilve feature enabled +for test in "without" "with" +do + log_note "Testing $test deferred resilvers" + + if [[ $test == "with" ]] + then + log_must zpool set feature@resilver_defer=enabled $TESTPOOL1 + RESTARTS=( "${DEFER_RESTARTS[@]}" ) + VDEVS=( "${DEFER_VDEVS[@]}" ) + VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}" + fi + + # clear the events + log_must zpool events -c + + # limit scanning time + log_must set_tunable32 RESILVER_MIN_TIME_MS 50 + + # initiate a resilver and suspend the scan as soon as possible + log_must zpool replace $TESTPOOL1 $VDEV_REPLACE + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # there should only be 1 resilver start + verify_restarts '' "${RESTARTS[0]}" "${VDEVS[0]}" + + # offline then online a vdev to introduce a new DTL range after current + # scan, which should restart (or defer) the resilver + log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 + log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 + + # there should now be 2 resilver starts w/o defer, 1 with defer + verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}" + + # inject read io errors on vdev and verify resilver does not restart + log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1 + log_must cat ${DATAPATHS[1]} > /dev/null + log_must zinject -c all + + # there should still be 2 resilver starts w/o defer, 1 with defer + verify_restarts ' after zinject' "${RESTARTS[2]}" "${VDEVS[2]}" + + # unsuspend resilver + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 + + # wait for resilver to finish + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + + # wait for a few txg's to see if a resilver happens + log_must zpool sync $TESTPOOL1 + log_must zpool sync $TESTPOOL1 + + # there should now be 2 resilver starts + verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}" +done + +log_pass "Resilver did not restart unnecessarily" diff --git a/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh new file mode 100755 index 0000000000..48763f9b2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Datto Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Testing resilver completes when scan errors are encountered, but relevant +# DTL's have not been lost. +# +# STRATEGY: +# 1. Create a pool (1k recordsize) +# 2. Create a 32m file (32k records) +# 3. Inject an error halfway through the file +# 4. Start a resilver, ensure the error is triggered and that the resilver +# does not restart after finishing +# +# NB: use legacy scanning to ensure scan of specific block causes error +# + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE + log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY +} + +log_assert "Check for resilver restarts caused by scan errors" + +ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY) + +log_onexit cleanup + +# use legacy scan to ensure injected error will be triggered +log_must set_tunable32 SCAN_LEGACY 1 + + # create the pool and a 32M file (32k blocks) +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1 + +# determine objset/object +objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL1/file | awk '{print $1}') + +# inject event to cause error during resilver +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1 + +# clear events and start resilver +log_must zpool events -c +log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE + +log_note "waiting for read errors to start showing up" +for iter in {0..59} +do + zpool sync $TESTPOOL1 + err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}') + (( $err > 0 )) && break + sleep 1 +done + +(( $err == 0 )) && log_fail "Unable to induce errors in resilver" + +log_note "waiting for resilver to finish" +for iter in {0..59} +do + finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l) + (( $finish > 0 )) && break + sleep 1 +done + +(( $finish == 0 )) && log_fail "resilver took too long to finish" + +# wait a few syncs to ensure that zfs does not restart the resilver +log_must zpool sync $TESTPOOL1 +log_must zpool sync $TESTPOOL1 + +# check if resilver was restarted +start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) +(( $start != 1 )) && log_fail "resilver restarted unnecessarily" + +log_pass "Resilver did not restart unnecessarily from scan errors" diff --git a/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh new file mode 100755 index 0000000000..da8a0a26e3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Verify scrub behaves as intended when contending with a healing or +# sequential resilver. +# +# STRATEGY: +# 1. Create a pool +# 2. Add a modest amount of data to the pool. +# 3. For healing and sequential resilver: +# a. Start scrubbing. +# b. Verify a resilver can be started and it cancels the scrub. +# c. Verify a scrub cannot be started when resilvering +# + +function cleanup +{ + log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Scrub was cancelled by resilver" + +ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS) +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64 +log_must zpool sync $TESTPOOL1 + +# Request a healing or sequential resilver +for replace_mode in "healing" "sequential"; do + + # + # Healing resilvers abort the dsl_scan and reconfigure it for + # resilvering. Sequential resilvers cancel the dsl_scan and start + # the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + history_msg="scan aborted, restarting" + flags="" + else + history_msg="scan cancelled" + flags="-s" + fi + + # Limit scanning time and suspend the scan as soon as possible. + log_must set_tunable32 RESILVER_MIN_TIME_MS 50 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Initiate a scrub. + log_must zpool scrub $TESTPOOL1 + + # Initiate a resilver to cancel the scrub. + log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + + # Verify the scrub was canceled, it may take a few seconds to exit. + while is_pool_scrubbing $TESTPOOL1; do + sleep 1 + done + log_mustnot is_pool_scrubbing $TESTPOOL1 + + # Verify a scrub cannot be started while resilvering. + log_must is_pool_resilvering $TESTPOOL1 + log_mustnot zpool scrub $TESTPOOL1 + + # Unsuspend resilver. + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 + + # Wait for resilver to finish then put the original back. + log_must zpool wait $TESTPOOL1 + log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \ + ${VDEV_FILES[1]} +done +log_pass "Scrub was cancelled by resilver" + diff --git a/tests/zfs-tests/tests/functional/reservation/reservation.shlib b/tests/zfs-tests/tests/functional/reservation/reservation.shlib index 49ee3b992d..47bd70f7cb 100644 --- a/tests/zfs-tests/tests/functional/reservation/reservation.shlib +++ b/tests/zfs-tests/tests/functional/reservation/reservation.shlib @@ -108,7 +108,7 @@ function create_multiple_fs # num_fs base_fs_name base_mnt_name # # This function compute the largest volume size which is multiple of volume -# block size (default 8K) and not greater than the largest expected volsize. +# block size (default 16K) and not greater than the largest expected volsize. # # $1 The largest expected volume size. # $2 The volume block size @@ -116,7 +116,7 @@ function create_multiple_fs # num_fs base_fs_name base_mnt_name function floor_volsize # [volblksize] { typeset largest_volsize=$1 - typeset volblksize=${2:-8192} + typeset volblksize=${2:-16384} if ((largest_volsize < volblksize)); then log_fail "The largest_volsize must be greater than volblksize." @@ -157,7 +157,7 @@ function volsize_to_reservation typeset volblocksize=$(get_prop volblocksize $vol) else typeset ncopies=1 - typeset volblocksize=8192 + typeset volblocksize=16384 fi typeset nblocks=$((volsize / volblocksize)) diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_001_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_001_pos.ksh index b72b8e4a38..b8220791f1 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_001_pos.ksh @@ -115,7 +115,7 @@ for obj in $TESTPOOL/$TESTFS $OBJ_LIST; do # # Due to the way space is consumed and released by metadata we - # can't do an exact check here, but we do do a basic sanity + # can't do an exact check here, but we do a basic sanity # check. # log_must within_limits $space_avail $new_space_avail $RESV_TOLERANCE diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_002_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_002_pos.ksh index 8ae3593613..e0fed6389c 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_002_pos.ksh @@ -54,7 +54,7 @@ verify_runnable "both" function cleanup { for obj in $OBJ_LIST; do - datasetexists $obj && log_must_busy zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done log_must zero_reservation $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_003_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_003_pos.ksh index 48adabe72f..ee303b53be 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_003_pos.ksh @@ -59,7 +59,7 @@ function cleanup log_must zero_reservation $TESTPOOL/$TESTFS for obj in $OBJ_LIST; do - datasetexists $obj && log_must zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done } diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_004_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_004_pos.ksh index f8342ff294..eb606a7624 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_004_pos.ksh @@ -56,7 +56,7 @@ verify_runnable "both" function cleanup { for obj in $OBJ_LIST; do - datasetexists $obj && log_must_busy zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done } diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_005_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_005_pos.ksh index 4047fab0d7..535d652daf 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_005_pos.ksh @@ -59,7 +59,7 @@ log_assert "Verify space released when reservation on a dataset is set "\ function cleanup { for obj in $OBJ_LIST; do - datasetexists $obj && log_must zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done } diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_006_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_006_pos.ksh index ec1986c454..da0d36a35d 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_006_pos.ksh @@ -39,7 +39,7 @@ # for a dataset. Unlike quotas however there should be no restrictions # on accessing space outside of the limits of the reservation (if the # space is available in the pool). Verify that in a filesystem with a -# reservation set that its possible to create files both within the +# reservation set that it's possible to create files both within the # reserved space and also outside. # # STRATEGY: diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.ksh index 48d6b40ad0..a1fffd3624 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.ksh @@ -56,10 +56,10 @@ log_assert "Verify reservations on data sets doesn't affect other data sets " \ function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -f datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup @@ -100,7 +100,7 @@ function create_resv_destroy { # args1 dataset1 args2 dataset2 # available totals should revert back to the values they # had after creating the first dataset. # - log_must zfs destroy -f $dataset2 + log_must_busy zfs destroy -f $dataset2 avail_dest_dset2=`get_prop available $TESTPOOL` used_dest_dset2=`get_prop used $TESTPOOL` @@ -112,7 +112,7 @@ function create_resv_destroy { # args1 dataset1 args2 dataset2 # After destroying the first dataset the space used and # space available totals should revert back to the values # they had when the pool was first created. - log_must zfs destroy -f $dataset1 + log_must_busy zfs destroy -f $dataset1 avail_dest_dset1=`get_prop available $TESTPOOL` used_dest_dset1=`get_prop used $TESTPOOL` diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_008_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_008_pos.ksh index fbf4276e8b..cfc30f4742 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_008_pos.ksh @@ -57,7 +57,7 @@ function cleanup typeset -i loop=0 while (($loop < $RESV_NUM_FS)); do datasetexists $TESTPOOL/${TESTFS}$loop && \ - log_must zfs destroy -f $TESTPOOL/${TESTFS}$loop + destroy_dataset $TESTPOOL/${TESTFS}$loop -f [[ -d ${TESTDIR}$loop ]] && log_must rm -r ${TESTDIR}$loop @@ -85,7 +85,7 @@ resv_size_set=`expr $resv_space_avail / $num_resv_fs` # # We set the reservations now, rather than when we created the filesystems -# to allow us to take into account space used by the filsystem metadata +# to allow us to take into account space used by the filesystem metadata # # Note we don't set a reservation on the first filesystem we created, # hence num=1 rather than zero below. diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_009_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_009_pos.ksh index 171577def5..a639abf896 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_009_pos.ksh @@ -58,7 +58,9 @@ function cleanup { log_must rm -rf $TESTDIR/$TESTFILE1 log_must rm -rf $TESTDIR/$TESTFILE2 - log_must zfs destroy -f $TESTPOOL/$TESTFS1 + + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -f } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_010_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_010_pos.ksh index 2ca279a4a4..f3a64a0bea 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_010_pos.ksh @@ -57,7 +57,7 @@ log_assert "Destroying top level filesystem with reservation allows more " \ function cleanup { datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 [[ -e $TESTDIR/$TESTFILE1 ]] && log_must rm -rf $TESTDIR/$TESTFILE1 [[ -e $TESTDIR/$TESTFILE2 ]] && log_must rm -rf $TESTDIR/$TESTFILE2 diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.ksh index 6a80bb575d..bf09552234 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.ksh @@ -58,7 +58,7 @@ log_assert "Reservation properties preserved across exports and imports" function cleanup { for obj in $OBJ_LIST; do - datasetexists $obj && log_must zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done log_must zero_reservation $TESTPOOL/$TESTFS @@ -91,7 +91,7 @@ log_must zfs set reservation=$resv_set $TESTPOOL/$TESTFS1 log_must zfs set reservation=$resv_set $TESTPOOL/$TESTFS1/$TESTFS2 log_must zfs set reservation=$resv_set $TESTPOOL/$TESTVOL2 -log_must zpool export $TESTPOOL +log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL for obj in $TESTPOOL/$TESTFS $OBJ_LIST; do diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_014_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_014_pos.ksh index e8bd91d00e..3b7f384da3 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_014_pos.ksh @@ -54,7 +54,7 @@ function cleanup # # Note we don't destroy $TESTFS as it's used by other tests for obj in $OBJ_LIST ; do - datasetexists $obj && log_must zfs destroy -f $obj + datasetexists $obj && destroy_dataset $obj -f done log_must zero_reservation $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_015_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_015_pos.ksh index d67f8c7ec2..7067a78105 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_015_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_015_pos.ksh @@ -57,7 +57,7 @@ log_assert "Setting volume reservation to 'none' allows more data to be " \ function cleanup { datasetexists $TESTPOOL/$TESTVOL && \ - log_must zfs destroy $TESTPOOL/$TESTVOL + destroy_dataset $TESTPOOL/$TESTVOL [[ -e $TESTDIR/$TESTFILE1 ]] && log_must rm -rf $TESTDIR/$TESTFILE1 [[ -e $TESTDIR/$TESTFILE2 ]] && log_must rm -rf $TESTDIR/$TESTFILE2 @@ -76,6 +76,7 @@ space_avail=$(largest_volsize_from_pool $TESTPOOL) resv_size_set=$(floor_volsize $resv_size_set) log_must zfs create -V $resv_size_set $TESTPOOL/$TESTVOL +block_device_wait $TESTPOOL/$TESTVOL space_avail_still=`get_prop available $TESTPOOL` diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_016_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_016_pos.ksh index cbb1db658b..82bbcde4a3 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_016_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_016_pos.ksh @@ -56,7 +56,7 @@ log_assert "Destroying a regular volume with reservation allows more data to" \ function cleanup { datasetexists $TESTPOOL/$TESTVOL && \ - log_must zfs destroy $TESTPOOL/$TESTVOL + destroy_dataset $TESTPOOL/$TESTVOL [[ -e $TESTDIR/$TESTFILE1 ]] && log_must rm -rf $TESTDIR/$TESTFILE1 [[ -e $TESTDIR/$TESTFILE2 ]] && log_must rm -rf $TESTDIR/$TESTFILE2 @@ -76,6 +76,7 @@ vol_set_size=$(floor_volsize $vol_set_size) # Creating a regular volume implicitly sets its reservation # property to the same value. log_must zfs create -V $vol_set_size $TESTPOOL/$TESTVOL +block_device_wait $TESTPOOL/$TESTVOL space_avail_still=$(get_prop available $TESTPOOL) fill_size=$((space_avail_still + $RESV_TOLERANCE)) diff --git a/tests/zfs-tests/tests/functional/reservation/reservation_018_pos.ksh b/tests/zfs-tests/tests/functional/reservation/reservation_018_pos.ksh index 0969a68773..1f92c88985 100755 --- a/tests/zfs-tests/tests/functional/reservation/reservation_018_pos.ksh +++ b/tests/zfs-tests/tests/functional/reservation/reservation_018_pos.ksh @@ -47,7 +47,7 @@ verify_runnable "both" function cleanup { - datasetexists $fs_child && log_must zfs destroy $fs_child + datasetexists $fs_child && destroy_dataset $fs_child log_must zfs set reservation=$reserv_val $fs } diff --git a/tests/zfs-tests/tests/functional/rootpool/rootpool_007_pos.ksh b/tests/zfs-tests/tests/functional/rootpool/rootpool_007_pos.ksh index e355a0f1f0..e4d4268cc0 100755 --- a/tests/zfs-tests/tests/functional/rootpool/rootpool_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/rootpool/rootpool_007_pos.ksh @@ -45,9 +45,13 @@ verify_runnable "global" function cleanup { - log_must zfs set compression=$orig_compress $rootfs + [[ -n "$orig_compress" ]] && \ + log_must zfs set compression=$orig_compress $rootfs } +typeset assert_msg="the zfs rootfs's compression property can be set to \ + gzip and gzip[1-9]" + log_onexit cleanup log_assert $assert_msg @@ -55,9 +59,6 @@ typeset rootpool=$(get_rootpool) typeset rootfs=$(get_pool_prop bootfs $rootpool) typeset orig_compress=$(get_prop compression $rootfs) -typeset assert_msg="the zfs rootfs's compression property can be set to \ - gzip and gzip[1-9]" - set -A gtype "gzip" "gzip-1" "gzip-2" "gzip-3" "gzip-4" "gzip-5" \ "gzip-6" "gzip-7" "gzip-8" "gzip-9" diff --git a/tests/zfs-tests/tests/functional/rootpool/setup.ksh b/tests/zfs-tests/tests/functional/rootpool/setup.ksh index 5c3e56b91a..8d80971081 100755 --- a/tests/zfs-tests/tests/functional/rootpool/setup.ksh +++ b/tests/zfs-tests/tests/functional/rootpool/setup.ksh @@ -37,6 +37,6 @@ verify_runnable "global" # This functionality is supported under Linux, but these test cases # are disabled by default since they manipulate the systems root pool. # -if is_linux; then +if is_linux || is_freebsd; then log_unsupported "Supported but disabled by default" fi diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index 585018ac25..94bdd26745 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -2,6 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/rsend dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + recv_dedup.ksh \ + recv_dedup_encrypted_zvol.ksh \ rsend_001_pos.ksh \ rsend_002_pos.ksh \ rsend_003_pos.ksh \ @@ -16,6 +18,7 @@ dist_pkgdata_SCRIPTS = \ rsend_012_pos.ksh \ rsend_013_pos.ksh \ rsend_014_pos.ksh \ + rsend_016_neg.ksh \ rsend_019_pos.ksh \ rsend_020_pos.ksh \ rsend_021_pos.ksh \ @@ -25,7 +28,6 @@ dist_pkgdata_SCRIPTS = \ send_encrypted_hierarchy.ksh \ send_encrypted_props.ksh \ send_encrypted_truncated_files.ksh \ - send-cD.ksh \ send-c_embedded_blocks.ksh \ send-c_incremental.ksh \ send-c_lz4_disabled.ksh \ @@ -40,16 +42,25 @@ dist_pkgdata_SCRIPTS = \ send-c_volume.ksh \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ + send-L_toggle.ksh \ send_freeobjects.ksh \ + send_partial_dataset.ksh \ send_realloc_dnode_size.ksh \ send_realloc_files.ksh \ send_realloc_encrypted_files.ksh \ send_spill_block.ksh \ send_holds.ksh \ send_hole_birth.ksh \ + send_invalid.ksh \ send_mixed_raw.ksh \ - send-wDR_encrypted_zvol.ksh + send-wR_encrypted_zvol.ksh \ + send_doall.ksh dist_pkgdata_DATA = \ + dedup.zsend.bz2 \ + dedup_encrypted_zvol.bz2 \ + dedup_encrypted_zvol.zsend.bz2 \ + fs.tar.gz \ rsend.cfg \ rsend.kshlib + diff --git a/tests/zfs-tests/tests/functional/rsend/dedup.zsend.bz2 b/tests/zfs-tests/tests/functional/rsend/dedup.zsend.bz2 new file mode 100644 index 0000000000..585e148526 Binary files /dev/null and b/tests/zfs-tests/tests/functional/rsend/dedup.zsend.bz2 differ diff --git a/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.bz2 b/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.bz2 new file mode 100644 index 0000000000..73a5742fc3 Binary files /dev/null and b/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.bz2 differ diff --git a/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.zsend.bz2 b/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.zsend.bz2 new file mode 100644 index 0000000000..04a6cb53f0 Binary files /dev/null and b/tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.zsend.bz2 differ diff --git a/tests/zfs-tests/tests/functional/rsend/fs.tar.gz b/tests/zfs-tests/tests/functional/rsend/fs.tar.gz new file mode 100644 index 0000000000..cb6861c155 Binary files /dev/null and b/tests/zfs-tests/tests/functional/rsend/fs.tar.gz differ diff --git a/tests/zfs-tests/tests/functional/rsend/recv_dedup.ksh b/tests/zfs-tests/tests/functional/rsend/recv_dedup.ksh new file mode 100755 index 0000000000..e6e282a1c6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/recv_dedup.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# DESCRIPTION: +# Verifies that we can receive a dedup send stream by processing it with +# "zstream redup". +# + +verify_runnable "both" + +function cleanup +{ + destroy_dataset $TESTPOOL/recv "-r" + rm -r /$TESTPOOL/tar + rm $sendfile +} +log_onexit cleanup + +log_assert "Verify zfs can receive dedup send streams with 'zstream redup'" + +typeset sendfile_compressed=$STF_SUITE/tests/functional/rsend/dedup.zsend.bz2 +typeset sendfile=/$TESTPOOL/dedup.zsend +typeset tarfile=$STF_SUITE/tests/functional/rsend/fs.tar.gz + +log_must eval "bzcat <$sendfile_compressed >$sendfile" +log_must zfs create $TESTPOOL/recv +log_must eval "zstream redup $sendfile | zfs recv -d $TESTPOOL/recv" + +log_must mkdir /$TESTPOOL/tar +log_must tar --directory /$TESTPOOL/tar -xzf $tarfile +log_must diff -r /$TESTPOOL/tar /$TESTPOOL/recv + +log_pass "zfs can receive dedup send streams with 'zstream redup'" diff --git a/tests/zfs-tests/tests/functional/rsend/recv_dedup_encrypted_zvol.ksh b/tests/zfs-tests/tests/functional/rsend/recv_dedup_encrypted_zvol.ksh new file mode 100755 index 0000000000..daf559d264 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/recv_dedup_encrypted_zvol.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# DESCRIPTION: +# Verifies that we can receive a dedup send stream of a zvol by processing it +# with "zstream redup". +# + +verify_runnable "both" + +function cleanup +{ + destroy_dataset $TESTPOOL/recv "-r" + rm $sendfile + rm $volfile + rm $keyfile +} +log_onexit cleanup + +log_assert "Verify zfs can receive raw, recursive, and deduplicated send streams" + +typeset keyfile=/$TESTPOOL/pkey +typeset recvdev=$ZVOL_DEVDIR/$TESTPOOL/recv +typeset sendfile_compressed=$STF_SUITE/tests/functional/rsend/dedup_encrypted_zvol.zsend.bz2 +typeset sendfile=/$TESTPOOL/dedup_encrypted_zvol.zsend +typeset volfile_compressed=$STF_SUITE/tests/functional/rsend/dedup_encrypted_zvol.bz2 +typeset volfile=/$TESTPOOL/dedup_encrypted_zvol + +log_must eval "echo 'password' > $keyfile" + +log_must eval "bzcat <$sendfile_compressed >$sendfile" +log_must eval "zstream redup $sendfile | zfs recv $TESTPOOL/recv" + +log_must zfs load-key $TESTPOOL/recv +block_device_wait $volfile + +log_must eval "bzcat <$volfile_compressed >$volfile" +log_must diff $volfile $recvdev + +log_pass "zfs can receive raw, recursive, and deduplicated send streams" diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 521a1c7eb6..d06bd39b4d 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -25,7 +25,8 @@ # # -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2013, 2018 by Delphix. All rights reserved. +# Copyright (c) 2020 by Datto Inc. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -158,14 +159,9 @@ function cmp_md5s { typeset file1=$1 typeset file2=$2 - eval md5sum $file1 | awk '{ print $1 }' > $BACKDIR/md5_file1 - eval md5sum $file2 | awk '{ print $1 }' > $BACKDIR/md5_file2 - diff $BACKDIR/md5_file1 $BACKDIR/md5_file2 - typeset -i ret=$? - - rm -f $BACKDIR/md5_file1 $BACKDIR/md5_file2 - - return $ret + typeset sum1=$(md5digest $file1) + typeset sum2=$(md5digest $file2) + test "$sum1" = "$sum2" } # @@ -222,15 +218,21 @@ function cmp_ds_prop { typeset dtst1=$1 typeset dtst2=$2 - - for item in "type" "origin" "volblocksize" "acltype" "dnodesize" \ + typeset -a props=("type" "origin" "volblocksize" "acltype" "dnodesize" \ "atime" "canmount" "checksum" "compression" "copies" "devices" \ "exec" "quota" "readonly" "recordsize" "reservation" "setuid" \ - "snapdir" "version" "volsize" "xattr" "zoned" "mountpoint"; + "snapdir" "version" "volsize" "xattr" "mountpoint"); + if is_freebsd; then + props+=("jailed") + else + props+=("zoned") + fi + + for prop in $props; do - zfs get -H -o property,value,source $item $dtst1 >> \ + zfs get -H -o property,value,source $prop $dtst1 >> \ $BACKDIR/dtst1 - zfs get -H -o property,value,source $item $dtst2 >> \ + zfs get -H -o property,value,source $prop $dtst2 >> \ $BACKDIR/dtst2 done @@ -343,7 +345,7 @@ function getds_with_suffix } # -# Output inherited properties whitch is edited for file system +# Output inherited properties which is edited for file system # function fs_inherit_prop { @@ -522,10 +524,9 @@ function churn_files attrlen="$(((RANDOM % 1000) + 1))" attrvalue="$(random_string VALID_NAME_CHAR \ $attrlen)" - attr -qr $attrname $file_name || \ + rm_xattr $attrname $file_name || \ log_fail "Failed to remove $attrname" - attr -qs $attrname \ - -V "$attrvalue" $file_name || \ + set_xattr $attrname "$attrvalue" $file_name || \ log_fail "Failed to set $attrname" elif [ $value -eq 1 ]; then dd if=/dev/urandom of=$file_name \ @@ -556,8 +557,8 @@ function churn_files attrlen="$(((RANDOM % 1000) + 1))" attrvalue="$(random_string \ VALID_NAME_CHAR $attrlen)" - attr -qs $attrname \ - -V "$attrvalue" $file_name || \ + set_xattr $attrname \ + "$attrvalue" $file_name || \ log_fail "Failed to set $attrname" done fi @@ -568,16 +569,31 @@ function churn_files } # -# Mess up file contents +# Mess up a send file's contents # -# $1 The file path +# $1 The send file path # -function mess_file +function mess_send_file { file=$1 - filesize=$(stat -c '%s' $file) + filesize=$(stat_size $file) + offset=$(($RANDOM * $RANDOM % $filesize)) + + # The random offset might truncate the send stream to be + # smaller than the DRR_BEGIN record. If this happens, then + # the receiving system won't have enough info to create the + # partial dataset at all. We use zstream dump to check for + # this and retry in this case. + nr_begins=$(head -c $offset $file | zstream dump | \ + grep DRR_BEGIN | awk '{ print $5 }') + while [ "$nr_begins" -eq 0 ]; do + offset=$(($RANDOM * $RANDOM % $filesize)) + nr_begins=$(head -c $offset $file | zstream dump | \ + grep DRR_BEGIN | awk '{ print $5 }') + done + if (($RANDOM % 7 <= 1)); then # # We corrupt 2 bytes to minimize the chance that we @@ -619,29 +635,51 @@ function file_check # $1 The ZFS send command # $2 The filesystem where the streams are sent # $3 The receive filesystem +# $4 Test dry-run (optional) # function resume_test { - sendcmd=$1 - streamfs=$2 - recvfs=$3 + typeset sendcmd=$1 + typeset streamfs=$2 + typeset recvfs=$3 + typeset dryrun=${4:-1} stream_num=1 log_must eval "$sendcmd >/$streamfs/$stream_num" for ((i=0; i<2; i=i+1)); do - mess_file /$streamfs/$stream_num + mess_send_file /$streamfs/$stream_num log_mustnot zfs recv -suv $recvfs /$streamfs/$stream_num" + + # Do a dry-run + [ $dryrun -ne 0 ] && \ + log_must eval "zfs send -nvt $token > /dev/null" + + log_must eval "zfs send -t $token >/$streamfs/$stream_num" [[ -f /$streamfs/$stream_num ]] || \ log_fail "NO FILE /$streamfs/$stream_num" done log_must zfs recv -suv $recvfs /$streamfs/1" + mess_send_file /$streamfs/1 + log_mustnot zfs recv -suv $recvfs < /$streamfs/1 2>&1 + token=$(zfs get -Hp -o value receive_resume_token $recvfs) + echo "$token" > /$streamfs/resume_token + + return 0 +} + # # Setup filesystems for the resumable send/receive tests # @@ -660,7 +698,7 @@ function test_fs_setup datasetexists $recvfs && log_must_busy zfs destroy -r $recvpool datasetexists $streamfs && log_must_busy zfs destroy -r $streamfs - if $(datasetexists $sendfs || zfs create -o compress=lz4 $sendfs); then + if datasetexists $sendfs || zfs create -o compress=lz4 $sendfs; then mk_files 1000 256 0 $sendfs & mk_files 1000 131072 0 $sendfs & mk_files 100 1048576 0 $sendfs & @@ -703,7 +741,7 @@ function stream_has_features shift [[ -f $file ]] || log_fail "Couldn't find file: $file" - typeset flags=$(cat $file | zstreamdump | \ + typeset flags=$(cat $file | zstream dump | \ awk '/features =/ {features = $3} END {print features}') typeset -A feature feature[dedup]="1" @@ -736,7 +774,7 @@ function stream_has_features # comparing. This function does not currently handle incremental streams # that remove data. # -# $1 The zstreamdump output file +# $1 The zstream dump output file # $2 The dataset to compare against # This can be a source of a send or recv target (fs, not snapshot) # $3 The percentage below which verification is deemed a failure @@ -753,8 +791,8 @@ function verify_stream_size [[ -f $stream ]] || log_fail "No such file: $stream" datasetexists $ds || log_fail "No such dataset: $ds" - typeset stream_size=$(cat $stream | zstreamdump | sed -n \ - 's/ Total write size = \(.*\) (0x.*)/\1/p') + typeset stream_size=$(cat $stream | zstream dump | sed -n \ + 's/ Total payload size = \(.*\) (0x.*)/\1/p') typeset inc_size=0 if [[ -n $inc_src ]]; then @@ -800,10 +838,20 @@ function rand_set_prop } # Generate a recursive checksum of a filesystem which includes the file -# contents and any associated xattrs. +# contents and any associated extended attributes. function recursive_cksum { - find $1 -type f -exec sh -c 'sha256sum {}; getfattr \ - --absolute-names --only-values -d {} | sha256sum' \; | \ - sort -k 2 | awk '{ print $1 }' | sha256sum + case "$(uname)" in + FreeBSD) + find $1 -type f -exec sh -c 'sha256 -q {}; lsextattr -q \ + system {} | sha256 -q; lsextattr -q user {} | sha256 -q' \ + \; | sort | sha256 -q + ;; + *) + find $1 -type f -exec sh -c 'sha256sum {}; getfattr \ + --absolute-names --only-values -d {} | sha256sum' \; | \ + sort -k 2 | awk '{ print $1 }' | sha256sum | \ + awk '{ print $1 }' + ;; + esac } diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh index 5e657a898f..8e1821d88a 100755 --- a/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh @@ -38,12 +38,12 @@ # STRATEGY: # 1. Separately promote pool clone, filesystem clone and volume clone. # 2. Recursively backup all the POOL and restore in POOL2 -# 3. Verify all the datesets and property be properly received. +# 3. Verify all the datasets and properties were properly received. # verify_runnable "both" -# See issue: https://github.com/zfsonlinux/zfs/issues/6066 +# See issue: https://github.com/openzfs/zfs/issues/6066 log_unsupported "Occasionally hangs" # Origin Clone diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_011_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_011_pos.ksh index 9ecd18d87d..68f0e13927 100755 --- a/tests/zfs-tests/tests/functional/rsend/rsend_011_pos.ksh +++ b/tests/zfs-tests/tests/functional/rsend/rsend_011_pos.ksh @@ -63,7 +63,7 @@ for prop in $(fs_inherit_prop); do done # -# Inherit propertes in sub-datasets +# Inherit properties in sub-datasets # for ds in "$POOL/$FS/fs1" "$POOL/$FS/fs1/fs2" "$POOL/$FS/fs1/fclone" ; do for prop in $(fs_inherit_prop) ; do diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh index 57d58b9bab..594357dc4b 100755 --- a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh @@ -21,14 +21,12 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2013, 2016, Delphix. All rights reserved. # Use is subject to license terms. # -# -# Copyright (c) 2013, 2016 by Delphix. All rights reserved. -# - +. $STF_SUITE/include/properties.shlib . $STF_SUITE/tests/functional/rsend/rsend.kshlib # @@ -39,7 +37,7 @@ # 1. Setting properties for all the filesystem and volumes randomly # 2. Backup all the data from POOL by send -R # 3. Restore all the data in POOL2 -# 4. Verify all the perperties in two pools are same +# 4. Verify all the properties in the two pools are the same # verify_runnable "global" @@ -118,12 +116,10 @@ for fs in "$POOL" "$POOL/pclone" "$POOL/$FS" "$POOL/$FS/fs1" \ "$POOL/$FS/fs1/fs2" "$POOL/$FS/fs1/fclone" ; do rand_set_prop $fs aclinherit "discard" "noallow" "secure" "passthrough" rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256" - rand_set_prop $fs acltype "off" "noacl" "posixacl" + rand_set_prop $fs acltype "off" "posix" "nfsv4" "noacl" "posixacl" rand_set_prop $fs atime "on" "off" rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256" - rand_set_prop $fs compression "on" "off" "lzjb" "gzip" \ - "gzip-1" "gzip-2" "gzip-3" "gzip-4" "gzip-5" "gzip-6" \ - "gzip-7" "gzip-8" "gzip-9" + rand_set_prop $fs compression "${compress_prop_vals[@]}" rand_set_prop $fs copies "1" "2" "3" rand_set_prop $fs devices "on" "off" rand_set_prop $fs exec "on" "off" @@ -132,15 +128,15 @@ for fs in "$POOL" "$POOL/pclone" "$POOL/$FS" "$POOL/$FS/fs1" \ rand_set_prop $fs dnodesize "legacy" "auto" "1k" "2k" "4k" "8k" "16k" rand_set_prop $fs setuid "on" "off" rand_set_prop $fs snapdir "hidden" "visible" - rand_set_prop $fs xattr "on" "off" + if ! is_freebsd; then + rand_set_prop $fs xattr "on" "off" + fi rand_set_prop $fs user:prop "aaa" "bbb" "23421" "()-+?" done for vol in "$POOL/vol" "$POOL/$FS/vol" ; do rand_set_prop $vol checksum "on" "off" "fletcher2" "fletcher4" "sha256" - rand_set_prop $vol compression "on" "off" "lzjb" "gzip" \ - "gzip-1" "gzip-2" "gzip-3" "gzip-4" "gzip-5" "gzip-6" \ - "gzip-7" "gzip-8" "gzip-9" + rand_set_prop $vol compression "${compress_prop_vals[@]}" rand_set_prop $vol readonly "on" "off" rand_set_prop $vol copies "1" "2" "3" rand_set_prop $vol user:prop "aaa" "bbb" "23421" "()-+?" diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh new file mode 100755 index 0000000000..26573bfb59 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh @@ -0,0 +1,45 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that error conditions don't cause panics in zfs send +# +# Strategy: +# 1. Perform a zfs incremental send from a bookmark that doesn't exist +# 2. Perform a zfs incremental replication send with incremental source +# same as target (#11121) +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TEST_BASE_DIR/devnull +} + +log_onexit cleanup + +log_mustnot eval "zfs send -i \#bla $POOl/$FS@final > $TEST_BASE_DIR/devnull" + +log_must eval "zfs send -R -i snapA $POOL/vol@snapA 2>&1 " \ + "> $TEST_BASE_DIR/devnull | grep -q WARNING" + +log_pass "Ensure that error conditions cause appropriate failures." diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh index 60be67328e..cb68b1c3b2 100755 --- a/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh +++ b/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh @@ -25,7 +25,7 @@ # # Strategy: # 1. Bookmark a ZFS snapshot -# 2. Destroy the ZFS sanpshot +# 2. Destroy the ZFS snapshot # 3. Destroy the filesystem for the receive # 4. Verify receive of the full send stream # 5. Start an incremental ZFS send of the ZFS bookmark, redirect output to a diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh index 20f0bee155..c44985ae8c 100755 --- a/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh +++ b/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh @@ -25,7 +25,7 @@ # # Strategy: # 1. Destroy the filesystem for the receive -# 2. Unmount the source filsesystem +# 2. Unmount the source filesystem # 3. Start a full ZFS send, redirect output to a file # 4. Mess up the contents of the stream state file on disk # 5. Try ZFS receive, which should fail with a checksum mismatch error @@ -46,7 +46,7 @@ log_onexit resume_cleanup $sendfs $streamfs test_fs_setup $sendfs $recvfs $streamfs log_must zfs unmount -f $sendfs -resume_test "zfs send $sendfs" $streamfs $recvfs +resume_test "zfs send $sendfs" $streamfs $recvfs 0 file_check $sendfs $recvfs log_pass "Verify resumability of a full ZFS send/receive with the source " \ diff --git a/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh new file mode 100755 index 0000000000..483efcc605 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that send -L can be changed to on in an incremental. +# Verify that send -L can not be turned off in an incremental. +# + +function cleanup +{ + log_must_busy zfs destroy -r $TESTPOOL/fs + log_must_busy zfs destroy -r $TESTPOOL/recv +} + +verify_runnable "both" + +log_assert "Verify toggling send -L works as expected" +log_onexit cleanup + +log_must zfs create -o compression=on -o recordsize=1m $TESTPOOL/fs + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 + +log_must zfs snapshot $TESTPOOL/fs@snap + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 conv=notrunc seek=2048 + +log_must zfs snapshot $TESTPOOL/fs@snap2 + +log_must zfs create $TESTPOOL/recv + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-noL +log_must zfs send -c -i @snap $TESTPOOL/fs@snap2| zfs recv $TESTPOOL/recv/noL-noL +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-noL/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/L-L/file + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/noL-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-L/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-noL +log_mustnot zfs send -c -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-noL +log_must diff /$TESTPOOL/fs/.zfs/snapshot/snap/file /$TESTPOOL/recv/L-noL/file + +log_pass "Verify toggling send -L works as expected" diff --git a/tests/zfs-tests/tests/functional/rsend/send-cD.ksh b/tests/zfs-tests/tests/functional/rsend/send-cD.ksh deleted file mode 100755 index ceface9dbc..0000000000 --- a/tests/zfs-tests/tests/functional/rsend/send-cD.ksh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/ksh -p - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright (c) 2015, 2018 by Delphix. All rights reserved. -# - -. $STF_SUITE/tests/functional/rsend/rsend.kshlib - -# -# Description: -# Verify that the -c and -D flags do not interfere with each other. -# -# Strategy: -# 1. Write unique data to a filesystem and create a compressed, deduplicated -# full stream. -# 2. Verify that the stream and send dataset show the same size -# 3. Make several copies of the original data, and create both full and -# incremental compressed, deduplicated send streams -# 4. Verify the full stream is no bigger than the stream from step 1 -# 5. Verify the streams can be received correctly. -# - -verify_runnable "both" - -log_assert "Verify that the -c and -D flags do not interfere with each other" -log_onexit cleanup_pool $POOL2 - -typeset sendfs=$POOL2/sendfs -typeset recvfs=$POOL2/recvfs -typeset stream0=$BACKDIR/stream.0 -typeset stream1=$BACKDIR/stream.1 -typeset inc=$BACKDIR/stream.inc - -log_must zfs create -o compress=lz4 $sendfs -log_must zfs create -o compress=lz4 $recvfs -typeset dir=$(get_prop mountpoint $sendfs) -# Don't use write_compressible: we want compressible but undedupable data here. -log_must eval "dd if=/dev/urandom bs=1024k count=4 | base64 >$dir/file" -log_must zfs snapshot $sendfs@snap0 -log_must eval "zfs send -D -c $sendfs@snap0 >$stream0" - -# The stream size should match at this point because the data is all unique -verify_stream_size $stream0 $sendfs - -for i in {0..3}; do - log_must cp $dir/file $dir/file.$i -done -log_must zfs snapshot $sendfs@snap1 - -# The stream sizes should match, since the second stream contains no new blocks -log_must eval "zfs send -D -c $sendfs@snap1 >$stream1" -typeset size0=$(stat -c %s $stream0) -typeset size1=$(stat -c %s $stream1) -within_percent $size0 $size1 90 || log_fail "$size0 and $size1" - -# Finally, make sure the receive works correctly. -log_must eval "zfs send -D -c -i snap0 $sendfs@snap1 >$inc" -log_must eval "zfs recv -d $recvfs <$stream0" -log_must eval "zfs recv -d $recvfs <$inc" -cmp_ds_cont $sendfs $recvfs - -# The size of the incremental should be the same as the initial send. -typeset size2=$(stat -c %s $inc) -within_percent $size0 $size2 90 || log_fail "$size0 and $size1" - -log_pass "The -c and -D flags do not interfere with each other" diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh index 1983a3ea18..3dce217d89 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh @@ -53,7 +53,7 @@ for recsize in "${recsize_prop_vals[@]}"; do # For lz4, this method works for blocks up to 16k, but not larger [[ $recsize -eq $((32 * 1024)) ]] && break - if is_linux; then + if is_linux || is_freebsd; then log_must truncate -s $recsize $dir/$recsize log_must dd if=/dev/urandom of=$dir/$recsize \ seek=$((recsize - 8)) bs=1 count=8 conv=notrunc @@ -63,17 +63,17 @@ for recsize in "${recsize_prop_vals[@]}"; do fi done -# Generate the streams and zstreamdump output. +# Generate the streams and zstream dump output. log_must zfs snapshot $sendfs@now log_must eval "zfs send -c $sendfs@now >$stream" -log_must eval "zstreamdump -v <$stream >$dump" +log_must eval "zstream dump -v <$stream >$dump" log_must eval "zfs recv -d $recvfs <$stream" cmp_ds_cont $sendfs $recvfs verify_stream_size $stream $sendfs log_mustnot stream_has_features $stream embed_data log_must eval "zfs send -c -e $sendfs@now >$stream2" -log_must eval "zstreamdump -v <$stream2 >$dump2" +log_must eval "zstream dump -v <$stream2 >$dump2" log_must eval "zfs recv -d $recvfs2 <$stream2" cmp_ds_cont $sendfs $recvfs2 verify_stream_size $stream2 $sendfs @@ -101,9 +101,9 @@ for recsize in "${recsize_prop_vals[@]}"; do log_fail "Obj $recv2_obj not embedded in $recvfs2" grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump && \ - log_fail "Obj $obj embedded in zstreamdump output" + log_fail "Obj $obj embedded in zstream dump output" grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump2 || \ - log_fail "Obj $obj not embedded in zstreamdump output" + log_fail "Obj $obj not embedded in zstream dump output" done log_pass "Compressed streams can contain embedded blocks." diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh index 666e11f702..bc706bab25 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh @@ -52,8 +52,8 @@ for compress in off gzip; do poolexists $POOL3 && destroy_pool $POOL3 log_must zpool create $pool_opt $POOL3 $DISK3 - datasetexists $send_ds && log_must_busy zfs destroy -r $send_ds - datasetexists $recv_ds && log_must_busy zfs destroy -r $recv_ds + datasetexists $send_ds && destroy_dataset $send_ds -r + datasetexists $recv_ds && destroy_dataset $recv_ds -r log_must zfs create -o compress=$compress $send_ds typeset dir=$(get_prop mountpoint $send_ds) diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh index 614394e526..15873ed12f 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh @@ -12,10 +12,11 @@ # # -# Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (c) 2015, Delphix. All rights reserved. # . $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/include/properties.shlib # # Description: @@ -34,7 +35,6 @@ verify_runnable "both" log_assert "Verify compressed streams are rejected if incompatible." -typeset compress_types="off gzip lz4" typeset send_ds=$POOL2/testds typeset recv_ds=$POOL3/testds @@ -49,9 +49,9 @@ log_onexit cleanup datasetexists $POOL3 && log_must zpool destroy $POOL3 log_must zpool create -d $POOL3 $DISK3 -for compress in $compress_types; do - datasetexists $send_ds && log_must_busy zfs destroy -r $send_ds - datasetexists $recv_ds && log_must_busy zfs destroy -r $recv_ds +for compress in "${compress_prop_vals[@]}"; do + datasetexists $send_ds && destroy_dataset $send_ds -r + datasetexists $recv_ds && destroy_dataset $recv_ds -r log_must zfs create -o compress=$compress $send_ds typeset dir=$(get_prop mountpoint $send_ds) diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_resume.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_resume.ksh index d8d7c40e49..05ba5ed244 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_resume.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_resume.ksh @@ -28,7 +28,7 @@ # 2. Mess up the contents of the stream state file on disk # 3. Try ZFS receive, which should fail with a checksum mismatch error # 4. ZFS send to the stream state file again using the receive_resume_token -# 5. ZFS receieve and verify the receive completes successfully +# 5. ZFS receive and verify the receive completes successfully # 6. Repeat steps on an incremental ZFS send # diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh index 130bc3dbc9..056fc2cc25 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh @@ -12,10 +12,11 @@ # # -# Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (c) 2015, Delphix. All rights reserved. # . $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/include/properties.shlib # # Description: @@ -28,7 +29,6 @@ # verify_runnable "both" -typeset compress_types="off gzip lz4" typeset send_ds="$POOL2/testfs" typeset send_vol="$POOL2/vol" typeset send_voldev="$ZVOL_DEVDIR/$POOL2/vol" @@ -40,7 +40,12 @@ function get_estimated_size { typeset cmd=$1 typeset ds=${cmd##* } - typeset tmpfile=$(mktemp -p $BACKDIR) + if is_freebsd; then + mkdir -p $BACKDIR + typeset tmpfile=$(TMPDIR=$BACKDIR mktemp) + else + typeset tmpfile=$(mktemp -p $BACKDIR) + fi eval "$cmd >$tmpfile" [[ $? -eq 0 ]] || log_fail "get_estimated_size: $cmd" @@ -55,12 +60,12 @@ log_onexit cleanup_pool $POOL2 write_compressible $BACKDIR ${megs}m -for compress in $compress_types; do - datasetexists $send_ds && log_must_busy zfs destroy -r $send_ds - datasetexists $send_vol && log_must_busy zfs destroy -r $send_vol +for compress in "${compress_prop_vals[@]}"; do + datasetexists $send_ds && destroy_dataset $send_ds -r + datasetexists $send_vol && destroy_dataset $send_vol -r log_must zfs create -o compress=$compress $send_ds log_must zfs create -V 1g -o compress=$compress $send_vol - block_device_wait + block_device_wait $send_voldev typeset dir=$(get_prop mountpoint $send_ds) log_must cp $file $dir @@ -89,4 +94,4 @@ for compress in $compress_types; do "$vol_csize and $vol_refer differed by too much" done -log_pass "The the stream size given by -P accounts for compressed send." +log_pass "The stream size given by -P accounts for compressed send." diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh index 9b886f8157..b7d978624f 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh @@ -12,7 +12,8 @@ # # -# Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (c) 2015, Delphix. All rights reserved. +# Copyright (c) 2019, Kjeld Schouten-Lebbing. All rights reserved. # . $STF_SUITE/tests/functional/rsend/rsend.kshlib @@ -37,7 +38,7 @@ log_onexit cleanup_pool $POOL2 typeset sendfs=$POOL2/$FS typeset megs=128 -for prop in $(get_rand_compress_any 6); do +for prop in "${compress_prop_vals[@]}"; do for compressible in 'yes' 'no'; do log_must zfs create -o compress=$prop $sendfs diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh index caaf07ccb7..988ed91b99 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh @@ -49,8 +49,8 @@ typeset megs=8 log_must zfs create -V 256m -o compress=lz4 $vol write_compressible $BACKDIR ${megs}m 2 -md5_1=$(md5sum $data1 | awk '{print $1}') -md5_2=$(md5sum $data2 | awk '{print $1}') +md5_1=$(md5digest $data1) +md5_2=$(md5digest $data2) log_must dd if=$data1 of=$voldev bs=1024k log_must zfs snapshot $vol@snap @@ -60,8 +60,7 @@ log_must eval "zfs recv -d $POOL2 <$BACKDIR/full" verify_stream_size $BACKDIR/full $vol verify_stream_size $BACKDIR/full $vol2 -md5=$(dd if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \ - awk '{print $1}') +md5=$(dd if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5digest) [[ $md5 = $md5_1 ]] || log_fail "md5 mismatch: $md5 != $md5_1" # Repeat, for an incremental send @@ -73,8 +72,7 @@ log_must eval "zfs recv -d $POOL2 <$BACKDIR/inc" verify_stream_size $BACKDIR/inc $vol 90 $vol@snap verify_stream_size $BACKDIR/inc $vol2 90 $vol2@snap -md5=$(dd skip=$megs if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \ - awk '{print $1}') +md5=$(dd skip=$megs if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5digest) [[ $md5 = $md5_2 ]] || log_fail "md5 mismatch: $md5 != $md5_2" log_pass "Verify compressed send works with volumes" diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh index 52abfe7edc..5b9939c6a6 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh @@ -13,6 +13,7 @@ # # Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (c) 2020 by Datto, Inc. All rights reserved. # . $STF_SUITE/tests/functional/rsend/rsend.kshlib @@ -20,29 +21,35 @@ # # Description: -# Verify compression features show up in zstreamdump +# Verify compression features show up in zstream dump # # Strategy: # 1. Create a full compressed send stream -# 2. Verify zstreamdump shows this stream has the relevant features -# 3. Verify zstreamdump's accounting of logical and compressed size is correct +# 2. Verify zstream dump shows this stream has the relevant features +# 3. Verify zstream dump's accounting of logical and compressed size is correct +# 4. Verify the toname from a resume token +# 5. Verify it fails with corrupted resume token +# 6. Verify it fails with missing resume token # verify_runnable "both" -log_assert "Verify zstreamdump correctly interprets compressed send streams." +log_assert "Verify zstream dump correctly interprets compressed send streams." log_onexit cleanup_pool $POOL2 typeset sendfs=$POOL2/fs +typeset streamfs=$POOL2/fs2 +typeset recvfs=$POOL2/fs3 log_must zfs create -o compress=lz4 $sendfs +log_must zfs create -o compress=lz4 $streamfs typeset dir=$(get_prop mountpoint $sendfs) write_compressible $dir 16m log_must zfs snapshot $sendfs@full log_must eval "zfs send -c $sendfs@full >$BACKDIR/full" log_must stream_has_features $BACKDIR/full lz4 compressed -cat $BACKDIR/full | zstreamdump -v > $BACKDIR/dump.out +cat $BACKDIR/full | zstream dump -v > $BACKDIR/dump.out lsize=$(awk '/^WRITE [^0]/ {lsize += $24} END {printf("%d", lsize)}' \ $BACKDIR/dump.out) @@ -56,4 +63,13 @@ csize_prop=$(get_prop used $sendfs) within_percent $csize $csize_prop 90 || log_fail \ "$csize and $csize_prop differed by too much" -log_pass "zstreamdump correctly interprets compressed send streams." +x=$(get_resume_token "zfs send -c $sendfs@full" $streamfs $recvfs) +resume_token=$(cat /$streamfs/resume_token) +to_name_fs=$sendfs +log_must eval "zstream token $resume_token | grep $to_name_fs" + +bad_resume_token="1-1162e8285b-100789c6360" +log_mustnot eval "zstream token $bad_resume_token 2>&1" +log_mustnot eval "zstream token 2>&1" + +log_pass "zstream dump correctly interprets compressed send streams." diff --git a/tests/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh b/tests/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh index 84c0a5e3c3..25ad8e0820 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh @@ -55,8 +55,8 @@ verify_runnable "both" function cleanup { - datasetexists $TESTPOOL/128k && log_must_busy zfs destroy $TESTPOOL/128k - datasetexists $TESTPOOL/1m && log_must_busy zfs destroy $TESTPOOL/1m + datasetexists $TESTPOOL/128k && destroy_dataset $TESTPOOL/128k + datasetexists $TESTPOOL/1m && destroy_dataset $TESTPOOL/1m cleanup_pool $POOL2 destroy_pool $POOL3 } @@ -72,8 +72,12 @@ function check_recsize [[ -f $file ]] || log_fail "file '$file' doesn't exist" typeset read_recsize=$(get_prop recsize $recv_ds) - typeset read_file_bs=$(stat $file | sed -n \ - 's/.*IO Block: \([0-9]*\).*/\1/p') + if is_freebsd; then + typeset read_file_bs=$(stat -f "%k" $file) + else + typeset read_file_bs=$(stat $file | sed -n \ + 's/.*IO Block: \([0-9]*\).*/\1/p') + fi [[ $read_recsize = $expected_recsize ]] || log_fail \ "read_recsize: $read_recsize expected_recsize: $expected_recsize" @@ -130,7 +134,7 @@ function check [[ -f $stream ]] && log_must rm $stream log_must eval "zfs send $flags $send_snap >$stream" $verify eval "zfs recv $recv_ds <$stream" - typeset stream_size=$(cat $stream | zstreamdump | sed -n \ + typeset stream_size=$(cat $stream | zstream dump | sed -n \ 's/ Total write size = \(.*\) (0x.*)/\1/p') # diff --git a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh b/tests/zfs-tests/tests/functional/rsend/send-wR_encrypted_zvol.ksh similarity index 66% rename from tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh rename to tests/zfs-tests/tests/functional/rsend/send-wR_encrypted_zvol.ksh index 49b846e9c3..b95fc3da30 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-wDR_encrypted_zvol.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-wR_encrypted_zvol.ksh @@ -16,20 +16,21 @@ # # Copyright (c) 2018 by Datto Inc. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/rsend/rsend.kshlib # # DESCRIPTION: -# Verify that zvols with dedup=on and encryption=on can be sent and received -# with a deduplicated raw send stream. +# Verify that zvols with encryption=on can be sent and received with a raw +# send stream. # # STRATEGY: -# 1. Create a zvol with dedup and encryption on and put a filesystem on it +# 1. Create a zvol with encryption on and put a filesystem on it # 2. Copy a file into the zvol a few times and take a snapshot # 3. Repeat step 2 a few times to create more snapshots -# 4. Send all snapshots in a recursive, raw, deduplicated send stream +# 4. Send all snapshots in a recursive, raw send stream # 5. Mount the received zvol and verify that all of the data there is correct # @@ -37,8 +38,8 @@ verify_runnable "both" function cleanup { - ismounted $recvmnt ext4 && log_must umount $recvmnt - ismounted $mntpnt ext4 && log_must umount $mntpnt + ismounted $recvmnt $fstype && log_must umount $recvmnt + ismounted $mntpnt $fstype && log_must umount $mntpnt [[ -d $recvmnt ]] && log_must rm -rf $keyfile [[ -d $mntpnt ]] && log_must rm -rf $keyfile destroy_dataset $TESTPOOL/recv "-r" @@ -48,7 +49,7 @@ function cleanup } log_onexit cleanup -log_assert "Verify zfs can receive raw, recursive, and deduplicated send streams" +log_assert "Verify zfs can receive raw, recursive send streams" typeset keyfile=/$TESTPOOL/pkey typeset snap_count=5 @@ -57,14 +58,26 @@ typeset mntpnt=$TESTDIR/$TESTVOL typeset recvdev=$ZVOL_DEVDIR/$TESTPOOL/recv typeset recvmnt=$TESTDIR/recvmnt typeset sendfile=$TESTDIR/sendfile +typeset fstype=none log_must eval "echo 'password' > $keyfile" log_must zfs create -o dedup=on -o encryption=on -o keyformat=passphrase \ -o keylocation=file://$keyfile -V 128M $TESTPOOL/$TESTVOL -log_must block_device_wait +block_device_wait -log_must eval "echo 'y' | newfs -t ext4 -v $zdev" +if is_linux; then + # ext4 only supported on Linux + log_must new_fs -t ext4 $zdev + fstype=ext4 + typeset remount_ro="-o remount,ro" + typeset remount_rw="-o remount,rw" +else + log_must new_fs $zdev + fstype=$NEWFS_DEFAULT_FS + typeset remount_ro="-ur" + typeset remount_rw="-uw" +fi log_must mkdir -p $mntpnt log_must mkdir -p $recvmnt log_must mount $zdev $mntpnt @@ -76,18 +89,20 @@ for ((i = 1; i <= $snap_count; i++)); do done log_must sync + log_must mount $remount_ro $zdev $mntpnt log_must zfs snap $TESTPOOL/$TESTVOL@snap$i + log_must mount $remount_rw $zdev $mntpnt done -log_must eval "zfs send -wDR $TESTPOOL/$TESTVOL@snap$snap_count > $sendfile" +log_must eval "zfs send -wR $TESTPOOL/$TESTVOL@snap$snap_count > $sendfile" log_must eval "zfs recv $TESTPOOL/recv < $sendfile" log_must zfs load-key $TESTPOOL/recv -log_must block_device_wait +block_device_wait log_must mount $recvdev $recvmnt -md5_1=$(cat $mntpnt/* | md5sum | awk '{print $1}') -md5_2=$(cat $recvmnt/* | md5sum | awk '{print $1}') +md5_1=$(cat $mntpnt/* | md5digest) +md5_2=$(cat $recvmnt/* | md5digest) [[ "$md5_1" == "$md5_2" ]] || log_fail "md5 mismatch: $md5_1 != $md5_2" -log_pass "zfs can receive raw, recursive, and deduplicated send streams" +log_pass "zfs can receive raw, recursive send streams" diff --git a/tests/zfs-tests/tests/functional/rsend/send_doall.ksh b/tests/zfs-tests/tests/functional/rsend/send_doall.ksh new file mode 100755 index 0000000000..e5c3490b32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_doall.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify send_doall stream is properly received +# +# Strategy: +# 1) Create a set of snapshots. +# 2) Send these snapshots (from origin to the last one) to a file using send_doall. +# 3) Receive the file to newfs to test if the stream is properly handled. +# + +verify_runnable "both" + +log_assert "Verify send_doall stream is correct" + +function cleanup +{ + rm -f $BACKDIR/fs@* + destroy_dataset $POOL/fs "-rR" + destroy_dataset $POOL/newfs "-rR" +} + +log_onexit cleanup + +log_must zfs create $POOL/fs +log_must zfs create $POOL/fs/child + +# Create 3 files and a snapshot between each file creation. +for i in {1..3}; do + file="/$POOL/fs/file$i" + log_must mkfile 16384 $file + + file="/$POOL/fs/child/file$i" + log_must mkfile 16384 $file + + log_must zfs snapshot -r $POOL/fs@snap$i +done + +# Snapshot the pool and send it to the new dataset. +log_must eval "send_doall $POOL/fs@snap3 >$BACKDIR/fs@snap3" +log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap3" + +zfs list $POOL/newfs/child +if [[ $? -eq 0 ]]; then + log_fail "Children dataset should not have been received" +fi + +log_pass "Verify send_doall stream is correct" diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh index 6288178f89..370f5382eb 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh @@ -46,9 +46,9 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r datasetexists $TESTPOOL/recv && \ - log_must zfs destroy -r $TESTPOOL/recv + destroy_dataset $TESTPOOL/recv -r [[ -f $keyfile ]] && log_must rm $keyfile [[ -f $sendfile ]] && log_must rm $sendfile } @@ -84,11 +84,11 @@ log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrsadir log_must zfs set xattr=sa $TESTPOOL/$TESTFS2 log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrsadir -# ZoL issue #7432 +# OpenZFS issue #7432 log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2 log_must touch /$TESTPOOL/$TESTFS2/attrs log_must eval "python -c 'print \"a\" * 4096' | \ - attr -s bigval /$TESTPOOL/$TESTFS2/attrs" + set_xattr_stdin bigval /$TESTPOOL/$TESTFS2/attrs" log_must zfs set compression=off xattr=on $TESTPOOL/$TESTFS2 log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1 diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh index a216f1c5ff..793904db91 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh @@ -58,7 +58,8 @@ log_assert "'zfs recv' must properly handle encryption properties" typeset keyfile=/$TESTPOOL/pkey typeset sendfile=/$TESTPOOL/sendfile -typeset snap=$TESTPOOL/ds@snap +typeset snap=$TESTPOOL/ds@snap1 +typeset snap2=$TESTPOOL/ds@snap2 typeset esnap=$TESTPOOL/crypt@snap1 typeset esnap2=$TESTPOOL/crypt@snap2 @@ -75,9 +76,10 @@ log_must zfs create -o keyformat=passphrase -o keylocation=file://$keyfile \ log_must mkfile 1M /$TESTPOOL/ds/$TESTFILE0 log_must cp /$TESTPOOL/ds/$TESTFILE0 /$TESTPOOL/crypt/$TESTFILE0 -typeset cksum=$(md5sum /$TESTPOOL/ds/$TESTFILE0 | awk '{ print $1 }') +typeset cksum=$(md5digest /$TESTPOOL/ds/$TESTFILE0) log_must zfs snap -r $snap +log_must zfs snap -r $snap2 log_must zfs snap -r $esnap log_must zfs snap -r $esnap2 @@ -122,12 +124,12 @@ ds=$TESTPOOL/recv log_must eval "zfs send $snap > $sendfile" log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ "-o keylocation=file://$keyfile $ds < $sendfile" -log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" log_must test "$(get_prop 'mounted' $ds)" == "yes" -recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }') +recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds @@ -138,12 +140,12 @@ ds=$TESTPOOL/recv log_must eval "zfs send -p $snap > $sendfile" log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ "-o keylocation=file://$keyfile $ds < $sendfile" -log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" log_must test "$(get_prop 'mounted' $ds)" == "yes" -recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }') +recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds @@ -156,12 +158,12 @@ ds=$TESTPOOL/recv log_must eval "zfs send -R $snap > $sendfile" log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \ "-o keylocation=file://$keyfile $ds < $sendfile" -log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds" log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile" log_must test "$(get_prop 'mounted' $ds)" == "yes" -recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }') +recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds @@ -172,10 +174,10 @@ ds=$TESTPOOL/crypt/recv log_must eval "zfs send -p $snap > $sendfile" log_must eval "zfs recv -x encryption $ds < $sendfile" log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" -log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" log_must test "$(get_prop 'mounted' $ds)" == "yes" -recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }') +recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds @@ -186,10 +188,24 @@ ds=$TESTPOOL/crypt/recv log_must eval "zfs send -R $snap > $sendfile" log_must eval "zfs recv -x encryption $ds < $sendfile" log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" -log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" log_must test "$(get_prop 'mounted' $ds)" == "yes" -recv_cksum=$(md5sum /$ds/$TESTFILE0 | awk '{ print $1 }') +recv_cksum=$(md5digest /$ds/$TESTFILE0) +log_must test "$recv_cksum" == "$cksum" +log_must zfs destroy -r $ds + +# Test that we can override an unencrypted, incremental, recursive stream's +# encryption settings, receiving all datasets as encrypted children. +log_note "Must be able to receive recursive stream to encrypted child" +ds=$TESTPOOL/crypt/recv +log_must eval "zfs send -R $snap2 > $sendfile" +log_must eval "zfs recv -x encryption $ds < $sendfile" +log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt" +log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm" +log_must test "$(get_prop 'keyformat' $ds)" == "passphrase" +log_must test "$(get_prop 'mounted' $ds)" == "yes" +recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_truncated_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_truncated_files.ksh index d701bcecb9..5760bf9b90 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_truncated_files.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_truncated_files.ksh @@ -42,9 +42,9 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r datasetexists $TESTPOOL/recv && \ - log_must zfs destroy -r $TESTPOOL/recv + destroy_dataset $TESTPOOL/recv -r [[ -f $keyfile ]] && log_must rm $keyfile [[ -f $sendfile ]] && log_must rm $sendfile } @@ -52,8 +52,16 @@ log_onexit cleanup function recursive_cksum { - find $1 -type f -exec sha256sum {} \; | \ - sort -k 2 | awk '{ print $1 }' | sha256sum + case "$(uname)" in + FreeBSD) + find $1 -type f -exec sha256 -q {} \; | \ + sort | sha256digest + ;; + *) + find $1 -type f -exec sha256sum {} \; | \ + sort -k 2 | awk '{ print $1 }' | sha256digest + ;; + esac } log_assert "Verify 'zfs send -w' works with many different file layouts" diff --git a/tests/zfs-tests/tests/functional/rsend/send_freeobjects.ksh b/tests/zfs-tests/tests/functional/rsend/send_freeobjects.ksh index 6533352a9a..925f667ee9 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_freeobjects.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_freeobjects.ksh @@ -21,7 +21,7 @@ # # Description: # Verify FREEOBJECTS record frees sequential objects (See -# https://github.com/zfsonlinux/zfs/issues/6694) +# https://github.com/openzfs/zfs/issues/6694) # # Strategy: # 1. Create three files with sequential object numbers, f1 f2 and f3 diff --git a/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh index c2b5ff7a05..1dfa97e773 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh @@ -53,7 +53,7 @@ function cleanup { cleanup_pool $sendpool cleanup_pool $recvpool - set_tunable64 send_holes_without_birth_time 1 + set_tunable64 SEND_HOLES_WITHOUT_BIRTH_TIME 1 } function send_and_verify @@ -72,7 +72,7 @@ function send_and_verify # to be re-enabled for this test case to verify correctness. Once we're # comfortable that all hole_birth bugs has been resolved this behavior may # be re-enabled by default. -log_must set_tunable64 send_holes_without_birth_time 0 +log_must set_tunable64 SEND_HOLES_WITHOUT_BIRTH_TIME 0 # Incremental send truncating the file and adding new data. log_must zfs create -o recordsize=4k $sendfs @@ -81,7 +81,7 @@ log_must truncate -s 1G /$sendfs/file1 log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=11264 seek=1152 log_must zfs snapshot $sendfs@snap1 -log_must truncate -s 4194304 /$sendfs/file1 +log_must truncate -s 4M /$sendfs/file1 log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=152 seek=384 \ conv=notrunc log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=10 seek=1408 \ diff --git a/tests/zfs-tests/tests/functional/rsend/send_invalid.ksh b/tests/zfs-tests/tests/functional/rsend/send_invalid.ksh new file mode 100755 index 0000000000..2ce7ee4a08 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_invalid.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version a.0. +# You may only use this file in accordance with the terms of version +# a.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Portions Copyright 2020 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that send with invalid options will fail gracefully. +# +# Strategy: +# 1. Perform zfs send on the cli with the order of the snapshots reversed +# 2. Perform zfs send using libzfs with the order of the snapshots reversed +# + +verify_runnable "both" + +log_assert "Verify that send with invalid options will fail gracefully." + +function cleanup +{ + datasetexists $testfs && destroy_dataset $testfs -r +} +log_onexit cleanup + +testfs=$POOL/fs + +log_must zfs create $testfs +log_must zfs snap $testfs@snap0 +log_must zfs snap $testfs@snap1 + +# Test bad send with the CLI +log_mustnot eval "zfs send -i $testfs@snap1 $testfs@snap0 >$TEST_BASE_DIR/devnull" + +# Test bad send with libzfs/libzfs_core +log_must badsend $testfs@snap0 $testfs@snap1 + +log_pass "Send with invalid options fails gracefully." diff --git a/tests/zfs-tests/tests/functional/rsend/send_mixed_raw.ksh b/tests/zfs-tests/tests/functional/rsend/send_mixed_raw.ksh index eea535af11..59b08ccf72 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_mixed_raw.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_mixed_raw.ksh @@ -49,11 +49,11 @@ verify_runnable "both" function cleanup { datasetexists $TESTPOOL/$TESTFS3 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS3 + destroy_dataset $TESTPOOL/$TESTFS3 -r datasetexists $TESTPOOL/$TESTFS2 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS2 + destroy_dataset $TESTPOOL/$TESTFS2 -r datasetexists $TESTPOOL/$TESTFS1 && \ - log_must zfs destroy -r $TESTPOOL/$TESTFS1 + destroy_dataset $TESTPOOL/$TESTFS1 -r } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/rsend/send_partial_dataset.ksh b/tests/zfs-tests/tests/functional/rsend/send_partial_dataset.ksh new file mode 100755 index 0000000000..c390327a5b --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_partial_dataset.ksh @@ -0,0 +1,110 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version a.0. +# You may only use this file in accordance with the terms of version +# a.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 Datto Inc. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that a partially received dataset can be sent with +# 'zfs send --saved'. +# +# Strategy: +# 1. Setup a pool with partially received filesystem +# 2. Perform saved send without incremental +# 3. Perform saved send with incremental +# 4. Perform saved send with incremental, resuming from a token +# 5. Perform negative tests for invalid command inputs +# + +verify_runnable "both" + +log_assert "Verify that a partially received dataset can be sent with " \ + "'zfs send --saved'." + +function cleanup +{ + destroy_dataset $POOL/testfs2 "-r" + destroy_dataset $POOL/stream "-r" + destroy_dataset $POOL/recvfs "-r" + destroy_dataset $POOL/partialfs "-r" +} +log_onexit cleanup + +log_must zfs create $POOL/testfs2 +log_must zfs create $POOL/stream +mntpnt=$(get_prop mountpoint $POOL/testfs2) + +# Setup a pool with partially received filesystems +log_must mkfile 1m $mntpnt/filea +log_must zfs snap $POOL/testfs2@a +log_must mkfile 1m $mntpnt/fileb +log_must zfs snap $POOL/testfs2@b +log_must eval "zfs send $POOL/testfs2@a | zfs recv $POOL/recvfs" +log_must eval "zfs send -i $POOL/testfs2@a $POOL/testfs2@b > " \ + "/$POOL/stream/inc.send" +log_must eval "zfs send $POOL/testfs2@b > /$POOL/stream/full.send" +mess_send_file /$POOL/stream/full.send +mess_send_file /$POOL/stream/inc.send +log_mustnot zfs recv -s $POOL/recvfullfs < /$POOL/stream/full.send +log_mustnot zfs recv -s $POOL/recvfs < /$POOL/stream/inc.send + +# Perform saved send without incremental +log_mustnot eval "zfs send --saved $POOL/recvfullfs | zfs recv -s " \ + "$POOL/partialfs" +token=$(zfs get -Hp -o value receive_resume_token $POOL/partialfs) +log_must eval "zfs send -t $token | zfs recv -s $POOL/partialfs" +file_check $POOL/recvfullfs $POOL/partialfs +log_must zfs destroy -r $POOL/partialfs + +# Perform saved send with incremental +log_must eval "zfs send $POOL/recvfs@a | zfs recv $POOL/partialfs" +log_mustnot eval "zfs send --saved $POOL/recvfs | " \ + "zfs recv -s $POOL/partialfs" +token=$(zfs get -Hp -o value receive_resume_token $POOL/partialfs) +log_must eval "zfs send -t $token | zfs recv -s $POOL/partialfs" +file_check $POOL/recvfs $POOL/partialfs +log_must zfs destroy -r $POOL/partialfs + +# Perform saved send with incremental, resuming from token +log_must eval "zfs send $POOL/recvfs@a | zfs recv $POOL/partialfs" +log_must eval "zfs send --saved $POOL/recvfs > " \ + "/$POOL/stream/partial.send" +mess_send_file /$POOL/stream/partial.send +log_mustnot zfs recv -s $POOL/partialfs < /$POOL/stream/partial.send +token=$(zfs get -Hp -o value receive_resume_token $POOL/partialfs) +log_must eval "zfs send -t $token | zfs recv -s $POOL/partialfs" +file_check $POOL/recvfs $POOL/partialfs + +# Perform negative tests for invalid command inputs +set -A badargs \ + "" \ + "$POOL/recvfs@a" \ + "-i $POOL/recvfs@a $POOL/recvfs@b" \ + "-R $POOL/recvfs" \ + "-p $POOL/recvfs" \ + "-I $POOL/recvfs" \ + "-h $POOL/recvfs" + +while (( i < ${#badargs[*]} )) +do + log_mustnot eval "zfs send --saved ${badargs[i]} >$TEST_BASE_DIR/devnull" + (( i = i + 1 )) +done + +log_pass "A partially received dataset can be sent with 'zfs send --saved'." diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh index 12a72fa092..551ed15db2 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh @@ -49,13 +49,8 @@ function cleanup rm -f $BACKDIR/fs-dn-2k rm -f $BACKDIR/fs-attr - if datasetexists $POOL/fs ; then - log_must zfs destroy -rR $POOL/fs - fi - - if datasetexists $POOL/newfs ; then - log_must zfs destroy -rR $POOL/newfs - fi + datasetexists $POOL/fs && destroy_dataset $POOL/fs -rR + datasetexists $POOL/newfs && destroy_dataset $POOL/newfs -rR } log_onexit cleanup @@ -93,7 +88,8 @@ log_must zfs snapshot $POOL/fs@c # 4. Create an empty file and add xattrs to it to exercise reclaiming a # dnode that requires more than 1 slot for its bonus buffer (Zol #7433) log_must zfs set compression=on xattr=sa $POOL/fs -log_must eval "python -c 'print \"a\" * 512' | attr -s bigval /$POOL/fs/attrs" +log_must eval "python -c 'print \"a\" * 512' | + set_xattr_stdin bigval /$POOL/fs/attrs" log_must zfs snapshot $POOL/fs@d # 5. Generate initial and incremental streams diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh index 3c3de86d91..a653f8b3f1 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh @@ -12,10 +12,12 @@ # # -# Copyright (c) 2019 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2019, Lawrence Livermore National Security LLC. +# Use is subject to license terms. # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib . $STF_SUITE/tests/functional/rsend/rsend.kshlib # @@ -25,7 +27,7 @@ # Strategy: # 1. Create a pool containing an encrypted filesystem. # 2. Use 'zfs send -wp' to perform a raw send of the initial filesystem. -# 3. Repeat the followings steps N times to verify raw incremental receives. +# 3. Repeat the following steps N times to verify raw incremental receives. # a) Randomly change several key dataset properties. # b) Modify the contents of the filesystem such that dnode reallocation # is likely during the 'zfs receive', and receive_object() exercises @@ -65,10 +67,15 @@ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap${last_snap}" # Set atime=off to prevent the recursive_cksum from modifying newfs. log_must zfs set atime=off $POOL/newfs -# Due to reduced performance on debug kernels use fewer files by default. if is_kmemleak; then + # Use fewer files and passes on debug kernels + # to avoid timeout due to reduced performance. nr_files=100 passes=2 +elif is_freebsd; then + # Use fewer files and passes on FreeBSD to avoid timeout. + nr_files=500 + passes=2 else nr_files=1000 passes=3 @@ -78,7 +85,7 @@ for i in {1..$passes}; do # Randomly modify several dataset properties in order to generate # more interesting incremental send streams. rand_set_prop $POOL/fs checksum "off" "fletcher4" "sha256" - rand_set_prop $POOL/fs compression "off" "lzjb" "gzip" "lz4" + rand_set_prop $POOL/fs compression "${compress_prop_vals[@]}" rand_set_prop $POOL/fs recordsize "32K" "128K" rand_set_prop $POOL/fs dnodesize "legacy" "auto" "4k" rand_set_prop $POOL/fs xattr "on" "sa" diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh index 4b89a73d80..083a2bec9d 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh @@ -15,6 +15,7 @@ # Copyright (c) 2019 by Lawrence Livermore National Security, LLC. # +. $STF_SUITE/include/properties.shlib . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/rsend/rsend.kshlib @@ -25,7 +26,7 @@ # Strategy: # 1. Create a pool containing an encrypted filesystem. # 2. Use 'zfs send -wp' to perform a raw send of the initial filesystem. -# 3. Repeat the followings steps N times to verify raw incremental receives. +# 3. Repeat the following steps N times to verify raw incremental receives. # a) Randomly change several key dataset properties. # b) Modify the contents of the filesystem such that dnode reallocation # is likely during the 'zfs receive', and receive_object() exercises @@ -58,10 +59,15 @@ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap${last_snap}" # Set atime=off to prevent the recursive_cksum from modifying newfs. log_must zfs set atime=off $POOL/newfs -# Due to reduced performance on debug kernels use fewer files by default. if is_kmemleak; then + # Use fewer files and passes on debug kernels + # to avoid timeout due to reduced performance. nr_files=100 passes=2 +elif is_freebsd; then + # Use fewer passes and files on FreeBSD to avoid timeout. + nr_files=500 + passes=2 else nr_files=1000 passes=3 @@ -71,7 +77,7 @@ for i in {1..$passes}; do # Randomly modify several dataset properties in order to generate # more interesting incremental send streams. rand_set_prop $POOL/fs checksum "off" "fletcher4" "sha256" - rand_set_prop $POOL/fs compression "off" "lzjb" "gzip" "lz4" + rand_set_prop $POOL/fs compression "${compress_prop_vals[@]}" rand_set_prop $POOL/fs recordsize "32K" "128K" rand_set_prop $POOL/fs dnodesize "legacy" "auto" "4k" rand_set_prop $POOL/fs xattr "on" "sa" diff --git a/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh b/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh index 9de732e223..73f164852f 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh @@ -59,7 +59,7 @@ for i in {1..40}; do log_must mkfile 16384 $file for j in {1..20}; do - log_must attr -qs "testattr$j" -V "$attrvalue" $file + log_must set_xattr "testattr$j" "$attrvalue" $file done done @@ -103,7 +103,7 @@ log_must truncate -s 1073741824 /$POOL/fs/file15 log_must truncate -s 50 /$POOL/fs/file16 for i in {11..20}; do - log_must attr -qr testattr1 /$POOL/fs/file$i + log_must rm_xattr testattr1 /$POOL/fs/file$i done # @@ -125,7 +125,7 @@ log_must truncate -s 50 /$POOL/fs/file26 for i in {21..30}; do for j in {1..20}; do - log_must attr -qr testattr$j /$POOL/fs/file$i + log_must rm_xattr testattr$j /$POOL/fs/file$i done done @@ -134,8 +134,8 @@ done # for i in {31..40}; do file="/$POOL/fs/file$i" - log_must attr -qr testattr$(((RANDOM % 20) + 1)) $file - log_must attr -qs testattr$(((RANDOM % 20) + 1)) -V "$attrvalue" $file + log_must rm_xattr testattr$(((RANDOM % 20) + 1)) $file + log_must set_xattr testattr$(((RANDOM % 20) + 1)) "$attrvalue" $file done # Calculate the expected recursive checksum for the source. diff --git a/tests/zfs-tests/tests/functional/slog/Makefile.am b/tests/zfs-tests/tests/functional/slog/Makefile.am index 4548ce63b4..33e3a6d3a4 100644 --- a/tests/zfs-tests/tests/functional/slog/Makefile.am +++ b/tests/zfs-tests/tests/functional/slog/Makefile.am @@ -17,7 +17,8 @@ dist_pkgdata_SCRIPTS = \ slog_013_pos.ksh \ slog_014_pos.ksh \ slog_015_neg.ksh \ - slog_replay_fs.ksh \ + slog_replay_fs_001.ksh \ + slog_replay_fs_002.ksh \ slog_replay_volume.ksh dist_pkgdata_DATA = \ diff --git a/tests/zfs-tests/tests/functional/slog/cleanup.ksh b/tests/zfs-tests/tests/functional/slog/cleanup.ksh index ac301f386f..92bc4aa59d 100755 --- a/tests/zfs-tests/tests/functional/slog/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/slog/cleanup.ksh @@ -34,10 +34,6 @@ verify_runnable "global" -if ! verify_slog_support ; then - log_unsupported "This system doesn't support separate intent logs" -fi - if datasetexists $TESTPOOL ; then log_must zpool destroy -f $TESTPOOL fi diff --git a/tests/zfs-tests/tests/functional/slog/setup.ksh b/tests/zfs-tests/tests/functional/slog/setup.ksh index f30824d3ee..4278fc6978 100755 --- a/tests/zfs-tests/tests/functional/slog/setup.ksh +++ b/tests/zfs-tests/tests/functional/slog/setup.ksh @@ -34,17 +34,4 @@ verify_runnable "global" -if ! verify_slog_support ; then - log_unsupported "This system doesn't support separate intent logs" -fi - -if [[ -d $VDEV ]]; then - log_must rm -rf $VDIR -fi -if [[ -d $VDEV2 ]]; then - log_must rm -rf $VDIR2 -fi -log_must mkdir -p $VDIR $VDIR2 -log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 - log_pass diff --git a/tests/zfs-tests/tests/functional/slog/slog.kshlib b/tests/zfs-tests/tests/functional/slog/slog.kshlib index 6ed7e4e050..75cfec2d83 100644 --- a/tests/zfs-tests/tests/functional/slog/slog.kshlib +++ b/tests/zfs-tests/tests/functional/slog/slog.kshlib @@ -31,11 +31,20 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/slog/slog.cfg +function setup +{ + log_must rm -rf $VDIR $VDIR2 + log_must mkdir -p $VDIR $VDIR2 + log_must truncate -s $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 + + return 0 +} + function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 - rm -rf $TESTDIR + rm -rf $TESTDIR $VDIR $VDIR2 } # diff --git a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh index 3d3daf5f9c..a4c35ed9e9 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "Creating a pool with a log device succeeds." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh index b056f19cdb..91904aa612 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Adding a log device to normal pool works." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh index c647b8f54b..0b4d6ede3e 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Adding an extra log device works." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh index 4b0b3439a2..10f28dcc00 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Attaching a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh index cbbb948691..4836f6f279 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Detaching a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh index 53e8c67ca0..24143196fd 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Replacing a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh index 4926fb7b31..27ac38606c 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh @@ -48,6 +48,7 @@ verify_runnable "global" log_assert "Exporting and importing pool with log devices passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh index 587e0e3212..54587a0c61 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh @@ -44,6 +44,7 @@ verify_runnable "global" log_assert "A raidz/raidz2 log is not supported." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh index e7091f17b7..222f71a999 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "A raidz/raidz2 log can not be added to existed pool." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh index 8fe248ffbc..edd9abea09 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Slog device can not be replaced with spare device." log_onexit cleanup +log_must setup log_must zpool create $TESTPOOL $VDEV spare $SDEV log $LDEV sdev=$(random_get $SDEV) diff --git a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh index 2dad200b31..3bebc82017 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh @@ -46,6 +46,7 @@ verify_runnable "global" log_assert "Offline and online a log device passes." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh index 45566d427f..8d6fb2bffb 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh @@ -45,6 +45,7 @@ verify_runnable "global" log_assert "Pool can survive when one of mirror log device get corrupted." log_onexit cleanup +log_must setup for type in "" "mirror" "raidz" "raidz2" do diff --git a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh index bbe5adc241..89b3aeb403 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh @@ -50,6 +50,8 @@ function cleanup_testenv if [[ -n $lofidev ]]; then if is_linux; then losetup -d $lofidev + elif is_freebsd; then + mdconfig -du ${lofidev#md} else lofiadm -d $lofidev fi @@ -60,6 +62,7 @@ log_assert "Verify slog device can be disk, file, lofi device or any device " \ "that presents a block interface." verify_disk_count "$DISKS" 2 log_onexit cleanup_testenv +log_must setup dsk1=${DISKS%% *} log_must zpool create $TESTPOOL ${DISKS#$dsk1} @@ -77,6 +80,8 @@ if is_linux; then lofidev=$(losetup -f) log_must losetup $lofidev ${LDEV2%% *} lofidev=${lofidev##*/} +elif is_freebsd; then + lofidev=$(mdconfig -a ${LDEV2%% *}) else lofidev=${LDEV2%% *} log_must lofiadm -a $lofidev diff --git a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh index 0ec96ae1e6..f8530a623d 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh @@ -44,6 +44,7 @@ verify_runnable "global" log_assert "log device can survive when one of the pool device get corrupted." +log_must setup for type in "mirror" "raidz" "raidz2"; do for spare in "" "spare"; do @@ -63,7 +64,7 @@ for type in "mirror" "raidz" "raidz2"; do # Corrupt a pool device to make the pool DEGRADED # The oseek value below is to skip past the vdev label. # - if is_linux; then + if is_linux || is_freebsd; then log_must dd if=/dev/urandom of=$VDIR/a bs=1024k \ seek=4 conv=notrunc count=50 else diff --git a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh index 37821888ea..04fb225ed4 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh @@ -26,7 +26,7 @@ # 3. Concurrently do the following: # 3.1. Perform 8K sync writes # 3.2. Perform log offline/online commands -# 4. Loop to test with growing "zfs_commit_timout_pct" values. +# 4. Loop to test with growing "zfs_commit_timeout_pct" values. # verify_runnable "global" @@ -40,16 +40,17 @@ function cleanup # wait - set_tunable64 zfs_commit_timeout_pct $ORIG_TIMEOUT + set_tunable64 COMMIT_TIMEOUT_PCT $ORIG_TIMEOUT poolexists $TESTPOOL && zpool destroy -f $TESTPOOL } -ORIG_TIMEOUT=$(get_tunable zfs_commit_timeout_pct | tail -1 | awk '{print $NF}') +typeset ORIG_TIMEOUT=$(get_tunable COMMIT_TIMEOUT_PCT) log_onexit cleanup +log_must setup for PCT in 0 1 2 4 8 16 32 64 128 256 512 1024; do - log_must set_tunable64 zfs_commit_timeout_pct $PCT + log_must set_tunable64 COMMIT_TIMEOUT_PCT $PCT log_must zpool create $TESTPOOL $VDEV log $SDEV diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh similarity index 71% rename from tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh rename to tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh index 5f281a756f..0b78a099f0 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh @@ -58,14 +58,9 @@ verify_runnable "global" -function cleanup_fs -{ - rm -f $TESTDIR/checksum - cleanup -} - log_assert "Replay of intent log succeeds." -log_onexit cleanup_fs +log_onexit cleanup +log_must setup # # 1. Create an empty file system (TESTFS) @@ -113,12 +108,15 @@ log_must rmdir /$TESTPOOL/$TESTFS/dir_to_delete # Create a simple validation payload log_must mkdir -p $TESTDIR -log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/payload bs=1k count=8 -log_must eval "sha256sum -b /$TESTPOOL/$TESTFS/payload >$TESTDIR/checksum" +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/payload \ + oflag=sync bs=1k count=8 +typeset checksum=$(sha256digest /$TESTPOOL/$TESTFS/payload) # TX_WRITE (small file with ordering) -log_must mkfile 1k /$TESTPOOL/$TESTFS/small_file -log_must mkfile 512b /$TESTPOOL/$TESTFS/small_file +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/small_file \ + oflag=sync bs=1k count=1 +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/small_file \ + oflag=sync bs=512 count=1 # TX_CREATE, TX_MKDIR, TX_REMOVE, TX_RMDIR log_must cp -R /usr/share/dict /$TESTPOOL/$TESTFS @@ -127,7 +125,11 @@ log_must rm -rf /$TESTPOOL/$TESTFS/dict # TX_SETATTR log_must touch /$TESTPOOL/$TESTFS/setattr log_must chmod 567 /$TESTPOOL/$TESTFS/setattr -log_must chgrp root /$TESTPOOL/$TESTFS/setattr +if is_freebsd; then + log_must chgrp wheel /$TESTPOOL/$TESTFS/setattr +else + log_must chgrp root /$TESTPOOL/$TESTFS/setattr +fi log_must touch -cm -t 201311271200 /$TESTPOOL/$TESTFS/setattr # TX_TRUNCATE (to zero) @@ -136,29 +138,42 @@ log_must truncate -s 0 /$TESTPOOL/$TESTFS/truncated_file # TX_WRITE (large file) log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/large \ - bs=128k count=64 oflag=sync + oflag=sync bs=128k count=64 # Write zeros, which compress to holes, in the middle of a file -log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.1 bs=128k count=8 -log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.1 bs=128k count=2 +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.1 \ + oflag=sync bs=128k count=8 +log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.1 \ + oflag=sync bs=128k count=2 -log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.2 bs=128k count=8 -log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.2 bs=128k count=2 seek=2 +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.2 \ + oflag=sync bs=128k count=8 +log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.2 \ + oflag=sync bs=128k count=2 seek=2 -log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.3 bs=128k count=8 -log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.3 bs=128k count=2 \ - seek=2 conv=notrunc +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.3 \ + oflag=sync bs=128k count=8 +log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/holes.3 \ + oflag=sync bs=128k count=2 seek=2 conv=notrunc # TX_MKXATTR log_must mkdir /$TESTPOOL/$TESTFS/xattr.dir -log_must attr -qs fileattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.dir -log_must attr -qs tmpattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.dir -log_must attr -qr tmpattr /$TESTPOOL/$TESTFS/xattr.dir - log_must touch /$TESTPOOL/$TESTFS/xattr.file -log_must attr -qs fileattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file -log_must attr -qs tmpattr -V HelloWorld /$TESTPOOL/$TESTFS/xattr.file -log_must attr -qr tmpattr /$TESTPOOL/$TESTFS/xattr.file +log_must set_xattr fileattr HelloWorld /$TESTPOOL/$TESTFS/xattr.dir +log_must set_xattr tmpattr HelloWorld /$TESTPOOL/$TESTFS/xattr.dir +log_must rm_xattr fileattr /$TESTPOOL/$TESTFS/xattr.dir + +log_must set_xattr fileattr HelloWorld /$TESTPOOL/$TESTFS/xattr.file +log_must set_xattr tmpattr HelloWorld /$TESTPOOL/$TESTFS/xattr.file +log_must rm_xattr tmpattr /$TESTPOOL/$TESTFS/xattr.file + +# TX_WRITE, TX_LINK, TX_REMOVE +# Make sure TX_REMOVE won't affect TX_WRITE if file is not destroyed +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/link_and_unlink \ + oflag=sync bs=128k count=8 +log_must ln /$TESTPOOL/$TESTFS/link_and_unlink \ + /$TESTPOOL/$TESTFS/link_and_unlink.link +log_must rm /$TESTPOOL/$TESTFS/link_and_unlink.link # # 4. Copy TESTFS to temporary location (TESTDIR/copy) @@ -194,13 +209,15 @@ log_note "Verify current block usage:" log_must zdb -bcv $TESTPOOL log_note "Verify copy of xattrs:" -log_must attr -l /$TESTPOOL/$TESTFS/xattr.dir -log_must attr -l /$TESTPOOL/$TESTFS/xattr.file +log_must ls_xattr /$TESTPOOL/$TESTFS/xattr.dir +log_must ls_xattr /$TESTPOOL/$TESTFS/xattr.file log_note "Verify working set diff:" log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy log_note "Verify file checksum:" -log_must sha256sum -c $TESTDIR/checksum +typeset checksum1=$(sha256digest /$TESTPOOL/$TESTFS/payload) +[[ "$checksum1" == "$checksum" ]] || \ + log_fail "checksum mismatch ($checksum1 != $checksum)" log_pass "Replay of intent log succeeds." diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh new file mode 100755 index 0000000000..3c3ccdf4ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/tests/functional/slog/slog.kshlib + +# +# DESCRIPTION: +# Verify slog replay correctly when TX_REMOVEs are followed by +# TX_CREATEs. +# +# STRATEGY: +# 1. Create a file system (TESTFS) with a lot of files +# 2. Freeze TESTFS +# 3. Remove all files then create a lot of files +# 4. Copy TESTFS to temporary location (TESTDIR/copy) +# 5. Unmount filesystem +# +# 6. Remount TESTFS +# 7. Compare TESTFS against the TESTDIR/copy +# + +verify_runnable "global" + +function cleanup_fs +{ + cleanup +} + +log_assert "Replay of intent log succeeds." +log_onexit cleanup_fs +log_must setup + +# +# 1. Create a file system (TESTFS) with a lot of files +# +log_must zpool create $TESTPOOL $VDEV log mirror $LDEV +log_must zfs set compression=on $TESTPOOL +log_must zfs create $TESTPOOL/$TESTFS + +# Prep for the test of TX_REMOVE followed by TX_CREATE +dnsize=(legacy auto 1k 2k 4k 8k 16k) +NFILES=200 +log_must mkdir /$TESTPOOL/$TESTFS/dir0 +log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done' + +# +# Reimport to reset dnode allocation pointer. +# This is to make sure we will have TX_REMOVE and TX_CREATE on same id +# +log_must zpool export $TESTPOOL +log_must zpool import -f -d $VDIR $TESTPOOL + +# +# This dd command works around an issue where ZIL records aren't created +# after freezing the pool unless a ZIL header already exists. Create a file +# synchronously to force ZFS to write one out. +# +log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \ + conv=fdatasync,fsync bs=1 count=1 + +# +# 2. Freeze TESTFS +# +log_must zpool freeze $TESTPOOL + +# +# 3. Remove all files then create a lot of files +# +# TX_REMOVE followed by TX_CREATE +log_must eval 'rm -f /$TESTPOOL/$TESTFS/dir0/*' +log_must eval 'for i in $(seq $NFILES); do zfs set dnodesize=${dnsize[$RANDOM % ${#dnsize[@]}]} $TESTPOOL/$TESTFS; touch /$TESTPOOL/$TESTFS/dir0/file.$i; done' + +# +# 4. Copy TESTFS to temporary location (TESTDIR/copy) +# +log_must mkdir -p $TESTDIR/copy +log_must cp -a /$TESTPOOL/$TESTFS/* $TESTDIR/copy/ + +# +# 5. Unmount filesystem and export the pool +# +# At this stage TESTFS is empty again and frozen, the intent log contains +# a complete set of deltas to replay. +# +log_must zfs unmount /$TESTPOOL/$TESTFS + +log_note "Verify transactions to replay:" +log_must zdb -iv $TESTPOOL/$TESTFS + +log_must zpool export $TESTPOOL + +# +# 6. Remount TESTFS +# +# Import the pool to unfreeze it and claim log blocks. It has to be +# `zpool import -f` because we can't write a frozen pool's labels! +# +log_must zpool import -f -d $VDIR $TESTPOOL + +# +# 7. Compare TESTFS against the TESTDIR/copy +# +log_note "Verify current block usage:" +log_must zdb -bcv $TESTPOOL + +log_note "Verify number of files" +log_must test "$(ls /$TESTPOOL/$TESTFS/dir0 | wc -l)" -eq $NFILES + +log_note "Verify working set diff:" +log_must diff -r /$TESTPOOL/$TESTFS $TESTDIR/copy + +log_pass "Replay of intent log succeeds." diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh index 2cdcb38dc2..d39c6ded55 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh @@ -61,10 +61,11 @@ verify_runnable "global" VOLUME=$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL MNTPNT=$TESTDIR/$TESTVOL +FSTYPE=none function cleanup_volume { - if ismounted $MNTPNT ext4; then + if ismounted $MNTPNT $FSTYPE; then log_must umount $MNTPNT rmdir $MNTPNT fi @@ -76,6 +77,7 @@ function cleanup_volume log_assert "Replay of intent log succeeds." log_onexit cleanup_volume +log_must setup # # 1. Create an empty volume (TESTVOL), set sync=always, and format @@ -86,11 +88,20 @@ log_must zfs create -V 128M $TESTPOOL/$TESTVOL log_must zfs set compression=on $TESTPOOL/$TESTVOL log_must zfs set sync=always $TESTPOOL/$TESTVOL log_must mkdir -p $TESTDIR -log_must block_device_wait -echo "y" | newfs -t ext4 -v $VOLUME -log_must mkdir -p $MNTPNT -log_must mount -o discard $VOLUME $MNTPNT -log_must rmdir $MNTPNT/lost+found +block_device_wait +if is_linux; then + # ext4 only on Linux + log_must new_fs -t ext4 -v $VOLUME + log_must mkdir -p $MNTPNT + log_must mount -o discard $VOLUME $MNTPNT + FSTYPE=ext4 + log_must rmdir $MNTPNT/lost+found +else + log_must new_fs $VOLUME + log_must mkdir -p $MNTPNT + log_must mount $VOLUME $MNTPNT + FSTYPE=$NEWFS_DEFAULT_FS +fi log_must zpool sync # @@ -115,19 +126,21 @@ log_must dd if=/dev/urandom of=$MNTPNT/throughput-128k bs=128k count=1 log_must dd if=/dev/urandom of=$MNTPNT/holes bs=128k count=8 log_must dd if=/dev/zero of=$MNTPNT/holes bs=128k count=2 seek=2 conv=notrunc -# TX_TRUNCATE -if fallocate --punch-hole 2>&1 | grep -q "unrecognized option"; then - log_note "fallocate(1) does not support --punch-hole" -else - log_must dd if=/dev/urandom of=$MNTPNT/discard bs=128k count=16 - log_must fallocate --punch-hole -l 128K -o 512K $MNTPNT/discard - log_must fallocate --punch-hole -l 512K -o 1M $MNTPNT/discard +if is_linux; then + # TX_TRUNCATE + if fallocate --punch-hole 2>&1 | grep -q "unrecognized option"; then + log_note "fallocate(1) does not support --punch-hole" + else + log_must dd if=/dev/urandom of=$MNTPNT/discard bs=128k count=16 + log_must fallocate --punch-hole -l 128K -o 512K $MNTPNT/discard + log_must fallocate --punch-hole -l 512K -o 1M $MNTPNT/discard + fi fi # # 4. Generate checksums for all ext4 files. # -log_must sha256sum -b $MNTPNT/* >$TESTDIR/checksum +typeset checksum=$(cat $MNTPNT/* | sha256digest) # # 5. Unmount filesystem and export the pool @@ -149,7 +162,7 @@ log_must zpool export $TESTPOOL # `zpool import -f` because we can't write a frozen pool's labels! # log_must zpool import -f $TESTPOOL -log_must block_device_wait +block_device_wait log_must mount $VOLUME $MNTPNT # @@ -159,6 +172,8 @@ log_note "Verify current block usage:" log_must zdb -bcv $TESTPOOL log_note "Verify checksums" -log_must sha256sum -c $TESTDIR/checksum +typeset checksum1=$(cat $MNTPNT/* | sha256digest) +[[ "$checksum1" == "$checksum" ]] || \ + log_fail "checksum mismatch ($checksum1 != $checksum)" log_pass "Replay of intent log succeeds." diff --git a/tests/zfs-tests/tests/functional/snapshot/cleanup.ksh b/tests/zfs-tests/tests/functional/snapshot/cleanup.ksh index 12d950999c..530a785330 100755 --- a/tests/zfs-tests/tests/functional/snapshot/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/cleanup.ksh @@ -32,7 +32,7 @@ . $STF_SUITE/include/libtest.shlib if is_linux; then - log_must set_tunable64 zfs_admin_snapshot 0 + log_must set_tunable64 ADMIN_SNAPSHOT 0 fi default_container_cleanup diff --git a/tests/zfs-tests/tests/functional/snapshot/clone_001_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/clone_001_pos.ksh index 5268971932..1c8a3b2a6c 100755 --- a/tests/zfs-tests/tests/functional/snapshot/clone_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/clone_001_pos.ksh @@ -61,7 +61,17 @@ set -A args "$SNAPFS" "$SNAPDIR" "$TESTPOOL/$TESTCLONE" "$TESTDIR.0" \ function setup_all { + if is_freebsd; then + # Pool creation on zvols is forbidden by default. + # Save and the current setting. + typeset _saved=$(get_tunable VOL_RECURSIVE) + log_must set_tunable64 VOL_RECURSIVE 1 + fi create_pool $TESTPOOL1 ${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL + if is_freebsd; then + # Restore the previous setting. + log_must set_tunable64 VOL_RECURSIVE $_saved + fi log_must zfs create $TESTPOOL1/$TESTFS log_must zfs set mountpoint=$TESTDIR2 $TESTPOOL1/$TESTFS @@ -86,8 +96,8 @@ function cleanup_all (( i = i + 4 )) done - datasetexists $TESTPOOL1/$TESTFS && \ - log_must zfs destroy -f $TESTPOOL1/$TESTFS + datasetexists $TESTPOOL1/$TESTFS && \ + destroy_dataset $TESTPOOL1/$TESTFS -f destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/snapshot/rollback_003_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/rollback_003_pos.ksh index 342e7df58f..59e7c110dd 100755 --- a/tests/zfs-tests/tests/functional/snapshot/rollback_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/rollback_003_pos.ksh @@ -48,10 +48,6 @@ verify_runnable "both" -if is_linux; then - log_unsupported "Test case is known to fail on Linux" -fi - function cleanup { typeset snap="" @@ -61,18 +57,16 @@ function cleanup log_must zfs mount -a unset __ZFS_POOL_RESTRICT - for snap in "$SNAPPOOL.1" "$SNAPPOOL" - do - snapexists $snap - [[ $? -eq 0 ]] && \ - log_must zfs destroy $snap + for snap in "$SNAPPOOL.1" "$SNAPPOOL"; do + if snapexists $snap; then + destroy_snapshot $snap + fi done - for fs in "$TESTPOOL/$TESTFILE/$TESTFILE.1" "$TESTPOOL/$TESTFILE" - do - datasetexists $fs - [[ $? -eq 0 ]] && \ - log_must zfs destroy -r $fs + for fs in "$TESTPOOL/$TESTFILE/$TESTFILE.1" "$TESTPOOL/$TESTFILE"; do + if datasetexists $fs; then + destroy_dataset $fs -r + fi done [[ -e /$TESTPOOL ]] && \ @@ -107,4 +101,11 @@ log_must touch /$TESTPOOL/$TESTFILE/$TESTFILE.1 log_must zfs rollback $SNAPPOOL.1 +# +# Workaround for issue #6143. Issuing a `df` seems to properly force any +# negative dcache entries to be invalidated preventing subsequent failures +# when accessing the mount point. Additional investigation required. +# +log_must df + log_pass "Rollbacks succeed when nested file systems are present." diff --git a/tests/zfs-tests/tests/functional/snapshot/setup.ksh b/tests/zfs-tests/tests/functional/snapshot/setup.ksh index 6f0646737e..a73d1aff3c 100755 --- a/tests/zfs-tests/tests/functional/snapshot/setup.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/setup.ksh @@ -34,7 +34,7 @@ DISK=${DISKS%% *} if is_linux; then - log_must set_tunable64 zfs_admin_snapshot 1 + log_must set_tunable64 ADMIN_SNAPSHOT 1 fi default_container_volume_setup ${DISK} diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh index b404ffbd50..124a7db9c6 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh @@ -36,7 +36,7 @@ # DESCRIPTION: # An archive of a zfs file system and an archive of its snapshot # is identical even though the original file system has -# changed sinced the snapshot was taken. +# changed since the snapshot was taken. # # STRATEGY: # 1) Create files in all of the zfs file systems diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh index dc50e46933..68a616c02a 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh @@ -35,7 +35,7 @@ # # DESCRIPTION: # An archive of a zfs dataset and an archive of its snapshot -# changed sinced the snapshot was taken. +# changed since the snapshot was taken. # # STRATEGY: # 1) Create some files in a ZFS dataset diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh index 6607d4ca49..1ee7e33c2a 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh @@ -88,7 +88,7 @@ else fi log_must zfs snapshot -r $snappool -log_must block_device_wait +block_device_wait #verify the snapshot -r results for snap in $snappool $snapfs $snapvol $snapctr $snapctrvol \ diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh index 0f876ad6d6..128b443c6f 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh @@ -83,7 +83,7 @@ else fi log_must zfs snapshot -r $snappool -log_must block_device_wait +block_device_wait #select the $TESTCTR as destroy point, $TESTCTR is a child of $TESTPOOL log_must zfs destroy -r $snapctr @@ -92,7 +92,7 @@ for snap in $snapctr $snapctrvol $snapctrclone $snapctrfs; do log_fail "The snapshot $snap is not destroyed correctly." done -for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1;do +for snap in $snappool $snapfs $snapvol $ctrfs@$TESTSNAP1; do ! snapexists $snap && \ log_fail "The snapshot $snap should be not destroyed." done diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_011_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_011_pos.ksh index 44e5943bcd..7e0a7f4ce1 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_011_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_011_pos.ksh @@ -51,8 +51,7 @@ verify_runnable "both" function cleanup { - snapexists $SNAPPOOL && \ - log_must zfs destroy -r $SNAPPOOL + snapexists $SNAPPOOL && destroy_dataset $SNAPPOOL -r [[ -e $TESTDIR ]] && \ log_must rm -rf $TESTDIR/* > /dev/null 2>&1 diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_012_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_012_pos.ksh index c5717e4526..92db9b53a7 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_012_pos.ksh @@ -55,15 +55,14 @@ function cleanup { if datasetexists $clone1; then log_must zfs promote $ctrfs - log_must zfs destroy $clone1 + destroy_dataset $clone1 fi - snapexists $snapctr && \ - log_must zfs destroy -r $snapctr + snapexists $snapctr && destroy_dataset $snapctr -r if snapexists $clone@$TESTSNAP1; then log_must zfs promote $ctrfs - log_must zfs destroy -rR $ctrfs@$TESTSNAP1 + destroy_dataset $ctrfs@$TESTSNAP1 -rR fi } diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_013_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_013_pos.ksh index 31aedb2245..e02f6eb300 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_013_pos.ksh @@ -48,11 +48,8 @@ verify_runnable "both" function cleanup { - datasetexists $ctrfs && \ - zfs destroy -r $ctrfs - - snapexists $snappool && \ - log_must zfs destroy -r $snappool + datasetexists $ctrfs && destroy_dataset $ctrfs -r + snapexists $snappool && destroy_dataset $snappool -r [[ -e $TESTDIR ]] && \ log_must rm -rf $TESTDIR/* > /dev/null 2>&1 diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_014_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_014_pos.ksh index 3579fbebb4..d48d404b6d 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_014_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_014_pos.ksh @@ -51,8 +51,7 @@ function cleanup [[ -e $TESTDIR1 ]] && \ log_must rm -rf $TESTDIR1/* > /dev/null 2>&1 - snapexists $SNAPCTR && \ - log_must zfs destroy $SNAPCTR + snapexists $SNAPCTR && destroy_dataset $SNAPCTR datasetexists $TESTPOOL/$TESTCTR/$TESTFS1 && \ log_must zfs set quota=none $TESTPOOL/$TESTCTR/$TESTFS1 diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_015_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_015_pos.ksh index 1091bcb13e..5a4d2ccaf6 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_015_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_015_pos.ksh @@ -54,7 +54,7 @@ function cleanup typeset -i i=0 while ((i < snap_cnt)); do typeset snap=$fs@snap.$i - datasetexists $snap && log_must zfs destroy -f $snap + datasetexists $snap && destroy_dataset $snap -f ((i += 1)) done diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_016_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_016_pos.ksh index b460c2b0c5..b66023cc85 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_016_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_016_pos.ksh @@ -47,19 +47,12 @@ verify_runnable "both" function cleanup { - datasetexists $SNAPFS && \ - log_must zfs destroy -Rf $SNAPFS - datasetexists $TESTPOOL/$TESTFS@snap_a && \ - log_must zfs destroy -Rf $TESTPOOL/$TESTFS@snap_a - datasetexists $TESTPOOL/$TESTFS@snap_b && \ - log_must zfs destroy -Rf $TESTPOOL/$TESTFS@snap_b - datasetexists $TESTPOOL/$TESTCLONE@snap_a && \ - log_must zfs destroy -Rf $TESTPOOL/$TESTCLONE@snap_a - - datasetexists $TESTPOOL/$TESTCLONE && \ - log_must zfs destroy $TESTPOOL/$TESTCLONE - datasetexists $TESTPOOL/$TESTFS && \ - log_must zfs destroy $TESTPOOL/$TESTFS + datasetexists $SNAPFS && destroy_dataset $SNAPFS -Rf + datasetexists $TESTPOOL/$TESTFS@snap_a && destroy_dataset $TESTPOOL/$TESTFS@snap_a -Rf + datasetexists $TESTPOOL/$TESTFS@snap_b && destroy_dataset $TESTPOOL/$TESTFS@snap_b -Rf + datasetexists $TESTPOOL/$TESTCLONE@snap_a && destroy_dataset $TESTPOOL/$TESTCLONE@snap_a -Rf + datasetexists $TESTPOOL/$TESTCLONE && destroy_dataset $TESTPOOL/$TESTCLONE + datasetexists $TESTPOOL/$TESTFS && destroy_dataset $TESTPOOL/$TESTFS log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_017_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_017_pos.ksh index a21f8750d6..6e5b8973cf 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_017_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_017_pos.ksh @@ -56,9 +56,8 @@ function cleanup { cd $SAVED_DIR - if datasetexists $TESTPOOL/$TESTFS ; then - log_must zfs destroy -Rf $TESTPOOL/$TESTFS - fi + datasetexists $TESTPOOL/$TESTFS && \ + destroy_dataset $TESTPOOL/$TESTFS -Rf log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/snapused/snapused_001_pos.ksh b/tests/zfs-tests/tests/functional/snapused/snapused_001_pos.ksh index 302ba40c38..c1277f2b4e 100755 --- a/tests/zfs-tests/tests/functional/snapused/snapused_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapused/snapused_001_pos.ksh @@ -51,7 +51,7 @@ verify_runnable "both" function cleanup { - log_must zfs destroy -rR $USEDTEST + datasetexists $USEDTEST && destroy_dataset $USEDTEST -rR } log_assert "Verify used is correct." diff --git a/tests/zfs-tests/tests/functional/snapused/snapused_002_pos.ksh b/tests/zfs-tests/tests/functional/snapused/snapused_002_pos.ksh index 96d2df6c65..a41ca1d70f 100755 --- a/tests/zfs-tests/tests/functional/snapused/snapused_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapused/snapused_002_pos.ksh @@ -49,7 +49,7 @@ verify_runnable "both" function cleanup { - log_must zfs destroy -rR $USEDTEST + datasetexists $USEDTEST && destroy_dataset $USEDTEST -rR } log_assert "Verify usedbychildren is correct." diff --git a/tests/zfs-tests/tests/functional/snapused/snapused_003_pos.ksh b/tests/zfs-tests/tests/functional/snapused/snapused_003_pos.ksh index d4726ff40c..ff54cbaa1a 100755 --- a/tests/zfs-tests/tests/functional/snapused/snapused_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapused/snapused_003_pos.ksh @@ -48,7 +48,7 @@ verify_runnable "both" function cleanup { - log_must zfs destroy -rR $USEDTEST + datasetexists $USEDTEST && destroy_dataset $USEDTEST -rR } log_assert "Verify usedbydataset is correct." diff --git a/tests/zfs-tests/tests/functional/snapused/snapused_004_pos.ksh b/tests/zfs-tests/tests/functional/snapused/snapused_004_pos.ksh index 64ca3e2b35..8fb8b6be5b 100755 --- a/tests/zfs-tests/tests/functional/snapused/snapused_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapused/snapused_004_pos.ksh @@ -53,7 +53,7 @@ verify_runnable "both" function cleanup { - log_must zfs destroy -rR $USEDTEST + datasetexists $USEDTEST && destroy_dataset $USEDTEST -rR } log_assert "Verify usedbyrefreservation is correct." diff --git a/tests/zfs-tests/tests/functional/snapused/snapused_005_pos.ksh b/tests/zfs-tests/tests/functional/snapused/snapused_005_pos.ksh index ac5224caf6..9d21e1d23d 100755 --- a/tests/zfs-tests/tests/functional/snapused/snapused_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapused/snapused_005_pos.ksh @@ -47,7 +47,7 @@ verify_runnable "both" function cleanup { - log_must zfs destroy -rR $USEDTEST + datasetexists $USEDTEST && destroy_dataset $USEDTEST -rR } log_assert "Verify usedbysnapshots is correct." diff --git a/tests/zfs-tests/tests/functional/suid/.gitignore b/tests/zfs-tests/tests/functional/suid/.gitignore new file mode 100644 index 0000000000..a9a3db79ba --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/.gitignore @@ -0,0 +1 @@ +/suid_write_to_file diff --git a/tests/zfs-tests/tests/functional/suid/Makefile.am b/tests/zfs-tests/tests/functional/suid/Makefile.am new file mode 100644 index 0000000000..594d2b77ca --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid + +dist_pkgdata_SCRIPTS = \ + suid_write_to_suid.ksh \ + suid_write_to_sgid.ksh \ + suid_write_to_suid_sgid.ksh \ + suid_write_to_none.ksh \ + cleanup.ksh \ + setup.ksh + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/suid + +pkgexec_PROGRAMS = suid_write_to_file +suid_write_to_file_SOURCES = suid_write_to_file.c diff --git a/tests/zfs-tests/tests/functional/suid/cleanup.ksh b/tests/zfs-tests/tests/functional/suid/cleanup.ksh new file mode 100755 index 0000000000..6e41e02faf --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/suid/setup.ksh b/tests/zfs-tests/tests/functional/suid/setup.ksh new file mode 100755 index 0000000000..d04d5568c0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c new file mode 100644 index 0000000000..571dc553be --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_file.c @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void +test_stat_mode(mode_t extra) +{ + struct stat st; + int i, fd; + char fpath[1024]; + char *penv[] = {"TESTDIR", "TESTFILE0"}; + char buf[] = "test"; + mode_t res; + mode_t mode = 0777 | extra; + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + umask(0); + if (stat(penv[0], &st) == -1 && mkdir(penv[0], mode) == -1) { + perror("mkdir"); + exit(2); + } + + snprintf(fpath, sizeof (fpath), "%s/%s", penv[0], penv[1]); + unlink(fpath); + if (stat(fpath, &st) == 0) { + fprintf(stderr, "%s exists\n", fpath); + exit(3); + } + + fd = creat(fpath, mode); + if (fd == -1) { + perror("creat"); + exit(4); + } + close(fd); + + if (setuid(65534) == -1) { + perror("setuid"); + exit(5); + } + + fd = open(fpath, O_RDWR); + if (fd == -1) { + perror("open"); + exit(6); + } + + if (write(fd, buf, sizeof (buf)) == -1) { + perror("write"); + exit(7); + } + close(fd); + + if (stat(fpath, &st) == -1) { + perror("stat"); + exit(8); + } + unlink(fpath); + + /* Verify SUID/SGID are dropped */ + res = st.st_mode & (0777 | S_ISUID | S_ISGID); + if (res != (mode & 0777)) { + fprintf(stderr, "stat(2) %o\n", res); + exit(9); + } +} + +int +main(int argc, char *argv[]) +{ + const char *name; + mode_t extra; + + if (argc < 2) { + fprintf(stderr, "Invalid argc\n"); + exit(1); + } + + name = argv[1]; + if (strcmp(name, "SUID") == 0) { + extra = S_ISUID; + } else if (strcmp(name, "SGID") == 0) { + extra = S_ISGID; + } else if (strcmp(name, "SUID_SGID") == 0) { + extra = S_ISUID | S_ISGID; + } else if (strcmp(name, "NONE") == 0) { + extra = 0; + } else { + fprintf(stderr, "Invalid name %s\n", name); + exit(1); + } + + test_stat_mode(extra); + + return (0); +} diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh new file mode 100755 index 0000000000..dd01978619 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to regular file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to regular file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "NONE" + +log_pass "Verify write(2) to regular file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh new file mode 100755 index 0000000000..49ae2bd1b3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SGID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SGID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SGID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SGID" + +log_pass "Verify write(2) to SGID file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh new file mode 100755 index 0000000000..3983aad2e5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SUID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SUID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SUID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID" + +log_pass "Verify write(2) to SUID file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh new file mode 100755 index 0000000000..a058c7e7d4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh @@ -0,0 +1,52 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify write(2) to SUID/SGID file by non-owner. +# Also see https://github.com/pjd/pjdfstest/blob/master/tests/chmod/12.t +# +# STRATEGY: +# 1. creat(2) a file with SUID/SGID. +# 2. write(2) to the file with uid=65534. +# 3. stat(2) the file and verify .st_mode value. +# + +verify_runnable "both" + +function cleanup +{ + rm -f $TESTDIR/$TESTFILE0 +} + +log_onexit cleanup +log_note "Verify write(2) to SUID/SGID file by non-owner" + +log_must $STF_SUITE/tests/functional/suid/suid_write_to_file "SUID_SGID" + +log_pass "Verify write(2) to SUID/SGID file by non-owner passed" diff --git a/tests/zfs-tests/tests/functional/tmpfile/.gitignore b/tests/zfs-tests/tests/functional/tmpfile/.gitignore index b7a19481ad..de014c5256 100644 --- a/tests/zfs-tests/tests/functional/tmpfile/.gitignore +++ b/tests/zfs-tests/tests/functional/tmpfile/.gitignore @@ -2,3 +2,4 @@ /tmpfile_001_pos /tmpfile_002_pos /tmpfile_003_pos +/tmpfile_stat_mode diff --git a/tests/zfs-tests/tests/functional/tmpfile/Makefile.am b/tests/zfs-tests/tests/functional/tmpfile/Makefile.am index 411445217a..35a1f44c16 100644 --- a/tests/zfs-tests/tests/functional/tmpfile/Makefile.am +++ b/tests/zfs-tests/tests/functional/tmpfile/Makefile.am @@ -8,7 +8,8 @@ dist_pkgdata_SCRIPTS = \ pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/tmpfile -pkgexec_PROGRAMS = tmpfile_test tmpfile_001_pos tmpfile_002_pos tmpfile_003_pos +pkgexec_PROGRAMS = tmpfile_test tmpfile_001_pos tmpfile_002_pos \ + tmpfile_003_pos tmpfile_stat_mode tmpfile_test_SOURCES= tmpfile_test.c tmpfile_001_pos_SOURCES = tmpfile_001_pos.c tmpfile_002_pos_SOURCES = tmpfile_002_pos.c diff --git a/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c b/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c index c2c02c5d4f..b0c2360819 100644 --- a/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c +++ b/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c @@ -28,7 +28,7 @@ #define BSZ 64 -void +static void fill_random(char *buf, int len) { int i; diff --git a/tests/zfs-tests/tests/functional/tmpfile/tmpfile_stat_mode.c b/tests/zfs-tests/tests/functional/tmpfile/tmpfile_stat_mode.c new file mode 100644 index 0000000000..bf71d429c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/tmpfile/tmpfile_stat_mode.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* backward compat in case it's not defined */ +#ifndef O_TMPFILE +#define O_TMPFILE (020000000|O_DIRECTORY) +#endif + +/* + * DESCRIPTION: + * Verify stat(2) for O_TMPFILE file considers umask. + * + * STRATEGY: + * 1. open(2) with O_TMPFILE. + * 2. linkat(2). + * 3. fstat(2)/stat(2) and verify .st_mode value. + */ + +static void +test_stat_mode(mode_t mask) +{ + struct stat st, fst; + int i, fd; + char spath[1024], dpath[1024]; + char *penv[] = {"TESTDIR", "TESTFILE0"}; + mode_t masked = 0777 & ~mask; + mode_t mode; + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + umask(mask); + fd = open(penv[0], O_RDWR|O_TMPFILE, 0777); + if (fd == -1) { + perror("open"); + exit(2); + } + + if (fstat(fd, &fst) == -1) { + perror("fstat"); + close(fd); + exit(3); + } + + snprintf(spath, sizeof (spath), "/proc/self/fd/%d", fd); + snprintf(dpath, sizeof (dpath), "%s/%s", penv[0], penv[1]); + + unlink(dpath); + if (linkat(AT_FDCWD, spath, AT_FDCWD, dpath, AT_SYMLINK_FOLLOW) == -1) { + perror("linkat"); + close(fd); + exit(4); + } + close(fd); + + if (stat(dpath, &st) == -1) { + perror("stat"); + exit(5); + } + unlink(dpath); + + /* Verify fstat(2) result */ + mode = fst.st_mode & 0777; + if (mode != masked) { + fprintf(stderr, "fstat(2) %o != %o\n", mode, masked); + exit(6); + } + + /* Verify stat(2) result */ + mode = st.st_mode & 0777; + if (mode != masked) { + fprintf(stderr, "stat(2) %o != %o\n", mode, masked); + exit(7); + } +} + +int +main(int argc, char *argv[]) +{ + fprintf(stdout, "Verify stat(2) for O_TMPFILE file considers umask.\n"); + + test_stat_mode(0022); + test_stat_mode(0077); + + return (0); +} diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am index 4f260a8e47..8917ed726e 100644 --- a/tests/zfs-tests/tests/functional/trim/Makefile.am +++ b/tests/zfs-tests/tests/functional/trim/Makefile.am @@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \ autotrim_config.ksh \ autotrim_trim_integrity.ksh \ trim_integrity.ksh \ - trim_config.ksh + trim_config.ksh \ + trim_l2arc.ksh diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index 6ce396a380..924b56935d 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -49,35 +49,41 @@ function cleanup log_must rm -f $TRIM_VDEVS - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch - log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 TRIM_TXG_BATCH $trim_txg_batch + log_must set_tunable64 VDEV_MIN_MS_COUNT $vdev_min_ms_count } log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 -# Reduced zfs_trim_txg_batch to make trimming more frequent. -typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch) -log_must set_tunable64 zfs_trim_txg_batch 8 +# Reduced TRIM_TXG_BATCH to make trimming more frequent. +typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) +log_must set_tunable64 TRIM_TXG_BATCH 8 # Increased metaslabs to better simulate larger more realistic devices. -typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count) -log_must set_tunable64 zfs_vdev_min_ms_count 32 +typeset vdev_min_ms_count=$(get_tunable VDEV_MIN_MS_COUNT) +log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS @@ -89,7 +95,7 @@ for type in "" "mirror" "raidz2"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R - verify_vdevs "-gt" "$VDEV_MAX_MB" $VDEVS + verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, wait for trim, verify the vdevs are now sparse. log_must rm /$TESTPOOL/file diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index c7b3da7c09..78fe18fa69 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -47,20 +47,20 @@ function cleanup log_must rm -f $TRIM_VDEVS - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 TRIM_TXG_BATCH $trim_txg_batch } log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 -# Reduced zfs_trim_txg_batch to make trimming more frequent. -typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch) -log_must set_tunable64 zfs_trim_txg_batch 8 +# Reduced TRIM_TXG_BATCH to make trimming more frequent. +typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) +log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index c0e850c48f..13c9b95e06 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -48,20 +48,20 @@ function cleanup log_must rm -f $TRIM_VDEVS - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 TRIM_TXG_BATCH $trim_txg_batch } log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 -# Reduced zfs_trim_txg_batch to make trimming more frequent. -typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch) -log_must set_tunable64 zfs_trim_txg_batch 8 +# Reduced TRIM_TXG_BATCH to make trimming more frequent. +typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) +log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "raidz2" "draid" "draid2"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS @@ -77,8 +77,7 @@ for type in "" "mirror" "raidz" "raidz2" "raidz3"; do zpool sync if [[ $((n % 4)) -eq 0 ]]; then - log_must zpool trim $TESTPOOL - wait_trim $TESTPOOL $TRIM_VDEVS + log_must timeout 120 zpool trim -w $TESTPOOL fi done log_must du -hs /$TESTPOOL diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh index cdcf038ad1..09489600b3 100755 --- a/tests/zfs-tests/tests/functional/trim/setup.ksh +++ b/tests/zfs-tests/tests/functional/trim/setup.ksh @@ -23,15 +23,21 @@ verify_runnable "global" -DISK1=${DISKS%% *} +if is_freebsd; then + log_unsupported "FreeBSD has no hole punching mechanism for the time being." + diskinfo -v $DISKS | grep -qE 'No.*# TRIM/UNMAP support' && + log_unsupported "DISKS do not support discard (TRIM/UNMAP)" +else + DISK1=${DISKS%% *} -typeset -i max_discard=0 -if [[ -b $DEV_RDSKDIR/$DISK1 ]]; then - max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }') -fi + typeset -i max_discard=0 + if is_disk_device $DEV_RDSKDIR/$DISK1; then + max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }') + fi -if test $max_discard -eq 0; then - log_unsupported "DISKS do not support discard (TRIM/UNMAP)" + if test $max_discard -eq 0; then + log_unsupported "DISKS do not support discard (TRIM/UNMAP)" + fi fi log_pass diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib index 02802d8c91..bede946a09 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.kshlib +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -18,7 +18,7 @@ . $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib # -# Get the actual on disk disk for the provided file. +# Get the actual size on disk for the provided file. # function get_size_mb { @@ -33,17 +33,18 @@ function get_trim_io { typeset pool="${1-:$TESTPOOL}" typeset type="${2-:ind}" + typeset vdev="${3}" typeset rval # Sum the ind or agg columns of the trim request size histogram. case "$type" in "ind") - rval=$(zpool iostat -pr $pool | awk \ + rval=$(zpool iostat -pr $pool $vdev | awk \ '$1 ~ /[0-9].*/ { sum += $12 } END { print sum }') echo -n "$rval" ;; "agg") - rval=$(zpool iostat -pr $pool | awk \ + rval=$(zpool iostat -pr $pool $vdev | awk \ '$1 ~ /[0-9].*/ { sum += $13 } END { print sum }') echo -n "$rval" ;; @@ -61,9 +62,10 @@ function verify_trim_io typeset pool="${1:-$TESTPOOL}" typeset type="${2:-ind}" typeset min_trim_ios=${3:-100} + typeset vdev="${4}" typeset ios - ios=$(get_trim_io $pool $type) + ios=$(get_trim_io $pool $type $vdev) if [[ $ios -ge $min_trim_ios ]]; then log_note "Issued $ios $type trim IOs for pool $pool" else @@ -118,37 +120,3 @@ function verify_vdevs # op size vdevs fi done } - -# -# Wait for up to 120 seconds for trimming of the listed vdevs to complete. -# -function wait_trim # pool vdevs -{ - typeset stop_time=$(( $(date +%s) + 120 )) - typeset pool="$1" - shift - typeset vdevs=$@ - typeset complete - - while [[ $complete -eq 0 ]]; do - complete=1 - - for vdev in $vdevs; do - if [[ "$(trim_progress $pool $vdev)" -lt "100" ]]; then - complete=0 - break - else - log_must eval "trim_prog_line $pool $vdev | \ - grep complete" - fi - done - - if [ "$(date +%s)" -ge $stop_time ]; then - log_fail "Exceeded trim time limit of 120s" - fi - - sleep 0.5 - done - - log_note "Pool completed trim successfully." -} diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index e56bd6248f..9a6e19e1c0 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -49,35 +49,41 @@ function cleanup log_must rm -f $TRIM_VDEVS - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch - log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 TRIM_TXG_BATCH $trim_txg_batch + log_must set_tunable64 VDEV_MIN_MS_COUNT $vdev_min_ms_count } log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 -# Reduced zfs_trim_txg_batch to make trimming more frequent. -typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch) -log_must set_tunable64 zfs_trim_txg_batch 8 +# Reduced TRIM_TXG_BATCH to make trimming more frequent. +typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) +log_must set_tunable64 TRIM_TXG_BATCH 8 # Increased metaslabs to better simulate larger more realistic devices. -typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count) -log_must set_tunable64 zfs_vdev_min_ms_count 32 +typeset vdev_min_ms_count=$(get_tunable VDEV_MIN_MS_COUNT) +log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS @@ -88,12 +94,11 @@ for type in "" "mirror" "raidz2"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R - verify_vdevs "-gt" "$VDEV_MAX_MB" $VDEVS + verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, issue trim, verify the vdevs are now sparse. log_must rm /$TESTPOOL/file - log_must zpool trim $TESTPOOL - wait_trim $TESTPOOL $VDEVS + log_must timeout 120 zpool trim -w $TESTPOOL verify_vdevs "-le" "$VDEV_MIN_MB" $VDEVS log_must zpool destroy $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index 0bbc439ee8..38f226d7f8 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -47,20 +47,20 @@ function cleanup log_must rm -f $TRIM_VDEVS - log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min - log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch + log_must set_tunable64 TRIM_EXTENT_BYTES_MIN $trim_extent_bytes_min + log_must set_tunable64 TRIM_TXG_BATCH $trim_txg_batch } log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. -typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min) -log_must set_tunable64 zfs_trim_extent_bytes_min 4096 +typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 -# Reduced zfs_trim_txg_batch to make trimming more frequent. -typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch) -log_must set_tunable64 zfs_trim_txg_batch 8 +# Reduced TRIM_TXG_BATCH to make trimming more frequent. +typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) +log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS @@ -76,8 +76,7 @@ for type in "" "mirror" "raidz" "raidz2" "raidz3"; do done log_must du -hs /$TESTPOOL - log_must zpool trim $TESTPOOL - wait_trim $TESTPOOL $TRIM_VDEVS + log_must timeout 120 zpool trim -w $TESTPOOL verify_trim_io $TESTPOOL "ind" 10 verify_pool $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh new file mode 100755 index 0000000000..ecf9f3424e --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.kshlib +. $STF_SUITE/tests/functional/trim/trim.cfg + +# +# DESCRIPTION: +# Verify trimming of L2ARC +# +# STRATEGY: +# 1. Set 'l2arc_trim_ahead = 1' and `l2arc_write_size = 64MB`. +# 2. Create a pool on file vdevs to trim. +# 3. Verify the cache device was trimmed. +# 4. Fill the pool with a file larger than the L2ARC vdev. +# 5. Randomly read the previous written file long enough for the +# L2ARC vdev to be filled and overwritten 5 times. +# 6. Verify trim IOs of the expected type were issued for the pool. +# 7. Verify the allocated space on the cache device is less than +# its size. +# + +verify_runnable "global" + +log_assert "Trim of L2ARC succeeds." + +function cleanup +{ + if poolexists $TESTPOOL; then + destroy_pool $TESTPOOL + fi + + log_must rm -f $VDEVS + log_must set_tunable32 L2ARC_TRIM_AHEAD $l2arc_trimahead + log_must set_tunable32 L2ARC_WRITE_MAX $l2arc_writemax +} +log_onexit cleanup + +# The cache device $TRIM_VDEV2 has to be small enough, so that +# dev->l2ad_hand loops around and dev->l2ad_first=0. Otherwise +# l2arc_evict() exits before evicting/trimming. +typeset l2arc_trimahead=$(get_tunable L2ARC_TRIM_AHEAD) +typeset l2arc_writemax=$(get_tunable L2ARC_WRITE_MAX) +log_must set_tunable32 L2ARC_TRIM_AHEAD 1 +log_must set_tunable32 L2ARC_WRITE_MAX $((64 * 1024 * 1024)) +VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" +log_must truncate -s $((MINVDEVSIZE)) $TRIM_VDEV2 +log_must truncate -s $((4 * MINVDEVSIZE)) $TRIM_VDEV1 +typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024)) + +log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2 +verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2 + +typeset fill_mb=$(( floor(2 * MINVDEVSIZE) )) +export DIRECTORY=/$TESTPOOL +export NUMJOBS=1 +export FILE_SIZE=${fill_mb} +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export RUNTIME=30 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# Write to the pool. +log_must fio $FIO_SCRIPTS/mkfiles.fio + +# Read randomly from the pool to fill L2ARC. +export RUNTIME=30 +log_must fio $FIO_SCRIPTS/random_reads.fio + +export RUNTIME=1 +typeset do_once=true +while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do + typeset l2_size1=$(get_arcstat l2_size) + log_must fio $FIO_SCRIPTS/random_reads.fio + typeset l2_size2=$(get_arcstat l2_size) + do_once=false +done + +verify_trim_io $TESTPOOL "ind" 5 $TRIM_VDEV2 + +typeset cache_size=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $2}') +typeset cache_alloc=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $3}') + +log_must test $cache_alloc -lt $cache_size + +log_must zpool destroy $TESTPOOL +log_must rm -f $VDEVS + +log_pass "Trim of L2ARC succeeds." diff --git a/tests/zfs-tests/tests/functional/truncate/truncate_timestamps.ksh b/tests/zfs-tests/tests/functional/truncate/truncate_timestamps.ksh index c365c7415e..27b28e82eb 100755 --- a/tests/zfs-tests/tests/functional/truncate/truncate_timestamps.ksh +++ b/tests/zfs-tests/tests/functional/truncate/truncate_timestamps.ksh @@ -38,13 +38,23 @@ function verify_truncate #